318 lines
9 KiB
Scheme
318 lines
9 KiB
Scheme
;;
|
|
;; util-utf8.scm
|
|
;;
|
|
;; UTF-8 support
|
|
;;
|
|
;; ISC License
|
|
;;
|
|
;; Copyright 2023 Dominik Pantůček <dominik.pantucek@trustica.cz>
|
|
;;
|
|
;; Permission to use, copy, modify, and/or distribute this software
|
|
;; for any purpose with or without fee is hereby granted, provided
|
|
;; that the above copyright notice and this permission notice appear
|
|
;; in all copies.
|
|
;;
|
|
;; THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
|
|
;; WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
|
|
;; WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
|
|
;; AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
|
|
;; CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
|
;; OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
|
|
;; NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
|
;; CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|
;;
|
|
|
|
(declare (unit util-utf8))
|
|
|
|
(module
|
|
util-utf8
|
|
(
|
|
utf8-char->string
|
|
|
|
make-utf8-string
|
|
|
|
string-append-utf8-char
|
|
|
|
utf8-string->lists
|
|
|
|
utf8-bytes->lists
|
|
|
|
utf8-string-length
|
|
|
|
utf8-string-next-char
|
|
|
|
utf8-string->list
|
|
list->utf8-string
|
|
|
|
string-utf8?
|
|
|
|
util-utf8-tests!
|
|
)
|
|
|
|
(import scheme
|
|
(chicken base)
|
|
(chicken bitwise)
|
|
testing)
|
|
|
|
;; Encodes given character as UTF-8
|
|
(define (utf8-char->string ch)
|
|
(let ((n (char->integer ch)))
|
|
(cond ((>= n #x10000)
|
|
(let ((r (make-string 4))
|
|
(b0 (bitwise-and n #b111111))
|
|
(b1 (bitwise-and (arithmetic-shift n -6) #b111111))
|
|
(b2 (bitwise-and (arithmetic-shift n -12) #b111111))
|
|
(b3 (bitwise-and (arithmetic-shift n -18) #b111)))
|
|
(string-set! r 0 (integer->char (bitwise-ior b3 #b11110000)))
|
|
(string-set! r 1 (integer->char (bitwise-ior b2 #b10000000)))
|
|
(string-set! r 2 (integer->char (bitwise-ior b1 #b10000000)))
|
|
(string-set! r 3 (integer->char (bitwise-ior b0 #b10000000)))
|
|
r))
|
|
((>= n #x800)
|
|
(let ((r (make-string 3))
|
|
(b0 (bitwise-and n #b111111))
|
|
(b1 (bitwise-and (arithmetic-shift n -6) #b111111))
|
|
(b2 (bitwise-and (arithmetic-shift n -12) #b1111)))
|
|
(string-set! r 0 (integer->char (bitwise-ior b2 #b11100000)))
|
|
(string-set! r 1 (integer->char (bitwise-ior b1 #b10000000)))
|
|
(string-set! r 2 (integer->char (bitwise-ior b0 #b10000000)))
|
|
r))
|
|
((>= n #x80)
|
|
(let ((r (make-string 2))
|
|
(b0 (bitwise-and n #b111111))
|
|
(b1 (bitwise-and (arithmetic-shift n -6) #b11111)))
|
|
(string-set! r 0 (integer->char (bitwise-ior b1 #b11000000)))
|
|
(string-set! r 1 (integer->char (bitwise-ior b0 #b10000000)))
|
|
r))
|
|
(else
|
|
(make-string 1 ch)))))
|
|
|
|
;; UTF-8 version of make-string
|
|
(define (make-utf8-string n . chs)
|
|
(let* ((ch (if (null? chs) #\space (car chs)))
|
|
(s (utf8-char->string ch))
|
|
(sl (string-length s))
|
|
(rl (* n sl))
|
|
(r (make-string rl)))
|
|
(let loop ((ri 0)
|
|
(si 0))
|
|
(if (= ri rl)
|
|
r
|
|
(let ((nsi (add1 si)))
|
|
(string-set! r ri (string-ref s si))
|
|
(loop (add1 ri)
|
|
(if (= nsi sl) 0 nsi)))))))
|
|
|
|
;; UTF-8 character append
|
|
(define (string-append-utf8-char s ch)
|
|
(string-append s (utf8-char->string ch)))
|
|
|
|
;; Converts a UTF-8 string into two lists: list of UTF-8 characters
|
|
;; of the string and a list of remaining bytes (as integers).
|
|
(define (utf8-string->lists str)
|
|
(utf8-bytes->lists
|
|
(map char->integer
|
|
(string->list str))))
|
|
|
|
;; The same as above but accepts a list of bytes (as integers)
|
|
(define (utf8-bytes->lists chars)
|
|
(let loop ((bytes chars)
|
|
(rpending '())
|
|
(pending 0)
|
|
(expected #f)
|
|
(res '()))
|
|
(if (null? bytes)
|
|
(values (reverse res)
|
|
(reverse rpending))
|
|
(let ((byte (car bytes)))
|
|
(cond (expected
|
|
;; Decode UTF-8 sequence
|
|
(cond ((= expected 1)
|
|
;; Last byte
|
|
(let ((char (integer->char (bitwise-ior pending
|
|
(bitwise-and byte #b111111)))))
|
|
(loop (cdr bytes)
|
|
'()
|
|
0
|
|
#f
|
|
(cons char res))))
|
|
(else
|
|
;; Intermediate bytes
|
|
(loop (cdr bytes)
|
|
(cons byte rpending)
|
|
(arithmetic-shift (bitwise-ior pending
|
|
(bitwise-and byte #b111111)) 6)
|
|
(sub1 expected)
|
|
res))))
|
|
(else
|
|
;; ASCII or first of UTF-8 sequence
|
|
(cond ((= (bitwise-and byte #b10000000) 0)
|
|
;; ASCII
|
|
(loop (cdr bytes)
|
|
'()
|
|
0
|
|
#f
|
|
(cons (integer->char byte) res)))
|
|
(else
|
|
;; First byte of UTF-8 sequence
|
|
(let-values
|
|
(((first-byte char-bytes)
|
|
(cond ((= (bitwise-and byte #b11000000) #b11000000)
|
|
(values (bitwise-and byte #b11111)
|
|
2))
|
|
((= (bitwise-and byte #b11100000) #b11100000)
|
|
(values (bitwise-and byte #b1111)
|
|
3))
|
|
((= (bitwise-and byte #b11110000) #b11110000)
|
|
(values (bitwise-and byte #b111)
|
|
4))
|
|
(else
|
|
;; Should not happen
|
|
(values 0 0)))))
|
|
(loop (cdr bytes)
|
|
(list byte)
|
|
(arithmetic-shift first-byte 6)
|
|
(sub1 char-bytes)
|
|
res))))))))))
|
|
|
|
;; Returns the position right after the character at specified
|
|
;; position.
|
|
(define (utf8-string-next-char str . sis)
|
|
(let ((len (string-length str))
|
|
(si0 (if (null? sis) 0 (car sis))))
|
|
(let loop ((si si0)
|
|
(pc 0))
|
|
(if (or (= si len)
|
|
(and (> si si0)
|
|
(eq? pc 0)))
|
|
si
|
|
(let ((b (char->integer (string-ref str si))))
|
|
(loop (add1 si)
|
|
(if (= pc 0)
|
|
(if (= (bitwise-and b #b11111000) #b11110000)
|
|
3
|
|
(if (= (bitwise-and b #b11110000) #b11100000)
|
|
2
|
|
(if (= (bitwise-and b 128) 128)
|
|
1
|
|
0)))
|
|
(if (= (bitwise-and b 128) 128)
|
|
(sub1 pc)
|
|
0))))))))
|
|
|
|
;; Calculates the length of given UTF-8 string
|
|
(define (utf8-string-length s)
|
|
(let ((l (string-length s)))
|
|
(let loop ((si 0)
|
|
(ci 0)
|
|
(pc 0))
|
|
(if (= si l)
|
|
ci
|
|
(let ((b (char->integer (string-ref s si))))
|
|
(loop (add1 si)
|
|
(if (or (= pc 0)
|
|
(= (bitwise-and b 128) 0))
|
|
(add1 ci)
|
|
ci)
|
|
(if (= pc 0)
|
|
(if (= (bitwise-and b #b11111000) #b11110000)
|
|
3
|
|
(if (= (bitwise-and b #b11110000) #b11100000)
|
|
2
|
|
(if (= (bitwise-and b 128) 128)
|
|
1
|
|
0)))
|
|
(if (= (bitwise-and b 128) 128)
|
|
(sub1 pc)
|
|
0))))))))
|
|
|
|
;; Converts utf8 string to list of unicode characters
|
|
(define (utf8-string->list s)
|
|
(let-values (((lst _) (utf8-string->lists s)))
|
|
lst))
|
|
|
|
;; Prepends 1-byte characters representing utf8 encoding of given
|
|
;; unicode character to the list
|
|
(define (prepend-unicode-char-to-utf8-list ch lst)
|
|
(let ((n (char->integer ch)))
|
|
(cond ((>= n #x10000)
|
|
(let ((r (make-string 4))
|
|
(b0 (bitwise-and n #b111111))
|
|
(b1 (bitwise-and (arithmetic-shift n -6) #b111111))
|
|
(b2 (bitwise-and (arithmetic-shift n -12) #b111111))
|
|
(b3 (bitwise-and (arithmetic-shift n -18) #b111)))
|
|
(cons (integer->char (bitwise-ior b0 #b10000000))
|
|
(cons (integer->char (bitwise-ior b1 #b10000000))
|
|
(cons (integer->char (bitwise-ior b2 #b10000000))
|
|
(cons (integer->char (bitwise-ior b3 #b11110000))
|
|
lst))))))
|
|
((>= n #x800)
|
|
(let ((r (make-string 3))
|
|
(b0 (bitwise-and n #b111111))
|
|
(b1 (bitwise-and (arithmetic-shift n -6) #b111111))
|
|
(b2 (bitwise-and (arithmetic-shift n -12) #b1111)))
|
|
(cons (integer->char (bitwise-ior b0 #b10000000))
|
|
(cons (integer->char (bitwise-ior b1 #b10000000))
|
|
(cons (integer->char (bitwise-ior b2 #b11100000))
|
|
lst)))))
|
|
((>= n #x80)
|
|
(let ((r (make-string 2))
|
|
(b0 (bitwise-and n #b111111))
|
|
(b1 (bitwise-and (arithmetic-shift n -6) #b11111)))
|
|
(cons (integer->char (bitwise-ior b0 #b10000000))
|
|
(cons (integer->char (bitwise-ior b1 #b11000000))
|
|
lst))))
|
|
(else
|
|
(cons ch lst)))))
|
|
|
|
;; Converts list of unicode characters into utf8 string
|
|
(define (list->utf8-string lst)
|
|
(let loop ((lst lst)
|
|
(res '()))
|
|
(if (null? lst)
|
|
(list->string (reverse res))
|
|
(loop (cdr lst)
|
|
(prepend-unicode-char-to-utf8-list (car lst) res)))))
|
|
|
|
;; Returns true, if given string contains UTF-8 characters
|
|
(define (string-utf8? s)
|
|
(let ((asciilen (string-length s))
|
|
(utf8len (utf8-string-length s)))
|
|
(not (= asciilen utf8len))))
|
|
|
|
;; Module self-tests
|
|
(define (util-utf8-tests!)
|
|
(run-tests
|
|
util-utf8
|
|
(test-equal? utf8-char->string
|
|
(utf8-char->string #\ř)
|
|
"ř")
|
|
(test-equal? make-utf8-string
|
|
(make-utf8-string 4 #\č)
|
|
"čččč")
|
|
(test-equal? string-append-utf8-char
|
|
(string-append-utf8-char "ččč" #\ř)
|
|
"čččř")
|
|
(test-equal? utf8-string->list
|
|
(utf8-string->list "ěščř")
|
|
'(#\ě #\š #\č #\ř))
|
|
(test-equal? list->utf8-string
|
|
(list->utf8-string '(#\ě #\š #\č #\ř))
|
|
"ěščř")
|
|
(test-equal? utf8-string-length
|
|
(utf8-string-length "ěščř")
|
|
4)
|
|
(test-equal? utf8-string-length
|
|
(utf8-string-length "ěxšy")
|
|
4)
|
|
(test-equal? utf8-string-next-char
|
|
(utf8-string-next-char "ěščř")
|
|
2)
|
|
(test-equal? utf8-string-next-char
|
|
(utf8-string-next-char "ěščř" 2)
|
|
4)
|
|
(test-true string-utf8? (string-utf8? "ěščř"))
|
|
(test-false string-utf8? (string-utf8? "Hello World!"))
|
|
))
|
|
|
|
)
|