hackerbase/src/util-utf8.scm

318 lines
9 KiB
Scheme

;;
;; util-utf8.scm
;;
;; UTF-8 support
;;
;; ISC License
;;
;; Copyright 2023 Dominik Pantůček <dominik.pantucek@trustica.cz>
;;
;; Permission to use, copy, modify, and/or distribute this software
;; for any purpose with or without fee is hereby granted, provided
;; that the above copyright notice and this permission notice appear
;; in all copies.
;;
;; THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
;; WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
;; WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
;; AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
;; CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
;; OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
;; NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
;; CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
;;
(declare (unit util-utf8))
(module
util-utf8
(
utf8-char->string
make-utf8-string
string-append-utf8-char
utf8-string->lists
utf8-bytes->lists
utf8-string-length
utf8-string-next-char
utf8-string->list
list->utf8-string
string-utf8?
util-utf8-tests!
)
(import scheme
(chicken base)
(chicken bitwise)
testing)
;; Encodes given character as UTF-8
(define (utf8-char->string ch)
(let ((n (char->integer ch)))
(cond ((>= n #x10000)
(let ((r (make-string 4))
(b0 (bitwise-and n #b111111))
(b1 (bitwise-and (arithmetic-shift n -6) #b111111))
(b2 (bitwise-and (arithmetic-shift n -12) #b111111))
(b3 (bitwise-and (arithmetic-shift n -18) #b111)))
(string-set! r 0 (integer->char (bitwise-ior b3 #b11110000)))
(string-set! r 1 (integer->char (bitwise-ior b2 #b10000000)))
(string-set! r 2 (integer->char (bitwise-ior b1 #b10000000)))
(string-set! r 3 (integer->char (bitwise-ior b0 #b10000000)))
r))
((>= n #x800)
(let ((r (make-string 3))
(b0 (bitwise-and n #b111111))
(b1 (bitwise-and (arithmetic-shift n -6) #b111111))
(b2 (bitwise-and (arithmetic-shift n -12) #b1111)))
(string-set! r 0 (integer->char (bitwise-ior b2 #b11100000)))
(string-set! r 1 (integer->char (bitwise-ior b1 #b10000000)))
(string-set! r 2 (integer->char (bitwise-ior b0 #b10000000)))
r))
((>= n #x80)
(let ((r (make-string 2))
(b0 (bitwise-and n #b111111))
(b1 (bitwise-and (arithmetic-shift n -6) #b11111)))
(string-set! r 0 (integer->char (bitwise-ior b1 #b11000000)))
(string-set! r 1 (integer->char (bitwise-ior b0 #b10000000)))
r))
(else
(make-string 1 ch)))))
;; UTF-8 version of make-string
(define (make-utf8-string n . chs)
(let* ((ch (if (null? chs) #\space (car chs)))
(s (utf8-char->string ch))
(sl (string-length s))
(rl (* n sl))
(r (make-string rl)))
(let loop ((ri 0)
(si 0))
(if (= ri rl)
r
(let ((nsi (add1 si)))
(string-set! r ri (string-ref s si))
(loop (add1 ri)
(if (= nsi sl) 0 nsi)))))))
;; UTF-8 character append
(define (string-append-utf8-char s ch)
(string-append s (utf8-char->string ch)))
;; Converts a UTF-8 string into two lists: list of UTF-8 characters
;; of the string and a list of remaining bytes (as integers).
(define (utf8-string->lists str)
(utf8-bytes->lists
(map char->integer
(string->list str))))
;; The same as above but accepts a list of bytes (as integers)
(define (utf8-bytes->lists chars)
(let loop ((bytes chars)
(rpending '())
(pending 0)
(expected #f)
(res '()))
(if (null? bytes)
(values (reverse res)
(reverse rpending))
(let ((byte (car bytes)))
(cond (expected
;; Decode UTF-8 sequence
(cond ((= expected 1)
;; Last byte
(let ((char (integer->char (bitwise-ior pending
(bitwise-and byte #b111111)))))
(loop (cdr bytes)
'()
0
#f
(cons char res))))
(else
;; Intermediate bytes
(loop (cdr bytes)
(cons byte rpending)
(arithmetic-shift (bitwise-ior pending
(bitwise-and byte #b111111)) 6)
(sub1 expected)
res))))
(else
;; ASCII or first of UTF-8 sequence
(cond ((= (bitwise-and byte #b10000000) 0)
;; ASCII
(loop (cdr bytes)
'()
0
#f
(cons (integer->char byte) res)))
(else
;; First byte of UTF-8 sequence
(let-values
(((first-byte char-bytes)
(cond ((= (bitwise-and byte #b11000000) #b11000000)
(values (bitwise-and byte #b11111)
2))
((= (bitwise-and byte #b11100000) #b11100000)
(values (bitwise-and byte #b1111)
3))
((= (bitwise-and byte #b11110000) #b11110000)
(values (bitwise-and byte #b111)
4))
(else
;; Should not happen
(values 0 0)))))
(loop (cdr bytes)
(list byte)
(arithmetic-shift first-byte 6)
(sub1 char-bytes)
res))))))))))
;; Returns the position right after the character at specified
;; position.
(define (utf8-string-next-char str . sis)
(let ((len (string-length str))
(si0 (if (null? sis) 0 (car sis))))
(let loop ((si si0)
(pc 0))
(if (or (= si len)
(and (> si si0)
(eq? pc 0)))
si
(let ((b (char->integer (string-ref str si))))
(loop (add1 si)
(if (= pc 0)
(if (= (bitwise-and b #b11111000) #b11110000)
3
(if (= (bitwise-and b #b11110000) #b11100000)
2
(if (= (bitwise-and b 128) 128)
1
0)))
(if (= (bitwise-and b 128) 128)
(sub1 pc)
0))))))))
;; Calculates the length of given UTF-8 string
(define (utf8-string-length s)
(let ((l (string-length s)))
(let loop ((si 0)
(ci 0)
(pc 0))
(if (= si l)
ci
(let ((b (char->integer (string-ref s si))))
(loop (add1 si)
(if (or (= pc 0)
(= (bitwise-and b 128) 0))
(add1 ci)
ci)
(if (= pc 0)
(if (= (bitwise-and b #b11111000) #b11110000)
3
(if (= (bitwise-and b #b11110000) #b11100000)
2
(if (= (bitwise-and b 128) 128)
1
0)))
(if (= (bitwise-and b 128) 128)
(sub1 pc)
0))))))))
;; Converts utf8 string to list of unicode characters
(define (utf8-string->list s)
(let-values (((lst _) (utf8-string->lists s)))
lst))
;; Prepends 1-byte characters representing utf8 encoding of given
;; unicode character to the list
(define (prepend-unicode-char-to-utf8-list ch lst)
(let ((n (char->integer ch)))
(cond ((>= n #x10000)
(let ((r (make-string 4))
(b0 (bitwise-and n #b111111))
(b1 (bitwise-and (arithmetic-shift n -6) #b111111))
(b2 (bitwise-and (arithmetic-shift n -12) #b111111))
(b3 (bitwise-and (arithmetic-shift n -18) #b111)))
(cons (integer->char (bitwise-ior b0 #b10000000))
(cons (integer->char (bitwise-ior b1 #b10000000))
(cons (integer->char (bitwise-ior b2 #b10000000))
(cons (integer->char (bitwise-ior b3 #b11110000))
lst))))))
((>= n #x800)
(let ((r (make-string 3))
(b0 (bitwise-and n #b111111))
(b1 (bitwise-and (arithmetic-shift n -6) #b111111))
(b2 (bitwise-and (arithmetic-shift n -12) #b1111)))
(cons (integer->char (bitwise-ior b0 #b10000000))
(cons (integer->char (bitwise-ior b1 #b10000000))
(cons (integer->char (bitwise-ior b2 #b11100000))
lst)))))
((>= n #x80)
(let ((r (make-string 2))
(b0 (bitwise-and n #b111111))
(b1 (bitwise-and (arithmetic-shift n -6) #b11111)))
(cons (integer->char (bitwise-ior b0 #b10000000))
(cons (integer->char (bitwise-ior b1 #b11000000))
lst))))
(else
(cons ch lst)))))
;; Converts list of unicode characters into utf8 string
(define (list->utf8-string lst)
(let loop ((lst lst)
(res '()))
(if (null? lst)
(list->string (reverse res))
(loop (cdr lst)
(prepend-unicode-char-to-utf8-list (car lst) res)))))
;; Returns true, if given string contains UTF-8 characters
(define (string-utf8? s)
(let ((asciilen (string-length s))
(utf8len (utf8-string-length s)))
(not (= asciilen utf8len))))
;; Module self-tests
(define (util-utf8-tests!)
(run-tests
util-utf8
(test-equal? utf8-char->string
(utf8-char->string #\ř)
"ř")
(test-equal? make-utf8-string
(make-utf8-string 4 #\č)
"čččč")
(test-equal? string-append-utf8-char
(string-append-utf8-char "ččč" #\ř)
"čččř")
(test-equal? utf8-string->list
(utf8-string->list "ěščř")
'(#\ě #\š #\č #\ř))
(test-equal? list->utf8-string
(list->utf8-string '(#\ě #\š #\č #\ř))
"ěščř")
(test-equal? utf8-string-length
(utf8-string-length "ěščř")
4)
(test-equal? utf8-string-length
(utf8-string-length "ěxšy")
4)
(test-equal? utf8-string-next-char
(utf8-string-next-char "ěščř")
2)
(test-equal? utf8-string-next-char
(utf8-string-next-char "ěščř" 2)
4)
(test-true string-utf8? (string-utf8? "ěščř"))
(test-false string-utf8? (string-utf8? "Hello World!"))
))
)