;; ;; util-utf8.scm ;; ;; UTF-8 support ;; ;; ISC License ;; ;; Copyright 2023 Dominik Pantůček ;; ;; Permission to use, copy, modify, and/or distribute this software ;; for any purpose with or without fee is hereby granted, provided ;; that the above copyright notice and this permission notice appear ;; in all copies. ;; ;; THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL ;; WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED ;; WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE ;; AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR ;; CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS ;; OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, ;; NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN ;; CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ;; (declare (unit util-utf8)) (import duck) (module* util-utf8 #:doc ("High-performance UTF-8 support.") ( utf8-char->string make-utf8-string string-append-utf8-char utf8-string->lists utf8-bytes->lists utf8-string-length utf8-string-next-char utf8-string->list list->utf8-string string-utf8? util-utf8-tests! ) (import scheme (chicken base) (chicken bitwise) testing racket-kwargs) (define/doc (utf8-char->string ch) ("Encodes given character as UTF-8.") (let ((n (char->integer ch))) (cond ((>= n #x10000) (let ((r (make-string 4)) (b0 (bitwise-and n #b111111)) (b1 (bitwise-and (arithmetic-shift n -6) #b111111)) (b2 (bitwise-and (arithmetic-shift n -12) #b111111)) (b3 (bitwise-and (arithmetic-shift n -18) #b111))) (string-set! r 0 (integer->char (bitwise-ior b3 #b11110000))) (string-set! r 1 (integer->char (bitwise-ior b2 #b10000000))) (string-set! r 2 (integer->char (bitwise-ior b1 #b10000000))) (string-set! r 3 (integer->char (bitwise-ior b0 #b10000000))) r)) ((>= n #x800) (let ((r (make-string 3)) (b0 (bitwise-and n #b111111)) (b1 (bitwise-and (arithmetic-shift n -6) #b111111)) (b2 (bitwise-and (arithmetic-shift n -12) #b1111))) (string-set! r 0 (integer->char (bitwise-ior b2 #b11100000))) (string-set! r 1 (integer->char (bitwise-ior b1 #b10000000))) (string-set! r 2 (integer->char (bitwise-ior b0 #b10000000))) r)) ((>= n #x80) (let ((r (make-string 2)) (b0 (bitwise-and n #b111111)) (b1 (bitwise-and (arithmetic-shift n -6) #b11111))) (string-set! r 0 (integer->char (bitwise-ior b1 #b11000000))) (string-set! r 1 (integer->char (bitwise-ior b0 #b10000000))) r)) (else (make-string 1 ch))))) (define*/doc (make-utf8-string n (ch #\space)) ("UTF-8 version of make-string.") (let* ((s (utf8-char->string ch)) (sl (string-length s)) (rl (* n sl)) (r (make-string rl))) (let loop ((ri 0) (si 0)) (if (= ri rl) r (let ((nsi (add1 si))) (string-set! r ri (string-ref s si)) (loop (add1 ri) (if (= nsi sl) 0 nsi))))))) (define/doc (string-append-utf8-char s ch) ("UTF-8 character append.") (string-append s (utf8-char->string ch))) (define/doc (utf8-string->lists str) ("Converts a UTF-8 string into two lists: list of UTF-8 characters of the string and a list of remaining bytes (as integers).") (utf8-bytes->lists (map char->integer (string->list str)))) (define/doc (utf8-bytes->lists chars) ("The same as above but accepts a list of bytes (as integers).") (let loop ((bytes chars) (rpending '()) (pending 0) (expected #f) (res '())) (if (null? bytes) (values (reverse res) (reverse rpending)) (let ((byte (car bytes))) (cond (expected ;; Decode UTF-8 sequence (cond ((= expected 1) ;; Last byte (let ((char (integer->char (bitwise-ior pending (bitwise-and byte #b111111))))) (loop (cdr bytes) '() 0 #f (cons char res)))) (else ;; Intermediate bytes (loop (cdr bytes) (cons byte rpending) (arithmetic-shift (bitwise-ior pending (bitwise-and byte #b111111)) 6) (sub1 expected) res)))) (else ;; ASCII or first of UTF-8 sequence (cond ((= (bitwise-and byte #b10000000) 0) ;; ASCII (loop (cdr bytes) '() 0 #f (cons (integer->char byte) res))) (else ;; First byte of UTF-8 sequence (let-values (((first-byte char-bytes) (cond ((= (bitwise-and byte #b11000000) #b11000000) (values (bitwise-and byte #b11111) 2)) ((= (bitwise-and byte #b11100000) #b11100000) (values (bitwise-and byte #b1111) 3)) ((= (bitwise-and byte #b11110000) #b11110000) (values (bitwise-and byte #b111) 4)) (else ;; Should not happen (values 0 0))))) (loop (cdr bytes) (list byte) (arithmetic-shift first-byte 6) (sub1 char-bytes) res)))))))))) (define*/doc (utf8-string-next-char str (si0 0)) ("Returns the position right after the character at specified position.") (let ((len (string-length str))) (let loop ((si si0) (pc 0)) (if (or (= si len) (and (> si si0) (eq? pc 0))) si (let ((b (char->integer (string-ref str si)))) (loop (add1 si) (if (= pc 0) (if (= (bitwise-and b #b11111000) #b11110000) 3 (if (= (bitwise-and b #b11110000) #b11100000) 2 (if (= (bitwise-and b 128) 128) 1 0))) (if (= (bitwise-and b 128) 128) (sub1 pc) 0)))))))) (define/doc (utf8-string-length s) ("Calculates the length of given UTF-8 string.") (let ((l (string-length s))) (let loop ((si 0) (ci 0) (pc 0)) (if (= si l) ci (let ((b (char->integer (string-ref s si)))) (loop (add1 si) (if (or (= pc 0) (= (bitwise-and b 128) 0)) (add1 ci) ci) (if (= pc 0) (if (= (bitwise-and b #b11111000) #b11110000) 3 (if (= (bitwise-and b #b11110000) #b11100000) 2 (if (= (bitwise-and b 128) 128) 1 0))) (if (= (bitwise-and b 128) 128) (sub1 pc) 0)))))))) (define/doc (utf8-string->list s) ("Converts utf8 string to list of unicode characters.") (let-values (((lst _) (utf8-string->lists s))) lst)) ;; Prepends 1-byte characters representing utf8 encoding of given ;; unicode character to the list (define (prepend-unicode-char-to-utf8-list ch lst) (let ((n (char->integer ch))) (cond ((>= n #x10000) (let ((r (make-string 4)) (b0 (bitwise-and n #b111111)) (b1 (bitwise-and (arithmetic-shift n -6) #b111111)) (b2 (bitwise-and (arithmetic-shift n -12) #b111111)) (b3 (bitwise-and (arithmetic-shift n -18) #b111))) (cons (integer->char (bitwise-ior b0 #b10000000)) (cons (integer->char (bitwise-ior b1 #b10000000)) (cons (integer->char (bitwise-ior b2 #b10000000)) (cons (integer->char (bitwise-ior b3 #b11110000)) lst)))))) ((>= n #x800) (let ((r (make-string 3)) (b0 (bitwise-and n #b111111)) (b1 (bitwise-and (arithmetic-shift n -6) #b111111)) (b2 (bitwise-and (arithmetic-shift n -12) #b1111))) (cons (integer->char (bitwise-ior b0 #b10000000)) (cons (integer->char (bitwise-ior b1 #b10000000)) (cons (integer->char (bitwise-ior b2 #b11100000)) lst))))) ((>= n #x80) (let ((r (make-string 2)) (b0 (bitwise-and n #b111111)) (b1 (bitwise-and (arithmetic-shift n -6) #b11111))) (cons (integer->char (bitwise-ior b0 #b10000000)) (cons (integer->char (bitwise-ior b1 #b11000000)) lst)))) (else (cons ch lst))))) (define/doc (list->utf8-string lst) ("Converts list of unicode characters into utf8 string.") (let loop ((lst lst) (res '())) (if (null? lst) (list->string (reverse res)) (loop (cdr lst) (prepend-unicode-char-to-utf8-list (car lst) res))))) (define/doc (string-utf8? s) ("Returns true, if given string contains UTF-8 characters.") (let ((asciilen (string-length s)) (utf8len (utf8-string-length s))) (not (= asciilen utf8len)))) ;; Module self-tests (define (util-utf8-tests!) (run-tests util-utf8 (test-equal? utf8-char->string (utf8-char->string #\ř) "ř") (test-equal? make-utf8-string (make-utf8-string 4 #\č) "čččč") (test-equal? string-append-utf8-char (string-append-utf8-char "ččč" #\ř) "čččř") (test-equal? utf8-string->list (utf8-string->list "ěščř") '(#\ě #\š #\č #\ř)) (test-equal? list->utf8-string (list->utf8-string '(#\ě #\š #\č #\ř)) "ěščř") (test-equal? utf8-string-length (utf8-string-length "ěščř") 4) (test-equal? utf8-string-length (utf8-string-length "ěxšy") 4) (test-equal? utf8-string-next-char (utf8-string-next-char "ěščř") 2) (test-equal? utf8-string-next-char (utf8-string-next-char "ěščř" 2) 4) (test-true string-utf8? (string-utf8? "ěščř")) (test-false string-utf8? (string-utf8? "Hello World!")) )) )