;; ;; util-utf8.scm ;; ;; UTF-8 support ;; ;; ISC License ;; ;; Copyright 2023 Dominik Pantůček ;; ;; Permission to use, copy, modify, and/or distribute this software ;; for any purpose with or without fee is hereby granted, provided ;; that the above copyright notice and this permission notice appear ;; in all copies. ;; ;; THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL ;; WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED ;; WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE ;; AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR ;; CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS ;; OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, ;; NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN ;; CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ;; (declare (unit util-utf8)) (module util-utf8 ( utf8-char->string make-utf8-string string-append-utf8-char utf8-string->lists utf8-bytes->lists utf8-string-length utf8-string-next-char utf8-string->list list->utf8-string util-utf8-tests! ) (import scheme (chicken base) (chicken bitwise) testing) ;; Encodes given character as UTF-8 (define (utf8-char->string ch) (let ((n (char->integer ch))) (cond ((>= n #x10000) (let ((r (make-string 4)) (b0 (bitwise-and n #b111111)) (b1 (bitwise-and (arithmetic-shift n -6) #b111111)) (b2 (bitwise-and (arithmetic-shift n -12) #b111111)) (b3 (bitwise-and (arithmetic-shift n -18) #b111))) (string-set! r 0 (integer->char (bitwise-ior b3 #b11110000))) (string-set! r 1 (integer->char (bitwise-ior b2 #b10000000))) (string-set! r 2 (integer->char (bitwise-ior b1 #b10000000))) (string-set! r 3 (integer->char (bitwise-ior b0 #b10000000))) r)) ((>= n #x800) (let ((r (make-string 3)) (b0 (bitwise-and n #b111111)) (b1 (bitwise-and (arithmetic-shift n -6) #b111111)) (b2 (bitwise-and (arithmetic-shift n -12) #b1111))) (string-set! r 0 (integer->char (bitwise-ior b2 #b11100000))) (string-set! r 1 (integer->char (bitwise-ior b1 #b10000000))) (string-set! r 2 (integer->char (bitwise-ior b0 #b10000000))) r)) ((>= n #x80) (let ((r (make-string 2)) (b0 (bitwise-and n #b111111)) (b1 (bitwise-and (arithmetic-shift n -6) #b11111))) (string-set! r 0 (integer->char (bitwise-ior b1 #b11000000))) (string-set! r 1 (integer->char (bitwise-ior b0 #b10000000))) r)) (else (make-string 1 ch))))) ;; UTF-8 version of make-string (define (make-utf8-string n . chs) (let* ((ch (if (null? chs) #\space (car chs))) (s (utf8-char->string ch)) (sl (string-length s)) (rl (* n sl)) (r (make-string rl))) (let loop ((ri 0) (si 0)) (if (= ri rl) r (let ((nsi (add1 si))) (string-set! r ri (string-ref s si)) (loop (add1 ri) (if (= nsi sl) 0 nsi))))))) ;; UTF-8 character append (define (string-append-utf8-char s ch) (string-append s (utf8-char->string ch))) ;; Converts a UTF-8 string into two lists: list of UTF-8 characters ;; of the string and a list of remaining bytes (as integers). (define (utf8-string->lists str) (utf8-bytes->lists (map char->integer (string->list str)))) ;; The same as above but accepts a list of bytes (as integers) (define (utf8-bytes->lists chars) (let loop ((bytes chars) (rpending '()) (pending 0) (expected #f) (res '())) (if (null? bytes) (values (reverse res) (reverse rpending)) (let ((byte (car bytes))) (cond (expected ;; Decode UTF-8 sequence (cond ((= expected 1) ;; Last byte (let ((char (integer->char (bitwise-ior pending (bitwise-and byte #b111111))))) (loop (cdr bytes) '() 0 #f (cons char res)))) (else ;; Intermediate bytes (loop (cdr bytes) (cons byte rpending) (arithmetic-shift (bitwise-ior pending (bitwise-and byte #b111111)) 6) (sub1 expected) res)))) (else ;; ASCII or first of UTF-8 sequence (cond ((= (bitwise-and byte #b10000000) 0) ;; ASCII (loop (cdr bytes) '() 0 #f (cons (integer->char byte) res))) (else ;; First byte of UTF-8 sequence (let-values (((first-byte char-bytes) (cond ((= (bitwise-and byte #b11000000) #b11000000) (values (bitwise-and byte #b11111) 2)) ((= (bitwise-and byte #b11100000) #b11100000) (values (bitwise-and byte #b1111) 3)) ((= (bitwise-and byte #b11110000) #b11110000) (values (bitwise-and byte #b111) 4)) (else ;; Should not happen (values 0 0))))) (loop (cdr bytes) (list byte) (arithmetic-shift first-byte 6) (sub1 char-bytes) res)))))))))) ;; Returns the position right after the character at specified ;; position. (define (utf8-string-next-char str . sis) (let ((len (string-length str)) (si0 (if (null? sis) 0 (car sis)))) (let loop ((si si0) (pc 0)) (if (or (= si len) (and (> si si0) (eq? pc 0))) si (let ((b (char->integer (string-ref str si)))) (loop (add1 si) (if (= pc 0) (if (= (bitwise-and b #b11111000) #b11110000) 3 (if (= (bitwise-and b #b11110000) #b11100000) 2 (if (= (bitwise-and b 128) 128) 1 0))) (if (= (bitwise-and b 128) 128) (sub1 pc) 0)))))))) ;; Calculates the length of given UTF-8 string (define (utf8-string-length s) (let ((l (string-length s))) (let loop ((si 0) (ci 0) (pc 0)) (if (= si l) ci (let ((b (char->integer (string-ref s si)))) (loop (add1 si) (if (or (= pc 0) (= (bitwise-and b 128) 0)) (add1 ci) ci) (if (= pc 0) (if (= (bitwise-and b #b11111000) #b11110000) 3 (if (= (bitwise-and b #b11110000) #b11100000) 2 (if (= (bitwise-and b 128) 128) 1 0))) (if (= (bitwise-and b 128) 128) (sub1 pc) 0)))))))) ;; Converts utf8 string to list of unicode characters (define (utf8-string->list s) (let-values (((lst _) (utf8-string->lists s))) lst)) ;; Prepends 1-byte characters representing utf8 encoding of given ;; unicode character to the list (define (prepend-unicode-char-to-utf8-list ch lst) (let ((n (char->integer ch))) (cond ((>= n #x10000) (let ((r (make-string 4)) (b0 (bitwise-and n #b111111)) (b1 (bitwise-and (arithmetic-shift n -6) #b111111)) (b2 (bitwise-and (arithmetic-shift n -12) #b111111)) (b3 (bitwise-and (arithmetic-shift n -18) #b111))) (cons (integer->char (bitwise-ior b0 #b10000000)) (cons (integer->char (bitwise-ior b1 #b10000000)) (cons (integer->char (bitwise-ior b2 #b10000000)) (cons (integer->char (bitwise-ior b3 #b11110000)) lst)))))) ((>= n #x800) (let ((r (make-string 3)) (b0 (bitwise-and n #b111111)) (b1 (bitwise-and (arithmetic-shift n -6) #b111111)) (b2 (bitwise-and (arithmetic-shift n -12) #b1111))) (cons (integer->char (bitwise-ior b0 #b10000000)) (cons (integer->char (bitwise-ior b1 #b10000000)) (cons (integer->char (bitwise-ior b2 #b11100000)) lst))))) ((>= n #x80) (let ((r (make-string 2)) (b0 (bitwise-and n #b111111)) (b1 (bitwise-and (arithmetic-shift n -6) #b11111))) (cons (integer->char (bitwise-ior b0 #b10000000)) (cons (integer->char (bitwise-ior b1 #b11000000)) lst)))) (else (cons ch lst))))) ;; Converts list of unicode characters into utf8 string (define (list->utf8-string lst) (let loop ((lst lst) (res '())) (if (null? lst) (list->string (reverse res)) (loop (cdr lst) (prepend-unicode-char-to-utf8-list (car lst) res))))) ;; Module self-tests (define (util-utf8-tests!) (run-tests util-utf8 (test-equal? utf8-char->string (utf8-char->string #\ř) "ř") (test-equal? make-utf8-string (make-utf8-string 4 #\č) "čččč") (test-equal? string-append-utf8-char (string-append-utf8-char "ččč" #\ř) "čččř") (test-equal? utf8-string->list (utf8-string->list "ěščř") '(#\ě #\š #\č #\ř)) (test-equal? list->utf8-string (list->utf8-string '(#\ě #\š #\č #\ř)) "ěščř") (test-equal? utf8-string-length (utf8-string-length "ěščř") 4) (test-equal? utf8-string-length (utf8-string-length "ěxšy") 4) (test-equal? utf8-string-next-char (utf8-string-next-char "ěščř") 2) (test-equal? utf8-string-next-char (utf8-string-next-char "ěščř" 2) 4) )) )