Import new table renderer.
This commit is contained in:
parent
3a59a9293a
commit
3f7f1356a4
12 changed files with 3859 additions and 1 deletions
305
src/util-utf8.scm
Normal file
305
src/util-utf8.scm
Normal file
|
@ -0,0 +1,305 @@
|
|||
;;
|
||||
;; util-utf8.scm
|
||||
;;
|
||||
;; UTF-8 support
|
||||
;;
|
||||
;; ISC License
|
||||
;;
|
||||
;; Copyright 2023 Dominik Pantůček <dominik.pantucek@trustica.cz>
|
||||
;;
|
||||
;; Permission to use, copy, modify, and/or distribute this software
|
||||
;; for any purpose with or without fee is hereby granted, provided
|
||||
;; that the above copyright notice and this permission notice appear
|
||||
;; in all copies.
|
||||
;;
|
||||
;; THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
|
||||
;; WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
|
||||
;; WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
|
||||
;; AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
|
||||
;; CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
||||
;; OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
|
||||
;; NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
||||
;; CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
;;
|
||||
|
||||
(declare (unit util-utf8))
|
||||
|
||||
(module
|
||||
util-utf8
|
||||
(
|
||||
utf8-char->string
|
||||
|
||||
make-utf8-string
|
||||
|
||||
string-append-utf8-char
|
||||
|
||||
utf8-string->lists
|
||||
|
||||
utf8-bytes->lists
|
||||
|
||||
utf8-string-length
|
||||
|
||||
utf8-string-next-char
|
||||
|
||||
utf8-string->list
|
||||
list->utf8-string
|
||||
|
||||
util-utf8-tests!
|
||||
)
|
||||
|
||||
(import scheme
|
||||
(chicken base)
|
||||
(chicken bitwise)
|
||||
testing)
|
||||
|
||||
;; Encodes given character as UTF-8
|
||||
(define (utf8-char->string ch)
|
||||
(let ((n (char->integer ch)))
|
||||
(cond ((>= n #x10000)
|
||||
(let ((r (make-string 4))
|
||||
(b0 (bitwise-and n #b111111))
|
||||
(b1 (bitwise-and (arithmetic-shift n -6) #b111111))
|
||||
(b2 (bitwise-and (arithmetic-shift n -12) #b111111))
|
||||
(b3 (bitwise-and (arithmetic-shift n -18) #b111)))
|
||||
(string-set! r 0 (integer->char (bitwise-ior b3 #b11110000)))
|
||||
(string-set! r 1 (integer->char (bitwise-ior b2 #b10000000)))
|
||||
(string-set! r 2 (integer->char (bitwise-ior b1 #b10000000)))
|
||||
(string-set! r 3 (integer->char (bitwise-ior b0 #b10000000)))
|
||||
r))
|
||||
((>= n #x800)
|
||||
(let ((r (make-string 3))
|
||||
(b0 (bitwise-and n #b111111))
|
||||
(b1 (bitwise-and (arithmetic-shift n -6) #b111111))
|
||||
(b2 (bitwise-and (arithmetic-shift n -12) #b1111)))
|
||||
(string-set! r 0 (integer->char (bitwise-ior b2 #b11100000)))
|
||||
(string-set! r 1 (integer->char (bitwise-ior b1 #b10000000)))
|
||||
(string-set! r 2 (integer->char (bitwise-ior b0 #b10000000)))
|
||||
r))
|
||||
((>= n #x80)
|
||||
(let ((r (make-string 2))
|
||||
(b0 (bitwise-and n #b111111))
|
||||
(b1 (bitwise-and (arithmetic-shift n -6) #b11111)))
|
||||
(string-set! r 0 (integer->char (bitwise-ior b1 #b11000000)))
|
||||
(string-set! r 1 (integer->char (bitwise-ior b0 #b10000000)))
|
||||
r))
|
||||
(else
|
||||
(make-string 1 ch)))))
|
||||
|
||||
;; UTF-8 version of make-string
|
||||
(define (make-utf8-string n . chs)
|
||||
(let* ((ch (if (null? chs) #\space (car chs)))
|
||||
(s (utf8-char->string ch))
|
||||
(sl (string-length s))
|
||||
(rl (* n sl))
|
||||
(r (make-string rl)))
|
||||
(let loop ((ri 0)
|
||||
(si 0))
|
||||
(if (= ri rl)
|
||||
r
|
||||
(let ((nsi (add1 si)))
|
||||
(string-set! r ri (string-ref s si))
|
||||
(loop (add1 ri)
|
||||
(if (= nsi sl) 0 nsi)))))))
|
||||
|
||||
;; UTF-8 character append
|
||||
(define (string-append-utf8-char s ch)
|
||||
(string-append s (utf8-char->string ch)))
|
||||
|
||||
;; Converts a UTF-8 string into two lists: list of UTF-8 characters
|
||||
;; of the string and a list of remaining bytes (as integers).
|
||||
(define (utf8-string->lists str)
|
||||
(utf8-bytes->lists
|
||||
(map char->integer
|
||||
(string->list str))))
|
||||
|
||||
;; The same as above but accepts a list of bytes (as integers)
|
||||
(define (utf8-bytes->lists chars)
|
||||
(let loop ((bytes chars)
|
||||
(rpending '())
|
||||
(pending 0)
|
||||
(expected #f)
|
||||
(res '()))
|
||||
(if (null? bytes)
|
||||
(values (reverse res)
|
||||
(reverse rpending))
|
||||
(let ((byte (car bytes)))
|
||||
(cond (expected
|
||||
;; Decode UTF-8 sequence
|
||||
(cond ((= expected 1)
|
||||
;; Last byte
|
||||
(let ((char (integer->char (bitwise-ior pending
|
||||
(bitwise-and byte #b111111)))))
|
||||
(loop (cdr bytes)
|
||||
'()
|
||||
0
|
||||
#f
|
||||
(cons char res))))
|
||||
(else
|
||||
;; Intermediate bytes
|
||||
(loop (cdr bytes)
|
||||
(cons byte rpending)
|
||||
(arithmetic-shift (bitwise-ior pending
|
||||
(bitwise-and byte #b111111)) 6)
|
||||
(sub1 expected)
|
||||
res))))
|
||||
(else
|
||||
;; ASCII or first of UTF-8 sequence
|
||||
(cond ((= (bitwise-and byte #b10000000) 0)
|
||||
;; ASCII
|
||||
(loop (cdr bytes)
|
||||
'()
|
||||
0
|
||||
#f
|
||||
(cons (integer->char byte) res)))
|
||||
(else
|
||||
;; First byte of UTF-8 sequence
|
||||
(let-values
|
||||
(((first-byte char-bytes)
|
||||
(cond ((= (bitwise-and byte #b11000000) #b11000000)
|
||||
(values (bitwise-and byte #b11111)
|
||||
2))
|
||||
((= (bitwise-and byte #b11100000) #b11100000)
|
||||
(values (bitwise-and byte #b1111)
|
||||
3))
|
||||
((= (bitwise-and byte #b11110000) #b11110000)
|
||||
(values (bitwise-and byte #b111)
|
||||
4)))))
|
||||
(loop (cdr bytes)
|
||||
(list byte)
|
||||
(arithmetic-shift first-byte 6)
|
||||
(sub1 char-bytes)
|
||||
res))))))))))
|
||||
|
||||
;; Returns the position right after the character at specified
|
||||
;; position.
|
||||
(define (utf8-string-next-char str . sis)
|
||||
(let ((len (string-length str))
|
||||
(si0 (if (null? sis) 0 (car sis))))
|
||||
(let loop ((si si0)
|
||||
(pc 0))
|
||||
(if (or (= si len)
|
||||
(and (> si si0)
|
||||
(eq? pc 0)))
|
||||
si
|
||||
(let ((b (char->integer (string-ref str si))))
|
||||
(loop (add1 si)
|
||||
(if (= pc 0)
|
||||
(if (= (bitwise-and b #b11111000) #b11110000)
|
||||
3
|
||||
(if (= (bitwise-and b #b11110000) #b11100000)
|
||||
2
|
||||
(if (= (bitwise-and b 128) 128)
|
||||
1
|
||||
0)))
|
||||
(if (= (bitwise-and b 128) 128)
|
||||
(sub1 pc)
|
||||
0))))))))
|
||||
|
||||
;; Calculates the length of given UTF-8 string
|
||||
(define (utf8-string-length s)
|
||||
(let ((l (string-length s)))
|
||||
(let loop ((si 0)
|
||||
(ci 0)
|
||||
(pc 0))
|
||||
(if (= si l)
|
||||
ci
|
||||
(let ((b (char->integer (string-ref s si))))
|
||||
(loop (add1 si)
|
||||
(if (or (= pc 0)
|
||||
(= (bitwise-and b 128) 0))
|
||||
(add1 ci)
|
||||
ci)
|
||||
(if (= pc 0)
|
||||
(if (= (bitwise-and b #b11111000) #b11110000)
|
||||
3
|
||||
(if (= (bitwise-and b #b11110000) #b11100000)
|
||||
2
|
||||
(if (= (bitwise-and b 128) 128)
|
||||
1
|
||||
0)))
|
||||
(if (= (bitwise-and b 128) 128)
|
||||
(sub1 pc)
|
||||
0))))))))
|
||||
|
||||
;; Converts utf8 string to list of unicode characters
|
||||
(define (utf8-string->list s)
|
||||
(let-values (((lst _) (utf8-string->lists s)))
|
||||
lst))
|
||||
|
||||
;; Prepends 1-byte characters representing utf8 encoding of given
|
||||
;; unicode character to the list
|
||||
(define (prepend-unicode-char-to-utf8-list ch lst)
|
||||
(let ((n (char->integer ch)))
|
||||
(cond ((>= n #x10000)
|
||||
(let ((r (make-string 4))
|
||||
(b0 (bitwise-and n #b111111))
|
||||
(b1 (bitwise-and (arithmetic-shift n -6) #b111111))
|
||||
(b2 (bitwise-and (arithmetic-shift n -12) #b111111))
|
||||
(b3 (bitwise-and (arithmetic-shift n -18) #b111)))
|
||||
(cons (integer->char (bitwise-ior b0 #b10000000))
|
||||
(cons (integer->char (bitwise-ior b1 #b10000000))
|
||||
(cons (integer->char (bitwise-ior b2 #b10000000))
|
||||
(cons (integer->char (bitwise-ior b3 #b11110000))
|
||||
lst))))))
|
||||
((>= n #x800)
|
||||
(let ((r (make-string 3))
|
||||
(b0 (bitwise-and n #b111111))
|
||||
(b1 (bitwise-and (arithmetic-shift n -6) #b111111))
|
||||
(b2 (bitwise-and (arithmetic-shift n -12) #b1111)))
|
||||
(cons (integer->char (bitwise-ior b0 #b10000000))
|
||||
(cons (integer->char (bitwise-ior b1 #b10000000))
|
||||
(cons (integer->char (bitwise-ior b2 #b11100000))
|
||||
lst)))))
|
||||
((>= n #x80)
|
||||
(let ((r (make-string 2))
|
||||
(b0 (bitwise-and n #b111111))
|
||||
(b1 (bitwise-and (arithmetic-shift n -6) #b11111)))
|
||||
(cons (integer->char (bitwise-ior b0 #b10000000))
|
||||
(cons (integer->char (bitwise-ior b1 #b11000000))
|
||||
lst))))
|
||||
(else
|
||||
(cons ch lst)))))
|
||||
|
||||
;; Converts list of unicode characters into utf8 string
|
||||
(define (list->utf8-string lst)
|
||||
(let loop ((lst lst)
|
||||
(res '()))
|
||||
(if (null? lst)
|
||||
(list->string (reverse res))
|
||||
(loop (cdr lst)
|
||||
(prepend-unicode-char-to-utf8-list (car lst) res)))))
|
||||
|
||||
;; Module self-tests
|
||||
(define (util-utf8-tests!)
|
||||
(run-tests
|
||||
util-utf8
|
||||
(test-equal? utf8-char->string
|
||||
(utf8-char->string #\ř)
|
||||
"ř")
|
||||
(test-equal? make-utf8-string
|
||||
(make-utf8-string 4 #\č)
|
||||
"čččč")
|
||||
(test-equal? string-append-utf8-char
|
||||
(string-append-utf8-char "ččč" #\ř)
|
||||
"čččř")
|
||||
(test-equal? utf8-string->list
|
||||
(utf8-string->list "ěščř")
|
||||
'(#\ě #\š #\č #\ř))
|
||||
(test-equal? list->utf8-string
|
||||
(list->utf8-string '(#\ě #\š #\č #\ř))
|
||||
"ěščř")
|
||||
(test-equal? utf8-string-length
|
||||
(utf8-string-length "ěščř")
|
||||
4)
|
||||
(test-equal? utf8-string-length
|
||||
(utf8-string-length "ěxšy")
|
||||
4)
|
||||
(test-equal? utf8-string-next-char
|
||||
(utf8-string-next-char "ěščř")
|
||||
2)
|
||||
(test-equal? utf8-string-next-char
|
||||
(utf8-string-next-char "ěščř" 2)
|
||||
4)
|
||||
))
|
||||
|
||||
)
|
Loading…
Add table
Add a link
Reference in a new issue