Fix utf-8 3-byte handling.

This commit is contained in:
Dominik Pantůček 2024-02-08 21:05:08 +01:00
parent ed55660c80
commit dc3044026c

View file

@ -120,13 +120,13 @@ of the string and a list of remaining bytes (as integers).")
(define/doc (utf8-bytes->lists chars) (define/doc (utf8-bytes->lists chars)
("The same as above but accepts a list of bytes (as integers).") ("The same as above but accepts a list of bytes (as integers).")
(let loop ((bytes chars) (let loop ((bytes chars)
(rpending '()) (rpending chars)
(pending 0) (pending 0)
(expected #f) (expected #f)
(res '())) (res '()))
(if (null? bytes) (if (null? bytes)
(values (reverse res) (values (reverse res)
(reverse rpending)) rpending)
(let ((byte (car bytes))) (let ((byte (car bytes)))
(cond (expected (cond (expected
;; Decode UTF-8 sequence ;; Decode UTF-8 sequence
@ -135,14 +135,14 @@ of the string and a list of remaining bytes (as integers).")
(let ((char (integer->char (bitwise-ior pending (let ((char (integer->char (bitwise-ior pending
(bitwise-and byte #b111111))))) (bitwise-and byte #b111111)))))
(loop (cdr bytes) (loop (cdr bytes)
'() (cdr bytes)
0 0
#f #f
(cons char res)))) (cons char res))))
(else (else
;; Intermediate bytes ;; Intermediate bytes
(loop (cdr bytes) (loop (cdr bytes)
(cons byte rpending) rpending
(arithmetic-shift (bitwise-ior pending (arithmetic-shift (bitwise-ior pending
(bitwise-and byte #b111111)) 6) (bitwise-and byte #b111111)) 6)
(sub1 expected) (sub1 expected)
@ -152,7 +152,7 @@ of the string and a list of remaining bytes (as integers).")
(cond ((= (bitwise-and byte #b10000000) 0) (cond ((= (bitwise-and byte #b10000000) 0)
;; ASCII ;; ASCII
(loop (cdr bytes) (loop (cdr bytes)
'() (cdr bytes)
0 0
#f #f
(cons (integer->char byte) res))) (cons (integer->char byte) res)))
@ -160,20 +160,20 @@ of the string and a list of remaining bytes (as integers).")
;; First byte of UTF-8 sequence ;; First byte of UTF-8 sequence
(let-values (let-values
(((first-byte char-bytes) (((first-byte char-bytes)
(cond ((= (bitwise-and byte #b11000000) #b11000000) (cond ((= (bitwise-and byte #b11100000) #b11000000)
(values (bitwise-and byte #b11111) (values (bitwise-and byte #b11111)
2)) 2))
((= (bitwise-and byte #b11100000) #b11100000) ((= (bitwise-and byte #b11110000) #b11100000)
(values (bitwise-and byte #b1111) (values (bitwise-and byte #b1111)
3)) 3))
((= (bitwise-and byte #b11110000) #b11110000) ((= (bitwise-and byte #b11111000) #b11110000)
(values (bitwise-and byte #b111) (values (bitwise-and byte #b111)
4)) 4))
(else (else
;; Should not happen ;; Should not happen
(values 0 0))))) (values 0 0)))))
(loop (cdr bytes) (loop (cdr bytes)
(list byte) bytes
(arithmetic-shift first-byte 6) (arithmetic-shift first-byte 6)
(sub1 char-bytes) (sub1 char-bytes)
res)))))))))) res))))))))))