Fix utf-8 3-byte handling.

This commit is contained in:
Dominik Pantůček 2024-02-08 21:05:08 +01:00
parent ed55660c80
commit dc3044026c

View file

@ -120,13 +120,13 @@ of the string and a list of remaining bytes (as integers).")
(define/doc (utf8-bytes->lists chars)
("The same as above but accepts a list of bytes (as integers).")
(let loop ((bytes chars)
(rpending '())
(rpending chars)
(pending 0)
(expected #f)
(res '()))
(if (null? bytes)
(values (reverse res)
(reverse rpending))
rpending)
(let ((byte (car bytes)))
(cond (expected
;; Decode UTF-8 sequence
@ -135,14 +135,14 @@ of the string and a list of remaining bytes (as integers).")
(let ((char (integer->char (bitwise-ior pending
(bitwise-and byte #b111111)))))
(loop (cdr bytes)
'()
(cdr bytes)
0
#f
(cons char res))))
(else
;; Intermediate bytes
(loop (cdr bytes)
(cons byte rpending)
rpending
(arithmetic-shift (bitwise-ior pending
(bitwise-and byte #b111111)) 6)
(sub1 expected)
@ -152,7 +152,7 @@ of the string and a list of remaining bytes (as integers).")
(cond ((= (bitwise-and byte #b10000000) 0)
;; ASCII
(loop (cdr bytes)
'()
(cdr bytes)
0
#f
(cons (integer->char byte) res)))
@ -160,20 +160,20 @@ of the string and a list of remaining bytes (as integers).")
;; First byte of UTF-8 sequence
(let-values
(((first-byte char-bytes)
(cond ((= (bitwise-and byte #b11000000) #b11000000)
(cond ((= (bitwise-and byte #b11100000) #b11000000)
(values (bitwise-and byte #b11111)
2))
((= (bitwise-and byte #b11100000) #b11100000)
((= (bitwise-and byte #b11110000) #b11100000)
(values (bitwise-and byte #b1111)
3))
((= (bitwise-and byte #b11110000) #b11110000)
((= (bitwise-and byte #b11111000) #b11110000)
(values (bitwise-and byte #b111)
4))
(else
;; Should not happen
(values 0 0)))))
(loop (cdr bytes)
(list byte)
bytes
(arithmetic-shift first-byte 6)
(sub1 char-bytes)
res))))))))))