Finish ducking.

This commit is contained in:
Dominik Pantůček 2023-07-07 14:42:57 +02:00
parent 1b18c99d3c
commit c42582a799
4 changed files with 107 additions and 31 deletions

View file

@ -609,3 +609,74 @@ Directory handling which didn't fit elsewhere.
Makes sure given path exists and it is a directory. Throws an error Makes sure given path exists and it is a directory. Throws an error
if it exists and it is not a directory. if it exists and it is not a directory.
## util-utf8 [module]
(import util-utf8)
High-performance UTF-8 support.
### utf8-char->string [procedure]
(utf8-char->string ch)
Encodes given character as UTF-8.
### make-utf8-string [procedure]
(make-utf8-string n
(ch #\space))
UTF-8 version of make-string.
### string-append-utf8-char [procedure]
(string-append-utf8-char s
ch)
UTF-8 character append.
### utf8-string->lists [procedure]
(utf8-string->lists str)
Converts a UTF-8 string into two lists: list of UTF-8 characters
of the string and a list of remaining bytes (as integers).
### utf8-bytes->lists [procedure]
(utf8-bytes->lists chars)
The same as above but accepts a list of bytes (as integers).
### utf8-string-next-char [procedure]
(utf8-string-next-char str
(si0 0))
Returns the position right after the character at specified
position.
### utf8-string-length [procedure]
(utf8-string-length s)
Calculates the length of given UTF-8 string.
### utf8-string->list [procedure]
(utf8-string->list s)
Converts utf8 string to list of unicode characters.
### list->utf8-string [procedure]
(list->utf8-string lst)
Converts list of unicode characters into utf8 string.
### string-utf8? [procedure]
(string-utf8? s)
Returns true, if given string contains UTF-8 characters.

View file

@ -65,14 +65,15 @@ GENDOC-SOURCES=gendoc.scm duck-extract.import.scm \
util-proc.import.scm util-format.import.scm \ util-proc.import.scm util-format.import.scm \
util-tag.import.scm util-string.import.scm \ util-tag.import.scm util-string.import.scm \
util-bst.import.scm util-bst-bdict.import.scm \ util-bst.import.scm util-bst-bdict.import.scm \
util-bst-ldict.import.scm util-dir.import.scm util-bst-ldict.import.scm util-dir.import.scm \
util-utf8.import.scm
GENDOC-OBJS=gendoc.o duck-extract.o util-time.o util-csv.o util-io.o \ GENDOC-OBJS=gendoc.o duck-extract.o util-time.o util-csv.o util-io.o \
progress.o testing.o util-proc.o util-git.o util-io.o \ progress.o testing.o util-proc.o util-git.o util-io.o \
util-stdout.o util-parser.o util-list.o util-proc.o \ util-stdout.o util-parser.o util-list.o util-proc.o \
util-format.o racket-kwargs.o util-bst-ldict.o util-tag.o \ util-format.o racket-kwargs.o util-bst-ldict.o util-tag.o \
duck.o util-string.o util-bst.o util-bst-bdict.o \ duck.o util-string.o util-bst.o util-bst-bdict.o \
util-bst-ldict.o util-dir.o util-bst-ldict.o util-dir.o util-utf8.o
.PHONY: imports .PHONY: imports
imports: $(HACKERBASE-DEPS) imports: $(HACKERBASE-DEPS)
@ -444,7 +445,8 @@ SGR-STATE-SOURCES=sgr-state.scm testing.import.scm \
sgr-state.o: sgr-state.import.scm sgr-state.o: sgr-state.import.scm
sgr-state.import.scm: $(SGR-STATE-SOURCES) sgr-state.import.scm: $(SGR-STATE-SOURCES)
UTIL-UTF8-SOURCES=util-utf8.scm testing.import.scm UTIL-UTF8-SOURCES=util-utf8.scm testing.import.scm duck.import.scm \
racket-kwargs.import.scm
util-utf8.o: util-utf8.import.scm util-utf8.o: util-utf8.import.scm
util-utf8.import.scm: $(UTIL-UTF8-SOURCES) util-utf8.import.scm: $(UTIL-UTF8-SOURCES)

View file

@ -37,4 +37,5 @@
util-bst-ldict util-bst-ldict
util-bst-lset util-bst-lset
util-dir util-dir
util-utf8
) )

View file

@ -24,8 +24,11 @@
(declare (unit util-utf8)) (declare (unit util-utf8))
(module (import duck)
(module*
util-utf8 util-utf8
#:doc ("High-performance UTF-8 support.")
( (
utf8-char->string utf8-char->string
@ -52,10 +55,11 @@
(import scheme (import scheme
(chicken base) (chicken base)
(chicken bitwise) (chicken bitwise)
testing) testing
racket-kwargs)
;; Encodes given character as UTF-8 (define/doc (utf8-char->string ch)
(define (utf8-char->string ch) ("Encodes given character as UTF-8.")
(let ((n (char->integer ch))) (let ((n (char->integer ch)))
(cond ((>= n #x10000) (cond ((>= n #x10000)
(let ((r (make-string 4)) (let ((r (make-string 4))
@ -87,10 +91,9 @@
(else (else
(make-string 1 ch))))) (make-string 1 ch)))))
;; UTF-8 version of make-string (define*/doc (make-utf8-string n (ch #\space))
(define (make-utf8-string n . chs) ("UTF-8 version of make-string.")
(let* ((ch (if (null? chs) #\space (car chs))) (let* ((s (utf8-char->string ch))
(s (utf8-char->string ch))
(sl (string-length s)) (sl (string-length s))
(rl (* n sl)) (rl (* n sl))
(r (make-string rl))) (r (make-string rl)))
@ -103,19 +106,19 @@
(loop (add1 ri) (loop (add1 ri)
(if (= nsi sl) 0 nsi))))))) (if (= nsi sl) 0 nsi)))))))
;; UTF-8 character append (define/doc (string-append-utf8-char s ch)
(define (string-append-utf8-char s ch) ("UTF-8 character append.")
(string-append s (utf8-char->string ch))) (string-append s (utf8-char->string ch)))
;; Converts a UTF-8 string into two lists: list of UTF-8 characters (define/doc (utf8-string->lists str)
;; of the string and a list of remaining bytes (as integers). ("Converts a UTF-8 string into two lists: list of UTF-8 characters
(define (utf8-string->lists str) of the string and a list of remaining bytes (as integers).")
(utf8-bytes->lists (utf8-bytes->lists
(map char->integer (map char->integer
(string->list str)))) (string->list str))))
;; The same as above but accepts a list of bytes (as integers) (define/doc (utf8-bytes->lists chars)
(define (utf8-bytes->lists chars) ("The same as above but accepts a list of bytes (as integers).")
(let loop ((bytes chars) (let loop ((bytes chars)
(rpending '()) (rpending '())
(pending 0) (pending 0)
@ -175,11 +178,10 @@
(sub1 char-bytes) (sub1 char-bytes)
res)))))))))) res))))))))))
;; Returns the position right after the character at specified (define*/doc (utf8-string-next-char str (si0 0))
;; position. ("Returns the position right after the character at specified
(define (utf8-string-next-char str . sis) position.")
(let ((len (string-length str)) (let ((len (string-length str)))
(si0 (if (null? sis) 0 (car sis))))
(let loop ((si si0) (let loop ((si si0)
(pc 0)) (pc 0))
(if (or (= si len) (if (or (= si len)
@ -200,8 +202,8 @@
(sub1 pc) (sub1 pc)
0)))))))) 0))))))))
;; Calculates the length of given UTF-8 string (define/doc (utf8-string-length s)
(define (utf8-string-length s) ("Calculates the length of given UTF-8 string.")
(let ((l (string-length s))) (let ((l (string-length s)))
(let loop ((si 0) (let loop ((si 0)
(ci 0) (ci 0)
@ -226,8 +228,8 @@
(sub1 pc) (sub1 pc)
0)))))))) 0))))))))
;; Converts utf8 string to list of unicode characters (define/doc (utf8-string->list s)
(define (utf8-string->list s) ("Converts utf8 string to list of unicode characters.")
(let-values (((lst _) (utf8-string->lists s))) (let-values (((lst _) (utf8-string->lists s)))
lst)) lst))
@ -265,8 +267,8 @@
(else (else
(cons ch lst))))) (cons ch lst)))))
;; Converts list of unicode characters into utf8 string (define/doc (list->utf8-string lst)
(define (list->utf8-string lst) ("Converts list of unicode characters into utf8 string.")
(let loop ((lst lst) (let loop ((lst lst)
(res '())) (res '()))
(if (null? lst) (if (null? lst)
@ -274,8 +276,8 @@
(loop (cdr lst) (loop (cdr lst)
(prepend-unicode-char-to-utf8-list (car lst) res))))) (prepend-unicode-char-to-utf8-list (car lst) res)))))
;; Returns true, if given string contains UTF-8 characters (define/doc (string-utf8? s)
(define (string-utf8? s) ("Returns true, if given string contains UTF-8 characters.")
(let ((asciilen (string-length s)) (let ((asciilen (string-length s))
(utf8len (utf8-string-length s))) (utf8len (utf8-string-length s)))
(not (= asciilen utf8len)))) (not (= asciilen utf8len))))