diff --git a/doc/d-utils.md b/doc/d-utils.md index c9b0d45..6b3641a 100644 --- a/doc/d-utils.md +++ b/doc/d-utils.md @@ -609,3 +609,74 @@ Directory handling which didn't fit elsewhere. Makes sure given path exists and it is a directory. Throws an error if it exists and it is not a directory. + +## util-utf8 [module] + + (import util-utf8) + +High-performance UTF-8 support. + +### utf8-char->string [procedure] + + (utf8-char->string ch) + +Encodes given character as UTF-8. + +### make-utf8-string [procedure] + + (make-utf8-string n + (ch #\space)) + +UTF-8 version of make-string. + +### string-append-utf8-char [procedure] + + (string-append-utf8-char s + ch) + +UTF-8 character append. + +### utf8-string->lists [procedure] + + (utf8-string->lists str) + +Converts a UTF-8 string into two lists: list of UTF-8 characters +of the string and a list of remaining bytes (as integers). + +### utf8-bytes->lists [procedure] + + (utf8-bytes->lists chars) + +The same as above but accepts a list of bytes (as integers). + +### utf8-string-next-char [procedure] + + (utf8-string-next-char str + (si0 0)) + +Returns the position right after the character at specified +position. + +### utf8-string-length [procedure] + + (utf8-string-length s) + +Calculates the length of given UTF-8 string. + +### utf8-string->list [procedure] + + (utf8-string->list s) + +Converts utf8 string to list of unicode characters. + +### list->utf8-string [procedure] + + (list->utf8-string lst) + +Converts list of unicode characters into utf8 string. + +### string-utf8? [procedure] + + (string-utf8? s) + +Returns true, if given string contains UTF-8 characters. diff --git a/src/Makefile b/src/Makefile index 5d86cce..de3e6c9 100644 --- a/src/Makefile +++ b/src/Makefile @@ -65,14 +65,15 @@ GENDOC-SOURCES=gendoc.scm duck-extract.import.scm \ util-proc.import.scm util-format.import.scm \ util-tag.import.scm util-string.import.scm \ util-bst.import.scm util-bst-bdict.import.scm \ - util-bst-ldict.import.scm util-dir.import.scm + util-bst-ldict.import.scm util-dir.import.scm \ + util-utf8.import.scm GENDOC-OBJS=gendoc.o duck-extract.o util-time.o util-csv.o util-io.o \ progress.o testing.o util-proc.o util-git.o util-io.o \ util-stdout.o util-parser.o util-list.o util-proc.o \ util-format.o racket-kwargs.o util-bst-ldict.o util-tag.o \ duck.o util-string.o util-bst.o util-bst-bdict.o \ - util-bst-ldict.o util-dir.o + util-bst-ldict.o util-dir.o util-utf8.o .PHONY: imports imports: $(HACKERBASE-DEPS) @@ -444,7 +445,8 @@ SGR-STATE-SOURCES=sgr-state.scm testing.import.scm \ sgr-state.o: sgr-state.import.scm sgr-state.import.scm: $(SGR-STATE-SOURCES) -UTIL-UTF8-SOURCES=util-utf8.scm testing.import.scm +UTIL-UTF8-SOURCES=util-utf8.scm testing.import.scm duck.import.scm \ + racket-kwargs.import.scm util-utf8.o: util-utf8.import.scm util-utf8.import.scm: $(UTIL-UTF8-SOURCES) diff --git a/src/gendoc.scm b/src/gendoc.scm index 31f8a0e..cf91230 100644 --- a/src/gendoc.scm +++ b/src/gendoc.scm @@ -37,4 +37,5 @@ util-bst-ldict util-bst-lset util-dir + util-utf8 ) diff --git a/src/util-utf8.scm b/src/util-utf8.scm index 84ff0a9..14a4c6b 100644 --- a/src/util-utf8.scm +++ b/src/util-utf8.scm @@ -24,8 +24,11 @@ (declare (unit util-utf8)) -(module +(import duck) + +(module* util-utf8 + #:doc ("High-performance UTF-8 support.") ( utf8-char->string @@ -52,10 +55,11 @@ (import scheme (chicken base) (chicken bitwise) - testing) + testing + racket-kwargs) - ;; Encodes given character as UTF-8 - (define (utf8-char->string ch) + (define/doc (utf8-char->string ch) + ("Encodes given character as UTF-8.") (let ((n (char->integer ch))) (cond ((>= n #x10000) (let ((r (make-string 4)) @@ -87,10 +91,9 @@ (else (make-string 1 ch))))) - ;; UTF-8 version of make-string - (define (make-utf8-string n . chs) - (let* ((ch (if (null? chs) #\space (car chs))) - (s (utf8-char->string ch)) + (define*/doc (make-utf8-string n (ch #\space)) + ("UTF-8 version of make-string.") + (let* ((s (utf8-char->string ch)) (sl (string-length s)) (rl (* n sl)) (r (make-string rl))) @@ -103,19 +106,19 @@ (loop (add1 ri) (if (= nsi sl) 0 nsi))))))) - ;; UTF-8 character append - (define (string-append-utf8-char s ch) + (define/doc (string-append-utf8-char s ch) + ("UTF-8 character append.") (string-append s (utf8-char->string ch))) - ;; Converts a UTF-8 string into two lists: list of UTF-8 characters - ;; of the string and a list of remaining bytes (as integers). - (define (utf8-string->lists str) + (define/doc (utf8-string->lists str) + ("Converts a UTF-8 string into two lists: list of UTF-8 characters +of the string and a list of remaining bytes (as integers).") (utf8-bytes->lists (map char->integer (string->list str)))) - ;; The same as above but accepts a list of bytes (as integers) - (define (utf8-bytes->lists chars) + (define/doc (utf8-bytes->lists chars) + ("The same as above but accepts a list of bytes (as integers).") (let loop ((bytes chars) (rpending '()) (pending 0) @@ -175,11 +178,10 @@ (sub1 char-bytes) res)))))))))) - ;; Returns the position right after the character at specified - ;; position. - (define (utf8-string-next-char str . sis) - (let ((len (string-length str)) - (si0 (if (null? sis) 0 (car sis)))) + (define*/doc (utf8-string-next-char str (si0 0)) + ("Returns the position right after the character at specified +position.") + (let ((len (string-length str))) (let loop ((si si0) (pc 0)) (if (or (= si len) @@ -200,8 +202,8 @@ (sub1 pc) 0)))))))) - ;; Calculates the length of given UTF-8 string - (define (utf8-string-length s) + (define/doc (utf8-string-length s) + ("Calculates the length of given UTF-8 string.") (let ((l (string-length s))) (let loop ((si 0) (ci 0) @@ -226,8 +228,8 @@ (sub1 pc) 0)))))))) - ;; Converts utf8 string to list of unicode characters - (define (utf8-string->list s) + (define/doc (utf8-string->list s) + ("Converts utf8 string to list of unicode characters.") (let-values (((lst _) (utf8-string->lists s))) lst)) @@ -265,8 +267,8 @@ (else (cons ch lst))))) - ;; Converts list of unicode characters into utf8 string - (define (list->utf8-string lst) + (define/doc (list->utf8-string lst) + ("Converts list of unicode characters into utf8 string.") (let loop ((lst lst) (res '())) (if (null? lst) @@ -274,8 +276,8 @@ (loop (cdr lst) (prepend-unicode-char-to-utf8-list (car lst) res))))) - ;; Returns true, if given string contains UTF-8 characters - (define (string-utf8? s) + (define/doc (string-utf8? s) + ("Returns true, if given string contains UTF-8 characters.") (let ((asciilen (string-length s)) (utf8len (utf8-string-length s))) (not (= asciilen utf8len))))