Heavily optimize CSV reader.

This commit is contained in:
Dominik Pantůček 2023-03-31 19:20:49 +02:00
parent a216b27825
commit 306cca2ae3

View file

@ -41,49 +41,52 @@
testing testing
progress) progress)
;; Curry version of line parser with configurable cell separator and ;; Curry version of line parser with configurable cell separator and
;; string delimiter. Returns a list of lists of strings. ;; string delimiter. Returns a list of lists of strings.
(define ((make-csv-line-parser separator string-delimiter) line) (define ((make-csv-line-parser separator string-delimiter) line)
(let loop ((tokens (irregex-extract (irregex "." 'u) line)) (let loop ((tokens (string->list line))
(res '()) (res '())
(state 1)) (state 1))
(if (null? tokens) (if (null? tokens)
(reverse res) (reverse
(map
(lambda (cell)
(list->string (reverse cell)))
res))
(let ((token (car tokens))) (let ((token (car tokens)))
(case state (case state
((0) ; Parsing regular unquoted cell data - separator creates new cell ((0) ; Parsing regular unquoted cell data - separator creates new cell
(if (equal? token separator) (if (eq? token separator)
(loop (cdr tokens) (loop (cdr tokens)
res res
1) ; Start a new cell 1) ; Start a new cell
(loop (cdr tokens) (loop (cdr tokens)
(cons (string-append (car res) token) (cdr res)) (cons (cons token (car res)) (cdr res))
0))) 0)))
((1) ; Starting a new cell - check for string delimiter ((1) ; Starting a new cell - check for string delimiter
(if (equal? token string-delimiter) (if (eq? token string-delimiter)
(loop (cdr tokens) (loop (cdr tokens)
(cons "" res) ; If it is quoted, keep even empty strings there (cons '() res) ; If it is quoted, keep even empty strings there
2) 2)
(if (equal? token separator) (if (eq? token separator)
(loop (cdr tokens) (loop (cdr tokens)
(cons "" res) ; This was an empty cell (cons '() res) ; This was an empty cell
1) ; Another new cell awaiting 1) ; Another new cell awaiting
(loop (cdr tokens) (loop (cdr tokens)
(cons token res) ; first token of regular new cell (cons (list token) res) ; first token of regular new cell
0)))) 0))))
((2) ; Parsing quoted cell data - no support for escaping string delimiter! ((2) ; Parsing quoted cell data - no support for escaping string delimiter!
(if (equal? token string-delimiter) (if (eq? token string-delimiter)
(loop (cdr tokens) (loop (cdr tokens)
res res
0) ; There shouldn't be anything more, but it is safe to append the rest as normal unquoted data 0) ; There shouldn't be anything more, but it is safe to append the rest as normal unquoted data
(loop (cdr tokens) (loop (cdr tokens)
(cons (string-append (car res) token) (cdr res)) (cons (cons token (car res)) (cdr res))
2)))))))) ; Continue inside quoted data 2)))))))) ; Continue inside quoted data
;; Parses given CSV lines list ;; Parses given CSV lines list
(define (csv-parse-lines lines . args) (define (csv-parse-lines lines . args)
(let* ((separator (get-keyword #:separator args (lambda () ";"))) (let* ((separator (get-keyword #:separator args (lambda () #\;)))
(string-delimiter (get-keyword #:string-delimiter args (lambda () "\""))) (string-delimiter (get-keyword #:string-delimiter args (lambda () #\")))
(csv-parse-line (make-csv-line-parser separator string-delimiter)) (csv-parse-line (make-csv-line-parser separator string-delimiter))
(total (max (sub1 (length lines)) 1))) (total (max (sub1 (length lines)) 1)))
(let loop ((lines lines) (let loop ((lines lines)