From 306cca2ae3b20a975ea03e6d3f0fb219fb9d005c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominik=20Pant=C5=AF=C4=8Dek?= Date: Fri, 31 Mar 2023 19:20:49 +0200 Subject: [PATCH] Heavily optimize CSV reader. --- csv-simple.scm | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/csv-simple.scm b/csv-simple.scm index 60a3331..e20689a 100644 --- a/csv-simple.scm +++ b/csv-simple.scm @@ -41,49 +41,52 @@ testing progress) - ;; Curry version of line parser with configurable cell separator and + ;; Curry version of line parser with configurable cell separator and ;; string delimiter. Returns a list of lists of strings. (define ((make-csv-line-parser separator string-delimiter) line) - (let loop ((tokens (irregex-extract (irregex "." 'u) line)) + (let loop ((tokens (string->list line)) (res '()) (state 1)) (if (null? tokens) - (reverse res) + (reverse + (map + (lambda (cell) + (list->string (reverse cell))) + res)) (let ((token (car tokens))) (case state ((0) ; Parsing regular unquoted cell data - separator creates new cell - (if (equal? token separator) + (if (eq? token separator) (loop (cdr tokens) res 1) ; Start a new cell (loop (cdr tokens) - (cons (string-append (car res) token) (cdr res)) + (cons (cons token (car res)) (cdr res)) 0))) ((1) ; Starting a new cell - check for string delimiter - (if (equal? token string-delimiter) + (if (eq? token string-delimiter) (loop (cdr tokens) - (cons "" res) ; If it is quoted, keep even empty strings there + (cons '() res) ; If it is quoted, keep even empty strings there 2) - (if (equal? token separator) + (if (eq? token separator) (loop (cdr tokens) - (cons "" res) ; This was an empty cell + (cons '() res) ; This was an empty cell 1) ; Another new cell awaiting (loop (cdr tokens) - (cons token res) ; first token of regular new cell + (cons (list token) res) ; first token of regular new cell 0)))) ((2) ; Parsing quoted cell data - no support for escaping string delimiter! - (if (equal? token string-delimiter) + (if (eq? token string-delimiter) (loop (cdr tokens) res 0) ; There shouldn't be anything more, but it is safe to append the rest as normal unquoted data (loop (cdr tokens) - (cons (string-append (car res) token) (cdr res)) + (cons (cons token (car res)) (cdr res)) 2)))))))) ; Continue inside quoted data - ;; Parses given CSV lines list (define (csv-parse-lines lines . args) - (let* ((separator (get-keyword #:separator args (lambda () ";"))) - (string-delimiter (get-keyword #:string-delimiter args (lambda () "\""))) + (let* ((separator (get-keyword #:separator args (lambda () #\;))) + (string-delimiter (get-keyword #:string-delimiter args (lambda () #\"))) (csv-parse-line (make-csv-line-parser separator string-delimiter)) (total (max (sub1 (length lines)) 1))) (let loop ((lines lines)