hackerbase/src/util-csv.scm

161 lines
4.6 KiB
Scheme

;;
;; util-csv.scm
;;
;; Simple, incomplete and incorrect but fast CSV loader.
;;
;; ISC License
;;
;; Copyright 2023 Brmlab, z.s.
;; Dominik Pantůček <dominik.pantucek@trustica.cz>
;;
;; Permission to use, copy, modify, and/or distribute this software
;; for any purpose with or without fee is hereby granted, provided
;; that the above copyright notice and this permission notice appear
;; in all copies.
;;
;; THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
;; WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
;; WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
;; AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
;; CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
;; OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
;; NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
;; CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
;;
(declare (unit util-csv))
(import duck)
(module*
util-csv
#:doc ("This module provides a very simple, incomplete and incorrect but fast
CSV loader.")
(
make-csv-line-parser
csv-parse
csv-split-header
csv-simple-tests!
)
(import scheme
(chicken base)
(chicken keyword)
(chicken irregex)
(chicken condition)
testing
progress
util-io
racket-kwargs)
(define/doc ((make-csv-line-parser separator string-delimiter) line)
("Curried version of fast CSV line parser with given separator and string delimiter.
* ```separator``` - separator character
* ```string-delimiger``` - string quotation character
* ```line``` - line to parse")
(let loop ((tokens (string->list line))
(res '())
(state 1))
(if (null? tokens)
(reverse
(map
(lambda (cell)
(list->string (reverse cell)))
res))
(let ((token (car tokens)))
(case state
((0) ; Parsing regular unquoted cell data - separator creates new cell
(if (eq? token separator)
(loop (cdr tokens)
res
1) ; Start a new cell
(loop (cdr tokens)
(cons (cons token (car res)) (cdr res))
0)))
((1) ; Starting a new cell - check for string delimiter
(if (eq? token string-delimiter)
(loop (cdr tokens)
(cons '() res) ; If it is quoted, keep even empty strings there
2)
(if (eq? token separator)
(loop (cdr tokens)
(cons '() res) ; This was an empty cell
1) ; Another new cell awaiting
(loop (cdr tokens)
(cons (list token) res) ; first token of regular new cell
0))))
((2) ; Parsing quoted cell data - no support for escaping string delimiter!
(if (eq? token string-delimiter)
(loop (cdr tokens)
res
0) ; There shouldn't be anything more, but it is safe to append the rest as normal unquoted data
(loop (cdr tokens)
(cons (cons token (car res)) (cdr res))
2)))))))) ; Continue inside quoted data
(define*/doc (csv-parse-lines lines
#:separator (separator #\;)
#:string-delimiter (string-delimiter #\"))
("Parses given lines and returns list of lists of strings.")
(let* ((csv-parse-line (make-csv-line-parser separator string-delimiter))
(total (max (sub1 (length lines)) 1)))
(let loop ((lines lines)
(idx 0)
(res '()))
(if (null? lines)
(reverse res)
(let ((line (car lines)))
(progress%-advance (/ idx total))
(loop (cdr lines)
(add1 idx)
(cons (csv-parse-line line)
res)))))))
(define/doc (csv-parse fn . args)
("Uses ```csv-parse-lines``` on lines read from given file ```fn```.")
(call/cc
(lambda (ret)
(with-exception-handler
(lambda (ex)
(ret #f))
(lambda ()
(let ((lines (read-lines/no-bom (open-input-file fn))))
(apply csv-parse-lines lines args)))))))
(define/doc (csv-split-header csv)
("Splits given loaded CSV into two tables at the first empty row.")
(let loop ((body csv)
(rhead '()))
(if (null? body)
(list (reverse rhead) '())
(let ((row (car body)))
(if (null? row)
(list (reverse rhead)
(cdr body))
(loop (cdr body)
(cons row rhead)))))))
;; Module self-tests
(define (csv-simple-tests!)
(run-tests
csv-simple
(test-equal? csv-parse-line
((make-csv-line-parser #\; #\") "test;2;3")
'("test" "2" "3"))
(test-equal? csv-parse-line
((make-csv-line-parser #\; #\") "test;\"2;quoted\";3")
'("test" "2;quoted" "3"))
(test-equal? csv-split-header
(csv-split-header '((1 2) () (3 4)))
'(((1 2)) ((3 4))))
(test-equal? csv-split-header
(csv-split-header '((1 2) (5 6) (3 4)))
'(((1 2) (5 6) (3 4)) ()))
(test-equal? csv-parse-lines
(csv-parse-lines '("a;b;c" "1;2"))
'(("a" "b" "c") ("1" "2")))
))
)