hackerbase/csv-simple.scm

122 lines
3.7 KiB
Scheme

;;
;; csv-simple.scm
;;
;; Simple, incomplete and incorrect but fast CSV loader.
;;
;; ISC License
;;
;; Copyright 2023 Brmlab, z.s.
;; Dominik Pantůček <dominik.pantucek@trustica.cz>
;;
;; Permission to use, copy, modify, and/or distribute this software
;; for any purpose with or without fee is hereby granted, provided
;; that the above copyright notice and this permission notice appear
;; in all copies.
;;
;; THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
;; WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
;; WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
;; AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
;; CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
;; OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
;; NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
;; CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
;;
(declare (unit csv-simple))
(module
csv-simple
(
csv-parse
csv-split-header
csv-simple-tests!
)
(import scheme
(chicken base)
(chicken keyword)
(chicken io)
(chicken irregex)
testing)
;; Curry version of line parser with configurable cell separator and
;; string delimiter. Returns a list of lists of strings.
(define ((make-csv-line-parser separator string-delimiter) line)
(let loop ((tokens (irregex-extract (irregex "." 'u) line))
(res '())
(state 1))
(if (null? tokens)
(reverse res)
(let ((token (car tokens)))
(case state
((0) ; Parsing regular unquoted cell data - separator creates new cell
(if (equal? token separator)
(loop (cdr tokens)
res
1) ; Start a new cell
(loop (cdr tokens)
(cons (string-append (car res) token) (cdr res))
0)))
((1) ; Starting a new cell - check for string delimiter
(if (equal? token string-delimiter)
(loop (cdr tokens)
(cons "" res) ; If it is quoted, keep even empty strings there
2)
(if (equal? token separator)
(loop (cdr tokens)
(cons "" res) ; This was an empty cell
1) ; Another new cell awaiting
(loop (cdr tokens)
(cons token res) ; first token of regular new cell
0))))
((2) ; Parsing quoted cell data - no support for escaping string delimiter!
(if (equal? token string-delimiter)
(loop (cdr tokens)
res
0) ; There shouldn't be anything more, but it is safe to append the rest as normal unquoted data
(loop (cdr tokens)
(cons (string-append (car res) token) (cdr res))
2)))))))) ; Continue inside quoted data
;; Loads given CSV file and parses its lines into lists
(define (csv-parse fn . args)
(let* ((separator (get-keyword #:separator args (lambda () ";")))
(string-delimiter (get-keyword #:string-delimiter args (lambda () "\"")))
(lines (read-lines (open-input-file fn)))
(csv-parse-line (make-csv-line-parser separator string-delimiter)))
(map csv-parse-line lines)))
;; Splits CSV into header and body based on the first empty row.
(define (csv-split-header csv)
(let loop ((body csv)
(rhead '()))
(if (null? body)
(list (reverse rhead) '())
(let ((row (car body)))
(if (null? row)
(list (reverse rhead)
(cdr body))
(loop (cdr body)
(cons row rhead)))))))
;; Module self-tests
(define (csv-simple-tests!)
(run-tests
csv-simple
(test-equal? csv-parse-line
((make-csv-line-parser ";" "\"") "test;2;3")
'("test" "2" "3"))
(test-equal? csv-parse-line
((make-csv-line-parser ";" "\"") "test;\"2;quoted\";3")
'("test" "2;quoted" "3"))
(test-equal? csv-split-header
(csv-split-header '((1 2) () (3 4)))
'(((1 2)) ((3 4))))
(test-equal? csv-split-header
(csv-split-header '((1 2) (5 6) (3 4)))
'(((1 2) (5 6) (3 4)) ()))
))
)