hackerbase/src/sgr-list.scm

;;
;; sgr-list.scm
;;
;; Intermediate representation of strings with SGR state changes.
;;
;; ISC License
;;
;; Copyright 2023 Dominik Pantůček <dominik.pantucek@trustica.cz>
;;
;; Permission to use, copy, modify, and/or distribute this software
;; for any purpose with or without fee is hereby granted, provided
;; that the above copyright notice and this permission notice appear
;; in all copies.
;;
;; THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
;; WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
;; WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
;; AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
;; CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
;; OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
;; NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
;; CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
;;

(declare (unit sgr-list))

(module
 sgr-list
 (
  string->sgr-list
  string->sgr-list/words

  sgr-list->string

  sgr-list-length
  sgr-list-length/stretch
  sgr-list-length-w/o-glues

  sgr-token-spaces?
  sgr-token-glue?
  sgr-token-newline?
  sgr-token-unglue?

  sgr-token-neutralize
  sgr-list-neutralize

  sgr-list-num-glues

  sgr-list-min-width

  sgr-list-last-state

  sgr-list-tests!
  )

 (import scheme
	 (chicken base)
	 (chicken string)
	 racket-kwargs
	 sgr-state
	 testing
	 util-utf8)

 ;; Converts given string into a list of string parts and sgr-states
 (define* (string->sgr-list str
			    (initial-state empty-sgr-state))
   (let ((src-len (string-length str)))
     (let loop ((src-idx 0)
		(token-start 0)
		(token-end 0)
		(token-pos 0)
		(state initial-state)
		(res '()))
       (if (>= src-idx src-len)
	   (reverse
	    (if (= token-start token-end)
		res
		(cons (cons (substring str token-start token-end)
			    token-pos)
		      res)))
	   (let ((ch (string-ref str src-idx)))
	     (cond
	      ((eq? ch #\x1b)
	       (let-values (((sgr-parsed next-idx)
			     (parse-csi-sgr-sequence+pos str src-idx state)))
		 (let ((token (update-sgr-state-from-list state sgr-parsed)))
		   (loop next-idx
			 next-idx
			 next-idx
			 0
			 token
			 (cons token
			       (if (= token-start token-end)
				   res
				   (cons (cons (substring str token-start token-end)
					       token-pos)
					 res)))))))
	      (else
	       (let ((next-idx (utf8-string-next-char str src-idx)))
		 (loop next-idx
		       token-start
		       next-idx
		       (add1 token-pos)
		       state
		       res)))))))))

 ;; Converts given string into a list of string parts and sgr-states,
 ;; handling continuous whitespace as separate tokens.
 (define* (string->sgr-list/words str
				  (initial-state empty-sgr-state))
   (let ((src-len (string-length str)))
     (let loop ((src-idx 0)
		(token-start 0)
		(token-end 0)
		(token-pos 0)
		(state initial-state)
		(space? #f)
		(res '()))
       (if (>= src-idx src-len)
	   (reverse
	    (if (= token-start token-end)
		res
		(cons (cons (substring str token-start token-end)
			    token-pos)
		      res)))
	   (let ((ch (string-ref str src-idx)))
	     (cond
	      ((eq? ch #\x1b)
	       (let-values (((sgr-parsed next-idx)
			     (parse-csi-sgr-sequence+pos str src-idx state)))
		 (let ((token (update-sgr-state-from-list state sgr-parsed)))
		   (loop next-idx
			 next-idx
			 next-idx
			 0
			 token
			 #f
			 (cons token
			       (if (= token-start token-end)
				   res
				   (cons (cons (substring str token-start token-end)
					       token-pos)
					 res)))))))
	      ((eq? ch #\backspace)
	       (let ((next-idx (add1 src-idx)))
		 (loop next-idx
		       next-idx
		       next-idx
		       0
		       state
		       #f
		       (cons (cons "\b" 1)
			     (if (= token-start token-end)
				 res
				 (cons (cons (substring str token-start token-end)
					     token-pos)
				       res))))))
	      ((eq? ch #\newline)
	       (let ((next-idx (add1 src-idx)))
		 (loop next-idx
		       next-idx
		       next-idx
		       0
		       state
		       #f
		       (cons (cons "\n" 1)
			     (if (= token-start token-end)
				 res
				 (cons (cons (substring str token-start token-end)
					     token-pos)
				       res))))))
	      ((memq ch '(#\space #\tab))
	       (let ((next-idx (add1 src-idx)))
		 (loop next-idx
		       (if space?
			   token-start
			   src-idx)
		       next-idx
		       (if space?
			   (add1 token-pos)
			   1)
		       state
		       #t
		       (if (and (not space?)
				(> token-end token-start))
			   (cons (cons (substring str token-start token-end)
				       token-pos)
				 res)
			   res))))
	      (else
	       (let ((next-idx (utf8-string-next-char str src-idx)))
		 (loop next-idx
		       (if space?
			   src-idx
			   token-start)
		       next-idx
		       (if space?
			   1
			   (add1 token-pos))
		       state
		       #f
		       (if space?
			   (cons (cons (substring str token-start token-end)
				       token-pos)
				 res)
			   res))))))))))

 ;; Converts a SGR list into a single string
 (define* (sgr-list->string lst
			    (initial-state empty-sgr-state)
			    #:reset-state (reset-state #f))
   (let loop ((lst lst)
	      (res '())
	      (state initial-state))
     (if (null? lst)
	 (string-intersperse
	  (reverse (if reset-state
		       (cons reset-state res)
		       res))
	  "")
	 (let ((token (car lst)))
	   (if (sgr-state? token)
	       (loop (cdr lst)
		     (cons (sgr-state-change->string state token) res)
		     token)
	       (loop (cdr lst)
		     (cons (car token) res)
		     state))))))

 ;; Predicate for SGR list tokens
 (define (sgr-list-token? v)
   (and (pair? v)
	(string? (car v))
	(fixnum? (cdr v))))

 ;; Returns the length of all utf8 strings in the sgr-list
 (define (sgr-list-length sl)
   (foldl
    (lambda (acc tk)
      (if (sgr-list-token? tk)
	  (+ acc (cdr tk))
	  acc))
    0 sl))

 ;; Returns the length of all utf8 strings without glues. Glues must
 ;; have at least width 1 if they are not the first or last
 ;; token. Those can have zero length. All spaces have width of
 ;; 1. Used for wrapping blocks.
 (define (sgr-list-length/stretch sl)
   (let loop ((sl sl)
	      (len 0)
	      (seen-string? #f)
	      (last-space? #f))
     (if (null? sl)
	 (if last-space?
	     (sub1 len)
	     len)
	 (let* ((tk (car sl))
		(is-spaces? (sgr-token-spaces? tk))
		(is-string? (and (not (sgr-state? tk))
				 (not is-spaces?))))
	   (loop (cdr sl)
		 (cond ((sgr-token-spaces? tk)
			(if seen-string?
			    (add1 len)
			    len))
		       ((sgr-state? tk)
			len)
		       (else
			(+ len (cdr tk))))
		 (or seen-string?
		     is-string?)
		 (if is-string?
		     #f
		     (if is-spaces?
			 #t
			 last-space?)))))))

 ;; Returns length without glues
 (define (sgr-list-length-w/o-glues sl)
   (foldl (lambda (acc tk)
	    (if (or (sgr-state? tk)
		    (sgr-token-glue? tk)
		    (sgr-token-unglue? tk))
		acc
		(+ acc (cdr tk))))
	  0 sl))


 ;; Predicate for string token containing only spaces and tabs
 (define (sgr-token-spaces? t)
   (and (pair? t)
	(string? (car t))
	(let loop ((i 0))
	  (if (= i (cdr t))
	      #t
	      (if (memq (string-ref (car t) i) '(#\space #\tab))
		  (loop (add1 i))
		  #f)))))

 ;; Predicate for string token with only spaces and at least one tab
 (define (sgr-token-glue? t)
   (and (pair? t)
	(string? (car t))
	(let loop ((i 0)
		   (glue #f))
	  (if (= i (cdr t))
	      glue
	      (let ((ch (string-ref (car t) i)))
		(if (eq? ch #\tab)
		    (loop (add1 i) #t)
		    (if (eq? ch #\space)
			(loop (add1 i) glue)
			#f)))))))

 ;; Predicate for string containing only a newline
 (define (sgr-token-newline? t)
   (and (pair? t)
	(string? (car t))
	(eq? (cdr t) 1)
	(eq? (string-ref (car t) 0) #\newline)))

 ;; Predicate for unglue (used for justify)
 (define (sgr-token-unglue? t)
   (and (pair? t)
	(string? (car t))
	(eq? (cdr t) 1)
	(eq? (string-ref (car t) 0) #\backspace)))

 ;; Replaces all occurences of #\tab with #\space and removes trailing
 ;; #\backspace
 (define (sgr-token-neutralize t)
   (if (and (pair? t)
	    (string? (car t)))
       (let* ((str (string-copy (car t)))
	      (len (string-length str)))
	 (if (> len 0)
	     (let loop ((idx 0))
	       (if (= idx len)
		   (if (eq? (string-ref str (sub1 len)) #\backspace)
		       (cons (substring str 0 (sub1 len))
			     (sub1 len))
		       (cons str len))
		   (let ((ch (string-ref str idx)))
		     (when (eq? ch #\tab)
		       (string-set! str idx #\space))
		     (loop (add1 idx)))))
	     t))))

 ;; Neutralizes whole SGR list
 (define (sgr-list-neutralize sl)
   (map sgr-token-neutralize sl))

 ;; Returns the number of glues in given SGR list
 (define (sgr-list-num-glues sl)
   (foldl (lambda (acc t)
	    (if (sgr-token-glue? t)
		(add1 acc)
		acc))
	  0 sl))

 ;; Returns the longest word
 (define (sgr-list-min-width sl)
   (let loop ((sl sl)
	      (res 0))
     (if (null? sl)
	 res
	 (let ((tk (car sl)))
	   (loop (cdr sl)
		 (if (sgr-state? tk)
		     res
		     (max res (cdr tk))))))))

 ;; Returns the last SGR state in given sgr-list
 (define* (sgr-list-last-state sl (initial-state empty-sgr-state))
   (let loop ((sl sl)
	      (state initial-state))
     (if (null? sl)
	 state
	 (loop (cdr sl)
	       (if (sgr-state? (car sl))
		   (car sl)
		   state)))))

 ;; Module self-tests
 (define (sgr-list-tests!)
   (run-tests
    sgr-list
    (test-equal? sgr-list->string
		 (sgr-list->string '(("Hello" . 5)))
		 "Hello")
    (test-true sgr-list-token?
	       (sgr-list-token? '("Hello" . 5)))
    (test-false sgr-list-token?
		(sgr-list-token? empty-sgr-state))
    (test-false sgr-list-token?
		(sgr-list-token? (set-sgr-state-foreground
				  empty-sgr-state
				  (make-sgr-truecolor 1 2 3))))
    (test-equal? sgr-list-length
		 (sgr-list-length '(("Hello" . 5)))
		 5)
    (test-true sgr-token-spaces?
	       (sgr-token-spaces? '("     " . 5)))
    (test-true sgr-token-spaces?
	       (sgr-token-spaces? '("  \x09  " . 5)))
    (test-false sgr-token-spaces?
		(sgr-token-spaces? '("  x  " . 5)))
    (test-true sgr-token-newline?
	       (sgr-token-newline? '("\n" . 1)))
    (test-false sgr-token-newline?
		(sgr-token-newline? '("\na" . 2)))
    (test-false sgr-token-newline?
		(sgr-token-newline? '("x" . 1)))
    (test-true sgr-token-glue?
	       (sgr-token-glue? '("  \t  " . 5)))
    (test-true sgr-token-glue?
	       (sgr-token-glue? '("\t" . 1)))
    (test-false sgr-token-glue?
		(sgr-token-glue? '("     " . 5)))
    (test-false sgr-token-glue?
		(sgr-token-glue? '("  x  " . 5)))
    (test-equal? string->sgr-list
		 (string->sgr-list "Hello")
		 '(("Hello" . 5)))
    (test-equal? string->sgr-list
		 (string->sgr-list "\x1b[1mHello \x1b[0mWorld!")
		 '(1 ("Hello " . 6) 0 ("World!" . 6)))
    (test-equal? string->sgr-list/words
		 (string->sgr-list/words "Hello World!")
		 '(("Hello" . 5)
		   (" " . 1)
		   ("World!" . 6)))
    (test-equal? string->sgr-list/words
		 (string->sgr-list/words "\x1b[1mHello \x1b[0mWorld!")
		 '(1
		   ("Hello" . 5)
		   (" " . 1)
		   0
		   ("World!" . 6)))
    (test-equal? string->sgr-list/words
		 (string->sgr-list/words "Hello World!\b")
		 '(("Hello" . 5)
		   (" " . 1)
		   ("World!" . 6)
		   ("\b" . 1)))
    (test-equal? string->sgr-list/words
		 (string->sgr-list/words "Hello\nWorld!")
		 '(("Hello" . 5)
		   ("\n" . 1)
		   ("World!" . 6)))
    (test-equal? sgr-list-length/stretch
		 (sgr-list-length/stretch
		  '(("Hello" . 5)))
		 5)
    (test-equal? sgr-list-length/stretch
		 (sgr-list-length/stretch
		  '(("Hello" . 5) ("    " . 4)))
		 5)
    (test-equal? sgr-list-length/stretch
		 (sgr-list-length/stretch
		  '(("Hello" . 5) ("    " . 4) 1))
		 5)
    (test-equal? sgr-list-length/stretch
		 (sgr-list-length/stretch
		  '(1 ("    " . 4) ("Hello" . 5) ("    " . 4) 1))
		 5)
    (test-equal? sgr-list-length/stretch
		 (sgr-list-length/stretch
		  '(("Hello" . 5) ("    " . 4) ("World" . 5)))
		 11)
    (test-equal? sgr-list-length-w/o-glues
		 (sgr-list-length-w/o-glues
		  '(("Hello" . 5) ("  \t  " . 5) ("World" . 5)))
		 10)
    (test-true sgr-token-unglue?
	       (sgr-token-unglue? '("\b" . 1)))
    (test-false sgr-token-unglue?
		(sgr-token-unglue? '(" \b" . 2)))
    (test-equal? sgr-token-neutralize
		 (sgr-token-neutralize '("Hello\b" . 6))
		 '("Hello" . 5))
    ))

 )