;;;; regex.scm - Unit for using the POSIX regex package
;
; Copyright (c) 2000-2003, Felix L. Winkelmann
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following
; conditions are met:
;
;   Redistributions of source code must retain the above copyright notice, this list of conditions and the following
;     disclaimer. 
;   Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
;     disclaimer in the documentation and/or other materials provided with the distribution. 
;   Neither the name of the author nor the names of its contributors may be used to endorse or promote
;     products derived from this software without specific prior written permission. 
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
; AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
; OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
; POSSIBILITY OF SUCH DAMAGE.
;
; Send bugs, suggestions and ideas to: 
;
; felix@call-with-current-continuation.org
;
; Felix L. Winkelmann
; Steinweg 1A
; 37130 Gleichen, OT Weissenborn
; Germany


(cond-expand
 [chicken-compile-shared]
 [else (declare (unit regex))] )

(declare 
  (usual-integrations)
  (fixnum)
  (disable-interrupts)
  (hide ##regexp#re-match ##regexp#buffers ##regexp#gather-results ##regexp#gather-result-positions
	##regexp#buffer-index ##regexp#compile ##regexp#compile-pattern)
  (foreign-declare #<<EOF
#include <regex.h>

#define C_MAXIMAL_NUMBER_OF_SUB_MATCHES 256

regmatch_t C_match_registers[ C_MAXIMAL_NUMBER_OF_SUB_MATCHES + 1 ];

#define C_regexp_alloc_buffer(ptr) (C_set_block_item((ptr), 0, (C_word)calloc(1, sizeof(regex_t))))
#define C_regexp_count_matches(ptr) C_fix(((regex_t *)C_slot(ptr, 0))->re_nsub + 1)
#define C_regexp_register_start(i) C_fix(C_match_registers[ C_unfix(i) ].rm_so)
#define C_regexp_register_end(i) C_fix(C_match_registers[ C_unfix(i) ].rm_eo)
EOF
) )

(cond-expand
 [paranoia]
 [else
  (declare
    (no-bound-checks)
    (bound-to-procedure
     ##sys#check-string ##sys#check-exact ##sys#make-pointer ##sys#cons ##sys#size ##sys#slot
     ##regexp#compile ##regexp#gather-results ##regexp#re-match
     ##regexp#re-compile-pattern) ) ] )

(cond-expand
 [unsafe
  (eval-when (compile)
    (define-macro (##sys#check-structure . _) '(##core#undefined))
    (define-macro (##sys#check-range . _) '(##core#undefined))
    (define-macro (##sys#check-pair . _) '(##core#undefined))
    (define-macro (##sys#check-list . _) '(##core#undefined))
    (define-macro (##sys#check-symbol . _) '(##core#undefined))
    (define-macro (##sys#check-string . _) '(##core#undefined))
    (define-macro (##sys#check-char . _) '(##core#undefined))
    (define-macro (##sys#check-exact . _) '(##core#undefined))
    (define-macro (##sys#check-port . _) '(##core#undefined))
    (define-macro (##sys#check-number . _) '(##core#undefined))
    (define-macro (##sys#check-byte-vector . _) '(##core#undefined)) ) ]
 [else] )

(register-feature! 'regex)


;;; Create global pattern buffer and initalize:

(define-constant ##regexp#buffer-count 5)

(define ##regexp#buffers
  (list (cons "" (##sys#make-pointer))
	(cons "" (##sys#make-pointer))
	(cons "" (##sys#make-pointer))
	(cons "" (##sys#make-pointer))
	(cons "" (##sys#make-pointer)) ) )

(define ##regexp#buffer-index 0)

(for-each (lambda (b) (##core#inline "C_regexp_alloc_buffer" (cdr b))) ##regexp#buffers)
(set! ##regexp#buffers (list->vector ##regexp#buffers))


;;; Compile regular expression into pattern buffer:

(define ##regexp#re-compile-pattern
  (foreign-lambda* int ((c-string rx) (c-pointer buffer))
    "regfree(buffer);"
    "return(regcomp((regex_t *)buffer, rx, REG_EXTENDED));") )

(define (##regexp#compile regexp loc)
  (##sys#check-string regexp loc)
  (let ([index #f])
    (let loop ([i 0])
      (cond [(fx>= i ##regexp#buffer-count)
	     (set! index ##regexp#buffer-index)
	     (set! ##regexp#buffer-index (fx+ index 1)) 
	     (when (fx>= ##regexp#buffer-index ##regexp#buffer-count)
	       (set! ##regexp#buffer-index 0) ) ]
	    [(string=? regexp (##sys#slot (##sys#slot ##regexp#buffers i) 0))
	     (set! index i) ]
	    [else (loop (fx+ i 1))] ) )
    (let ([b (##sys#slot ##regexp#buffers index)])
      (if (zero? (##regexp#re-compile-pattern regexp (##sys#slot b 1)))
	  (##sys#setslot b 0 regexp) 
	  (##sys#error loc "can not compile regular expression" regexp) )
      (##sys#slot b 1) ) ) )

(define regexp 
  (let ([alloc
	 (foreign-lambda* c-pointer () "return(calloc(1, sizeof(regex_t)));") ]
	[free (foreign-lambda* void ((c-pointer rx))
		"regfree((regex_t *)rx);"
		"C_free(rx);") ]
	[comp 
	 (foreign-lambda* int ([c-string rx] [c-pointer ptr])
	   "return(regcomp((regex_t *)ptr, rx, REG_EXTENDED));") ] )
    (lambda (rx)
      (##sys#check-string rx 'regexp)
      (let ([rt (alloc)])
	(set-finalizer! rt free)
	(if (zero? (comp rx rt))
	    (##sys#make-structure 'regexp rt)
	    (##sys#error 'regexp "can not compile regular expression" rx) ) ) ) ) )

(define (regexp? x)
  (##sys#structure? x 'regexp) )


;;; Gather matched result strings or positions:

(define (##regexp#gather-result-positions result b)
  (and (zero? result)
       (let ([n (##core#inline "C_regexp_count_matches" b)])
	 (let loop ([i 0])
	   (if (fx>= i n)
	       '()
	       (let ([start (##core#inline "C_regexp_register_start" i)])
		 (cons
		  (if (fx>= start 0)
		      (cons start (cons (##core#inline "C_regexp_register_end" i) '()))
		      #f)
		  (loop (fx+ i 1)) ) ) ) ) ) ) )

(define ##regexp#gather-results
  (let ([substring substring])
    (lambda (result str b)
      (let ([ps (##regexp#gather-result-positions result b)])
	(and ps
	     (##sys#map (lambda (poss) (and poss (apply substring str poss)))
		   ps) ) ) ) ) )


;;; Match string with regular expression:

(define ##regexp#re-match
  (foreign-lambda* int ((c-pointer buffer) (c-string str) (int start) (int range))
    "int i, r, n;"
    "regex_t *rx = (regex_t *)buffer;"
    "if(range) str[ range ] = '\0';"
    "n = rx->re_nsub + 1;"
    "r = regexec((regex_t *)buffer, str + start, n, C_match_registers, 0);"
    "if(start != 0) {"
    "  for(i = 0; i < n; ++i) {"
    "    C_match_registers[ i ].rm_so += start;"
    "    C_match_registers[ i ].rm_eo += start;"
    "  }"
    "}"
    "return(r);") )

(let ([b #f]
      [string-append string-append] )

  (define (prepare regexp str start loc)
    (##sys#check-string str loc)
    (let ([si (if (pair? start) (##sys#slot start 0) 0)])
      (##sys#check-exact si loc)
      (set! b 
	(cond [(string? regexp) (##regexp#compile (string-append "^" regexp "$") loc)]
	      [(##sys#structure? regexp 'regexp) (##sys#slot regexp 1)]
	      [else (##sys#signal-hook #:type-error loc "bad argument type - not a string or compiled regexp" regexp)] ) )
      (##regexp#re-match b str si 0) ) )

  (set! string-match
    (lambda (regexp str . start)
      (let ([m (prepare regexp str start 'string-match)])
	(##regexp#gather-results m str b) ) ) )

  (set! string-match-positions
    (lambda (regexp str . start)
      (let ([m (prepare regexp str start 'string-match-positions)])
	(##regexp#gather-result-positions m b) ) ) ) )


;;; Search string with regular expression:

(let ([b #f])

  (define (prepare regexp str start-and-range loc)
    (##sys#check-string str loc)
    (let* ([range (and (##core#inline "C_blockp" start-and-range) 
		       (##sys#slot start-and-range 1) ) ]
	   [si (if range (##sys#slot start-and-range 0) 0)]
	   [ri (if (##core#inline "C_blockp" range) (##sys#slot range 0) 0)] )
      (##sys#check-exact si loc)
      (##sys#check-exact ri loc)
      (set! b
	(cond [(string? regexp) (##regexp#compile regexp loc)]
	      [(##sys#structure? regexp 'regexp) (##sys#slot regexp 1)]
	      [else (##sys#signal-hook #:type-error loc "bad argument type - not a string or compiled regexp" regexp)] ) )
      (##regexp#re-match b str si ri) ) )

  (set! string-search 
    (lambda (regexp str . start-and-range)
      (let ([s (prepare regexp str start-and-range 'string-search)])
	(##regexp#gather-results s str b) ) ) )

  (set! string-search-positions
    (lambda (regexp str . start-and-range)
      (let ([s (prepare regexp str start-and-range 'string-search-positions)])
	(##regexp#gather-result-positions s b) ) ) ) )


;;; Split string into fields:

(define string-split-fields
  (let ([reverse reverse]
	[substring substring]
	[string-search-positions string-search-positions] )
    (lambda (regexp str . mode-and-start)
      (##sys#check-string str 'string-split-fields)
      (let* ([argc (length mode-and-start)]
	     [len (##sys#size str)]
	     [mode (if (fx> argc 0) (car mode-and-start) #t)]
	     [start (if (fx> argc 1) (cadr mode-and-start) 0)] 
	     [fini (case mode
		     [(#:suffix)
		      (lambda (ms start)
			(if (fx< start len)
			    (##sys#error 'string-split-fields "record does not end with suffix" str regexp)
			    (reverse ms) ) ) ]
		     [(#:infix)
		      (lambda (ms start)
			(if (fx>= start len)
			    (reverse ms)
			    (reverse (cons (substring str start len) ms)) ) ) ]
		     [else (lambda (ms start) (reverse ms)) ] ) ]
	     [fetch (case mode
		      [(#:infix #:suffix) (lambda (start from to) (substring str start from))]
		      [else (lambda (start from to) (substring str from to))] ) ] )
	(let loop ([ms '()] [start start])
	  (let ([m (string-search-positions regexp str start)])
	    (if m
		(let* ([mp (##sys#slot m 0)]
		       [from (##sys#slot mp 0)]
		       [to (cadr mp)] )
		  (if (fx= from to)
		      (if (fx= to len)
			  (fini ms start)
			  (loop (cons (fetch start (fx+ from 1) (fx+ to 2)) ms) (fx+ to 1)) )
		      (loop (cons (fetch start from to) ms) to) ) )
		(fini ms start) ) ) ) ) ) ) )


;;; Substitute matching strings:

(define string-substitute
  (let ([substring substring]
	[reverse reverse]
	[make-string make-string]
	[string-search-positions string-search-positions] )
    (lambda (regex subst string . flag)
      (##sys#check-string subst 'string-substitute)
      (let* ([which (if (pair? flag) (car flag) 1)]
	     [substlen (##sys#size subst)]
	     [substlen-1 (fx- substlen 1)]
	     [result '()] 
	     [total 0] )

	(define (push x) 
	  (set! result (cons x result))
	  (set! total (fx+ total (##sys#size x))) )
	
	(define (substitute matches)
	  (let loop ([start 0] [index 0])
	    (if (fx>= index substlen-1)
		(push (if (fx= start 0) subst (substring subst start substlen)))
		(let ([c (##core#inline "C_subchar" subst index)]
		      [index+1 (fx+ index 1)] )
		  (if (char=? c #\\) 
		      (let ([c2 (##core#inline "C_subchar" subst index+1)])
			(if (not (char=? #\\ c2))
			    (let ([mi (list-ref matches (fx- (char->integer c2) 48))])
			      (push (substring subst start index))
			      (push (substring string (car mi) (cadr mi))) 
			      (loop (fx+ index 2) index+1) )
			    (loop start (fx+ index+1 1)) ) )
		      (loop start index+1) ) ) ) ) )

	(define (concatenate strs)
	  (let ([str (make-string total)])
	    (let loop ([ss strs] [index 0])
	      (if (null? ss) 
		  str
		  (let* ([si (car ss)]
			 [len (##sys#size si)] )
		    (##core#inline "C_substring_copy" si str 0 len index)
		    (loop (cdr ss) (fx+ index len)) ) ) ) ) )

	(let loop ([index 0] [count 1])
	  (let ([matches (string-search-positions regex string index)])
	    (cond [matches
		   (let* ([range (car matches)]
			  [upto (cadr range)] )
		     (cond [(or (not (fixnum? which)) (fx= count which))
			    (push (substring string index (car range)))
			    (substitute matches)
			    (loop upto #f) ]
			   [else
			    (push (substring string index upto))
			    (loop upto (fx+ count 1)) ] ) ) ]
		  [else
		   (push (substring string index (##sys#size string)))
		   (concatenate (reverse result)) ] ) ) ) ) ) ) )

(define string-substitute*
  (let ([string-search-positions string-search-positions])
    (lambda (str smap)
      (##sys#check-string str 'string-substitute*)
      (##sys#check-list smap 'string-substitute*)
      (let ([len (##sys#size str)])
	(define (collect i from total fs)
	  (if (fx>= i len)
	      (##sys#fragments->string
	       total
	       (reverse 
		(if (fx> i from) 
		    (cons (##sys#substring str from i) fs)
		    fs) ) )
	      (let loop ([smap smap] [pos len])
		(if (null? smap)
		    (collect pos from (fx+ total (fx- pos i)) fs)
		    (let* ([p (car smap)]
			   [sm (car p)]
			   [st (cdr p)] 
			   [m (string-search-positions sm str i)] 
			   [ma (and m (##sys#slot m 0))] )
		      (if (and ma (fx= i (##sys#slot ma 0)))
			  (let ([i2 (##sys#slot (##sys#slot ma 1) 0)])
			    (when (fx> i from)
			      (set! fs 
				(cons (##sys#substring str from i) fs)) ) 
			    (collect 
			     i2 i2
			     (fx+ total (string-length st))
			     (cons st fs) ) ) 
			  (loop (cdr smap) 
				(if ma 
				    (fxmin pos (##sys#slot ma 0)) 
				    pos) ) ) ) ) ) ) ) 
	(collect 0 0 0 '()) ) ) ) )


;;; Some useful things:

(define pattern->regexp
  (let ([list->string list->string]
	[string->list string->list] )
    (lambda (s)
      (##sys#check-string s 'pattern->regexp)
      (list->string
       (let loop ([cs (string->list s)])
	 (if (null? cs)
	     '()
	     (let ([c (car cs)]
		   [rest (cdr cs)] )
	       (cond [(char=? c #\*) `(#\. #\* ,@(loop rest))]
		     [(char=? c #\?) (cons '#\. (loop rest))]
		     [(or (char-alphabetic? c) (char-numeric? c)) (cons c (loop rest))]
		     [else `(#\\ ,c ,@(loop rest))] ) ) ) ) ) ) ) )

(define grep
  (let ([string-match string-match])
    (lambda (rx lst)
      (##sys#check-list lst 'grep)
      (let loop ([lst lst])
	(if (null? lst)
	    '()
	    (let ([x (car lst)]
		  [r (cdr lst)] )
	      (if (string-match rx x)
		  (cons x (loop r))
		  (loop r) ) ) ) ) ) ) )
