pgloader/lib/abnf/abnf.lisp

;;;
;;; Augmented BNF for Syntax Specifications: ABNF
;;;
;;; Parsing ABNF syntaxes so that we can offer users to edit them
;;;
;;; see http://tools.ietf.org/html/draft-ietf-syslog-protocol-15#page-10
;;; and http://tools.ietf.org/html/rfc2234
;;;
(in-package #:abnf)

(defvar *abnf-rfc2234-abnf-definition*
  "     rulelist       =  1*( rule / (*c-wsp c-nl) )

        rule           =  rulename defined-as elements c-nl
                               ; continues if next line starts
                               ;  with white space

        rulename       =  ALPHA *(ALPHA / DIGIT / \"-\")

        defined-as     =  *c-wsp (\"=\" / \"=/\") *c-wsp
                               ; basic rules definition and
                               ;  incremental alternatives

        elements       =  alternation *c-wsp

        c-wsp          =  WSP / (c-nl WSP)

        c-nl           =  comment / CRLF
                               ; comment or newline

        comment        =  \";\" *(WSP / VCHAR) CRLF

        alternation    =  concatenation
                          *(*c-wsp \"/\" *c-wsp concatenation)

        concatenation  =  repetition *(1*c-wsp repetition)

        repetition     =  [repeat] element

        repeat         =  1*DIGIT / (*DIGIT \"*\" *DIGIT)

        element        =  rulename / group / option /
                          char-val / num-val / prose-val

        group          =  \"(\" *c-wsp alternation *c-wsp \")\"

        option         =  \"[\" *c-wsp alternation *c-wsp \"]\"

        char-val       =  DQUOTE *(%x20-21 / %x23-7E) DQUOTE
                               ; quoted string of SP and VCHAR without DQUOTE

        num-val        =  \"%\" (bin-val / dec-val / hex-val)

        bin-val        =  \"b\" 1*BIT
                          [ 1*(\".\" 1*BIT) / (\"-\" 1*BIT) ]
                               ; series of concatenated bit values
                               ; or single ONEOF range

        dec-val        =  \"d\" 1*DIGIT
                          [ 1*(\".\" 1*DIGIT) / (\"-\" 1*DIGIT) ]

        hex-val        =  \"x\" 1*HEXDIG
                          [ 1*(\".\" 1*HEXDIG) / (\"-\" 1*HEXDIG) ]

        prose-val      =  \"<\" *(%x20-3D / %x3F-7E) \">\"
                               ; bracketed string of SP and VCHAR without angles
                               ; prose description, to be used as last resort
  "
  "See http://tools.ietf.org/html/rfc2234#section-4")

(defvar *abnf-rfc-syslog-draft-15*
  "SYSLOG-MSG      = HEADER SP STRUCTURED-DATA [SP MSG]

   HEADER          = VERSION SP FACILITY SP SEVERITY SP
                     TRUNCATE SP TIMESTAMP SP HOSTNAME
                     SP APP-NAME SP PROCID SP MSGID
   VERSION         = NONZERO-DIGIT 0*2DIGIT
   FACILITY        = \"0\" / (NONZERO-DIGIT 0*9DIGIT)
                         ; range 0..2147483647 ;
   SEVERITY        = \"0\" / \"1\" / \"2\" / \"3\" / \"4\" / \"5\" /
   \"6\" / \"7\"
   TRUNCATE        = 2DIGIT
   HOSTNAME        = 1*255PRINTUSASCII

   APP-NAME        = 1*48PRINTUSASCII
   PROCID          = \"-\" / 1*128PRINTUSASCII
   MSGID           = \"-\" / 1*32PRINTUSASCII

   TIMESTAMP       = FULL-DATE \"T\" FULL-TIME
   FULL-DATE       = DATE-FULLYEAR \"-\" DATE-MONTH \"-\" DATE-MDAY
   DATE-FULLYEAR   = 4DIGIT
   DATE-MONTH      = 2DIGIT  ; 01-12
   DATE-MDAY       = 2DIGIT  ; 01-28, 01-29, 01-30, 01-31 based on
                                 ; month/year ;
   FULL-TIME       = PARTIAL-TIME TIME-OFFSET
   PARTIAL-TIME    = TIME-HOUR \":\" TIME-MINUTE \":\" TIME-SECOND
                     [TIME-SECFRAC]
   TIME-HOUR       = 2DIGIT  ; 00-23
   TIME-MINUTE     = 2DIGIT  ; 00-59
   TIME-SECOND     = 2DIGIT  ; 00-58, 00-59, 00-60 based on leap
                                 ; second rules ;
   TIME-SECFRAC    = \".\" 1*6DIGIT
   TIME-OFFSET     = \"Z\" / TIME-NUMOFFSET
   TIME-NUMOFFSET  = (\"+\" / \"-\") TIME-HOUR \":\" TIME-MINUTE


   STRUCTURED-DATA = 1*SD-ELEMENT / \"-\"
   SD-ELEMENT      = \"[\" SD-ID *(SP SD-PARAM) \"]\"
   SD-PARAM        = PARAM-NAME \"=\" %d34 PARAM-VALUE %d34
   SD-ID           = SD-NAME
   PARAM-NAME      = SD-NAME
   PARAM-VALUE     = UTF-8-STRING ; characters '\"', '\' and
                                      ; ']' MUST be escaped. ;
   SD-NAME         = 1*32PRINTUSASCII
                         ; except '=', SP, ']', %d34 (\") ;

   MSG             = UTF-8-STRING
   UTF-8-STRING    = *OCTET ; Any VALID UTF-8 String
                         ; \"shortest form\" MUST be used ;

   OCTET           = %d00-255
   SP              = %d32
   PRINTUSASCII    = %d33-126
   NONZERO-DIGIT   = \"1\" / \"2\" / \"3\" / \"4\" / \"5\" /
                     \"6\" / \"7\" / \"8\" / \"9\"
   DIGIT           = \"0\" / NONZERO-DIGIT"
  "See http://tools.ietf.org/html/draft-ietf-syslog-protocol-15#page-10")

(defvar *abnf-rsyslog*
  (concatenate 'string
	       "RSYSLOG-MSG = \"<\" PRIVAL \">\" VERSION SP TIMESTAMP
                              SP HOSTNAME SP APP-NAME SP PROCID SP MSGID
                              SP [SD-ID SP] DATA

                DATA        = ~/.*/

                PRIVAL      = 1*3DIGIT ; range 0 .. 191"
	       '(#\Newline #\Newline)
	       *abnf-rfc-syslog-draft-15*)
  "See http://www.rsyslog.com/doc/syslog_protocol.html")

(defvar *abnf-rfc5424-syslog-protocol*
  "   SYSLOG-MSG      = HEADER SP STRUCTURED-DATA [SP MSG]

      HEADER          = PRI VERSION SP TIMESTAMP SP HOSTNAME
                        SP APP-NAME SP PROCID SP MSGID
      PRI             = \"<\" PRIVAL \">\"
      PRIVAL          = 1*3DIGIT ; range 0 .. 191
      VERSION         = NONZERO-DIGIT 0*2DIGIT
      HOSTNAME        = NILVALUE / 1*255PRINTUSASCII

      APP-NAME        = NILVALUE / 1*48PRINTUSASCII
      PROCID          = NILVALUE / 1*128PRINTUSASCII
      MSGID           = NILVALUE / 1*32PRINTUSASCII

      TIMESTAMP       = NILVALUE / FULL-DATE \"T\" FULL-TIME
      FULL-DATE       = DATE-FULLYEAR \"-\" DATE-MONTH \"-\" DATE-MDAY
      DATE-FULLYEAR   = 4DIGIT
      DATE-MONTH      = 2DIGIT  ; 01-12
      DATE-MDAY       = 2DIGIT  ; 01-28, 01-29, 01-30, 01-31 based on
                                ; month/year
      FULL-TIME       = PARTIAL-TIME TIME-OFFSET
      PARTIAL-TIME    = TIME-HOUR \":\" TIME-MINUTE \":\" TIME-SECOND
                        [TIME-SECFRAC]
      TIME-HOUR       = 2DIGIT  ; 00-23
      TIME-MINUTE     = 2DIGIT  ; 00-59
      TIME-SECOND     = 2DIGIT  ; 00-59
      TIME-SECFRAC    = \".\" 1*6DIGIT
      TIME-OFFSET     = \"Z\" / TIME-NUMOFFSET
      TIME-NUMOFFSET  = (\"+\" / \"-\") TIME-HOUR \":\" TIME-MINUTE


      STRUCTURED-DATA = NILVALUE / 1*SD-ELEMENT
      SD-ELEMENT      = \"[\" SD-ID *(SP SD-PARAM) \"]\"
      SD-PARAM        = PARAM-NAME \"=\" %d34 PARAM-VALUE %d34
      SD-ID           = SD-NAME
      PARAM-NAME      = SD-NAME
      PARAM-VALUE     = UTF-8-STRING ; characters '\"', '\' and
                                     ; ']' MUST be escaped.
      SD-NAME         = 1*32PRINTUSASCII
                        ; except '=', SP, ']', %d34 (\")

      MSG             = MSG-ANY / MSG-UTF8
      MSG-ANY         = *OCTET ; not starting with BOM
      MSG-UTF8        = BOM UTF-8-STRING
      BOM             = %xEF.BB.BF

      UTF-8-STRING    = *OCTET ; UTF-8 string as specified
                        ; in RFC 3629

      OCTET           = %d00-255
      SP              = %d32
      PRINTUSASCII    = %d33-126
      NONZERO-DIGIT   = %d49-57
      DIGIT           = %d48 / NONZERO-DIGIT
      NILVALUE        = \"-\""
  "See http://tools.ietf.org/html/rfc5424#section-6")

#|

This table comes from http://tools.ietf.org/html/rfc2234#page-11 and 12.

        ALPHA          =  %x41-5A / %x61-7A   ; A-Z / a-z
        BIT            =  "0" / "1"
        CHAR           =  %x01-7F
        CR             =  %x0D
        CRLF           =  CR LF
        CTL            =  %x00-1F / %x7F
        DIGIT          =  %x30-39
        DQUOTE         =  %x22
        HEXDIG         =  DIGIT / "A" / "B" / "C" / "D" / "E" / "F"
        HTAB           =  %x09
        LF             =  %x0A
        LWSP           =  *(WSP / CRLF WSP)
        OCTET          =  %x00-FF
        SP             =  %x20
        VCHAR          =  %x21-7E
        WSP            =  SP / HTAB
|#

(defvar *abnf-default-rules*
  `((:abnf-alpha  (:char-class (:range #\A #\Z) (:range #\a #\z)))
    (:abnf-bit    (:char-class #\0 #\1))
    (:abnf-char   (:char-class (:range ,(code-char #x1) ,(code-char #x7f))))
    (:abnf-cr     #\Newline)
    (:abnf-crlf   (:sequence #\Newline #\Return))
    (:abnf-ctl    (:char-class (:range ,(code-char #x0) ,(code-char #x1f))
			  ,(code-char #x7f)))
    (:abnf-digit  (:char-class (:range #\0 #\9)))
    (:abnf-dquote #\")
    (:abnf-hexdig (:char-class (:range #\0 #\9) (:range #\A #\F)))
    (:abnf-htab   #\Tab)
    (:abnf-lf     #\Newline)
    (:abnf-lwsp   (:regex "\s+"))
    (:abnf-octet  (:char-class (:range ,(code-char #x0) ,(code-char #xff))))
    (:abnf-sp     #\Space)
    (:abnf-vchar  (:char-class (:range ,(code-char #x21) ,(code-char #x7e))))
    (:abnf-wsp    (:char-class #\Space #\Tab)))
  "An alist of the usual rules needed for ABNF grammars")

(defun rule-name-character-p (character)
  (or (alphanumericp character)
      (char= character #\-)))

(defun vcharp (character)
  (<= #x21 (char-code character) #x7E))

(defrule vchar (+ (vcharp character))                   (:text t))
(defrule wsp (or #\Space #\Tab)                         (:constant :wsp))

(defrule comment (and ";" (* (or wsp vchar)) #\Newline) (:constant :comment))
(defrule c-nl (or comment #\Newline)                    (:constant :c-nl))
(defrule c-wsp (or wsp c-nl)                            (:constant :c-wsp))
(defrule n-wsp (* c-wsp)                                (:constant :c-wsp))

(defun rule-name-symbol (rule-name &key find-symbol)
  "Turn the string we read in the ABNF into internal symbol."
  (let ((symbol-fun  (if find-symbol #'find-symbol #'intern))
	(symbol-name (string-upcase (format nil "abnf-~a" rule-name))))
    (funcall symbol-fun symbol-name :keyword)))

(defrule rule-name (and (alpha-char-p character)
			(+ (rule-name-character-p character)))
  (:lambda (name)
    (rule-name-symbol (text name))))

(defrule equal (and n-wsp #\= n-wsp)       (:constant :equal))
(defrule end-of-rule n-wsp                 (:constant :eor))

(defrule digit (digit-char-p character)
  (:lambda (digit)
    (parse-integer (text digit))))

(defrule digits (+ (digit-char-p character))
  (:lambda (digits)
    (code-char (parse-integer (text digits)))))

(defun char-val-char-p (character)
  (let ((code (char-code character)))
    (or (<= #x20 code #x21)
	(<= #x23 code #x7E))))

(defrule char-val (and #\" (* (char-val-char-p character)) #\")
  (:lambda (char)
    (destructuring-bind (open val close) char
      (declare (ignore open close))
      (text val))))

(defrule dec-string (and digits (+ (and "." digits)))
  (:lambda (string)
    (destructuring-bind (first rest) string
      `(:sequence ,first ,@(mapcar #'cadr rest)))))

(defrule dec-range (and digits "-" digits)
  (:lambda (range)
    (destructuring-bind (min sep max) range
      (declare (ignore sep))
      `(:char-class (:range ,min ,max)))))

(defrule dec-val (and "d" (or dec-string dec-range digits))
  (:lambda (dv)
    (destructuring-bind (d val) dv
      (declare (ignore d))
      val)))

(defun hexadecimal-char-p (character)
  (member character #. (quote (coerce "0123456789abcdefABCDEF" 'list))))

(defrule hexdigits (+ (hexadecimal-char-p character))
  (:lambda (hx)
    (code-char (parse-integer (text hx) :radix 16))))

(defrule hex-string (and hexdigits (+ (and "." hexdigits)))
  (:lambda (string)
    (destructuring-bind (first rest) string
      `(:sequence ,first ,@(mapcar #'cadr rest)))))

(defrule hex-range (and hexdigits range-sep hexdigits)
  (:lambda (range)
    (destructuring-bind (min sep max) range
      (declare (ignore sep))
      `(:char-class (:range ,min ,max)))))

(defrule hex-val (and "x" (or hex-string hex-range hexdigits))
  (:lambda (dv)
    (destructuring-bind (d val) dv
      (declare (ignore d))
      val)))

(defrule num-val (and "%" (or dec-val hex-val))
  (:lambda (nv)
    (destructuring-bind (percent val) nv
      (declare (ignore percent))
      val)))

;;; allow to parse rule definitions without a separating blank line
(defrule rule-name-reference (and rule-name (! equal))
  (:lambda (ref)
    (destructuring-bind (rule-name nil) ref
      rule-name)))

;;; what about allowing regular expressions directly?
(defun process-quoted-regex (pr)
  "Helper function to process different kinds of quotes for regexps"
  (destructuring-bind (open regex close) pr
      (declare (ignore open close))
      `(:regex ,(text regex))))

(defrule single-quoted-regex (and #\' (+ (not #\')) #\')
  (:function process-quoted-regex))

(defrule double-quoted-regex (and #\" (+ (not #\")) #\")
  (:function process-quoted-regex))

(defrule parens-quoted-regex (and #\( (+ (not #\))) #\))
  (:function process-quoted-regex))

(defrule braces-quoted-regex (and #\{ (+ (not #\})) #\})
  (:function process-quoted-regex))

(defrule chevron-quoted-regex (and #\< (+ (not #\>)) #\>)
  (:function process-quoted-regex))

(defrule brackets-quoted-regex (and #\[ (+ (not #\])) #\])
  (:function process-quoted-regex))

(defrule slash-quoted-regex (and #\/ (+ (not #\/)) #\/)
  (:function process-quoted-regex))

(defrule pipe-quoted-regex (and #\| (+ (not #\|)) #\|)
  (:function process-quoted-regex))

(defrule sharp-quoted-regex (and #\# (+ (not #\#)) #\#)
  (:function process-quoted-regex))

(defrule quoted-regex (and "~" (or single-quoted-regex
				   double-quoted-regex
				   parens-quoted-regex
				   braces-quoted-regex
				   chevron-quoted-regex
				   brackets-quoted-regex
				   slash-quoted-regex
				   pipe-quoted-regex
				   sharp-quoted-regex))
  (:lambda (qr)
    (destructuring-bind (tilde regex) qr
      (declare (ignore tilde))
      regex)))

(defrule element (or rule-name-reference char-val num-val quoted-regex))

(defrule number (+ (digit-char-p character))
  (:lambda (number)
    (parse-integer (text number))))

(defrule repeat-var (and (? number) "*" (? number))
  (:lambda (rv)
    (destructuring-bind (min star max) rv
      (declare (ignore star))
      (cons (or min 0) max))))

(defrule repeat-specific number
  (:lambda (number)
    (cons number number)))

(defrule repeat (or repeat-var repeat-specific))

(defrule repetition (and (? repeat) toplevel-element)
  (:lambda (repetition)
    (destructuring-bind (repeat element) repetition
	(if repeat
	    (destructuring-bind (min . max) repeat
	      `(:non-greedy-repetition ,min ,max ,element))
	    ;; no repeat clause
	    element))))

(defrule concatenation-element (and n-wsp repetition)
  (:lambda (ce)
    (destructuring-bind (n-wsp rep) ce
      (declare (ignore n-wsp))
      rep)))

(defrule concatenation (and repetition (* concatenation-element))
  (:lambda (concat)
    (destructuring-bind (rep1 rlist) concat
      (if rlist
	  `(:sequence ,@(list* rep1 rlist))
	  ;; concatenation of a single element
	  rep1))))

(defrule alternation-element (and n-wsp "/" n-wsp concatenation)
  (:lambda (ae)
    (destructuring-bind (ws1 sl ws2 concatenation) ae
      (declare (ignore ws1 sl ws2))
      concatenation)))

(defrule alternation (and concatenation (* alternation-element))
  (:lambda (alternation)
    (destructuring-bind (c1 clist) alternation
      (if clist
	  `(:alternation ,@(list* c1 clist))
	  c1))))

(defrule group (and "(" n-wsp alternation n-wsp ")")
  (:lambda (group)
    (destructuring-bind (open ws1 a ws2 close) group
      (declare (ignore open close ws1 ws2))
      ;; we need the grouping when parsing the ABNF syntax, but once this
      ;; parsing is done there's no ambiguity possible left and we don't
      ;; need the grouping anymore in the resulting regular-expression parse
      ;; tree.
      a)))

(defrule option (and "[" n-wsp alternation n-wsp "]")
  (:lambda (option)
    (destructuring-bind (open ws1 a ws2 close) option
      (declare (ignore open close ws1 ws2))
      `(:non-greedy-repetition 0 1 ,a))))

(defrule toplevel-element (or group option element))

(defrule alternations (and n-wsp alternation)
  (:lambda (alts)
    (destructuring-bind (n-wsp alt) alts
      (declare (ignore n-wsp))
      alt)))

(defrule elements (and (+ alternations) end-of-rule)
  (:lambda (alist)
    (destructuring-bind (alts eor) alist
      (declare (ignore eor))
      alts)))

(defrule rule (and n-wsp rule-name equal elements)
  (:lambda (rule)
    (destructuring-bind (n-wsp rule-name eq definition) rule
      (declare (ignore n-wsp eq))
      (cons rule-name definition))))

(defrule rule-list (+ rule))

;;;
;;; Now that we are able to transform ABNF rule set into an alist of
;;; cl-ppcre parse trees and references to other rules in the set, we need
;;; to expand each symbol's definition to get a real cl-ppcre scanner parse
;;; tree.
;;;
(defun expand-rule-definition (definition
			       rule-set
			       registering-rules
			       already-expanded-rules)
  "Expand given rule DEFINITION within given RULE-SET"
  (typecase definition
    (list
     ;; walk the definition and expand its elements
     (loop
	for element in definition
	collect (expand-rule-definition element
					rule-set
					registering-rules
					already-expanded-rules)))

    (symbol
     (if (member definition '(:sequence
			      :alternation
			      :regex
			      :char-class
			      :range
			      :non-greedy-repetition))
	 ;; that's a cl-ppcre scanner parse-tree symbol
	 ;; only put in that list those cl-ppcre symbols we actually produce
	 definition

	 ;; here we have to actually expand the symbol
	 (progn
	   ;; first protect against infinite recursion
	   (when (member definition already-expanded-rules)
	     (error "Can not expand recursive rule: ~S." definition))

	   (destructuring-bind (rule-name rule-definition)
	       (or (assoc definition rule-set)
		   (assoc definition *abnf-default-rules*))
	     (let* ((already-expanded-rules
		     (cons definition already-expanded-rules))

		    (expanded-definition
		     (expand-rule-definition rule-definition
					     rule-set
					     registering-rules
					     already-expanded-rules)))
	       (if (member rule-name registering-rules)
		   `(:register ,expanded-definition)
		   expanded-definition))))))

    ;; all other types of data are "constants" in our parse-tree
    (t definition)))

(defun expand-rule (rule-name rule-set &optional registering-rules)
  "Given a rule, expand it completely removing references to other parsed
   rules"
  (let ((rule (rule-name-symbol rule-name :find-symbol t)))
    (destructuring-bind (rule-name definition)
	   (assoc rule rule-set)
      `(:sequence
	:start-anchor
	,(expand-rule-definition definition
				rule-set
				(loop
				   for rr in registering-rules
				   collect (rule-name-symbol rr :find-symbol t))
				(list rule-name))))))

(defun parse-abnf-grammar (abnf-string top-level-rule
			   &key registering-rules junk-allowed)
  "Parse STRING as an ABNF grammar as defined in RFC 2234. Returns a cl-ppcre
   scanner that will only match strings conforming to given grammar.

   See http://tools.ietf.org/html/rfc2234 for details about the ABNF specs.
   Added to that grammar is support for regular expression, that are
   expected in the ELEMENT production and spelled ~/regex/. The allowed
   delimiters are: ~// ~[] ~{} ~() ~<> ~\"\" ~'' ~|| and ~##."
  (let ((rule-set
	 (parse 'rule-list abnf-string :junk-allowed junk-allowed)))
    (cl-ppcre:create-scanner
     (expand-rule top-level-rule
		  ;; in case of duplicates only keep the latest addition
		  (remove-duplicates rule-set :key #'car)
		  registering-rules))))

(defun test (&key (times 10000))
  "This serves as a test and an example: if you're going to use the same
   scanner more than one, be sure to compute it only once."
  (let* ((cl-ppcre:*use-bmh-matchers* t)
	 (cl-ppcre:*optimize-char-classes* t)
	 (scanner
	  (parse-abnf-grammar *abnf-rfc-syslog-draft-15*
			      :timestamp
			      :registering-rules '(:full-date
						   :partial-time
						   :time-offset))))
   (loop
      repeat times
      do (cl-ppcre:register-groups-bind
	      (date time zone)
	    (scanner "2013-09-08T00:02:03.123456Z+02:00")
	  (list date time zone)))))