;;; ;;; Augmented BNF for Syntax Specifications: ABNF ;;; ;;; Parsing ABNF syntaxes so that we can offer users to edit them ;;; ;;; see http://tools.ietf.org/html/draft-ietf-syslog-protocol-15#page-10 ;;; and http://tools.ietf.org/html/rfc2234 ;;; (in-package #:abnf) (defvar *abnf-rfc2234-abnf-definition* " rulelist = 1*( rule / (*c-wsp c-nl) ) rule = rulename defined-as elements c-nl ; continues if next line starts ; with white space rulename = ALPHA *(ALPHA / DIGIT / \"-\") defined-as = *c-wsp (\"=\" / \"=/\") *c-wsp ; basic rules definition and ; incremental alternatives elements = alternation *c-wsp c-wsp = WSP / (c-nl WSP) c-nl = comment / CRLF ; comment or newline comment = \";\" *(WSP / VCHAR) CRLF alternation = concatenation *(*c-wsp \"/\" *c-wsp concatenation) concatenation = repetition *(1*c-wsp repetition) repetition = [repeat] element repeat = 1*DIGIT / (*DIGIT \"*\" *DIGIT) element = rulename / group / option / char-val / num-val / prose-val group = \"(\" *c-wsp alternation *c-wsp \")\" option = \"[\" *c-wsp alternation *c-wsp \"]\" char-val = DQUOTE *(%x20-21 / %x23-7E) DQUOTE ; quoted string of SP and VCHAR without DQUOTE num-val = \"%\" (bin-val / dec-val / hex-val) bin-val = \"b\" 1*BIT [ 1*(\".\" 1*BIT) / (\"-\" 1*BIT) ] ; series of concatenated bit values ; or single ONEOF range dec-val = \"d\" 1*DIGIT [ 1*(\".\" 1*DIGIT) / (\"-\" 1*DIGIT) ] hex-val = \"x\" 1*HEXDIG [ 1*(\".\" 1*HEXDIG) / (\"-\" 1*HEXDIG) ] prose-val = \"<\" *(%x20-3D / %x3F-7E) \">\" ; bracketed string of SP and VCHAR without angles ; prose description, to be used as last resort " "See http://tools.ietf.org/html/rfc2234#section-4") (defvar *abnf-rfc-syslog-draft-15* "SYSLOG-MSG = HEADER SP STRUCTURED-DATA [SP MSG] HEADER = VERSION SP FACILITY SP SEVERITY SP TRUNCATE SP TIMESTAMP SP HOSTNAME SP APP-NAME SP PROCID SP MSGID VERSION = NONZERO-DIGIT 0*2DIGIT FACILITY = \"0\" / (NONZERO-DIGIT 0*9DIGIT) ; range 0..2147483647 ; SEVERITY = \"0\" / \"1\" / \"2\" / \"3\" / \"4\" / \"5\" / \"6\" / \"7\" TRUNCATE = 2DIGIT HOSTNAME = 1*255PRINTUSASCII APP-NAME = 1*48PRINTUSASCII PROCID = \"-\" / 1*128PRINTUSASCII MSGID = \"-\" / 1*32PRINTUSASCII TIMESTAMP = FULL-DATE \"T\" FULL-TIME FULL-DATE = DATE-FULLYEAR \"-\" DATE-MONTH \"-\" DATE-MDAY DATE-FULLYEAR = 4DIGIT DATE-MONTH = 2DIGIT ; 01-12 DATE-MDAY = 2DIGIT ; 01-28, 01-29, 01-30, 01-31 based on ; month/year ; FULL-TIME = PARTIAL-TIME TIME-OFFSET PARTIAL-TIME = TIME-HOUR \":\" TIME-MINUTE \":\" TIME-SECOND [TIME-SECFRAC] TIME-HOUR = 2DIGIT ; 00-23 TIME-MINUTE = 2DIGIT ; 00-59 TIME-SECOND = 2DIGIT ; 00-58, 00-59, 00-60 based on leap ; second rules ; TIME-SECFRAC = \".\" 1*6DIGIT TIME-OFFSET = \"Z\" / TIME-NUMOFFSET TIME-NUMOFFSET = (\"+\" / \"-\") TIME-HOUR \":\" TIME-MINUTE STRUCTURED-DATA = 1*SD-ELEMENT / \"-\" SD-ELEMENT = \"[\" SD-ID *(SP SD-PARAM) \"]\" SD-PARAM = PARAM-NAME \"=\" %d34 PARAM-VALUE %d34 SD-ID = SD-NAME PARAM-NAME = SD-NAME PARAM-VALUE = UTF-8-STRING ; characters '\"', '\' and ; ']' MUST be escaped. ; SD-NAME = 1*32PRINTUSASCII ; except '=', SP, ']', %d34 (\") ; MSG = UTF-8-STRING UTF-8-STRING = *OCTET ; Any VALID UTF-8 String ; \"shortest form\" MUST be used ; OCTET = %d00-255 SP = %d32 PRINTUSASCII = %d33-126 NONZERO-DIGIT = \"1\" / \"2\" / \"3\" / \"4\" / \"5\" / \"6\" / \"7\" / \"8\" / \"9\" DIGIT = \"0\" / NONZERO-DIGIT" "See http://tools.ietf.org/html/draft-ietf-syslog-protocol-15#page-10") (defvar *abnf-rsyslog* (concatenate 'string "RSYSLOG-MSG = \"<\" PRIVAL \">\" VERSION SP TIMESTAMP SP HOSTNAME SP APP-NAME SP PROCID SP MSGID SP [SD-ID SP] DATA DATA = ~/.*/ PRIVAL = 1*3DIGIT ; range 0 .. 191" '(#\Newline #\Newline) *abnf-rfc-syslog-draft-15*) "See http://www.rsyslog.com/doc/syslog_protocol.html") (defvar *abnf-rfc5424-syslog-protocol* " SYSLOG-MSG = HEADER SP STRUCTURED-DATA [SP MSG] HEADER = PRI VERSION SP TIMESTAMP SP HOSTNAME SP APP-NAME SP PROCID SP MSGID PRI = \"<\" PRIVAL \">\" PRIVAL = 1*3DIGIT ; range 0 .. 191 VERSION = NONZERO-DIGIT 0*2DIGIT HOSTNAME = NILVALUE / 1*255PRINTUSASCII APP-NAME = NILVALUE / 1*48PRINTUSASCII PROCID = NILVALUE / 1*128PRINTUSASCII MSGID = NILVALUE / 1*32PRINTUSASCII TIMESTAMP = NILVALUE / FULL-DATE \"T\" FULL-TIME FULL-DATE = DATE-FULLYEAR \"-\" DATE-MONTH \"-\" DATE-MDAY DATE-FULLYEAR = 4DIGIT DATE-MONTH = 2DIGIT ; 01-12 DATE-MDAY = 2DIGIT ; 01-28, 01-29, 01-30, 01-31 based on ; month/year FULL-TIME = PARTIAL-TIME TIME-OFFSET PARTIAL-TIME = TIME-HOUR \":\" TIME-MINUTE \":\" TIME-SECOND [TIME-SECFRAC] TIME-HOUR = 2DIGIT ; 00-23 TIME-MINUTE = 2DIGIT ; 00-59 TIME-SECOND = 2DIGIT ; 00-59 TIME-SECFRAC = \".\" 1*6DIGIT TIME-OFFSET = \"Z\" / TIME-NUMOFFSET TIME-NUMOFFSET = (\"+\" / \"-\") TIME-HOUR \":\" TIME-MINUTE STRUCTURED-DATA = NILVALUE / 1*SD-ELEMENT SD-ELEMENT = \"[\" SD-ID *(SP SD-PARAM) \"]\" SD-PARAM = PARAM-NAME \"=\" %d34 PARAM-VALUE %d34 SD-ID = SD-NAME PARAM-NAME = SD-NAME PARAM-VALUE = UTF-8-STRING ; characters '\"', '\' and ; ']' MUST be escaped. SD-NAME = 1*32PRINTUSASCII ; except '=', SP, ']', %d34 (\") MSG = MSG-ANY / MSG-UTF8 MSG-ANY = *OCTET ; not starting with BOM MSG-UTF8 = BOM UTF-8-STRING BOM = %xEF.BB.BF UTF-8-STRING = *OCTET ; UTF-8 string as specified ; in RFC 3629 OCTET = %d00-255 SP = %d32 PRINTUSASCII = %d33-126 NONZERO-DIGIT = %d49-57 DIGIT = %d48 / NONZERO-DIGIT NILVALUE = \"-\"" "See http://tools.ietf.org/html/rfc5424#section-6") #| This table comes from http://tools.ietf.org/html/rfc2234#page-11 and 12. ALPHA = %x41-5A / %x61-7A ; A-Z / a-z BIT = "0" / "1" CHAR = %x01-7F CR = %x0D CRLF = CR LF CTL = %x00-1F / %x7F DIGIT = %x30-39 DQUOTE = %x22 HEXDIG = DIGIT / "A" / "B" / "C" / "D" / "E" / "F" HTAB = %x09 LF = %x0A LWSP = *(WSP / CRLF WSP) OCTET = %x00-FF SP = %x20 VCHAR = %x21-7E WSP = SP / HTAB |# (defvar *abnf-default-rules* `((:abnf-alpha (:char-class (:range #\A #\Z) (:range #\a #\z))) (:abnf-bit (:char-class #\0 #\1)) (:abnf-char (:char-class (:range ,(code-char #x1) ,(code-char #x7f)))) (:abnf-cr #\Newline) (:abnf-crlf (:sequence #\Newline #\Return)) (:abnf-ctl (:char-class (:range ,(code-char #x0) ,(code-char #x1f)) ,(code-char #x7f))) (:abnf-digit (:char-class (:range #\0 #\9))) (:abnf-dquote #\") (:abnf-hexdig (:char-class (:range #\0 #\9) (:range #\A #\F))) (:abnf-htab #\Tab) (:abnf-lf #\Newline) (:abnf-lwsp (:regex "\s+")) (:abnf-octet (:char-class (:range ,(code-char #x0) ,(code-char #xff)))) (:abnf-sp #\Space) (:abnf-vchar (:char-class (:range ,(code-char #x21) ,(code-char #x7e)))) (:abnf-wsp (:char-class #\Space #\Tab))) "An alist of the usual rules needed for ABNF grammars") (defun rule-name-character-p (character) (or (alphanumericp character) (char= character #\-))) (defun vcharp (character) (<= #x21 (char-code character) #x7E)) (defrule vchar (+ (vcharp character)) (:text t)) (defrule wsp (or #\Space #\Tab) (:constant :wsp)) (defrule comment (and ";" (* (or wsp vchar)) #\Newline) (:constant :comment)) (defrule c-nl (or comment #\Newline) (:constant :c-nl)) (defrule c-wsp (or wsp c-nl) (:constant :c-wsp)) (defrule n-wsp (* c-wsp) (:constant :c-wsp)) (defun rule-name-symbol (rule-name &key find-symbol) "Turn the string we read in the ABNF into internal symbol." (let ((symbol-fun (if find-symbol #'find-symbol #'intern)) (symbol-name (string-upcase (format nil "abnf-~a" rule-name)))) (funcall symbol-fun symbol-name :keyword))) (defrule rule-name (and (alpha-char-p character) (+ (rule-name-character-p character))) (:lambda (name) (rule-name-symbol (text name)))) (defrule equal (and n-wsp #\= n-wsp) (:constant :equal)) (defrule end-of-rule n-wsp (:constant :eor)) (defrule digit (digit-char-p character) (:lambda (digit) (parse-integer (text digit)))) (defrule digits (+ (digit-char-p character)) (:lambda (digits) (code-char (parse-integer (text digits))))) (defun char-val-char-p (character) (let ((code (char-code character))) (or (<= #x20 code #x21) (<= #x23 code #x7E)))) (defrule char-val (and #\" (* (char-val-char-p character)) #\") (:lambda (char) (destructuring-bind (open val close) char (declare (ignore open close)) (text val)))) (defrule dec-string (and digits (+ (and "." digits))) (:lambda (string) (destructuring-bind (first rest) string `(:sequence ,first ,@(mapcar #'cadr rest))))) (defrule dec-range (and digits "-" digits) (:lambda (range) (destructuring-bind (min sep max) range (declare (ignore sep)) `(:char-class (:range ,min ,max))))) (defrule dec-val (and "d" (or dec-string dec-range digits)) (:lambda (dv) (destructuring-bind (d val) dv (declare (ignore d)) val))) (defun hexadecimal-char-p (character) (member character #. (quote (coerce "0123456789abcdefABCDEF" 'list)))) (defrule hexdigits (+ (hexadecimal-char-p character)) (:lambda (hx) (code-char (parse-integer (text hx) :radix 16)))) (defrule hex-string (and hexdigits (+ (and "." hexdigits))) (:lambda (string) (destructuring-bind (first rest) string `(:sequence ,first ,@(mapcar #'cadr rest))))) (defrule hex-range (and hexdigits range-sep hexdigits) (:lambda (range) (destructuring-bind (min sep max) range (declare (ignore sep)) `(:char-class (:range ,min ,max))))) (defrule hex-val (and "x" (or hex-string hex-range hexdigits)) (:lambda (dv) (destructuring-bind (d val) dv (declare (ignore d)) val))) (defrule num-val (and "%" (or dec-val hex-val)) (:lambda (nv) (destructuring-bind (percent val) nv (declare (ignore percent)) val))) ;;; allow to parse rule definitions without a separating blank line (defrule rule-name-reference (and rule-name (! equal)) (:lambda (ref) (destructuring-bind (rule-name nil) ref rule-name))) ;;; what about allowing regular expressions directly? (defun process-quoted-regex (pr) "Helper function to process different kinds of quotes for regexps" (destructuring-bind (open regex close) pr (declare (ignore open close)) `(:regex ,(text regex)))) (defrule single-quoted-regex (and #\' (+ (not #\')) #\') (:function process-quoted-regex)) (defrule double-quoted-regex (and #\" (+ (not #\")) #\") (:function process-quoted-regex)) (defrule parens-quoted-regex (and #\( (+ (not #\))) #\)) (:function process-quoted-regex)) (defrule braces-quoted-regex (and #\{ (+ (not #\})) #\}) (:function process-quoted-regex)) (defrule chevron-quoted-regex (and #\< (+ (not #\>)) #\>) (:function process-quoted-regex)) (defrule brackets-quoted-regex (and #\[ (+ (not #\])) #\]) (:function process-quoted-regex)) (defrule slash-quoted-regex (and #\/ (+ (not #\/)) #\/) (:function process-quoted-regex)) (defrule pipe-quoted-regex (and #\| (+ (not #\|)) #\|) (:function process-quoted-regex)) (defrule sharp-quoted-regex (and #\# (+ (not #\#)) #\#) (:function process-quoted-regex)) (defrule quoted-regex (and "~" (or single-quoted-regex double-quoted-regex parens-quoted-regex braces-quoted-regex chevron-quoted-regex brackets-quoted-regex slash-quoted-regex pipe-quoted-regex sharp-quoted-regex)) (:lambda (qr) (destructuring-bind (tilde regex) qr (declare (ignore tilde)) regex))) (defrule element (or rule-name-reference char-val num-val quoted-regex)) (defrule number (+ (digit-char-p character)) (:lambda (number) (parse-integer (text number)))) (defrule repeat-var (and (? number) "*" (? number)) (:lambda (rv) (destructuring-bind (min star max) rv (declare (ignore star)) (cons (or min 0) max)))) (defrule repeat-specific number (:lambda (number) (cons number number))) (defrule repeat (or repeat-var repeat-specific)) (defrule repetition (and (? repeat) toplevel-element) (:lambda (repetition) (destructuring-bind (repeat element) repetition (if repeat (destructuring-bind (min . max) repeat `(:non-greedy-repetition ,min ,max ,element)) ;; no repeat clause element)))) (defrule concatenation-element (and n-wsp repetition) (:lambda (ce) (destructuring-bind (n-wsp rep) ce (declare (ignore n-wsp)) rep))) (defrule concatenation (and repetition (* concatenation-element)) (:lambda (concat) (destructuring-bind (rep1 rlist) concat (if rlist `(:sequence ,@(list* rep1 rlist)) ;; concatenation of a single element rep1)))) (defrule alternation-element (and n-wsp "/" n-wsp concatenation) (:lambda (ae) (destructuring-bind (ws1 sl ws2 concatenation) ae (declare (ignore ws1 sl ws2)) concatenation))) (defrule alternation (and concatenation (* alternation-element)) (:lambda (alternation) (destructuring-bind (c1 clist) alternation (if clist `(:alternation ,@(list* c1 clist)) c1)))) (defrule group (and "(" n-wsp alternation n-wsp ")") (:lambda (group) (destructuring-bind (open ws1 a ws2 close) group (declare (ignore open close ws1 ws2)) ;; we need the grouping when parsing the ABNF syntax, but once this ;; parsing is done there's no ambiguity possible left and we don't ;; need the grouping anymore in the resulting regular-expression parse ;; tree. a))) (defrule option (and "[" n-wsp alternation n-wsp "]") (:lambda (option) (destructuring-bind (open ws1 a ws2 close) option (declare (ignore open close ws1 ws2)) `(:non-greedy-repetition 0 1 ,a)))) (defrule toplevel-element (or group option element)) (defrule alternations (and n-wsp alternation) (:lambda (alts) (destructuring-bind (n-wsp alt) alts (declare (ignore n-wsp)) alt))) (defrule elements (and (+ alternations) end-of-rule) (:lambda (alist) (destructuring-bind (alts eor) alist (declare (ignore eor)) alts))) (defrule rule (and n-wsp rule-name equal elements) (:lambda (rule) (destructuring-bind (n-wsp rule-name eq definition) rule (declare (ignore n-wsp eq)) (cons rule-name definition)))) (defrule rule-list (+ rule)) ;;; ;;; Now that we are able to transform ABNF rule set into an alist of ;;; cl-ppcre parse trees and references to other rules in the set, we need ;;; to expand each symbol's definition to get a real cl-ppcre scanner parse ;;; tree. ;;; (defun expand-rule-definition (definition rule-set registering-rules already-expanded-rules) "Expand given rule DEFINITION within given RULE-SET" (typecase definition (list ;; walk the definition and expand its elements (loop for element in definition collect (expand-rule-definition element rule-set registering-rules already-expanded-rules))) (symbol (if (member definition '(:sequence :alternation :regex :char-class :range :non-greedy-repetition)) ;; that's a cl-ppcre scanner parse-tree symbol ;; only put in that list those cl-ppcre symbols we actually produce definition ;; here we have to actually expand the symbol (progn ;; first protect against infinite recursion (when (member definition already-expanded-rules) (error "Can not expand recursive rule: ~S." definition)) (destructuring-bind (rule-name rule-definition) (or (assoc definition rule-set) (assoc definition *abnf-default-rules*)) (let* ((already-expanded-rules (cons definition already-expanded-rules)) (expanded-definition (expand-rule-definition rule-definition rule-set registering-rules already-expanded-rules))) (if (member rule-name registering-rules) `(:register ,expanded-definition) expanded-definition)))))) ;; all other types of data are "constants" in our parse-tree (t definition))) (defun expand-rule (rule-name rule-set &optional registering-rules) "Given a rule, expand it completely removing references to other parsed rules" (let ((rule (rule-name-symbol rule-name :find-symbol t))) (destructuring-bind (rule-name definition) (assoc rule rule-set) `(:sequence :start-anchor ,(expand-rule-definition definition rule-set (loop for rr in registering-rules collect (rule-name-symbol rr :find-symbol t)) (list rule-name)))))) (defun parse-abnf-grammar (abnf-string top-level-rule &key registering-rules junk-allowed) "Parse STRING as an ABNF grammar as defined in RFC 2234. Returns a cl-ppcre scanner that will only match strings conforming to given grammar. See http://tools.ietf.org/html/rfc2234 for details about the ABNF specs. Added to that grammar is support for regular expression, that are expected in the ELEMENT production and spelled ~/regex/. The allowed delimiters are: ~// ~[] ~{} ~() ~<> ~\"\" ~'' ~|| and ~##." (let ((rule-set (parse 'rule-list abnf-string :junk-allowed junk-allowed))) (cl-ppcre:create-scanner (expand-rule top-level-rule ;; in case of duplicates only keep the latest addition (remove-duplicates rule-set :key #'car) registering-rules)))) (defun test (&key (times 10000)) "This serves as a test and an example: if you're going to use the same scanner more than one, be sure to compute it only once." (let* ((cl-ppcre:*use-bmh-matchers* t) (cl-ppcre:*optimize-char-classes* t) (scanner (parse-abnf-grammar *abnf-rfc-syslog-draft-15* :timestamp :registering-rules '(:full-date :partial-time :time-offset)))) (loop repeat times do (cl-ppcre:register-groups-bind (date time zone) (scanner "2013-09-08T00:02:03.123456Z+02:00") (list date time zone)))))