;;; ;;; JavaScript 2.0 regular expression parser ;;; ;;; Waldemar Horwat (waldemar@acm.org) ;;; (progn (defparameter *rw* (generate-world "R" '((lexer regexp-lexer :lr-1 :regular-expression-pattern ((:unicode-character (% every (:text "Any Unicode character")) () t) (:unicode-alphanumeric (% alphanumeric (:text "Any Unicode alphabetic or decimal digit character (includes ASCII " (:character-literal #\0) :nbhy (:character-literal #\9) ", " (:character-literal #\A) :nbhy (:character-literal #\Z) ", and " (:character-literal #\a) :nbhy (:character-literal #\z) ")")) () t) (:line-terminator (#?000A #?000D #?0085 #?2028 #?2029) () t) (:decimal-digit (#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9) (($default-action $default-action) (decimal-value $digit-value))) (:non-zero-digit (#\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9) ((decimal-value $digit-value))) (:hex-digit (#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9 #\A #\B #\C #\D #\E #\F #\a #\b #\c #\d #\e #\f) ((hex-value $digit-value))) (:control-letter (++ (#\A #\B #\C #\D #\E #\F #\G #\H #\I #\J #\K #\L #\M #\N #\O #\P #\Q #\R #\S #\T #\U #\V #\W #\X #\Y #\Z) (#\a #\b #\c #\d #\e #\f #\g #\h #\i #\j #\k #\l #\m #\n #\o #\p #\q #\r #\s #\t #\u #\v #\w #\x #\y #\z)) (($default-action $default-action))) (:pattern-character (- :unicode-character (#\^ #\$ #\\ #\. #\* #\+ #\? #\( #\) #\[ #\] #\{ #\} #\|)) (($default-action $default-action))) ((:class-character dash) (- :unicode-character (#\\ #\])) (($default-action $default-action))) ((:class-character no-dash) (- (:class-character dash) (#\-)) (($default-action $default-action))) (:identity-escape (- :unicode-character (+ (#\_) :unicode-alphanumeric)) (($default-action $default-action)))) (($default-action char16 nil identity) ($digit-value integer digit-value digit-char-36))) (deftag syntax-error) (deftype semantic-exception (tag syntax-error)) (%heading 1 "Unicode Character Classes") (%charclass :unicode-character) (%charclass :unicode-alphanumeric) (%charclass :line-terminator) (define line-terminators (range-set char16) (range-set-of char16 #?000A #?000D #?2028 #?2029)) (define re-whitespaces (range-set char16) (range-set-of char16 #?000C #?000A #?000D #?0009 #?000B #\space)) (define re-digits (range-set char16) (range-set-of-ranges char16 #\0 #\9)) (define re-word-characters (range-set char16) (range-set-of-ranges char16 #\0 #\9 #\A #\Z #\a #\z #\_ nil)) (%print-actions) (%heading 1 "Regular Expression Definitions") (deftuple r-e-input (str string) (ignore-case boolean) (multiline boolean) (span boolean)) (%text :semantics "Field " (:label r-e-input str) " is the input string. " (:label r-e-input ignore-case) ", " (:label r-e-input multiline) ", and " (:label r-e-input span) " are the corresponding regular expression flags.") (deftag undefined) (deftype capture (union string (tag undefined))) (deftuple r-e-match (end-index integer) (captures (vector capture))) (deftag failure) (deftype r-e-result (union r-e-match (tag failure))) (%text :semantics "A " (:type r-e-match) " holds an intermediate state during the pattern-matching process. " (:label r-e-match end-index) " is the index of the next input character to be matched by the next component in a regular expression pattern. " "If we are at the end of the pattern, " (:label r-e-match end-index) " is one plus the index of the last matched input character. " (:label r-e-match captures) " is a zero-based array of the strings captured so far by capturing parentheses.") (deftype continuation (-> (r-e-match) r-e-result)) (%text :semantics "A " (:type continuation) " is a function that attempts to match the remaining portion of the pattern against the input string, " "starting at the intermediate state given by its " (:type r-e-match) " argument. " "If a match is possible, it returns a " (:type r-e-match) " result that contains the final state; if no match is possible, it returns a " (:tag failure) " result.") (deftype matcher (-> (r-e-input r-e-match continuation) r-e-result)) (%text :semantics "A " (:type matcher) " is a function that attempts to match a middle portion of the pattern against the input string, " "starting at the intermediate state given by its " (:type r-e-match) " argument. " "Since the remainder of the pattern heavily influences whether (and how) a middle portion will match, we " "must pass in a " (:type continuation) " function that checks whether the rest of the pattern matched. " "If the continuation returns " (:tag failure) ", the matcher function may call it repeatedly, " "trying various alternatives at pattern choice points.") (%text :semantics "The " (:type r-e-input) " parameter contains the input string and is merely passed down to subroutines.") (%text :semantics "A " (:type (-> (integer) matcher)) " is a function executed at the time the regular expression is compiled that returns a " (:type matcher) " for a part " "of the pattern. The " (:type integer) " parameter contains the number of capturing left parentheses seen so far in the " "pattern and is used to assign static, consecutive numbers to capturing parentheses.") (define (character-set-matcher (acceptance-set (range-set char16)) (invert boolean)) matcher ;*********ignore case? (function (m (t r-e-input) (x r-e-match) (c continuation)) r-e-result (const i integer (& end-index x)) (const s string (& str t)) (cond ((= i (length s)) (return failure)) ((xor (set-in (nth s i) acceptance-set) invert) (return (c (new r-e-match (+ i 1) (& captures x))))) (nil (return failure)))) (return m)) (%text :semantics (:global character-set-matcher) " returns a " (:type matcher) " that matches a single input string character. If " (:local invert) " is " (:tag false) ", the match succeeds if the character is a member of the " (:local acceptance-set) " set of characters (possibly ignoring case). If " (:local invert) " is " (:tag true) ", the match succeeds if the character is not a member of the " (:local acceptance-set) " set of characters (possibly ignoring case).") (define (character-matcher (ch char16)) matcher (return (character-set-matcher (range-set-of char16 ch) false))) (%text :semantics (:global character-matcher) " returns a " (:type matcher) " that matches a single input string character. The match succeeds if the character is the same as " (:local ch) " (possibly ignoring case).") (%print-actions) (%heading 1 "Regular Expression Patterns") (rule :regular-expression-pattern ((execute (-> (r-e-input integer) r-e-result))) (production :regular-expression-pattern (:disjunction) regular-expression-pattern-disjunction (execute (begin (const m1 matcher ((gen-matcher :disjunction) 0)) (function (e (t r-e-input) (index integer)) r-e-result (const x r-e-match (new r-e-match index (repeat capture undefined (count-parens :disjunction)))) (return (m1 t x success-continuation))) (return e))))) (%print-actions) (define (success-continuation (x r-e-match)) r-e-result (return x)) (%heading 2 "Disjunctions") (rule :disjunction ((gen-matcher (-> (integer) matcher)) (count-parens integer)) (production :disjunction (:alternative) disjunction-one ((gen-matcher paren-index) (return ((gen-matcher :alternative) paren-index))) (count-parens (count-parens :alternative))) (production :disjunction (:alternative #\| :disjunction) disjunction-more ((gen-matcher paren-index) (const m1 matcher ((gen-matcher :alternative) paren-index)) (const m2 matcher ((gen-matcher :disjunction) (+ paren-index (count-parens :alternative)))) (function (m3 (t r-e-input) (x r-e-match) (c continuation)) r-e-result (const y r-e-result (m1 t x c)) (case y (:select r-e-match (return y)) (:select (tag failure) (return (m2 t x c))))) (return m3)) (count-parens (+ (count-parens :alternative) (count-parens :disjunction))))) (%print-actions) (%heading 2 "Alternatives") (rule :alternative ((gen-matcher (-> (integer) matcher)) (count-parens integer)) (production :alternative () alternative-none ((gen-matcher (paren-index :unused)) (function (m (t r-e-input :unused) (x r-e-match) (c continuation)) r-e-result (return (c x))) (return m)) (count-parens 0)) (production :alternative (:alternative :term) alternative-some ((gen-matcher paren-index) (const m1 matcher ((gen-matcher :alternative) paren-index)) (const m2 matcher ((gen-matcher :term) (+ paren-index (count-parens :alternative)))) (function (m3 (t r-e-input) (x r-e-match) (c continuation)) r-e-result (function (d (y r-e-match)) r-e-result (return (m2 t y c))) (return (m1 t x d))) (return m3)) (count-parens (+ (count-parens :alternative) (count-parens :term))))) (%print-actions) (%heading 2 "Terms") (rule :term ((gen-matcher (-> (integer) matcher)) (count-parens integer)) (production :term (:assertion) term-assertion ((gen-matcher (paren-index :unused)) (function (m (t r-e-input) (x r-e-match) (c continuation)) r-e-result (if ((test-assertion :assertion) t x) (return (c x)) (return failure))) (return m)) (count-parens 0)) (production :term (:atom) term-atom ((gen-matcher paren-index) (return ((gen-matcher :atom) paren-index))) (count-parens (count-parens :atom))) (production :term (:atom :quantifier) term-quantified-atom ((gen-matcher paren-index) (const m matcher ((gen-matcher :atom) paren-index)) (const min integer (minimum :quantifier)) (const max limit (maximum :quantifier)) (const greedy boolean (greedy :quantifier)) (when (not-in max (tag +infinity) :narrow-true) (rwhen (< max min) (throw syntax-error))) (return (repeat-matcher m min max greedy paren-index (count-parens :atom)))) (count-parens (count-parens :atom)))) (%print-actions) (rule :quantifier ((minimum integer) (maximum limit) (greedy boolean)) (production :quantifier (:quantifier-prefix) quantifier-eager (minimum (minimum :quantifier-prefix)) (maximum (maximum :quantifier-prefix)) (greedy true)) (production :quantifier (:quantifier-prefix #\?) quantifier-greedy (minimum (minimum :quantifier-prefix)) (maximum (maximum :quantifier-prefix)) (greedy false))) (rule :quantifier-prefix ((minimum integer) (maximum limit)) (production :quantifier-prefix (#\*) quantifier-prefix-zero-or-more (minimum 0) (maximum +infinity)) (production :quantifier-prefix (#\+) quantifier-prefix-one-or-more (minimum 1) (maximum +infinity)) (production :quantifier-prefix (#\?) quantifier-prefix-zero-or-one (minimum 0) (maximum 1)) (production :quantifier-prefix (#\{ :decimal-digits #\}) quantifier-prefix-repeat (minimum (integer-value :decimal-digits)) (maximum (integer-value :decimal-digits))) (production :quantifier-prefix (#\{ :decimal-digits #\, #\}) quantifier-prefix-repeat-or-more (minimum (integer-value :decimal-digits)) (maximum +infinity)) (production :quantifier-prefix (#\{ :decimal-digits #\, :decimal-digits #\}) quantifier-prefix-repeat-range (minimum (integer-value :decimal-digits 1)) (maximum (integer-value :decimal-digits 2)))) (rule :decimal-digits ((integer-value integer)) (production :decimal-digits (:decimal-digit) decimal-digits-first (integer-value (decimal-value :decimal-digit))) (production :decimal-digits (:decimal-digits :decimal-digit) decimal-digits-rest (integer-value (+ (* 10 (integer-value :decimal-digits)) (decimal-value :decimal-digit))))) (%charclass :decimal-digit) (deftag +infinity) (deftype limit (union integer (tag +infinity))) (define (reset-parens (x r-e-match) (p integer) (n-parens integer)) r-e-match (var captures (vector capture) (& captures x)) (var i integer p) (while (< i (+ p n-parens)) (<- captures (set-nth captures i undefined)) (<- i (+ i 1))) (return (new r-e-match (& end-index x) captures))) (define (repeat-matcher (body matcher) (min integer) (max limit) (greedy boolean) (paren-index integer) (n-body-parens integer)) matcher (function (m (t r-e-input) (x r-e-match) (c continuation)) r-e-result (rwhen (= max 0 limit) (return (c x))) (function (d (y r-e-match)) r-e-result (rwhen (and (= min 0) (= (& end-index y) (& end-index x))) (return failure)) (var new-min integer min) (when (/= min 0) (<- new-min (- min 1))) (var new-max limit max) (when (not-in max (tag +infinity) :narrow-true) (<- new-max (- max 1))) (const m2 matcher (repeat-matcher body new-min new-max greedy paren-index n-body-parens)) (return (m2 t y c))) (const xr r-e-match (reset-parens x paren-index n-body-parens)) (cond ((/= min 0) (return (body t xr d))) (greedy (const z r-e-result (body t xr d)) (case z (:select r-e-match (return z)) (:select (tag failure) (return (c x))))) (nil (const z r-e-result (c x)) (case z (:select r-e-match (return z)) (:select (tag failure) (return (body t xr d))))))) (return m)) (%print-actions) (%heading 2 "Assertions") (rule :assertion ((test-assertion (-> (r-e-input r-e-match) boolean))) (production :assertion (#\^) assertion-beginning ((test-assertion t x) (return (or (= (& end-index x) 0) (and (& multiline t) (set-in (nth (& str t) (- (& end-index x) 1)) line-terminators)))))) (production :assertion (#\$) assertion-end ((test-assertion t x) (return (or (= (& end-index x) (length (& str t))) (and (& multiline t) (set-in (nth (& str t) (& end-index x)) line-terminators)))))) (production :assertion (#\\ #\b) assertion-word-boundary ((test-assertion t x) (return (at-word-boundary (& end-index x) (& str t))))) (production :assertion (#\\ #\B) assertion-non-word-boundary ((test-assertion t x) (return (not (at-word-boundary (& end-index x) (& str t))))))) (%print-actions) (define (at-word-boundary (i integer) (s string)) boolean (return (xor (in-word (- i 1) s) (in-word i s)))) (define (in-word (i integer) (s string)) boolean (if (or (= i -1) (= i (length s))) (return false) (return (set-in (nth s i) re-word-characters)))) (%heading 1 "Atoms") (rule :atom ((gen-matcher (-> (integer) matcher)) (count-parens integer)) (production :atom (:pattern-character) atom-pattern-character ((gen-matcher (paren-index :unused)) (return (character-matcher ($default-action :pattern-character)))) (count-parens 0)) (production :atom (#\.) atom-dot ((gen-matcher (paren-index :unused)) (function (m1 (t r-e-input) (x r-e-match) (c continuation)) r-e-result (const a (range-set char16) (if (& span t) (range-set-of char16) line-terminators)) (const m2 matcher (character-set-matcher a true)) (return (m2 t x c))) (return m1)) (count-parens 0)) (production :atom (:null-escape) atom-null-escape ((gen-matcher (paren-index :unused)) (function (m (t r-e-input :unused) (x r-e-match) (c continuation)) r-e-result (return (c x))) (return m)) (count-parens 0)) (production :atom (#\\ :atom-escape) atom-atom-escape ((gen-matcher paren-index) (return ((gen-matcher :atom-escape) paren-index))) (count-parens 0)) (production :atom (:character-class) atom-character-class ((gen-matcher (paren-index :unused)) (const a (range-set char16) (acceptance-set :character-class)) (return (character-set-matcher a (invert :character-class)))) (count-parens 0)) (production :atom (#\( :disjunction #\)) atom-parentheses ((gen-matcher paren-index) (const m1 matcher ((gen-matcher :disjunction) (+ paren-index 1))) (function (m2 (t r-e-input) (x r-e-match) (c continuation)) r-e-result (function (d (y r-e-match)) r-e-result (const ref capture (subseq (& str t) (& end-index x) (- (& end-index y) 1))) (const updated-captures (vector capture) (set-nth (& captures y) paren-index ref)) (return (c (new r-e-match (& end-index y) updated-captures)))) (return (m1 t x d))) (return m2)) (count-parens (+ (count-parens :disjunction) 1))) (production :atom (#\( #\? #\: :disjunction #\)) atom-non-capturing-parentheses ((gen-matcher paren-index) (return ((gen-matcher :disjunction) paren-index))) (count-parens (count-parens :disjunction))) (production :atom (#\( #\? #\= :disjunction #\)) atom-positive-lookahead ((gen-matcher paren-index) (const m1 matcher ((gen-matcher :disjunction) paren-index)) (function (m2 (t r-e-input) (x r-e-match) (c continuation)) r-e-result ;(function (d (y r-e-match)) r-e-result ; (return (c (new r-e-match (& end-index x) (& captures y))))) ;(return (m1 t x d))))) (const y r-e-result (m1 t x success-continuation)) (case y (:narrow r-e-match (return (c (new r-e-match (& end-index x) (& captures y))))) (:select (tag failure) (return failure)))) (return m2)) (count-parens (count-parens :disjunction))) (production :atom (#\( #\? #\! :disjunction #\)) atom-negative-lookahead ((gen-matcher paren-index) (const m1 matcher ((gen-matcher :disjunction) paren-index)) (function (m2 (t r-e-input) (x r-e-match) (c continuation)) r-e-result (case (m1 t x success-continuation) (:select r-e-match (return failure)) (:select (tag failure) (return (c x))))) (return m2)) (count-parens (count-parens :disjunction)))) (%charclass :pattern-character) (%print-actions) (%heading 1 "Escapes") (production :null-escape (#\\ #\_) null-escape-underscore) (rule :atom-escape ((gen-matcher (-> (integer) matcher))) (production :atom-escape (:decimal-escape) atom-escape-decimal ((gen-matcher paren-index) (const n integer (escape-value :decimal-escape)) (cond ((= n 0) (return (character-matcher #?0000))) ((> n paren-index) (throw syntax-error)) (nil (return (backreference-matcher n)))))) (production :atom-escape (:character-escape) atom-escape-character ((gen-matcher (paren-index :unused)) (return (character-matcher (character-value :character-escape))))) (production :atom-escape (:character-class-escape) atom-escape-character-class ((gen-matcher (paren-index :unused)) (return (character-set-matcher (acceptance-set :character-class-escape) false))))) (%print-actions) (define (backreference-matcher (n integer)) matcher (function (m (t r-e-input) (x r-e-match) (c continuation)) r-e-result (const ref capture (nth-backreference x n)) (case ref (:narrow string (const i integer (& end-index x)) (const s string (& str t)) (const j integer (+ i (length ref))) (if (and (<= j (length s)) (= (subseq s i (- j 1)) ref string)) ;*********ignore case? (return (c (new r-e-match j (& captures x)))) (return failure))) (:select (tag undefined) (return (c x))))) (return m)) (define (nth-backreference (x r-e-match) (n integer)) capture (return (nth (& captures x) (- n 1)))) (rule :character-escape ((character-value char16)) (production :character-escape (:control-escape) character-escape-control (character-value (character-value :control-escape))) (production :character-escape (#\c :control-letter) character-escape-control-letter (character-value (integer-to-char16 (bitwise-and (char16-to-integer ($default-action :control-letter)) 31)))) (production :character-escape (:hex-escape) character-escape-hex (character-value (character-value :hex-escape))) (production :character-escape (:identity-escape) character-escape-identity (character-value ($default-action :identity-escape)))) (%charclass :control-letter) (%charclass :identity-escape) (rule :control-escape ((character-value char16)) (production :control-escape (#\f) control-escape-form-feed (character-value #?000C)) (production :control-escape (#\n) control-escape-new-line (character-value #?000A)) (production :control-escape (#\r) control-escape-return (character-value #?000D)) (production :control-escape (#\t) control-escape-tab (character-value #?0009)) (production :control-escape (#\v) control-escape-vertical-tab (character-value #?000B))) (%print-actions) (%heading 2 "Decimal Escapes") (rule :decimal-escape ((escape-value integer)) (production :decimal-escape (:decimal-integer-literal (:- :decimal-digit)) decimal-escape-integer (escape-value (integer-value :decimal-integer-literal)))) (rule :decimal-integer-literal ((integer-value integer)) (production :decimal-integer-literal (#\0) decimal-integer-literal-0 (integer-value 0)) (production :decimal-integer-literal (:non-zero-decimal-digits) decimal-integer-literal-nonzero (integer-value (integer-value :non-zero-decimal-digits)))) (rule :non-zero-decimal-digits ((integer-value integer)) (production :non-zero-decimal-digits (:non-zero-digit) non-zero-decimal-digits-first (integer-value (decimal-value :non-zero-digit))) (production :non-zero-decimal-digits (:non-zero-decimal-digits :decimal-digit) non-zero-decimal-digits-rest (integer-value (+ (* 10 (integer-value :non-zero-decimal-digits)) (decimal-value :decimal-digit))))) (%charclass :non-zero-digit) (%print-actions) (%heading 2 "Hexadecimal Escapes") (rule :hex-escape ((character-value char16)) (production :hex-escape (#\x :hex-digit :hex-digit) hex-escape-2 (character-value (integer-to-char16 (+ (* 16 (hex-value :hex-digit 1)) (hex-value :hex-digit 2))))) (production :hex-escape (#\u :hex-digit :hex-digit :hex-digit :hex-digit) hex-escape-4 (character-value (integer-to-char16 (+ (+ (+ (* 4096 (hex-value :hex-digit 1)) (* 256 (hex-value :hex-digit 2))) (* 16 (hex-value :hex-digit 3))) (hex-value :hex-digit 4)))))) (%charclass :hex-digit) (%print-actions) (%heading 2 "Character Class Escapes") (rule :character-class-escape ((acceptance-set (range-set char16))) (production :character-class-escape (#\s) character-class-escape-whitespace (acceptance-set re-whitespaces)) (production :character-class-escape (#\S) character-class-escape-non-whitespace (acceptance-set (set- (range-set-of-ranges char16 #?0000 #?FFFF) re-whitespaces))) (production :character-class-escape (#\d) character-class-escape-digit (acceptance-set re-digits)) (production :character-class-escape (#\D) character-class-escape-non-digit (acceptance-set (set- (range-set-of-ranges char16 #?0000 #?FFFF) re-digits))) (production :character-class-escape (#\w) character-class-escape-word (acceptance-set re-word-characters)) (production :character-class-escape (#\W) character-class-escape-non-word (acceptance-set (set- (range-set-of-ranges char16 #?0000 #?FFFF) re-word-characters)))) (%print-actions) (%heading 1 "User-Specified Character Classes") (rule :character-class ((acceptance-set (range-set char16)) (invert boolean)) (production :character-class (#\[ (:- #\^) :class-ranges #\]) character-class-positive (acceptance-set (acceptance-set :class-ranges)) (invert false)) (production :character-class (#\[ #\^ :class-ranges #\]) character-class-negative (acceptance-set (acceptance-set :class-ranges)) (invert true))) (rule :class-ranges ((acceptance-set (range-set char16))) (production :class-ranges () class-ranges-none (acceptance-set (range-set-of char16))) (production :class-ranges ((:nonempty-class-ranges dash)) class-ranges-some (acceptance-set (acceptance-set :nonempty-class-ranges)))) (grammar-argument :delta dash no-dash) (rule (:nonempty-class-ranges :delta) ((acceptance-set (range-set char16))) (production (:nonempty-class-ranges :delta) ((:class-atom dash)) nonempty-class-ranges-final (acceptance-set (acceptance-set :class-atom))) (production (:nonempty-class-ranges :delta) ((:class-atom :delta) (:nonempty-class-ranges no-dash)) nonempty-class-ranges-non-final (acceptance-set (set+ (acceptance-set :class-atom) (acceptance-set :nonempty-class-ranges)))) (production (:nonempty-class-ranges :delta) ((:class-atom :delta) #\- (:class-atom dash) :class-ranges) nonempty-class-ranges-range (acceptance-set (set+ (character-range (acceptance-set :class-atom 1) (acceptance-set :class-atom 2)) (acceptance-set :class-ranges)))) (production (:nonempty-class-ranges :delta) (:null-escape :class-ranges) nonempty-class-ranges-null-escape (acceptance-set (acceptance-set :class-ranges)))) (%print-actions) (define (character-range (low (range-set char16)) (high (range-set char16))) (range-set char16) (rwhen (or (/= (length low) 1) (/= (length high) 1)) (throw syntax-error)) (const l char16 (unique-elt-of low)) (const h char16 (unique-elt-of high)) (if (<= l h char16) (return (range-set-of-ranges char16 l h)) (throw syntax-error))) (%heading 2 "Character Class Range Atoms") (rule (:class-atom :delta) ((acceptance-set (range-set char16))) (production (:class-atom :delta) ((:class-character :delta)) class-atom-character (acceptance-set (range-set-of char16 ($default-action :class-character)))) (production (:class-atom :delta) (#\\ :class-escape) class-atom-escape (acceptance-set (acceptance-set :class-escape)))) (%charclass (:class-character dash)) (%charclass (:class-character no-dash)) (rule :class-escape ((acceptance-set (range-set char16))) (production :class-escape (:decimal-escape) class-escape-decimal (acceptance-set (begin (if (= (escape-value :decimal-escape) 0) (return (range-set-of char16 #?0000)) (throw syntax-error))))) (production :class-escape (#\b) class-escape-backspace (acceptance-set (range-set-of char16 #?0008))) (production :class-escape (:character-escape) class-escape-character-escape (acceptance-set (range-set-of char16 (character-value :character-escape)))) (production :class-escape (:character-class-escape) class-escape-character-class-escape (acceptance-set (acceptance-set :character-class-escape)))) (%print-actions) ))) (defparameter *rl* (world-lexer *rw* 'regexp-lexer)) (defparameter *rg* (lexer-grammar *rl*))) (eval-when (:load-toplevel :execute) (defun run-regexp (regexp input &key ignore-case multiline span) (let ((execute (first (lexer-parse *rl* regexp)))) (dotimes (i (length input) :failure) (let ((result (funcall execute (list 'r:r-e-input input ignore-case multiline span) i))) (unless (eq result :failure) (assert-true (eq (first result) 'r:r-e-match)) (return (list* i (subseq input i (second result)) (cddr result))))))))) (defun dump-regexp () (values (depict-rtf-to-local-file "JS20/RegExpGrammar.rtf" "Regular Expression Grammar" #'(lambda (rtf-stream) (depict-world-commands rtf-stream *rw* :heading-offset 1 :visible-semantics nil))) (depict-rtf-to-local-file "JS20/RegExpSemantics.rtf" "Regular Expression Semantics" #'(lambda (rtf-stream) (depict-world-commands rtf-stream *rw* :heading-offset 1))) (depict-html-to-local-file "JS20/RegExpGrammar.html" "Regular Expression Grammar" t #'(lambda (html-stream) (depict-world-commands html-stream *rw* :heading-offset 1 :visible-semantics nil)) :external-link-base "notation.html") (depict-html-to-local-file "JS20/RegExpSemantics.html" "Regular Expression Semantics" t #'(lambda (html-stream) (depict-world-commands html-stream *rw* :heading-offset 1)) :external-link-base "notation.html"))) #| (dump-regexp) (with-local-output (s "JS20/RegExpGrammar.txt") (print-lexer *rl* s) (print-grammar *rg* s)) (lexer-pparse *rl* "a+" :trace t) (lexer-pparse *rl* "[]+" :trace t) (run-regexp "(0x|0)2" "0x20") (run-regexp "(a*)b\\1+c" "aabaaaac") (run-regexp "(a*)b\\1+c" "aabaabaaaac") (run-regexp "(a*)b\\1+" "baaaac") (run-regexp "b(a+)(a+)?(a+)c" "baaaac") (run-regexp "(((a+)?(b+)?c)*)" "aacbbbcac") (run-regexp "(\\s\\S\\s)" "aac xa d fds fds sac") (run-regexp "(\\s)" "aac xa deac") (run-regexp "[01234]+aa+" "93-43aabbc") (run-regexp "[\\101A-ae-]+" "93ABC-@ezy43abc") (run-regexp "[\\181A-ae-]+" "93ABC-@ezy43abc") (run-regexp "b[ace]+" "baaaacecfe") (run-regexp "b[^a]+" "baaaabc") (run-regexp "(?=(a+))a*b\\1" "baaabac") (run-regexp "(?=(a+))" "baaabac") (run-regexp "(.*?)a(?!(a+)b\\2c)\\2(.*)" "baaabaac") (run-regexp "(aa|aabaac|ba|b|c)*" "aabaac") (run-regexp "[\\_^01234]+\\_aa+" "93-43aabbc") (run-regexp "a." "AAab") (run-regexp "a." "AAab" :ignore-case t) (run-regexp "a.." (concatenate 'string "a" (string #\newline) "bacd")) (run-regexp "a.." (concatenate 'string "a" (string #\newline) "bacd") :span t) (run-regexp "(a|b*)*" "a") (run-regexp "(b*)*" "a") (run-regexp "^\\-?(\\d{1,}|\\.{0,})*(\\,\\d{1,})?$" "100.00") (run-regexp "^\\-?(\\d{1,}|\\.{0,})*(\\,\\d{1,})?$" "100,00") (run-regexp "^\\-?(\\d{1,}|\\.{0,})*(\\,\\d{1,})?$" "1.000,00") (run-regexp "^(a\\1?){4}$" "aaaaaaaaaa") (run-regexp "(?:(f)(o)(o)|(b)(a)(r))*" "foobar") (run-regexp "(a)?a(b)" "ab") (run-regexp "^(?:b|a(?=(.)))*\\1" "abc") |# #+allegro (clean-grammar *rg*) ;Remove this line if you wish to print the grammar's state tables. (length (grammar-states *rg*))