1999-11-03 06:58:07 +03:00
|
|
|
;;; The contents of this file are subject to the Mozilla Public
|
|
|
|
;;; License Version 1.1 (the "License"); you may not use this file
|
|
|
|
;;; except in compliance with the License. You may obtain a copy of
|
|
|
|
;;; the License at http://www.mozilla.org/MPL/
|
|
|
|
;;;
|
|
|
|
;;; Software distributed under the License is distributed on an "AS
|
|
|
|
;;; IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
|
|
|
|
;;; implied. See the License for the specific language governing
|
|
|
|
;;; rights and limitations under the License.
|
|
|
|
;;;
|
|
|
|
;;; The Original Code is the Language Design and Prototyping Environment.
|
|
|
|
;;;
|
|
|
|
;;; The Initial Developer of the Original Code is Netscape Communications
|
|
|
|
;;; Corporation. Portions created by Netscape Communications Corporation are
|
|
|
|
;;; Copyright (C) 1999 Netscape Communications Corporation. All
|
|
|
|
;;; Rights Reserved.
|
|
|
|
;;;
|
|
|
|
;;; Contributor(s): Waldemar Horwat <waldemar@acm.org>
|
1999-01-30 04:19:00 +03:00
|
|
|
|
|
|
|
;;;
|
|
|
|
;;; Lexer grammar generator
|
|
|
|
;;;
|
1999-11-03 06:58:07 +03:00
|
|
|
;;; Waldemar Horwat (waldemar@acm.org)
|
1999-01-30 04:19:00 +03:00
|
|
|
;;;
|
|
|
|
|
|
|
|
|
|
|
|
;;; A lexer grammar is an extension of a standard grammar that combines both parsing and combining
|
|
|
|
;;; characters into character classes.
|
|
|
|
;;;
|
|
|
|
;;; A lexer grammar is comprised of the following:
|
|
|
|
;;; a start nonterminal;
|
|
|
|
;;; a list of grammar productions, in which each terminal must be a character;
|
|
|
|
;;; a list of character classes, where each class is a list of:
|
|
|
|
;;; a nonterminal C;
|
|
|
|
;;; an expression <set-expr> that denotes the set of characters in character class C;
|
|
|
|
;;; a list of bindings, each containing:
|
|
|
|
;;; an action name;
|
|
|
|
;;; a lexer-action name;
|
|
|
|
;;; a list of lexer-action bindings, each containing:
|
|
|
|
;;; a lexer-action name;
|
|
|
|
;;; the type of this lexer-action's value;
|
|
|
|
;;; the name of a lisp function (char -> value) that performs the lexer-action on a character.
|
|
|
|
;;;
|
|
|
|
;;; Grammar productions may refer to character classes C as nonterminals.
|
|
|
|
;;;
|
|
|
|
;;; An expression <set-expr> can be any of the following:
|
|
|
|
;;; C The name of a previously defined character class.
|
|
|
|
;;; (char1 char2 ... charn) The set of characters {char1, char2, ..., charn}
|
|
|
|
;;; (+ <set-expr1> ... <set-exprn>) The set union of <set-expr1>, ..., <set-exprn>,
|
|
|
|
;;; which should be disjoint.
|
|
|
|
;;; (++ <set-expr1> ... <set-exprn>) Same as +, but printed on separate lines.
|
|
|
|
;;; (- <set-expr1> <set-expr2>) The set of characters in <set-expr1> but not <set-expr2>;
|
|
|
|
;;; <set-expr2> should be a subset of <set-expr1>.
|
1999-05-11 01:04:53 +04:00
|
|
|
;;; (% <builtin-class> . <description>) A predefined set of characters. <description> is suitable for
|
|
|
|
;;; depicting.
|
|
|
|
;;;
|
|
|
|
;;; <builtin-class> can be one of the following:
|
|
|
|
;;; every The set of all characters
|
|
|
|
;;; initial-alpha The set of characters suitable for the beginning of a Unicode identifier
|
|
|
|
;;; alphanumeric The set of Unicode identifier continuation characters
|
1999-01-30 04:19:00 +03:00
|
|
|
|
|
|
|
|
|
|
|
;;; ------------------------------------------------------------------------------------------------------
|
|
|
|
;;; SETS OF CHARACTERS
|
|
|
|
|
|
|
|
;;; A character set is represented by an integer.
|
|
|
|
;;; The set may be infinite as long as its complement is finite.
|
|
|
|
;;; Bit n is set if the character with code n is a member of the set.
|
|
|
|
;;; The integer is negative if the set is infinite.
|
|
|
|
|
|
|
|
|
|
|
|
; Print the charset
|
|
|
|
(defun print-charset (charset &optional (stream t))
|
|
|
|
(pprint-logical-block (stream (bitmap-to-ranges charset) :prefix "{" :suffix "}")
|
|
|
|
(pprint-exit-if-list-exhausted)
|
|
|
|
(loop
|
|
|
|
(flet
|
|
|
|
((int-to-char (i)
|
|
|
|
(if (or (eq i :infinity) (= i char-code-limit))
|
|
|
|
:infinity
|
|
|
|
(code-char i))))
|
|
|
|
(let* ((range (pprint-pop))
|
|
|
|
(lo (int-to-char (car range)))
|
|
|
|
(hi (int-to-char (cdr range))))
|
|
|
|
(write (if (eql lo hi) lo (list lo hi)) :stream stream :pretty t)
|
|
|
|
(pprint-exit-if-list-exhausted)
|
|
|
|
(format stream " ~:_"))))))
|
|
|
|
|
|
|
|
|
1999-05-11 01:04:53 +04:00
|
|
|
(defconstant *empty-charset* 0)
|
|
|
|
|
|
|
|
|
1999-01-30 04:19:00 +03:00
|
|
|
; Return the character set consisting of the single character char.
|
|
|
|
(declaim (inline char-charset))
|
|
|
|
(defun char-charset (char)
|
|
|
|
(ash 1 (char-code char)))
|
|
|
|
|
|
|
|
|
|
|
|
; Return the character set consisting of adding char to the given charset.
|
|
|
|
(defun charset-add-char (charset char)
|
|
|
|
(let ((i (char-code char)))
|
|
|
|
(if (logbitp i charset)
|
|
|
|
charset
|
|
|
|
(logior charset (ash 1 i)))))
|
|
|
|
|
|
|
|
|
1999-05-11 01:04:53 +04:00
|
|
|
; Return the character set consisting of adding the character range to the given charset.
|
|
|
|
(defun charset-add-range (charset low-char high-char)
|
|
|
|
(let ((low (char-code low-char))
|
|
|
|
(high (char-code high-char)))
|
|
|
|
(assert-true (>= high low))
|
|
|
|
(dpb -1 (byte (1+ (- high low)) low) charset)))
|
|
|
|
|
|
|
|
|
1999-01-30 04:19:00 +03:00
|
|
|
; Return the union of the two character sets, which should be disjoint.
|
|
|
|
(defun charset-union (charset1 charset2)
|
|
|
|
(unless (zerop (logand charset1 charset2))
|
|
|
|
(error "Union of overlapping character sets"))
|
|
|
|
(logior charset1 charset2))
|
|
|
|
|
|
|
|
|
|
|
|
; Return the difference of the two character sets, the second of which should be
|
|
|
|
; a subset of the first.
|
|
|
|
(defun charset-difference (charset1 charset2)
|
|
|
|
(unless (zerop (logandc1 charset1 charset2))
|
|
|
|
(error "Difference of non-subset character sets"))
|
|
|
|
(logandc2 charset1 charset2))
|
|
|
|
|
|
|
|
|
|
|
|
; Return true if the character set is empty.
|
|
|
|
(declaim (inline charset-empty?))
|
|
|
|
(defun charset-empty? (charset)
|
|
|
|
(zerop charset))
|
|
|
|
|
|
|
|
|
|
|
|
; Return true if the character set is infinite.
|
|
|
|
(declaim (inline charset-infinite?))
|
|
|
|
(defun charset-infinite? (charset)
|
|
|
|
(minusp charset))
|
|
|
|
|
|
|
|
|
1999-05-11 01:04:53 +04:00
|
|
|
; Return true if the character set contains the given character.
|
|
|
|
(declaim (inline char-in-charset?))
|
|
|
|
(defun char-in-charset? (charset char)
|
|
|
|
(logbitp (char-code char) charset))
|
|
|
|
|
|
|
|
|
1999-01-30 04:19:00 +03:00
|
|
|
; If the character set contains exactly one character, return that character;
|
|
|
|
; otherwise, return nil.
|
|
|
|
(defun charset-char (charset)
|
|
|
|
(let ((hi (1- (integer-length charset))))
|
|
|
|
(and (plusp charset) (= charset (ash 1 hi)) (code-char hi))))
|
|
|
|
|
|
|
|
|
|
|
|
; Return the highest character in the character set, which must be finite and nonempty.
|
|
|
|
(declaim (inline charset-highest-char))
|
|
|
|
(defun charset-highest-char (charset)
|
|
|
|
(assert-true (plusp charset))
|
|
|
|
(code-char (1- (integer-length charset))))
|
|
|
|
|
|
|
|
|
|
|
|
; Given a list of charsets, return a list of the largest possible
|
|
|
|
; charsets (called partitions) such that:
|
|
|
|
; for any input charset C and partition P, either P is entirely contained in C or it is disjoint from C;
|
|
|
|
; all partitions are mutually disjoint;
|
|
|
|
; the union of all partitions is the infinite set of all characters.
|
|
|
|
(defun compute-partitions (charsets)
|
|
|
|
(labels
|
|
|
|
((split-partitions (partitions charset)
|
|
|
|
(mapcan #'(lambda (partition)
|
|
|
|
(remove-if #'zerop (list (logand partition charset) (logandc2 partition charset))))
|
|
|
|
partitions))
|
|
|
|
(partition< (partition1 partition2)
|
|
|
|
(cond
|
|
|
|
((minusp partition1) nil)
|
|
|
|
((minusp partition2) t)
|
|
|
|
(t (< partition1 partition2)))))
|
|
|
|
(sort (reduce #'split-partitions charsets :initial-value '(-1))
|
|
|
|
#'partition<)))
|
|
|
|
|
|
|
|
|
1999-05-11 01:04:53 +04:00
|
|
|
;;; ------------------------------------------------------------------------------------------------------
|
|
|
|
;;; PREDEFINED SETS OF CHARACTERS
|
|
|
|
|
|
|
|
(defmacro predefined-character-set (symbol)
|
|
|
|
`(get ,symbol 'predefined-character-set))
|
|
|
|
|
|
|
|
|
|
|
|
; Predefine a character set with the given name. The set is specified by char-ranges, which is a
|
|
|
|
; list of single characters or two-elements (low-char high-char) lists; both low-char and high-char
|
|
|
|
; are inclusive.
|
|
|
|
|
|
|
|
(defun define-character-set (symbol char-ranges)
|
|
|
|
(let ((charset *empty-charset*))
|
|
|
|
(dolist (char-range char-ranges)
|
|
|
|
(setq charset
|
|
|
|
(if (characterp char-range)
|
|
|
|
(charset-add-char charset char-range)
|
|
|
|
(charset-add-range charset (first char-range) (second char-range)))))
|
|
|
|
(setf (predefined-character-set symbol) charset)))
|
|
|
|
|
|
|
|
|
|
|
|
(setf (predefined-character-set 'every) -1)
|
|
|
|
(define-character-set 'initial-alpha '((#\A #\Z) (#\a #\z)))
|
|
|
|
(define-character-set 'alphanumeric '((#\0 #\9) (#\A #\Z) (#\a #\z)))
|
|
|
|
|
|
|
|
(define-character-set '*ordinary-initial-identifier-character* '(#\$ #\_ (#\A #\Z) (#\a #\z)))
|
|
|
|
(define-character-set '*ordinary-continuing-identifier-character* '(#\$ #\_ (#\0 #\9) (#\A #\Z) (#\a #\z)))
|
|
|
|
|
|
|
|
(defun ordinary-initial-identifier-character? (char)
|
|
|
|
(char-in-charset? (predefined-character-set '*ordinary-initial-identifier-character*) char))
|
|
|
|
|
|
|
|
(defun ordinary-continuing-identifier-character? (char)
|
|
|
|
(char-in-charset? (predefined-character-set '*ordinary-continuing-identifier-character*) char))
|
|
|
|
|
|
|
|
|
1999-01-30 04:19:00 +03:00
|
|
|
;;; ------------------------------------------------------------------------------------------------------
|
|
|
|
;;; LEXER-ACTIONS
|
|
|
|
|
1999-07-27 05:00:05 +04:00
|
|
|
(defstruct (lexer-action (:constructor make-lexer-action (name number type-expr function-name function))
|
1999-01-30 04:19:00 +03:00
|
|
|
(:copier nil)
|
|
|
|
(:predicate lexer-action?))
|
|
|
|
(name nil :type identifier :read-only t) ;The action name to use for this lexer-action
|
|
|
|
(number nil :type integer :read-only t) ;Serial number of this lexer-action
|
1999-07-27 05:00:05 +04:00
|
|
|
(type-expr nil :read-only t) ;A type expression that specifies the result type of function
|
|
|
|
(function-name nil :type (or null identifier) :read-only t) ;Name of external function to use when depicting this lexer-action
|
|
|
|
(function nil :type identifier :read-only t)) ;A lisp function (char -> value) that performs the lexer-action on a character
|
1999-01-30 04:19:00 +03:00
|
|
|
|
|
|
|
|
|
|
|
(defun print-lexer-action (lexer-action &optional (stream t))
|
|
|
|
(format stream "~@<~A ~@_~:I: ~<<<~;~W~;>>~:> ~_= ~<<~;#'~W~;>~:>~:>"
|
|
|
|
(lexer-action-name lexer-action)
|
|
|
|
(list (lexer-action-type-expr lexer-action))
|
1999-07-27 05:00:05 +04:00
|
|
|
(list (lexer-action-function lexer-action))))
|
1999-01-30 04:19:00 +03:00
|
|
|
|
|
|
|
|
|
|
|
;;; ------------------------------------------------------------------------------------------------------
|
|
|
|
;;; CHARCLASSES
|
|
|
|
|
1999-05-11 01:04:53 +04:00
|
|
|
(defstruct (charclass (:constructor make-charclass (nonterminal charset-source charset actions hidden))
|
1999-01-30 04:19:00 +03:00
|
|
|
(:predicate charclass?))
|
|
|
|
(nonterminal nil :type nonterminal :read-only t) ;The nonterminal on the left-hand side of this production
|
|
|
|
(charset-source nil :read-only t) ;The source expression for the charset
|
|
|
|
(charset nil :type integer :read-only t) ;The set of characters in this class
|
1999-05-11 01:04:53 +04:00
|
|
|
(actions nil :type list :read-only t) ;List of (action-name . lexer-action)
|
|
|
|
(hidden nil :type bool :read-only t)) ;True if this charclass should not be in the grammar
|
1999-01-30 04:19:00 +03:00
|
|
|
|
|
|
|
|
1999-05-17 09:04:07 +04:00
|
|
|
; Return a copy of the charset expr with all parametrized nonterminals interned.
|
|
|
|
(defun intern-charset-expr (parametrization expr)
|
|
|
|
(cond
|
|
|
|
((or (not (consp expr)) (eq (first expr) '%)) expr)
|
|
|
|
((keywordp (first expr)) (assert-type (grammar-parametrization-intern parametrization expr) nonterminal))
|
|
|
|
(t (mapcar #'(lambda (subexpr)
|
|
|
|
(intern-charset-expr parametrization subexpr))
|
|
|
|
expr))))
|
|
|
|
|
|
|
|
|
1999-01-30 04:19:00 +03:00
|
|
|
; Evaluate a <set-expr> whose syntax is given at the top of this file.
|
|
|
|
; Return the charset.
|
|
|
|
; charclasses-hash is a hash table of nonterminal -> charclass.
|
|
|
|
(defun eval-charset-expr (charclasses-hash expr)
|
|
|
|
(cond
|
|
|
|
((null expr) 0)
|
1999-05-17 09:04:07 +04:00
|
|
|
((nonterminal? expr)
|
1999-01-30 04:19:00 +03:00
|
|
|
(charclass-charset
|
|
|
|
(or (gethash expr charclasses-hash)
|
|
|
|
(error "Character class ~S not defined" expr))))
|
|
|
|
((consp expr)
|
|
|
|
(labels
|
|
|
|
((recursive-eval (expr)
|
|
|
|
(eval-charset-expr charclasses-hash expr)))
|
1999-05-11 01:04:53 +04:00
|
|
|
(case (first expr)
|
|
|
|
((+ ++) (reduce #'charset-union (rest expr) :initial-value 0 :key #'recursive-eval))
|
|
|
|
(- (unless (rest expr)
|
1999-01-30 04:19:00 +03:00
|
|
|
(error "Bad character set expression ~S" expr))
|
1999-05-11 01:04:53 +04:00
|
|
|
(reduce #'charset-difference (rest expr) :key #'recursive-eval))
|
|
|
|
(% (assert-non-null (predefined-character-set (second expr))))
|
1999-01-30 04:19:00 +03:00
|
|
|
(t (reduce #'charset-union expr :key #'char-charset)))))
|
|
|
|
(t (error "Bad character set expression ~S" expr))))
|
|
|
|
|
|
|
|
|
|
|
|
(defun print-charclass (charclass &optional (stream t))
|
|
|
|
(pprint-logical-block (stream nil)
|
|
|
|
(format stream "~W -> ~@_~:I" (charclass-nonterminal charclass))
|
|
|
|
(print-charset (charclass-charset charclass) stream)
|
|
|
|
(format stream " ~_")
|
1999-05-11 01:04:53 +04:00
|
|
|
(pprint-fill stream (mapcar #'car (charclass-actions charclass)))
|
|
|
|
(when (charclass-hidden charclass)
|
|
|
|
(format stream " ~_hidden"))))
|
1999-01-30 04:19:00 +03:00
|
|
|
|
|
|
|
|
|
|
|
; Emit markup for the lexer charset expression.
|
|
|
|
(defun depict-charset-source (markup-stream expr)
|
|
|
|
(cond
|
|
|
|
((null expr) (error "Can't emit null charset expression"))
|
1999-05-17 09:04:07 +04:00
|
|
|
((nonterminal? expr) (depict-general-nonterminal markup-stream expr :reference))
|
1999-01-30 04:19:00 +03:00
|
|
|
((consp expr)
|
1999-05-11 01:04:53 +04:00
|
|
|
(case (first expr)
|
|
|
|
((+ ++) (depict-list markup-stream #'depict-charset-source (rest expr) :separator " | "))
|
1999-01-30 04:19:00 +03:00
|
|
|
(- (depict-charset-source markup-stream (second expr))
|
|
|
|
(depict markup-stream " " :but-not " ")
|
|
|
|
(depict-list markup-stream #'depict-charset-source (cddr expr) :separator " | "))
|
1999-05-11 01:04:53 +04:00
|
|
|
(% (depict-styled-text markup-stream (cddr expr)))
|
1999-01-30 04:19:00 +03:00
|
|
|
(t (depict-list markup-stream #'depict-terminal expr :separator " | "))))
|
|
|
|
(t (error "Bad character set expression ~S" expr))))
|
|
|
|
|
|
|
|
|
|
|
|
; Emit markup paragraphs for the lexer charclass.
|
|
|
|
(defun depict-charclass (markup-stream charclass)
|
1999-05-11 01:04:53 +04:00
|
|
|
(depict-block-style (markup-stream ':grammar-rule)
|
|
|
|
(let ((nonterminal (charclass-nonterminal charclass))
|
|
|
|
(expr (charclass-charset-source charclass)))
|
|
|
|
(if (and (consp expr) (eq (first expr) '++))
|
|
|
|
(let* ((subexprs (rest expr))
|
|
|
|
(length (length subexprs)))
|
|
|
|
(depict-paragraph (markup-stream ':grammar-lhs)
|
|
|
|
(depict-general-nonterminal markup-stream nonterminal :definition)
|
|
|
|
(depict markup-stream " " ':derives-10))
|
|
|
|
(dotimes (i length)
|
|
|
|
(depict-paragraph (markup-stream (if (= i (1- length)) ':grammar-rhs-last ':grammar-rhs))
|
|
|
|
(if (zerop i)
|
|
|
|
(depict markup-stream ':tab3)
|
|
|
|
(depict markup-stream "|" ':tab2))
|
|
|
|
(depict-charset-source markup-stream (nth i subexprs)))))
|
|
|
|
(depict-paragraph (markup-stream ':grammar-lhs-last)
|
|
|
|
(depict-general-nonterminal markup-stream (charclass-nonterminal charclass) :definition)
|
|
|
|
(depict markup-stream " " ':derives-10 " ")
|
|
|
|
(depict-charset-source markup-stream expr))))))
|
1999-01-30 04:19:00 +03:00
|
|
|
|
|
|
|
|
|
|
|
;;; ------------------------------------------------------------------------------------------------------
|
|
|
|
;;; PARTITIONS
|
|
|
|
|
|
|
|
(defstruct (partition (:constructor make-partition (charset lexer-actions))
|
|
|
|
(:predicate partition?))
|
|
|
|
(charset nil :type integer :read-only t) ;The set of characters in this partition
|
|
|
|
(lexer-actions nil :type list :read-only t)) ;List of lexer-actions needed on characters in this partition
|
|
|
|
|
|
|
|
(defconstant *default-partition-name* '$_other_) ;partition-name to use for characters not found in lexer-char-tokens
|
|
|
|
|
|
|
|
|
|
|
|
(defun print-partition (partition-name partition &optional (stream t))
|
|
|
|
(pprint-logical-block (stream nil)
|
|
|
|
(format stream "~W -> ~@_~:I" partition-name)
|
|
|
|
(print-charset (partition-charset partition) stream)
|
|
|
|
(format stream " ~_")
|
|
|
|
(pprint-fill stream (mapcar #'lexer-action-name (partition-lexer-actions partition)))))
|
|
|
|
|
|
|
|
|
|
|
|
;;; ------------------------------------------------------------------------------------------------------
|
|
|
|
;;; LEXER
|
|
|
|
|
|
|
|
|
|
|
|
(defstruct (lexer (:constructor allocate-lexer)
|
|
|
|
(:copier nil)
|
|
|
|
(:predicate lexer?))
|
|
|
|
(lexer-actions nil :type hash-table :read-only t) ;Hash table of lexer-action-name -> lexer-action
|
|
|
|
(charclasses nil :type list :read-only t) ;List of charclasses in the order in which they were given
|
|
|
|
(charclasses-hash nil :type hash-table :read-only t) ;Hash table of nonterminal -> charclass
|
|
|
|
(char-tokens nil :type hash-table :read-only t) ;Hash table of character -> (character or partition-name)
|
|
|
|
(partition-names nil :type list :read-only t) ;List of partition names in the order in which they were created
|
|
|
|
(partitions nil :type hash-table :read-only t) ;Hash table of partition-name -> partition
|
|
|
|
(grammar nil :type (or null grammar)) ;Grammar that accepts exactly one lexer token
|
|
|
|
(metagrammar nil :type (or null metagrammar))) ;Grammar that accepts the longest input sequence that forms a token
|
|
|
|
|
|
|
|
|
|
|
|
; Return a function (character -> terminal) that classifies an input character
|
|
|
|
; as either itself or a partition-name.
|
|
|
|
; If the returned function is called on a non-character, it returns its input unchanged.
|
|
|
|
(defun lexer-classifier (lexer)
|
|
|
|
(let ((char-tokens (lexer-char-tokens lexer)))
|
|
|
|
#'(lambda (char)
|
|
|
|
(if (characterp char)
|
|
|
|
(gethash char char-tokens *default-partition-name*)
|
|
|
|
char))))
|
|
|
|
|
|
|
|
|
|
|
|
; Return the charclass that defines the given lexer nonterminal or nil if none.
|
|
|
|
(defun lexer-charclass (lexer nonterminal)
|
|
|
|
(gethash nonterminal (lexer-charclasses-hash lexer)))
|
|
|
|
|
|
|
|
|
|
|
|
; Return the charset of all characters that appear as terminals in grammar-source.
|
|
|
|
(defun grammar-singletons (grammar-source)
|
|
|
|
(assert-type grammar-source (list (tuple t (list t) identifier)))
|
|
|
|
(let ((singletons 0))
|
1999-06-04 00:52:04 +04:00
|
|
|
(labels
|
|
|
|
((scan-for-singletons (list)
|
|
|
|
(dolist (element list)
|
|
|
|
(cond
|
|
|
|
((characterp element)
|
|
|
|
(setq singletons (charset-add-char singletons element)))
|
|
|
|
((consp element)
|
|
|
|
(case (first element)
|
|
|
|
(:- (scan-for-singletons (rest element)))
|
|
|
|
(:-- (scan-for-singletons (cddr element)))))))))
|
|
|
|
|
|
|
|
(dolist (production-source grammar-source)
|
|
|
|
(scan-for-singletons (second production-source))))
|
1999-01-30 04:19:00 +03:00
|
|
|
singletons))
|
|
|
|
|
|
|
|
|
|
|
|
; Return the list of all lexer-action-names that appear in at least one charclass of which this
|
|
|
|
; partition is a subset.
|
|
|
|
(defun collect-lexer-action-names (charclasses partition)
|
|
|
|
(let ((lexer-action-names nil))
|
|
|
|
(dolist (charclass charclasses)
|
|
|
|
(unless (zerop (logand (charclass-charset charclass) partition))
|
|
|
|
(dolist (action (charclass-actions charclass))
|
|
|
|
(pushnew (cdr action) lexer-action-names))))
|
|
|
|
(sort lexer-action-names #'< :key #'lexer-action-number)))
|
|
|
|
|
|
|
|
|
|
|
|
; Make a lexer structure corresponding to a grammar with the given source.
|
|
|
|
; charclasses-source is a list of character classes, where each class is a list of:
|
1999-05-11 01:04:53 +04:00
|
|
|
; a nonterminal C (may be a list to specify an attributed-nonterminal);
|
1999-01-30 04:19:00 +03:00
|
|
|
; an expression <set-expr> that denotes the set of characters in character class C;
|
|
|
|
; a list of bindings, each containing:
|
|
|
|
; an action name;
|
1999-05-11 01:04:53 +04:00
|
|
|
; a lexer-action name;
|
|
|
|
; an optional flag that indicatest that the character class should not be in the grammar.
|
1999-01-30 04:19:00 +03:00
|
|
|
; lexer-actions-source is a list of lexer-action bindings, each containing:
|
|
|
|
; a lexer-action name;
|
|
|
|
; the type of this lexer-action's value;
|
1999-07-27 05:00:05 +04:00
|
|
|
; the name of a primitive to use when depicting this lexer-action's definition;
|
1999-01-30 04:19:00 +03:00
|
|
|
; the name of a lisp function (char -> value) that performs the lexer-action on a character.
|
|
|
|
; This does not make the lexer's grammar; use make-lexer-and-grammar for that.
|
1999-05-11 01:04:53 +04:00
|
|
|
(defun make-lexer (parametrization charclasses-source lexer-actions-source grammar-source)
|
|
|
|
(assert-type charclasses-source (list (cons t (cons t (cons (list (tuple identifier identifier)) t)))))
|
1999-07-27 05:00:05 +04:00
|
|
|
(assert-type lexer-actions-source (list (tuple identifier t (or null identifier) identifier)))
|
1999-01-30 04:19:00 +03:00
|
|
|
(let ((lexer-actions (make-hash-table :test #'eq))
|
|
|
|
(charclasses nil)
|
|
|
|
(charclasses-hash (make-hash-table :test *grammar-symbol-=*))
|
|
|
|
(charsets nil)
|
|
|
|
(singletons (grammar-singletons grammar-source)))
|
|
|
|
(let ((lexer-action-number 0))
|
|
|
|
(dolist (lexer-action-source lexer-actions-source)
|
|
|
|
(let ((name (first lexer-action-source))
|
|
|
|
(type-expr (second lexer-action-source))
|
1999-07-27 05:00:05 +04:00
|
|
|
(function-name (third lexer-action-source))
|
|
|
|
(function (fourth lexer-action-source)))
|
1999-01-30 04:19:00 +03:00
|
|
|
(when (gethash name lexer-actions)
|
|
|
|
(error "Attempt to redefine lexer action ~S" name))
|
|
|
|
(setf (gethash name lexer-actions)
|
1999-07-27 05:00:05 +04:00
|
|
|
(make-lexer-action name (incf lexer-action-number) type-expr function-name function)))))
|
1999-01-30 04:19:00 +03:00
|
|
|
|
|
|
|
(dolist (charclass-source charclasses-source)
|
1999-05-17 09:04:07 +04:00
|
|
|
(let* ((nonterminal (assert-type (grammar-parametrization-intern parametrization (first charclass-source)) nonterminal))
|
|
|
|
(charset-source (intern-charset-expr parametrization (ensure-proper-form (second charclass-source))))
|
|
|
|
(charset (eval-charset-expr charclasses-hash charset-source))
|
|
|
|
(actions
|
|
|
|
(mapcar #'(lambda (action-source)
|
|
|
|
(let* ((lexer-action-name (second action-source))
|
|
|
|
(lexer-action (gethash lexer-action-name lexer-actions)))
|
|
|
|
(unless lexer-action
|
|
|
|
(error "Unknown lexer-action ~S" lexer-action-name))
|
|
|
|
(cons (first action-source) lexer-action)))
|
|
|
|
(third charclass-source))))
|
1999-01-30 04:19:00 +03:00
|
|
|
(when (gethash nonterminal charclasses-hash)
|
|
|
|
(error "Attempt to redefine character class ~S" nonterminal))
|
|
|
|
(when (charset-empty? charset)
|
|
|
|
(error "Empty character class ~S" nonterminal))
|
1999-05-17 09:04:07 +04:00
|
|
|
(let ((charclass (make-charclass nonterminal charset-source charset actions (fourth charclass-source))))
|
1999-01-30 04:19:00 +03:00
|
|
|
(push charclass charclasses)
|
|
|
|
(setf (gethash nonterminal charclasses-hash) charclass)
|
|
|
|
(push charset charsets))))
|
|
|
|
(setq charclasses (nreverse charclasses))
|
|
|
|
(bitmap-each-bit #'(lambda (i) (push (ash 1 i) charsets))
|
|
|
|
singletons)
|
|
|
|
(let ((char-tokens (make-hash-table :test #'eql))
|
|
|
|
(partition-names nil)
|
|
|
|
(partitions (make-hash-table :test #'eq))
|
|
|
|
(current-partition-number 0))
|
|
|
|
(dolist (partition (compute-partitions charsets))
|
|
|
|
(let ((singleton (charset-char partition)))
|
|
|
|
(cond
|
|
|
|
(singleton (setf (gethash singleton char-tokens) singleton))
|
|
|
|
((charset-infinite? partition)
|
|
|
|
(push *default-partition-name* partition-names)
|
|
|
|
(setf (gethash *default-partition-name* partitions)
|
|
|
|
(make-partition partition (collect-lexer-action-names charclasses partition))))
|
|
|
|
(t (let ((token (intern (format nil "$_CHARS~D_" (incf current-partition-number)))))
|
|
|
|
(bitmap-each-bit #'(lambda (i)
|
|
|
|
(setf (gethash (code-char i) char-tokens) token))
|
|
|
|
partition)
|
|
|
|
(push token partition-names)
|
|
|
|
(setf (gethash token partitions)
|
|
|
|
(make-partition partition (collect-lexer-action-names charclasses partition))))))))
|
|
|
|
(allocate-lexer
|
|
|
|
:lexer-actions lexer-actions
|
|
|
|
:charclasses charclasses
|
|
|
|
:charclasses-hash charclasses-hash
|
|
|
|
:char-tokens char-tokens
|
|
|
|
:partition-names (nreverse partition-names)
|
|
|
|
:partitions partitions))))
|
|
|
|
|
|
|
|
|
|
|
|
(defun print-lexer (lexer &optional (stream t))
|
|
|
|
(let* ((lexer-actions (lexer-lexer-actions lexer))
|
|
|
|
(lexer-action-names (sort (hash-table-keys lexer-actions) #'<
|
|
|
|
:key #'(lambda (lexer-action-name)
|
|
|
|
(lexer-action-number (gethash lexer-action-name lexer-actions)))))
|
|
|
|
(charclasses (lexer-charclasses lexer))
|
|
|
|
(partition-names (lexer-partition-names lexer))
|
|
|
|
(partitions (lexer-partitions lexer))
|
|
|
|
(singletons nil))
|
|
|
|
|
|
|
|
(when lexer-action-names
|
|
|
|
(pprint-logical-block (stream lexer-action-names)
|
|
|
|
(format stream "Lexer Actions:~2I")
|
|
|
|
(loop
|
|
|
|
(pprint-newline :mandatory stream)
|
|
|
|
(let ((lexer-action (gethash (pprint-pop) lexer-actions)))
|
|
|
|
(print-lexer-action lexer-action stream))
|
|
|
|
(pprint-exit-if-list-exhausted)))
|
|
|
|
(pprint-newline :mandatory stream)
|
|
|
|
(pprint-newline :mandatory stream))
|
|
|
|
|
|
|
|
(when charclasses
|
|
|
|
(pprint-logical-block (stream charclasses)
|
|
|
|
(format stream "Charclasses:~2I")
|
|
|
|
(loop
|
|
|
|
(pprint-newline :mandatory stream)
|
|
|
|
(let ((charclass (pprint-pop)))
|
|
|
|
(print-charclass charclass stream))
|
|
|
|
(pprint-exit-if-list-exhausted)))
|
|
|
|
(pprint-newline :mandatory stream)
|
|
|
|
(pprint-newline :mandatory stream))
|
|
|
|
|
|
|
|
(when partition-names
|
|
|
|
(pprint-logical-block (stream partition-names)
|
|
|
|
(format stream "Partitions:~2I")
|
|
|
|
(loop
|
|
|
|
(pprint-newline :mandatory stream)
|
|
|
|
(let ((partition-name (pprint-pop)))
|
|
|
|
(print-partition partition-name (gethash partition-name partitions) stream))
|
|
|
|
(pprint-exit-if-list-exhausted)))
|
|
|
|
(pprint-newline :mandatory stream)
|
|
|
|
(pprint-newline :mandatory stream))
|
|
|
|
|
|
|
|
(maphash
|
|
|
|
#'(lambda (char char-or-partition)
|
|
|
|
(if (eql char char-or-partition)
|
|
|
|
(push char singletons)
|
|
|
|
(assert-type char-or-partition identifier)))
|
|
|
|
(lexer-char-tokens lexer))
|
|
|
|
(setq singletons (sort singletons #'char<))
|
|
|
|
(when singletons
|
|
|
|
(format stream "Singletons: ~@_~<~@{~W ~:_~}~:>~:@_~:@_" singletons))))
|
|
|
|
|
|
|
|
|
|
|
|
(defmethod print-object ((lexer lexer) stream)
|
|
|
|
(print-unreadable-object (lexer stream :identity t)
|
|
|
|
(write-string "lexer" stream)))
|
|
|
|
|
|
|
|
|
|
|
|
;;; ------------------------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
1999-06-04 00:52:04 +04:00
|
|
|
; Return a freshly consed list of partitions for the given charclass.
|
|
|
|
(defun charclass-partitions (lexer charclass)
|
|
|
|
(do ((partitions nil)
|
|
|
|
(charset (charclass-charset charclass)))
|
|
|
|
((charset-empty? charset) partitions)
|
|
|
|
(let* ((partition-name (if (charset-infinite? charset)
|
|
|
|
*default-partition-name*
|
|
|
|
(gethash (charset-highest-char charset) (lexer-char-tokens lexer))))
|
|
|
|
(partition-charset (if (characterp partition-name)
|
|
|
|
(char-charset partition-name)
|
|
|
|
(partition-charset (gethash partition-name (lexer-partitions lexer))))))
|
|
|
|
(push partition-name partitions)
|
|
|
|
(setq charset (charset-difference charset partition-charset)))))
|
|
|
|
|
|
|
|
|
1999-11-02 04:45:14 +03:00
|
|
|
; Return an updated grammar-source whose character class nonterminals are replaced with sets of
|
1999-06-04 00:52:04 +04:00
|
|
|
; terminals inside :- and :-- constraints.
|
|
|
|
(defun update-constraint-nonterminals (lexer grammar-source)
|
|
|
|
(mapcar
|
|
|
|
#'(lambda (production-source)
|
|
|
|
(let ((rhs (second production-source)))
|
|
|
|
(if (some #'(lambda (rhs-component)
|
|
|
|
(and (consp rhs-component)
|
|
|
|
(member (first rhs-component) '(:- :--))))
|
|
|
|
rhs)
|
|
|
|
(list*
|
|
|
|
(first production-source)
|
|
|
|
(mapcar
|
|
|
|
#'(lambda (component)
|
|
|
|
(when (consp component)
|
|
|
|
(let ((tag (first component)))
|
|
|
|
(when (eq tag ':-)
|
|
|
|
(setq component (list* ':-- (rest component) (rest component)))
|
|
|
|
(setq tag ':--))
|
|
|
|
(when (eq tag ':--)
|
|
|
|
(setq component
|
|
|
|
(list* tag
|
|
|
|
(second component)
|
|
|
|
(mapcan #'(lambda (grammar-symbol)
|
|
|
|
(if (nonterminal? grammar-symbol)
|
|
|
|
(charclass-partitions lexer (assert-non-null (lexer-charclass lexer grammar-symbol)))
|
|
|
|
(list grammar-symbol)))
|
|
|
|
(cddr component)))))))
|
|
|
|
component)
|
|
|
|
rhs)
|
|
|
|
(cddr production-source))
|
|
|
|
production-source)))
|
|
|
|
grammar-source))
|
|
|
|
|
|
|
|
|
1999-01-30 04:19:00 +03:00
|
|
|
; Return two values:
|
1999-06-04 00:52:04 +04:00
|
|
|
; An updated grammar-source that includes:
|
|
|
|
; grammar productions that define the character class nonterminals out of characters and tokens;
|
|
|
|
; character class nonterminals replaced with sets of terminals inside :- and :-- constraints.
|
|
|
|
; Extra commands that:
|
1999-01-30 04:19:00 +03:00
|
|
|
; define the partitions used in this lexer;
|
|
|
|
; define the actions of these productions.
|
1999-06-04 00:52:04 +04:00
|
|
|
(defun lexer-grammar-and-commands (lexer grammar-source)
|
1999-01-30 04:19:00 +03:00
|
|
|
(labels
|
|
|
|
((component-partitions (charset partitions)
|
|
|
|
(if (charset-empty? charset)
|
|
|
|
partitions
|
|
|
|
(let* ((partition-name (if (charset-infinite? charset)
|
|
|
|
*default-partition-name*
|
|
|
|
(gethash (charset-highest-char charset) (lexer-char-tokens lexer))))
|
|
|
|
(partition (gethash partition-name (lexer-partitions lexer))))
|
|
|
|
(component-partitions (charset-difference charset (partition-charset partition))
|
|
|
|
(cons partition partitions))))))
|
|
|
|
(let ((productions nil)
|
|
|
|
(commands nil))
|
|
|
|
(dolist (charclass (lexer-charclasses lexer))
|
1999-05-11 01:04:53 +04:00
|
|
|
(unless (charclass-hidden charclass)
|
|
|
|
(let* ((nonterminal (charclass-nonterminal charclass))
|
|
|
|
(nonterminal-source (general-grammar-symbol-source nonterminal))
|
|
|
|
(production-prefix (if (consp nonterminal-source)
|
|
|
|
(format nil "~{~A~^-~}" nonterminal-source)
|
|
|
|
nonterminal-source))
|
|
|
|
(production-number 0))
|
|
|
|
(dolist (action (charclass-actions charclass))
|
|
|
|
(let ((lexer-action (cdr action)))
|
|
|
|
(push (list 'declare-action (car action) nonterminal-source (lexer-action-type-expr lexer-action)) commands)))
|
|
|
|
(do ((charset (charclass-charset charclass)))
|
|
|
|
((charset-empty? charset))
|
|
|
|
(let* ((partition-name (if (charset-infinite? charset)
|
|
|
|
*default-partition-name*
|
|
|
|
(gethash (charset-highest-char charset) (lexer-char-tokens lexer))))
|
|
|
|
(partition-charset (if (characterp partition-name)
|
|
|
|
(char-charset partition-name)
|
|
|
|
(partition-charset (gethash partition-name (lexer-partitions lexer)))))
|
|
|
|
(production-name (intern (format nil "~A-~D" production-prefix (incf production-number)))))
|
|
|
|
(push (list nonterminal-source (list partition-name) production-name) productions)
|
|
|
|
(dolist (action (charclass-actions charclass))
|
|
|
|
(let* ((lexer-action (cdr action))
|
|
|
|
(body (if (characterp partition-name)
|
1999-07-27 05:00:05 +04:00
|
|
|
(let* ((lexer-action-function (lexer-action-function lexer-action))
|
1999-05-11 01:04:53 +04:00
|
|
|
(result (funcall lexer-action-function partition-name)))
|
|
|
|
(typecase result
|
|
|
|
(integer result)
|
|
|
|
(character result)
|
|
|
|
((eql nil) 'false)
|
|
|
|
((eql t) 'true)
|
|
|
|
(t (error "Cannot infer the type of ~S's result ~S" lexer-action-function result))))
|
|
|
|
(list (lexer-action-name lexer-action) partition-name))))
|
|
|
|
(push (list 'action (car action) production-name body nil) commands)))
|
|
|
|
(setq charset (charset-difference charset partition-charset)))))))
|
1999-01-30 04:19:00 +03:00
|
|
|
|
|
|
|
(let ((partition-commands
|
|
|
|
(mapcan
|
|
|
|
#'(lambda (partition-name)
|
|
|
|
(mapcan #'(lambda (lexer-action)
|
|
|
|
(let ((lexer-action-name (lexer-action-name lexer-action)))
|
|
|
|
(list
|
|
|
|
(list 'declare-action lexer-action-name partition-name (lexer-action-type-expr lexer-action))
|
1999-07-27 05:00:05 +04:00
|
|
|
(list 'terminal-action lexer-action-name partition-name (lexer-action-function lexer-action)))))
|
1999-01-30 04:19:00 +03:00
|
|
|
(partition-lexer-actions (gethash partition-name (lexer-partitions lexer)))))
|
|
|
|
(lexer-partition-names lexer))))
|
|
|
|
(values
|
1999-06-04 00:52:04 +04:00
|
|
|
(nreconc productions (update-constraint-nonterminals lexer grammar-source))
|
1999-01-30 04:19:00 +03:00
|
|
|
(nconc partition-commands (nreverse commands)))))))
|
|
|
|
|
|
|
|
|
|
|
|
; Make a lexer and grammar from the given source.
|
|
|
|
; kind should be either :lalr-1 or :lr-1.
|
|
|
|
; charclasses-source is a list of character classes, and
|
|
|
|
; lexer-actions-source is a list of lexer-action bindings; see make-lexer.
|
|
|
|
; start-symbol is the grammar's start symbol, and grammar-source is its source.
|
|
|
|
; Return two values:
|
|
|
|
; the lexer (including the grammar in its grammar field);
|
|
|
|
; list of extra commands that:
|
|
|
|
; define the partitions used in this lexer;
|
|
|
|
; define the actions of these productions.
|
1999-02-10 09:39:58 +03:00
|
|
|
(defun make-lexer-and-grammar (kind charclasses-source lexer-actions-source parametrization start-symbol grammar-source &optional excluded-nonterminals-source)
|
1999-05-11 01:04:53 +04:00
|
|
|
(let ((lexer (make-lexer parametrization charclasses-source lexer-actions-source grammar-source)))
|
1999-06-04 00:52:04 +04:00
|
|
|
(multiple-value-bind (lexer-grammar-source extra-commands) (lexer-grammar-and-commands lexer grammar-source)
|
1999-02-10 09:39:58 +03:00
|
|
|
(let ((grammar (make-and-compile-grammar kind parametrization start-symbol
|
1999-06-04 00:52:04 +04:00
|
|
|
lexer-grammar-source excluded-nonterminals-source)))
|
1999-01-30 04:19:00 +03:00
|
|
|
(setf (lexer-grammar lexer) grammar)
|
|
|
|
(values lexer extra-commands)))))
|
|
|
|
|
|
|
|
|
|
|
|
; Parse the input string to produce a list of action results.
|
|
|
|
; If trace is:
|
|
|
|
; nil, don't print trace information
|
|
|
|
; :code, print trace information, including action code
|
|
|
|
; other print trace information
|
|
|
|
; Return two values:
|
|
|
|
; the list of action results;
|
|
|
|
; the list of action results' types.
|
|
|
|
(defun lexer-parse (lexer string &key trace)
|
|
|
|
(let ((in (coerce string 'list)))
|
|
|
|
(action-parse (lexer-grammar lexer) (lexer-classifier lexer) in :trace trace)))
|
|
|
|
|
|
|
|
|
|
|
|
; Same as lexer-parse except that also print the action results nicely.
|
|
|
|
(defun lexer-pparse (lexer string &key (stream t) trace)
|
|
|
|
(multiple-value-bind (results types) (lexer-parse lexer string :trace trace)
|
|
|
|
(print-values results types stream)
|
|
|
|
(terpri stream)
|
|
|
|
(values results types)))
|
|
|
|
|
|
|
|
|
|
|
|
; Compute the lexer grammar's metagrammar.
|
|
|
|
(defun set-up-lexer-metagrammar (lexer)
|
|
|
|
(setf (lexer-metagrammar lexer) (make-metagrammar (lexer-grammar lexer))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
; Parse the input string into elements, where each element is the longest
|
|
|
|
; possible string of input characters that is accepted by the grammar.
|
|
|
|
; The grammar's terminals are all characters that may appear in the input
|
|
|
|
; string plus the symbol $END which is inserted after the last character of
|
|
|
|
; the string.
|
|
|
|
; Return the list of lists of action results of the elements.
|
1999-05-11 01:04:53 +04:00
|
|
|
;
|
|
|
|
; If initial-state and state-transition are non-nil, the parser has state.
|
|
|
|
; initial-state is a list of input symbols to be prepended to the input string
|
|
|
|
; before the first element is parsed. state-transition is a function that
|
|
|
|
; takes the result of each successful action and produces two values:
|
|
|
|
; a modified result of that action;
|
|
|
|
; a list of input symbols to be prepended to the input string before the next
|
|
|
|
; element is parsed.
|
|
|
|
;
|
1999-01-30 04:19:00 +03:00
|
|
|
; If trace is:
|
|
|
|
; nil, don't print trace information
|
|
|
|
; :code, print trace information, including action code
|
|
|
|
; other print trace information
|
1999-05-11 01:04:53 +04:00
|
|
|
;
|
|
|
|
; Return three values:
|
1999-01-30 04:19:00 +03:00
|
|
|
; the list of lists of action results;
|
|
|
|
; the list of action results' types. Each of the lists of action results has
|
|
|
|
; this type signature.
|
1999-05-11 01:04:53 +04:00
|
|
|
; the last state
|
|
|
|
(defun lexer-metaparse (lexer string &key initial-state state-transition trace)
|
1999-01-30 04:19:00 +03:00
|
|
|
(let ((metagrammar (lexer-metagrammar lexer)))
|
|
|
|
(do ((in (append (coerce string 'list) '($end)))
|
|
|
|
(results-lists nil))
|
1999-05-11 01:04:53 +04:00
|
|
|
((endp in) (values (nreverse results-lists)
|
|
|
|
(grammar-user-start-action-types (metagrammar-grammar metagrammar))
|
|
|
|
initial-state))
|
|
|
|
(multiple-value-bind (results in-rest)
|
|
|
|
(action-metaparse metagrammar (lexer-classifier lexer) (append initial-state in) :trace trace)
|
|
|
|
(when state-transition
|
|
|
|
(multiple-value-setq (results initial-state) (funcall state-transition results)))
|
1999-01-30 04:19:00 +03:00
|
|
|
(setq in in-rest)
|
|
|
|
(push results results-lists)))))
|
|
|
|
|
|
|
|
|
|
|
|
; Same as lexer-metaparse except that also print the action results nicely.
|
1999-05-11 01:04:53 +04:00
|
|
|
(defun lexer-pmetaparse (lexer string &key initial-state state-transition (stream t) trace)
|
|
|
|
(multiple-value-bind (results-lists types final-state)
|
|
|
|
(lexer-metaparse lexer string :initial-state initial-state :state-transition state-transition :trace trace)
|
1999-01-30 04:19:00 +03:00
|
|
|
(pprint-logical-block (stream results-lists)
|
|
|
|
(pprint-exit-if-list-exhausted)
|
|
|
|
(loop
|
|
|
|
(print-values (pprint-pop) types stream :prefix "(" :suffix ")")
|
|
|
|
(pprint-exit-if-list-exhausted)
|
|
|
|
(format stream " ~_")))
|
|
|
|
(terpri stream)
|
1999-05-11 01:04:53 +04:00
|
|
|
(values results-lists types final-state)))
|
1999-01-30 04:19:00 +03:00
|
|
|
|