pjs/js2/semantics/HTML-To-RTF/Convert.lisp

539 строки
21 KiB
Common Lisp

;;; The contents of this file are subject to the Mozilla Public
;;; License Version 1.1 (the "License"); you may not use this file
;;; except in compliance with the License. You may obtain a copy of
;;; the License at http://www.mozilla.org/MPL/
;;;
;;; Software distributed under the License is distributed on an "AS
;;; IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
;;; implied. See the License for the specific language governing
;;; rights and limitations under the License.
;;;
;;; The Original Code is the Language Design and Prototyping Environment.
;;;
;;; The Initial Developer of the Original Code is Netscape Communications
;;; Corporation. Portions created by Netscape Communications Corporation are
;;; Copyright (C) 1999-2002 Netscape Communications Corporation. All
;;; Rights Reserved.
;;;
;;; Contributor(s): Waldemar Horwat <waldemar@acm.org>
;;;
;;; Alternatively, the contents of this file may be used under the terms of
;;; either the GNU General Public License Version 2 or later (the "GPL"), or
;;; the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
;;; in which case the provisions of the GPL or the LGPL are applicable instead
;;; of those above. If you wish to allow use of your version of this file only
;;; under the terms of either the GPL or the LGPL, and not to allow others to
;;; use your version of this file under the terms of the MPL, indicate your
;;; decision by deleting the provisions above and replace them with the notice
;;; and other provisions required by the GPL or the LGPL. If you do not delete
;;; the provisions above, a recipient may use your version of this file under
;;; the terms of any one of the MPL, the GPL or the LGPL.
;;;
;;; Custom HTML-to-RTF Converter
;;;
;;; Waldemar Horwat (waldemar@acm.org)
;;;
(defconstant *missing-marker* "*****")
; Return the html-name-token of the tag of the given html element.
(defun tag-name (element)
(html-parser:name (instance-of element)))
(defun match-tag-name (element tag-name)
(eq (tag-name element) tag-name))
; Return the value of the given attribute in <element> or nil if not found.
(defun attribute-value (element attribute-name)
(cdr (assoc attribute-name (attr-values element) :key #'html-parser:name)))
; Return true if the element has the given given <tag-name>, all of required-attributes, and perhaps
; the optional-attributes.
(defun match-element (element tag-name required-attributes optional-attributes)
(and (match-tag-name element tag-name)
(let ((attribute-values (attr-values element)))
(and
(every #'(lambda (required-attribute)
(assoc required-attribute attribute-values :key #'html-parser:name))
required-attributes)
(every #'(lambda (attribute-value)
(let ((attribute (html-parser:name (car attribute-value))))
(or (member attribute required-attributes)
(member attribute optional-attributes))))
attribute-values)))))
; Ensure that <element> has the given given <tag-name>, all of required-attributes, and perhaps
; the optional-attributes.
(defun ensure-element (element tag-name required-attributes optional-attributes)
(unless (match-element element tag-name required-attributes optional-attributes)
(error "Tag ~S ~S ~S expected; got ~S" tag-name required-attributes optional-attributes element)))
; Return the children of <element> that have the given <tag-name>, all of required-attributes, and perhaps
; the optional-attributes.
(defun matching-parts (element tag-name required-attributes optional-attributes)
(remove-if-not #'(lambda (child) (match-element child tag-name required-attributes optional-attributes))
(parts element)))
; Return the unique child of <element> that has the given <tag-name>, all of required-attributes, and perhaps
; the optional-attributes.
(defun matching-part (element tag-name required-attributes optional-attributes)
(let ((parts (matching-parts element tag-name required-attributes optional-attributes)))
(unless (and parts (endp (cdr parts)))
(error "Element ~S should have only one ~S child" element tag-name))
(car parts)))
; Convert control characters in the given string into spaces.
(defun normalize (string)
(let ((l nil))
(dotimes (i (length string))
(let ((ch (char string i)))
(if (<= (char-code ch) 32)
(unless (eql (car l) #\Space)
(push #\Space l))
(push ch l))))
(coerce (nreverse l) 'string)))
(defun normalize-preformatted (string)
(map 'list #'(lambda (ch)
(if (< (char-code ch) 32)
'line
(string ch)))
string))
(defvar *preformatted* nil)
(defun emit-string (markup-stream string)
(if *preformatted*
(dolist (segment (normalize-preformatted string))
(depict markup-stream segment))
(depict markup-stream (normalize string))))
(defparameter *special-char-code-map*
'((#x00AB . :left-angle-quote)
(#x00BB . :right-angle-quote)
(#x2013 . endash)
(#x2014 . emdash)
(#x2018 . :left-single-quote)
(#x2019 . :right-single-quote)
(#x201C . :left-double-quote)
(#x201D . :right-double-quote)))
(defun emit-special-character (markup-stream char-num)
(let ((code (cdr (assoc char-num *special-char-code-map*))))
(if code
(depict markup-stream code)
(progn
(depict markup-stream *missing-marker*)
(format *terminal-io* "Ignoring character code ~S~%" char-num)))))
(defparameter *character-style-map*
'(("control" . :character-literal-control)
("terminal" . :terminal)
("terminal-keyword" . :terminal-keyword)
("terminal-sub" . :terminal-sub)
("nonterminal" . :nonterminal)
("nonterminal-attribute" . :nonterminal-attribute)
("nonterminal-argument" . :nonterminal-argument)
("nonterminal-sub" . :nonterminal-sub)
("semantic-keyword" . :semantic-keyword)
("domain-name" . :domain-name)
("domain-sub" . :domain-sub)
("field-name" . :field-name)
("field-sub" . :field-sub)
("tag-name" . :tag-name)
("id-name" . :id-name)
("global-variable" . :global-variable)
("var-sub" . :variable-sub)
("action-name" . :action-name)
("small-caps" . scaps)))
(defun class-to-character-style (element)
(let ((class (attribute-value element '#t"CLASS")))
(if (null class)
nil
(let ((style (cdr (assoc class *character-style-map* :test #'equal))))
(unless style
(format *terminal-io* "Ignoring character style ~S~%" class))
style))))
(defparameter *u-styles*
'(("U_bull" . :bullet)
("U_ne" . :not-equal)
("U_le" . :less-or-equal)
("U_ge" . :greater-or-equal)
("U_infin" . :infinity)
("U_forall" . :for-all-10)
("U_exist" . :exists-10)
("U_perp" . :bottom-10)
("U_larr" . :assign-10)
("U_uarr" . :up-arrow-10)
("U_rarr" . :function-arrow-10)
("U_times" . :cartesian-product-10)
("U_equiv" . :identical-10)
("U_oplus" . :circle-plus-10)
("U_empty" . :empty-10)
("U_cap" . :intersection-10)
("U_cup" . :union-10)
("U_sub" . :subset-10)
("U_sube" . :subset-eq-10)
("U_isin" . :member-10)
("U_notin" . :not-member-10)
("U_lArr" . :label-assign-10)
("U_rArr" . :derives-10)
("U_lang" . :left-triangle-bracket-10)
("U_lceil" . :left-ceiling-10)
("U_lfloor" . :left-floor-10)
("U_rang" . :right-triangle-bracket-10)
("U_rceil" . :right-ceiling-10)
("U_rfloor" . :right-floor-10)
("U_alpha" . :alpha)
("U_beta" . :beta)
("U_chi" . :chi)
("U_delta" . :delta)
("U_epsilon" . :epsilon)
("U_phi" . :phi)
("U_gamma" . :gamma)
("U_eta" . :eta)
("U_iota" . :iota)
("U_kappa" . :kappa)
("U_lambda" . :lambda)
("U_mu" . :mu)
("U_nu" . :nu)
("U_omicron" . :omicron)
("U_pi" . :pi)
("U_theta" . :theta)
("U_rho" . :rho)
("U_sigma" . :sigma)
("U_tau" . :tau)
("U_upsilon" . :upsilon)
("U_omega" . :omega)
("U_xi" . :xi)
("U_psi" . :psi)
("U_zeta" . :zeta)
("U_Omega" . :capital-omega)))
; s is a sequence of elements, some of which may be eql to item. Return a list of
; runs of s's elements between the occurrences of item.
(defun split-sequence (s item)
(let ((pos (position item s)))
(if pos
(cons (subseq s 0 pos) (split-sequence (subseq s (1+ pos)) item))
(list s))))
(defun emit-script-element (markup-stream element)
(let* ((children (parts element))
(child (first children)))
(if (and
(= (length children) 1)
(stringp child)
(> (length child) 16)
(equal (subseq child 0 15) "document.write(")
(eql (char child (1- (length child))) #\)))
(dolist (u-name (split-sequence (subseq child 15 (1- (length child))) #\+))
(let ((u-style (cdr (assoc u-name *u-styles* :test #'equal))))
(if u-style
(depict markup-stream u-style)
(progn
(depict markup-stream *missing-marker*)
(format *terminal-io* "Ignoring SCRIPT document.write entry ~S in ~S~%" u-name child)))))
(progn
(depict markup-stream *missing-marker*)
(format *terminal-io* "Ignoring SCRIPT element ~S ~S~%" element children)))))
(defparameter *entity-map*
'((#e"nbsp" . ~)
(#e"lt" . "<")
(#e"gt" . ">")
(#e"amp" . "&")
(#e"quot" . "\"")))
(defun emit-entity (markup-stream entity)
(let ((rtf (cdr (assoc entity *entity-map*))))
(if rtf
(depict markup-stream rtf)
(progn
(depict markup-stream "*****[" (html-parser:token-name entity) "]")
(format *terminal-io* "Ignoring entity ~S~%" entity)))))
(defparameter *inline-element-map*
'((#t"VAR" . :variable)
(#t"B" . b)
(#t"I" . i)
(#t"TT" . :courier)))
(defun emit-inline-element (markup-stream element)
(cond
((stringp element)
(emit-string markup-stream element))
((integerp element)
(emit-special-character markup-stream element))
((typep element 'html-entity-token)
(emit-entity markup-stream element))
((match-element element '#t"SCRIPT" '(#t"TYPE") nil)
(emit-script-element markup-stream element))
((or
(match-element element '#t"A" nil '(#t"CLASS" #t"HREF" #t"NAME"))
(match-element element '#t"SPAN" nil '(#t"CLASS"))
(match-element element '#t"VAR" '(#t"CLASS") nil))
(depict-char-style (markup-stream (class-to-character-style element))
(emit-inline-parts markup-stream element)))
((match-element element '#t"CODE" nil '(#t"CLASS"))
(let ((class (attribute-value element '#t"CLASS")))
(if (equal class "terminal-keyword")
(depict-char-style (markup-stream (class-to-character-style element))
(emit-inline-parts markup-stream element))
(progn
(when class
(format *terminal-io* "Ignoring CODE character style ~S~%" class))
(depict-char-style (markup-stream :character-literal)
(emit-inline-parts markup-stream element))))))
((match-element element '#t"SUB" nil '(#t"CLASS"))
(depict-char-style (markup-stream 'sub)
(depict-char-style (markup-stream (class-to-character-style element))
(emit-inline-parts markup-stream element))))
((match-element element '#t"SUP" nil '(#t"CLASS"))
(depict-char-style (markup-stream 'super)
(depict-char-style (markup-stream (class-to-character-style element))
(emit-inline-parts markup-stream element))))
((match-element element '#t"BR" nil nil)
(depict markup-stream :new-line))
(t (let ((inline-style (cdr (assoc (tag-name element) *inline-element-map*))))
(if (and inline-style (endp (attr-values element)))
(depict-char-style (markup-stream inline-style)
(emit-inline-parts markup-stream element))
(progn
(depict markup-stream *missing-marker*)
(format *terminal-io* "Ignoring inline element ~S~%" element)))))))
; Emit the children of the given element as inline elements.
(defun emit-inline-parts (markup-stream element)
(dolist (child (parts element))
(emit-inline-element markup-stream child)))
; Emit the children of the given element as inline elements in a paragraph of the given style.
; However, if some children are paragraph-level elements, emit them as separate paragraphs.
(defun emit-inline-or-paragraph-parts (markup-stream element paragraph-style)
(emit-inline-or-paragraph-elements markup-stream (parts element) paragraph-style ))
(defparameter *paragraph-elements*
'(#t"P" #t"TH" #t"TD" #t"PRE" #t"UL" #t"OL" #t"DIV" #t"HR" #t"TABLE" #t"H1" #t"H2" #t"H3" #t"H4"))
(defun paragraph-element? (element)
(and (typep element 'html-tag-instance)
(member (tag-name element) *paragraph-elements*)))
(defun emit-inline-or-paragraph-elements (markup-stream elements paragraph-style)
(let* ((paragraph-element (member-if #'paragraph-element? elements))
(inline-parts (ldiff elements paragraph-element)))
(when inline-parts
(depict-paragraph (markup-stream paragraph-style)
(dolist (child inline-parts)
(emit-inline-element markup-stream child))))
(when paragraph-element
(emit-paragraph-element markup-stream (car paragraph-element))
(emit-inline-or-paragraph-elements markup-stream (cdr paragraph-element) paragraph-style))))
(defparameter *class-paragraph-styles*
'(("mod-date" . :mod-date)
("grammar-argument" . :grammar-argument)
("indent" . :body-text)
("operator-heading" . :heading4)
("algorithm-stmt" . :algorithm-stmt)))
(defun class-to-paragraph-style (element)
(let ((class (attribute-value element '#t"CLASS")))
(if class
(let ((style (cdr (assoc class *class-paragraph-styles* :test #'equal))))
(or style
(progn
(format *terminal-io* "Ignoring paragraph style ~S~%" class)
:body-text)))
:body-text)))
(defun grammar-rule-child-style (element last)
(and
(match-element element '#t"DIV" '(#t"CLASS") nil)
(let ((class (attribute-value element '#t"CLASS")))
(cond
((equal class "grammar-lhs")
(if last :grammar-lhs-last :grammar-lhs))
((equal class "grammar-rhs")
(if last :grammar-rhs-last :grammar-rhs))
(t nil)))))
(defparameter *divs-containing-divs*
'("indent"))
(defvar *algorithm-level* nil)
(defun emit-div (markup-stream element class)
(cond
((equal class "grammar-rule")
(let ((children (parts element)))
(do ()
((endp children))
(let* ((child (pop children))
(style (grammar-rule-child-style child (endp children))))
(unless style
(format *terminal-io* "Bad grammar-rule child ~S~%" child)
(setq style :body-text))
(depict-paragraph (markup-stream style)
(emit-inline-parts markup-stream child))))))
((equal class "algorithm")
(let ((*algorithm-level* 0))
(emit-paragraph-elements markup-stream element)))
((equal class "algorithm-indent")
(let ((*algorithm-level* 1))
(emit-paragraph-elements markup-stream element)))
((or (equal class "lvl") (equal class "lvl-wide"))
(let ((*algorithm-level* (1+ *algorithm-level*)))
(emit-paragraph-elements markup-stream element)))
((equal class "stmt")
(unless (and *algorithm-level* (< *algorithm-level* 10))
(error "The stmt class can only be used inside an algorithm class"))
(depict-paragraph (markup-stream (find-symbol (format nil "ALGORITHM-~D" *algorithm-level*) :keyword))
(emit-inline-parts markup-stream element)))
((member class *divs-containing-divs* :test #'equal)
(depict-paragraph (markup-stream :body-text)
(depict markup-stream "***** BEGIN DIV" class))
(emit-paragraph-elements markup-stream element)
(depict-paragraph (markup-stream :body-text)
(depict markup-stream "***** END DIV" class)))
(t (emit-inline-or-paragraph-parts markup-stream element (class-to-paragraph-style element)))))
(defparameter *paragraph-element-map*
'((#t"H1" . :heading1)
(#t"H2" . :heading2)
(#t"H3" . :heading3)
(#t"H4" . :heading4)))
; Emit the paragraph-level element.
(defun emit-paragraph-element (markup-stream element)
(cond
((or
(match-element element '#t"P" nil '(#t"CLASS"))
(match-element element '#t"TH" nil '(#t"CLASS" #t"COLSPAN" #t"ROWSPAN" #t"NOWRAP" #t"VALIGN" #t"ALIGN"))
(match-element element '#t"TD" nil '(#t"CLASS" #t"COLSPAN" #t"ROWSPAN" #t"NOWRAP" #t"VALIGN" #t"ALIGN")))
(emit-inline-or-paragraph-parts markup-stream element (class-to-paragraph-style element)))
((match-element element '#t"PRE" nil nil)
(depict-paragraph (markup-stream :sample-code)
(let ((*preformatted* t))
(emit-inline-parts markup-stream element))))
((or (match-element element '#t"UL" nil nil)
(match-element element '#t"OL" nil nil))
(depict-paragraph (markup-stream :body-text)
(depict markup-stream "***** BEGIN LIST"))
(dolist (child (parts element))
(ensure-element child '#t"LI" nil nil)
(emit-inline-or-paragraph-parts markup-stream child :body-text))
(depict-paragraph (markup-stream :body-text)
(depict markup-stream "***** END LIST")))
((match-element element '#t"DIV" nil '(#t"CLASS"))
(let ((class (attribute-value element '#t"CLASS")))
(if class
(emit-div markup-stream element class)
(emit-paragraph-elements markup-stream element))))
((match-element element '#t"HR" nil nil))
((match-element element '#t"TABLE" nil '(#t"BORDER" #t"CELLSPACING" #t"CELLPADDING"))
(depict-paragraph (markup-stream :body-text)
(depict markup-stream "***** BEGIN TABLE"))
(emit-paragraph-elements markup-stream element)
(depict-paragraph (markup-stream :body-text)
(depict markup-stream "***** END TABLE")))
((match-element element '#t"THEAD" nil nil)
(emit-paragraph-elements markup-stream element))
((match-element element '#t"TR" nil nil)
(emit-paragraph-elements markup-stream element))
(t (let ((paragraph-style (cdr (assoc (tag-name element) *paragraph-element-map*))))
(if (and paragraph-style (endp (attr-values element)))
(emit-inline-or-paragraph-parts markup-stream element paragraph-style)
(progn
(depict-paragraph (markup-stream :body-text)
(depict markup-stream *missing-marker*))
(format *terminal-io* "Ignoring paragraph element ~S~%" element)))))))
; Emit the children of the given element as paragraph-level elements.
(defun emit-paragraph-elements (markup-stream element)
(dolist (child (parts element))
(emit-paragraph-element markup-stream child)))
(defun emit-html-file (markup-stream element)
(ensure-element element '#t"HTML" nil nil)
(let* ((body (matching-part element '#t"BODY" nil nil))
(body-elements (parts body)))
(when (and body-elements (match-tag-name (first body-elements) '#t"TABLE"))
(setq body-elements (rest body-elements)))
(when (and body-elements (match-tag-name (car (last body-elements)) '#t"TABLE"))
(setq body-elements (butlast body-elements)))
(dolist (body-element body-elements)
(emit-paragraph-element markup-stream body-element))))
(defun translate-html-to-rtf (html-file-name rtf-path title)
(let* ((source-text (file->string html-file-name))
(element (html-parser::simple-parser source-text)))
(depict-rtf-to-local-file
rtf-path
title
#'(lambda (markup-stream)
(emit-html-file markup-stream element))
*rtf-definitions*)))
#|
(setq s (html-parser:file->string "Alemana:Users:waldemar:Mozilla:Docs:mozilla-org:html:js:language:js20:formal:index.html"))
(setq p (html-parser::simple-parser s))
(depict-rtf-to-local-file
"HTML-To-RTF/Test.rtf"
"Test"
#'(lambda (markup-stream)
(emit-html-file markup-stream p))
*rtf-definitions*)
(translate-html-to-rtf "Alemana:Users:waldemar:Mozilla:Docs:mozilla-org:html:js:language:js20:formal:index.html" "HTML-To-RTF/Test.rtf" "Test")
(translate-html-to-rtf "Alemana:Users:waldemar:Mozilla:Docs:mozilla-org:html:js:language:js20:introduction:notation.html"
"HTML-To-RTF/Notation.rtf" "Notation")
(translate-html-to-rtf "Alemana:Users:waldemar:Mozilla:Docs:mozilla-org:html:js:language:es4:core:expressions.html"
"HTML-To-RTF/Expressions.rtf" "Expressions")
(translate-html-to-rtf "Alemana:Users:waldemar:Mozilla:Moz:mozilla:js2:semantics:HTML-To-RTF:Expressions.html"
"HTML-To-RTF/Expressions.rtf" "Expressions")
(translate-html-to-rtf "Alemana:Users:waldemar:Mozilla:Docs:mozilla-org:html:js:language:js20:formal:stages.html"
"HTML-To-RTF/Stages.rtf" "Stages")
(translate-html-to-rtf "Alemana:Users:waldemar:Mozilla:Docs:mozilla-org:html:js:language:es4:core:expressions.html"
"HTML-To-RTF/Expressions.rtf" "Expressions")
(translate-html-to-rtf "Alemana:Users:waldemar:Mozilla:Docs:mozilla-org:html:js:language:js20:formal:notation.html"
"HTML-To-RTF/FormalNotation.rtf" "Formal Notation")
|#