1109 строки
34 KiB
Diff
1109 строки
34 KiB
Diff
diff --git a/src/gdoc.c b/src/gdoc.c
|
|
index 50cd9bc..3786746 100644
|
|
--- a/src/gdoc.c
|
|
+++ b/src/gdoc.c
|
|
@@ -96,14 +96,15 @@ static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode)
|
|
|
|
static void CleanNode( TidyDocImpl* doc, Node *node )
|
|
{
|
|
+ Stack *stack = TY_(newStack)(doc, 16);
|
|
Node *child, *next;
|
|
|
|
- if (node->content)
|
|
+ if ( (child = node->content) )
|
|
{
|
|
- for (child = node->content; child != NULL; child = next)
|
|
+ while (child)
|
|
{
|
|
next = child->next;
|
|
-
|
|
+
|
|
if (TY_(nodeIsElement)(child))
|
|
{
|
|
if (nodeIsSTYLE(child))
|
|
@@ -131,11 +132,16 @@ static void CleanNode( TidyDocImpl* doc, Node *node )
|
|
if (child->attributes)
|
|
TY_(DropAttrByName)( doc, child, "class" );
|
|
|
|
- CleanNode(doc, child);
|
|
+ TY_(push)(stack,next);
|
|
+ child = child->content;
|
|
+ continue;
|
|
}
|
|
}
|
|
+ child = next ? next : TY_(pop)(stack);
|
|
}
|
|
}
|
|
+ TY_(freeStack)(stack);
|
|
+ TidyFree(doc->allocator, stack);
|
|
}
|
|
|
|
/* insert meta element to force browser to recognize doc as UTF8 */
|
|
diff --git a/src/lexer.c b/src/lexer.c
|
|
index bc4e50a..d1cae84 100644
|
|
--- a/src/lexer.c
|
|
+++ b/src/lexer.c
|
|
@@ -877,15 +877,6 @@ static tmbchar LastChar( tmbstr str )
|
|
return 0;
|
|
}
|
|
|
|
-/*
|
|
- node->type is one of these:
|
|
-
|
|
- #define TextNode 1
|
|
- #define StartTag 2
|
|
- #define EndTag 3
|
|
- #define StartEndTag 4
|
|
-*/
|
|
-
|
|
Lexer* TY_(NewLexer)( TidyDocImpl* doc )
|
|
{
|
|
Lexer* lexer = (Lexer*) TidyDocAlloc( doc, sizeof(Lexer) );
|
|
@@ -1545,13 +1536,7 @@ void TY_(FreeNode)( TidyDocImpl* doc, Node *node )
|
|
}
|
|
}
|
|
#endif
|
|
- /* this is no good ;=((
|
|
- if (node && doc && doc->lexer) {
|
|
- if (node == doc->lexer->token) {
|
|
- doc->lexer->token = NULL; // TY_(NewNode)( doc->lexer->allocator, doc->lexer );
|
|
- }
|
|
- }
|
|
- ----------------- */
|
|
+
|
|
while ( node )
|
|
{
|
|
Node* next = node->next;
|
|
@@ -4462,11 +4447,102 @@ static Node *ParseDocTypeDecl(TidyDocImpl* doc)
|
|
return NULL;
|
|
}
|
|
|
|
-/*
|
|
- * local variables:
|
|
- * mode: c
|
|
- * indent-tabs-mode: nil
|
|
- * c-basic-offset: 4
|
|
- * eval: (c-set-offset 'substatement-open 0)
|
|
- * end:
|
|
+
|
|
+/****************************************************************************//*
|
|
+ ** MARK: - Node Stack
|
|
+ ***************************************************************************/
|
|
+
|
|
+
|
|
+/**
|
|
+ * Create a new stack with a given starting capacity. If memory allocation
|
|
+ * fails, then the allocator will panic the program automatically.
|
|
+ */
|
|
+Stack* TY_(newStack)(TidyDocImpl *doc, uint capacity)
|
|
+{
|
|
+ Stack *stack = (Stack *)TidyAlloc(doc->allocator, sizeof(Stack));
|
|
+ stack->top = -1;
|
|
+ stack->capacity = capacity;
|
|
+ stack->firstNode = (Node **)TidyAlloc(doc->allocator, stack->capacity * sizeof(Node**));
|
|
+ stack->allocator = doc->allocator;
|
|
+ return stack;
|
|
+}
|
|
+
|
|
+
|
|
+/**
|
|
+ * Increase the stack size. This will be called automatically when the
|
|
+ * current stack is full. If memory allocation fails, then the allocator
|
|
+ * will panic the program automatically.
|
|
+ */
|
|
+void TY_(growStack)(Stack *stack)
|
|
+{
|
|
+ uint new_capacity = stack->capacity * 2;
|
|
+
|
|
+ Node **firstNode = (Node **)TidyAlloc(stack->allocator, new_capacity);
|
|
+
|
|
+ memcpy( firstNode, stack->firstNode, sizeof(Node**) * (stack->top + 1) );
|
|
+ TidyFree(stack->allocator, stack->firstNode);
|
|
+
|
|
+ stack->firstNode = firstNode;
|
|
+ stack->capacity = new_capacity;
|
|
+}
|
|
+
|
|
+
|
|
+/**
|
|
+ * Stack is full when top is equal to the last index.
|
|
+ */
|
|
+Bool TY_(stackFull)(Stack *stack)
|
|
+{
|
|
+ return stack->top == stack->capacity - 1;
|
|
+}
|
|
+
|
|
+
|
|
+/**
|
|
+ * Stack is empty when top is equal to -1
|
|
+ */
|
|
+Bool TY_(stackEmpty)(Stack *stack)
|
|
+{
|
|
+ return stack->top == -1;
|
|
+}
|
|
+
|
|
+
|
|
+/**
|
|
+ * Push an item to the stack.
|
|
+ */
|
|
+void TY_(push)(Stack *stack, Node *node)
|
|
+{
|
|
+ if (TY_(stackFull)(stack))
|
|
+ TY_(growStack)(stack);
|
|
+
|
|
+ if (node)
|
|
+ stack->firstNode[++stack->top] = node;
|
|
+}
|
|
+
|
|
+
|
|
+/**
|
|
+ * Pop an item from the stack.
|
|
+ */
|
|
+Node* TY_(pop)(Stack *stack)
|
|
+{
|
|
+ return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--];
|
|
+}
|
|
+
|
|
+
|
|
+/**
|
|
+ * Peek at the stack.
|
|
*/
|
|
+FUNC_UNUSED Node* TY_(peek)(Stack *stack)
|
|
+{
|
|
+ return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--];
|
|
+}
|
|
+
|
|
+/**
|
|
+ * Frees the stack when done.
|
|
+ */
|
|
+void TY_(freeStack)(Stack *stack)
|
|
+{
|
|
+ TidyFree( stack->allocator, stack->firstNode );
|
|
+ stack->top = -1;
|
|
+ stack->capacity = 0;
|
|
+ stack->firstNode = NULL;
|
|
+ stack->allocator = NULL;
|
|
+}
|
|
diff --git a/src/lexer.h b/src/lexer.h
|
|
index c181d4b..d9ae113 100644
|
|
--- a/src/lexer.h
|
|
+++ b/src/lexer.h
|
|
@@ -1,33 +1,46 @@
|
|
#ifndef __LEXER_H__
|
|
#define __LEXER_H__
|
|
|
|
-/* lexer.h -- Lexer for html parser
|
|
-
|
|
- (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
|
|
- See tidy.h for the copyright notice.
|
|
|
|
- Given an input source, it returns a sequence of tokens.
|
|
-
|
|
- GetToken(source) gets the next token
|
|
- UngetToken(source) provides one level undo
|
|
-
|
|
- The tags include an attribute list:
|
|
-
|
|
- - linked list of attribute/value nodes
|
|
- - each node has 2 NULL-terminated strings.
|
|
- - entities are replaced in attribute values
|
|
-
|
|
- white space is compacted if not in preformatted mode
|
|
- If not in preformatted mode then leading white space
|
|
- is discarded and subsequent white space sequences
|
|
- compacted to single space characters.
|
|
-
|
|
- If XmlTags is no then Tag names are folded to upper
|
|
- case and attribute names to lower case.
|
|
-
|
|
- Not yet done:
|
|
- - Doctype subset and marked sections
|
|
-*/
|
|
+/**************************************************************************//**
|
|
+ * @file
|
|
+ * Lexer for HTML and XML Parsers.
|
|
+ *
|
|
+ * Given an input source, it returns a sequence of tokens.
|
|
+ *
|
|
+ * GetToken(source) gets the next token
|
|
+ * UngetToken(source) provides one level undo
|
|
+ *
|
|
+ * The tags include an attribute list:
|
|
+ *
|
|
+ * - linked list of attribute/value nodes
|
|
+ * - each node has 2 NULL-terminated strings.
|
|
+ * - entities are replaced in attribute values
|
|
+ *
|
|
+ * white space is compacted if not in preformatted mode
|
|
+ * If not in preformatted mode then leading white space
|
|
+ * is discarded and subsequent white space sequences
|
|
+ * compacted to single space characters.
|
|
+ *
|
|
+ * If XmlTags is no then Tag names are folded to upper
|
|
+ * case and attribute names to lower case.
|
|
+ *
|
|
+ * Not yet done:
|
|
+ * - Doctype subset and marked sections
|
|
+ *
|
|
+ * @author HTACG, et al (consult git log)
|
|
+ *
|
|
+ * @copyright
|
|
+ * (c) 1998-2021 (W3C) MIT, ERCIM, Keio University, and HTACG.
|
|
+ * See tidy.h for the copyright notice.
|
|
+ * @par
|
|
+ * All Rights Reserved.
|
|
+ * @par
|
|
+ * See `tidy.h` for the complete license.
|
|
+ *
|
|
+ * @date Additional updates: consult git log
|
|
+ *
|
|
+ ******************************************************************************/
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
@@ -35,8 +48,23 @@ extern "C" {
|
|
|
|
#include "forward.h"
|
|
|
|
-/* lexer character types
|
|
-*/
|
|
+/** @addtogroup internal_api */
|
|
+/** @{ */
|
|
+
|
|
+
|
|
+/***************************************************************************//**
|
|
+ ** @defgroup lexer_h HTML and XML Lexing
|
|
+ **
|
|
+ ** These functions and structures form the internal API for document
|
|
+ ** lexing.
|
|
+ **
|
|
+ ** @{
|
|
+ ******************************************************************************/
|
|
+
|
|
+
|
|
+/**
|
|
+ * Lexer character types.
|
|
+ */
|
|
#define digit 1u
|
|
#define letter 2u
|
|
#define namechar 4u
|
|
@@ -47,8 +75,9 @@ extern "C" {
|
|
#define digithex 128u
|
|
|
|
|
|
-/* node->type is one of these values
|
|
-*/
|
|
+/**
|
|
+ * node->type is one of these values
|
|
+ */
|
|
typedef enum
|
|
{
|
|
RootNode,
|
|
@@ -68,9 +97,9 @@ typedef enum
|
|
} NodeType;
|
|
|
|
|
|
-
|
|
-/* lexer GetToken states
|
|
-*/
|
|
+/**
|
|
+ * Lexer GetToken() states.
|
|
+ */
|
|
typedef enum
|
|
{
|
|
LEX_CONTENT,
|
|
@@ -88,7 +117,10 @@ typedef enum
|
|
LEX_XMLDECL
|
|
} LexerState;
|
|
|
|
-/* ParseDocTypeDecl state constants */
|
|
+
|
|
+/**
|
|
+ * ParseDocTypeDecl state constants.
|
|
+ */
|
|
typedef enum
|
|
{
|
|
DT_INTERMEDIATE,
|
|
@@ -98,67 +130,43 @@ typedef enum
|
|
DT_INTSUBSET
|
|
} ParseDocTypeDeclState;
|
|
|
|
-/* content model shortcut encoding
|
|
-
|
|
- Descriptions are tentative.
|
|
-*/
|
|
+/**
|
|
+ * Content model shortcut encoding.
|
|
+ * Descriptions are tentative.
|
|
+ */
|
|
#define CM_UNKNOWN 0
|
|
-/* Elements with no content. Map to HTML specification. */
|
|
-#define CM_EMPTY (1 << 0)
|
|
-/* Elements that appear outside of "BODY". */
|
|
-#define CM_HTML (1 << 1)
|
|
-/* Elements that can appear within HEAD. */
|
|
-#define CM_HEAD (1 << 2)
|
|
-/* HTML "block" elements. */
|
|
-#define CM_BLOCK (1 << 3)
|
|
-/* HTML "inline" elements. */
|
|
-#define CM_INLINE (1 << 4)
|
|
-/* Elements that mark list item ("LI"). */
|
|
-#define CM_LIST (1 << 5)
|
|
-/* Elements that mark definition list item ("DL", "DT"). */
|
|
-#define CM_DEFLIST (1 << 6)
|
|
-/* Elements that can appear inside TABLE. */
|
|
-#define CM_TABLE (1 << 7)
|
|
-/* Used for "THEAD", "TFOOT" or "TBODY". */
|
|
-#define CM_ROWGRP (1 << 8)
|
|
-/* Used for "TD", "TH" */
|
|
-#define CM_ROW (1 << 9)
|
|
-/* Elements whose content must be protected against white space movement.
|
|
- Includes some elements that can found in forms. */
|
|
-#define CM_FIELD (1 << 10)
|
|
-/* Used to avoid propagating inline emphasis inside some elements
|
|
- such as OBJECT or APPLET. */
|
|
-#define CM_OBJECT (1 << 11)
|
|
-/* Elements that allows "PARAM". */
|
|
-#define CM_PARAM (1 << 12)
|
|
-/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
|
|
-#define CM_FRAMES (1 << 13)
|
|
-/* Heading elements (h1, h2, ...). */
|
|
-#define CM_HEADING (1 << 14)
|
|
-/* Elements with an optional end tag. */
|
|
-#define CM_OPT (1 << 15)
|
|
-/* Elements that use "align" attribute for vertical position. */
|
|
-#define CM_IMG (1 << 16)
|
|
-/* Elements with inline and block model. Used to avoid calling InlineDup. */
|
|
-#define CM_MIXED (1 << 17)
|
|
-/* Elements whose content needs to be indented only if containing one
|
|
- CM_BLOCK element. */
|
|
-#define CM_NO_INDENT (1 << 18)
|
|
-/* Elements that are obsolete (such as "dir", "menu"). */
|
|
-#define CM_OBSOLETE (1 << 19)
|
|
-/* User defined elements. Used to determine how attributes wihout value
|
|
- should be printed. */
|
|
-#define CM_NEW (1 << 20)
|
|
-/* Elements that cannot be omitted. */
|
|
-#define CM_OMITST (1 << 21)
|
|
-
|
|
-/* If the document uses just HTML 2.0 tags and attributes described
|
|
-** it as HTML 2.0 Similarly for HTML 3.2 and the 3 flavors of HTML 4.0.
|
|
-** If there are proprietary tags and attributes then describe it as
|
|
-** HTML Proprietary. If it includes the xml-lang or xmlns attributes
|
|
-** but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the
|
|
-** flavors of Voyager (strict, loose or frameset).
|
|
-*/
|
|
+#define CM_EMPTY (1 << 0) /**< Elements with no content. Map to HTML specification. */
|
|
+#define CM_HTML (1 << 1) /**< Elements that appear outside of "BODY". */
|
|
+#define CM_HEAD (1 << 2) /**< Elements that can appear within HEAD. */
|
|
+#define CM_BLOCK (1 << 3) /**< HTML "block" elements. */
|
|
+#define CM_INLINE (1 << 4) /**< HTML "inline" elements. */
|
|
+#define CM_LIST (1 << 5) /**< Elements that mark list item ("LI"). */
|
|
+#define CM_DEFLIST (1 << 6) /**< Elements that mark definition list item ("DL", "DT"). */
|
|
+#define CM_TABLE (1 << 7) /**< Elements that can appear inside TABLE. */
|
|
+#define CM_ROWGRP (1 << 8) /**< Used for "THEAD", "TFOOT" or "TBODY". */
|
|
+#define CM_ROW (1 << 9) /**< Used for "TD", "TH" */
|
|
+#define CM_FIELD (1 << 10) /**< Elements whose content must be protected against white space movement. Includes some elements that can found in forms. */
|
|
+#define CM_OBJECT (1 << 11) /**< Used to avoid propagating inline emphasis inside some elements such as OBJECT or APPLET. */
|
|
+#define CM_PARAM (1 << 12) /**< Elements that allows "PARAM". */
|
|
+#define CM_FRAMES (1 << 13) /**< "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
|
|
+#define CM_HEADING (1 << 14) /**< Heading elements (h1, h2, ...). */
|
|
+#define CM_OPT (1 << 15) /**< Elements with an optional end tag. */
|
|
+#define CM_IMG (1 << 16) /**< Elements that use "align" attribute for vertical position. */
|
|
+#define CM_MIXED (1 << 17) /**< Elements with inline and block model. Used to avoid calling InlineDup. */
|
|
+#define CM_NO_INDENT (1 << 18) /**< Elements whose content needs to be indented only if containing one CM_BLOCK element. */
|
|
+#define CM_OBSOLETE (1 << 19) /**< Elements that are obsolete (such as "dir", "menu"). */
|
|
+#define CM_NEW (1 << 20) /**< User defined elements. Used to determine how attributes without value should be printed. */
|
|
+#define CM_OMITST (1 << 21) /**< Elements that cannot be omitted. */
|
|
+
|
|
+
|
|
+/**
|
|
+ * If the document uses just HTML 2.0 tags and attributes described
|
|
+ * it is HTML 2.0. Similarly for HTML 3.2 and the 3 flavors of HTML 4.0.
|
|
+ * If there are proprietary tags and attributes then describe it as
|
|
+ * HTML Proprietary. If it includes the xml-lang or xmlns attributes
|
|
+ * but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the
|
|
+ * flavors of Voyager (strict, loose or frameset).
|
|
+ */
|
|
|
|
/* unknown */
|
|
#define xxxx 0u
|
|
@@ -220,8 +228,10 @@ typedef enum
|
|
/* all proprietary types */
|
|
#define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN)
|
|
|
|
-/* Linked list of class names and styles
|
|
-*/
|
|
+
|
|
+/**
|
|
+ * Linked list of class names and styles
|
|
+ */
|
|
struct _Style;
|
|
typedef struct _Style TagStyle;
|
|
|
|
@@ -234,8 +244,9 @@ struct _Style
|
|
};
|
|
|
|
|
|
-/* Linked list of style properties
|
|
-*/
|
|
+/**
|
|
+ * Linked list of style properties
|
|
+ */
|
|
struct _StyleProp;
|
|
typedef struct _StyleProp StyleProp;
|
|
|
|
@@ -247,11 +258,9 @@ struct _StyleProp
|
|
};
|
|
|
|
|
|
-
|
|
-
|
|
-/* Attribute/Value linked list node
|
|
-*/
|
|
-
|
|
+/**
|
|
+ * Attribute/Value linked list node
|
|
+ */
|
|
struct _AttVal
|
|
{
|
|
AttVal* next;
|
|
@@ -264,93 +273,89 @@ struct _AttVal
|
|
};
|
|
|
|
|
|
-
|
|
-/*
|
|
- Mosaic handles inlines via a separate stack from other elements
|
|
- We duplicate this to recover from inline markup errors such as:
|
|
-
|
|
- <i>italic text
|
|
- <p>more italic text</b> normal text
|
|
-
|
|
- which for compatibility with Mosaic is mapped to:
|
|
-
|
|
- <i>italic text</i>
|
|
- <p><i>more italic text</i> normal text
|
|
-
|
|
- Note that any inline end tag pop's the effect of the current
|
|
- inline start tag, so that </b> pop's <i> in the above example.
|
|
+/**
|
|
+ * Mosaic handles inlines via a separate stack from other elements
|
|
+ * We duplicate this to recover from inline markup errors such as:
|
|
+ * ~~~
|
|
+ * <i>italic text
|
|
+ * <p>more italic text</b> normal text
|
|
+ * ~~~
|
|
+ * which for compatibility with Mosaic is mapped to:
|
|
+ * ~~~
|
|
+ * <i>italic text</i>
|
|
+ * <p><i>more italic text</i> normal text
|
|
+ * ~~~
|
|
+ * Note that any inline end tag pop's the effect of the current
|
|
+ * inline start tag, so that `</b>` pop's `<i>` in the above example.
|
|
*/
|
|
struct _IStack
|
|
{
|
|
IStack* next;
|
|
- const Dict* tag; /* tag's dictionary definition */
|
|
- tmbstr element; /* name (NULL for text nodes) */
|
|
+ const Dict* tag; /**< tag's dictionary definition */
|
|
+ tmbstr element; /**< name (NULL for text nodes) */
|
|
AttVal* attributes;
|
|
};
|
|
|
|
|
|
-/* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl,
|
|
-** etc. etc.
|
|
-*/
|
|
-
|
|
+/**
|
|
+ * HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, etc., etc.
|
|
+ */
|
|
struct _Node
|
|
{
|
|
- Node* parent; /* tree structure */
|
|
+ Node* parent; /**< tree structure */
|
|
Node* prev;
|
|
Node* next;
|
|
Node* content;
|
|
Node* last;
|
|
|
|
AttVal* attributes;
|
|
- const Dict* was; /* old tag when it was changed */
|
|
- const Dict* tag; /* tag's dictionary definition */
|
|
+ const Dict* was; /**< old tag when it was changed */
|
|
+ const Dict* tag; /**< tag's dictionary definition */
|
|
|
|
- tmbstr element; /* name (NULL for text nodes) */
|
|
+ tmbstr element; /**< name (NULL for text nodes) */
|
|
|
|
- uint start; /* start of span onto text array */
|
|
- uint end; /* end of span onto text array */
|
|
- NodeType type; /* TextNode, StartTag, EndTag etc. */
|
|
+ uint start; /**< start of span onto text array */
|
|
+ uint end; /**< end of span onto text array */
|
|
+ NodeType type; /**< TextNode, StartTag, EndTag etc. */
|
|
|
|
- uint line; /* current line of document */
|
|
- uint column; /* current column of document */
|
|
+ uint line; /**< current line of document */
|
|
+ uint column; /**< current column of document */
|
|
|
|
- Bool closed; /* true if closed by explicit end tag */
|
|
- Bool implicit; /* true if inferred */
|
|
- Bool linebreak; /* true if followed by a line break */
|
|
+ Bool closed; /**< true if closed by explicit end tag */
|
|
+ Bool implicit; /**< true if inferred */
|
|
+ Bool linebreak; /**< true if followed by a line break */
|
|
};
|
|
|
|
|
|
-/*
|
|
- The following are private to the lexer
|
|
- Use NewLexer() to create a lexer, and
|
|
- FreeLexer() to free it.
|
|
-*/
|
|
-
|
|
+/**
|
|
+ * The following are private to the lexer.
|
|
+ * Use `NewLexer()` to create a lexer, and `FreeLexer()` to free it.
|
|
+ */
|
|
struct _Lexer
|
|
{
|
|
- uint lines; /* lines seen */
|
|
- uint columns; /* at start of current token */
|
|
- Bool waswhite; /* used to collapse contiguous white space */
|
|
- Bool pushed; /* true after token has been pushed back */
|
|
- Bool insertspace; /* when space is moved after end tag */
|
|
- Bool excludeBlocks; /* Netscape compatibility */
|
|
- Bool exiled; /* true if moved out of table */
|
|
- Bool isvoyager; /* true if xmlns attribute on html element */
|
|
- uint versions; /* bit vector of HTML versions */
|
|
- uint doctype; /* version as given by doctype (if any) */
|
|
- uint versionEmitted; /* version of doctype emitted */
|
|
- Bool bad_doctype; /* e.g. if html or PUBLIC is missing */
|
|
- uint txtstart; /* start of current node */
|
|
- uint txtend; /* end of current node */
|
|
- LexerState state; /* state of lexer's finite state machine */
|
|
-
|
|
- Node* token; /* last token returned by GetToken() */
|
|
- Node* itoken; /* last duplicate inline returned by GetToken() */
|
|
- Node* root; /* remember root node of the document */
|
|
- Node* parent; /* remember parent node for CDATA elements */
|
|
-
|
|
- Bool seenEndBody; /* true if a </body> tag has been encountered */
|
|
- Bool seenEndHtml; /* true if a </html> tag has been encountered */
|
|
+ uint lines; /**< lines seen */
|
|
+ uint columns; /**< at start of current token */
|
|
+ Bool waswhite; /**< used to collapse contiguous white space */
|
|
+ Bool pushed; /**< true after token has been pushed back */
|
|
+ Bool insertspace; /**< when space is moved after end tag */
|
|
+ Bool excludeBlocks; /**< Netscape compatibility */
|
|
+ Bool exiled; /**< true if moved out of table */
|
|
+ Bool isvoyager; /**< true if xmlns attribute on html element (i.e., "Voyager" was the W3C codename for XHTML). */
|
|
+ uint versions; /**< bit vector of HTML versions */
|
|
+ uint doctype; /**< version as given by doctype (if any) */
|
|
+ uint versionEmitted; /**< version of doctype emitted */
|
|
+ Bool bad_doctype; /**< e.g. if html or PUBLIC is missing */
|
|
+ uint txtstart; /**< start of current node */
|
|
+ uint txtend; /**< end of current node */
|
|
+ LexerState state; /**< state of lexer's finite state machine */
|
|
+
|
|
+ Node* token; /**< last token returned by GetToken() */
|
|
+ Node* itoken; /**< last duplicate inline returned by GetToken() */
|
|
+ Node* root; /**< remember root node of the document */
|
|
+ Node* parent; /**< remember parent node for CDATA elements */
|
|
+
|
|
+ Bool seenEndBody; /**< true if a `</body>` tag has been encountered */
|
|
+ Bool seenEndHtml; /**< true if a `</html>` tag has been encountered */
|
|
|
|
/*
|
|
Lexer character buffer
|
|
@@ -361,33 +366,57 @@ struct _Lexer
|
|
|
|
lexsize must be reset for each file.
|
|
*/
|
|
- tmbstr lexbuf; /* MB character buffer */
|
|
- uint lexlength; /* allocated */
|
|
- uint lexsize; /* used */
|
|
+ tmbstr lexbuf; /**< MB character buffer */
|
|
+ uint lexlength; /**< allocated */
|
|
+ uint lexsize; /**< used */
|
|
|
|
/* Inline stack for compatibility with Mosaic */
|
|
- Node* inode; /* for deferring text node */
|
|
- IStack* insert; /* for inferring inline tags */
|
|
+ Node* inode; /**< for deferring text node */
|
|
+ IStack* insert; /**< for inferring inline tags */
|
|
IStack* istack;
|
|
- uint istacklength; /* allocated */
|
|
- uint istacksize; /* used */
|
|
- uint istackbase; /* start of frame */
|
|
+ uint istacklength; /**< allocated */
|
|
+ uint istacksize; /**< used */
|
|
+ uint istackbase; /**< start of frame */
|
|
|
|
- TagStyle *styles; /* used for cleaning up presentation markup */
|
|
+ TagStyle *styles; /**< used for cleaning up presentation markup */
|
|
|
|
- TidyAllocator* allocator; /* allocator */
|
|
+ TidyAllocator* allocator; /**< allocator */
|
|
};
|
|
|
|
|
|
-/* Lexer Functions
|
|
-*/
|
|
+/**
|
|
+ * modes for GetToken()
|
|
+ *
|
|
+ * MixedContent -- for elements which don't accept PCDATA
|
|
+ * Preformatted -- white space preserved as is
|
|
+ * IgnoreMarkup -- for CDATA elements such as script, style
|
|
+ */
|
|
+typedef enum
|
|
+{
|
|
+ IgnoreWhitespace,
|
|
+ MixedContent,
|
|
+ Preformatted,
|
|
+ IgnoreMarkup,
|
|
+ OtherNamespace,
|
|
+ CdataContent
|
|
+} GetTokenMode;
|
|
|
|
-/* choose what version to use for new doctype */
|
|
+
|
|
+/** @name Lexer Functions
|
|
+ * @{
|
|
+ */
|
|
+
|
|
+
|
|
+/**
|
|
+ * Choose what version to use for new doctype
|
|
+ */
|
|
TY_PRIVATE int TY_(HTMLVersion)( TidyDocImpl* doc );
|
|
|
|
-/* everything is allowed in proprietary version of HTML */
|
|
-/* this is handled here rather than in the tag/attr dicts */
|
|
|
|
+/**
|
|
+ * Everything is allowed in proprietary version of HTML.
|
|
+ * This is handled here rather than in the tag/attr dicts
|
|
+ */
|
|
TY_PRIVATE void TY_(ConstrainVersion)( TidyDocImpl* doc, uint vers );
|
|
|
|
TY_PRIVATE Bool TY_(IsWhite)(uint c);
|
|
@@ -399,7 +428,6 @@ TY_PRIVATE Bool TY_(IsNamechar)(uint c);
|
|
TY_PRIVATE Bool TY_(IsXMLLetter)(uint c);
|
|
TY_PRIVATE Bool TY_(IsXMLNamechar)(uint c);
|
|
|
|
-/* Bool IsLower(uint c); */
|
|
TY_PRIVATE Bool TY_(IsUpper)(uint c);
|
|
TY_PRIVATE uint TY_(ToLower)(uint c);
|
|
TY_PRIVATE uint TY_(ToUpper)(uint c);
|
|
@@ -407,60 +435,82 @@ TY_PRIVATE uint TY_(ToUpper)(uint c);
|
|
TY_PRIVATE Lexer* TY_(NewLexer)( TidyDocImpl* doc );
|
|
TY_PRIVATE void TY_(FreeLexer)( TidyDocImpl* doc );
|
|
|
|
-/* store character c as UTF-8 encoded byte stream */
|
|
+
|
|
+/**
|
|
+ * Store character c as UTF-8 encoded byte stream
|
|
+ */
|
|
TY_PRIVATE void TY_(AddCharToLexer)( Lexer *lexer, uint c );
|
|
|
|
-/*
|
|
- Used for elements and text nodes
|
|
- element name is NULL for text nodes
|
|
- start and end are offsets into lexbuf
|
|
- which contains the textual content of
|
|
- all elements in the parse tree.
|
|
-
|
|
- parent and content allow traversal
|
|
- of the parse tree in any direction.
|
|
- attributes are represented as a linked
|
|
- list of AttVal nodes which hold the
|
|
- strings for attribute/value pairs.
|
|
+
|
|
+/**
|
|
+ * Used for elements and text nodes.
|
|
+ * - Element name is NULL for text nodes.
|
|
+ * - start and end are offsets into lexbuf,
|
|
+ * which contains the textual content of
|
|
+ * all elements in the parse tree.
|
|
+ * - parent and content allow traversal
|
|
+ * of the parse tree in any direction.
|
|
+ * - attributes are represented as a linked
|
|
+ * list of AttVal nodes which hold the
|
|
+ * strings for attribute/value pairs.
|
|
*/
|
|
TY_PRIVATE Node* TY_(NewNode)( TidyAllocator* allocator, Lexer* lexer );
|
|
|
|
|
|
-/* used to clone heading nodes when split by an <HR> */
|
|
+/**
|
|
+ * Used to clone heading nodes when split by an `<HR>`
|
|
+ */
|
|
TY_PRIVATE Node* TY_(CloneNode)( TidyDocImpl* doc, Node *element );
|
|
|
|
-/* free node's attributes */
|
|
+
|
|
+/**
|
|
+ * Free node's attributes
|
|
+ */
|
|
TY_PRIVATE void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node );
|
|
|
|
-/* doesn't repair attribute list linkage */
|
|
+
|
|
+/**
|
|
+ * Doesn't repair attribute list linkage
|
|
+ */
|
|
TY_PRIVATE void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av );
|
|
|
|
-/* detach attribute from node */
|
|
+
|
|
+/**
|
|
+ * Detach attribute from node
|
|
+ */
|
|
TY_PRIVATE void TY_(DetachAttribute)( Node *node, AttVal *attr );
|
|
|
|
-/* detach attribute from node then free it
|
|
-*/
|
|
+
|
|
+/**
|
|
+ * Detach attribute from node then free it.
|
|
+ */
|
|
TY_PRIVATE void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr );
|
|
|
|
-/*
|
|
- Free document nodes by iterating through peers and recursing
|
|
- through children. Set next to NULL before calling FreeNode()
|
|
- to avoid freeing peer nodes. Doesn't patch up prev/next links.
|
|
+
|
|
+/**
|
|
+ * Free document nodes by iterating through peers and recursing
|
|
+ * through children. Set `next` to `NULL` before calling `FreeNode()`
|
|
+ * to avoid freeing peer nodes. Doesn't patch up prev/next links.
|
|
*/
|
|
TY_PRIVATE void TY_(FreeNode)( TidyDocImpl* doc, Node *node );
|
|
|
|
+
|
|
TY_PRIVATE Node* TY_(TextToken)( Lexer *lexer );
|
|
|
|
-/* used for creating preformatted text from Word2000 */
|
|
+
|
|
+/**
|
|
+ * Used for creating preformatted text from Word2000.
|
|
+ */
|
|
TY_PRIVATE Node* TY_(NewLineNode)( Lexer *lexer );
|
|
|
|
-/* used for adding a for Word2000 */
|
|
+
|
|
+/**
|
|
+ * Used for adding a for Word2000.
|
|
+ */
|
|
TY_PRIVATE Node* TY_(NewLiteralTextNode)(Lexer *lexer, ctmbstr txt );
|
|
|
|
-TY_PRIVATE void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str );
|
|
-/* TY_PRIVATE void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len ); */
|
|
|
|
-/* find element */
|
|
+TY_PRIVATE void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str );
|
|
TY_PRIVATE Node* TY_(FindDocType)( TidyDocImpl* doc );
|
|
TY_PRIVATE Node* TY_(FindHTML)( TidyDocImpl* doc );
|
|
TY_PRIVATE Node* TY_(FindHEAD)( TidyDocImpl* doc );
|
|
@@ -468,10 +518,16 @@ TY_PRIVATE Node* TY_(FindTITLE)(TidyDocImpl* doc);
|
|
TY_PRIVATE Node* TY_(FindBody)( TidyDocImpl* doc );
|
|
TY_PRIVATE Node* TY_(FindXmlDecl)(TidyDocImpl* doc);
|
|
|
|
-/* Returns containing block element, if any */
|
|
+
|
|
+/**
|
|
+ * Returns containing block element, if any
|
|
+ */
|
|
TY_PRIVATE Node* TY_(FindContainer)( Node* node );
|
|
|
|
-/* add meta element for Tidy */
|
|
+
|
|
+/**
|
|
+ * Add meta element for Tidy.
|
|
+ */
|
|
TY_PRIVATE Bool TY_(AddGenerator)( TidyDocImpl* doc );
|
|
|
|
TY_PRIVATE uint TY_(ApparentVersion)( TidyDocImpl* doc );
|
|
@@ -485,118 +541,209 @@ TY_PRIVATE Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc );
|
|
TY_PRIVATE Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc );
|
|
|
|
|
|
-/* fixup doctype if missing */
|
|
+/**
|
|
+ * Fixup doctype if missing.
|
|
+ */
|
|
TY_PRIVATE Bool TY_(FixDocType)( TidyDocImpl* doc );
|
|
|
|
-/* ensure XML document starts with <?xml version="1.0"?> */
|
|
-/* add encoding attribute if not using ASCII or UTF-8 output */
|
|
+
|
|
+/**
|
|
+ * Ensure XML document starts with <?xml version="1.0"?>,and
|
|
+ * add encoding attribute if not using ASCII or UTF-8 output.
|
|
+ */
|
|
TY_PRIVATE Bool TY_(FixXmlDecl)( TidyDocImpl* doc );
|
|
|
|
+
|
|
TY_PRIVATE Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id);
|
|
|
|
TY_PRIVATE void TY_(UngetToken)( TidyDocImpl* doc );
|
|
|
|
-
|
|
-/*
|
|
- modes for GetToken()
|
|
-
|
|
- MixedContent -- for elements which don't accept PCDATA
|
|
- Preformatted -- white space preserved as is
|
|
- IgnoreMarkup -- for CDATA elements such as script, style
|
|
-*/
|
|
-typedef enum
|
|
-{
|
|
- IgnoreWhitespace,
|
|
- MixedContent,
|
|
- Preformatted,
|
|
- IgnoreMarkup,
|
|
- OtherNamespace,
|
|
- CdataContent
|
|
-} GetTokenMode;
|
|
-
|
|
TY_PRIVATE Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode );
|
|
|
|
TY_PRIVATE void TY_(InitMap)(void);
|
|
|
|
|
|
-/* create a new attribute */
|
|
+/**
|
|
+ * Create a new attribute.
|
|
+ */
|
|
TY_PRIVATE AttVal* TY_(NewAttribute)( TidyDocImpl* doc );
|
|
|
|
-/* create a new attribute with given name and value */
|
|
+
|
|
+/**
|
|
+ * Create a new attribute with given name and value.
|
|
+ */
|
|
TY_PRIVATE AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value,
|
|
int delim );
|
|
|
|
-/* insert attribute at the end of attribute list of a node */
|
|
+
|
|
+/**
|
|
+ * Insert attribute at the end of attribute list of a node.
|
|
+ */
|
|
TY_PRIVATE void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av );
|
|
|
|
-/* insert attribute at the start of attribute list of a node */
|
|
+/**
|
|
+ * Insert attribute at the start of attribute list of a node.
|
|
+ */
|
|
TY_PRIVATE void TY_(InsertAttributeAtStart)( Node *node, AttVal *av );
|
|
|
|
-/*************************************
|
|
- In-line Stack functions
|
|
-*************************************/
|
|
-
|
|
-
|
|
-/* duplicate attributes */
|
|
-TY_PRIVATE AttVal* TY_(DupAttrs)( TidyDocImpl* doc, AttVal* attrs );
|
|
|
|
-/*
|
|
- push a copy of an inline node onto stack
|
|
- but don't push if implicit or OBJECT or APPLET
|
|
- (implicit tags are ones generated from the istack)
|
|
+/** @}
|
|
+ * @name Inline Stack Functions
|
|
+ * @{
|
|
+ */
|
|
|
|
- One issue arises with pushing inlines when
|
|
- the tag is already pushed. For instance:
|
|
|
|
- <p><em>text
|
|
- <p><em>more text
|
|
+/**
|
|
+ * Duplicate attributes.
|
|
+ */
|
|
+TY_PRIVATE AttVal* TY_(DupAttrs)( TidyDocImpl* doc, AttVal* attrs );
|
|
|
|
- Shouldn't be mapped to
|
|
|
|
- <p><em>text</em></p>
|
|
- <p><em><em>more text</em></em>
|
|
-*/
|
|
+/**
|
|
+ * Push a copy of an inline node onto stack, but don't push if
|
|
+ * implicit or OBJECT or APPLET (implicit tags are ones generated
|
|
+ * from the istack).
|
|
+ *
|
|
+ * One issue arises with pushing inlines when the tag is already pushed.
|
|
+ * For instance:
|
|
+ * ~~~
|
|
+ * <p><em>text
|
|
+ * <p><em>more text
|
|
+ * ~~~
|
|
+ * Shouldn't be mapped to
|
|
+ * ~~~
|
|
+ * <p><em>text</em></p>
|
|
+ * <p><em><em>more text</em></em>
|
|
+ * ~~~
|
|
+ */
|
|
TY_PRIVATE void TY_(PushInline)( TidyDocImpl* doc, Node* node );
|
|
|
|
-/* pop inline stack */
|
|
+
|
|
+/**
|
|
+ * Pop inline stack.
|
|
+ */
|
|
TY_PRIVATE void TY_(PopInline)( TidyDocImpl* doc, Node* node );
|
|
|
|
+
|
|
TY_PRIVATE Bool TY_(IsPushed)( TidyDocImpl* doc, Node* node );
|
|
TY_PRIVATE Bool TY_(IsPushedLast)( TidyDocImpl* doc, Node *element, Node *node );
|
|
|
|
-/*
|
|
- This has the effect of inserting "missing" inline
|
|
- elements around the contents of blocklevel elements
|
|
- such as P, TD, TH, DIV, PRE etc. This procedure is
|
|
- called at the start of ParseBlock. when the inline
|
|
- stack is not empty, as will be the case in:
|
|
|
|
- <i><h1>italic heading</h1></i>
|
|
+/**
|
|
+ * This has the effect of inserting "missing" inline elements around the
|
|
+ * contents of blocklevel elements such as P, TD, TH, DIV, PRE etc. This
|
|
+ * procedure is called at the start of `ParseBlock`, when the inline
|
|
+ * stack is not empty, as will be the case in:
|
|
+ * ~~~
|
|
+ * <i><h1>italic heading</h1></i>
|
|
+ * ~~~
|
|
+ * which is then treated as equivalent to
|
|
+ * ~~~
|
|
+ * <h1><i>italic heading</i></h1>
|
|
+ * ~~~
|
|
+ * This is implemented by setting the lexer into a mode where it gets
|
|
+ * tokens from the inline stack rather than from the input stream.
|
|
+ */
|
|
+TY_PRIVATE int TY_(InlineDup)( TidyDocImpl* doc, Node *node );
|
|
|
|
- which is then treated as equivalent to
|
|
|
|
- <h1><i>italic heading</i></h1>
|
|
+/**
|
|
+ * Fefer duplicates when entering a table or other
|
|
+ * element where the inlines shouldn't be duplicated.
|
|
+ */
|
|
+TY_PRIVATE void TY_(DeferDup)( TidyDocImpl* doc );
|
|
|
|
- This is implemented by setting the lexer into a mode
|
|
- where it gets tokens from the inline stack rather than
|
|
- from the input stream.
|
|
-*/
|
|
-TY_PRIVATE int TY_(InlineDup)( TidyDocImpl* doc, Node *node );
|
|
|
|
-/*
|
|
- defer duplicates when entering a table or other
|
|
- element where the inlines shouldn't be duplicated
|
|
-*/
|
|
-TY_PRIVATE void TY_(DeferDup)( TidyDocImpl* doc );
|
|
TY_PRIVATE Node* TY_(InsertedToken)( TidyDocImpl* doc );
|
|
|
|
-/* stack manipulation for inline elements */
|
|
+/**
|
|
+ * Stack manipulation for inline elements
|
|
+ */
|
|
TY_PRIVATE Bool TY_(SwitchInline)( TidyDocImpl* doc, Node* element, Node* node );
|
|
+
|
|
+
|
|
TY_PRIVATE Bool TY_(InlineDup1)( TidyDocImpl* doc, Node* node, Node* element );
|
|
|
|
+
|
|
+/** @}
|
|
+ * @name Generic stack of nodes.
|
|
+ * @{
|
|
+ */
|
|
+
|
|
+
|
|
+/**
|
|
+ * This typedef represents a stack of addresses to nodes. Tidy uses these to
|
|
+ * try to limit recursion by pushing nodes to a stack when possible instead
|
|
+ * of recursing.
|
|
+ */
|
|
+typedef struct _Stack {
|
|
+ int top; /**< Current top position. */
|
|
+ unsigned capacity; /**< Current capacity. Can be expanded. */
|
|
+ Node **firstNode; /** A pointer to the first pointer to a Node in an array of node addresses. */
|
|
+ TidyAllocator* allocator; /**< Tidy's allocator, used at instantiation and expanding. */
|
|
+} Stack;
|
|
+
|
|
+
|
|
+/**
|
|
+ * Create a new stack with a given starting capacity. If memory allocation
|
|
+ * fails, then the allocator will panic the program automatically.
|
|
+ */
|
|
+TY_PRIVATE Stack* TY_(newStack)(TidyDocImpl *doc, uint capacity);
|
|
+
|
|
+
|
|
+/**
|
|
+ * Increase the stack size. This will be called automatically when the
|
|
+ * current stack is full. If memory allocation fails, then the allocator
|
|
+ * will panic the program automatically.
|
|
+ */
|
|
+TY_PRIVATE void TY_(growStack)(Stack *stack);
|
|
+
|
|
+
|
|
+/**
|
|
+ * Stack is full when top is equal to the last index.
|
|
+ */
|
|
+TY_PRIVATE Bool TY_(stackFull)(Stack *stack);
|
|
+
|
|
+
|
|
+/**
|
|
+ * Stack is empty when top is equal to -1
|
|
+ */
|
|
+TY_PRIVATE Bool TY_(stackEmpty)(Stack *stack);
|
|
+
|
|
+
|
|
+/**
|
|
+ * Push an item to the stack.
|
|
+ */
|
|
+TY_PRIVATE void TY_(push)(Stack *stack, Node *node);
|
|
+
|
|
+
|
|
+/**
|
|
+ * Pop an item from the stack.
|
|
+ */
|
|
+TY_PRIVATE Node* TY_(pop)(Stack *stack);
|
|
+
|
|
+
|
|
+/**
|
|
+ * Peek at the stack.
|
|
+ */
|
|
+TY_PRIVATE Node* TY_(peek)(Stack *stack);
|
|
+
|
|
+/**
|
|
+ * Frees the stack when done.
|
|
+ */
|
|
+TY_PRIVATE void TY_(freeStack)(Stack *stack);
|
|
+
|
|
+
|
|
+/** @}
|
|
+ */
|
|
+
|
|
+
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
|
|
+/** @} end parser_h group */
|
|
+/** @} end internal_api group */
|
|
+
|
|
#endif /* __LEXER_H__ */
|