diff --git a/SPECS/tidy/CVE-2021-33391.patch b/SPECS/tidy/CVE-2021-33391.patch new file mode 100644 index 0000000000..e36af526e0 --- /dev/null +++ b/SPECS/tidy/CVE-2021-33391.patch @@ -0,0 +1,1108 @@ +diff --git a/src/gdoc.c b/src/gdoc.c +index 50cd9bc..3786746 100644 +--- a/src/gdoc.c ++++ b/src/gdoc.c +@@ -96,14 +96,15 @@ static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode) + + static void CleanNode( TidyDocImpl* doc, Node *node ) + { ++ Stack *stack = TY_(newStack)(doc, 16); + Node *child, *next; + +- if (node->content) ++ if ( (child = node->content) ) + { +- for (child = node->content; child != NULL; child = next) ++ while (child) + { + next = child->next; +- ++ + if (TY_(nodeIsElement)(child)) + { + if (nodeIsSTYLE(child)) +@@ -131,11 +132,16 @@ static void CleanNode( TidyDocImpl* doc, Node *node ) + if (child->attributes) + TY_(DropAttrByName)( doc, child, "class" ); + +- CleanNode(doc, child); ++ TY_(push)(stack,next); ++ child = child->content; ++ continue; + } + } ++ child = next ? next : TY_(pop)(stack); + } + } ++ TY_(freeStack)(stack); ++ TidyFree(doc->allocator, stack); + } + + /* insert meta element to force browser to recognize doc as UTF8 */ +diff --git a/src/lexer.c b/src/lexer.c +index bc4e50a..d1cae84 100644 +--- a/src/lexer.c ++++ b/src/lexer.c +@@ -877,15 +877,6 @@ static tmbchar LastChar( tmbstr str ) + return 0; + } + +-/* +- node->type is one of these: +- +- #define TextNode 1 +- #define StartTag 2 +- #define EndTag 3 +- #define StartEndTag 4 +-*/ +- + Lexer* TY_(NewLexer)( TidyDocImpl* doc ) + { + Lexer* lexer = (Lexer*) TidyDocAlloc( doc, sizeof(Lexer) ); +@@ -1545,13 +1536,7 @@ void TY_(FreeNode)( TidyDocImpl* doc, Node *node ) + } + } + #endif +- /* this is no good ;=(( +- if (node && doc && doc->lexer) { +- if (node == doc->lexer->token) { +- doc->lexer->token = NULL; // TY_(NewNode)( doc->lexer->allocator, doc->lexer ); +- } +- } +- ----------------- */ ++ + while ( node ) + { + Node* next = node->next; +@@ -4462,11 +4447,102 @@ static Node *ParseDocTypeDecl(TidyDocImpl* doc) + return NULL; + } + +-/* +- * local variables: +- * mode: c +- * indent-tabs-mode: nil +- * c-basic-offset: 4 +- * eval: (c-set-offset 'substatement-open 0) +- * end: ++ ++/****************************************************************************//* ++ ** MARK: - Node Stack ++ ***************************************************************************/ ++ ++ ++/** ++ * Create a new stack with a given starting capacity. If memory allocation ++ * fails, then the allocator will panic the program automatically. ++ */ ++Stack* TY_(newStack)(TidyDocImpl *doc, uint capacity) ++{ ++ Stack *stack = (Stack *)TidyAlloc(doc->allocator, sizeof(Stack)); ++ stack->top = -1; ++ stack->capacity = capacity; ++ stack->firstNode = (Node **)TidyAlloc(doc->allocator, stack->capacity * sizeof(Node**)); ++ stack->allocator = doc->allocator; ++ return stack; ++} ++ ++ ++/** ++ * Increase the stack size. This will be called automatically when the ++ * current stack is full. If memory allocation fails, then the allocator ++ * will panic the program automatically. ++ */ ++void TY_(growStack)(Stack *stack) ++{ ++ uint new_capacity = stack->capacity * 2; ++ ++ Node **firstNode = (Node **)TidyAlloc(stack->allocator, new_capacity); ++ ++ memcpy( firstNode, stack->firstNode, sizeof(Node**) * (stack->top + 1) ); ++ TidyFree(stack->allocator, stack->firstNode); ++ ++ stack->firstNode = firstNode; ++ stack->capacity = new_capacity; ++} ++ ++ ++/** ++ * Stack is full when top is equal to the last index. ++ */ ++Bool TY_(stackFull)(Stack *stack) ++{ ++ return stack->top == stack->capacity - 1; ++} ++ ++ ++/** ++ * Stack is empty when top is equal to -1 ++ */ ++Bool TY_(stackEmpty)(Stack *stack) ++{ ++ return stack->top == -1; ++} ++ ++ ++/** ++ * Push an item to the stack. ++ */ ++void TY_(push)(Stack *stack, Node *node) ++{ ++ if (TY_(stackFull)(stack)) ++ TY_(growStack)(stack); ++ ++ if (node) ++ stack->firstNode[++stack->top] = node; ++} ++ ++ ++/** ++ * Pop an item from the stack. ++ */ ++Node* TY_(pop)(Stack *stack) ++{ ++ return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--]; ++} ++ ++ ++/** ++ * Peek at the stack. + */ ++FUNC_UNUSED Node* TY_(peek)(Stack *stack) ++{ ++ return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--]; ++} ++ ++/** ++ * Frees the stack when done. ++ */ ++void TY_(freeStack)(Stack *stack) ++{ ++ TidyFree( stack->allocator, stack->firstNode ); ++ stack->top = -1; ++ stack->capacity = 0; ++ stack->firstNode = NULL; ++ stack->allocator = NULL; ++} +diff --git a/src/lexer.h b/src/lexer.h +index c181d4b..d9ae113 100644 +--- a/src/lexer.h ++++ b/src/lexer.h +@@ -1,33 +1,46 @@ + #ifndef __LEXER_H__ + #define __LEXER_H__ + +-/* lexer.h -- Lexer for html parser +- +- (c) 1998-2008 (W3C) MIT, ERCIM, Keio University +- See tidy.h for the copyright notice. + +- Given an input source, it returns a sequence of tokens. +- +- GetToken(source) gets the next token +- UngetToken(source) provides one level undo +- +- The tags include an attribute list: +- +- - linked list of attribute/value nodes +- - each node has 2 NULL-terminated strings. +- - entities are replaced in attribute values +- +- white space is compacted if not in preformatted mode +- If not in preformatted mode then leading white space +- is discarded and subsequent white space sequences +- compacted to single space characters. +- +- If XmlTags is no then Tag names are folded to upper +- case and attribute names to lower case. +- +- Not yet done: +- - Doctype subset and marked sections +-*/ ++/**************************************************************************//** ++ * @file ++ * Lexer for HTML and XML Parsers. ++ * ++ * Given an input source, it returns a sequence of tokens. ++ * ++ * GetToken(source) gets the next token ++ * UngetToken(source) provides one level undo ++ * ++ * The tags include an attribute list: ++ * ++ * - linked list of attribute/value nodes ++ * - each node has 2 NULL-terminated strings. ++ * - entities are replaced in attribute values ++ * ++ * white space is compacted if not in preformatted mode ++ * If not in preformatted mode then leading white space ++ * is discarded and subsequent white space sequences ++ * compacted to single space characters. ++ * ++ * If XmlTags is no then Tag names are folded to upper ++ * case and attribute names to lower case. ++ * ++ * Not yet done: ++ * - Doctype subset and marked sections ++ * ++ * @author HTACG, et al (consult git log) ++ * ++ * @copyright ++ * (c) 1998-2021 (W3C) MIT, ERCIM, Keio University, and HTACG. ++ * See tidy.h for the copyright notice. ++ * @par ++ * All Rights Reserved. ++ * @par ++ * See `tidy.h` for the complete license. ++ * ++ * @date Additional updates: consult git log ++ * ++ ******************************************************************************/ + + #ifdef __cplusplus + extern "C" { +@@ -35,8 +48,23 @@ extern "C" { + + #include "forward.h" + +-/* lexer character types +-*/ ++/** @addtogroup internal_api */ ++/** @{ */ ++ ++ ++/***************************************************************************//** ++ ** @defgroup lexer_h HTML and XML Lexing ++ ** ++ ** These functions and structures form the internal API for document ++ ** lexing. ++ ** ++ ** @{ ++ ******************************************************************************/ ++ ++ ++/** ++ * Lexer character types. ++ */ + #define digit 1u + #define letter 2u + #define namechar 4u +@@ -47,8 +75,9 @@ extern "C" { + #define digithex 128u + + +-/* node->type is one of these values +-*/ ++/** ++ * node->type is one of these values ++ */ + typedef enum + { + RootNode, +@@ -68,9 +97,9 @@ typedef enum + } NodeType; + + +- +-/* lexer GetToken states +-*/ ++/** ++ * Lexer GetToken() states. ++ */ + typedef enum + { + LEX_CONTENT, +@@ -88,7 +117,10 @@ typedef enum + LEX_XMLDECL + } LexerState; + +-/* ParseDocTypeDecl state constants */ ++ ++/** ++ * ParseDocTypeDecl state constants. ++ */ + typedef enum + { + DT_INTERMEDIATE, +@@ -98,67 +130,43 @@ typedef enum + DT_INTSUBSET + } ParseDocTypeDeclState; + +-/* content model shortcut encoding +- +- Descriptions are tentative. +-*/ ++/** ++ * Content model shortcut encoding. ++ * Descriptions are tentative. ++ */ + #define CM_UNKNOWN 0 +-/* Elements with no content. Map to HTML specification. */ +-#define CM_EMPTY (1 << 0) +-/* Elements that appear outside of "BODY". */ +-#define CM_HTML (1 << 1) +-/* Elements that can appear within HEAD. */ +-#define CM_HEAD (1 << 2) +-/* HTML "block" elements. */ +-#define CM_BLOCK (1 << 3) +-/* HTML "inline" elements. */ +-#define CM_INLINE (1 << 4) +-/* Elements that mark list item ("LI"). */ +-#define CM_LIST (1 << 5) +-/* Elements that mark definition list item ("DL", "DT"). */ +-#define CM_DEFLIST (1 << 6) +-/* Elements that can appear inside TABLE. */ +-#define CM_TABLE (1 << 7) +-/* Used for "THEAD", "TFOOT" or "TBODY". */ +-#define CM_ROWGRP (1 << 8) +-/* Used for "TD", "TH" */ +-#define CM_ROW (1 << 9) +-/* Elements whose content must be protected against white space movement. +- Includes some elements that can found in forms. */ +-#define CM_FIELD (1 << 10) +-/* Used to avoid propagating inline emphasis inside some elements +- such as OBJECT or APPLET. */ +-#define CM_OBJECT (1 << 11) +-/* Elements that allows "PARAM". */ +-#define CM_PARAM (1 << 12) +-/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */ +-#define CM_FRAMES (1 << 13) +-/* Heading elements (h1, h2, ...). */ +-#define CM_HEADING (1 << 14) +-/* Elements with an optional end tag. */ +-#define CM_OPT (1 << 15) +-/* Elements that use "align" attribute for vertical position. */ +-#define CM_IMG (1 << 16) +-/* Elements with inline and block model. Used to avoid calling InlineDup. */ +-#define CM_MIXED (1 << 17) +-/* Elements whose content needs to be indented only if containing one +- CM_BLOCK element. */ +-#define CM_NO_INDENT (1 << 18) +-/* Elements that are obsolete (such as "dir", "menu"). */ +-#define CM_OBSOLETE (1 << 19) +-/* User defined elements. Used to determine how attributes wihout value +- should be printed. */ +-#define CM_NEW (1 << 20) +-/* Elements that cannot be omitted. */ +-#define CM_OMITST (1 << 21) +- +-/* If the document uses just HTML 2.0 tags and attributes described +-** it as HTML 2.0 Similarly for HTML 3.2 and the 3 flavors of HTML 4.0. +-** If there are proprietary tags and attributes then describe it as +-** HTML Proprietary. If it includes the xml-lang or xmlns attributes +-** but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the +-** flavors of Voyager (strict, loose or frameset). +-*/ ++#define CM_EMPTY (1 << 0) /**< Elements with no content. Map to HTML specification. */ ++#define CM_HTML (1 << 1) /**< Elements that appear outside of "BODY". */ ++#define CM_HEAD (1 << 2) /**< Elements that can appear within HEAD. */ ++#define CM_BLOCK (1 << 3) /**< HTML "block" elements. */ ++#define CM_INLINE (1 << 4) /**< HTML "inline" elements. */ ++#define CM_LIST (1 << 5) /**< Elements that mark list item ("LI"). */ ++#define CM_DEFLIST (1 << 6) /**< Elements that mark definition list item ("DL", "DT"). */ ++#define CM_TABLE (1 << 7) /**< Elements that can appear inside TABLE. */ ++#define CM_ROWGRP (1 << 8) /**< Used for "THEAD", "TFOOT" or "TBODY". */ ++#define CM_ROW (1 << 9) /**< Used for "TD", "TH" */ ++#define CM_FIELD (1 << 10) /**< Elements whose content must be protected against white space movement. Includes some elements that can found in forms. */ ++#define CM_OBJECT (1 << 11) /**< Used to avoid propagating inline emphasis inside some elements such as OBJECT or APPLET. */ ++#define CM_PARAM (1 << 12) /**< Elements that allows "PARAM". */ ++#define CM_FRAMES (1 << 13) /**< "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */ ++#define CM_HEADING (1 << 14) /**< Heading elements (h1, h2, ...). */ ++#define CM_OPT (1 << 15) /**< Elements with an optional end tag. */ ++#define CM_IMG (1 << 16) /**< Elements that use "align" attribute for vertical position. */ ++#define CM_MIXED (1 << 17) /**< Elements with inline and block model. Used to avoid calling InlineDup. */ ++#define CM_NO_INDENT (1 << 18) /**< Elements whose content needs to be indented only if containing one CM_BLOCK element. */ ++#define CM_OBSOLETE (1 << 19) /**< Elements that are obsolete (such as "dir", "menu"). */ ++#define CM_NEW (1 << 20) /**< User defined elements. Used to determine how attributes without value should be printed. */ ++#define CM_OMITST (1 << 21) /**< Elements that cannot be omitted. */ ++ ++ ++/** ++ * If the document uses just HTML 2.0 tags and attributes described ++ * it is HTML 2.0. Similarly for HTML 3.2 and the 3 flavors of HTML 4.0. ++ * If there are proprietary tags and attributes then describe it as ++ * HTML Proprietary. If it includes the xml-lang or xmlns attributes ++ * but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the ++ * flavors of Voyager (strict, loose or frameset). ++ */ + + /* unknown */ + #define xxxx 0u +@@ -220,8 +228,10 @@ typedef enum + /* all proprietary types */ + #define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN) + +-/* Linked list of class names and styles +-*/ ++ ++/** ++ * Linked list of class names and styles ++ */ + struct _Style; + typedef struct _Style TagStyle; + +@@ -234,8 +244,9 @@ struct _Style + }; + + +-/* Linked list of style properties +-*/ ++/** ++ * Linked list of style properties ++ */ + struct _StyleProp; + typedef struct _StyleProp StyleProp; + +@@ -247,11 +258,9 @@ struct _StyleProp + }; + + +- +- +-/* Attribute/Value linked list node +-*/ +- ++/** ++ * Attribute/Value linked list node ++ */ + struct _AttVal + { + AttVal* next; +@@ -264,93 +273,89 @@ struct _AttVal + }; + + +- +-/* +- Mosaic handles inlines via a separate stack from other elements +- We duplicate this to recover from inline markup errors such as: +- +- italic text +-
more italic text normal text +- +- which for compatibility with Mosaic is mapped to: +- +- italic text +-
more italic text normal text +- +- Note that any inline end tag pop's the effect of the current +- inline start tag, so that pop's in the above example. ++/** ++ * Mosaic handles inlines via a separate stack from other elements ++ * We duplicate this to recover from inline markup errors such as: ++ * ~~~ ++ * italic text ++ *
more italic text normal text ++ * ~~~ ++ * which for compatibility with Mosaic is mapped to: ++ * ~~~ ++ * italic text ++ *
more italic text normal text ++ * ~~~ ++ * Note that any inline end tag pop's the effect of the current ++ * inline start tag, so that `` pop's `` in the above example. + */ + struct _IStack + { + IStack* next; +- const Dict* tag; /* tag's dictionary definition */ +- tmbstr element; /* name (NULL for text nodes) */ ++ const Dict* tag; /**< tag's dictionary definition */ ++ tmbstr element; /**< name (NULL for text nodes) */ + AttVal* attributes; + }; + + +-/* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, +-** etc. etc. +-*/ +- ++/** ++ * HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, etc., etc. ++ */ + struct _Node + { +- Node* parent; /* tree structure */ ++ Node* parent; /**< tree structure */ + Node* prev; + Node* next; + Node* content; + Node* last; + + AttVal* attributes; +- const Dict* was; /* old tag when it was changed */ +- const Dict* tag; /* tag's dictionary definition */ ++ const Dict* was; /**< old tag when it was changed */ ++ const Dict* tag; /**< tag's dictionary definition */ + +- tmbstr element; /* name (NULL for text nodes) */ ++ tmbstr element; /**< name (NULL for text nodes) */ + +- uint start; /* start of span onto text array */ +- uint end; /* end of span onto text array */ +- NodeType type; /* TextNode, StartTag, EndTag etc. */ ++ uint start; /**< start of span onto text array */ ++ uint end; /**< end of span onto text array */ ++ NodeType type; /**< TextNode, StartTag, EndTag etc. */ + +- uint line; /* current line of document */ +- uint column; /* current column of document */ ++ uint line; /**< current line of document */ ++ uint column; /**< current column of document */ + +- Bool closed; /* true if closed by explicit end tag */ +- Bool implicit; /* true if inferred */ +- Bool linebreak; /* true if followed by a line break */ ++ Bool closed; /**< true if closed by explicit end tag */ ++ Bool implicit; /**< true if inferred */ ++ Bool linebreak; /**< true if followed by a line break */ + }; + + +-/* +- The following are private to the lexer +- Use NewLexer() to create a lexer, and +- FreeLexer() to free it. +-*/ +- ++/** ++ * The following are private to the lexer. ++ * Use `NewLexer()` to create a lexer, and `FreeLexer()` to free it. ++ */ + struct _Lexer + { +- uint lines; /* lines seen */ +- uint columns; /* at start of current token */ +- Bool waswhite; /* used to collapse contiguous white space */ +- Bool pushed; /* true after token has been pushed back */ +- Bool insertspace; /* when space is moved after end tag */ +- Bool excludeBlocks; /* Netscape compatibility */ +- Bool exiled; /* true if moved out of table */ +- Bool isvoyager; /* true if xmlns attribute on html element */ +- uint versions; /* bit vector of HTML versions */ +- uint doctype; /* version as given by doctype (if any) */ +- uint versionEmitted; /* version of doctype emitted */ +- Bool bad_doctype; /* e.g. if html or PUBLIC is missing */ +- uint txtstart; /* start of current node */ +- uint txtend; /* end of current node */ +- LexerState state; /* state of lexer's finite state machine */ +- +- Node* token; /* last token returned by GetToken() */ +- Node* itoken; /* last duplicate inline returned by GetToken() */ +- Node* root; /* remember root node of the document */ +- Node* parent; /* remember parent node for CDATA elements */ +- +- Bool seenEndBody; /* true if a