diff --git a/SPECS/tidy/CVE-2021-33391.patch b/SPECS/tidy/CVE-2021-33391.patch new file mode 100644 index 0000000000..e36af526e0 --- /dev/null +++ b/SPECS/tidy/CVE-2021-33391.patch @@ -0,0 +1,1108 @@ +diff --git a/src/gdoc.c b/src/gdoc.c +index 50cd9bc..3786746 100644 +--- a/src/gdoc.c ++++ b/src/gdoc.c +@@ -96,14 +96,15 @@ static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode) + + static void CleanNode( TidyDocImpl* doc, Node *node ) + { ++ Stack *stack = TY_(newStack)(doc, 16); + Node *child, *next; + +- if (node->content) ++ if ( (child = node->content) ) + { +- for (child = node->content; child != NULL; child = next) ++ while (child) + { + next = child->next; +- ++ + if (TY_(nodeIsElement)(child)) + { + if (nodeIsSTYLE(child)) +@@ -131,11 +132,16 @@ static void CleanNode( TidyDocImpl* doc, Node *node ) + if (child->attributes) + TY_(DropAttrByName)( doc, child, "class" ); + +- CleanNode(doc, child); ++ TY_(push)(stack,next); ++ child = child->content; ++ continue; + } + } ++ child = next ? next : TY_(pop)(stack); + } + } ++ TY_(freeStack)(stack); ++ TidyFree(doc->allocator, stack); + } + + /* insert meta element to force browser to recognize doc as UTF8 */ +diff --git a/src/lexer.c b/src/lexer.c +index bc4e50a..d1cae84 100644 +--- a/src/lexer.c ++++ b/src/lexer.c +@@ -877,15 +877,6 @@ static tmbchar LastChar( tmbstr str ) + return 0; + } + +-/* +- node->type is one of these: +- +- #define TextNode 1 +- #define StartTag 2 +- #define EndTag 3 +- #define StartEndTag 4 +-*/ +- + Lexer* TY_(NewLexer)( TidyDocImpl* doc ) + { + Lexer* lexer = (Lexer*) TidyDocAlloc( doc, sizeof(Lexer) ); +@@ -1545,13 +1536,7 @@ void TY_(FreeNode)( TidyDocImpl* doc, Node *node ) + } + } + #endif +- /* this is no good ;=(( +- if (node && doc && doc->lexer) { +- if (node == doc->lexer->token) { +- doc->lexer->token = NULL; // TY_(NewNode)( doc->lexer->allocator, doc->lexer ); +- } +- } +- ----------------- */ ++ + while ( node ) + { + Node* next = node->next; +@@ -4462,11 +4447,102 @@ static Node *ParseDocTypeDecl(TidyDocImpl* doc) + return NULL; + } + +-/* +- * local variables: +- * mode: c +- * indent-tabs-mode: nil +- * c-basic-offset: 4 +- * eval: (c-set-offset 'substatement-open 0) +- * end: ++ ++/****************************************************************************//* ++ ** MARK: - Node Stack ++ ***************************************************************************/ ++ ++ ++/** ++ * Create a new stack with a given starting capacity. If memory allocation ++ * fails, then the allocator will panic the program automatically. ++ */ ++Stack* TY_(newStack)(TidyDocImpl *doc, uint capacity) ++{ ++ Stack *stack = (Stack *)TidyAlloc(doc->allocator, sizeof(Stack)); ++ stack->top = -1; ++ stack->capacity = capacity; ++ stack->firstNode = (Node **)TidyAlloc(doc->allocator, stack->capacity * sizeof(Node**)); ++ stack->allocator = doc->allocator; ++ return stack; ++} ++ ++ ++/** ++ * Increase the stack size. This will be called automatically when the ++ * current stack is full. If memory allocation fails, then the allocator ++ * will panic the program automatically. ++ */ ++void TY_(growStack)(Stack *stack) ++{ ++ uint new_capacity = stack->capacity * 2; ++ ++ Node **firstNode = (Node **)TidyAlloc(stack->allocator, new_capacity); ++ ++ memcpy( firstNode, stack->firstNode, sizeof(Node**) * (stack->top + 1) ); ++ TidyFree(stack->allocator, stack->firstNode); ++ ++ stack->firstNode = firstNode; ++ stack->capacity = new_capacity; ++} ++ ++ ++/** ++ * Stack is full when top is equal to the last index. ++ */ ++Bool TY_(stackFull)(Stack *stack) ++{ ++ return stack->top == stack->capacity - 1; ++} ++ ++ ++/** ++ * Stack is empty when top is equal to -1 ++ */ ++Bool TY_(stackEmpty)(Stack *stack) ++{ ++ return stack->top == -1; ++} ++ ++ ++/** ++ * Push an item to the stack. ++ */ ++void TY_(push)(Stack *stack, Node *node) ++{ ++ if (TY_(stackFull)(stack)) ++ TY_(growStack)(stack); ++ ++ if (node) ++ stack->firstNode[++stack->top] = node; ++} ++ ++ ++/** ++ * Pop an item from the stack. ++ */ ++Node* TY_(pop)(Stack *stack) ++{ ++ return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--]; ++} ++ ++ ++/** ++ * Peek at the stack. + */ ++FUNC_UNUSED Node* TY_(peek)(Stack *stack) ++{ ++ return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--]; ++} ++ ++/** ++ * Frees the stack when done. ++ */ ++void TY_(freeStack)(Stack *stack) ++{ ++ TidyFree( stack->allocator, stack->firstNode ); ++ stack->top = -1; ++ stack->capacity = 0; ++ stack->firstNode = NULL; ++ stack->allocator = NULL; ++} +diff --git a/src/lexer.h b/src/lexer.h +index c181d4b..d9ae113 100644 +--- a/src/lexer.h ++++ b/src/lexer.h +@@ -1,33 +1,46 @@ + #ifndef __LEXER_H__ + #define __LEXER_H__ + +-/* lexer.h -- Lexer for html parser +- +- (c) 1998-2008 (W3C) MIT, ERCIM, Keio University +- See tidy.h for the copyright notice. + +- Given an input source, it returns a sequence of tokens. +- +- GetToken(source) gets the next token +- UngetToken(source) provides one level undo +- +- The tags include an attribute list: +- +- - linked list of attribute/value nodes +- - each node has 2 NULL-terminated strings. +- - entities are replaced in attribute values +- +- white space is compacted if not in preformatted mode +- If not in preformatted mode then leading white space +- is discarded and subsequent white space sequences +- compacted to single space characters. +- +- If XmlTags is no then Tag names are folded to upper +- case and attribute names to lower case. +- +- Not yet done: +- - Doctype subset and marked sections +-*/ ++/**************************************************************************//** ++ * @file ++ * Lexer for HTML and XML Parsers. ++ * ++ * Given an input source, it returns a sequence of tokens. ++ * ++ * GetToken(source) gets the next token ++ * UngetToken(source) provides one level undo ++ * ++ * The tags include an attribute list: ++ * ++ * - linked list of attribute/value nodes ++ * - each node has 2 NULL-terminated strings. ++ * - entities are replaced in attribute values ++ * ++ * white space is compacted if not in preformatted mode ++ * If not in preformatted mode then leading white space ++ * is discarded and subsequent white space sequences ++ * compacted to single space characters. ++ * ++ * If XmlTags is no then Tag names are folded to upper ++ * case and attribute names to lower case. ++ * ++ * Not yet done: ++ * - Doctype subset and marked sections ++ * ++ * @author HTACG, et al (consult git log) ++ * ++ * @copyright ++ * (c) 1998-2021 (W3C) MIT, ERCIM, Keio University, and HTACG. ++ * See tidy.h for the copyright notice. ++ * @par ++ * All Rights Reserved. ++ * @par ++ * See `tidy.h` for the complete license. ++ * ++ * @date Additional updates: consult git log ++ * ++ ******************************************************************************/ + + #ifdef __cplusplus + extern "C" { +@@ -35,8 +48,23 @@ extern "C" { + + #include "forward.h" + +-/* lexer character types +-*/ ++/** @addtogroup internal_api */ ++/** @{ */ ++ ++ ++/***************************************************************************//** ++ ** @defgroup lexer_h HTML and XML Lexing ++ ** ++ ** These functions and structures form the internal API for document ++ ** lexing. ++ ** ++ ** @{ ++ ******************************************************************************/ ++ ++ ++/** ++ * Lexer character types. ++ */ + #define digit 1u + #define letter 2u + #define namechar 4u +@@ -47,8 +75,9 @@ extern "C" { + #define digithex 128u + + +-/* node->type is one of these values +-*/ ++/** ++ * node->type is one of these values ++ */ + typedef enum + { + RootNode, +@@ -68,9 +97,9 @@ typedef enum + } NodeType; + + +- +-/* lexer GetToken states +-*/ ++/** ++ * Lexer GetToken() states. ++ */ + typedef enum + { + LEX_CONTENT, +@@ -88,7 +117,10 @@ typedef enum + LEX_XMLDECL + } LexerState; + +-/* ParseDocTypeDecl state constants */ ++ ++/** ++ * ParseDocTypeDecl state constants. ++ */ + typedef enum + { + DT_INTERMEDIATE, +@@ -98,67 +130,43 @@ typedef enum + DT_INTSUBSET + } ParseDocTypeDeclState; + +-/* content model shortcut encoding +- +- Descriptions are tentative. +-*/ ++/** ++ * Content model shortcut encoding. ++ * Descriptions are tentative. ++ */ + #define CM_UNKNOWN 0 +-/* Elements with no content. Map to HTML specification. */ +-#define CM_EMPTY (1 << 0) +-/* Elements that appear outside of "BODY". */ +-#define CM_HTML (1 << 1) +-/* Elements that can appear within HEAD. */ +-#define CM_HEAD (1 << 2) +-/* HTML "block" elements. */ +-#define CM_BLOCK (1 << 3) +-/* HTML "inline" elements. */ +-#define CM_INLINE (1 << 4) +-/* Elements that mark list item ("LI"). */ +-#define CM_LIST (1 << 5) +-/* Elements that mark definition list item ("DL", "DT"). */ +-#define CM_DEFLIST (1 << 6) +-/* Elements that can appear inside TABLE. */ +-#define CM_TABLE (1 << 7) +-/* Used for "THEAD", "TFOOT" or "TBODY". */ +-#define CM_ROWGRP (1 << 8) +-/* Used for "TD", "TH" */ +-#define CM_ROW (1 << 9) +-/* Elements whose content must be protected against white space movement. +- Includes some elements that can found in forms. */ +-#define CM_FIELD (1 << 10) +-/* Used to avoid propagating inline emphasis inside some elements +- such as OBJECT or APPLET. */ +-#define CM_OBJECT (1 << 11) +-/* Elements that allows "PARAM". */ +-#define CM_PARAM (1 << 12) +-/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */ +-#define CM_FRAMES (1 << 13) +-/* Heading elements (h1, h2, ...). */ +-#define CM_HEADING (1 << 14) +-/* Elements with an optional end tag. */ +-#define CM_OPT (1 << 15) +-/* Elements that use "align" attribute for vertical position. */ +-#define CM_IMG (1 << 16) +-/* Elements with inline and block model. Used to avoid calling InlineDup. */ +-#define CM_MIXED (1 << 17) +-/* Elements whose content needs to be indented only if containing one +- CM_BLOCK element. */ +-#define CM_NO_INDENT (1 << 18) +-/* Elements that are obsolete (such as "dir", "menu"). */ +-#define CM_OBSOLETE (1 << 19) +-/* User defined elements. Used to determine how attributes wihout value +- should be printed. */ +-#define CM_NEW (1 << 20) +-/* Elements that cannot be omitted. */ +-#define CM_OMITST (1 << 21) +- +-/* If the document uses just HTML 2.0 tags and attributes described +-** it as HTML 2.0 Similarly for HTML 3.2 and the 3 flavors of HTML 4.0. +-** If there are proprietary tags and attributes then describe it as +-** HTML Proprietary. If it includes the xml-lang or xmlns attributes +-** but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the +-** flavors of Voyager (strict, loose or frameset). +-*/ ++#define CM_EMPTY (1 << 0) /**< Elements with no content. Map to HTML specification. */ ++#define CM_HTML (1 << 1) /**< Elements that appear outside of "BODY". */ ++#define CM_HEAD (1 << 2) /**< Elements that can appear within HEAD. */ ++#define CM_BLOCK (1 << 3) /**< HTML "block" elements. */ ++#define CM_INLINE (1 << 4) /**< HTML "inline" elements. */ ++#define CM_LIST (1 << 5) /**< Elements that mark list item ("LI"). */ ++#define CM_DEFLIST (1 << 6) /**< Elements that mark definition list item ("DL", "DT"). */ ++#define CM_TABLE (1 << 7) /**< Elements that can appear inside TABLE. */ ++#define CM_ROWGRP (1 << 8) /**< Used for "THEAD", "TFOOT" or "TBODY". */ ++#define CM_ROW (1 << 9) /**< Used for "TD", "TH" */ ++#define CM_FIELD (1 << 10) /**< Elements whose content must be protected against white space movement. Includes some elements that can found in forms. */ ++#define CM_OBJECT (1 << 11) /**< Used to avoid propagating inline emphasis inside some elements such as OBJECT or APPLET. */ ++#define CM_PARAM (1 << 12) /**< Elements that allows "PARAM". */ ++#define CM_FRAMES (1 << 13) /**< "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */ ++#define CM_HEADING (1 << 14) /**< Heading elements (h1, h2, ...). */ ++#define CM_OPT (1 << 15) /**< Elements with an optional end tag. */ ++#define CM_IMG (1 << 16) /**< Elements that use "align" attribute for vertical position. */ ++#define CM_MIXED (1 << 17) /**< Elements with inline and block model. Used to avoid calling InlineDup. */ ++#define CM_NO_INDENT (1 << 18) /**< Elements whose content needs to be indented only if containing one CM_BLOCK element. */ ++#define CM_OBSOLETE (1 << 19) /**< Elements that are obsolete (such as "dir", "menu"). */ ++#define CM_NEW (1 << 20) /**< User defined elements. Used to determine how attributes without value should be printed. */ ++#define CM_OMITST (1 << 21) /**< Elements that cannot be omitted. */ ++ ++ ++/** ++ * If the document uses just HTML 2.0 tags and attributes described ++ * it is HTML 2.0. Similarly for HTML 3.2 and the 3 flavors of HTML 4.0. ++ * If there are proprietary tags and attributes then describe it as ++ * HTML Proprietary. If it includes the xml-lang or xmlns attributes ++ * but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the ++ * flavors of Voyager (strict, loose or frameset). ++ */ + + /* unknown */ + #define xxxx 0u +@@ -220,8 +228,10 @@ typedef enum + /* all proprietary types */ + #define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN) + +-/* Linked list of class names and styles +-*/ ++ ++/** ++ * Linked list of class names and styles ++ */ + struct _Style; + typedef struct _Style TagStyle; + +@@ -234,8 +244,9 @@ struct _Style + }; + + +-/* Linked list of style properties +-*/ ++/** ++ * Linked list of style properties ++ */ + struct _StyleProp; + typedef struct _StyleProp StyleProp; + +@@ -247,11 +258,9 @@ struct _StyleProp + }; + + +- +- +-/* Attribute/Value linked list node +-*/ +- ++/** ++ * Attribute/Value linked list node ++ */ + struct _AttVal + { + AttVal* next; +@@ -264,93 +273,89 @@ struct _AttVal + }; + + +- +-/* +- Mosaic handles inlines via a separate stack from other elements +- We duplicate this to recover from inline markup errors such as: +- +- italic text +-

more italic text normal text +- +- which for compatibility with Mosaic is mapped to: +- +- italic text +-

more italic text normal text +- +- Note that any inline end tag pop's the effect of the current +- inline start tag, so that pop's in the above example. ++/** ++ * Mosaic handles inlines via a separate stack from other elements ++ * We duplicate this to recover from inline markup errors such as: ++ * ~~~ ++ * italic text ++ *

more italic text normal text ++ * ~~~ ++ * which for compatibility with Mosaic is mapped to: ++ * ~~~ ++ * italic text ++ *

more italic text normal text ++ * ~~~ ++ * Note that any inline end tag pop's the effect of the current ++ * inline start tag, so that `` pop's `` in the above example. + */ + struct _IStack + { + IStack* next; +- const Dict* tag; /* tag's dictionary definition */ +- tmbstr element; /* name (NULL for text nodes) */ ++ const Dict* tag; /**< tag's dictionary definition */ ++ tmbstr element; /**< name (NULL for text nodes) */ + AttVal* attributes; + }; + + +-/* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, +-** etc. etc. +-*/ +- ++/** ++ * HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, etc., etc. ++ */ + struct _Node + { +- Node* parent; /* tree structure */ ++ Node* parent; /**< tree structure */ + Node* prev; + Node* next; + Node* content; + Node* last; + + AttVal* attributes; +- const Dict* was; /* old tag when it was changed */ +- const Dict* tag; /* tag's dictionary definition */ ++ const Dict* was; /**< old tag when it was changed */ ++ const Dict* tag; /**< tag's dictionary definition */ + +- tmbstr element; /* name (NULL for text nodes) */ ++ tmbstr element; /**< name (NULL for text nodes) */ + +- uint start; /* start of span onto text array */ +- uint end; /* end of span onto text array */ +- NodeType type; /* TextNode, StartTag, EndTag etc. */ ++ uint start; /**< start of span onto text array */ ++ uint end; /**< end of span onto text array */ ++ NodeType type; /**< TextNode, StartTag, EndTag etc. */ + +- uint line; /* current line of document */ +- uint column; /* current column of document */ ++ uint line; /**< current line of document */ ++ uint column; /**< current column of document */ + +- Bool closed; /* true if closed by explicit end tag */ +- Bool implicit; /* true if inferred */ +- Bool linebreak; /* true if followed by a line break */ ++ Bool closed; /**< true if closed by explicit end tag */ ++ Bool implicit; /**< true if inferred */ ++ Bool linebreak; /**< true if followed by a line break */ + }; + + +-/* +- The following are private to the lexer +- Use NewLexer() to create a lexer, and +- FreeLexer() to free it. +-*/ +- ++/** ++ * The following are private to the lexer. ++ * Use `NewLexer()` to create a lexer, and `FreeLexer()` to free it. ++ */ + struct _Lexer + { +- uint lines; /* lines seen */ +- uint columns; /* at start of current token */ +- Bool waswhite; /* used to collapse contiguous white space */ +- Bool pushed; /* true after token has been pushed back */ +- Bool insertspace; /* when space is moved after end tag */ +- Bool excludeBlocks; /* Netscape compatibility */ +- Bool exiled; /* true if moved out of table */ +- Bool isvoyager; /* true if xmlns attribute on html element */ +- uint versions; /* bit vector of HTML versions */ +- uint doctype; /* version as given by doctype (if any) */ +- uint versionEmitted; /* version of doctype emitted */ +- Bool bad_doctype; /* e.g. if html or PUBLIC is missing */ +- uint txtstart; /* start of current node */ +- uint txtend; /* end of current node */ +- LexerState state; /* state of lexer's finite state machine */ +- +- Node* token; /* last token returned by GetToken() */ +- Node* itoken; /* last duplicate inline returned by GetToken() */ +- Node* root; /* remember root node of the document */ +- Node* parent; /* remember parent node for CDATA elements */ +- +- Bool seenEndBody; /* true if a tag has been encountered */ +- Bool seenEndHtml; /* true if a tag has been encountered */ ++ uint lines; /**< lines seen */ ++ uint columns; /**< at start of current token */ ++ Bool waswhite; /**< used to collapse contiguous white space */ ++ Bool pushed; /**< true after token has been pushed back */ ++ Bool insertspace; /**< when space is moved after end tag */ ++ Bool excludeBlocks; /**< Netscape compatibility */ ++ Bool exiled; /**< true if moved out of table */ ++ Bool isvoyager; /**< true if xmlns attribute on html element (i.e., "Voyager" was the W3C codename for XHTML). */ ++ uint versions; /**< bit vector of HTML versions */ ++ uint doctype; /**< version as given by doctype (if any) */ ++ uint versionEmitted; /**< version of doctype emitted */ ++ Bool bad_doctype; /**< e.g. if html or PUBLIC is missing */ ++ uint txtstart; /**< start of current node */ ++ uint txtend; /**< end of current node */ ++ LexerState state; /**< state of lexer's finite state machine */ ++ ++ Node* token; /**< last token returned by GetToken() */ ++ Node* itoken; /**< last duplicate inline returned by GetToken() */ ++ Node* root; /**< remember root node of the document */ ++ Node* parent; /**< remember parent node for CDATA elements */ ++ ++ Bool seenEndBody; /**< true if a `` tag has been encountered */ ++ Bool seenEndHtml; /**< true if a `` tag has been encountered */ + + /* + Lexer character buffer +@@ -361,33 +366,57 @@ struct _Lexer + + lexsize must be reset for each file. + */ +- tmbstr lexbuf; /* MB character buffer */ +- uint lexlength; /* allocated */ +- uint lexsize; /* used */ ++ tmbstr lexbuf; /**< MB character buffer */ ++ uint lexlength; /**< allocated */ ++ uint lexsize; /**< used */ + + /* Inline stack for compatibility with Mosaic */ +- Node* inode; /* for deferring text node */ +- IStack* insert; /* for inferring inline tags */ ++ Node* inode; /**< for deferring text node */ ++ IStack* insert; /**< for inferring inline tags */ + IStack* istack; +- uint istacklength; /* allocated */ +- uint istacksize; /* used */ +- uint istackbase; /* start of frame */ ++ uint istacklength; /**< allocated */ ++ uint istacksize; /**< used */ ++ uint istackbase; /**< start of frame */ + +- TagStyle *styles; /* used for cleaning up presentation markup */ ++ TagStyle *styles; /**< used for cleaning up presentation markup */ + +- TidyAllocator* allocator; /* allocator */ ++ TidyAllocator* allocator; /**< allocator */ + }; + + +-/* Lexer Functions +-*/ ++/** ++ * modes for GetToken() ++ * ++ * MixedContent -- for elements which don't accept PCDATA ++ * Preformatted -- white space preserved as is ++ * IgnoreMarkup -- for CDATA elements such as script, style ++ */ ++typedef enum ++{ ++ IgnoreWhitespace, ++ MixedContent, ++ Preformatted, ++ IgnoreMarkup, ++ OtherNamespace, ++ CdataContent ++} GetTokenMode; + +-/* choose what version to use for new doctype */ ++ ++/** @name Lexer Functions ++ * @{ ++ */ ++ ++ ++/** ++ * Choose what version to use for new doctype ++ */ + TY_PRIVATE int TY_(HTMLVersion)( TidyDocImpl* doc ); + +-/* everything is allowed in proprietary version of HTML */ +-/* this is handled here rather than in the tag/attr dicts */ + ++/** ++ * Everything is allowed in proprietary version of HTML. ++ * This is handled here rather than in the tag/attr dicts ++ */ + TY_PRIVATE void TY_(ConstrainVersion)( TidyDocImpl* doc, uint vers ); + + TY_PRIVATE Bool TY_(IsWhite)(uint c); +@@ -399,7 +428,6 @@ TY_PRIVATE Bool TY_(IsNamechar)(uint c); + TY_PRIVATE Bool TY_(IsXMLLetter)(uint c); + TY_PRIVATE Bool TY_(IsXMLNamechar)(uint c); + +-/* Bool IsLower(uint c); */ + TY_PRIVATE Bool TY_(IsUpper)(uint c); + TY_PRIVATE uint TY_(ToLower)(uint c); + TY_PRIVATE uint TY_(ToUpper)(uint c); +@@ -407,60 +435,82 @@ TY_PRIVATE uint TY_(ToUpper)(uint c); + TY_PRIVATE Lexer* TY_(NewLexer)( TidyDocImpl* doc ); + TY_PRIVATE void TY_(FreeLexer)( TidyDocImpl* doc ); + +-/* store character c as UTF-8 encoded byte stream */ ++ ++/** ++ * Store character c as UTF-8 encoded byte stream ++ */ + TY_PRIVATE void TY_(AddCharToLexer)( Lexer *lexer, uint c ); + +-/* +- Used for elements and text nodes +- element name is NULL for text nodes +- start and end are offsets into lexbuf +- which contains the textual content of +- all elements in the parse tree. +- +- parent and content allow traversal +- of the parse tree in any direction. +- attributes are represented as a linked +- list of AttVal nodes which hold the +- strings for attribute/value pairs. ++ ++/** ++ * Used for elements and text nodes. ++ * - Element name is NULL for text nodes. ++ * - start and end are offsets into lexbuf, ++ * which contains the textual content of ++ * all elements in the parse tree. ++ * - parent and content allow traversal ++ * of the parse tree in any direction. ++ * - attributes are represented as a linked ++ * list of AttVal nodes which hold the ++ * strings for attribute/value pairs. + */ + TY_PRIVATE Node* TY_(NewNode)( TidyAllocator* allocator, Lexer* lexer ); + + +-/* used to clone heading nodes when split by an


*/ ++/** ++ * Used to clone heading nodes when split by an `
` ++ */ + TY_PRIVATE Node* TY_(CloneNode)( TidyDocImpl* doc, Node *element ); + +-/* free node's attributes */ ++ ++/** ++ * Free node's attributes ++ */ + TY_PRIVATE void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node ); + +-/* doesn't repair attribute list linkage */ ++ ++/** ++ * Doesn't repair attribute list linkage ++ */ + TY_PRIVATE void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av ); + +-/* detach attribute from node */ ++ ++/** ++ * Detach attribute from node ++ */ + TY_PRIVATE void TY_(DetachAttribute)( Node *node, AttVal *attr ); + +-/* detach attribute from node then free it +-*/ ++ ++/** ++ * Detach attribute from node then free it. ++ */ + TY_PRIVATE void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr ); + +-/* +- Free document nodes by iterating through peers and recursing +- through children. Set next to NULL before calling FreeNode() +- to avoid freeing peer nodes. Doesn't patch up prev/next links. ++ ++/** ++ * Free document nodes by iterating through peers and recursing ++ * through children. Set `next` to `NULL` before calling `FreeNode()` ++ * to avoid freeing peer nodes. Doesn't patch up prev/next links. + */ + TY_PRIVATE void TY_(FreeNode)( TidyDocImpl* doc, Node *node ); + ++ + TY_PRIVATE Node* TY_(TextToken)( Lexer *lexer ); + +-/* used for creating preformatted text from Word2000 */ ++ ++/** ++ * Used for creating preformatted text from Word2000. ++ */ + TY_PRIVATE Node* TY_(NewLineNode)( Lexer *lexer ); + +-/* used for adding a   for Word2000 */ ++ ++/** ++ * Used for adding a   for Word2000. ++ */ + TY_PRIVATE Node* TY_(NewLiteralTextNode)(Lexer *lexer, ctmbstr txt ); + +-TY_PRIVATE void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str ); +-/* TY_PRIVATE void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len ); */ + +-/* find element */ ++TY_PRIVATE void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str ); + TY_PRIVATE Node* TY_(FindDocType)( TidyDocImpl* doc ); + TY_PRIVATE Node* TY_(FindHTML)( TidyDocImpl* doc ); + TY_PRIVATE Node* TY_(FindHEAD)( TidyDocImpl* doc ); +@@ -468,10 +518,16 @@ TY_PRIVATE Node* TY_(FindTITLE)(TidyDocImpl* doc); + TY_PRIVATE Node* TY_(FindBody)( TidyDocImpl* doc ); + TY_PRIVATE Node* TY_(FindXmlDecl)(TidyDocImpl* doc); + +-/* Returns containing block element, if any */ ++ ++/** ++ * Returns containing block element, if any ++ */ + TY_PRIVATE Node* TY_(FindContainer)( Node* node ); + +-/* add meta element for Tidy */ ++ ++/** ++ * Add meta element for Tidy. ++ */ + TY_PRIVATE Bool TY_(AddGenerator)( TidyDocImpl* doc ); + + TY_PRIVATE uint TY_(ApparentVersion)( TidyDocImpl* doc ); +@@ -485,118 +541,209 @@ TY_PRIVATE Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc ); + TY_PRIVATE Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc ); + + +-/* fixup doctype if missing */ ++/** ++ * Fixup doctype if missing. ++ */ + TY_PRIVATE Bool TY_(FixDocType)( TidyDocImpl* doc ); + +-/* ensure XML document starts with */ +-/* add encoding attribute if not using ASCII or UTF-8 output */ ++ ++/** ++ * Ensure XML document starts with ,and ++ * add encoding attribute if not using ASCII or UTF-8 output. ++ */ + TY_PRIVATE Bool TY_(FixXmlDecl)( TidyDocImpl* doc ); + ++ + TY_PRIVATE Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id); + + TY_PRIVATE void TY_(UngetToken)( TidyDocImpl* doc ); + +- +-/* +- modes for GetToken() +- +- MixedContent -- for elements which don't accept PCDATA +- Preformatted -- white space preserved as is +- IgnoreMarkup -- for CDATA elements such as script, style +-*/ +-typedef enum +-{ +- IgnoreWhitespace, +- MixedContent, +- Preformatted, +- IgnoreMarkup, +- OtherNamespace, +- CdataContent +-} GetTokenMode; +- + TY_PRIVATE Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode ); + + TY_PRIVATE void TY_(InitMap)(void); + + +-/* create a new attribute */ ++/** ++ * Create a new attribute. ++ */ + TY_PRIVATE AttVal* TY_(NewAttribute)( TidyDocImpl* doc ); + +-/* create a new attribute with given name and value */ ++ ++/** ++ * Create a new attribute with given name and value. ++ */ + TY_PRIVATE AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value, + int delim ); + +-/* insert attribute at the end of attribute list of a node */ ++ ++/** ++ * Insert attribute at the end of attribute list of a node. ++ */ + TY_PRIVATE void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av ); + +-/* insert attribute at the start of attribute list of a node */ ++/** ++ * Insert attribute at the start of attribute list of a node. ++ */ + TY_PRIVATE void TY_(InsertAttributeAtStart)( Node *node, AttVal *av ); + +-/************************************* +- In-line Stack functions +-*************************************/ +- +- +-/* duplicate attributes */ +-TY_PRIVATE AttVal* TY_(DupAttrs)( TidyDocImpl* doc, AttVal* attrs ); + +-/* +- push a copy of an inline node onto stack +- but don't push if implicit or OBJECT or APPLET +- (implicit tags are ones generated from the istack) ++/** @} ++ * @name Inline Stack Functions ++ * @{ ++ */ + +- One issue arises with pushing inlines when +- the tag is already pushed. For instance: + +-

text +-

more text ++/** ++ * Duplicate attributes. ++ */ ++TY_PRIVATE AttVal* TY_(DupAttrs)( TidyDocImpl* doc, AttVal* attrs ); + +- Shouldn't be mapped to + +-

text

+-

more text +-*/ ++/** ++ * Push a copy of an inline node onto stack, but don't push if ++ * implicit or OBJECT or APPLET (implicit tags are ones generated ++ * from the istack). ++ * ++ * One issue arises with pushing inlines when the tag is already pushed. ++ * For instance: ++ * ~~~ ++ *

text ++ *

more text ++ * ~~~ ++ * Shouldn't be mapped to ++ * ~~~ ++ *

text

++ *

more text ++ * ~~~ ++ */ + TY_PRIVATE void TY_(PushInline)( TidyDocImpl* doc, Node* node ); + +-/* pop inline stack */ ++ ++/** ++ * Pop inline stack. ++ */ + TY_PRIVATE void TY_(PopInline)( TidyDocImpl* doc, Node* node ); + ++ + TY_PRIVATE Bool TY_(IsPushed)( TidyDocImpl* doc, Node* node ); + TY_PRIVATE Bool TY_(IsPushedLast)( TidyDocImpl* doc, Node *element, Node *node ); + +-/* +- This has the effect of inserting "missing" inline +- elements around the contents of blocklevel elements +- such as P, TD, TH, DIV, PRE etc. This procedure is +- called at the start of ParseBlock. when the inline +- stack is not empty, as will be the case in: + +-

italic heading

++/** ++ * This has the effect of inserting "missing" inline elements around the ++ * contents of blocklevel elements such as P, TD, TH, DIV, PRE etc. This ++ * procedure is called at the start of `ParseBlock`, when the inline ++ * stack is not empty, as will be the case in: ++ * ~~~ ++ *

italic heading

++ * ~~~ ++ * which is then treated as equivalent to ++ * ~~~ ++ *

italic heading

++ * ~~~ ++ * This is implemented by setting the lexer into a mode where it gets ++ * tokens from the inline stack rather than from the input stream. ++ */ ++TY_PRIVATE int TY_(InlineDup)( TidyDocImpl* doc, Node *node ); + +- which is then treated as equivalent to + +-

italic heading

++/** ++ * Fefer duplicates when entering a table or other ++ * element where the inlines shouldn't be duplicated. ++ */ ++TY_PRIVATE void TY_(DeferDup)( TidyDocImpl* doc ); + +- This is implemented by setting the lexer into a mode +- where it gets tokens from the inline stack rather than +- from the input stream. +-*/ +-TY_PRIVATE int TY_(InlineDup)( TidyDocImpl* doc, Node *node ); + +-/* +- defer duplicates when entering a table or other +- element where the inlines shouldn't be duplicated +-*/ +-TY_PRIVATE void TY_(DeferDup)( TidyDocImpl* doc ); + TY_PRIVATE Node* TY_(InsertedToken)( TidyDocImpl* doc ); + +-/* stack manipulation for inline elements */ ++/** ++ * Stack manipulation for inline elements ++ */ + TY_PRIVATE Bool TY_(SwitchInline)( TidyDocImpl* doc, Node* element, Node* node ); ++ ++ + TY_PRIVATE Bool TY_(InlineDup1)( TidyDocImpl* doc, Node* node, Node* element ); + ++ ++/** @} ++ * @name Generic stack of nodes. ++ * @{ ++ */ ++ ++ ++/** ++ * This typedef represents a stack of addresses to nodes. Tidy uses these to ++ * try to limit recursion by pushing nodes to a stack when possible instead ++ * of recursing. ++ */ ++typedef struct _Stack { ++ int top; /**< Current top position. */ ++ unsigned capacity; /**< Current capacity. Can be expanded. */ ++ Node **firstNode; /** A pointer to the first pointer to a Node in an array of node addresses. */ ++ TidyAllocator* allocator; /**< Tidy's allocator, used at instantiation and expanding. */ ++} Stack; ++ ++ ++/** ++ * Create a new stack with a given starting capacity. If memory allocation ++ * fails, then the allocator will panic the program automatically. ++ */ ++TY_PRIVATE Stack* TY_(newStack)(TidyDocImpl *doc, uint capacity); ++ ++ ++/** ++ * Increase the stack size. This will be called automatically when the ++ * current stack is full. If memory allocation fails, then the allocator ++ * will panic the program automatically. ++ */ ++TY_PRIVATE void TY_(growStack)(Stack *stack); ++ ++ ++/** ++ * Stack is full when top is equal to the last index. ++ */ ++TY_PRIVATE Bool TY_(stackFull)(Stack *stack); ++ ++ ++/** ++ * Stack is empty when top is equal to -1 ++ */ ++TY_PRIVATE Bool TY_(stackEmpty)(Stack *stack); ++ ++ ++/** ++ * Push an item to the stack. ++ */ ++TY_PRIVATE void TY_(push)(Stack *stack, Node *node); ++ ++ ++/** ++ * Pop an item from the stack. ++ */ ++TY_PRIVATE Node* TY_(pop)(Stack *stack); ++ ++ ++/** ++ * Peek at the stack. ++ */ ++TY_PRIVATE Node* TY_(peek)(Stack *stack); ++ ++/** ++ * Frees the stack when done. ++ */ ++TY_PRIVATE void TY_(freeStack)(Stack *stack); ++ ++ ++/** @} ++ */ ++ ++ + #ifdef __cplusplus + } + #endif + + ++/** @} end parser_h group */ ++/** @} end internal_api group */ ++ + #endif /* __LEXER_H__ */ diff --git a/SPECS/tidy/tidy.spec b/SPECS/tidy/tidy.spec index f08a787a47..459ebaef5c 100644 --- a/SPECS/tidy/tidy.spec +++ b/SPECS/tidy/tidy.spec @@ -5,12 +5,13 @@ Summary: Utility to clean up and pretty print HTML/XHTML/XML Name: tidy Version: 5.8.0 -Release: 5%{?dist} +Release: 6%{?dist} License: W3C Vendor: Microsoft Corporation Distribution: Mariner URL: https://www.html-tidy.org/ Source0: https://github.com/htacg/%{upname}/archive/%{version}.tar.gz#/%{upname}-%{version}.tar.gz +Patch0: CVE-2021-33391.patch BuildRequires: cmake BuildRequires: gcc BuildRequires: gcc-c++ @@ -92,6 +93,9 @@ rm -fv %{buildroot}%{_libdir}/libtidy.a %{_libdir}/pkgconfig/tidy.pc %changelog +* Mon May 01 2023 Sean Dougherty - 5.8.0-6 +- Backported patch to fix CVE-2021-33391 + * Tue Oct 18 2022 Osama Esmail - 5.8.0-5 - Upgraded from 5.7.28 to 5.8.0 - Changed libtidys.a to libtidy.a