Added support for CDATA sections.

1998-11-12 23:54:37 +00:00 · 1998-11-12 23:54:37 +00:00 · 422c941109
--- a/htmlparser/src/nsDTDUtils.cpp
+++ b/htmlparser/src/nsDTDUtils.cpp
@ -236,7 +236,7 @@ void CTokenRecycler::RecycleToken(CToken* aToken) {

 /**
 * 
- * @update	gess8/4/98
+ * @update	vidur 11/12/98
 * @param 
 * @return
 */
@ -262,6 +262,7 @@ CToken* CTokenRecycler::CreateTokenOfType(eHTMLTokenTypes aType,eHTMLTags aTag,
      case eToken_style:      result=new CStyleToken(); break;
      case eToken_skippedcontent: result=new CSkippedContentToken(aString); break;
      case eToken_instruction:result=new CInstructionToken(); break;
+      case eToken_cdatasection:result=new CCDATASectionToken(); break;
        default:
          break;
    }
--- a/htmlparser/src/nsHTMLTokens.cpp
+++ b/htmlparser/src/nsHTMLTokens.cpp
@ -456,6 +456,110 @@ nsresult CTextToken::Consume(PRUnichar aChar, CScanner& aScanner) {
  return result;
 }

+/*
+ *  default constructor
+ *  
+ *  @update  vidur 11/12/98
+ *  @param   aName -- string to init token name with
+ *  @return  
+ */
+CCDATASectionToken::CCDATASectionToken() : CHTMLToken(eHTMLTag_unknown) {
+}
+
+
+/*
+ *  string based constructor
+ *  
+ *  @update  vidur 11/12/98
+ *  @param   aName -- string to init token name with
+ *  @return  
+ */
+CCDATASectionToken::CCDATASectionToken(const nsString& aName) : CHTMLToken(aName) {
+  mTypeID=eHTMLTag_unknown;
+}
+
+/*
+ *  
+ *  
+ *  @update  vidur 11/12/98
+ *  @param   
+ *  @return  
+ */
+const char*  CCDATASectionToken::GetClassName(void) {
+  return "cdatasection";
+}
+
+/*
+ *  
+ *  
+ *  @update  vidur 11/12/98
+ *  @param   
+ *  @return  
+ */
+PRInt32 CCDATASectionToken::GetTokenType(void) {
+  return eToken_cdatasection;
+}
+
+/*
+ *  Consume as much marked test from scanner as possible.
+ *
+ *  @update  vidur 11/12/98
+ *  @param   aChar -- last char consumed from stream
+ *  @param   aScanner -- controller of underlying input source
+ *  @return  error result
+ */
+nsresult CCDATASectionToken::Consume(PRUnichar aChar, CScanner& aScanner) {
+  static    nsAutoString terminals("]\r");
+  nsresult  result=NS_OK;
+  PRBool    done=PR_FALSE;
+
+  while((NS_OK==result) && (!done)) {
+    result=aScanner.ReadUntil(mTextValue,terminals,PR_FALSE,PR_FALSE);
+    if(NS_OK==result) {
+      result=aScanner.Peek(aChar);
+      if(kCR==aChar) {
+        result=aScanner.GetChar(aChar); //strip off the \r
+        result=aScanner.Peek(aChar);    //then see what's next.
+        switch(aChar) {
+          case kCR:
+            result=aScanner.GetChar(aChar); //strip off the \r
+            mTextValue.Append("\n\n");
+            break;
+          case kNewLine:
+             //which means we saw \r\n, which becomes \n
+            result=aScanner.GetChar(aChar); //strip off the \n
+                //now fall through on purpose...
+          default:
+            mTextValue.Append("\n");
+            break;
+        }
+      }
+      else if (kRightSquareBracket==aChar) {
+        result=aScanner.GetChar(aChar); //strip off the ]
+        result=aScanner.Peek(aChar);    //then see what's next.
+        if (kRightSquareBracket==aChar) {
+          result=aScanner.GetChar(aChar);    //strip off the second ]
+          result=aScanner.Peek(aChar);    //then see what's next.
+          if (kGreaterThan==aChar) {
+            result=aScanner.GetChar(aChar); //strip off the >
+            done=PR_TRUE;
+          }
+          else {
+            // This isn't the end of the CDATA section so go on
+            mTextValue.Append("]");
+          }
+        }
+        else {
+          // This isn't the end of the CDATA section so go on
+          mTextValue.Append("]");
+        }
+      }
+      else done=PR_TRUE;
+    }
+  }
+  return result;
+}
+
 /*
 *  default constructor
 *  
--- a/htmlparser/src/nsHTMLTokens.h
+++ b/htmlparser/src/nsHTMLTokens.h
@ -54,6 +54,7 @@ enum eHTMLTokenTypes {
  eToken_start=1,     eToken_end,     eToken_comment,         eToken_entity,
  eToken_whitespace,  eToken_newline, eToken_text,            eToken_attribute,
  eToken_script,      eToken_style,   eToken_skippedcontent,  eToken_instruction,
+  eToken_cdatasection,
  eToken_last //make sure this stays the last token...
 };

@ -79,7 +80,6 @@ public:
 protected:
 };

-
 /**
 *  This declares start tokens, which always take the form <xxxx>. 
 *	This class also knows how to consume related attributes.
@ -207,6 +207,23 @@ class CTextToken: public CHTMLToken {
 };


+/**
+ *  CDATASection tokens contain raw unescaped text content delimited by
+ *  a ![CDATA[ and ]]. 
+ *  XXX Not really a HTML construct - maybe we need a separation
+ *  
+ *  @update  vidur 11/12/98
+ */
+class CCDATASectionToken : public CHTMLToken {
+public:
+                        CCDATASectionToken();
+                        CCDATASectionToken(const nsString& aString);
+    virtual nsresult    Consume(PRUnichar aChar,CScanner& aScanner);
+    virtual const char* GetClassName(void);
+    virtual PRInt32     GetTokenType(void);  
+};
+
+
 /**
 *  Attribute tokens are used to contain attribute key/value
 *  pairs whereever they may occur. Typically, they should
--- a/htmlparser/src/nsParserTypes.h
+++ b/htmlparser/src/nsParserTypes.h
@ -73,7 +73,8 @@ const PRUint32  kRightParen       = ')';
 const PRUint32  kLeftBrace        = '{';
 const PRUint32  kRightBrace       = '}';
 const PRUint32  kQuestionMark     = '?';
-
+const PRUint32  kLeftSquareBracket  = '[';
+const PRUint32  kRightSquareBracket = ']';

 #endif

--- a/htmlparser/src/nsWellFormedDTD.cpp
+++ b/htmlparser/src/nsWellFormedDTD.cpp
@ -473,6 +473,74 @@ NS_IMETHODIMP CWellFormedDTD::ConsumeComment(PRUnichar aChar,CScanner& aScanner,
  return result;
 }

+/*
+ * Consume characters as long as they match the string passed in.
+ * If they don't match, put them all back. 
+ * XXX The scanner should be able to do this.
+ *
+ *  @update vidur 11/12/98
+ */
+static nsresult
+ConsumeConditional(CScanner& aScanner, 
+                   const nsString& aMatchString,
+                   PRBool& aMatch)
+{
+  nsresult result=NS_OK;
+  PRUnichar matchChar;
+
+  PRInt32 i, count = aMatchString.Length();
+  for (i=0; i < count; i++) {
+    result = aScanner.GetChar(matchChar);
+    if ((NS_OK != result) || (aMatchString.CharAt(i) != matchChar)) {
+      break;
+    }
+  }
+
+  if (NS_OK == result) {
+    if (i != count) {
+      for (; i >= 0; i--) {
+        aScanner.PutBack(aMatchString.CharAt(i));
+      }
+      aMatch = PR_FALSE;
+    }
+    else {
+      aMatch = PR_TRUE;
+    }
+  }
+
+  return result;
+}
+
+/**
+ *  This method is called when we see a "<!" sequence. The result
+ *  could be a comment or a CDATASection.
+ *  XXX "Escaped Content" is not the right term, but I couldn't think
+ *  of a good one.
+ *  
+ *  @update vidur 11/12/98
+ *  @param  aChar: last char read
+ *  @param  aScanner: see nsScanner.h
+ *  @param  aToken is the newly created token (comment or cdatasection)
+ *  @return error code
+ */
+NS_IMETHODIMP CWellFormedDTD::ConsumeEscapedContent(PRUnichar aChar,CScanner& aScanner,CToken*& aToken){
+  nsresult result=NS_OK;
+  nsAutoString CDATAString("[CDATA[");
+  PRBool isCDATA = PR_FALSE;
+
+  result = ConsumeConditional(aScanner, CDATAString, isCDATA);
+  if (NS_OK == result) {
+    if (isCDATA) {
+      aToken=gTokenRecycler.CreateTokenOfType(eToken_cdatasection,eHTMLTag_unknown,gEmpty);
+    }
+    else {
+      aToken=gTokenRecycler.CreateTokenOfType(eToken_comment,eHTMLTag_comment,gEmpty);
+    }
+  }
+
+  return result;
+}
+
 /**
 *  This method is called just after a newline has been consumed. 
 *  
@ -496,7 +564,7 @@ NS_IMETHODIMP CWellFormedDTD::ConsumeNewline(PRUnichar aChar,CScanner& aScanner,
 *  and we know we're at the start of some kind of tagged 
 *  element. We don't know yet if it's a tag or a comment.
 *  
- *  @update  gess 5/12/98
+ *  @update  vidur 11/12/98
 *  @param   aChar is the last char read
 *  @param   aScanner is represents our input source
 *  @param   aToken is the out arg holding our new token
@ -520,7 +588,7 @@ NS_IMETHODIMP CWellFormedDTD::ConsumeTag(PRUnichar aChar,CScanner& aScanner,CTok
        break;

      case kExclamation:
-        aToken=gTokenRecycler.CreateTokenOfType(eToken_comment,eHTMLTag_comment,gEmpty);
+        result = ConsumeEscapedContent(aChar, aScanner, aToken);
        break;

      case kQuestionMark: //it must be an XML processing instruction...
@ -599,7 +667,7 @@ NS_IMETHODIMP CWellFormedDTD::ConsumeToken(CToken*& aToken){
          case kCR: case kLF:
            result=ConsumeNewline(theChar,*theScanner,aToken);
            break;
-          
+            
          case kNotFound:
            break;
          
@ -699,7 +767,7 @@ PRBool CWellFormedDTD::IsContainer(PRInt32 aTag) const{

 /**
 *  
- *  @update  gess 3/25/98
+ *  @update  vidur 11/12/98
 *  @param   aToken -- token object to be put into content model
 *  @return  0 if all is well; non-zero is an error
 */
@ -716,6 +784,7 @@ NS_IMETHODIMP CWellFormedDTD::HandleToken(CToken* aToken) {
    case eToken_entity:
    case eToken_whitespace:
    case eToken_text:
+    case eToken_cdatasection:
      result=mSink->AddLeaf(theNode); 
      break;

--- a/htmlparser/src/nsWellFormedDTD.h
+++ b/htmlparser/src/nsWellFormedDTD.h
@ -225,6 +225,7 @@ protected:
    NS_IMETHODIMP ConsumeText(const nsString& aString,CScanner& aScanner,CToken*& aToken);
    NS_IMETHODIMP ConsumeNewline(PRUnichar aChar,CScanner& aScanner,CToken*& aToken);
    NS_IMETHODIMP ConsumeWhitespace(PRUnichar aChar,CScanner& aScanner,CToken*& aToken);
+    NS_IMETHODIMP ConsumeEscapedContent(PRUnichar aChar,CScanner& aScanner,CToken*& aToken);
    NS_IMETHODIMP ConsumeComment(PRUnichar aChar,CScanner& aScanner,CToken*& aToken);
    NS_IMETHODIMP ConsumeEntity(PRUnichar aChar,CScanner& aScanner,CToken*& aToken);
    NS_IMETHODIMP ConsumeAttributes(PRUnichar aChar,CScanner& aScanner,CStartToken* aToken);
--- a/parser/htmlparser/src/nsDTDUtils.cpp
+++ b/parser/htmlparser/src/nsDTDUtils.cpp
@ -236,7 +236,7 @@ void CTokenRecycler::RecycleToken(CToken* aToken) {

 /**
 * 
- * @update	gess8/4/98
+ * @update	vidur 11/12/98
 * @param 
 * @return
 */
@ -262,6 +262,7 @@ CToken* CTokenRecycler::CreateTokenOfType(eHTMLTokenTypes aType,eHTMLTags aTag,
      case eToken_style:      result=new CStyleToken(); break;
      case eToken_skippedcontent: result=new CSkippedContentToken(aString); break;
      case eToken_instruction:result=new CInstructionToken(); break;
+      case eToken_cdatasection:result=new CCDATASectionToken(); break;
        default:
          break;
    }
--- a/parser/htmlparser/src/nsHTMLTokens.cpp
+++ b/parser/htmlparser/src/nsHTMLTokens.cpp
@ -456,6 +456,110 @@ nsresult CTextToken::Consume(PRUnichar aChar, CScanner& aScanner) {
  return result;
 }

+/*
+ *  default constructor
+ *  
+ *  @update  vidur 11/12/98
+ *  @param   aName -- string to init token name with
+ *  @return  
+ */
+CCDATASectionToken::CCDATASectionToken() : CHTMLToken(eHTMLTag_unknown) {
+}
+
+
+/*
+ *  string based constructor
+ *  
+ *  @update  vidur 11/12/98
+ *  @param   aName -- string to init token name with
+ *  @return  
+ */
+CCDATASectionToken::CCDATASectionToken(const nsString& aName) : CHTMLToken(aName) {
+  mTypeID=eHTMLTag_unknown;
+}
+
+/*
+ *  
+ *  
+ *  @update  vidur 11/12/98
+ *  @param   
+ *  @return  
+ */
+const char*  CCDATASectionToken::GetClassName(void) {
+  return "cdatasection";
+}
+
+/*
+ *  
+ *  
+ *  @update  vidur 11/12/98
+ *  @param   
+ *  @return  
+ */
+PRInt32 CCDATASectionToken::GetTokenType(void) {
+  return eToken_cdatasection;
+}
+
+/*
+ *  Consume as much marked test from scanner as possible.
+ *
+ *  @update  vidur 11/12/98
+ *  @param   aChar -- last char consumed from stream
+ *  @param   aScanner -- controller of underlying input source
+ *  @return  error result
+ */
+nsresult CCDATASectionToken::Consume(PRUnichar aChar, CScanner& aScanner) {
+  static    nsAutoString terminals("]\r");
+  nsresult  result=NS_OK;
+  PRBool    done=PR_FALSE;
+
+  while((NS_OK==result) && (!done)) {
+    result=aScanner.ReadUntil(mTextValue,terminals,PR_FALSE,PR_FALSE);
+    if(NS_OK==result) {
+      result=aScanner.Peek(aChar);
+      if(kCR==aChar) {
+        result=aScanner.GetChar(aChar); //strip off the \r
+        result=aScanner.Peek(aChar);    //then see what's next.
+        switch(aChar) {
+          case kCR:
+            result=aScanner.GetChar(aChar); //strip off the \r
+            mTextValue.Append("\n\n");
+            break;
+          case kNewLine:
+             //which means we saw \r\n, which becomes \n
+            result=aScanner.GetChar(aChar); //strip off the \n
+                //now fall through on purpose...
+          default:
+            mTextValue.Append("\n");
+            break;
+        }
+      }
+      else if (kRightSquareBracket==aChar) {
+        result=aScanner.GetChar(aChar); //strip off the ]
+        result=aScanner.Peek(aChar);    //then see what's next.
+        if (kRightSquareBracket==aChar) {
+          result=aScanner.GetChar(aChar);    //strip off the second ]
+          result=aScanner.Peek(aChar);    //then see what's next.
+          if (kGreaterThan==aChar) {
+            result=aScanner.GetChar(aChar); //strip off the >
+            done=PR_TRUE;
+          }
+          else {
+            // This isn't the end of the CDATA section so go on
+            mTextValue.Append("]");
+          }
+        }
+        else {
+          // This isn't the end of the CDATA section so go on
+          mTextValue.Append("]");
+        }
+      }
+      else done=PR_TRUE;
+    }
+  }
+  return result;
+}
+
 /*
 *  default constructor
 *  
--- a/parser/htmlparser/src/nsHTMLTokens.h
+++ b/parser/htmlparser/src/nsHTMLTokens.h
@ -54,6 +54,7 @@ enum eHTMLTokenTypes {
  eToken_start=1,     eToken_end,     eToken_comment,         eToken_entity,
  eToken_whitespace,  eToken_newline, eToken_text,            eToken_attribute,
  eToken_script,      eToken_style,   eToken_skippedcontent,  eToken_instruction,
+  eToken_cdatasection,
  eToken_last //make sure this stays the last token...
 };

@ -79,7 +80,6 @@ public:
 protected:
 };

-
 /**
 *  This declares start tokens, which always take the form <xxxx>. 
 *	This class also knows how to consume related attributes.
@ -207,6 +207,23 @@ class CTextToken: public CHTMLToken {
 };


+/**
+ *  CDATASection tokens contain raw unescaped text content delimited by
+ *  a ![CDATA[ and ]]. 
+ *  XXX Not really a HTML construct - maybe we need a separation
+ *  
+ *  @update  vidur 11/12/98
+ */
+class CCDATASectionToken : public CHTMLToken {
+public:
+                        CCDATASectionToken();
+                        CCDATASectionToken(const nsString& aString);
+    virtual nsresult    Consume(PRUnichar aChar,CScanner& aScanner);
+    virtual const char* GetClassName(void);
+    virtual PRInt32     GetTokenType(void);  
+};
+
+
 /**
 *  Attribute tokens are used to contain attribute key/value
 *  pairs whereever they may occur. Typically, they should
--- a/parser/htmlparser/src/nsParserTypes.h
+++ b/parser/htmlparser/src/nsParserTypes.h
@ -73,7 +73,8 @@ const PRUint32  kRightParen       = ')';
 const PRUint32  kLeftBrace        = '{';
 const PRUint32  kRightBrace       = '}';
 const PRUint32  kQuestionMark     = '?';
-
+const PRUint32  kLeftSquareBracket  = '[';
+const PRUint32  kRightSquareBracket = ']';

 #endif

--- a/parser/htmlparser/src/nsWellFormedDTD.cpp
+++ b/parser/htmlparser/src/nsWellFormedDTD.cpp
@ -473,6 +473,74 @@ NS_IMETHODIMP CWellFormedDTD::ConsumeComment(PRUnichar aChar,CScanner& aScanner,
  return result;
 }

+/*
+ * Consume characters as long as they match the string passed in.
+ * If they don't match, put them all back. 
+ * XXX The scanner should be able to do this.
+ *
+ *  @update vidur 11/12/98
+ */
+static nsresult
+ConsumeConditional(CScanner& aScanner, 
+                   const nsString& aMatchString,
+                   PRBool& aMatch)
+{
+  nsresult result=NS_OK;
+  PRUnichar matchChar;
+
+  PRInt32 i, count = aMatchString.Length();
+  for (i=0; i < count; i++) {
+    result = aScanner.GetChar(matchChar);
+    if ((NS_OK != result) || (aMatchString.CharAt(i) != matchChar)) {
+      break;
+    }
+  }
+
+  if (NS_OK == result) {
+    if (i != count) {
+      for (; i >= 0; i--) {
+        aScanner.PutBack(aMatchString.CharAt(i));
+      }
+      aMatch = PR_FALSE;
+    }
+    else {
+      aMatch = PR_TRUE;
+    }
+  }
+
+  return result;
+}
+
+/**
+ *  This method is called when we see a "<!" sequence. The result
+ *  could be a comment or a CDATASection.
+ *  XXX "Escaped Content" is not the right term, but I couldn't think
+ *  of a good one.
+ *  
+ *  @update vidur 11/12/98
+ *  @param  aChar: last char read
+ *  @param  aScanner: see nsScanner.h
+ *  @param  aToken is the newly created token (comment or cdatasection)
+ *  @return error code
+ */
+NS_IMETHODIMP CWellFormedDTD::ConsumeEscapedContent(PRUnichar aChar,CScanner& aScanner,CToken*& aToken){
+  nsresult result=NS_OK;
+  nsAutoString CDATAString("[CDATA[");
+  PRBool isCDATA = PR_FALSE;
+
+  result = ConsumeConditional(aScanner, CDATAString, isCDATA);
+  if (NS_OK == result) {
+    if (isCDATA) {
+      aToken=gTokenRecycler.CreateTokenOfType(eToken_cdatasection,eHTMLTag_unknown,gEmpty);
+    }
+    else {
+      aToken=gTokenRecycler.CreateTokenOfType(eToken_comment,eHTMLTag_comment,gEmpty);
+    }
+  }
+
+  return result;
+}
+
 /**
 *  This method is called just after a newline has been consumed. 
 *  
@ -496,7 +564,7 @@ NS_IMETHODIMP CWellFormedDTD::ConsumeNewline(PRUnichar aChar,CScanner& aScanner,
 *  and we know we're at the start of some kind of tagged 
 *  element. We don't know yet if it's a tag or a comment.
 *  
- *  @update  gess 5/12/98
+ *  @update  vidur 11/12/98
 *  @param   aChar is the last char read
 *  @param   aScanner is represents our input source
 *  @param   aToken is the out arg holding our new token
@ -520,7 +588,7 @@ NS_IMETHODIMP CWellFormedDTD::ConsumeTag(PRUnichar aChar,CScanner& aScanner,CTok
        break;

      case kExclamation:
-        aToken=gTokenRecycler.CreateTokenOfType(eToken_comment,eHTMLTag_comment,gEmpty);
+        result = ConsumeEscapedContent(aChar, aScanner, aToken);
        break;

      case kQuestionMark: //it must be an XML processing instruction...
@ -599,7 +667,7 @@ NS_IMETHODIMP CWellFormedDTD::ConsumeToken(CToken*& aToken){
          case kCR: case kLF:
            result=ConsumeNewline(theChar,*theScanner,aToken);
            break;
-          
+            
          case kNotFound:
            break;
          
@ -699,7 +767,7 @@ PRBool CWellFormedDTD::IsContainer(PRInt32 aTag) const{

 /**
 *  
- *  @update  gess 3/25/98
+ *  @update  vidur 11/12/98
 *  @param   aToken -- token object to be put into content model
 *  @return  0 if all is well; non-zero is an error
 */
@ -716,6 +784,7 @@ NS_IMETHODIMP CWellFormedDTD::HandleToken(CToken* aToken) {
    case eToken_entity:
    case eToken_whitespace:
    case eToken_text:
+    case eToken_cdatasection:
      result=mSink->AddLeaf(theNode); 
      break;

--- a/parser/htmlparser/src/nsWellFormedDTD.h
+++ b/parser/htmlparser/src/nsWellFormedDTD.h
@ -225,6 +225,7 @@ protected:
    NS_IMETHODIMP ConsumeText(const nsString& aString,CScanner& aScanner,CToken*& aToken);
    NS_IMETHODIMP ConsumeNewline(PRUnichar aChar,CScanner& aScanner,CToken*& aToken);
    NS_IMETHODIMP ConsumeWhitespace(PRUnichar aChar,CScanner& aScanner,CToken*& aToken);
+    NS_IMETHODIMP ConsumeEscapedContent(PRUnichar aChar,CScanner& aScanner,CToken*& aToken);
    NS_IMETHODIMP ConsumeComment(PRUnichar aChar,CScanner& aScanner,CToken*& aToken);
    NS_IMETHODIMP ConsumeEntity(PRUnichar aChar,CScanner& aScanner,CToken*& aToken);
    NS_IMETHODIMP ConsumeAttributes(PRUnichar aChar,CScanner& aScanner,CStartToken* aToken);