Change our concept of a "tagname" to include all sorts of random chars like IE

does (that is, allow any char that's not in a short list of terminal chars). Bug 236002, r=choess, sr=peterv
2004-05-01 05:37:25 +00:00 · 2004-05-01 05:37:25 +00:00 · 1d5fa21a52
--- a/htmlparser/src/nsHTMLTokens.cpp
+++ b/htmlparser/src/nsHTMLTokens.cpp
@ -182,7 +182,7 @@ nsresult CStartToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 aFlag
  nsresult result=NS_OK;
  if (aFlag & NS_IPARSER_FLAG_HTML) {
    nsAutoString theSubstr;
-    result=aScanner.GetIdentifier(theSubstr,PR_TRUE);
+    result=aScanner.ReadTagIdentifier(theSubstr);
    mTypeID = (PRInt32)nsHTMLTags::LookupTag(theSubstr);
    // Save the original tag string if this is user-defined or if we
    // are viewing source
@ -195,7 +195,7 @@ nsresult CStartToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 aFlag
    //was written <title_> but since we didn't respect the '_', we only saw <title>. Then 
    //we searched for end title, which never comes (they give </title_>). 

-    result=aScanner.ReadIdentifier(mTextValue,PR_TRUE);  
+    result=aScanner.ReadTagIdentifier(mTextValue);  
    mTypeID = nsHTMLTags::LookupTag(mTextValue);
  }

@ -284,7 +284,7 @@ nsresult CEndToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 aFlag)
  nsresult result = NS_OK;
  if (aFlag & NS_IPARSER_FLAG_HTML) {
    nsAutoString theSubstr;
-    result=aScanner.GetIdentifier(theSubstr,PR_TRUE);
+    result=aScanner.ReadTagIdentifier(theSubstr);
    NS_ENSURE_SUCCESS(result, result);
    
    mTypeID = (PRInt32)nsHTMLTags::LookupTag(theSubstr);
@ -296,7 +296,7 @@ nsresult CEndToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 aFlag)
    }
  }
  else {
-    result = aScanner.ReadIdentifier(mTextValue,PR_TRUE);
+    result = aScanner.ReadTagIdentifier(mTextValue);
    NS_ENSURE_SUCCESS(result, result);

    mTypeID = nsHTMLTags::LookupTag(mTextValue);
@ -1939,7 +1939,7 @@ CEntityToken::ConsumeEntity(PRUnichar aChar,
        theChar == '_' ||
        theChar == ':') {
        aScanner.GetChar(aChar); // Consume &
-        result=aScanner.ReadIdentifier(aString,PR_TRUE); // Ref. Bug# 23791 - For setting aIgnore to PR_TRUE.
+        result=aScanner.ReadEntityIdentifier(aString);
      }
      else {
        return NS_HTMLTOKENS_NOT_AN_ENTITY;
--- a/htmlparser/src/nsScanner.cpp
+++ b/htmlparser/src/nsScanner.cpp
@ -733,14 +733,12 @@ nsresult nsScanner::SkipPast(nsString& aValidSet){
 }

 /**
- *  Consume characters until you did not find the terminal char
+ *  Consume characters until you run into space, a '<', a '>', or a '/'.
 *  
- *  @update  gess 3/25/98
 *  @param   aString - receives new data from stream
- *  @param   aIgnore - If set ignores ':','-','_','.'
 *  @return  error code
 */
-nsresult nsScanner::GetIdentifier(nsString& aString,PRBool allowPunct) {
+nsresult nsScanner::ReadTagIdentifier(nsString& aString) {

  if (!mSlidingBuffer) {
    return kEOF;
@ -758,26 +756,28 @@ nsresult nsScanner::GetIdentifier(nsString& aString,PRBool allowPunct) {
 
    theChar=*current;
    if(theChar) {
-      found=PR_FALSE;
+      found = PR_TRUE;
      switch(theChar) {
-        case ':':
-        case '_':
-        case '-':
-        case '.':
-          found=allowPunct;
+        case '\n':
+        case '\r':
+        case ' ' :
+        case '\b':
+        case '\t':
+        case '\v':
+        case '<':
+        case '>':
+        case '/':
+          found = PR_FALSE;
          break;
        default:
-          found = ('a'<=theChar && theChar<='z') ||
-                  ('A'<=theChar && theChar<='Z') ||
-                  ('0'<=theChar && theChar<='9');
          break;
      }

      if(!found) {
        // If we the current character isn't a valid character for
-        // the identifier, we're done. Copy the results into
+        // the identifier, we're done. Append the results to
        // the string passed in.
-        CopyUnicodeTo(mCurrentPosition, current, aString);
+        AppendUnicodeTo(mCurrentPosition, current, aString);
        break;
      }
    }
@ -795,14 +795,13 @@ nsresult nsScanner::GetIdentifier(nsString& aString,PRBool allowPunct) {
 }

 /**
- *  Consume characters until you did not find the terminal char
+ *  Consume characters until you run into a char that's not valid in an
+ *  entity name
 *  
- *  @update  gess 3/25/98
 *  @param   aString - receives new data from stream
- *  @param   allowPunct - If set ignores ':','-','_','.'
 *  @return  error code
 */
-nsresult nsScanner::ReadIdentifier(nsString& aString,PRBool allowPunct) {
+nsresult nsScanner::ReadEntityIdentifier(nsString& aString) {

  if (!mSlidingBuffer) {
    return kEOF;
@ -823,11 +822,11 @@ nsresult nsScanner::ReadIdentifier(nsString& aString,PRBool allowPunct) {
    if(theChar) {
      found=PR_FALSE;
      switch(theChar) {
-        case ':':
        case '_':
        case '-':
        case '.':
-          found=allowPunct;
+          // Don't allow ':' in entity names.  See bug 23791
+          found = PR_TRUE;
          break;
        default:
          found = ('a'<=theChar && theChar<='z') ||
@ -855,65 +854,6 @@ nsresult nsScanner::ReadIdentifier(nsString& aString,PRBool allowPunct) {
  return result;
 }

-nsresult nsScanner::ReadIdentifier(nsScannerIterator& aStart,
-                                   nsScannerIterator& aEnd,
-                                   PRBool allowPunct) {
-
-  if (!mSlidingBuffer) {
-    return kEOF;
-  }
-
-  PRUnichar         theChar=0;
-  nsresult          result=Peek(theChar);
-  nsScannerIterator origin, current, end;
-  PRBool            found=PR_FALSE;  
-
-  origin = mCurrentPosition;
-  current = mCurrentPosition;
-  end = mEndPosition;
-
-  while(current != end) {
- 
-    theChar=*current;
-    if(theChar) {
-      found=PR_FALSE;
-      switch(theChar) {
-        case ':':
-        case '_':
-        case '-':
-          found=allowPunct;
-          break;
-        default:
-          if(('a'<=theChar) && (theChar<='z'))
-            found=PR_TRUE;
-          else if(('A'<=theChar) && (theChar<='Z'))
-            found=PR_TRUE;
-          else if(('0'<=theChar) && (theChar<='9'))
-            found=PR_TRUE;
-          break;
-      }
-
-      if(!found) {
-        aStart = mCurrentPosition;
-        aEnd = current;
-        break;
-      }
-    }
-    ++current;
-  }
-  
-  SetPosition(current);
-  if (current == end) {
-    aStart = origin;
-    aEnd = current;
-    return Eof();
-  }
-
-  //DoErrTest(aString);
-
-  return result;
-}
-
 /**
 *  Consume digits 
 *  
--- a/htmlparser/src/nsScanner.h
+++ b/htmlparser/src/nsScanner.h
@ -187,18 +187,21 @@ class nsScanner {
      nsresult Eof(void);

      /**
-       *  Consume characters until you find the terminal char
+       *  Consume characters until you run into space, a '<', a '>', or a '/'.
       *  
-       *  @update  gess 3/25/98
-       *  @param   aString receives new data from stream
-       *  @param   addTerminal tells us whether to append terminal to aString
+       *  @param   aString - receives new data from stream
       *  @return  error code
       */
-      nsresult GetIdentifier(nsString& aString,PRBool allowPunct=PR_FALSE);
-      nsresult ReadIdentifier(nsString& aString,PRBool allowPunct=PR_FALSE);
-      nsresult ReadIdentifier(nsScannerIterator& aStart,
-                              nsScannerIterator& aEnd,
-                              PRBool allowPunct=PR_FALSE);
+      nsresult ReadTagIdentifier(nsString& aString);
+
+      /**
+       *  Consume characters until you run into a char that's not valid in an
+       *  entity name
+       *  
+       *  @param   aString - receives new data from stream
+       *  @return  error code
+       */
+      nsresult ReadEntityIdentifier(nsString& aString);
      nsresult ReadNumber(nsString& aString,PRInt32 aBase);
      nsresult ReadWhitespace(nsString& aString, 
                              PRInt32& aNewlinesSkipped);