Bug 43274 ( nsbeta2+ ) - Checkin for Rickg.

r=harishd
2000-06-29 23:04:24 +00:00 · 2000-06-29 23:04:24 +00:00 · 8aa5a892d7
--- a/htmlparser/src/nsParser.cpp
+++ b/htmlparser/src/nsParser.cpp
@ -24,6 +24,8 @@
 #define DEBUG_XMLENCODING
 #define XMLENCODING_PEEKBYTES 64
 //#define TEST_DOCTYPES 
+//#define DISABLE_TRANSITIONAL_MODE
+

 #include "nsParser.h"
 #include "nsIContentSink.h" 
@ -466,6 +468,108 @@ nsDTDMode nsParser::GetParseMode(void){
 }


+
+class CWordTokenizer {
+public:
+  CWordTokenizer(nsString& aString,PRInt32 aStartOffset,PRInt32 aMaxOffset) {
+    mLength=0;
+    mOffset=aStartOffset;
+    mMaxOffset=aMaxOffset;
+    mBuffer=aString.GetUnicode();
+    mEndBuffer=mBuffer+mMaxOffset;
+  }
+
+  //********************************************************************************
+  // Get offset of nth word in string.
+  // We define words as: 
+  //    1) sequence of alphanum; 
+  //    2) quoted substring
+  //    3) SGML comment -- ... -- 
+  // Returns offset of nth word, or -1 (if out of words).
+  //********************************************************************************
+  
+  PRInt32 GetNextWord() {
+
+    const PRUnichar *cp=mBuffer+mOffset+mLength;  //skip last word
+
+    mLength=0;  //reset this
+    mOffset=-1; //reset this        
+
+    //now skip whitespace...
+
+    PRUnichar target=0;
+    PRBool    done=PR_FALSE;
+
+    while((!done) && (cp++<mEndBuffer)) {
+      switch(*cp) {
+        case kSpace:  case kNewLine:
+        case kCR:     case kTab:
+          continue;
+
+        case kQuote:
+        case kMinus:
+          target=*cp;
+          done=true;
+          break;
+
+        default:
+          done=true;
+          break;
+      }
+    }
+
+    if(cp<mEndBuffer) {  
+
+      const PRUnichar *firstcp=cp; //hang onto this...      
+      PRInt32 theDashCount=2;
+
+      cp++; //just skip first letter to simplify processing...
+
+      //ok, now find end of this word
+      while(cp++<mEndBuffer) {
+        if(kQuote==target) {
+          if(kQuote==*cp) {
+            cp++;
+            break; //we found our end...
+          }
+        }
+        else if(kMinus==target) {
+          //then let's look for SGML comments
+          if(kMinus==*cp) {
+            if(4==++theDashCount) {
+              cp++;
+              break;
+            }
+          }
+        }
+        else {
+          if((kSpace==*cp) ||
+             (kNewLine==*cp) ||
+             (kGreaterThan==*cp) ||
+             (kQuote==*cp) ||
+             (kCR==*cp) ||
+             (kTab==*cp)) {
+            break;
+          }
+        }
+      }
+
+      mLength=cp-firstcp;
+      mOffset = (0<mLength) ? firstcp-mBuffer : -1;
+
+    }
+
+    return mOffset;
+  }
+
+  PRInt32     mOffset;
+  PRInt32     mMaxOffset;
+  PRInt32     mLength;
+  const PRUnichar*  mBuffer;
+  const PRUnichar*  mEndBuffer;
+};
+
+
 /*************************************************************************************************
  First, let's define our modalities:

@ -475,7 +579,7 @@ nsDTDMode nsParser::GetParseMode(void){

  Assume the doctype is in the following form:
    <!DOCTYPE [Top Level Element] [Availability] "[Registration]// [Owner-ID]     //  [Type] [desc-text] // [Language]" "URI|text-identifier"> 
-              [HTML]              [PUBLIC|...]    [+|-]            [W3C|IETF|...]     [DTD]  "..."          [EN]|...]   "..."  
+              [HTML]              [PUBLIC|SYTEM]  [+|-]            [W3C|IETF|...]     [DTD]  "..."          [EN]|...]   "..."  


  Here are the new rules for DTD handling; comments welcome:
@ -502,7 +606,241 @@ nsDTDMode nsParser::GetParseMode(void){
       All other doctypes (<4.0), and documents without a doctype are handled in compatibility-mode.

 *****************************************************************************************************/
- 
+
+static 
+PRBool IsLoosePI(nsString& aBuffer,PRInt32 anOffset,PRInt32 aCount) {
+  PRBool result=PR_FALSE;
+
+  if((aBuffer.Find("TRANSITIONAL",PR_TRUE,anOffset,aCount)>kNotFound)||
+     (aBuffer.Find("LOOSE",PR_TRUE,anOffset,aCount)>kNotFound)       ||
+     (aBuffer.Find("FRAMESET",PR_TRUE,anOffset,aCount)>kNotFound)    ||
+     (aBuffer.Find("LATIN1", PR_TRUE,anOffset,aCount) >kNotFound)    ||
+     (aBuffer.Find("SYMBOLS",PR_TRUE,anOffset,aCount) >kNotFound)    ||
+     (aBuffer.Find("SPECIAL",PR_TRUE,anOffset,aCount) >kNotFound)) {
+
+    result=PR_TRUE;
+
+  }
+  return result;
+}
+
+/**
+ *  This is called when it's time to find out 
+ *  what mode the parser/DTD should run for this document.
+ *  (Each parsercontext can have it's own mode).
+ *  
+ *  @update  gess 06/24/00
+ *  @return  parsermode (define in nsIParser.h)
+ */
+static 
+void DetermineParseMode(nsString& aBuffer,nsDTDMode& aParseMode,eParserDocType& aDocType,const nsString& aMimeType) {
+  const char* theModeStr= PR_GetEnv("PARSE_MODE");
+
+  aParseMode=eDTDMode_quirks;
+  aDocType=eHTML3Text;
+
+
+    //let's eliminate non-HTML as quickly as possible...
+
+  PRInt32 theIndex=aBuffer.Find("?XML",PR_TRUE,0,128);      
+  if(kNotFound!=theIndex) {
+    aParseMode=eDTDMode_strict;
+    if(aMimeType.EqualsWithConversion(kHTMLTextContentType)) {
+      //this is here to prevent a crash if someone gives us an XML document,
+      //but necko tells us it's a text/html mimetype. 
+      aDocType=eHTML4Text;
+      aParseMode=eDTDMode_strict;
+    }
+    else aDocType=eXMLText;
+    return;
+
+  }
+  else if(aMimeType.EqualsWithConversion(kPlainTextContentType)) {
+    aDocType=ePlainText;
+    aParseMode=eDTDMode_quirks;
+    return;
+  }
+  else if(aMimeType.EqualsWithConversion(kRTFTextContentType)) {
+    aDocType=ePlainText;
+    aParseMode=eDTDMode_quirks;
+    return;
+  }
+
+
+  //now let's see if we have HTML or XHTML...
+
+  PRInt32 theGTPos=aBuffer.FindChar(kGreaterThan);
+  
+  if(kNotFound!=theGTPos) {
+
+    const PRUnichar*  theBuffer=aBuffer.GetUnicode();
+    CWordTokenizer theTokenizer(aBuffer,1,theGTPos);
+    PRInt32 theOffset=theTokenizer.GetNextWord();  //try to find ?xml, !doctype, etc...
+
+    if((kNotFound!=theOffset) && 
+       (0==nsCRT::strncasecmp(theBuffer+theOffset,"DOCTYPE",theTokenizer.mLength))) {             
+      
+      //Ok -- so assume it's (X)HTML; now figure out the flavor...
+        
+      PRInt32   theIter=0;      //prevent infinite loops...
+      PRBool    done=PR_FALSE;  //use this to quit if we find garbage...
+      PRBool    readSystemID=PR_FALSE;
+      nsDTDMode thePublicID=eDTDMode_quirks;
+      nsDTDMode theSystemID=eDTDMode_unknown;
+
+      theOffset=theTokenizer.GetNextWord();
+
+      while((kNotFound!=theOffset) && (!done)) {  
+
+        PRUnichar theChar=*(theBuffer+theTokenizer.mOffset);
+        if(kQuote==theChar) {
+
+          if(readSystemID) {
+
+            PRInt32 thePrefix=aBuffer.Find("http://www.w3.org/tr/",PR_TRUE,theOffset,5);  //find the prefix
+
+            if(kNotFound!=thePrefix) {
+              thePrefix+=20;
+              if(IsLoosePI(aBuffer,thePrefix,25)) { //find loose.dtd
+                theSystemID=eDTDMode_transitional;
+              }
+              else if(kNotFound!=aBuffer.Find("strict.dtd",PR_TRUE,thePrefix,25)) {  //find strict.dtd
+                theSystemID=eDTDMode_strict;
+              }
+            }
+
+          }
+          
+          else { //the public ID...
+
+            readSystemID=PR_TRUE;
+
+            PRInt32 theDTDPos=aBuffer.Find("//DTD",PR_TRUE,theOffset,theTokenizer.mLength);
+            if(theDTDPos) {
+
+                //first, let's see if it's XHML...
+              PRInt32 theMLTagPos=aBuffer.Find("XHTML",PR_TRUE,theOffset,theTokenizer.mLength);  
+              if(kNotFound!=theMLTagPos) {
+                aDocType=eXHTMLText;
+                if(IsLoosePI(aBuffer,theMLTagPos+4,20)) 
+                  thePublicID=eDTDMode_transitional;
+                else thePublicID=eDTDMode_strict;
+              }
+
+              else {
+
+                  //now check for strict ISO/IEC OWNER...
+                if(kNotFound!=aBuffer.Find("15445:1999",PR_FALSE,theOffset,theDTDPos-theTokenizer.mOffset)) {
+                  thePublicID=eDTDMode_strict;  //this ISO/IEC DTD is always strict.
+                  aDocType=eHTML4Text;
+                }
+
+                else {
+
+                    //for W3C DTD's, let's make sure it's HTML...
+                  PRInt32 theMLTagPos=aBuffer.Find("HTML",PR_TRUE,theOffset,theTokenizer.mLength);  
+                  if(kNotFound==theMLTagPos) {
+                    theMLTagPos=aBuffer.Find("HYPERTEXT MARKUP",PR_TRUE,theOffset,theTokenizer.mLength);  
+                  }
+
+                  if(kNotFound!=theMLTagPos) {
+                    //and now check the version number...
+
+                    PRInt32 theVersionPos=aBuffer.FindCharInSet("1234567890",theMLTagPos);
+                    PRInt32 theMajorVersion=3;
+
+                    if((0<=theVersionPos) && (theVersionPos<theGTPos)) {
+                      nsAutoString theNum;
+                      PRInt32 theTerminal=aBuffer.FindCharInSet(" />",theVersionPos+1);
+                      if(theTerminal) {
+                        aBuffer.Mid(theNum,theVersionPos,theTerminal-theVersionPos);
+                      }
+                      else aBuffer.Mid(theNum,theVersionPos,3);
+                      PRInt32 theErr=0;
+                      theMajorVersion=theNum.ToInteger(&theErr);
+
+                      if((0==theErr) && (3<theMajorVersion) && (theMajorVersion<100)) {
+                        if(IsLoosePI(aBuffer,theVersionPos+2,20)) 
+                          thePublicID=eDTDMode_transitional;
+                        else thePublicID=eDTDMode_strict;
+                        aDocType=eHTML4Text;
+                      }               
+                    } //if
+                  } //if
+                } //else
+                    
+              } //else
+            }
+
+          } //if publicID
+
+        } //if quote
+        
+        else if(kMinus==theChar) {
+          //explicitly skip comments...
+        }
+        
+        else { //handle an id
+          if(0==nsCRT::strncasecmp(theBuffer+theOffset,"SYSTEM",theTokenizer.mLength)) 
+            readSystemID=PR_TRUE;
+          else if(0==nsCRT::strncasecmp(theBuffer+theOffset,"HTML",theTokenizer.mLength)) 
+            readSystemID=PR_FALSE;
+        }
+
+        theOffset=theTokenizer.GetNextWord();
+        if(++theIter>10) done=PR_TRUE; //prevent infinite loops...
+      } //while
+
+
+      if(theSystemID==thePublicID) 
+        aParseMode=thePublicID;
+      else if(eDTDMode_unknown==theSystemID){
+        aParseMode=thePublicID;
+        if(eHTML4Text==aDocType) {
+          if (eDTDMode_transitional==thePublicID)
+            aParseMode=eDTDMode_quirks;  //degrade because the systemID is missing.
+        }
+      }
+      else {
+        //ack! The doctype is badly formed (system and public ID's contradict).
+        //let's switch back to default compatibility mode...
+          aParseMode=eDTDMode_unknown;
+      }
+    } 
+  }
+
+  if(eDTDMode_unknown==aParseMode) {
+      //nothing left to do but fail gracefully...
+    if(eXHTMLText==aDocType) {
+      aParseMode=eDTDMode_transitional;
+    }
+    if(eHTML4Text==aDocType) {
+      aDocType=eHTML3Text;
+      aParseMode=eDTDMode_quirks;
+    }
+  }
+
+#ifdef  DISABLE_TRANSITIONAL_MODE
+
+  /********************************************************************************************
+      The following code is here because to deal with a nasty backward compatibility problem. 
+      The composer product emits <doctype HTML 4.0 Transitional> for the documents it creates, 
+      but the documents aren't really compliant. To prevent lots of pages from breaking, well 
+      disable proper handling of Transitional doctypes and use quirks mode instead. If lucky, 
+      we'll get to add a pref to allow power users to get the right answer.
+   ********************************************************************************************/
+
+  if(eDTDMode_transitional==aParseMode) {
+    if(eHTML4Text==aDocType)
+      aParseMode=eDTDMode_quirks;
+    else if(eXHTMLText==aDocType)
+      aParseMode=eDTDMode_strict;
+  }
+#endif
+
+
+}
+
 /**
 *  This is called when it's time to find out 
 *  what mode the parser/DTD should run for this document.
@ -512,16 +850,17 @@ nsDTDMode nsParser::GetParseMode(void){
 *  @return  parsermode (define in nsIParser.h)
 */
 static 
-void DetermineParseMode(nsString& aBuffer,nsDTDMode& aParseMode,eParserDocType& aDocType,const nsString& aMimeType) {
+void DetermineParseMode2(nsString& aBuffer,nsDTDMode& aParseMode,eParserDocType& aDocType,const nsString& aMimeType) {
  const char* theModeStr= PR_GetEnv("PARSE_MODE");

  aParseMode = eDTDMode_unknown;
-    
+
+  PRInt32 theGTPos=aBuffer.FindChar(kGreaterThan);
+
  PRInt32 theIndex=aBuffer.Find("DOCTYPE",PR_TRUE,0,100);
  if(kNotFound<theIndex) {
  
    //good, we found "DOCTYPE" -- now go find it's end delimiter '>'
-    PRInt32 theGTPos=aBuffer.FindChar(kGreaterThan,theIndex+1);
    PRInt32 theEnd=(kNotFound==theGTPos) ? 512 : MinInt(512,theGTPos);
    PRInt32 theSubIndex=aBuffer.Find("//DTD",PR_TRUE,theIndex+8,theEnd-(theIndex+8));  //skip to the type and desc-text...
    PRInt32 theErr=0;
@ -552,6 +891,8 @@ void DetermineParseMode(nsString& aBuffer,nsDTDMode& aParseMode,eParserDocType&

      if(kNotFound<theSubIndex) {

+        //grab the next word
+
        PRInt32 theHTMLTagPos=aBuffer.Find("HTML",PR_TRUE,theStartPos,theCount);  
        if(kNotFound==theHTMLTagPos) {
          theHTMLTagPos=aBuffer.Find("HYPERTEXT MARKUP",PR_TRUE,theStartPos,theCount);  
@ -860,22 +1201,19 @@ nsresult nsParser::CreateCompatibleDTD(nsIDTD** aDTD,
 #ifdef TEST_DOCTYPES
 static const char* doctypes[] = {

-  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\">",
-  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">",
-  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\" \"http://www.w3.org/TR/html4/frameset.dtd\">",
+    //here are the XHTML doctypes we'll treat accordingly...

-  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\">",
-  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" >",
-  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\">",
+  "<!DOCTYPE \"-//W3C//DTD XHTML 1.0 Strict//EN\">",
+  "<!DOCTYPE \"-//W3C//DTD XHTML 1.0 Transitional//EN\">",
+  "<!DOCTYPE \"-//W3C//DTD XHTML 1.0 Frameset//EN\">",
  
    //here are a few HTML doctypes we'll treat as strict...

-  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\">",
  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">",
+  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\">",
+  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\">",
  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">",
-
-  "<!DOCTYPE HTML PUBLIC PublicID SystemID>",
-  "<!DOCTYPE HTML SYSTEM SystemID>",
+  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\">",

  "<!DOCTYPE \"-//W3C//DTD HTML 4.0//EN\">",
  "<!DOCTYPE \"-//W3C//DTD HTML 4.01//EN\">",
@ -888,17 +1226,13 @@ static const char* doctypes[] = {
  "<!DOCTYPE \"-//SoftQuad Software//DTD HoTMetaL PRO 6.::19990601::extensions to HTML 4.//EN\">", 

  "<!DOCTYPE \"-//W3C//DTD HTML 5.0//EN\">",
-  "<!DOCTYPE \"-//W3C//DTD HTML 6.01 Transitional//EN\">",
  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 6.0//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\">",

-    //here are the XHTML doctypes we'll treat as strict...
-  "<!DOCTYPE \"-//W3C//DTD XHTML 1.0 Strict//EN\">",
-  "<!DOCTYPE \"-//W3C//DTD XHTML 1.0 Transitional//EN\">",
-  "<!DOCTYPE \"-//W3C//DTD XHTML 1.0 Frameset//EN\">",
  
-    //these we treat as compatible (no quirks if possible)...
+    //these we treat as transitional (unless it's disabled)...

-  "<!DOCTYPE \"-//W3C//DTD HTML Experimental 19960712//EN\">", 
+  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">",
+  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\" \"http://www.w3.org/TR/html4/frameset.dtd\">",
  "<!DOCTYPE \"-//W3C//DTD HTML 4.01 Transitional//EN\">",
  "<!DOCTYPE \"-//W3C//DTD HTML 4.1 Frameset//EN\">", 
  "<!DOCTYPE \"-//W3C//DTD HTML 4.0 Transitional//EN\">", 
@ -910,6 +1244,11 @@ static const char* doctypes[] = {
  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\">",

    //these we treat as compatible with quirks... (along with any other we encounter)...
+
+  "<!DOCTYPE \"-//W3C//DTD HTML 6.01 Transitional//EN\">",
+  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" >",
+  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\">",
+  "<!DOCTYPE \"-//W3C//DTD HTML Experimental 19960712//EN\">", 
  "<!DOCTYPE \"-//W3O//DTD W3 HTML 3.0//EN//\">", 
  "<!DOCTYPE \"-//IETF//DTD HTML//EN//3.\">", 
  "<!DOCTYPE \"-//W3C//DTD W3 HTML 3.0//EN//\">", 
@ -925,7 +1264,6 @@ static const char* doctypes[] = {
  "<!DOCTYPE \"-//W3C//DTD W3 HTML Strict 3//EN//\">", 
  "<!DOCTYPE \"-//IETF//DTD HTML Strict Level 3//EN\">", 
  "<!DOCTYPE \"-//IETF//DTD HTML Strict Level 3//EN//3.0\">", 
-  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">",

  "<!DOCTYPE \"HTML\">", 
  "<!DOCTYPE \"-//IETF//DTD HTML//EN\">", 
@ -1021,22 +1359,6 @@ nsresult nsParser::WillBuildModel(nsString& aFilename){
      nsString& theBuffer=mParserContext->mScanner->GetBuffer();
      DetermineParseMode(theBuffer,mParserContext->mDTDMode,mParserContext->mDocType,mParserContext->mMimeType);

-#define DISABLE_TRANSITIONAL_MODE
-#ifdef  DISABLE_TRANSITIONAL_MODE
-
-      /********************************************************************************************
-          The following code is here because to deal with a nasty backward compatibility problem. 
-          The composer product emits <doctype HTML 4.0 Transitional> for the documents it creates, 
-          but the documents aren't really compliant. To prevent lots of pages from breaking, well 
-          disable proper handling of Transitional doctypes and use quirks mode instead. If lucky, 
-          we'll get to add a pref to allow power users to get the right answer.
-       ********************************************************************************************/
-
-      if(eDTDMode_transitional==mParserContext->mDTDMode) {
-        mParserContext->mDTDMode=eDTDMode_quirks;
-      }
-#endif
-
      if(PR_TRUE==FindSuitableDTD(*mParserContext,theBuffer)) {
        mParserContext->mDTD->WillBuildModel( *mParserContext,mSink);
      }//if        
--- a/parser/htmlparser/src/nsParser.cpp
+++ b/parser/htmlparser/src/nsParser.cpp
@ -24,6 +24,8 @@
 #define DEBUG_XMLENCODING
 #define XMLENCODING_PEEKBYTES 64
 //#define TEST_DOCTYPES 
+//#define DISABLE_TRANSITIONAL_MODE
+

 #include "nsParser.h"
 #include "nsIContentSink.h" 
@ -466,6 +468,108 @@ nsDTDMode nsParser::GetParseMode(void){
 }


+
+class CWordTokenizer {
+public:
+  CWordTokenizer(nsString& aString,PRInt32 aStartOffset,PRInt32 aMaxOffset) {
+    mLength=0;
+    mOffset=aStartOffset;
+    mMaxOffset=aMaxOffset;
+    mBuffer=aString.GetUnicode();
+    mEndBuffer=mBuffer+mMaxOffset;
+  }
+
+  //********************************************************************************
+  // Get offset of nth word in string.
+  // We define words as: 
+  //    1) sequence of alphanum; 
+  //    2) quoted substring
+  //    3) SGML comment -- ... -- 
+  // Returns offset of nth word, or -1 (if out of words).
+  //********************************************************************************
+  
+  PRInt32 GetNextWord() {
+
+    const PRUnichar *cp=mBuffer+mOffset+mLength;  //skip last word
+
+    mLength=0;  //reset this
+    mOffset=-1; //reset this        
+
+    //now skip whitespace...
+
+    PRUnichar target=0;
+    PRBool    done=PR_FALSE;
+
+    while((!done) && (cp++<mEndBuffer)) {
+      switch(*cp) {
+        case kSpace:  case kNewLine:
+        case kCR:     case kTab:
+          continue;
+
+        case kQuote:
+        case kMinus:
+          target=*cp;
+          done=true;
+          break;
+
+        default:
+          done=true;
+          break;
+      }
+    }
+
+    if(cp<mEndBuffer) {  
+
+      const PRUnichar *firstcp=cp; //hang onto this...      
+      PRInt32 theDashCount=2;
+
+      cp++; //just skip first letter to simplify processing...
+
+      //ok, now find end of this word
+      while(cp++<mEndBuffer) {
+        if(kQuote==target) {
+          if(kQuote==*cp) {
+            cp++;
+            break; //we found our end...
+          }
+        }
+        else if(kMinus==target) {
+          //then let's look for SGML comments
+          if(kMinus==*cp) {
+            if(4==++theDashCount) {
+              cp++;
+              break;
+            }
+          }
+        }
+        else {
+          if((kSpace==*cp) ||
+             (kNewLine==*cp) ||
+             (kGreaterThan==*cp) ||
+             (kQuote==*cp) ||
+             (kCR==*cp) ||
+             (kTab==*cp)) {
+            break;
+          }
+        }
+      }
+
+      mLength=cp-firstcp;
+      mOffset = (0<mLength) ? firstcp-mBuffer : -1;
+
+    }
+
+    return mOffset;
+  }
+
+  PRInt32     mOffset;
+  PRInt32     mMaxOffset;
+  PRInt32     mLength;
+  const PRUnichar*  mBuffer;
+  const PRUnichar*  mEndBuffer;
+};
+
+
 /*************************************************************************************************
  First, let's define our modalities:

@ -475,7 +579,7 @@ nsDTDMode nsParser::GetParseMode(void){

  Assume the doctype is in the following form:
    <!DOCTYPE [Top Level Element] [Availability] "[Registration]// [Owner-ID]     //  [Type] [desc-text] // [Language]" "URI|text-identifier"> 
-              [HTML]              [PUBLIC|...]    [+|-]            [W3C|IETF|...]     [DTD]  "..."          [EN]|...]   "..."  
+              [HTML]              [PUBLIC|SYTEM]  [+|-]            [W3C|IETF|...]     [DTD]  "..."          [EN]|...]   "..."  


  Here are the new rules for DTD handling; comments welcome:
@ -502,7 +606,241 @@ nsDTDMode nsParser::GetParseMode(void){
       All other doctypes (<4.0), and documents without a doctype are handled in compatibility-mode.

 *****************************************************************************************************/
- 
+
+static 
+PRBool IsLoosePI(nsString& aBuffer,PRInt32 anOffset,PRInt32 aCount) {
+  PRBool result=PR_FALSE;
+
+  if((aBuffer.Find("TRANSITIONAL",PR_TRUE,anOffset,aCount)>kNotFound)||
+     (aBuffer.Find("LOOSE",PR_TRUE,anOffset,aCount)>kNotFound)       ||
+     (aBuffer.Find("FRAMESET",PR_TRUE,anOffset,aCount)>kNotFound)    ||
+     (aBuffer.Find("LATIN1", PR_TRUE,anOffset,aCount) >kNotFound)    ||
+     (aBuffer.Find("SYMBOLS",PR_TRUE,anOffset,aCount) >kNotFound)    ||
+     (aBuffer.Find("SPECIAL",PR_TRUE,anOffset,aCount) >kNotFound)) {
+
+    result=PR_TRUE;
+
+  }
+  return result;
+}
+
+/**
+ *  This is called when it's time to find out 
+ *  what mode the parser/DTD should run for this document.
+ *  (Each parsercontext can have it's own mode).
+ *  
+ *  @update  gess 06/24/00
+ *  @return  parsermode (define in nsIParser.h)
+ */
+static 
+void DetermineParseMode(nsString& aBuffer,nsDTDMode& aParseMode,eParserDocType& aDocType,const nsString& aMimeType) {
+  const char* theModeStr= PR_GetEnv("PARSE_MODE");
+
+  aParseMode=eDTDMode_quirks;
+  aDocType=eHTML3Text;
+
+
+    //let's eliminate non-HTML as quickly as possible...
+
+  PRInt32 theIndex=aBuffer.Find("?XML",PR_TRUE,0,128);      
+  if(kNotFound!=theIndex) {
+    aParseMode=eDTDMode_strict;
+    if(aMimeType.EqualsWithConversion(kHTMLTextContentType)) {
+      //this is here to prevent a crash if someone gives us an XML document,
+      //but necko tells us it's a text/html mimetype. 
+      aDocType=eHTML4Text;
+      aParseMode=eDTDMode_strict;
+    }
+    else aDocType=eXMLText;
+    return;
+
+  }
+  else if(aMimeType.EqualsWithConversion(kPlainTextContentType)) {
+    aDocType=ePlainText;
+    aParseMode=eDTDMode_quirks;
+    return;
+  }
+  else if(aMimeType.EqualsWithConversion(kRTFTextContentType)) {
+    aDocType=ePlainText;
+    aParseMode=eDTDMode_quirks;
+    return;
+  }
+
+
+  //now let's see if we have HTML or XHTML...
+
+  PRInt32 theGTPos=aBuffer.FindChar(kGreaterThan);
+  
+  if(kNotFound!=theGTPos) {
+
+    const PRUnichar*  theBuffer=aBuffer.GetUnicode();
+    CWordTokenizer theTokenizer(aBuffer,1,theGTPos);
+    PRInt32 theOffset=theTokenizer.GetNextWord();  //try to find ?xml, !doctype, etc...
+
+    if((kNotFound!=theOffset) && 
+       (0==nsCRT::strncasecmp(theBuffer+theOffset,"DOCTYPE",theTokenizer.mLength))) {             
+      
+      //Ok -- so assume it's (X)HTML; now figure out the flavor...
+        
+      PRInt32   theIter=0;      //prevent infinite loops...
+      PRBool    done=PR_FALSE;  //use this to quit if we find garbage...
+      PRBool    readSystemID=PR_FALSE;
+      nsDTDMode thePublicID=eDTDMode_quirks;
+      nsDTDMode theSystemID=eDTDMode_unknown;
+
+      theOffset=theTokenizer.GetNextWord();
+
+      while((kNotFound!=theOffset) && (!done)) {  
+
+        PRUnichar theChar=*(theBuffer+theTokenizer.mOffset);
+        if(kQuote==theChar) {
+
+          if(readSystemID) {
+
+            PRInt32 thePrefix=aBuffer.Find("http://www.w3.org/tr/",PR_TRUE,theOffset,5);  //find the prefix
+
+            if(kNotFound!=thePrefix) {
+              thePrefix+=20;
+              if(IsLoosePI(aBuffer,thePrefix,25)) { //find loose.dtd
+                theSystemID=eDTDMode_transitional;
+              }
+              else if(kNotFound!=aBuffer.Find("strict.dtd",PR_TRUE,thePrefix,25)) {  //find strict.dtd
+                theSystemID=eDTDMode_strict;
+              }
+            }
+
+          }
+          
+          else { //the public ID...
+
+            readSystemID=PR_TRUE;
+
+            PRInt32 theDTDPos=aBuffer.Find("//DTD",PR_TRUE,theOffset,theTokenizer.mLength);
+            if(theDTDPos) {
+
+                //first, let's see if it's XHML...
+              PRInt32 theMLTagPos=aBuffer.Find("XHTML",PR_TRUE,theOffset,theTokenizer.mLength);  
+              if(kNotFound!=theMLTagPos) {
+                aDocType=eXHTMLText;
+                if(IsLoosePI(aBuffer,theMLTagPos+4,20)) 
+                  thePublicID=eDTDMode_transitional;
+                else thePublicID=eDTDMode_strict;
+              }
+
+              else {
+
+                  //now check for strict ISO/IEC OWNER...
+                if(kNotFound!=aBuffer.Find("15445:1999",PR_FALSE,theOffset,theDTDPos-theTokenizer.mOffset)) {
+                  thePublicID=eDTDMode_strict;  //this ISO/IEC DTD is always strict.
+                  aDocType=eHTML4Text;
+                }
+
+                else {
+
+                    //for W3C DTD's, let's make sure it's HTML...
+                  PRInt32 theMLTagPos=aBuffer.Find("HTML",PR_TRUE,theOffset,theTokenizer.mLength);  
+                  if(kNotFound==theMLTagPos) {
+                    theMLTagPos=aBuffer.Find("HYPERTEXT MARKUP",PR_TRUE,theOffset,theTokenizer.mLength);  
+                  }
+
+                  if(kNotFound!=theMLTagPos) {
+                    //and now check the version number...
+
+                    PRInt32 theVersionPos=aBuffer.FindCharInSet("1234567890",theMLTagPos);
+                    PRInt32 theMajorVersion=3;
+
+                    if((0<=theVersionPos) && (theVersionPos<theGTPos)) {
+                      nsAutoString theNum;
+                      PRInt32 theTerminal=aBuffer.FindCharInSet(" />",theVersionPos+1);
+                      if(theTerminal) {
+                        aBuffer.Mid(theNum,theVersionPos,theTerminal-theVersionPos);
+                      }
+                      else aBuffer.Mid(theNum,theVersionPos,3);
+                      PRInt32 theErr=0;
+                      theMajorVersion=theNum.ToInteger(&theErr);
+
+                      if((0==theErr) && (3<theMajorVersion) && (theMajorVersion<100)) {
+                        if(IsLoosePI(aBuffer,theVersionPos+2,20)) 
+                          thePublicID=eDTDMode_transitional;
+                        else thePublicID=eDTDMode_strict;
+                        aDocType=eHTML4Text;
+                      }               
+                    } //if
+                  } //if
+                } //else
+                    
+              } //else
+            }
+
+          } //if publicID
+
+        } //if quote
+        
+        else if(kMinus==theChar) {
+          //explicitly skip comments...
+        }
+        
+        else { //handle an id
+          if(0==nsCRT::strncasecmp(theBuffer+theOffset,"SYSTEM",theTokenizer.mLength)) 
+            readSystemID=PR_TRUE;
+          else if(0==nsCRT::strncasecmp(theBuffer+theOffset,"HTML",theTokenizer.mLength)) 
+            readSystemID=PR_FALSE;
+        }
+
+        theOffset=theTokenizer.GetNextWord();
+        if(++theIter>10) done=PR_TRUE; //prevent infinite loops...
+      } //while
+
+
+      if(theSystemID==thePublicID) 
+        aParseMode=thePublicID;
+      else if(eDTDMode_unknown==theSystemID){
+        aParseMode=thePublicID;
+        if(eHTML4Text==aDocType) {
+          if (eDTDMode_transitional==thePublicID)
+            aParseMode=eDTDMode_quirks;  //degrade because the systemID is missing.
+        }
+      }
+      else {
+        //ack! The doctype is badly formed (system and public ID's contradict).
+        //let's switch back to default compatibility mode...
+          aParseMode=eDTDMode_unknown;
+      }
+    } 
+  }
+
+  if(eDTDMode_unknown==aParseMode) {
+      //nothing left to do but fail gracefully...
+    if(eXHTMLText==aDocType) {
+      aParseMode=eDTDMode_transitional;
+    }
+    if(eHTML4Text==aDocType) {
+      aDocType=eHTML3Text;
+      aParseMode=eDTDMode_quirks;
+    }
+  }
+
+#ifdef  DISABLE_TRANSITIONAL_MODE
+
+  /********************************************************************************************
+      The following code is here because to deal with a nasty backward compatibility problem. 
+      The composer product emits <doctype HTML 4.0 Transitional> for the documents it creates, 
+      but the documents aren't really compliant. To prevent lots of pages from breaking, well 
+      disable proper handling of Transitional doctypes and use quirks mode instead. If lucky, 
+      we'll get to add a pref to allow power users to get the right answer.
+   ********************************************************************************************/
+
+  if(eDTDMode_transitional==aParseMode) {
+    if(eHTML4Text==aDocType)
+      aParseMode=eDTDMode_quirks;
+    else if(eXHTMLText==aDocType)
+      aParseMode=eDTDMode_strict;
+  }
+#endif
+
+
+}
+
 /**
 *  This is called when it's time to find out 
 *  what mode the parser/DTD should run for this document.
@ -512,16 +850,17 @@ nsDTDMode nsParser::GetParseMode(void){
 *  @return  parsermode (define in nsIParser.h)
 */
 static 
-void DetermineParseMode(nsString& aBuffer,nsDTDMode& aParseMode,eParserDocType& aDocType,const nsString& aMimeType) {
+void DetermineParseMode2(nsString& aBuffer,nsDTDMode& aParseMode,eParserDocType& aDocType,const nsString& aMimeType) {
  const char* theModeStr= PR_GetEnv("PARSE_MODE");

  aParseMode = eDTDMode_unknown;
-    
+
+  PRInt32 theGTPos=aBuffer.FindChar(kGreaterThan);
+
  PRInt32 theIndex=aBuffer.Find("DOCTYPE",PR_TRUE,0,100);
  if(kNotFound<theIndex) {
  
    //good, we found "DOCTYPE" -- now go find it's end delimiter '>'
-    PRInt32 theGTPos=aBuffer.FindChar(kGreaterThan,theIndex+1);
    PRInt32 theEnd=(kNotFound==theGTPos) ? 512 : MinInt(512,theGTPos);
    PRInt32 theSubIndex=aBuffer.Find("//DTD",PR_TRUE,theIndex+8,theEnd-(theIndex+8));  //skip to the type and desc-text...
    PRInt32 theErr=0;
@ -552,6 +891,8 @@ void DetermineParseMode(nsString& aBuffer,nsDTDMode& aParseMode,eParserDocType&

      if(kNotFound<theSubIndex) {

+        //grab the next word
+
        PRInt32 theHTMLTagPos=aBuffer.Find("HTML",PR_TRUE,theStartPos,theCount);  
        if(kNotFound==theHTMLTagPos) {
          theHTMLTagPos=aBuffer.Find("HYPERTEXT MARKUP",PR_TRUE,theStartPos,theCount);  
@ -860,22 +1201,19 @@ nsresult nsParser::CreateCompatibleDTD(nsIDTD** aDTD,
 #ifdef TEST_DOCTYPES
 static const char* doctypes[] = {

-  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\">",
-  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">",
-  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\" \"http://www.w3.org/TR/html4/frameset.dtd\">",
+    //here are the XHTML doctypes we'll treat accordingly...

-  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\">",
-  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" >",
-  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\">",
+  "<!DOCTYPE \"-//W3C//DTD XHTML 1.0 Strict//EN\">",
+  "<!DOCTYPE \"-//W3C//DTD XHTML 1.0 Transitional//EN\">",
+  "<!DOCTYPE \"-//W3C//DTD XHTML 1.0 Frameset//EN\">",
  
    //here are a few HTML doctypes we'll treat as strict...

-  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\">",
  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">",
+  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\">",
+  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\">",
  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">",
-
-  "<!DOCTYPE HTML PUBLIC PublicID SystemID>",
-  "<!DOCTYPE HTML SYSTEM SystemID>",
+  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\">",

  "<!DOCTYPE \"-//W3C//DTD HTML 4.0//EN\">",
  "<!DOCTYPE \"-//W3C//DTD HTML 4.01//EN\">",
@ -888,17 +1226,13 @@ static const char* doctypes[] = {
  "<!DOCTYPE \"-//SoftQuad Software//DTD HoTMetaL PRO 6.::19990601::extensions to HTML 4.//EN\">", 

  "<!DOCTYPE \"-//W3C//DTD HTML 5.0//EN\">",
-  "<!DOCTYPE \"-//W3C//DTD HTML 6.01 Transitional//EN\">",
  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 6.0//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\">",

-    //here are the XHTML doctypes we'll treat as strict...
-  "<!DOCTYPE \"-//W3C//DTD XHTML 1.0 Strict//EN\">",
-  "<!DOCTYPE \"-//W3C//DTD XHTML 1.0 Transitional//EN\">",
-  "<!DOCTYPE \"-//W3C//DTD XHTML 1.0 Frameset//EN\">",
  
-    //these we treat as compatible (no quirks if possible)...
+    //these we treat as transitional (unless it's disabled)...

-  "<!DOCTYPE \"-//W3C//DTD HTML Experimental 19960712//EN\">", 
+  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">",
+  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\" \"http://www.w3.org/TR/html4/frameset.dtd\">",
  "<!DOCTYPE \"-//W3C//DTD HTML 4.01 Transitional//EN\">",
  "<!DOCTYPE \"-//W3C//DTD HTML 4.1 Frameset//EN\">", 
  "<!DOCTYPE \"-//W3C//DTD HTML 4.0 Transitional//EN\">", 
@ -910,6 +1244,11 @@ static const char* doctypes[] = {
  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\">",

    //these we treat as compatible with quirks... (along with any other we encounter)...
+
+  "<!DOCTYPE \"-//W3C//DTD HTML 6.01 Transitional//EN\">",
+  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" >",
+  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\">",
+  "<!DOCTYPE \"-//W3C//DTD HTML Experimental 19960712//EN\">", 
  "<!DOCTYPE \"-//W3O//DTD W3 HTML 3.0//EN//\">", 
  "<!DOCTYPE \"-//IETF//DTD HTML//EN//3.\">", 
  "<!DOCTYPE \"-//W3C//DTD W3 HTML 3.0//EN//\">", 
@ -925,7 +1264,6 @@ static const char* doctypes[] = {
  "<!DOCTYPE \"-//W3C//DTD W3 HTML Strict 3//EN//\">", 
  "<!DOCTYPE \"-//IETF//DTD HTML Strict Level 3//EN\">", 
  "<!DOCTYPE \"-//IETF//DTD HTML Strict Level 3//EN//3.0\">", 
-  "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">",

  "<!DOCTYPE \"HTML\">", 
  "<!DOCTYPE \"-//IETF//DTD HTML//EN\">", 
@ -1021,22 +1359,6 @@ nsresult nsParser::WillBuildModel(nsString& aFilename){
      nsString& theBuffer=mParserContext->mScanner->GetBuffer();
      DetermineParseMode(theBuffer,mParserContext->mDTDMode,mParserContext->mDocType,mParserContext->mMimeType);

-#define DISABLE_TRANSITIONAL_MODE
-#ifdef  DISABLE_TRANSITIONAL_MODE
-
-      /********************************************************************************************
-          The following code is here because to deal with a nasty backward compatibility problem. 
-          The composer product emits <doctype HTML 4.0 Transitional> for the documents it creates, 
-          but the documents aren't really compliant. To prevent lots of pages from breaking, well 
-          disable proper handling of Transitional doctypes and use quirks mode instead. If lucky, 
-          we'll get to add a pref to allow power users to get the right answer.
-       ********************************************************************************************/
-
-      if(eDTDMode_transitional==mParserContext->mDTDMode) {
-        mParserContext->mDTDMode=eDTDMode_quirks;
-      }
-#endif
-
      if(PR_TRUE==FindSuitableDTD(*mParserContext,theBuffer)) {
        mParserContext->mDTD->WillBuildModel( *mParserContext,mSink);
      }//if