- Bug 243040: Re-escape attribute values which Gecko unescaped for us. Not doing this allowed to insert arbitary unchecked HTML into the result.

- JS quotes in attribute values are now probably possible (but JS still usually disabled by allowed attribute prefs and javascript: URL blocker) - removed base64 blocker which is probably pointless - comment and debug changes - little code style cleanup r=akk, sr=jst
2004-06-05 22:47:40 +00:00 · 2004-06-05 22:47:40 +00:00 · a6d55e7b8a
--- a/content/base/src/mozSanitizingSerializer.cpp
+++ b/content/base/src/mozSanitizingSerializer.cpp
@ -36,16 +36,13 @@
 *
 * ***** END LICENSE BLOCK ***** */

+// Removes potentially insecure or offending HTML
+
 /* I used nsPlaintextSerializer as base for this class. I don't understand
   all of the functions in the beginning. Possible that I fail to do
   something or do something useless.
   I am not proud about the implementation here at all.
   Feel free to fix it :-).
-
-   I am moderately concerned about methods to obfuscate HTML, which the
-   parser can decode during execution.
-   E.g. there are these dreaded data: and javascript URLs and
-   base64 encoding (which I don't really understand how it alloies
 */

 #include "mozSanitizingSerializer.h"
@ -58,14 +55,19 @@
 #include "nsContentUtils.h"
 #include "nsReadableUtils.h"
 #include "plstr.h"
-//#include "nsDependentString.h"
 #include "nsIProperties.h"
 #include "nsUnicharUtils.h"
 #include "nsIURI.h"
 #include "nsNetUtil.h"
+#include "nsEscape.h"

 //#define DEBUG_BenB

+static inline PRUnichar* escape(const nsString& source)
+{
+  return nsEscapeHTML2(source.get(), source.Length()); 
+}
+
 /* XXX: |printf|s in some error conditions. They are intended as information
   for the user, because they complain about malformed pref values.
   Not sure, if popping up dialog boxes is the right thing for such code
@ -89,16 +91,18 @@ nsresult NS_NewSanitizingHTMLSerializer(nsIContentSerializer** aSerializer)
 mozSanitizingHTMLSerializer::mozSanitizingHTMLSerializer()
  : mAllowedTags(30) // Just some initial buffer size
 {
-
  mOutputString = nsnull;
 }

 mozSanitizingHTMLSerializer::~mozSanitizingHTMLSerializer()
 {
+#ifdef DEBUG_BenB
+  printf("Output:\n%s\n", NS_LossyConvertUCS2toASCII(*mOutputString).get());
+#endif
  mAllowedTags.Enumerate(ReleaseProperties);
 }

-//<copied from="xpcom/ds/nsProperties.cpp">
+//<copy from="xpcom/ds/nsProperties.cpp">
 PRBool PR_CALLBACK 
 mozSanitizingHTMLSerializer::ReleaseProperties(nsHashKey* key, void* data,
                                               void* closure)
@ -107,7 +111,7 @@ mozSanitizingHTMLSerializer::ReleaseProperties(nsHashKey* key, void* data,
  NS_IF_RELEASE(prop);
  return PR_TRUE;
 }
-//</copied>
+//</copy>

 NS_IMPL_ISUPPORTS4(mozSanitizingHTMLSerializer,
                   nsIContentSerializer,
@ -149,8 +153,8 @@ mozSanitizingHTMLSerializer::Initialize(nsAString* aOutString,
 NS_IMETHODIMP
 mozSanitizingHTMLSerializer::Flush(nsAString& aStr)
 {
-#if DEBUG_BenB
-  printf("Flush: -%s-", NS_LossyConvertUCS2toASCII(aString).get());
+#ifdef DEBUG_BenB
+  printf("Flush: -%s-", NS_LossyConvertUCS2toASCII(aStr).get());
 #endif
  Write(aStr);
  return NS_OK;
@ -166,7 +170,6 @@ mozSanitizingHTMLSerializer::AppendDocumentStart(nsIDOMDocument *aDocument,
 void
 mozSanitizingHTMLSerializer::Write(const nsAString& aString)
 {
-  //  printf("%s", NS_LossyConvertUCS2toASCII(aString).get());
  mOutputString->Append(aString);
 }

@ -360,7 +363,7 @@ mozSanitizingHTMLSerializer::SetTitle(const nsString& aValue)
    // missing </title> tag won't result in everything
    // being eaten up as the title.
    Write(NS_LITERAL_STRING("<title>"));
-    Write(aValue);
+    Write(nsAdoptingString(escape(aValue)));
    Write(NS_LITERAL_STRING("</title>"));
  }
  return NS_OK;
@ -379,7 +382,8 @@ mozSanitizingHTMLSerializer::SetDocumentCharset(nsACString& aCharset)
  Write(NS_LITERAL_STRING("\n<meta http-equiv=\"Context-Type\" content=\"text/html; charset=")
        /* Danger: breaking the line within the string literal, like
           "foo"\n"bar", breaks win32! */
-        + NS_ConvertASCIItoUCS2(aCharset) + NS_LITERAL_STRING("\">\n"));
+        + nsAdoptingString(escape(NS_ConvertASCIItoUCS2(aCharset)))
+        + NS_LITERAL_STRING("\">\n"));
  return NS_OK;
 }

@ -529,7 +533,7 @@ mozSanitizingHTMLSerializer::DoAddLeaf(PRInt32 aTag,
  if (type == eHTMLTag_whitespace ||
      type == eHTMLTag_newline)
  {
-    Write(aText);
+    Write(aText); // sure to be safe?
  }
  else if (type == eHTMLTag_text)
  {
@ -538,13 +542,13 @@ mozSanitizingHTMLSerializer::DoAddLeaf(PRInt32 aTag,
      Write(text);
    else
      Write(NS_LITERAL_STRING(TEXT_REMOVED)); // Does not happen (yet)
-    if (NS_FAILED(rv))
-      return rv;
+    NS_ENSURE_SUCCESS(rv, rv);
  }
  else if (type == eHTMLTag_entity)
  {
    Write(NS_LITERAL_STRING("&"));
-    Write(aText);
+    Write(aText); // sure to be safe?
+    Write(NS_LITERAL_STRING(";"));
    // using + operator here might give an infinitive loop, see above.
  }
  else
@ -562,19 +566,7 @@ mozSanitizingHTMLSerializer::DoAddLeaf(PRInt32 aTag,
 nsresult
 mozSanitizingHTMLSerializer::SanitizeTextNode(nsString& aText /*inout*/)
 {
-  nsString& text = aText; // alias
-
-  PRInt32 pos;
-  if ((pos = text.Find("base64")) != kNotFound)
-    // Probably useless, not sure, but perfhit
-    //    return NS_ERROR_ILLEGAL_VALUE; -- this is too extreme
-    text.Insert(NS_LITERAL_STRING(TEXT_BREAKER), pos + 1);
-         /* Insert some other text after the first char of the problematic
-            text, so we prevent the processing by Gecko.
-            No idea, if that is needed, but better do it than being sorry.
-            Somebody who knows all the dangers and how they are reflected
-            in Gecko please jump in. */
-
+  aText.Adopt(escape(aText));
  return NS_OK;
 }

@ -584,14 +576,14 @@ mozSanitizingHTMLSerializer::SanitizeTextNode(nsString& aText /*inout*/)
   unwanted / dangerous URLs appear in the document
   (like javascript: and data:).

-   Pass the value as |value| arg. It will be modified in-place.
+   Pass the value as |aValue| arg. It will be modified in-place.

   If the value is not allowed at all, we return with NS_ERROR_ILLEGAL_VALUE.
-   In that case, do not use the |value|, but output nothing.
+   In that case, do not use the |aValue|, but output nothing.
 */
 nsresult
 mozSanitizingHTMLSerializer::SanitizeAttrValue(nsHTMLTag aTag,
-                                               const nsAString& attr_name,
+                                               const nsAString& anAttrName,
                                               nsString& aValue /*inout*/)
 {
  /* First, cut the attribute to 1000 chars.
@ -599,42 +591,42 @@ mozSanitizingHTMLSerializer::SanitizeAttrValue(nsHTMLTag aTag,
     considering that we don't support any JS. The longest attributes
     I can think of are URLs, and URLs with 1000 chars are likely to be
     bogus, too. */
-  nsAutoString value(Substring(aValue, 0, 1000));
-  //value.Truncate(1000); //-- this cuts half of the document !!?!!
+  aValue = Substring(aValue, 0, 1000);
+  //aValue.Truncate(1000); //-- this cuts half of the document !!?!!

-#ifdef DEBUG_BenB
-          printf("7: %s\n", ToNewUTF8String(value));
-#endif
-
-  value.StripChars("\"'");  /* This will break javascript attributes,
-                               but who wants javascript in
-                               sanitized HTML? */
+  aValue.Adopt(escape(aValue));

  /* Check some known bad stuff. Add more!
     I don't care too much, if it happens to trigger in some innocent cases
     (like <img alt="Statistical data: Mortage rates and newspapers">) -
     security first. */
-  if (value.Find("javascript:") != kNotFound ||
-      value.Find("data:") != kNotFound ||
-      value.Find("base64") != kNotFound)
+  if (aValue.Find("javascript:") != kNotFound ||
+      aValue.Find("data:") != kNotFound ||
+      aValue.Find("base64") != kNotFound)
    return NS_ERROR_ILLEGAL_VALUE;

  // Check img src scheme
  if (aTag == eHTMLTag_img && 
-      attr_name.Equals(NS_LITERAL_STRING("src"), nsCaseInsensitiveStringComparator()))
+      anAttrName.Equals(NS_LITERAL_STRING("src"),
+                        nsCaseInsensitiveStringComparator()))
  {
    nsresult rv;
-    nsCOMPtr<nsIIOService> ioService;
-    ioService = do_GetIOService(&rv);
-    if (NS_FAILED(rv)) return rv;
+    nsCOMPtr<nsIIOService> ioService = do_GetIOService(&rv);
+    NS_ENSURE_SUCCESS(rv, rv);
    nsCAutoString scheme;
-    rv = ioService->ExtractScheme(NS_LossyConvertUCS2toASCII(value), scheme);
-    if (NS_FAILED(rv)) return rv;
+    rv = ioService->ExtractScheme(NS_LossyConvertUCS2toASCII(aValue), scheme);
+    NS_ENSURE_SUCCESS(rv, rv);

    if (!scheme.Equals("cid", nsCaseInsensitiveCStringComparator()))
      return NS_ERROR_ILLEGAL_VALUE;
  }

+#ifdef DEBUG_BenB
+  printf("attribute value for %s: -%s-\n",
+         NS_LossyConvertUCS2toASCII(anAttrName).get(),
+         NS_LossyConvertUCS2toASCII(aValue).get());
+#endif
+
  return NS_OK;
 }

@ -707,9 +699,6 @@ nsresult
 mozSanitizingHTMLSerializer::ParsePrefs(const nsAString& aPref)
 {
  char* pref = ToNewCString(aPref);
-#ifdef DEBUG_BenB
-  printf("pref: -%s-\n", pref);
-#endif
  char* tags_lasts;
  for (char* iTag = PL_strtok_r(pref, " ", &tags_lasts);
       iTag;
@ -734,16 +723,10 @@ mozSanitizingHTMLSerializer::ParseTagPref(const nsCAutoString& tagpref)
    return NS_ERROR_OUT_OF_MEMORY;

  // Parsing tag
-#ifdef DEBUG_BenB
-  printf("Processing tag pref -%s-\n", tagpref.get());
-#endif
  PRInt32 bracket = tagpref.Find("(");
  nsCAutoString tag = tagpref;
  if (bracket != kNotFound)
    tag.Truncate(bracket);
-#ifdef DEBUG_BenB
-  printf( "Tag -%s-\n", tag.get());
-#endif
  if (tag.Equals(""))
  {
    printf(" malformed pref: %s\n", tagpref.get());
@ -754,14 +737,6 @@ mozSanitizingHTMLSerializer::ParseTagPref(const nsCAutoString& tagpref)
  NS_ConvertASCIItoUCS2 tag_widestr(tag);
  PRInt32 tag_id;
  parserService->HTMLStringTagToId(tag_widestr, &tag_id);
-#ifdef DEBUG_BenB
-  printf(" Have tag %d\n", tag_id);
-  const PRUnichar* tag_back;
-  parserService->HTMLIdToStringTag(tag_id, &tag_back);
-  printf(" Equals -%s-\n", tag_back
-                           ? NS_ConvertUCS2toUTF8(tag_back).get()
-                           : "");
-#endif
  if (tag_id == eHTMLTag_userdefined ||
      tag_id == eHTMLTag_unknown)
  {
@ -799,18 +774,12 @@ mozSanitizingHTMLSerializer::ParseTagPref(const nsCAutoString& tagpref)
    attrList.Append(Substring(tagpref,
                              bracket + 1,
                              tagpref.Length() - 2 - bracket));
-#ifdef DEBUG_BenB
-    printf(" Attr list: -%s-\n", attrList.get());
-#endif
    char* attrs_lasts;
    for (char* iAttr = PL_strtok_r(attrList.BeginWriting(),
                                   ",", &attrs_lasts);
         iAttr;
         iAttr = PL_strtok_r(NULL, ",", &attrs_lasts))
    {
-#ifdef DEBUG_BenB
-      printf(" Processing attr -%s-\n", iAttr);
-#endif
      attr_bag->Set(iAttr, 0);
    }