Bug 368998: when normalizing hostnames, we don't properly escape non-alphanumerics

patch: move host encoding to c++ (url classifier utils component) r=bryner
2007-03-05 05:58:05 +00:00 · 2007-03-05 05:58:05 +00:00 · e7ba49093b
--- a/toolkit/components/build/nsToolkitCompsModule.cpp
+++ b/toolkit/components/build/nsToolkitCompsModule.cpp
@ -91,7 +91,7 @@ NS_GENERIC_FACTORY_CONSTRUCTOR(nsTypeAheadFind)
 NS_GENERIC_FACTORY_SINGLETON_CONSTRUCTOR(nsUrlClassifierDBService,
                                         nsUrlClassifierDBService::GetInstance)
 NS_GENERIC_FACTORY_CONSTRUCTOR(nsUrlClassifierStreamUpdater)
-NS_GENERIC_FACTORY_CONSTRUCTOR(nsUrlClassifierUtils)
+NS_GENERIC_FACTORY_CONSTRUCTOR_INIT(nsUrlClassifierUtils, Init)
 #endif

 #ifdef MOZ_FEEDS
--- a/toolkit/components/url-classifier/content/enchash-decrypter.js
+++ b/toolkit/components/url-classifier/content/enchash-decrypter.js
@ -53,36 +53,6 @@
 //
 // TODO: accommodate other kinds of perl-but-not-javascript qualifiers

-/**
- * A fast, bit-vector map for ascii characters.
- *
- * Internally stores 256 bits in an array of 8 ints.
- * Does quick bit-flicking to lookup needed characters.
- */
-
-/**
- * @param Takes 8 ints to initialize the character map
- */
-function Charmap() {
-  if (arguments.length != 8) {
-    throw G_Error("charmap ctor requires 8 int args");
-  }
-  this.map_ = [];
-  for (var i = 0; i < 8; ++i) {
-    this.map_.push(arguments[i]);
-  }
-}
-
-/**
- * Do a quick lookup to see if the letter is in the map.
- * @param chr String of length 1 (ascii)
- * @return Boolean true if the letter is in the map
- */
-Charmap.prototype.contains = function(chr) {
-  var val = chr.charCodeAt(0);
-  return !!(this.map_[val >> 5] & (1 << (val & 31)));
-}
-
 /**
 * This thing knows how to generate lookup keys and decrypt values found in
 * a table of type enchash.
@ -94,10 +64,6 @@ function PROT_EnchashDecrypter() {
  this.base64_ = new G_Base64();
  this.streamCipher_ = Cc["@mozilla.org/security/streamcipher;1"]
                       .createInstance(Ci.nsIStreamCipher);
-  // Everything but alpha numerics, - and .
-  this.escapeCharmap_ = new Charmap(
-    0xffffffff, 0xfc009fff, 0xf8000001, 0xf8000001,
-    0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
 }

 PROT_EnchashDecrypter.DATABASE_SALT = "oU3q.72p";
@ -141,38 +107,6 @@ PROT_EnchashDecrypter.prototype.lastNChars_ = function(str, n) {
  return str.substr(n);
 }

-/**
- * We have to have our own hex-decoder because decodeURIComponent
- * expects UTF-8 (so it will barf on invalid UTF-8 sequences).
- *
- * @param str String to decode
- * 
- * @returns The decoded string
- */
-PROT_EnchashDecrypter.prototype.hexDecode_ = function(str) {
-  var output = [];
-
-  var i = 0;
-  while (i < str.length) {
-    var c = str.charAt(i);
-  
-    if (c == "%" && i + 2 < str.length) {
-
-      var asciiVal = Number("0x" + str.charAt(i + 1) + str.charAt(i + 2));
-      
-      if (!isNaN(asciiVal)) {
-        i += 2;
-        c = String.fromCharCode(asciiVal);
-      }
-    }
-    
-    output[output.length] = c;
-    ++i;
-  }
-  
-  return output.join("");
-}
-
 /**
 * Translate a plaintext enchash value into regular expressions
 *
@ -237,7 +171,7 @@ PROT_EnchashDecrypter.prototype.getCanonicalHost = function(str, opt_maxDots) {
    return "";
  }

-  var unescaped = this.hexDecode_(asciiHost);
+  var unescaped = unescape(asciiHost);

  unescaped = unescaped.replace(this.REs_.FIND_DODGY_CHARS_GLOBAL, "")
              .replace(this.REs_.FIND_END_DOTS_GLOBAL, "")
@ -248,7 +182,9 @@ PROT_EnchashDecrypter.prototype.getCanonicalHost = function(str, opt_maxDots) {
    unescaped = temp;

  // Escape everything that's not alphanumeric, hyphen, or dot.
-  var escaped = this.escapeString_(unescaped);
+  var urlUtils = Cc["@mozilla.org/url-classifier/utils;1"]
+                 .getService(Ci.nsIUrlClassifierUtils);
+  var escaped = urlUtils.escapeHostname(unescaped);

  if (opt_maxDots) {
    // Limit the number of dots
@ -272,27 +208,6 @@ PROT_EnchashDecrypter.prototype.getCanonicalHost = function(str, opt_maxDots) {
  return escaped;
 }

-/**
- * URL escapes everything except alphanumerics, - and . (dot).  Specifically,
- * escape everything in the escapeCharmap_ defined in the constructor.  This
- * is a little different than escape, encodeURIComponent, and encodeURI.
- */
-PROT_EnchashDecrypter.prototype.escapeString_ = function(unescaped) {
-  var escaped = '';
-  for (var i = 0; i < unescaped.length; ++i) {
-    if (this.escapeCharmap_.contains(unescaped[i])) {
-      var c = unescaped.charCodeAt(i).toString(16);
-      if (c.length == 1) {
-        c = '0' + c;
-      }
-      escaped += '%' + c;
-    } else {
-      escaped += unescaped[i];
-    }
-  }
-  return escaped;
-}
-
 PROT_EnchashDecrypter.prototype.parseIPAddress_ = function(host) {
  if (host.length <= 15) {

--- a/toolkit/components/url-classifier/public/nsIUrlClassifierUtils.idl
+++ b/toolkit/components/url-classifier/public/nsIUrlClassifierUtils.idl
@ -39,7 +39,7 @@
 * Some utility methods used by the url classifier.
 */

-[scriptable, uuid(9afd3add-eadc-409f-a187-e3bf60e47290)]
+[scriptable, uuid(89ea43b0-a23f-4db2-8d23-6d90dc55f67a)]
 interface nsIUrlClassifierUtils : nsISupports
 {
  /**
@ -54,4 +54,12 @@ interface nsIUrlClassifierUtils : nsISupports
   *          then specially url-encoded)
   */
  ACString canonicalizeURL(in ACString url);
+
+  /**
+   * When canonicalizing hostnames, the final step is to url escape everything that
+   * is not alphanumeric or hyphen or dot.  The existing methods (escape,
+   * encodeURIComponent and encodeURI are close, but not exactly what we want
+   * so we write our own function to do this.
+   */
+  ACString escapeHostname(in ACString hostname);
 };
--- a/toolkit/components/url-classifier/src/nsUrlClassifierUtils.cpp
+++ b/toolkit/components/url-classifier/src/nsUrlClassifierUtils.cpp
@ -44,13 +44,26 @@ static char int_to_hex_digit(PRInt32 i)
  return NS_STATIC_CAST(char, ((i < 10) ? (i + '0') : ((i - 10) + 'A')));
 }

-
-nsUrlClassifierUtils::nsUrlClassifierUtils()
+nsUrlClassifierUtils::nsUrlClassifierUtils() : mEscapeCharmap(nsnull)
 {
 }

+nsresult
+nsUrlClassifierUtils::Init()
+{
+  // Everything but alpha numerics, - and .
+  mEscapeCharmap = new Charmap(0xffffffff, 0xfc009fff, 0xf8000001, 0xf8000001,
+                               0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
+  if (!mEscapeCharmap)
+    return NS_ERROR_OUT_OF_MEMORY;
+  return NS_OK;
+}
+
 NS_IMPL_ISUPPORTS1(nsUrlClassifierUtils, nsIUrlClassifierUtils)

+/////////////////////////////////////////////////////////////////////////////
+// nsIUrlClassifierUtils
+
 /* ACString canonicalizeURL (in ACString url); */
 NS_IMETHODIMP
 nsUrlClassifierUtils::CanonicalizeURL(const nsACString & url, nsACString & _retval)
@ -65,6 +78,30 @@ nsUrlClassifierUtils::CanonicalizeURL(const nsACString & url, nsACString & _retv
  return NS_OK;
 }

+NS_IMETHODIMP
+nsUrlClassifierUtils::EscapeHostname(const nsACString & hostname,
+                                     nsACString & _retval)
+{
+  const char* curChar = hostname.BeginReading();
+  const char* end = hostname.EndReading();
+  while (curChar != end) {
+    unsigned char c = NS_STATIC_CAST(unsigned char, *curChar);
+    if (mEscapeCharmap->Contains(c)) {
+      _retval.Append('%');
+      _retval.Append(int_to_hex_digit(c / 16));
+      _retval.Append(int_to_hex_digit(c % 16));
+    } else {
+      _retval.Append(*curChar);
+    }
+    ++curChar;
+  }
+  
+  return NS_OK;
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// non-interface methods
+
 // This function will encode all "special" characters in typical url
 // encoding, that is %hh where h is a valid hex digit.  See the comment in
 // the header file for details.
--- a/toolkit/components/url-classifier/src/nsUrlClassifierUtils.h
+++ b/toolkit/components/url-classifier/src/nsUrlClassifierUtils.h
@ -37,14 +37,48 @@
 #ifndef nsUrlClassifierUtils_h_
 #define nsUrlClassifierUtils_h_

+#include "nsAutoPtr.h"
 #include "nsIUrlClassifierUtils.h"

 class nsUrlClassifierUtils : public nsIUrlClassifierUtils
 {
+private:
+  /**
+   * A fast, bit-vector map for ascii characters.
+   *
+   * Internally stores 256 bits in an array of 8 ints.
+   * Does quick bit-flicking to lookup needed characters.
+   */
+  class Charmap
+  {
+  public:
+    Charmap(PRUint32 b0, PRUint32 b1, PRUint32 b2, PRUint32 b3,
+            PRUint32 b4, PRUint32 b5, PRUint32 b6, PRUint32 b7)
+    {
+      mMap[0] = b0; mMap[1] = b1; mMap[2] = b2; mMap[3] = b3;
+      mMap[4] = b4; mMap[5] = b5; mMap[6] = b6; mMap[7] = b7;
+    }
+
+    /**
+     * Do a quick lookup to see if the letter is in the map.
+     */
+    PRBool Contains(unsigned char c) const
+    {
+      return mMap[c >> 5] & (1 << (c & 31));
+    }
+
+  private:
+    // Store the 256 bits in an 8 byte array.
+    PRUint32 mMap[8];
+  };
+
+
 public:
  nsUrlClassifierUtils();
  ~nsUrlClassifierUtils() {}

+  nsresult Init();
+
  NS_DECL_ISUPPORTS
  NS_DECL_NSIURLCLASSIFIERUTILS

@ -62,6 +96,8 @@ private:

  // Function to tell if we should encode a character.
  PRBool ShouldURLEscape(const unsigned char c) const;
+
+  nsAutoPtr<Charmap> mEscapeCharmap;
 };

 #endif // nsUrlClassifierUtils_h_
--- a/toolkit/components/url-classifier/tests/test_enchash-decrypter.xhtml
+++ b/toolkit/components/url-classifier/tests/test_enchash-decrypter.xhtml
@ -235,29 +235,43 @@ for (var key in testing) {
     "parseIPAddress broken on " + key + "(got: " + l.parseIPAddress_(key));
 }

-// Test escapeString (bug 368998)
+// Test escapeHostname (bug 368998)
 testing = {
  "asdf!@#$a": "asdf%21%40%23%24a",
  "AB CD 12354": "AB%20CD%2012354",
-  "\1\2\3\4\112\177": "%01%02%03%04J%7f",
-  "<>.AS/-+": "%3c%3e.AS%2f-%2b"
+  "\1\2\3\4\112\177": "%01%02%03%04J%7F",
+  "<>.AS/-+": "%3C%3E.AS%2F-%2B"
 };
+var urlUtils = Cc["@mozilla.org/url-classifier/utils;1"]
+               .getService(Ci.nsIUrlClassifierUtils);
 for (var key in testing) {
-  var out = l.escapeString_(key);
+  var out = urlUtils.escapeHostname(key);
  ok(out === testing[key],
     "escapeString broken on " + key + " (got: " + out + ")");
 }

-// escapeCharmap_ should be true for non-alphanumeric, non-hyphen, and
-// non-dot chars
+// Test a really long url (~130k).  getCanonicalHost takes about 55ms
+// on my 2.8ghz machine.
+var long_string = "x";
+for (var i = 0; i < 17; ++i) {
+  long_string += long_string;
+}
+var long_hostname_url = "http://" + long_string + "/foo";
+var startTime = Date.now();
+var out = l.getCanonicalHost(long_hostname_url);
+var endTime = Date.now();
+ok(out == long_string, "getCanonicalHost on long string (" +
+                       (endTime - startTime) + "ms)");
+
+// Verify that each character is escaped properly.
 for (var i = 0; i < 256; ++i) {
  var chr = String.fromCharCode(i);
  if ( (chr.toLowerCase() >= 'a' && chr.toLowerCase() <= 'z') ||
       (chr >= '0' && chr <= '9') ||
       '.' == chr || '-' == chr) {
-    ok(!l.escapeCharmap_.contains(chr), 'failed on ' + i);
+    ok(urlUtils.escapeHostname(chr).length == 1, 'failed on ' + i);
  } else {
-    ok(l.escapeCharmap_.contains(chr), 'failed on ' + i);
+    ok(urlUtils.escapeHostname(chr).length == 3, 'failed on ' + i);
  }
 }

@ -320,6 +334,14 @@ for (var key in testing) {
     "getCanonicalUrl broken on: " + key + "(got: " + l.getCanonicalUrl(key) + ")");
 }

+// Test for a really long url.  This 130k url takes about 80ms
+// on my 2.8ghz machine.
+startTime = Date.now();
+out = l.getCanonicalUrl(long_hostname_url);
+endTime = Date.now();
+ok(out == long_hostname_url, "getCanonicalUrl on long string (" +
+                       (endTime - startTime) + "ms)");
+
 // Test getlookupkey
 var testing = {};
 testing["www.google.com"] = "AF5638A09FDDDAFF5B7A6013B1BE69A9";