From c92d15f89f7afc6b93f0f6e55a6611a73ebce664 Mon Sep 17 00:00:00 2001
From: "gavin%gavinsharp.com" <gavin%gavinsharp.com>
Date: Wed, 17 May 2006 01:37:49 +0000
Subject: [PATCH] Bug 335878: sherlock files that use non-standard character
 encodings aren't displayed correctly, r=mconnor

---
 browser/components/search/nsSearchService.js | 382 +++++++++++--------
 1 file changed, 225 insertions(+), 157 deletions(-)

diff --git a/browser/components/search/nsSearchService.js b/browser/components/search/nsSearchService.js
index 51f23cfa611..eb91e7d4118 100755
--- a/browser/components/search/nsSearchService.js
+++ b/browser/components/search/nsSearchService.js
@@ -79,6 +79,8 @@ const ICON_DATAURL_PREFIX = "data:image/x-icon;base64,";
 // Supported extensions for Sherlock plugin icons
 const SHERLOCK_ICON_EXTENSIONS = [".gif", ".png", ".jpg", ".jpeg"];
 
+const NEW_LINES = /(\r\n|\r|\n)/;
+
 // Set an arbitrary cap on the maximum icon size. Without this, large icons can
 // cause big delays when loading them at startup.
 const MAX_ICON_SIZE   = 10000;
@@ -253,19 +255,21 @@ function b64(aBytes) {
   return out;
 }
 
-function iconLoadListener(aChannel, aEngine) {
+function loadListener(aChannel, aEngine, aCallback) {
   this._countRead = 0;
   this._channel = aChannel;
   this._bytes = [],
   this._engine = aEngine;
+  this._callback = aCallback;
 }
-iconLoadListener.prototype = {
+loadListener.prototype = {
+  _callback: null,
   _channel: null,
   _countRead: 0,
   _engine: null,
   _stream: null,
 
-  QueryInterface: function SRCH_iconLoad_QI(aIID) {
+  QueryInterface: function SRCH_loadQI(aIID) {
     if (aIID.equals(Ci.nsISupports)           ||
         aIID.equals(Ci.nsIRequestObserver)    ||
         aIID.equals(Ci.nsIStreamListener)     ||
@@ -281,40 +285,29 @@ iconLoadListener.prototype = {
   },
 
   // nsIRequestObserver
-  onStartRequest: function SRCH_iconLoadStartR(aRequest, aContext) {
-    LOG("iconLoadListener: Starting icon request.");
+  onStartRequest: function SRCH_loadStartR(aRequest, aContext) {
+    LOG("loadListener: Starting request: " + aRequest.name);
     this._stream = Cc["@mozilla.org/binaryinputstream;1"].
                    createInstance(Ci.nsIBinaryInputStream);
   },
 
-  onStopRequest: function SRCH_iconLoadStopR(aRequest, aContext, aStatusCode) {
-    LOG("iconLoadListener: Stopping icon request.");
-    var httpChannel = this._channel.QueryInterface(Ci.nsIHttpChannel);
-    if ((httpChannel && httpChannel.requestSucceeded) &&
-        Components.isSuccessCode(aStatusCode)         &&
-        this._countRead > 0) {
-
-      if (this._countRead < MAX_ICON_SIZE) {
-        var str = b64(this._bytes);
-        this._engine._iconURI = makeURI(ICON_DATAURL_PREFIX + str);
-
-        // The engine might not have a file yet, if it's being downloaded,
-        // because the request for the engine file itself (_onLoad) may not yet
-        // have occured. In that case, this change will be written to file when
-        // _onLoad is called.
-        if (this._engine._file)
-          this._engine._serializeToFile();
-        notifyAction(this._engine, SEARCH_ENGINE_CHANGED);
-      }
+  onStopRequest: function SRCH_loadStopR(aRequest, aContext, aStatusCode) {
+    LOG("loadListener: Stopping request: " + aRequest.name);
+    if (Components.isSuccessCode(aStatusCode) && this._countRead > 0)
+      this._callback(this._bytes, this._engine);
+    else {
+      LOG("loadListener: request failed!");
+      // send null so the callback can deal with the failure
+      this._callback(null, this._engine);
     }
     this._channel = null;
     this._engine  = null;
   },
 
   // nsIStreamListener
-  onDataAvailable: function SRCH_iconLoadDAvailable(aRequest, aContext,
-                                                    aInputStream, aOffset,
-                                                    aCount) {
+  onDataAvailable: function SRCH_loadDAvailable(aRequest, aContext,
+                                                aInputStream, aOffset,
+                                                aCount) {
     this._stream.setInputStream(aInputStream);
 
     // Get a byte array of the data
@@ -323,13 +316,13 @@ iconLoadListener.prototype = {
   },
 
   // nsIChannelEventSink
-  onChannelRedirect: function SRCH_iconLoadCRedirect(aOldChannel, aNewChannel,
-                                                     aFlags) {
+  onChannelRedirect: function SRCH_loadCRedirect(aOldChannel, aNewChannel,
+                                                 aFlags) {
     this._channel = aNewChannel;
   },
 
   // nsIInterfaceRequestor
-  getInterface: function SRCH_iconLoad_GI(aIID) {
+  getInterface: function SRCH_load_GI(aIID) {
     return this.QueryInterface(aIID);
   },
 
@@ -409,57 +402,140 @@ function getDir(aKey) {
   return dir;
 }
 
-// This isn't a full list - this is just copied over from
-// nsInternetSearchService to maintain backwards compat with Firefox 1.0.x
-const kCharsetCodes = [];
-kCharsetCodes[0] = "x-mac-roman";
-kCharsetCodes[6] = "x-mac-greek";
-kCharsetCodes[35] = "x-mac-turkish";
-kCharsetCodes[513] = "ISO-8859-1";
-kCharsetCodes[514] = "ISO-8859-2";
-kCharsetCodes[517] = "ISO-8859-5";
-kCharsetCodes[518] = "ISO-8859-6";
-kCharsetCodes[519] = "ISO-8859-7";
-kCharsetCodes[520] = "ISO-8859-8";
-kCharsetCodes[521] = "ISO-8859-9";
-kCharsetCodes[1049] = "IBM864";
-kCharsetCodes[1280] = "windows-1252";
-kCharsetCodes[1281] = "windows-1250";
-kCharsetCodes[1282] = "windows-1251";
-kCharsetCodes[1283] = "windows-1253";
-kCharsetCodes[1284] = "windows-1254";
-kCharsetCodes[1285] = "windows-1255";
-kCharsetCodes[1286] = "windows-1256";
-kCharsetCodes[1536] = "us-ascii";
-kCharsetCodes[1584] = "GB2312";
-kCharsetCodes[1585] = "x-gbk";
-kCharsetCodes[1600] = "EUC-KR";
-kCharsetCodes[2080] = "ISO-2022-JP";
-kCharsetCodes[2096] = "ISO-2022-CN";
-kCharsetCodes[2112] = "ISO-2022-KR";
-kCharsetCodes[2336] = "EUC-JP";
-kCharsetCodes[2352] = "GB2312";
-kCharsetCodes[2353] = "x-euc-tw";
-kCharsetCodes[2368] = "EUC-KR";
-kCharsetCodes[2561] = "Shift_JIS";
-kCharsetCodes[2562] = "KOI8-R";
-kCharsetCodes[2563] = "Big5";
-kCharsetCodes[2565] = "HZ-GB-2312";
-
 /**
- * Gets a character set name from a given code.
- * @param aCode
- *        One of the codes from the kCharsetCodes table, representing the
- *        requested charset.
- * @returns the requested character set name, or the default character set name
- *          if it doesn't exist.
+ * The following two functions are essentially copied from
+ * nsInternetSearchService. They are required for backwards compatibility.
  */
-function getCharSetFromCode(aCode) {
-  if (kCharsetCodes[aCode])
-    return kCharsetCodes[aCode];
+function queryCharsetFromCode(aCode) {
+  const codes = [];
+  codes[0] = "x-mac-roman";
+  codes[6] = "x-mac-greek";
+  codes[35] = "x-mac-turkish";
+  codes[513] = "ISO-8859-1";
+  codes[514] = "ISO-8859-2";
+  codes[517] = "ISO-8859-5";
+  codes[518] = "ISO-8859-6";
+  codes[519] = "ISO-8859-7";
+  codes[520] = "ISO-8859-8";
+  codes[521] = "ISO-8859-9";
+  codes[1049] = "IBM864";
+  codes[1280] = "windows-1252";
+  codes[1281] = "windows-1250";
+  codes[1282] = "windows-1251";
+  codes[1283] = "windows-1253";
+  codes[1284] = "windows-1254";
+  codes[1285] = "windows-1255";
+  codes[1286] = "windows-1256";
+  codes[1536] = "us-ascii";
+  codes[1584] = "GB2312";
+  codes[1585] = "x-gbk";
+  codes[1600] = "EUC-KR";
+  codes[2080] = "ISO-2022-JP";
+  codes[2096] = "ISO-2022-CN";
+  codes[2112] = "ISO-2022-KR";
+  codes[2336] = "EUC-JP";
+  codes[2352] = "GB2312";
+  codes[2353] = "x-euc-tw";
+  codes[2368] = "EUC-KR";
+  codes[2561] = "Shift_JIS";
+  codes[2562] = "KOI8-R";
+  codes[2563] = "Big5";
+  codes[2565] = "HZ-GB-2312";
+  
+  if (codes[aCode])
+    return codes[aCode];
 
   return getLocalizedPref("intl.charset.default", DEFAULT_QUERY_CHARSET);
 }
+function fileCharsetFromCode(aCode) {
+  const codes = [
+    "x-mac-roman",           // 0
+    "Shift_JIS",             // 1
+    "Big5",                  // 2
+    "EUC-KR",                // 3
+    "X-MAC-ARABIC",          // 4
+    "X-MAC-HEBREW",          // 5
+    "X-MAC-GREEK",           // 6
+    "X-MAC-CYRILLIC",        // 7
+    "X-MAC-DEVANAGARI" ,     // 9
+    "X-MAC-GURMUKHI",        // 10
+    "X-MAC-GUJARATI",        // 11
+    "X-MAC-ORIYA",           // 12
+    "X-MAC-BENGALI",         // 13
+    "X-MAC-TAMIL",           // 14
+    "X-MAC-TELUGU",          // 15
+    "X-MAC-KANNADA",         // 16
+    "X-MAC-MALAYALAM",       // 17
+    "X-MAC-SINHALESE",       // 18
+    "X-MAC-BURMESE",         // 19
+    "X-MAC-KHMER",           // 20
+    "X-MAC-THAI",            // 21
+    "X-MAC-LAOTIAN",         // 22
+    "X-MAC-GEORGIAN",        // 23
+    "X-MAC-ARMENIAN",        // 24
+    "GB2312",                // 25
+    "X-MAC-TIBETAN",         // 26
+    "X-MAC-MONGOLIAN",       // 27
+    "X-MAC-ETHIOPIC",        // 28
+    "X-MAC-CENTRALEURROMAN", // 29
+    "X-MAC-VIETNAMESE",      // 30
+    "X-MAC-EXTARABIC"        // 31
+  ];
+  // Sherlock files have always defaulted to x-mac-roman, so do that here too
+  return codes[aCode] || codes[0];
+}
+
+/**
+ * Returns a string interpretation of aBytes using aCharset, or null on
+ * failure.
+ */
+function bytesToString(aBytes, aCharset) {
+  var converter = Cc["@mozilla.org/intl/scriptableunicodeconverter"].
+                  createInstance(Ci.nsIScriptableUnicodeConverter);
+  LOG("bytesToString: converting using charset: " + aCharset);
+
+  try {
+    converter.charset = aCharset;
+    return converter.convertFromByteArray(aBytes, aBytes.length);
+  } catch (ex) {}
+
+  return null;
+}
+
+/**
+ * Converts an array of bytes representing a Sherlock file into an array of
+ * lines representing the useful data from the file.
+ */
+function sherlockBytesToData(aBytes) {
+  // Sherlock files can specify the file encoding they use in the file
+  // itself, using the sourceTextEncoding attribute. We read only ASCII
+  // bytes here to see if we need to reinterpret the byte stream.
+
+  // XXX If convertFromByteArray provided a way to ignore or replace
+  // invalid byte sequences, we could use it here and avoid doing this
+  // ourselves.
+  var asciiBytes = aBytes.filter(function (n) {return !(0x80 & n);});
+  var asciiString = String.fromCharCode.apply(null, asciiBytes);
+  asciiString = asciiString.split(NEW_LINES).filter(isUsefulLine)
+                           .join("\n");
+
+  // Look for the sourceTextEncoding attribute. It's value should be an
+  // integer that maps to one of the encodings in fileCharsetFromCode.
+  const sourceTextEncoding = /sourceTextEncoding\s*=['"](\d)['"]/i;
+  var sourceTE = sourceTextEncoding.exec(asciiString);
+  if (sourceTE && sourceTE.length > 1)
+    charset = fileCharsetFromCode(sourceTE[1]);
+  else
+    charset = fileCharsetFromCode(/* get the default */);
+
+  var dataString = bytesToString(aBytes, charset);
+  ENSURE(dataString, "_onLoad: Couldn't convert byte array!",
+         Cr.NS_ERROR_FAILURE);
+
+  // Split the string into lines, and filter out comments and
+  // whitespace-only lines
+  return dataString.split(NEW_LINES).filter(isUsefulLine);
+}
 
 /**
  * Wrapper for nsIPrefBranch::getComplexValue.
@@ -528,9 +604,6 @@ function getBoolPref(aName, aDefault) {
  * @param aName
  *        A name to "sanitize". Can be an empty string, in which case a random
  *        8 character filename will be produced.
- * @param aExt
- *        A file extension to use for the file. If not provided, defaults to
- *        XML_FILE_EXT.
  * @returns A nsIFile object in the user's search engines directory with a
  *          unique sanitized name.
  */
@@ -560,8 +633,9 @@ function sanitizeName(aName) {
 
   if (!name) {
     // Our input had no valid characters - use a random name
+    var cl = chars.length - 1;
     for (var i = 0; i < 8; ++i)
-      name += chars.charAt(Math.round(Math.random() * (chars.length - 1)));
+      name += chars.charAt(Math.round(Math.random() * cl));
   }
 
   return name;
@@ -789,9 +863,6 @@ Engine.prototype = {
   _file: null,
   // Whether the engine is hidden from the user.
   _hidden: null,
-  // The XMLHTTPRequest object used to download the engine.
-  // (null for engines loaded from disk)
-  _req: null,
   // The engine's name.
   _name: null,
   // The engine type. See engine types (TYPE_) defined above.
@@ -831,20 +902,12 @@ Engine.prototype = {
         this._data = doc.documentElement;
         break;
       case SEARCH_DATA_TEXT:
-        fileInStream.QueryInterface(Ci.nsILineInputStream);
+        var binaryInStream = Cc["@mozilla.org/binaryinputstream;1"].
+                             createInstance(Ci.nsIBinaryInputStream);
+        binaryInStream.setInputStream(fileInStream);
 
-        var line = { value: "" };
-        var more = false;
-        var lines = [];
-
-        do {
-          more = fileInStream.readLine(line);
-          // Filter out comments and whitespace-only lines
-          if (isUsefulLine(line.value))
-            lines.push(line.value);
-        } while (more);
-
-        this._data = lines;
+        var bytes = binaryInStream.readByteArray(binaryInStream.available());
+        this._data = sherlockBytesToData(bytes);
 
         break;
       default:
@@ -868,49 +931,14 @@ Engine.prototype = {
                 Cr.NS_ERROR_UNEXPECTED);
 
     LOG("_initFromURI: Downloading engine from: \"" + this._uri.spec + "\".");
-    var mimeType = "";
-    switch (this._dataType) {
-      case SEARCH_DATA_XML:
-        mimeType = "text/xml";
-        break;
-      case SEARCH_DATA_TEXT:
-        mimeType = "text/plain";
-        break;
-      default:
-        ERROR("Bogus engine _dataType: \"" + this._dataType + "\"",
-              Cr.NS_ERROR_UNEXPECTED);
-    }
 
-    this._req = Cc["@mozilla.org/xmlextras/xmlhttprequest;1"].
-                createInstance(Ci.nsIXMLHttpRequest);
-    this._req.open("GET", this._uri.spec, true);
-    this._req.overrideMimeType(mimeType);
-    this._req.setRequestHeader("Cache-Control", "no-cache");
+    var ios = Cc["@mozilla.org/network/io-service;1"].
+              getService(Ci.nsIIOService);
+    var chan = ios.newChannelFromURI(this._uri);
 
-    var self = this;
-    this._req.send(null);
-    this._req.onerror = function (event) { self._onError(event); };
-    this._req.onload  = function (event) { self._onLoad(event);  };
-  },
-
-  /**
-   * Handle an error during the load of an engine by prompting the user to
-   * notify him that the load failed.
-   */
-  _onError: function SRCH_ENG_onError(aEvent) {
-    var sbs = Cc["@mozilla.org/intl/stringbundle;1"].
-              getService(Ci.nsIStringBundleService);
-    var searchBundle = sbs.createBundle(SEARCH_BUNDLE);
-    var brandBundle = sbs.createBundle(BRAND_BUNDLE);
-    var brandName = brandBundle.GetStringFromName("brandShortName");
-    var title = searchBundle.GetStringFromName("error_loading_engine_title");
-    var text = searchBundle.formatStringFromName("error_loading_engine_msg",
-                                                 [brandName, this._location],
-                                                 2);
-
-    var ww = Cc["@mozilla.org/embedcomp/window-watcher;1"].
-             getService(Ci.nsIWindowWatcher);
-    ww.getNewPrompter(null).alert(title, text);
+    var listener = new loadListener(chan, this, this._onLoad);
+    chan.notificationCallbacks = listener;
+    chan.asyncOpen(listener, null);
   },
 
   /**
@@ -918,45 +946,65 @@ Engine.prototype = {
    * triggers parsing of the data. The engine is then flushed to disk. Notifies
    * the search service once initialization is complete.
    */
-  _onLoad: function SRCH_ENG_onLoad(aEvent) {
+  _onLoad: function SRCH_ENG_onLoad(aBytes, aEngine) {
+    /**
+     * Handle an error during the load of an engine by prompting the user to
+     * notify him that the load failed.
+     */
+    function onError() {
+      var sbs = Cc["@mozilla.org/intl/stringbundle;1"].
+                getService(Ci.nsIStringBundleService);
+      var searchBundle = sbs.createBundle(SEARCH_BUNDLE);
+      var brandBundle = sbs.createBundle(BRAND_BUNDLE);
+      var brandName = brandBundle.GetStringFromName("brandShortName");
+      var title = searchBundle.GetStringFromName("error_loading_engine_title");
+      var text = searchBundle.formatStringFromName("error_loading_engine_msg",
+                                                   [brandName, aEngine._location],
+                                                   2);
 
-    var httpChannel = this._req.channel.QueryInterface(Ci.nsIHttpChannel);
-    if (this._req.readyState != 4 || !httpChannel.requestSucceeded) {
-      this._onError();
-      LOG("_onLoad: Request for " + this._location + " failed!");
+      var ww = Cc["@mozilla.org/embedcomp/window-watcher;1"].
+               getService(Ci.nsIWindowWatcher);
+      ww.getNewPrompter(null).alert(title, text);
+    }
+
+    if (!aBytes) {
+      onError();
       return;
     }
 
-    switch (this._dataType) {
+    switch (aEngine._dataType) {
       case SEARCH_DATA_XML:
-        this._data = this._req.responseXML.documentElement;
+        var dataString = bytesToString(aBytes, "UTF-8");
+        ENSURE(dataString, "_onLoad: Couldn't convert byte array!",
+               Cr.NS_ERROR_FAILURE);
+        var parser = Cc["@mozilla.org/xmlextras/domparser;1"].
+                     createInstance(Ci.nsIDOMParser);
+        var doc = parser.parseFromString(dataString, "text/xml");
+        aEngine._data = doc.documentElement;
         break;
       case SEARCH_DATA_TEXT:
-        this._data = this._req.responseText.split(/(\r\n|\n\r|\r|\n)/);
-
-        // Filter out comments and whitespace-only lines.
-        this._data.filter(isUsefulLine);
+        aEngine._data = sherlockBytesToData(aBytes);
         break;
       default:
-        this._onError();
+        onError();
         LOG("_onLoad: Bogus engine _dataType: \"" + this._dataType + "\"");
         return;
     }
     try {
       // Initialize the engine from the obtained data
-      this._initFromData();
+      aEngine._initFromData();
     } catch (ex) {
-      // Report an error to the user
       LOG("_onLoad: Failed to init engine!\n" + ex);
-      this._onError();
+      // Report an error to the user
+      onError();
       return;
     }
 
     // Write the engine to file
-    this._serializeToFile();
+    aEngine._serializeToFile();
 
     // Notify the search service of the sucessful load
-    notifyAction(this, SEARCH_ENGINE_LOADED);
+    notifyAction(aEngine, SEARCH_ENGINE_LOADED);
   },
 
   /**
@@ -993,7 +1041,27 @@ Engine.prototype = {
           var ios = Cc["@mozilla.org/network/io-service;1"].
                     getService(Ci.nsIIOService);
           var chan = ios.newChannelFromURI(uri);
-          var listener = new iconLoadListener(chan, this);
+
+          function iconLoadCallback(aByteArray, aEngine) {
+            if (!aByteArray || aByteArray.length > MAX_ICON_SIZE) {
+              LOG("iconLoadCallback: engine too large!");
+              return;
+            }
+
+            var str = b64(aByteArray);
+            aEngine._iconURI = makeURI(ICON_DATAURL_PREFIX + str);
+
+            // The engine might not have a file yet, if it's being downloaded,
+            // because the request for the engine file itself (_onLoad) may not
+            // yet be complete. In that case, this change will be written to
+            // file when _onLoad is called.
+            if (aEngine._file)
+              aEngine._serializeToFile();
+
+            notifyAction(aEngine, SEARCH_ENGINE_CHANGED);
+          }
+
+          var listener = new loadListener(chan, this, iconLoadCallback);
           chan.notificationCallbacks = listener;
           chan.asyncOpen(listener, null);
         }
@@ -1383,7 +1451,7 @@ Engine.prototype = {
     this._name = searchSection["name"] || err("Missing name!");
     this._description = searchSection["description"] || "";
     this._queryCharset = searchSection["querycharset"] ||
-                            getCharSetFromCode(searchSection["queryencoding"]);
+                         queryCharsetFromCode(searchSection["queryencoding"]);
 
     // XXX should this really fall back to GET?
     var method = (searchSection["method"] || "GET").toUpperCase();
@@ -1609,7 +1677,7 @@ Engine.prototype = {
   get queryCharset() {
     if (this._queryCharset)
       return this._queryCharset;
-    return this._queryCharset = getCharSetFromCode(/* get the default */);
+    return this._queryCharset = queryCharsetFromCode(/* get the default */);
   },
 
   addParam: function SRCH_ENG_addParam(aName, aValue) {