Bug 338252: sourceTextEncoding isn't always properly parsed (fixes some Sherlock plugins with non standard character encodings), r=mconnor

2006-05-29 18:48:42 +00:00 · 2006-05-29 18:48:42 +00:00 · 9c114f257f
--- a/browser/components/search/nsSearchService.js
+++ b/browser/components/search/nsSearchService.js
@ -505,31 +505,19 @@ function bytesToString(aBytes, aCharset) {
 /**
 * Converts an array of bytes representing a Sherlock file into an array of
 * lines representing the useful data from the file.
+ *
+ * @param aBytes
+ *        The array of bytes representing the Sherlock file.
+ * @param aCharsetCode
+ *        An integer value representing a character set code to be passed to
+ *        fileCharsetFromCode, or null for the default Sherlock encoding.
 */
-function sherlockBytesToData(aBytes) {
-  // Sherlock files can specify the file encoding they use in the file
-  // itself, using the sourceTextEncoding attribute. We read only ASCII
-  // bytes here to see if we need to reinterpret the byte stream.
-
-  // XXX If convertFromByteArray provided a way to ignore or replace
-  // invalid byte sequences, we could use it here and avoid doing this
-  // ourselves.
-  var asciiBytes = aBytes.filter(function (n) {return !(0x80 & n);});
-  var asciiString = String.fromCharCode.apply(null, asciiBytes);
-  asciiString = asciiString.split(NEW_LINES).filter(isUsefulLine)
-                           .join("\n");
-
-  // Look for the sourceTextEncoding attribute. It's value should be an
-  // integer that maps to one of the encodings in fileCharsetFromCode.
-  const sourceTextEncoding = /sourceTextEncoding\s*=['"](\d)['"]/i;
-  var sourceTE = sourceTextEncoding.exec(asciiString);
-  if (sourceTE && sourceTE.length > 1)
-    charset = fileCharsetFromCode(sourceTE[1]);
-  else
-    charset = fileCharsetFromCode(/* get the default */);
+function sherlockBytesToLines(aBytes, aCharsetCode) {
+  // fileCharsetFromCode returns the default encoding if aCharsetCode is null
+  var charset = fileCharsetFromCode(aCharsetCode);

  var dataString = bytesToString(aBytes, charset);
-  ENSURE(dataString, "_onLoad: Couldn't convert byte array!",
+  ENSURE(dataString, "sherlockBytesToLines: Couldn't convert byte array!",
         Cr.NS_ERROR_FAILURE);

  // Split the string into lines, and filter out comments and
@ -850,7 +838,7 @@ function Engine(aLocation, aSourceDataType, aIsReadOnly) {
 Engine.prototype = {
  // The engine's alias.
  _alias: null,
-  // The data describing the engine. Is either an array of lines, for Sherlock
+  // The data describing the engine. Is either an array of bytes, for Sherlock
  // files, or an XML document element, for XML plugins.
  _data: null,
  // The engine's data type. See data types (DATA_) defined above.
@ -907,7 +895,7 @@ Engine.prototype = {
        binaryInStream.setInputStream(fileInStream);

        var bytes = binaryInStream.readByteArray(binaryInStream.available());
-        this._data = sherlockBytesToData(bytes);
+        this._data = bytes;

        break;
      default:
@ -983,7 +971,7 @@ Engine.prototype = {
        aEngine._data = doc.documentElement;
        break;
      case SEARCH_DATA_TEXT:
-        aEngine._data = sherlockBytesToData(aBytes);
+        aEngine._data = aBytes;
        break;
      default:
        onError();
@ -1445,7 +1433,34 @@ Engine.prototype = {
      throw Cr.NS_ERROR_FAILURE;
    }

-    var searchSection = getSection(this._data, "search");
+    // First try converting our byte array using the default Sherlock encoding.
+    // If this fails, or if we find a sourceTextEncoding attribute, we need to
+    // reconvert the byte array using the specified encoding.
+    var sherlockLines, searchSection, sourceTextEncoding;
+    try {
+      sherlockLines = sherlockBytesToLines(this._data);
+      searchSection = getSection(sherlockLines, "search");
+      sourceTextEncoding = parseInt(searchSection["sourcetextencoding"]);
+      if (sourceTextEncoding) {
+        // Re-convert the bytes using the found sourceTextEncoding
+        sherlockLines = sherlockBytesToLines(this._data, sourceTextEncoding);
+        searchSection = getSection(sherlockLines, "search");
+      }
+    } catch (ex) {
+      // The conversion using the default charset failed. Remove any non-ascii
+      // bytes and try to find a sourceTextEncoding.
+      var asciiBytes = this._data.filter(function (n) {return !(0x80 & n);});
+      var asciiString = String.fromCharCode.apply(null, asciiBytes);
+      sherlockLines = asciiString.split(NEW_LINES).filter(isUsefulLine);
+      searchSection = getSection(sherlockLines, "search");
+      sourceTextEncoding = parseInt(searchSection["sourcetextencoding"]);
+      if (sourceTextEncoding) {
+        sherlockLines = sherlockBytesToLines(this._data, sourceTextEncoding);
+        searchSection = getSection(sherlockLines, "search");
+      } else
+        ERROR("Couldn't find a working charset", Cr.NS_ERROR_FAILURE);
+    }
+
    LOG("_parseAsSherlock: Search section:\n" + searchSection.toSource());

    this._name = searchSection["name"] || err("Missing name!");
@ -1457,7 +1472,7 @@ Engine.prototype = {
    var method = (searchSection["method"] || "GET").toUpperCase();
    var template = searchSection["action"] || err("Missing action!");

-    var inputs        = getInputs(this._data);
+    var inputs = getInputs(sherlockLines);
    LOG("_parseAsSherlock: Inputs:\n" + inputs.toSource());

    var url = null;