Use ruleClassify() instead of lwcaClassify(), and format output of ruleClassify() to be appropriate for the front-end.

2015-01-16 15:50:13 -05:00 · 2015-01-16 15:50:13 -05:00 · 8d39c18b62
--- a/data/interests/interestsWorker.js
+++ b/data/interests/interestsWorker.js
@ -5,7 +5,6 @@
 "use strict";

 importScripts("tokenizerFactory.js");
-importScripts("lwca_refined.js");

 function InterestsWorkerError(message) {
    this.name = "InterestsWorkerError";
@ -22,7 +21,6 @@ InterestsWorkerError.prototype.constructor = InterestsWorkerError;
 let gNamespace = null;
 let gRegionCode = null;
 let gTokenizer = null;
-let gLWCAClassifier = null;
 let gInterestsData = null;

 // XXX The original splitter doesn't apply to chinese:
@ -31,8 +29,6 @@ const kSplitter = /[\s-]+/;

 // bootstrap the worker with data and models
 function bootstrap(aMessageData) {
-  gLWCAClassifier = new LWCAClassifier(aMessageData);
-
  // expects : {interestsData, interestsDataType, interestsUrlStopwords, workerRegionCode}
  gRegionCode = aMessageData.workerRegionCode;

@ -124,6 +120,46 @@ function parseVisit(host, baseDomain, path, title, url, options) {
  return words;
 }

+function formatClassificationResults(cats) {
+  let formattedClassification = {};
+  if (cats.length == 0) {
+    return [{"category": "uncategorized", "subcat": "dummy"}];
+  }
+
+  // populate formattedClassification object with top and sub categories
+  for (let i = 0; i < cats.length; i++) {
+    let cat = cats[i];
+    if (cat.indexOf("/") == -1) {
+      // This is a top category
+      if (!formattedClassification[cat]) {
+        formattedClassification[cat] = "general_" + i;
+      }
+    }
+    else {
+      // this is a subcategory
+      let chunks = cat.split("/");
+      let topCat = chunks[0];
+      let subCat = chunks[1];
+      if (!formattedClassification[topCat] ||
+          formattedClassification[topCat].indexOf("general") != -1) {
+        formattedClassification[topCat] = subCat + "_" + i;
+      }
+    }
+  }
+
+  // The final result is an ordered list of {category: <>, subcat: <>}.
+  let finalResult = new Array(Object.keys(formattedClassification).length);
+  for (let category in formattedClassification) {
+    let split = formattedClassification[category].split("_");
+    let subcat = split[0];
+    let index = split[1];
+    finalResult[index] = {"category": category, "subcat": subcat};
+  }
+
+
+  return finalResult;
+}
+
 // classify a page using rules
 function ruleClassify({host, baseDomain, path, title, url}) {
  let interests = [];
@ -180,19 +216,6 @@ function ruleClassify({host, baseDomain, path, title, url}) {
  return interestFinalizer(interests);
 }

-function lwcaClassify({url, title}) {
-  try {
-    if (url && title && gNamespace == "58-cat") {
-      let classification = gLWCAClassifier.classify(url, title);
-      let subcat = classification[1].split("/")[0];
-      return {"category": [classification[0]], "subcat": subcat};
-    }
-  } catch (ex) {
-    console.log(ex);
-  }
-  return [];
-}
-
 // Figure out which interests are associated to the document
 function getInterestsForDocument(aMessageData) {

@ -214,22 +237,11 @@ function getInterestsForDocument(aMessageData) {
  aMessageData.message = "InterestsForDocument";
  aMessageData.namespace = gNamespace;

-  // we need to submit 3 messages
-  // - for rule classification
-  // - for keyword classification
-  // - for combined classification
-  let interests = [];
-  let results = [];
+  // Submitting a msg for rule classification
  try {
-    interests = lwcaClassify(aMessageData);
-    if (Object.keys(interests).length > 0) {
-      results.push({type: "lwca", interests: interests.category, subcat: interests.subcat});
-    }
-
-    interests = ruleClassify(aMessageData);
-    results.push({type: "rules", interests: dedupeInterests(interests)});
-
-    let rulesWorked = interests.length > 0;
+    let interests = ruleClassify(aMessageData);
+    let formatted = formatClassificationResults(interests);
+    let results = [{type: "rules", interests: formatted}];

    aMessageData.results = results;
    self.postMessage(aMessageData);
--- a/lib/WorkerFactory.js
+++ b/lib/WorkerFactory.js
@ -28,11 +28,11 @@ const INTEREST_LOCALES = {
    modelNames: ["41-cat"],
  },
  "en-US": {
-    mainTaxonomyModel: "edrules",
+    mainTaxonomyModel: "dfr_rules",
    rankersDef: [
      {type: "rules", namespace: "edrules"}
    ],
-    modelNames: ["58-cat", "edrules", "edrules_extended"],
+    modelNames: ["58-cat", "edrules", "edrules_extended", "dfr_rules"],
  },
 };

--- a/lib/streams/dailyInterestsSpout.js
+++ b/lib/streams/dailyInterestsSpout.js
@ -83,16 +83,17 @@ let DailyInterestsSpout = {
          });

          results.forEach(item => {
-            let {type, interests, subcat} = item;
-            if (type != "lwca") {
-              return;
+            let {type, interests} = item;
+            let index = 0;
+            while (interests[index] === null) {
+              index++; // Finding first non-null index.
            }
-            interests.forEach(interest => {
-              if (interest == "uncategorized" || interest.indexOf("__") > -1) return;
+            let category = interests[index].category;
+            let subcat = interests[index].subcat;
+            if (category == "uncategorized" || category.indexOf("__") > -1) return;
              Object.keys(dateVisits).forEach(date => {
-                this._addDomain(host, dateVisits[date], visitIDs[date][0], interest);
-                this._storeInterest(host, visitIDs[date], date, dateVisits[date], namespace, type, interest, subcat);
-              });
+              this._addDomain(host, dateVisits[date], visitIDs[date][0], category);
+              this._storeInterest(host, visitIDs[date], date, dateVisits[date], namespace, type, category, subcat);
            });
          });
        }
--- a/lib/streams/interestDashboardDataProcessorBolt.js
+++ b/lib/streams/interestDashboardDataProcessorBolt.js
@ -174,7 +174,7 @@ let InterestDashboardDataProcessorBolt = {
          this.debugReport = this.debugReport.concat(message.errorData);
          DataProcessorHelper.initChartInStorage("interestDashboardData", this.storage);
          this.storage.chartData.interestDashboardData.sortedDomains = {"all": [], "byInterest": {}};
-          let interestDashboardTypeNamespace = message.chartData["lwca"]["58-cat"];
+          let interestDashboardTypeNamespace = message.chartData.rules.dfr_rules;

          this.debugReport.push("Processing data for pie chart");
          let chartData = [];
--- a/lib/streams/spiderDataProcessorBolt.js
+++ b/lib/streams/spiderDataProcessorBolt.js
@ -114,7 +114,7 @@ let SpiderDataProcessorBolt = {

      ingest: function _HSB_ingest(message) {
        DataProcessorHelper.initChartInStorage("spiderData", this.storage);
-        let data = message.chartData.lwca["58-cat"];
+        let data = message.chartData.rules.dfr_rules;
        let categories = data.categories;

        this._mainNodes = [{"id": 0,