Use ruleClassify() instead of lwcaClassify(), and format output of ruleClassify() to be appropriate for the front-end.

This commit is contained in:
Marina Samuel 2015-01-16 15:50:13 -05:00
Родитель 52a2b9d3b3
Коммит 8d39c18b62
5 изменённых файлов: 57 добавлений и 44 удалений

Просмотреть файл

@ -5,7 +5,6 @@
"use strict";
importScripts("tokenizerFactory.js");
importScripts("lwca_refined.js");
function InterestsWorkerError(message) {
this.name = "InterestsWorkerError";
@ -22,7 +21,6 @@ InterestsWorkerError.prototype.constructor = InterestsWorkerError;
let gNamespace = null;
let gRegionCode = null;
let gTokenizer = null;
let gLWCAClassifier = null;
let gInterestsData = null;
// XXX The original splitter doesn't apply to chinese:
@ -31,8 +29,6 @@ const kSplitter = /[\s-]+/;
// bootstrap the worker with data and models
function bootstrap(aMessageData) {
gLWCAClassifier = new LWCAClassifier(aMessageData);
// expects : {interestsData, interestsDataType, interestsUrlStopwords, workerRegionCode}
gRegionCode = aMessageData.workerRegionCode;
@ -124,6 +120,46 @@ function parseVisit(host, baseDomain, path, title, url, options) {
return words;
}
function formatClassificationResults(cats) {
let formattedClassification = {};
if (cats.length == 0) {
return [{"category": "uncategorized", "subcat": "dummy"}];
}
// populate formattedClassification object with top and sub categories
for (let i = 0; i < cats.length; i++) {
let cat = cats[i];
if (cat.indexOf("/") == -1) {
// This is a top category
if (!formattedClassification[cat]) {
formattedClassification[cat] = "general_" + i;
}
}
else {
// this is a subcategory
let chunks = cat.split("/");
let topCat = chunks[0];
let subCat = chunks[1];
if (!formattedClassification[topCat] ||
formattedClassification[topCat].indexOf("general") != -1) {
formattedClassification[topCat] = subCat + "_" + i;
}
}
}
// The final result is an ordered list of {category: <>, subcat: <>}.
let finalResult = new Array(Object.keys(formattedClassification).length);
for (let category in formattedClassification) {
let split = formattedClassification[category].split("_");
let subcat = split[0];
let index = split[1];
finalResult[index] = {"category": category, "subcat": subcat};
}
return finalResult;
}
// classify a page using rules
function ruleClassify({host, baseDomain, path, title, url}) {
let interests = [];
@ -180,19 +216,6 @@ function ruleClassify({host, baseDomain, path, title, url}) {
return interestFinalizer(interests);
}
function lwcaClassify({url, title}) {
try {
if (url && title && gNamespace == "58-cat") {
let classification = gLWCAClassifier.classify(url, title);
let subcat = classification[1].split("/")[0];
return {"category": [classification[0]], "subcat": subcat};
}
} catch (ex) {
console.log(ex);
}
return [];
}
// Figure out which interests are associated to the document
function getInterestsForDocument(aMessageData) {
@ -214,22 +237,11 @@ function getInterestsForDocument(aMessageData) {
aMessageData.message = "InterestsForDocument";
aMessageData.namespace = gNamespace;
// we need to submit 3 messages
// - for rule classification
// - for keyword classification
// - for combined classification
let interests = [];
let results = [];
// Submitting a msg for rule classification
try {
interests = lwcaClassify(aMessageData);
if (Object.keys(interests).length > 0) {
results.push({type: "lwca", interests: interests.category, subcat: interests.subcat});
}
interests = ruleClassify(aMessageData);
results.push({type: "rules", interests: dedupeInterests(interests)});
let rulesWorked = interests.length > 0;
let interests = ruleClassify(aMessageData);
let formatted = formatClassificationResults(interests);
let results = [{type: "rules", interests: formatted}];
aMessageData.results = results;
self.postMessage(aMessageData);

Просмотреть файл

@ -28,11 +28,11 @@ const INTEREST_LOCALES = {
modelNames: ["41-cat"],
},
"en-US": {
mainTaxonomyModel: "edrules",
mainTaxonomyModel: "dfr_rules",
rankersDef: [
{type: "rules", namespace: "edrules"}
],
modelNames: ["58-cat", "edrules", "edrules_extended"],
modelNames: ["58-cat", "edrules", "edrules_extended", "dfr_rules"],
},
};

Просмотреть файл

@ -83,16 +83,17 @@ let DailyInterestsSpout = {
});
results.forEach(item => {
let {type, interests, subcat} = item;
if (type != "lwca") {
return;
let {type, interests} = item;
let index = 0;
while (interests[index] === null) {
index++; // Finding first non-null index.
}
interests.forEach(interest => {
if (interest == "uncategorized" || interest.indexOf("__") > -1) return;
let category = interests[index].category;
let subcat = interests[index].subcat;
if (category == "uncategorized" || category.indexOf("__") > -1) return;
Object.keys(dateVisits).forEach(date => {
this._addDomain(host, dateVisits[date], visitIDs[date][0], interest);
this._storeInterest(host, visitIDs[date], date, dateVisits[date], namespace, type, interest, subcat);
});
this._addDomain(host, dateVisits[date], visitIDs[date][0], category);
this._storeInterest(host, visitIDs[date], date, dateVisits[date], namespace, type, category, subcat);
});
});
}

Просмотреть файл

@ -174,7 +174,7 @@ let InterestDashboardDataProcessorBolt = {
this.debugReport = this.debugReport.concat(message.errorData);
DataProcessorHelper.initChartInStorage("interestDashboardData", this.storage);
this.storage.chartData.interestDashboardData.sortedDomains = {"all": [], "byInterest": {}};
let interestDashboardTypeNamespace = message.chartData["lwca"]["58-cat"];
let interestDashboardTypeNamespace = message.chartData.rules.dfr_rules;
this.debugReport.push("Processing data for pie chart");
let chartData = [];

Просмотреть файл

@ -114,7 +114,7 @@ let SpiderDataProcessorBolt = {
ingest: function _HSB_ingest(message) {
DataProcessorHelper.initChartInStorage("spiderData", this.storage);
let data = message.chartData.lwca["58-cat"];
let data = message.chartData.rules.dfr_rules;
let categories = data.categories;
this._mainNodes = [{"id": 0,