Use ruleClassify() instead of lwcaClassify(), and format output of ruleClassify() to be appropriate for the front-end.
This commit is contained in:
Родитель
52a2b9d3b3
Коммит
8d39c18b62
|
@ -5,7 +5,6 @@
|
|||
"use strict";
|
||||
|
||||
importScripts("tokenizerFactory.js");
|
||||
importScripts("lwca_refined.js");
|
||||
|
||||
function InterestsWorkerError(message) {
|
||||
this.name = "InterestsWorkerError";
|
||||
|
@ -22,7 +21,6 @@ InterestsWorkerError.prototype.constructor = InterestsWorkerError;
|
|||
let gNamespace = null;
|
||||
let gRegionCode = null;
|
||||
let gTokenizer = null;
|
||||
let gLWCAClassifier = null;
|
||||
let gInterestsData = null;
|
||||
|
||||
// XXX The original splitter doesn't apply to chinese:
|
||||
|
@ -31,8 +29,6 @@ const kSplitter = /[\s-]+/;
|
|||
|
||||
// bootstrap the worker with data and models
|
||||
function bootstrap(aMessageData) {
|
||||
gLWCAClassifier = new LWCAClassifier(aMessageData);
|
||||
|
||||
// expects : {interestsData, interestsDataType, interestsUrlStopwords, workerRegionCode}
|
||||
gRegionCode = aMessageData.workerRegionCode;
|
||||
|
||||
|
@ -124,6 +120,46 @@ function parseVisit(host, baseDomain, path, title, url, options) {
|
|||
return words;
|
||||
}
|
||||
|
||||
function formatClassificationResults(cats) {
|
||||
let formattedClassification = {};
|
||||
if (cats.length == 0) {
|
||||
return [{"category": "uncategorized", "subcat": "dummy"}];
|
||||
}
|
||||
|
||||
// populate formattedClassification object with top and sub categories
|
||||
for (let i = 0; i < cats.length; i++) {
|
||||
let cat = cats[i];
|
||||
if (cat.indexOf("/") == -1) {
|
||||
// This is a top category
|
||||
if (!formattedClassification[cat]) {
|
||||
formattedClassification[cat] = "general_" + i;
|
||||
}
|
||||
}
|
||||
else {
|
||||
// this is a subcategory
|
||||
let chunks = cat.split("/");
|
||||
let topCat = chunks[0];
|
||||
let subCat = chunks[1];
|
||||
if (!formattedClassification[topCat] ||
|
||||
formattedClassification[topCat].indexOf("general") != -1) {
|
||||
formattedClassification[topCat] = subCat + "_" + i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// The final result is an ordered list of {category: <>, subcat: <>}.
|
||||
let finalResult = new Array(Object.keys(formattedClassification).length);
|
||||
for (let category in formattedClassification) {
|
||||
let split = formattedClassification[category].split("_");
|
||||
let subcat = split[0];
|
||||
let index = split[1];
|
||||
finalResult[index] = {"category": category, "subcat": subcat};
|
||||
}
|
||||
|
||||
|
||||
return finalResult;
|
||||
}
|
||||
|
||||
// classify a page using rules
|
||||
function ruleClassify({host, baseDomain, path, title, url}) {
|
||||
let interests = [];
|
||||
|
@ -180,19 +216,6 @@ function ruleClassify({host, baseDomain, path, title, url}) {
|
|||
return interestFinalizer(interests);
|
||||
}
|
||||
|
||||
function lwcaClassify({url, title}) {
|
||||
try {
|
||||
if (url && title && gNamespace == "58-cat") {
|
||||
let classification = gLWCAClassifier.classify(url, title);
|
||||
let subcat = classification[1].split("/")[0];
|
||||
return {"category": [classification[0]], "subcat": subcat};
|
||||
}
|
||||
} catch (ex) {
|
||||
console.log(ex);
|
||||
}
|
||||
return [];
|
||||
}
|
||||
|
||||
// Figure out which interests are associated to the document
|
||||
function getInterestsForDocument(aMessageData) {
|
||||
|
||||
|
@ -214,22 +237,11 @@ function getInterestsForDocument(aMessageData) {
|
|||
aMessageData.message = "InterestsForDocument";
|
||||
aMessageData.namespace = gNamespace;
|
||||
|
||||
// we need to submit 3 messages
|
||||
// - for rule classification
|
||||
// - for keyword classification
|
||||
// - for combined classification
|
||||
let interests = [];
|
||||
let results = [];
|
||||
// Submitting a msg for rule classification
|
||||
try {
|
||||
interests = lwcaClassify(aMessageData);
|
||||
if (Object.keys(interests).length > 0) {
|
||||
results.push({type: "lwca", interests: interests.category, subcat: interests.subcat});
|
||||
}
|
||||
|
||||
interests = ruleClassify(aMessageData);
|
||||
results.push({type: "rules", interests: dedupeInterests(interests)});
|
||||
|
||||
let rulesWorked = interests.length > 0;
|
||||
let interests = ruleClassify(aMessageData);
|
||||
let formatted = formatClassificationResults(interests);
|
||||
let results = [{type: "rules", interests: formatted}];
|
||||
|
||||
aMessageData.results = results;
|
||||
self.postMessage(aMessageData);
|
||||
|
|
|
@ -28,11 +28,11 @@ const INTEREST_LOCALES = {
|
|||
modelNames: ["41-cat"],
|
||||
},
|
||||
"en-US": {
|
||||
mainTaxonomyModel: "edrules",
|
||||
mainTaxonomyModel: "dfr_rules",
|
||||
rankersDef: [
|
||||
{type: "rules", namespace: "edrules"}
|
||||
],
|
||||
modelNames: ["58-cat", "edrules", "edrules_extended"],
|
||||
modelNames: ["58-cat", "edrules", "edrules_extended", "dfr_rules"],
|
||||
},
|
||||
};
|
||||
|
||||
|
|
|
@ -83,16 +83,17 @@ let DailyInterestsSpout = {
|
|||
});
|
||||
|
||||
results.forEach(item => {
|
||||
let {type, interests, subcat} = item;
|
||||
if (type != "lwca") {
|
||||
return;
|
||||
let {type, interests} = item;
|
||||
let index = 0;
|
||||
while (interests[index] === null) {
|
||||
index++; // Finding first non-null index.
|
||||
}
|
||||
interests.forEach(interest => {
|
||||
if (interest == "uncategorized" || interest.indexOf("__") > -1) return;
|
||||
let category = interests[index].category;
|
||||
let subcat = interests[index].subcat;
|
||||
if (category == "uncategorized" || category.indexOf("__") > -1) return;
|
||||
Object.keys(dateVisits).forEach(date => {
|
||||
this._addDomain(host, dateVisits[date], visitIDs[date][0], interest);
|
||||
this._storeInterest(host, visitIDs[date], date, dateVisits[date], namespace, type, interest, subcat);
|
||||
});
|
||||
this._addDomain(host, dateVisits[date], visitIDs[date][0], category);
|
||||
this._storeInterest(host, visitIDs[date], date, dateVisits[date], namespace, type, category, subcat);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
|
|
@ -174,7 +174,7 @@ let InterestDashboardDataProcessorBolt = {
|
|||
this.debugReport = this.debugReport.concat(message.errorData);
|
||||
DataProcessorHelper.initChartInStorage("interestDashboardData", this.storage);
|
||||
this.storage.chartData.interestDashboardData.sortedDomains = {"all": [], "byInterest": {}};
|
||||
let interestDashboardTypeNamespace = message.chartData["lwca"]["58-cat"];
|
||||
let interestDashboardTypeNamespace = message.chartData.rules.dfr_rules;
|
||||
|
||||
this.debugReport.push("Processing data for pie chart");
|
||||
let chartData = [];
|
||||
|
|
|
@ -114,7 +114,7 @@ let SpiderDataProcessorBolt = {
|
|||
|
||||
ingest: function _HSB_ingest(message) {
|
||||
DataProcessorHelper.initChartInStorage("spiderData", this.storage);
|
||||
let data = message.chartData.lwca["58-cat"];
|
||||
let data = message.chartData.rules.dfr_rules;
|
||||
let categories = data.categories;
|
||||
|
||||
this._mainNodes = [{"id": 0,
|
||||
|
|
Загрузка…
Ссылка в новой задаче