Merge pull request #22 from mozilla/lwca-cleanup

Closes Bug 1110234 - Remove unused pieces of lwca_refined and move classify() call to the interest worker.
This commit is contained in:
Ed Lee 2014-12-11 17:24:34 -08:00
Родитель d12e0c6828 e01f7cb051
Коммит f0c8a015e2
13 изменённых файлов: 306 добавлений и 1002 удалений

Просмотреть файл

@ -6,6 +6,7 @@
importScripts("tokenizerFactory.js");
importScripts("naiveBayesClassifier.js");
importScripts("lwca_refined.js");
function InterestsWorkerError(message) {
this.name = "InterestsWorkerError";
@ -23,6 +24,7 @@ let gNamespace = null;
let gRegionCode = null;
let gTokenizer = null;
let gClassifier = null;
let gLWCAClassifier = null;
let gInterestsData = null;
// XXX The original splitter doesn't apply to chinese:
@ -31,6 +33,8 @@ const kSplitter = /[\s-]+/;
// bootstrap the worker with data and models
function bootstrap(aMessageData) {
gLWCAClassifier = new LWCAClassifier(aMessageData);
// expects : {interestsData, interestsDataType, interestsClassifierModel, interestsUrlStopwords, workerRegionCode}
gRegionCode = aMessageData.workerRegionCode;
@ -162,6 +166,19 @@ function textClassify({url, title}) {
return [];
}
function lwcaClassify({url, title}) {
try {
if (url && title && gNamespace == "58-cat") {
let classification = gLWCAClassifier.classify(url, title);
let subcat = classification[1].split("/")[0];
return {"category": [classification[0]], "subcat": subcat};
}
} catch (ex) {
console.log(ex);
}
return [];
}
// Figure out which interests are associated to the document
function getInterestsForDocument(aMessageData) {
@ -191,6 +208,11 @@ function getInterestsForDocument(aMessageData) {
let results = [];
let combinedInterests = [];
try {
interests = lwcaClassify(aMessageData);
if (Object.keys(interests).length > 0) {
results.push({type: "lwca", interests: interests.category, subcat: interests.subcat});
}
interests = ruleClassify(aMessageData);
results.push({type: "rules", interests: dedupeInterests(interests)});

Просмотреть файл

@ -1,170 +0,0 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/**
* This worker is responsible for any extensive processing required
* for LWCA. This includes computations for persistentTitleChunks
* and queryVariables.
*/
function parseUri(str) {
// parseUri 1.2.2
// (c) Steven Levithan <stevenlevithan.com>
// MIT License
// http://blog.stevenlevithan.com/archives/parseuri
var o = parseUri.options,
m = o.parser[o.strictMode ? "strict" : "loose"].exec(str),
uri = {},
i = 14;
while (i--) uri[o.key[i]] = m[i] || "";
uri[o.q.name] = {};
uri[o.key[12]].replace(o.q.parser, function($0, $1, $2) {
if ($1) uri[o.q.name][$1] = $2;
});
return uri;
};
parseUri.options = {
strictMode: false,
key: ["source", "protocol", "authority", "userInfo", "user", "password", "host", "port", "relative", "path", "directory", "file", "query", "anchor"],
q: {
name: "queryKey",
parser: /(?:^|&)([^&=]*)=?([^&]*)/g
},
parser: {
strict: /^(?:([^:\/?#]+):)?(?:\/\/((?:(([^:@]*)(?::([^:@]*))?)?@)?([^:\/?#]*)(?::(\d*))?))?((((?:[^?#\/]*\/)*)([^?#]*))(?:\?([^#]*))?(?:#(.*))?)/,
loose: /^(?:(?![^:@]+:[^:@\/]*@)([^:\/?#.]+):)?(?:\/\/)?((?:(([^:@]*)(?::([^:@]*))?)?@)?([^:\/?#]*)(?::(\d*))?)(((\/(?:[^?#](?![^?#\/]*\.[^?#\/.]+(?:[?#]|$)))*\/?)?([^?#\/]*))(?:\?([^#]*))?(?:#(.*))?)/
}
};
function processHistoryEntry({visit, timestamp, qv}) {
let domain_titles = {};
let spaceFinder = RegExp(/.+(%20|\+|\s).+/g) //finds get variable values that have spaces in them
let url = parseUri(visit[0])
let domain = url.host
//scan components
for (let var_name in url.queryKey) {
if (spaceFinder.test(url.queryKey[var_name])) {
//Note: the following spaghetti is why you use a decent language like python
//with sets/defaultdicts
if (qv.hasOwnProperty(domain) == false) {
qv[domain] = {}
}
if (qv[domain].hasOwnProperty(var_name) == false) {
qv[domain][var_name] = 0
}
qv[domain][var_name] += 1
}
}
//sort title
if (domain_titles.hasOwnProperty(domain) == false) {
domain_titles[domain] = []
}
if (visit[1] != null) {
domain_titles[domain].push(visit[1])
}
if (visit[2] > timestamp) {
timestamp = visit[2] //timestamp is now last item loaded
}
self.postMessage({
"message": "visitProcessComplete",
"qv": qv,
"domain_titles": domain_titles,
"timestamp": timestamp,
"totalEntries": visit[3]
});
}
function longestCommonNgramSuffix(s1, s2) {
//Does what it says on the tin
s1 = s1.split(" ")
s2 = s2.split(" ")
let min_len = s1.length < s2.length ? s1.length : s2.length
let result = false
for (let a = 1; a < min_len + 1; a++) {
if (s1[s1.length - a] != s2[s2.length - a]) {
result = s1.slice(s1.length - a + 1)
break
}
}
if (result == false) {
return false
} else if (result == []) {
return false
} else {
return result.join(" ")
}
}
function sortDescendingByElementLength(first, second) {
//sorting function to sort a list of strings
return second.length - first.length
}
function computePTC({domain_titles}) {
let ptc = {};
let titleCount = 1;
//now for processing
for (let domain in domain_titles) {
let suffixes = {}
let titles = domain_titles[domain]
for (let x = 0; x < titles.length; x++) {
for (let y = x + 1; y < titles.length; y++) {
if (titles[x] != titles[y]) {
let lcns = longestCommonNgramSuffix(titles[x], titles[y])
if (lcns != false) {
if (suffixes.hasOwnProperty(lcns) == false) {
suffixes[lcns] = 0
}
suffixes[lcns] += 1
}
}
}
self.postMessage({
"message": "titleAnalyzed",
"domainCount": titleCount
});
titleCount++;
}
//eliminate those that only appear once
let to_add = [];
for (let suffix in suffixes) {
let count = suffixes[suffix]
if (count > 1) {
to_add.push(suffix)
}
}
//to_add must be sorted in descending order of length
//as largest matches should be eliminated first
to_add = to_add.sort(sortDescendingByElementLength)
ptc[domain] = to_add
}
//now remove anything empty
let to_delete = []
for (let x in ptc) {
if (ptc[x].length == 0) {
to_delete.push(x)
}
}
for (let x of to_delete) {
delete ptc[x]
}
self.postMessage({
"message": "computedPTC",
"ptc": ptc
});
}
self.onmessage = function({data}) {
self[data.command](data.payload);
};

Просмотреть файл

@ -0,0 +1,192 @@
//LWCA refined
//2014-09-08 mruttley
//Refined version of LWCA algorithm/process
//How to use? Simply:
// > let lwca = new LWCAClassifier()
// > lwca.classify("http://www.bbc.com/some_very_interesting_article", "Apple reveals shiny new gadget")
// >>> ['computers', 0.75]
let verbose = false
function LWCAClassifier({domain_rules, host_rules, path_rules, words_tree, ignore_words, ignore_domains, ignore_exts, bad_domain_specific}) {
// Main handler class
this.domain_rules = domain_rules;
this.host_rules = host_rules;
this.path_rules = path_rules;
this.words_tree = words_tree;
this.ignore_words = ignore_words;
this.ignore_domains = ignore_domains;
this.ignore_exts = ignore_exts;
this.bad_domain_specific = bad_domain_specific;
//Initialize various processors
if (verbose) console.log("Initializing...")
//build vk-tree
vk_tree = {}
for (let top_level of Object.keys(this.words_tree)) {
for (let sub_level of Object.keys(this.words_tree[top_level])) {
for (let kw of this.words_tree[top_level][sub_level]) {
vk_tree[kw] = [top_level, sub_level]
}
}
}
//build bad_domains, bad_ext, bad_chunk
let bad_domains = {}
let bad_exts = {}
let bad_chunks = {}
for (let domain_name of Object.keys(this.bad_domain_specific)){
let domain_name_chunks = domain_name.split(".")
bad_domains[domain_name_chunks[0]] = 1
bad_exts[domain_name_chunks[1]] = 1
for (let chunk of this.bad_domain_specific[domain_name]) {
bad_chunks[chunk] = 1
}
}
//New classifier
this.classify = function(url, title) {
if (verbose) console.log(url)
url = url.toLowerCase()
//check domains, hosts and paths for exact matches
//first check domain
domain = url.split("://")[1].split("/")[0]
domain_chunks = domain.split('.')
rule_mapping = false
for (let i in domain_chunks) {
fragment = domain_chunks.slice(i).join(".")
if (this.host_rules.hasOwnProperty(fragment)) {
rule_mapping = this.host_rules[fragment]; break
}
if (this.domain_rules.hasOwnProperty(fragment)) {
rule_mapping = this.domain_rules[fragment]; break
}
}
domain_and_path = url.split(domain)[1].split('?')[0].split('/').slice(1)
// http://www2.palmbeachpost.com/classifieds/catpage5.html
// /classifieds/catpage5.html
// [, classifieds, catpage5.html]
// [classifieds, catpage5.html]
for (let i in domain_and_path) {
path_fragment = domain_and_path.slice(i).join('/')
for (let j in domain_chunks) {
domain_fragment = domain_chunks.slice(j).join('.')
full_fragment = domain_fragment + "/" + path_fragment
if (this.path_rules.hasOwnProperty(full_fragment)) {
rule_mapping = this.path_rules[full_fragment]; break
}
}
}
if (rule_mapping != false) {
//is it top level already?
if (this.words_tree.hasOwnProperty(rule_mapping)) {
if(verbose) console.log('Used maxchunk to classify ' + url + " as " + [rule_mapping, "general"])
return [rule_mapping, "general"]
}else{
if (vk_tree.hasOwnProperty(rule_mapping)) {
vk_tree_mapping = vk_tree[rule_mapping]
if(verbose) console.log('Used maxchunk to classify ' + url + " as " + [top_level, rule_mapping])
return vk_tree_mapping
//return [vk_tree_mapping[0], rule_mapping]
}
}
}
//tokenize the url
url = url.match(wordFinder)
if (verbose) console.log(url)
bad_domain = 0
bad_ext = 0
bad_chunk = 0
ignore_domain = 0
ignore_ext = 0
scores = {} //top level & sub_level counts
for (let chunk of url) {
if (this.ignore_domains.hasOwnProperty(chunk)) ignore_domain += 1
if (this.ignore_exts.hasOwnProperty(chunk)) ignore_ext += 1
if (bad_domains.hasOwnProperty(chunk)) bad_domain += 1
if (bad_exts.hasOwnProperty(chunk)) bad_ext += 1
if (bad_chunks.hasOwnProperty(chunk)) bad_chunk += 1
if (this.ignore_words.hasOwnProperty(chunk)) continue
if (vk_tree.hasOwnProperty(chunk)) {
mapping = vk_tree[chunk]
if (scores.hasOwnProperty(mapping[0]) == false) {
scores[mapping[0]] = {}
}
if (scores[mapping[0]].hasOwnProperty(mapping[1]) == false) {
scores[mapping[0]][mapping[1]] = 0
}
scores[mapping[0]][mapping[1]] += 1
}
}
if (verbose) console.log(scores)
if (ignore_domain + ignore_ext >= 2) return ['uncategorized', 'dummy']
if (bad_domain + bad_chunk + bad_ext >= 3) return ['uncategorized', 'dummy'] //check that it's not a bad combination
if (Object.keys(scores).length == 0) return ['uncategorized', 'dummy'] //or there's no score
//convert to list of top levels
sl = []
sub_level_strings = {}
for (let top_level of Object.keys(scores)) {
sub_level_count = 0
subcats = []
for (let sub_level of Object.keys(scores[top_level])) {
subcats.push(sub_level)
sub_level_count += scores[top_level][sub_level]
}
sl.push([top_level, sub_level_count])
sub_level_strings[top_level] = subcats.join("/")
}
sl = sl.sort(sortDescendingBySecondElement)
//if just one item then return that
if (sl.length == 1) {
if (verbose) console.log([sl[0][0], sub_level_strings[sl[0][0]]])
return [sl[0][0], sub_level_strings[sl[0][0]]]
}
//if the top 2 are the same, return uncategorized
if (sl[0][1] == sl[1][1]) {
return ['uncategorized', 'dummy']
}else{ //else if there is a top item, return it
if (verbose) console.log([sl[0][0], sub_level_strings[sl[0][0]]])
return [sl[0][0], sub_level_strings[sl[0][0]]]
}
}
}
let wordFinder = RegExp("[a-z]{3,}", "g") //tokenizes english sentences
function sortDescendingBySecondElement(first, second) {
//function to be used in sort(some_function)
//does what it says on the tin
first = first[1]
second = second[1]
if (first == second) {
return 0
} else {
if (first > second) {
return false
} else {
return true
}
}
}

Просмотреть файл

@ -21,7 +21,6 @@ const {DayCountRankerBolt} = require("streams/dayCountRankerBolt");
const {ChartDataProcessorBolt} = require("streams/chartDataProcessorBolt");
const {InterestDashboardDataProcessorBolt} = require("streams/interestDashboardDataProcessorBolt");
const {DateUtils} = require("DateUtils");
const {LWCAClassifier} = require("lwca_refined");
const {UrlClassifier} = require("UrlClassifier");
const {computeInterestsFromHosts} = require("Utils");
@ -32,13 +31,6 @@ const {data} = require("sdk/self");
const kDefaultResubmitHistoryDays = 30;
function Controller(options={}) {
let self = this;
let worker = new ChromeWorker(data.url("interests/lwcaWorker.js"));
Task.spawn(function*() {
self._lwcaClassifier = new LWCAClassifier(worker);
yield self._lwcaClassifier.init();
});
let historyDaysToResubmit = options.historyDays || kDefaultResubmitHistoryDays;
this._workerFactory = new WorkerFactory();
this._historyDaysToResubmit = historyDaysToResubmit;
@ -141,7 +133,7 @@ Controller.prototype = {
this._processingHistoryPromise = Task.spawn(function() {
let startDay = DateUtils.today() - daysAgo;
let lastTimeStamp = this.storage.lastTimeStamp || 0;
this._currentReader = new HistoryReader(this._workers, this._streamObjects, lastTimeStamp, this._lwcaClassifier);
this._currentReader = new HistoryReader(this._workers, this._streamObjects, lastTimeStamp);
yield this._currentReader.resubmitHistory({startDay: startDay});
this.storage.lastTimeStamp = this._currentReader.getLastTimeStamp();
if (flush) {

Просмотреть файл

@ -22,8 +22,7 @@ Cu.import("resource://gre/modules/NetUtil.jsm");
const MS_PER_DAY = 86400000;
function HistoryReader(workers, streamObjects, lastTimeStamp = 0, lwcaClassifier, storageBackend) {
this._lwcaClassifier = lwcaClassifier;
function HistoryReader(workers, streamObjects, lastTimeStamp = 0, storageBackend) {
this._workers = workers;
this._ResubmitRecentHistoryLastTimeStamp = lastTimeStamp;
this._streamObjects = streamObjects;
@ -112,21 +111,10 @@ HistoryReader.prototype = {
_handleInterestsResults: function I__handleInterestsResults(aData) {
if (aData.messageId == "resubmit") {
// LWCA classification.
try {
if (aData.url && aData.title && aData.namespace == "58-cat") {
if (!shouldSkip(aData.url)) {
let classification = this._lwcaClassifier.classify(aData.url, aData.title);
let subcat = classification[1].split("/")[0];
aData.results.push({"type": "lwca", "interests": [classification[0]], "subcat": subcat});
}
}
} catch (ex) {
console.log(ex);
}
// save classification results in _interestsWorkersData array untill all have responded
this._interestsWorkersData.push(aData);
if (!shouldSkip(aData.url)) {
this._interestsWorkersData.push(aData);
}
// decrement url count and check if we have seen them all
this._ResubmitRecentHistoryUrlCount.interests--;
if (this._ResubmitRecentHistoryUrlCount.interests == 0) {

Просмотреть файл

@ -91,6 +91,8 @@ WorkerFactory.prototype = {
scriptLoader.loadSubScript(data.url("models/" + this._localeCode + "/" + modelName + "/textModel.json"));
// use the same url stop words
scriptLoader.loadSubScript(data.url("models/urlStopwords.json"));
scriptLoader.loadSubScript(data.url("words.js"));
scriptLoader.loadSubScript(data.url("rules.js"));
let worker = new ChromeWorker(data.url("interests/interestsWorker.js"));
worker.postMessage({
@ -101,7 +103,15 @@ WorkerFactory.prototype = {
interestsDataType: "dfr",
interestsData: interestsData,
interestsClassifierModel: interestsClassifierModel,
interestsUrlStopwords: interestsUrlStopwords
interestsUrlStopwords: interestsUrlStopwords,
domain_rules: domain_rules,
host_rules: host_rules,
path_rules: path_rules,
words_tree: words_tree,
ignore_words: ignore_words,
ignore_domains: ignore_domains,
ignore_exts: ignore_exts,
bad_domain_specific: bad_domain_specific
}
});

Просмотреть файл

@ -1,767 +0,0 @@
//LWCA refined
//2014-09-08 mruttley
//Refined version of LWCA algorithm/process
//Three stages:
// - Pre-processing
// - Classification
// - Post-processing
//How to use? Simply:
// > var lwca = new LWCAClassifier()
// > lwca.classify("http://www.bbc.com/some_very_interesting_article", "Apple reveals shiny new gadget")
// >>> ['computers', 0.75]
const {Cc, Ci, Cu, ChromeWorker} = require("chrome");
Cu.import("resource://gre/modules/Task.jsm");
var preprocessingProgressPercent = 0 //global variable to indicate how far in the pre processing the user is
var verbose = false
function LWCAClassifier(worker) {
// Main handler class
//Initialize various processors
if (verbose) console.log("Initializing...")
let cdb = new ComponentDatabase(worker); //objects that help match title components and query variables
//it also checks if it needs to be updated etc
//build vk-tree
vk_tree = {}
for (let top_level of Object.keys(words_tree)) {
for (let sub_level of Object.keys(words_tree[top_level])) {
for (let kw of words_tree[top_level][sub_level]) {
vk_tree[kw] = [top_level, sub_level]
}
}
}
//build bad_domains, bad_ext, bad_chunk
bad_domains = {}
bad_exts = {}
bad_chunks = {}
for (let domain_name of Object.keys(bad_domain_specific)){
domain_name_chunks = domain_name.split(".")
bad_domains[domain_name_chunks[0]] = 1
bad_exts[domain_name_chunks[1]] = 1
for (let chunk of bad_domain_specific[domain_name]) {
bad_chunks[chunk] = 1
}
}
//New classifier
this.classify = function(url, title) {
if (verbose) console.log(url)
url = url.toLowerCase()
//check domains, hosts and paths for exact matches
//first check domain
domain = url.split("://")[1].split("/")[0]
domain_chunks = domain.split('.')
rule_mapping = false
for (let i in domain_chunks) {
fragment = domain_chunks.slice(i).join(".")
if (host_rules.hasOwnProperty(fragment)) {
rule_mapping = host_rules[fragment]; break
}
if (domain_rules.hasOwnProperty(fragment)) {
rule_mapping = domain_rules[fragment]; break
}
}
domain_and_path = url.split(domain)[1].split('?')[0].split('/').slice(1)
// http://www2.palmbeachpost.com/classifieds/catpage5.html
// /classifieds/catpage5.html
// [, classifieds, catpage5.html]
// [classifieds, catpage5.html]
for (let i in domain_and_path) {
path_fragment = domain_and_path.slice(i).join('/')
for (let j in domain_chunks) {
domain_fragment = domain_chunks.slice(j).join('.')
full_fragment = domain_fragment + "/" + path_fragment
if (path_rules.hasOwnProperty(full_fragment)) {
rule_mapping = path_rules[full_fragment]; break
}
}
}
if (rule_mapping != false) {
//is it top level already?
if (words_tree.hasOwnProperty(rule_mapping)) {
if(verbose) console.log('Used maxchunk to classify ' + url + " as " + [rule_mapping, "general"])
return [rule_mapping, "general"]
}else{
if (vk_tree.hasOwnProperty(rule_mapping)) {
vk_tree_mapping = vk_tree[rule_mapping]
if(verbose) console.log('Used maxchunk to classify ' + url + " as " + [top_level, rule_mapping])
return vk_tree_mapping
//return [vk_tree_mapping[0], rule_mapping]
}
}
}
//tokenize the url
url = url.match(wordFinder)
if (verbose) console.log(url)
bad_domain = 0
bad_ext = 0
bad_chunk = 0
ignore_domain = 0
ignore_ext = 0
scores = {} //top level & sub_level counts
for (let chunk of url) {
if (ignore_domains.hasOwnProperty(chunk)) ignore_domain += 1
if (ignore_exts.hasOwnProperty(chunk)) ignore_ext += 1
if (bad_domains.hasOwnProperty(chunk)) bad_domain += 1
if (bad_exts.hasOwnProperty(chunk)) bad_ext += 1
if (bad_chunks.hasOwnProperty(chunk)) bad_chunk += 1
if (ignore_words.hasOwnProperty(chunk)) continue
if (vk_tree.hasOwnProperty(chunk)) {
mapping = vk_tree[chunk]
if (scores.hasOwnProperty(mapping[0]) == false) {
scores[mapping[0]] = {}
}
if (scores[mapping[0]].hasOwnProperty(mapping[1]) == false) {
scores[mapping[0]][mapping[1]] = 0
}
scores[mapping[0]][mapping[1]] += 1
}
}
if (verbose) console.log(scores)
if (ignore_domain + ignore_ext >= 2) return ['uncategorized', 'dummy']
if (bad_domain + bad_chunk + bad_ext >= 3) return ['uncategorized', 'dummy'] //check that it's not a bad combination
if (Object.keys(scores).length == 0) return ['uncategorized', 'dummy'] //or there's no score
//convert to list of top levels
sl = []
sub_level_strings = {}
for (let top_level of Object.keys(scores)) {
sub_level_count = 0
subcats = []
for (let sub_level of Object.keys(scores[top_level])) {
subcats.push(sub_level)
sub_level_count += scores[top_level][sub_level]
}
sl.push([top_level, sub_level_count])
sub_level_strings[top_level] = subcats.join("/")
}
sl = sl.sort(sortDescendingBySecondElement)
//if just one item then return that
if (sl.length == 1) {
if (verbose) console.log([sl[0][0], sub_level_strings[sl[0][0]]])
return [sl[0][0], sub_level_strings[sl[0][0]]]
}
//if the top 2 are the same, return uncategorized
if (sl[0][1] == sl[1][1]) {
return ['uncategorized', 'dummy']
}else{ //else if there is a top item, return it
if (verbose) console.log([sl[0][0], sub_level_strings[sl[0][0]]])
return [sl[0][0], sub_level_strings[sl[0][0]]]
}
}
this.init = function() {
return Task.spawn(function*() {
yield cdb.init();
});
};
}
// Pre-processors
function spotDefinites(url, title) {
//function to spot a definite classification
//e.g. "real estate" is definitely real estate
let definites = {
"real estate": "real estate", //TODO: moarr
}
for (let definiteMatch in definites) {
if (title.indexOf(definiteMatch) != -1) {
return [definites[definiteMatch], 'general']
}
}
return false //false if nothing found
}
function ComponentDatabase(worker, create_objects = true) {
//creates a database of known query variables and persistent title components
//initialization
this._worker = worker;
this._worker.addEventListener("message", this, false);
this._worker.addEventListener("error", this, false);
this.queryVariables = {}
this.persistentTitleChunks = {}
this.meta = {
'timestamp': 0
}
this.init = function() {
return Task.spawn(function*() {
//////// temporarily decoupled
////////
//if (verbose) console.log("Began the init function in Cdb")
//let ts =
// yield this.find_start_and_end();
//if (ts['start'] == 0) {
// //nothing ever made before
// if (verbose) console.log('Nothing found in local directory, so scanning the whole history')
// this.scan(ts['start'], ts['end']);
//} else {
// //something made before, so load it
// if (verbose) console.log('Found cdb in local directory, importing')
// yield this.load_component_database();
//
// //fill in the rest
// this.scan(ts['start'], ts['end']);
// if (verbose) console.log('loaded existing cdb from disc')
//}
}.bind(this));
};
this.find_start_and_end = function() {
return Task.spawn(function*() {
//where to start and end the scanning (if any)
//mostly a copy of get_history
let options = historyService.getNewQueryOptions(); //make a blank query
options.sortingMode = Ci.nsINavHistoryQueryOptions.SORT_BY_DATE_DESCENDING;
let query = historyService.getNewQuery();
let result = historyService.executeQuery(query, options);
let cont = result.root;
cont.containerOpen = true;
let latest_timestamp = cont.getChild(0).time; //this is the last url that the user visited, which is the 'end'
cont.containerOpen = false;
let lm = yield this.load_meta(); //find last url visited's id
if (lm == false) {
if (verbose) console.log('Could not find any meta information. Everything needs to be scanned. Please create a component database first')
return {
'start': 0,
'end': latest_timestamp
}
} else {
if (verbose) console.log('Found meta information on disc (ts: ' + this.meta['timestamp'] + ")")
return {
'start': this.meta['timestamp'],
'end': latest_timestamp
} //start and ending timestamps of whatever needs to be updated
}
}.bind(this));
};
this._handleVisitProcessComplete = function(msgData) {
this._qv = msgData.qv;
for (let domain in msgData.domain_titles) {
//sort title
if (this._domain_titles.hasOwnProperty(domain) == false) {
this._domain_titles[domain] = []
}
this._totalTitles += msgData.domain_titles[domain].length;
this._domain_titles[domain] = this._domain_titles[domain].concat(msgData.domain_titles[domain]);
}
this.meta['timestamp'] = msgData.timestamp;
this._processNextHistoryEvent();
if (this._historyProgressCallback) {
this._historyProgressCallback("historyProgress", this._history_total, msgData.totalEntries);
}
};
this._handleAnalyzedTitle = function(msgData) {
if (this._titleProgressCallback) {
this._titleProgressCallback("titleProgress", msgData.domainCount, this._totalTitles);
}
}
this._handleComputedPTC = function(msgData) {
let ptc = msgData.ptc;
if (this._start != 0) {
//merge the new stuff with the old stuff
//first query variables
for (let domain in this._qv) {
if (this.queryVariables.hasOwnProperty(domain) == false) {
this.queryVariables[domain] = {}
}
for (let v in this._qv[domain]) {
if (this.queryVariables[domain].hasOwnProperty(v) == false) {
this.queryVariables[domain][v] = 1
}
}
}
//then title components
for (let domain in ptc) {
if (this.persistentTitleChunks.hasOwnProperty(domain) == false) {
this.persistentTitleChunks[domain] = {}
}
for (let v of ptc[domain]) {
if (this.persistentTitleChunks[domain].hasOwnProperty(v) == false) {
this.persistentTitleChunks[domain][v] = 1
}
}
}
if (verbose) console.log('loaded existing cdb from disc')
} else {
this.queryVariables = this._qv;
this.persistentTitleChunks = ptc;
}
this._callback();
this.save() //now save everything
};
this.handleEvent = function(aEvent) {
let eventType = aEvent.type;
if (eventType == "message") {
let msgData = aEvent.data;
switch (msgData.message) {
case "visitProcessComplete":
this._handleVisitProcessComplete(msgData);
break;
case "computedPTC":
this._handleComputedPTC(msgData);
break;
case "titleAnalyzed":
this._handleAnalyzedTitle(msgData);
break;
}
} else if (eventType == "error") {
//TODO:handle error
console.log(aEvent.message);
}
};
this._processNextHistoryEvent = function() {
try {
let nextVisit = this._history.next();
this._history_total += 1;
this._worker.postMessage({
command: "processHistoryEntry",
payload: {
"visit": nextVisit,
"timestamp": this.meta['timestamp'],
"qv": this._qv
}
});
} catch (ex if ex instanceof StopIteration) {
if (verbose) console.log("Total history items loaded: " + this._history_total);
if (verbose) console.log("Finding common suffixes in " + Object.keys(this._domain_titles).length + " domains ");
this._worker.postMessage({
command: "computePTC",
payload: {
"domain_titles": this._domain_titles,
}
});
}
};
this.scan = function(start, end) {
this._history = getHistory(start, end);
this._history_total = 0;
this._start = start;
this._end = end;
this._qv = {}; //query variables
this._ptc = {}; //persistent title components
this._domain_titles = {};
this._totalTitles = 0;
this._processNextHistoryEvent(start, end);
}
this.load_meta = function() {
return Task.spawn(function*() {
if (verbose) console.log("load_meta function called")
//load meta
let decoder = new TextDecoder();
/////////DEBUGGING
let meta_location = OS.Path.join(OS.Constants.Path.profileDir, "meta.json");
console.log("Meta should be stored at: " + meta_location)
let meta_exists =
yield OS.File.exists(meta_location);
if (meta_exists) {
console.log("Meta file exists");
} else {
console.log("Meta does not exist");
return false;
}
///////////////////
try {
let array =
yield OS.File.read(meta_location);
if (verbose) console.log('onSuccess for meta loading called')
let info = decoder.decode(array);
let data = JSON.parse(info)
if (verbose) console.log('meta data found was: ' + JSON.stringify(data))
this.meta = data
return true //loads meta information into an object with timestamp and id
} catch (ex) {
if (verbose) console.log("Meta was not found")
return false //file doesn't exist
}
}.bind(this));
};
this.load_component_database = function() {
return Task.spawn(function*() {
//loads the component database if it exists, else returns false
let decoder = new TextDecoder();
try {
let array =
yield OS.File.read(OS.Path.join(OS.Constants.Path.profileDir, "cdb.json"));
let info = decoder.decode(array);
info = JSON.parse(info)
this.queryVariables = info['queryVariables']
this.persistentTitleChunks = info['persistentTitleChunks']
return true
} catch (ex) {
return false //file doesn't exist
}
}.bind(this));
};
this.save = function() {
return Task.spawn(function*() {
//assumes that both cdb and meta have been created
let encoder = new TextEncoder();
let meta_enc = encoder.encode(JSON.stringify(this.meta));
let cdb_enc = encoder.encode(JSON.stringify({
'queryVariables': this.queryVariables,
'persistentTitleChunks': this.persistentTitleChunks
}));
//save meta
yield OS.File.writeAtomic(OS.Path.join(OS.Constants.Path.profileDir, "meta.json"), meta_enc, {
tmpPath: OS.Path.join(OS.Constants.Path.profileDir, "meta.json.tmp")
});
//save component database
yield OS.File.writeAtomic(OS.Path.join(OS.Constants.Path.profileDir, "cdb.json"), cdb_enc, {
tmpPath: OS.Path.join(OS.Constants.Path.profileDir, "cdb.json.tmp")
});
}.bind(this));
};
}
function removePersistentTitleChunks(url, title, cdb) {
//Removes common title endings such as " - Google Search" using the component database
let domain = getDomain(url)
if (cdb.hasOwnProperty(domain)) {
for (let suffix of cdb[domain]) {
if (title.toLowerCase().endsWith(suffix.toLowerCase())) {
//chop suffix from end
title = title.slice(0, title.length - suffix.length)
break
}
}
}
return title
}
function removeDomainNames(url, title) {
//tries to remove the domain name (or aspects of it) from the title
//if this reduces the title to nothing, then just leave them in
url = parseUri(url)
url = url.host.split(".")
title = title.toLowerCase().match(wordFinder)
let new_title = []
let removed = []
for (let token of title) {
if (url.indexOf(token) == -1) {
new_title.push(token)
}
}
if (new_title.length == 0) {
return title.join(" ")
} else {
return new_title.join(" ")
}
}
// Classification
function cosineSimilarity(text, category_keywords, category_magnitude) {
//calculates the cosine similarity between the two arguments
//expects text to be an array of strings
//expects category_keywords to be an object of string: int
//returns a float
//create vector
let vector = {} //object of word: [text count, category count]
for (let word of text) {
if (vector.hasOwnProperty(word) == false) {
if (category_keywords.hasOwnProperty(word) == false) {
vector[word] = [1, 0]
} else {
vector[word] = [1, category_keywords[word]]
}
} else {
vector[word][0] += 1
}
}
//calculate dot product
let dot_product = 0
let text_vector_magnitude = 0
for (let word in vector) {
dot_product += (vector[word][0] * vector[word][1])
text_vector_magnitude += Math.pow(vector[word][0], 2)
}
let denominator = Math.sqrt(text_vector_magnitude) * category_magnitude
if (denominator != 0) {
return dot_product / denominator
}
return 0
}
// Post processing
function augmentRepeatWords(results) {
//Adds 1 to the score of any result containing a repeated word
wordCounts = {}
for (i = 0; i < results.length; i++) {
tokens = results[i][0].toLowerCase().match(wordFinder)
for (let token of tokens) {
if (wordCounts.hasOwnProperty(token) == false) {
wordCounts[token] = 0
}
wordCounts[token] += 1
}
}
//now go through again and find the repeats
for (i = 0; i < results.length; i++) {
tokens = results[i][0].toLowerCase().match(wordFinder)
for (let token of tokens) {
if (wordCounts[token] > 1) { //must be a repeat
results[i][1] += 1
}
}
}
return results
}
function augmentQueries(url, results, queryDatabase) {
//Tries to spot any search queries in the url
//Doubles the score of anything that contains a search query word
if (verbose) console.log("URL: " + url)
let queries = [] //a list of strings
url = parseUri(url) //
if (queryDatabase.hasOwnProperty(url.host)) { //if the domain is in the db
if (verbose) console.log("Domain: " + url.host + " is in the database")
if (verbose) console.log("There are " + Object.keys(url.queryKey).length + " keys in the url")
for (let variable in url.queryKey) { //iterate through url get variables
if (queryDatabase[url.host].hasOwnProperty(variable)) { //if in the db
query = unescape(url.queryKey[variable]) //append to list
queries.concat(query.match(wordFinder))
}
}
}
//now find any result that contains a query word
if (queries.length > 0) {
for (let result in results) {
if (verbose) console.log("Iterating through results")
for (let word of queries) {
if (results[result][0].indexOf(word) != -1) {
results[result][1] *= 2 //double the score
}
}
}
}
return results
}
// Auxiliary functions, matchers, options etc
const {data} = require("sdk/self"); //not quite sure why this is necessary
let {TextEncoder, TextDecoder, OS} = Cu.import("resource://gre/modules/osfile.jsm", {}); //for file IO
let historyService = Cc["@mozilla.org/browser/nav-history-service;1"].getService(Ci.nsINavHistoryService);
let scriptLoader = Cc["@mozilla.org/moz/jssubscript-loader;1"].getService(Ci.mozIJSSubScriptLoader);
scriptLoader.loadSubScript(data.url("words.js"));
scriptLoader.loadSubScript(data.url("rules.js"));
function getDomain(url) {
//returns the (sub)domain of a url
//subdomains are treated as different entities to top level urls
if (url.indexOf("://") != -1) {
url = url.split("://")[1]
if (url.indexOf("/") != -1) {
url = url.split("/")[0]
}
if (url.indexOf("?") != -1) {
url = url.split("?")[0]
}
} else {
return false
}
return url
}
function getHistory(start, end) {
//Generator that yields the most recent history urls one by one
//Returned in the form [url, title, timestamp]
//make a blank query
let options = historyService.getNewQueryOptions();
options.sortingMode = Ci.nsINavHistoryQueryOptions.SORT_BY_DATE_DESCENDING;
let query = historyService.getNewQuery();
query.beginTime = start;
query.endTime = end;
let result = historyService.executeQuery(query, options);
//open up the results
let cont = result.root;
cont.containerOpen = true;
//yield whatever there is
for (let i = 0; i < cont.childCount; i++) {
let node = cont.getChild(i);
yield [node.uri, node.title, node.time, cont.childCount];
}
//close the results container
cont.containerOpen = false;
}
function parseUri(str) {
// parseUri 1.2.2
// (c) Steven Levithan <stevenlevithan.com>
// MIT License
// http://blog.stevenlevithan.com/archives/parseuri
var o = parseUri.options,
m = o.parser[o.strictMode ? "strict" : "loose"].exec(str),
uri = {},
i = 14;
while (i--) uri[o.key[i]] = m[i] || "";
uri[o.q.name] = {};
uri[o.key[12]].replace(o.q.parser, function($0, $1, $2) {
if ($1) uri[o.q.name][$1] = $2;
});
return uri;
};
parseUri.options = {
strictMode: false,
key: ["source", "protocol", "authority", "userInfo", "user", "password", "host", "port", "relative", "path", "directory", "file", "query", "anchor"],
q: {
name: "queryKey",
parser: /(?:^|&)([^&=]*)=?([^&]*)/g
},
parser: {
strict: /^(?:([^:\/?#]+):)?(?:\/\/((?:(([^:@]*)(?::([^:@]*))?)?@)?([^:\/?#]*)(?::(\d*))?))?((((?:[^?#\/]*\/)*)([^?#]*))(?:\?([^#]*))?(?:#(.*))?)/,
loose: /^(?:(?![^:@]+:[^:@\/]*@)([^:\/?#.]+):)?(?:\/\/)?((?:(([^:@]*)(?::([^:@]*))?)?@)?([^:\/?#]*)(?::(\d*))?)(((\/(?:[^?#](?![^?#\/]*\.[^?#\/.]+(?:[?#]|$)))*\/?)?([^?#\/]*))(?:\?([^#]*))?(?:#(.*))?)/
}
};
String.prototype.endsWith = function(suffix) {
//http://stackoverflow.com/a/2548133/849354
return this.indexOf(suffix, this.length - suffix.length) !== -1;
};
var wordFinder = RegExp("[a-z]{3,}", "g") //tokenizes english sentences
var spaceFinder = RegExp(/.+(%20|\+|\s).+/g) //finds get variable values that have spaces in them
//bizarrely, if spaceFinder is declared in the way wordFinder is (two args), it returns an error. Oh JS...
function sortDescendingBySecondElement(first, second) {
//function to be used in sort(some_function)
//does what it says on the tin
first = first[1]
second = second[1]
if (first == second) {
return 0
} else {
if (first > second) {
return false
} else {
return true
}
}
}
function sortDescendingByElementLength(first, second) {
//sorting function to sort a list of strings
return second.length - first.length
}
function loadClassifications() {
//returns an id to iab mapping
//loads meta information into an object with timestamp and id
let decoder = new TextDecoder();
let promise = OS.File.read(OS.Path.join(OS.Constants.Path.profileDir, "meta.json"));
promise = promise.then(
function onSuccess(array) {
let info = decoder.decode(array);
info = JSON.parse(info)
//now expand it
//create an id-to-text version of the mapping
id_to_text = {}
for (let iab in info['mapping']) {
id = info['mapping'][iab]
id_to_text[id] = iab
}
//need id to text version of iab
for (let visitid in info['classifications']) {
mapping_id = info['classifications'][visitid]
info['classifications'][visitid] = id_to_text[mapping_id]
}
return info['classifications']
},
function onFailure() {
return false //file doesn't exist
}
);
}
//for the extension main.js to access
exports.LWCAClassifier = LWCAClassifier
exports.ComponentDatabase = ComponentDatabase

Просмотреть файл

@ -46,7 +46,9 @@ exports.testUtils = {
return false;
},
getWorker : function getWorker({namespace, domainRules, textModel, urlStopWords, listener, regionCode}) {
getWorker : function getWorker({namespace, domainRules, textModel, urlStopWords,
listener, regionCode, domain_rules, host_rules, path_rules,
words_tree, ignore_words, ignore_domains, ignore_exts, bad_domain_specific}) {
let worker = new ChromeWorker(data.url("interests/interestsWorker.js"));
worker.addEventListener("message", listener, false);
worker.addEventListener("error", listener, false);
@ -58,7 +60,15 @@ exports.testUtils = {
interestsDataType: "dfr",
interestsData: domainRules,
interestsClassifierModel: textModel,
interestsUrlStopwords: urlStopWords
interestsUrlStopwords: urlStopWords,
domain_rules: domain_rules,
host_rules: host_rules,
path_rules: path_rules,
words_tree: words_tree,
ignore_words: ignore_words,
ignore_domains: ignore_domains,
ignore_exts: ignore_exts,
bad_domain_specific: bad_domain_specific
}
});
return worker;

Просмотреть файл

@ -22,7 +22,6 @@ const {DayCountRankerBolt} = require("streams/dayCountRankerBolt");
const {DailyInterestsSpout} = require("streams/dailyInterestsSpout");
const {ChartDataProcessorBolt} = require("streams/chartDataProcessorBolt");
const {getPlacesHostForURI, getBaseDomain} = require("Utils");
const {LWCAClassifier} = require("lwca_refined");
const test = require("sdk/test");
const {data} = require("sdk/self");
@ -63,11 +62,7 @@ exports["test read all"] = function test_readAll(assert, done) {
let storageBackend = {};
let streamObjects = initStream(storageBackend);
let worker = new ChromeWorker(data.url("interests/lwcaWorker.js"));
let lwcaClassifier = new LWCAClassifier(worker);
yield lwcaClassifier.init();
let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(), streamObjects, 0, lwcaClassifier, storageBackend);
let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(), streamObjects, 0, storageBackend);
yield historyReader.resubmitHistory({startDay: today-20});
let assertDeferred = oldPromise.defer();
@ -100,12 +95,8 @@ exports["test read from given timestamp"] = function test_readFromGivenTimestamp
let storageBackend = {};
let streamObjects = initStream(storageBackend);
let worker = new ChromeWorker(data.url("interests/lwcaWorker.js"));
let lwcaClassifier = new LWCAClassifier(worker);
yield lwcaClassifier.init();
// only read starting from id == 10
let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(), streamObjects, (today-10)*MICROS_PER_DAY, lwcaClassifier, storageBackend);
let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(), streamObjects, (today-10)*MICROS_PER_DAY, storageBackend);
yield historyReader.resubmitHistory({startDay: today-20});
let assertDeferred = oldPromise.defer();
@ -143,12 +134,8 @@ exports["test chunk size 1"] = function test_ChunkSize1(assert, done) {
let storageBackend = {};
let streamObjects = initStream(storageBackend);
let worker = new ChromeWorker(data.url("interests/lwcaWorker.js"));
let lwcaClassifier = new LWCAClassifier(worker);
yield lwcaClassifier.init();
// only read starting from id == 10
let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects, 10, lwcaClassifier, storageBackend);
let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects, 10, storageBackend);
yield historyReader.resubmitHistory({startDay: today-20});
let assertDeferred = oldPromise.defer();
@ -167,7 +154,7 @@ exports["test chunk size 1"] = function test_ChunkSize1(assert, done) {
// now set chunksize to 1 and read from same id
storageBackend = {};
streamObjects = initStream(storageBackend);
historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects, 10, lwcaClassifier, storageBackend);
historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects, 10, storageBackend);
yield historyReader.resubmitHistory({startDay: today-20, chunkSize: 1});
assertDeferred = oldPromise.defer();
@ -200,11 +187,7 @@ exports["test accumulation"] = function test_Accumulation(assert, done) {
let storageBackend = {};
let streamObjects = initStream(storageBackend);
let worker = new ChromeWorker(data.url("interests/lwcaWorker.js"));
let lwcaClassifier = new LWCAClassifier(worker);
yield lwcaClassifier.init();
let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(), streamObjects, 0, lwcaClassifier, storageBackend);
let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(), streamObjects, 0, storageBackend);
yield historyReader.resubmitHistory({startDay: today-20});
let assertDeferred = oldPromise.defer();
@ -244,11 +227,7 @@ exports["test stop and restart"] = function test_StopAndRestart(assert, done) {
let storageBackend = {};
let streamObjects = initStream(storageBackend);
let worker = new ChromeWorker(data.url("interests/lwcaWorker.js"));
let lwcaClassifier = new LWCAClassifier(worker);
yield lwcaClassifier.init();
let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects, 0, lwcaClassifier, storageBackend);
let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects, 0, storageBackend);
let processDeferred;
@ -298,7 +277,7 @@ exports["test stop and restart"] = function test_StopAndRestart(assert, done) {
}
});
historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects, 0, lwcaClassifier, storageBackend);
historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects, 0, storageBackend);
let promise = historyReader.resubmitHistory({startDay: today-61});
let cycles = 0;
while (true) {
@ -309,7 +288,7 @@ exports["test stop and restart"] = function test_StopAndRestart(assert, done) {
if (lastTimeStamp == theVeryLastTimeStamp) {
break;
}
historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects,lastTimeStamp, lwcaClassifier, storageBackend);
historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects,lastTimeStamp, storageBackend);
promise = historyReader.resubmitHistory({startDay: today-61});
cycles ++;
}
@ -362,11 +341,7 @@ exports["test tldCounter"] = function test_TldCounter(assert, done) {
let storageBackend = {};
let streamObjects = initStream(storageBackend);
let worker = new ChromeWorker(data.url("interests/lwcaWorker.js"));
let lwcaClassifier = new LWCAClassifier(worker);
yield lwcaClassifier.init();
let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects,0, lwcaClassifier, storageBackend);
let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects,0, storageBackend);
yield historyReader.resubmitHistory({startDay: today-20},1);
assert.deepEqual(storageBackend.tldCounter,
{"au":{"mysql.au":1,"facebook.au":1},

Просмотреть файл

@ -8,6 +8,7 @@
const {testUtils} = require("./helpers");
const {Cc, Ci, Cu, ChromeWorker} = require("chrome");
const {data} = require("sdk/self");
const oldPromise = require("sdk/core/promise");
Cu.import("resource://gre/modules/Services.jsm");
Cu.import("resource://gre/modules/NetUtil.jsm");
@ -64,13 +65,26 @@ exports["test default matcher"] = function test_default_matcher(assert, done) {
} // end of handleEvent
};
let scriptLoader = Cc["@mozilla.org/moz/jssubscript-loader;1"].getService(Ci.mozIJSSubScriptLoader);
scriptLoader.loadSubScript(data.url("words.js"));
scriptLoader.loadSubScript(data.url("rules.js"));
let worker = testUtils.getWorker({
namespace: "test-Matching",
regionCode: 'zh-CN',
listener: workerTester,
domainRules: testDomainRules,
textModel: null,
urlStopWords: ['php', 'html']
urlStopWords: ['php', 'html'],
domain_rules: domain_rules,
host_rules: host_rules,
path_rules: path_rules,
words_tree: words_tree,
ignore_words: ignore_words,
ignore_domains: ignore_domains,
ignore_exts: ignore_exts,
bad_domain_specific: bad_domain_specific
});
Task.spawn(function() {

Просмотреть файл

@ -8,6 +8,7 @@
const {testUtils} = require("./helpers");
const {Cc, Ci, Cu, ChromeWorker} = require("chrome");
const {data} = require("sdk/self");
const oldPromise = require("sdk/core/promise");
Cu.import("resource://gre/modules/Services.jsm");
Cu.import("resource://gre/modules/NetUtil.jsm");
@ -170,12 +171,24 @@ exports["test default matcher"] = function test_default_matcher(assert, done) {
} // end of handleEvent
};
let scriptLoader = Cc["@mozilla.org/moz/jssubscript-loader;1"].getService(Ci.mozIJSSubScriptLoader);
scriptLoader.loadSubScript(data.url("words.js"));
scriptLoader.loadSubScript(data.url("rules.js"));
let worker = testUtils.getWorker({
namespace: "test-Matching",
listener: workerTester,
domainRules: testDomainRules,
textModel: null,
urlStopWords: ['php', 'html']
urlStopWords: ['php', 'html'],
domain_rules: domain_rules,
host_rules: host_rules,
path_rules: path_rules,
words_tree: words_tree,
ignore_words: ignore_words,
ignore_domains: ignore_domains,
ignore_exts: ignore_exts,
bad_domain_specific: bad_domain_specific
});
Task.spawn(function() {

Просмотреть файл

@ -54,12 +54,24 @@ exports["test edrules text"] = function test_edrules_text(assert, done) {
} // end of handleEvent
};
let scriptLoader = Cc["@mozilla.org/moz/jssubscript-loader;1"].getService(Ci.mozIJSSubScriptLoader);
scriptLoader.loadSubScript(data.url("words.js"));
scriptLoader.loadSubScript(data.url("rules.js"));
let worker = testUtils.getWorker({
namespace: "test-edrules-text",
listener: workerTester,
domainRules: null,
textModel: interestsClassifierModel,
urlStopWords: interestsUrlStopwords,
domain_rules: domain_rules,
host_rules: host_rules,
path_rules: path_rules,
words_tree: words_tree,
ignore_words: ignore_words,
ignore_domains: ignore_domains,
ignore_exts: ignore_exts,
bad_domain_specific: bad_domain_specific
});
Task.spawn(function() {
@ -166,12 +178,24 @@ exports["test text classifier"] = function test_text_classification(assert, done
} // end of handleEvent
};
let scriptLoader = Cc["@mozilla.org/moz/jssubscript-loader;1"].getService(Ci.mozIJSSubScriptLoader);
scriptLoader.loadSubScript(data.url("words.js"));
scriptLoader.loadSubScript(data.url("rules.js"));
let worker = testUtils.getWorker({
namespace: "test-text-classifier",
listener: workerTester,
domainRules: null,
textModel: riggedMatchTests.interestsClassifierModel,
urlStopWords: interestsUrlStopwords,
domain_rules: domain_rules,
host_rules: host_rules,
path_rules: path_rules,
words_tree: words_tree,
ignore_words: ignore_words,
ignore_domains: ignore_domains,
ignore_exts: ignore_exts,
bad_domain_specific: bad_domain_specific
});
Task.spawn(function() {

Просмотреть файл

@ -24,7 +24,8 @@ exports["test interest classifier"] = function test_UrlClassifier(assert, done)
let results = yield urlClassifier.classifyPage("http://www.autoblog.com/","Drive honda");
assert.equal(Object.keys(results).length, workers.length);
assert.deepEqual(results["58-cat"].results,
[{"type":"rules","interests":["cars"]},
[{"type": "lwca", interests: ["uncategorized"], subcat: "dummy"},
{"type":"rules","interests":["cars"]},
{"type":"keywords","interests":[]},
{"type":"combined","interests":["cars"]}
]);