Merge pull request #22 from mozilla/lwca-cleanup
Closes Bug 1110234 - Remove unused pieces of lwca_refined and move classify() call to the interest worker.
This commit is contained in:
Коммит
f0c8a015e2
|
@ -6,6 +6,7 @@
|
|||
|
||||
importScripts("tokenizerFactory.js");
|
||||
importScripts("naiveBayesClassifier.js");
|
||||
importScripts("lwca_refined.js");
|
||||
|
||||
function InterestsWorkerError(message) {
|
||||
this.name = "InterestsWorkerError";
|
||||
|
@ -23,6 +24,7 @@ let gNamespace = null;
|
|||
let gRegionCode = null;
|
||||
let gTokenizer = null;
|
||||
let gClassifier = null;
|
||||
let gLWCAClassifier = null;
|
||||
let gInterestsData = null;
|
||||
|
||||
// XXX The original splitter doesn't apply to chinese:
|
||||
|
@ -31,6 +33,8 @@ const kSplitter = /[\s-]+/;
|
|||
|
||||
// bootstrap the worker with data and models
|
||||
function bootstrap(aMessageData) {
|
||||
gLWCAClassifier = new LWCAClassifier(aMessageData);
|
||||
|
||||
// expects : {interestsData, interestsDataType, interestsClassifierModel, interestsUrlStopwords, workerRegionCode}
|
||||
gRegionCode = aMessageData.workerRegionCode;
|
||||
|
||||
|
@ -162,6 +166,19 @@ function textClassify({url, title}) {
|
|||
return [];
|
||||
}
|
||||
|
||||
function lwcaClassify({url, title}) {
|
||||
try {
|
||||
if (url && title && gNamespace == "58-cat") {
|
||||
let classification = gLWCAClassifier.classify(url, title);
|
||||
let subcat = classification[1].split("/")[0];
|
||||
return {"category": [classification[0]], "subcat": subcat};
|
||||
}
|
||||
} catch (ex) {
|
||||
console.log(ex);
|
||||
}
|
||||
return [];
|
||||
}
|
||||
|
||||
// Figure out which interests are associated to the document
|
||||
function getInterestsForDocument(aMessageData) {
|
||||
|
||||
|
@ -191,6 +208,11 @@ function getInterestsForDocument(aMessageData) {
|
|||
let results = [];
|
||||
let combinedInterests = [];
|
||||
try {
|
||||
interests = lwcaClassify(aMessageData);
|
||||
if (Object.keys(interests).length > 0) {
|
||||
results.push({type: "lwca", interests: interests.category, subcat: interests.subcat});
|
||||
}
|
||||
|
||||
interests = ruleClassify(aMessageData);
|
||||
results.push({type: "rules", interests: dedupeInterests(interests)});
|
||||
|
||||
|
|
|
@ -1,170 +0,0 @@
|
|||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
/**
|
||||
* This worker is responsible for any extensive processing required
|
||||
* for LWCA. This includes computations for persistentTitleChunks
|
||||
* and queryVariables.
|
||||
*/
|
||||
function parseUri(str) {
|
||||
// parseUri 1.2.2
|
||||
// (c) Steven Levithan <stevenlevithan.com>
|
||||
// MIT License
|
||||
// http://blog.stevenlevithan.com/archives/parseuri
|
||||
var o = parseUri.options,
|
||||
m = o.parser[o.strictMode ? "strict" : "loose"].exec(str),
|
||||
uri = {},
|
||||
i = 14;
|
||||
|
||||
while (i--) uri[o.key[i]] = m[i] || "";
|
||||
|
||||
uri[o.q.name] = {};
|
||||
uri[o.key[12]].replace(o.q.parser, function($0, $1, $2) {
|
||||
if ($1) uri[o.q.name][$1] = $2;
|
||||
});
|
||||
|
||||
return uri;
|
||||
};
|
||||
|
||||
parseUri.options = {
|
||||
strictMode: false,
|
||||
key: ["source", "protocol", "authority", "userInfo", "user", "password", "host", "port", "relative", "path", "directory", "file", "query", "anchor"],
|
||||
q: {
|
||||
name: "queryKey",
|
||||
parser: /(?:^|&)([^&=]*)=?([^&]*)/g
|
||||
},
|
||||
parser: {
|
||||
strict: /^(?:([^:\/?#]+):)?(?:\/\/((?:(([^:@]*)(?::([^:@]*))?)?@)?([^:\/?#]*)(?::(\d*))?))?((((?:[^?#\/]*\/)*)([^?#]*))(?:\?([^#]*))?(?:#(.*))?)/,
|
||||
loose: /^(?:(?![^:@]+:[^:@\/]*@)([^:\/?#.]+):)?(?:\/\/)?((?:(([^:@]*)(?::([^:@]*))?)?@)?([^:\/?#]*)(?::(\d*))?)(((\/(?:[^?#](?![^?#\/]*\.[^?#\/.]+(?:[?#]|$)))*\/?)?([^?#\/]*))(?:\?([^#]*))?(?:#(.*))?)/
|
||||
}
|
||||
};
|
||||
|
||||
function processHistoryEntry({visit, timestamp, qv}) {
|
||||
let domain_titles = {};
|
||||
let spaceFinder = RegExp(/.+(%20|\+|\s).+/g) //finds get variable values that have spaces in them
|
||||
let url = parseUri(visit[0])
|
||||
let domain = url.host
|
||||
|
||||
//scan components
|
||||
for (let var_name in url.queryKey) {
|
||||
if (spaceFinder.test(url.queryKey[var_name])) {
|
||||
//Note: the following spaghetti is why you use a decent language like python
|
||||
//with sets/defaultdicts
|
||||
if (qv.hasOwnProperty(domain) == false) {
|
||||
qv[domain] = {}
|
||||
}
|
||||
if (qv[domain].hasOwnProperty(var_name) == false) {
|
||||
qv[domain][var_name] = 0
|
||||
}
|
||||
qv[domain][var_name] += 1
|
||||
}
|
||||
}
|
||||
|
||||
//sort title
|
||||
if (domain_titles.hasOwnProperty(domain) == false) {
|
||||
domain_titles[domain] = []
|
||||
}
|
||||
|
||||
if (visit[1] != null) {
|
||||
domain_titles[domain].push(visit[1])
|
||||
}
|
||||
if (visit[2] > timestamp) {
|
||||
timestamp = visit[2] //timestamp is now last item loaded
|
||||
}
|
||||
self.postMessage({
|
||||
"message": "visitProcessComplete",
|
||||
"qv": qv,
|
||||
"domain_titles": domain_titles,
|
||||
"timestamp": timestamp,
|
||||
"totalEntries": visit[3]
|
||||
});
|
||||
}
|
||||
|
||||
function longestCommonNgramSuffix(s1, s2) {
|
||||
//Does what it says on the tin
|
||||
s1 = s1.split(" ")
|
||||
s2 = s2.split(" ")
|
||||
let min_len = s1.length < s2.length ? s1.length : s2.length
|
||||
|
||||
let result = false
|
||||
for (let a = 1; a < min_len + 1; a++) {
|
||||
if (s1[s1.length - a] != s2[s2.length - a]) {
|
||||
result = s1.slice(s1.length - a + 1)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if (result == false) {
|
||||
return false
|
||||
} else if (result == []) {
|
||||
return false
|
||||
} else {
|
||||
return result.join(" ")
|
||||
}
|
||||
}
|
||||
|
||||
function sortDescendingByElementLength(first, second) {
|
||||
//sorting function to sort a list of strings
|
||||
return second.length - first.length
|
||||
}
|
||||
|
||||
function computePTC({domain_titles}) {
|
||||
let ptc = {};
|
||||
let titleCount = 1;
|
||||
//now for processing
|
||||
for (let domain in domain_titles) {
|
||||
let suffixes = {}
|
||||
let titles = domain_titles[domain]
|
||||
for (let x = 0; x < titles.length; x++) {
|
||||
for (let y = x + 1; y < titles.length; y++) {
|
||||
if (titles[x] != titles[y]) {
|
||||
let lcns = longestCommonNgramSuffix(titles[x], titles[y])
|
||||
if (lcns != false) {
|
||||
if (suffixes.hasOwnProperty(lcns) == false) {
|
||||
suffixes[lcns] = 0
|
||||
}
|
||||
suffixes[lcns] += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
self.postMessage({
|
||||
"message": "titleAnalyzed",
|
||||
"domainCount": titleCount
|
||||
});
|
||||
titleCount++;
|
||||
}
|
||||
//eliminate those that only appear once
|
||||
let to_add = [];
|
||||
for (let suffix in suffixes) {
|
||||
let count = suffixes[suffix]
|
||||
if (count > 1) {
|
||||
to_add.push(suffix)
|
||||
}
|
||||
}
|
||||
//to_add must be sorted in descending order of length
|
||||
//as largest matches should be eliminated first
|
||||
to_add = to_add.sort(sortDescendingByElementLength)
|
||||
ptc[domain] = to_add
|
||||
}
|
||||
|
||||
//now remove anything empty
|
||||
let to_delete = []
|
||||
for (let x in ptc) {
|
||||
if (ptc[x].length == 0) {
|
||||
to_delete.push(x)
|
||||
}
|
||||
}
|
||||
for (let x of to_delete) {
|
||||
delete ptc[x]
|
||||
}
|
||||
|
||||
self.postMessage({
|
||||
"message": "computedPTC",
|
||||
"ptc": ptc
|
||||
});
|
||||
}
|
||||
|
||||
self.onmessage = function({data}) {
|
||||
self[data.command](data.payload);
|
||||
};
|
|
@ -0,0 +1,192 @@
|
|||
//LWCA refined
|
||||
//2014-09-08 mruttley
|
||||
//Refined version of LWCA algorithm/process
|
||||
|
||||
//How to use? Simply:
|
||||
// > let lwca = new LWCAClassifier()
|
||||
// > lwca.classify("http://www.bbc.com/some_very_interesting_article", "Apple reveals shiny new gadget")
|
||||
// >>> ['computers', 0.75]
|
||||
|
||||
let verbose = false
|
||||
|
||||
function LWCAClassifier({domain_rules, host_rules, path_rules, words_tree, ignore_words, ignore_domains, ignore_exts, bad_domain_specific}) {
|
||||
// Main handler class
|
||||
this.domain_rules = domain_rules;
|
||||
this.host_rules = host_rules;
|
||||
this.path_rules = path_rules;
|
||||
this.words_tree = words_tree;
|
||||
this.ignore_words = ignore_words;
|
||||
this.ignore_domains = ignore_domains;
|
||||
this.ignore_exts = ignore_exts;
|
||||
this.bad_domain_specific = bad_domain_specific;
|
||||
|
||||
//Initialize various processors
|
||||
if (verbose) console.log("Initializing...")
|
||||
|
||||
//build vk-tree
|
||||
vk_tree = {}
|
||||
for (let top_level of Object.keys(this.words_tree)) {
|
||||
for (let sub_level of Object.keys(this.words_tree[top_level])) {
|
||||
for (let kw of this.words_tree[top_level][sub_level]) {
|
||||
vk_tree[kw] = [top_level, sub_level]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//build bad_domains, bad_ext, bad_chunk
|
||||
let bad_domains = {}
|
||||
let bad_exts = {}
|
||||
let bad_chunks = {}
|
||||
|
||||
for (let domain_name of Object.keys(this.bad_domain_specific)){
|
||||
let domain_name_chunks = domain_name.split(".")
|
||||
bad_domains[domain_name_chunks[0]] = 1
|
||||
bad_exts[domain_name_chunks[1]] = 1
|
||||
for (let chunk of this.bad_domain_specific[domain_name]) {
|
||||
bad_chunks[chunk] = 1
|
||||
}
|
||||
}
|
||||
|
||||
//New classifier
|
||||
this.classify = function(url, title) {
|
||||
|
||||
if (verbose) console.log(url)
|
||||
url = url.toLowerCase()
|
||||
|
||||
//check domains, hosts and paths for exact matches
|
||||
//first check domain
|
||||
domain = url.split("://")[1].split("/")[0]
|
||||
domain_chunks = domain.split('.')
|
||||
rule_mapping = false
|
||||
for (let i in domain_chunks) {
|
||||
fragment = domain_chunks.slice(i).join(".")
|
||||
if (this.host_rules.hasOwnProperty(fragment)) {
|
||||
rule_mapping = this.host_rules[fragment]; break
|
||||
}
|
||||
if (this.domain_rules.hasOwnProperty(fragment)) {
|
||||
rule_mapping = this.domain_rules[fragment]; break
|
||||
}
|
||||
}
|
||||
|
||||
domain_and_path = url.split(domain)[1].split('?')[0].split('/').slice(1)
|
||||
|
||||
// http://www2.palmbeachpost.com/classifieds/catpage5.html
|
||||
// /classifieds/catpage5.html
|
||||
// [, classifieds, catpage5.html]
|
||||
// [classifieds, catpage5.html]
|
||||
|
||||
for (let i in domain_and_path) {
|
||||
path_fragment = domain_and_path.slice(i).join('/')
|
||||
for (let j in domain_chunks) {
|
||||
domain_fragment = domain_chunks.slice(j).join('.')
|
||||
full_fragment = domain_fragment + "/" + path_fragment
|
||||
if (this.path_rules.hasOwnProperty(full_fragment)) {
|
||||
rule_mapping = this.path_rules[full_fragment]; break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (rule_mapping != false) {
|
||||
//is it top level already?
|
||||
if (this.words_tree.hasOwnProperty(rule_mapping)) {
|
||||
if(verbose) console.log('Used maxchunk to classify ' + url + " as " + [rule_mapping, "general"])
|
||||
return [rule_mapping, "general"]
|
||||
}else{
|
||||
if (vk_tree.hasOwnProperty(rule_mapping)) {
|
||||
vk_tree_mapping = vk_tree[rule_mapping]
|
||||
if(verbose) console.log('Used maxchunk to classify ' + url + " as " + [top_level, rule_mapping])
|
||||
return vk_tree_mapping
|
||||
//return [vk_tree_mapping[0], rule_mapping]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//tokenize the url
|
||||
url = url.match(wordFinder)
|
||||
|
||||
if (verbose) console.log(url)
|
||||
|
||||
bad_domain = 0
|
||||
bad_ext = 0
|
||||
bad_chunk = 0
|
||||
ignore_domain = 0
|
||||
ignore_ext = 0
|
||||
|
||||
scores = {} //top level & sub_level counts
|
||||
|
||||
for (let chunk of url) {
|
||||
|
||||
if (this.ignore_domains.hasOwnProperty(chunk)) ignore_domain += 1
|
||||
if (this.ignore_exts.hasOwnProperty(chunk)) ignore_ext += 1
|
||||
if (bad_domains.hasOwnProperty(chunk)) bad_domain += 1
|
||||
if (bad_exts.hasOwnProperty(chunk)) bad_ext += 1
|
||||
if (bad_chunks.hasOwnProperty(chunk)) bad_chunk += 1
|
||||
if (this.ignore_words.hasOwnProperty(chunk)) continue
|
||||
|
||||
if (vk_tree.hasOwnProperty(chunk)) {
|
||||
mapping = vk_tree[chunk]
|
||||
if (scores.hasOwnProperty(mapping[0]) == false) {
|
||||
scores[mapping[0]] = {}
|
||||
}
|
||||
if (scores[mapping[0]].hasOwnProperty(mapping[1]) == false) {
|
||||
scores[mapping[0]][mapping[1]] = 0
|
||||
}
|
||||
scores[mapping[0]][mapping[1]] += 1
|
||||
}
|
||||
}
|
||||
|
||||
if (verbose) console.log(scores)
|
||||
|
||||
if (ignore_domain + ignore_ext >= 2) return ['uncategorized', 'dummy']
|
||||
if (bad_domain + bad_chunk + bad_ext >= 3) return ['uncategorized', 'dummy'] //check that it's not a bad combination
|
||||
if (Object.keys(scores).length == 0) return ['uncategorized', 'dummy'] //or there's no score
|
||||
|
||||
//convert to list of top levels
|
||||
sl = []
|
||||
sub_level_strings = {}
|
||||
for (let top_level of Object.keys(scores)) {
|
||||
sub_level_count = 0
|
||||
subcats = []
|
||||
for (let sub_level of Object.keys(scores[top_level])) {
|
||||
subcats.push(sub_level)
|
||||
sub_level_count += scores[top_level][sub_level]
|
||||
}
|
||||
sl.push([top_level, sub_level_count])
|
||||
sub_level_strings[top_level] = subcats.join("/")
|
||||
}
|
||||
sl = sl.sort(sortDescendingBySecondElement)
|
||||
|
||||
//if just one item then return that
|
||||
if (sl.length == 1) {
|
||||
if (verbose) console.log([sl[0][0], sub_level_strings[sl[0][0]]])
|
||||
return [sl[0][0], sub_level_strings[sl[0][0]]]
|
||||
}
|
||||
|
||||
//if the top 2 are the same, return uncategorized
|
||||
if (sl[0][1] == sl[1][1]) {
|
||||
return ['uncategorized', 'dummy']
|
||||
}else{ //else if there is a top item, return it
|
||||
if (verbose) console.log([sl[0][0], sub_level_strings[sl[0][0]]])
|
||||
return [sl[0][0], sub_level_strings[sl[0][0]]]
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
let wordFinder = RegExp("[a-z]{3,}", "g") //tokenizes english sentences
|
||||
|
||||
function sortDescendingBySecondElement(first, second) {
|
||||
//function to be used in sort(some_function)
|
||||
//does what it says on the tin
|
||||
first = first[1]
|
||||
second = second[1]
|
||||
if (first == second) {
|
||||
return 0
|
||||
} else {
|
||||
if (first > second) {
|
||||
return false
|
||||
} else {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
|
@ -21,7 +21,6 @@ const {DayCountRankerBolt} = require("streams/dayCountRankerBolt");
|
|||
const {ChartDataProcessorBolt} = require("streams/chartDataProcessorBolt");
|
||||
const {InterestDashboardDataProcessorBolt} = require("streams/interestDashboardDataProcessorBolt");
|
||||
const {DateUtils} = require("DateUtils");
|
||||
const {LWCAClassifier} = require("lwca_refined");
|
||||
const {UrlClassifier} = require("UrlClassifier");
|
||||
const {computeInterestsFromHosts} = require("Utils");
|
||||
|
||||
|
@ -32,13 +31,6 @@ const {data} = require("sdk/self");
|
|||
const kDefaultResubmitHistoryDays = 30;
|
||||
|
||||
function Controller(options={}) {
|
||||
let self = this;
|
||||
let worker = new ChromeWorker(data.url("interests/lwcaWorker.js"));
|
||||
Task.spawn(function*() {
|
||||
self._lwcaClassifier = new LWCAClassifier(worker);
|
||||
yield self._lwcaClassifier.init();
|
||||
});
|
||||
|
||||
let historyDaysToResubmit = options.historyDays || kDefaultResubmitHistoryDays;
|
||||
this._workerFactory = new WorkerFactory();
|
||||
this._historyDaysToResubmit = historyDaysToResubmit;
|
||||
|
@ -141,7 +133,7 @@ Controller.prototype = {
|
|||
this._processingHistoryPromise = Task.spawn(function() {
|
||||
let startDay = DateUtils.today() - daysAgo;
|
||||
let lastTimeStamp = this.storage.lastTimeStamp || 0;
|
||||
this._currentReader = new HistoryReader(this._workers, this._streamObjects, lastTimeStamp, this._lwcaClassifier);
|
||||
this._currentReader = new HistoryReader(this._workers, this._streamObjects, lastTimeStamp);
|
||||
yield this._currentReader.resubmitHistory({startDay: startDay});
|
||||
this.storage.lastTimeStamp = this._currentReader.getLastTimeStamp();
|
||||
if (flush) {
|
||||
|
|
|
@ -22,8 +22,7 @@ Cu.import("resource://gre/modules/NetUtil.jsm");
|
|||
|
||||
const MS_PER_DAY = 86400000;
|
||||
|
||||
function HistoryReader(workers, streamObjects, lastTimeStamp = 0, lwcaClassifier, storageBackend) {
|
||||
this._lwcaClassifier = lwcaClassifier;
|
||||
function HistoryReader(workers, streamObjects, lastTimeStamp = 0, storageBackend) {
|
||||
this._workers = workers;
|
||||
this._ResubmitRecentHistoryLastTimeStamp = lastTimeStamp;
|
||||
this._streamObjects = streamObjects;
|
||||
|
@ -112,21 +111,10 @@ HistoryReader.prototype = {
|
|||
|
||||
_handleInterestsResults: function I__handleInterestsResults(aData) {
|
||||
if (aData.messageId == "resubmit") {
|
||||
// LWCA classification.
|
||||
try {
|
||||
if (aData.url && aData.title && aData.namespace == "58-cat") {
|
||||
if (!shouldSkip(aData.url)) {
|
||||
let classification = this._lwcaClassifier.classify(aData.url, aData.title);
|
||||
let subcat = classification[1].split("/")[0];
|
||||
aData.results.push({"type": "lwca", "interests": [classification[0]], "subcat": subcat});
|
||||
}
|
||||
}
|
||||
} catch (ex) {
|
||||
console.log(ex);
|
||||
}
|
||||
|
||||
// save classification results in _interestsWorkersData array untill all have responded
|
||||
this._interestsWorkersData.push(aData);
|
||||
if (!shouldSkip(aData.url)) {
|
||||
this._interestsWorkersData.push(aData);
|
||||
}
|
||||
// decrement url count and check if we have seen them all
|
||||
this._ResubmitRecentHistoryUrlCount.interests--;
|
||||
if (this._ResubmitRecentHistoryUrlCount.interests == 0) {
|
||||
|
|
|
@ -91,6 +91,8 @@ WorkerFactory.prototype = {
|
|||
scriptLoader.loadSubScript(data.url("models/" + this._localeCode + "/" + modelName + "/textModel.json"));
|
||||
// use the same url stop words
|
||||
scriptLoader.loadSubScript(data.url("models/urlStopwords.json"));
|
||||
scriptLoader.loadSubScript(data.url("words.js"));
|
||||
scriptLoader.loadSubScript(data.url("rules.js"));
|
||||
|
||||
let worker = new ChromeWorker(data.url("interests/interestsWorker.js"));
|
||||
worker.postMessage({
|
||||
|
@ -101,7 +103,15 @@ WorkerFactory.prototype = {
|
|||
interestsDataType: "dfr",
|
||||
interestsData: interestsData,
|
||||
interestsClassifierModel: interestsClassifierModel,
|
||||
interestsUrlStopwords: interestsUrlStopwords
|
||||
interestsUrlStopwords: interestsUrlStopwords,
|
||||
domain_rules: domain_rules,
|
||||
host_rules: host_rules,
|
||||
path_rules: path_rules,
|
||||
words_tree: words_tree,
|
||||
ignore_words: ignore_words,
|
||||
ignore_domains: ignore_domains,
|
||||
ignore_exts: ignore_exts,
|
||||
bad_domain_specific: bad_domain_specific
|
||||
}
|
||||
});
|
||||
|
||||
|
|
|
@ -1,767 +0,0 @@
|
|||
//LWCA refined
|
||||
//2014-09-08 mruttley
|
||||
//Refined version of LWCA algorithm/process
|
||||
|
||||
//Three stages:
|
||||
// - Pre-processing
|
||||
// - Classification
|
||||
// - Post-processing
|
||||
|
||||
//How to use? Simply:
|
||||
// > var lwca = new LWCAClassifier()
|
||||
// > lwca.classify("http://www.bbc.com/some_very_interesting_article", "Apple reveals shiny new gadget")
|
||||
// >>> ['computers', 0.75]
|
||||
|
||||
const {Cc, Ci, Cu, ChromeWorker} = require("chrome");
|
||||
Cu.import("resource://gre/modules/Task.jsm");
|
||||
|
||||
var preprocessingProgressPercent = 0 //global variable to indicate how far in the pre processing the user is
|
||||
var verbose = false
|
||||
|
||||
function LWCAClassifier(worker) {
|
||||
// Main handler class
|
||||
|
||||
//Initialize various processors
|
||||
if (verbose) console.log("Initializing...")
|
||||
|
||||
let cdb = new ComponentDatabase(worker); //objects that help match title components and query variables
|
||||
//it also checks if it needs to be updated etc
|
||||
|
||||
//build vk-tree
|
||||
vk_tree = {}
|
||||
for (let top_level of Object.keys(words_tree)) {
|
||||
for (let sub_level of Object.keys(words_tree[top_level])) {
|
||||
for (let kw of words_tree[top_level][sub_level]) {
|
||||
vk_tree[kw] = [top_level, sub_level]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//build bad_domains, bad_ext, bad_chunk
|
||||
bad_domains = {}
|
||||
bad_exts = {}
|
||||
bad_chunks = {}
|
||||
|
||||
for (let domain_name of Object.keys(bad_domain_specific)){
|
||||
domain_name_chunks = domain_name.split(".")
|
||||
bad_domains[domain_name_chunks[0]] = 1
|
||||
bad_exts[domain_name_chunks[1]] = 1
|
||||
for (let chunk of bad_domain_specific[domain_name]) {
|
||||
bad_chunks[chunk] = 1
|
||||
}
|
||||
}
|
||||
|
||||
//New classifier
|
||||
this.classify = function(url, title) {
|
||||
|
||||
if (verbose) console.log(url)
|
||||
url = url.toLowerCase()
|
||||
|
||||
//check domains, hosts and paths for exact matches
|
||||
//first check domain
|
||||
domain = url.split("://")[1].split("/")[0]
|
||||
domain_chunks = domain.split('.')
|
||||
rule_mapping = false
|
||||
for (let i in domain_chunks) {
|
||||
fragment = domain_chunks.slice(i).join(".")
|
||||
if (host_rules.hasOwnProperty(fragment)) {
|
||||
rule_mapping = host_rules[fragment]; break
|
||||
}
|
||||
if (domain_rules.hasOwnProperty(fragment)) {
|
||||
rule_mapping = domain_rules[fragment]; break
|
||||
}
|
||||
}
|
||||
|
||||
domain_and_path = url.split(domain)[1].split('?')[0].split('/').slice(1)
|
||||
|
||||
// http://www2.palmbeachpost.com/classifieds/catpage5.html
|
||||
// /classifieds/catpage5.html
|
||||
// [, classifieds, catpage5.html]
|
||||
// [classifieds, catpage5.html]
|
||||
|
||||
for (let i in domain_and_path) {
|
||||
path_fragment = domain_and_path.slice(i).join('/')
|
||||
for (let j in domain_chunks) {
|
||||
domain_fragment = domain_chunks.slice(j).join('.')
|
||||
full_fragment = domain_fragment + "/" + path_fragment
|
||||
if (path_rules.hasOwnProperty(full_fragment)) {
|
||||
rule_mapping = path_rules[full_fragment]; break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (rule_mapping != false) {
|
||||
//is it top level already?
|
||||
if (words_tree.hasOwnProperty(rule_mapping)) {
|
||||
if(verbose) console.log('Used maxchunk to classify ' + url + " as " + [rule_mapping, "general"])
|
||||
return [rule_mapping, "general"]
|
||||
}else{
|
||||
if (vk_tree.hasOwnProperty(rule_mapping)) {
|
||||
vk_tree_mapping = vk_tree[rule_mapping]
|
||||
if(verbose) console.log('Used maxchunk to classify ' + url + " as " + [top_level, rule_mapping])
|
||||
return vk_tree_mapping
|
||||
//return [vk_tree_mapping[0], rule_mapping]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//tokenize the url
|
||||
url = url.match(wordFinder)
|
||||
|
||||
if (verbose) console.log(url)
|
||||
|
||||
bad_domain = 0
|
||||
bad_ext = 0
|
||||
bad_chunk = 0
|
||||
ignore_domain = 0
|
||||
ignore_ext = 0
|
||||
|
||||
scores = {} //top level & sub_level counts
|
||||
|
||||
for (let chunk of url) {
|
||||
|
||||
if (ignore_domains.hasOwnProperty(chunk)) ignore_domain += 1
|
||||
if (ignore_exts.hasOwnProperty(chunk)) ignore_ext += 1
|
||||
if (bad_domains.hasOwnProperty(chunk)) bad_domain += 1
|
||||
if (bad_exts.hasOwnProperty(chunk)) bad_ext += 1
|
||||
if (bad_chunks.hasOwnProperty(chunk)) bad_chunk += 1
|
||||
if (ignore_words.hasOwnProperty(chunk)) continue
|
||||
|
||||
if (vk_tree.hasOwnProperty(chunk)) {
|
||||
mapping = vk_tree[chunk]
|
||||
if (scores.hasOwnProperty(mapping[0]) == false) {
|
||||
scores[mapping[0]] = {}
|
||||
}
|
||||
if (scores[mapping[0]].hasOwnProperty(mapping[1]) == false) {
|
||||
scores[mapping[0]][mapping[1]] = 0
|
||||
}
|
||||
scores[mapping[0]][mapping[1]] += 1
|
||||
}
|
||||
}
|
||||
|
||||
if (verbose) console.log(scores)
|
||||
|
||||
if (ignore_domain + ignore_ext >= 2) return ['uncategorized', 'dummy']
|
||||
if (bad_domain + bad_chunk + bad_ext >= 3) return ['uncategorized', 'dummy'] //check that it's not a bad combination
|
||||
if (Object.keys(scores).length == 0) return ['uncategorized', 'dummy'] //or there's no score
|
||||
|
||||
//convert to list of top levels
|
||||
sl = []
|
||||
sub_level_strings = {}
|
||||
for (let top_level of Object.keys(scores)) {
|
||||
sub_level_count = 0
|
||||
subcats = []
|
||||
for (let sub_level of Object.keys(scores[top_level])) {
|
||||
subcats.push(sub_level)
|
||||
sub_level_count += scores[top_level][sub_level]
|
||||
}
|
||||
sl.push([top_level, sub_level_count])
|
||||
sub_level_strings[top_level] = subcats.join("/")
|
||||
}
|
||||
sl = sl.sort(sortDescendingBySecondElement)
|
||||
|
||||
//if just one item then return that
|
||||
if (sl.length == 1) {
|
||||
if (verbose) console.log([sl[0][0], sub_level_strings[sl[0][0]]])
|
||||
return [sl[0][0], sub_level_strings[sl[0][0]]]
|
||||
}
|
||||
|
||||
//if the top 2 are the same, return uncategorized
|
||||
if (sl[0][1] == sl[1][1]) {
|
||||
return ['uncategorized', 'dummy']
|
||||
}else{ //else if there is a top item, return it
|
||||
if (verbose) console.log([sl[0][0], sub_level_strings[sl[0][0]]])
|
||||
return [sl[0][0], sub_level_strings[sl[0][0]]]
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
this.init = function() {
|
||||
return Task.spawn(function*() {
|
||||
yield cdb.init();
|
||||
});
|
||||
};
|
||||
}
|
||||
|
||||
// Pre-processors
|
||||
|
||||
function spotDefinites(url, title) {
|
||||
//function to spot a definite classification
|
||||
//e.g. "real estate" is definitely real estate
|
||||
|
||||
let definites = {
|
||||
"real estate": "real estate", //TODO: moarr
|
||||
}
|
||||
|
||||
for (let definiteMatch in definites) {
|
||||
if (title.indexOf(definiteMatch) != -1) {
|
||||
return [definites[definiteMatch], 'general']
|
||||
}
|
||||
}
|
||||
|
||||
return false //false if nothing found
|
||||
}
|
||||
|
||||
function ComponentDatabase(worker, create_objects = true) {
|
||||
//creates a database of known query variables and persistent title components
|
||||
|
||||
//initialization
|
||||
this._worker = worker;
|
||||
this._worker.addEventListener("message", this, false);
|
||||
this._worker.addEventListener("error", this, false);
|
||||
|
||||
this.queryVariables = {}
|
||||
this.persistentTitleChunks = {}
|
||||
this.meta = {
|
||||
'timestamp': 0
|
||||
}
|
||||
|
||||
this.init = function() {
|
||||
return Task.spawn(function*() {
|
||||
//////// temporarily decoupled
|
||||
|
||||
////////
|
||||
//if (verbose) console.log("Began the init function in Cdb")
|
||||
//let ts =
|
||||
// yield this.find_start_and_end();
|
||||
//if (ts['start'] == 0) {
|
||||
// //nothing ever made before
|
||||
// if (verbose) console.log('Nothing found in local directory, so scanning the whole history')
|
||||
// this.scan(ts['start'], ts['end']);
|
||||
//} else {
|
||||
// //something made before, so load it
|
||||
// if (verbose) console.log('Found cdb in local directory, importing')
|
||||
// yield this.load_component_database();
|
||||
//
|
||||
// //fill in the rest
|
||||
// this.scan(ts['start'], ts['end']);
|
||||
// if (verbose) console.log('loaded existing cdb from disc')
|
||||
//}
|
||||
}.bind(this));
|
||||
};
|
||||
|
||||
this.find_start_and_end = function() {
|
||||
return Task.spawn(function*() {
|
||||
//where to start and end the scanning (if any)
|
||||
|
||||
//mostly a copy of get_history
|
||||
let options = historyService.getNewQueryOptions(); //make a blank query
|
||||
options.sortingMode = Ci.nsINavHistoryQueryOptions.SORT_BY_DATE_DESCENDING;
|
||||
let query = historyService.getNewQuery();
|
||||
let result = historyService.executeQuery(query, options);
|
||||
let cont = result.root;
|
||||
cont.containerOpen = true;
|
||||
let latest_timestamp = cont.getChild(0).time; //this is the last url that the user visited, which is the 'end'
|
||||
cont.containerOpen = false;
|
||||
|
||||
|
||||
let lm = yield this.load_meta(); //find last url visited's id
|
||||
if (lm == false) {
|
||||
if (verbose) console.log('Could not find any meta information. Everything needs to be scanned. Please create a component database first')
|
||||
return {
|
||||
'start': 0,
|
||||
'end': latest_timestamp
|
||||
}
|
||||
} else {
|
||||
if (verbose) console.log('Found meta information on disc (ts: ' + this.meta['timestamp'] + ")")
|
||||
return {
|
||||
'start': this.meta['timestamp'],
|
||||
'end': latest_timestamp
|
||||
} //start and ending timestamps of whatever needs to be updated
|
||||
}
|
||||
}.bind(this));
|
||||
};
|
||||
|
||||
this._handleVisitProcessComplete = function(msgData) {
|
||||
this._qv = msgData.qv;
|
||||
for (let domain in msgData.domain_titles) {
|
||||
//sort title
|
||||
if (this._domain_titles.hasOwnProperty(domain) == false) {
|
||||
this._domain_titles[domain] = []
|
||||
}
|
||||
this._totalTitles += msgData.domain_titles[domain].length;
|
||||
this._domain_titles[domain] = this._domain_titles[domain].concat(msgData.domain_titles[domain]);
|
||||
}
|
||||
|
||||
this.meta['timestamp'] = msgData.timestamp;
|
||||
this._processNextHistoryEvent();
|
||||
if (this._historyProgressCallback) {
|
||||
this._historyProgressCallback("historyProgress", this._history_total, msgData.totalEntries);
|
||||
}
|
||||
};
|
||||
|
||||
this._handleAnalyzedTitle = function(msgData) {
|
||||
if (this._titleProgressCallback) {
|
||||
this._titleProgressCallback("titleProgress", msgData.domainCount, this._totalTitles);
|
||||
}
|
||||
}
|
||||
|
||||
this._handleComputedPTC = function(msgData) {
|
||||
let ptc = msgData.ptc;
|
||||
|
||||
if (this._start != 0) {
|
||||
//merge the new stuff with the old stuff
|
||||
//first query variables
|
||||
for (let domain in this._qv) {
|
||||
if (this.queryVariables.hasOwnProperty(domain) == false) {
|
||||
this.queryVariables[domain] = {}
|
||||
}
|
||||
for (let v in this._qv[domain]) {
|
||||
if (this.queryVariables[domain].hasOwnProperty(v) == false) {
|
||||
this.queryVariables[domain][v] = 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//then title components
|
||||
for (let domain in ptc) {
|
||||
if (this.persistentTitleChunks.hasOwnProperty(domain) == false) {
|
||||
this.persistentTitleChunks[domain] = {}
|
||||
}
|
||||
for (let v of ptc[domain]) {
|
||||
if (this.persistentTitleChunks[domain].hasOwnProperty(v) == false) {
|
||||
this.persistentTitleChunks[domain][v] = 1
|
||||
}
|
||||
}
|
||||
}
|
||||
if (verbose) console.log('loaded existing cdb from disc')
|
||||
} else {
|
||||
this.queryVariables = this._qv;
|
||||
this.persistentTitleChunks = ptc;
|
||||
}
|
||||
this._callback();
|
||||
this.save() //now save everything
|
||||
};
|
||||
|
||||
this.handleEvent = function(aEvent) {
|
||||
let eventType = aEvent.type;
|
||||
if (eventType == "message") {
|
||||
let msgData = aEvent.data;
|
||||
switch (msgData.message) {
|
||||
case "visitProcessComplete":
|
||||
this._handleVisitProcessComplete(msgData);
|
||||
break;
|
||||
case "computedPTC":
|
||||
this._handleComputedPTC(msgData);
|
||||
break;
|
||||
case "titleAnalyzed":
|
||||
this._handleAnalyzedTitle(msgData);
|
||||
break;
|
||||
}
|
||||
} else if (eventType == "error") {
|
||||
//TODO:handle error
|
||||
console.log(aEvent.message);
|
||||
}
|
||||
};
|
||||
|
||||
this._processNextHistoryEvent = function() {
|
||||
try {
|
||||
let nextVisit = this._history.next();
|
||||
this._history_total += 1;
|
||||
|
||||
this._worker.postMessage({
|
||||
command: "processHistoryEntry",
|
||||
payload: {
|
||||
"visit": nextVisit,
|
||||
"timestamp": this.meta['timestamp'],
|
||||
"qv": this._qv
|
||||
}
|
||||
});
|
||||
} catch (ex if ex instanceof StopIteration) {
|
||||
if (verbose) console.log("Total history items loaded: " + this._history_total);
|
||||
if (verbose) console.log("Finding common suffixes in " + Object.keys(this._domain_titles).length + " domains ");
|
||||
|
||||
this._worker.postMessage({
|
||||
command: "computePTC",
|
||||
payload: {
|
||||
"domain_titles": this._domain_titles,
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
this.scan = function(start, end) {
|
||||
this._history = getHistory(start, end);
|
||||
this._history_total = 0;
|
||||
this._start = start;
|
||||
this._end = end;
|
||||
this._qv = {}; //query variables
|
||||
this._ptc = {}; //persistent title components
|
||||
this._domain_titles = {};
|
||||
this._totalTitles = 0;
|
||||
this._processNextHistoryEvent(start, end);
|
||||
}
|
||||
|
||||
this.load_meta = function() {
|
||||
return Task.spawn(function*() {
|
||||
if (verbose) console.log("load_meta function called")
|
||||
//load meta
|
||||
let decoder = new TextDecoder();
|
||||
|
||||
/////////DEBUGGING
|
||||
let meta_location = OS.Path.join(OS.Constants.Path.profileDir, "meta.json");
|
||||
console.log("Meta should be stored at: " + meta_location)
|
||||
|
||||
let meta_exists =
|
||||
yield OS.File.exists(meta_location);
|
||||
if (meta_exists) {
|
||||
console.log("Meta file exists");
|
||||
} else {
|
||||
console.log("Meta does not exist");
|
||||
return false;
|
||||
}
|
||||
///////////////////
|
||||
|
||||
try {
|
||||
let array =
|
||||
yield OS.File.read(meta_location);
|
||||
if (verbose) console.log('onSuccess for meta loading called')
|
||||
let info = decoder.decode(array);
|
||||
let data = JSON.parse(info)
|
||||
if (verbose) console.log('meta data found was: ' + JSON.stringify(data))
|
||||
this.meta = data
|
||||
return true //loads meta information into an object with timestamp and id
|
||||
} catch (ex) {
|
||||
if (verbose) console.log("Meta was not found")
|
||||
return false //file doesn't exist
|
||||
}
|
||||
}.bind(this));
|
||||
};
|
||||
|
||||
this.load_component_database = function() {
|
||||
return Task.spawn(function*() {
|
||||
//loads the component database if it exists, else returns false
|
||||
let decoder = new TextDecoder();
|
||||
try {
|
||||
let array =
|
||||
yield OS.File.read(OS.Path.join(OS.Constants.Path.profileDir, "cdb.json"));
|
||||
let info = decoder.decode(array);
|
||||
info = JSON.parse(info)
|
||||
this.queryVariables = info['queryVariables']
|
||||
this.persistentTitleChunks = info['persistentTitleChunks']
|
||||
return true
|
||||
} catch (ex) {
|
||||
return false //file doesn't exist
|
||||
}
|
||||
}.bind(this));
|
||||
};
|
||||
|
||||
this.save = function() {
|
||||
return Task.spawn(function*() {
|
||||
//assumes that both cdb and meta have been created
|
||||
let encoder = new TextEncoder();
|
||||
let meta_enc = encoder.encode(JSON.stringify(this.meta));
|
||||
let cdb_enc = encoder.encode(JSON.stringify({
|
||||
'queryVariables': this.queryVariables,
|
||||
'persistentTitleChunks': this.persistentTitleChunks
|
||||
}));
|
||||
//save meta
|
||||
yield OS.File.writeAtomic(OS.Path.join(OS.Constants.Path.profileDir, "meta.json"), meta_enc, {
|
||||
tmpPath: OS.Path.join(OS.Constants.Path.profileDir, "meta.json.tmp")
|
||||
});
|
||||
//save component database
|
||||
yield OS.File.writeAtomic(OS.Path.join(OS.Constants.Path.profileDir, "cdb.json"), cdb_enc, {
|
||||
tmpPath: OS.Path.join(OS.Constants.Path.profileDir, "cdb.json.tmp")
|
||||
});
|
||||
}.bind(this));
|
||||
};
|
||||
}
|
||||
|
||||
function removePersistentTitleChunks(url, title, cdb) {
|
||||
//Removes common title endings such as " - Google Search" using the component database
|
||||
|
||||
let domain = getDomain(url)
|
||||
if (cdb.hasOwnProperty(domain)) {
|
||||
for (let suffix of cdb[domain]) {
|
||||
if (title.toLowerCase().endsWith(suffix.toLowerCase())) {
|
||||
//chop suffix from end
|
||||
title = title.slice(0, title.length - suffix.length)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return title
|
||||
}
|
||||
|
||||
function removeDomainNames(url, title) {
|
||||
//tries to remove the domain name (or aspects of it) from the title
|
||||
//if this reduces the title to nothing, then just leave them in
|
||||
url = parseUri(url)
|
||||
url = url.host.split(".")
|
||||
title = title.toLowerCase().match(wordFinder)
|
||||
|
||||
let new_title = []
|
||||
let removed = []
|
||||
|
||||
for (let token of title) {
|
||||
if (url.indexOf(token) == -1) {
|
||||
new_title.push(token)
|
||||
}
|
||||
}
|
||||
|
||||
if (new_title.length == 0) {
|
||||
return title.join(" ")
|
||||
} else {
|
||||
return new_title.join(" ")
|
||||
}
|
||||
}
|
||||
|
||||
// Classification
|
||||
|
||||
function cosineSimilarity(text, category_keywords, category_magnitude) {
|
||||
//calculates the cosine similarity between the two arguments
|
||||
//expects text to be an array of strings
|
||||
//expects category_keywords to be an object of string: int
|
||||
//returns a float
|
||||
|
||||
//create vector
|
||||
let vector = {} //object of word: [text count, category count]
|
||||
for (let word of text) {
|
||||
if (vector.hasOwnProperty(word) == false) {
|
||||
if (category_keywords.hasOwnProperty(word) == false) {
|
||||
vector[word] = [1, 0]
|
||||
} else {
|
||||
vector[word] = [1, category_keywords[word]]
|
||||
}
|
||||
} else {
|
||||
vector[word][0] += 1
|
||||
}
|
||||
}
|
||||
|
||||
//calculate dot product
|
||||
|
||||
let dot_product = 0
|
||||
let text_vector_magnitude = 0
|
||||
|
||||
for (let word in vector) {
|
||||
dot_product += (vector[word][0] * vector[word][1])
|
||||
text_vector_magnitude += Math.pow(vector[word][0], 2)
|
||||
}
|
||||
|
||||
let denominator = Math.sqrt(text_vector_magnitude) * category_magnitude
|
||||
|
||||
if (denominator != 0) {
|
||||
return dot_product / denominator
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
// Post processing
|
||||
|
||||
function augmentRepeatWords(results) {
|
||||
//Adds 1 to the score of any result containing a repeated word
|
||||
|
||||
wordCounts = {}
|
||||
for (i = 0; i < results.length; i++) {
|
||||
tokens = results[i][0].toLowerCase().match(wordFinder)
|
||||
for (let token of tokens) {
|
||||
if (wordCounts.hasOwnProperty(token) == false) {
|
||||
wordCounts[token] = 0
|
||||
}
|
||||
wordCounts[token] += 1
|
||||
}
|
||||
}
|
||||
|
||||
//now go through again and find the repeats
|
||||
for (i = 0; i < results.length; i++) {
|
||||
tokens = results[i][0].toLowerCase().match(wordFinder)
|
||||
for (let token of tokens) {
|
||||
if (wordCounts[token] > 1) { //must be a repeat
|
||||
results[i][1] += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
function augmentQueries(url, results, queryDatabase) {
|
||||
//Tries to spot any search queries in the url
|
||||
//Doubles the score of anything that contains a search query word
|
||||
|
||||
if (verbose) console.log("URL: " + url)
|
||||
|
||||
let queries = [] //a list of strings
|
||||
url = parseUri(url) //
|
||||
|
||||
if (queryDatabase.hasOwnProperty(url.host)) { //if the domain is in the db
|
||||
if (verbose) console.log("Domain: " + url.host + " is in the database")
|
||||
if (verbose) console.log("There are " + Object.keys(url.queryKey).length + " keys in the url")
|
||||
for (let variable in url.queryKey) { //iterate through url get variables
|
||||
if (queryDatabase[url.host].hasOwnProperty(variable)) { //if in the db
|
||||
query = unescape(url.queryKey[variable]) //append to list
|
||||
queries.concat(query.match(wordFinder))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//now find any result that contains a query word
|
||||
if (queries.length > 0) {
|
||||
for (let result in results) {
|
||||
if (verbose) console.log("Iterating through results")
|
||||
for (let word of queries) {
|
||||
if (results[result][0].indexOf(word) != -1) {
|
||||
results[result][1] *= 2 //double the score
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
// Auxiliary functions, matchers, options etc
|
||||
|
||||
const {data} = require("sdk/self"); //not quite sure why this is necessary
|
||||
let {TextEncoder, TextDecoder, OS} = Cu.import("resource://gre/modules/osfile.jsm", {}); //for file IO
|
||||
let historyService = Cc["@mozilla.org/browser/nav-history-service;1"].getService(Ci.nsINavHistoryService);
|
||||
let scriptLoader = Cc["@mozilla.org/moz/jssubscript-loader;1"].getService(Ci.mozIJSSubScriptLoader);
|
||||
scriptLoader.loadSubScript(data.url("words.js"));
|
||||
scriptLoader.loadSubScript(data.url("rules.js"));
|
||||
|
||||
function getDomain(url) {
|
||||
//returns the (sub)domain of a url
|
||||
//subdomains are treated as different entities to top level urls
|
||||
if (url.indexOf("://") != -1) {
|
||||
url = url.split("://")[1]
|
||||
if (url.indexOf("/") != -1) {
|
||||
url = url.split("/")[0]
|
||||
}
|
||||
if (url.indexOf("?") != -1) {
|
||||
url = url.split("?")[0]
|
||||
}
|
||||
} else {
|
||||
return false
|
||||
}
|
||||
return url
|
||||
}
|
||||
|
||||
function getHistory(start, end) {
|
||||
//Generator that yields the most recent history urls one by one
|
||||
//Returned in the form [url, title, timestamp]
|
||||
|
||||
//make a blank query
|
||||
let options = historyService.getNewQueryOptions();
|
||||
options.sortingMode = Ci.nsINavHistoryQueryOptions.SORT_BY_DATE_DESCENDING;
|
||||
let query = historyService.getNewQuery();
|
||||
query.beginTime = start;
|
||||
query.endTime = end;
|
||||
let result = historyService.executeQuery(query, options);
|
||||
|
||||
//open up the results
|
||||
let cont = result.root;
|
||||
cont.containerOpen = true;
|
||||
|
||||
//yield whatever there is
|
||||
for (let i = 0; i < cont.childCount; i++) {
|
||||
let node = cont.getChild(i);
|
||||
yield [node.uri, node.title, node.time, cont.childCount];
|
||||
}
|
||||
|
||||
//close the results container
|
||||
cont.containerOpen = false;
|
||||
}
|
||||
|
||||
function parseUri(str) {
|
||||
// parseUri 1.2.2
|
||||
// (c) Steven Levithan <stevenlevithan.com>
|
||||
// MIT License
|
||||
// http://blog.stevenlevithan.com/archives/parseuri
|
||||
var o = parseUri.options,
|
||||
m = o.parser[o.strictMode ? "strict" : "loose"].exec(str),
|
||||
uri = {},
|
||||
i = 14;
|
||||
|
||||
while (i--) uri[o.key[i]] = m[i] || "";
|
||||
|
||||
uri[o.q.name] = {};
|
||||
uri[o.key[12]].replace(o.q.parser, function($0, $1, $2) {
|
||||
if ($1) uri[o.q.name][$1] = $2;
|
||||
});
|
||||
|
||||
return uri;
|
||||
};
|
||||
|
||||
parseUri.options = {
|
||||
strictMode: false,
|
||||
key: ["source", "protocol", "authority", "userInfo", "user", "password", "host", "port", "relative", "path", "directory", "file", "query", "anchor"],
|
||||
q: {
|
||||
name: "queryKey",
|
||||
parser: /(?:^|&)([^&=]*)=?([^&]*)/g
|
||||
},
|
||||
parser: {
|
||||
strict: /^(?:([^:\/?#]+):)?(?:\/\/((?:(([^:@]*)(?::([^:@]*))?)?@)?([^:\/?#]*)(?::(\d*))?))?((((?:[^?#\/]*\/)*)([^?#]*))(?:\?([^#]*))?(?:#(.*))?)/,
|
||||
loose: /^(?:(?![^:@]+:[^:@\/]*@)([^:\/?#.]+):)?(?:\/\/)?((?:(([^:@]*)(?::([^:@]*))?)?@)?([^:\/?#]*)(?::(\d*))?)(((\/(?:[^?#](?![^?#\/]*\.[^?#\/.]+(?:[?#]|$)))*\/?)?([^?#\/]*))(?:\?([^#]*))?(?:#(.*))?)/
|
||||
}
|
||||
};
|
||||
|
||||
String.prototype.endsWith = function(suffix) {
|
||||
//http://stackoverflow.com/a/2548133/849354
|
||||
return this.indexOf(suffix, this.length - suffix.length) !== -1;
|
||||
};
|
||||
|
||||
var wordFinder = RegExp("[a-z]{3,}", "g") //tokenizes english sentences
|
||||
var spaceFinder = RegExp(/.+(%20|\+|\s).+/g) //finds get variable values that have spaces in them
|
||||
//bizarrely, if spaceFinder is declared in the way wordFinder is (two args), it returns an error. Oh JS...
|
||||
|
||||
function sortDescendingBySecondElement(first, second) {
|
||||
//function to be used in sort(some_function)
|
||||
//does what it says on the tin
|
||||
first = first[1]
|
||||
second = second[1]
|
||||
if (first == second) {
|
||||
return 0
|
||||
} else {
|
||||
if (first > second) {
|
||||
return false
|
||||
} else {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function sortDescendingByElementLength(first, second) {
|
||||
//sorting function to sort a list of strings
|
||||
return second.length - first.length
|
||||
}
|
||||
|
||||
function loadClassifications() {
|
||||
//returns an id to iab mapping
|
||||
//loads meta information into an object with timestamp and id
|
||||
let decoder = new TextDecoder();
|
||||
let promise = OS.File.read(OS.Path.join(OS.Constants.Path.profileDir, "meta.json"));
|
||||
promise = promise.then(
|
||||
function onSuccess(array) {
|
||||
let info = decoder.decode(array);
|
||||
info = JSON.parse(info)
|
||||
|
||||
//now expand it
|
||||
//create an id-to-text version of the mapping
|
||||
id_to_text = {}
|
||||
for (let iab in info['mapping']) {
|
||||
id = info['mapping'][iab]
|
||||
id_to_text[id] = iab
|
||||
}
|
||||
|
||||
//need id to text version of iab
|
||||
for (let visitid in info['classifications']) {
|
||||
mapping_id = info['classifications'][visitid]
|
||||
info['classifications'][visitid] = id_to_text[mapping_id]
|
||||
}
|
||||
|
||||
return info['classifications']
|
||||
|
||||
},
|
||||
function onFailure() {
|
||||
return false //file doesn't exist
|
||||
}
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
//for the extension main.js to access
|
||||
exports.LWCAClassifier = LWCAClassifier
|
||||
exports.ComponentDatabase = ComponentDatabase
|
|
@ -46,7 +46,9 @@ exports.testUtils = {
|
|||
return false;
|
||||
},
|
||||
|
||||
getWorker : function getWorker({namespace, domainRules, textModel, urlStopWords, listener, regionCode}) {
|
||||
getWorker : function getWorker({namespace, domainRules, textModel, urlStopWords,
|
||||
listener, regionCode, domain_rules, host_rules, path_rules,
|
||||
words_tree, ignore_words, ignore_domains, ignore_exts, bad_domain_specific}) {
|
||||
let worker = new ChromeWorker(data.url("interests/interestsWorker.js"));
|
||||
worker.addEventListener("message", listener, false);
|
||||
worker.addEventListener("error", listener, false);
|
||||
|
@ -58,7 +60,15 @@ exports.testUtils = {
|
|||
interestsDataType: "dfr",
|
||||
interestsData: domainRules,
|
||||
interestsClassifierModel: textModel,
|
||||
interestsUrlStopwords: urlStopWords
|
||||
interestsUrlStopwords: urlStopWords,
|
||||
domain_rules: domain_rules,
|
||||
host_rules: host_rules,
|
||||
path_rules: path_rules,
|
||||
words_tree: words_tree,
|
||||
ignore_words: ignore_words,
|
||||
ignore_domains: ignore_domains,
|
||||
ignore_exts: ignore_exts,
|
||||
bad_domain_specific: bad_domain_specific
|
||||
}
|
||||
});
|
||||
return worker;
|
||||
|
|
|
@ -22,7 +22,6 @@ const {DayCountRankerBolt} = require("streams/dayCountRankerBolt");
|
|||
const {DailyInterestsSpout} = require("streams/dailyInterestsSpout");
|
||||
const {ChartDataProcessorBolt} = require("streams/chartDataProcessorBolt");
|
||||
const {getPlacesHostForURI, getBaseDomain} = require("Utils");
|
||||
const {LWCAClassifier} = require("lwca_refined");
|
||||
const test = require("sdk/test");
|
||||
const {data} = require("sdk/self");
|
||||
|
||||
|
@ -63,11 +62,7 @@ exports["test read all"] = function test_readAll(assert, done) {
|
|||
let storageBackend = {};
|
||||
let streamObjects = initStream(storageBackend);
|
||||
|
||||
let worker = new ChromeWorker(data.url("interests/lwcaWorker.js"));
|
||||
let lwcaClassifier = new LWCAClassifier(worker);
|
||||
yield lwcaClassifier.init();
|
||||
|
||||
let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(), streamObjects, 0, lwcaClassifier, storageBackend);
|
||||
let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(), streamObjects, 0, storageBackend);
|
||||
yield historyReader.resubmitHistory({startDay: today-20});
|
||||
|
||||
let assertDeferred = oldPromise.defer();
|
||||
|
@ -100,12 +95,8 @@ exports["test read from given timestamp"] = function test_readFromGivenTimestamp
|
|||
let storageBackend = {};
|
||||
let streamObjects = initStream(storageBackend);
|
||||
|
||||
let worker = new ChromeWorker(data.url("interests/lwcaWorker.js"));
|
||||
let lwcaClassifier = new LWCAClassifier(worker);
|
||||
yield lwcaClassifier.init();
|
||||
|
||||
// only read starting from id == 10
|
||||
let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(), streamObjects, (today-10)*MICROS_PER_DAY, lwcaClassifier, storageBackend);
|
||||
let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(), streamObjects, (today-10)*MICROS_PER_DAY, storageBackend);
|
||||
yield historyReader.resubmitHistory({startDay: today-20});
|
||||
|
||||
let assertDeferred = oldPromise.defer();
|
||||
|
@ -143,12 +134,8 @@ exports["test chunk size 1"] = function test_ChunkSize1(assert, done) {
|
|||
let storageBackend = {};
|
||||
let streamObjects = initStream(storageBackend);
|
||||
|
||||
let worker = new ChromeWorker(data.url("interests/lwcaWorker.js"));
|
||||
let lwcaClassifier = new LWCAClassifier(worker);
|
||||
yield lwcaClassifier.init();
|
||||
|
||||
// only read starting from id == 10
|
||||
let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects, 10, lwcaClassifier, storageBackend);
|
||||
let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects, 10, storageBackend);
|
||||
yield historyReader.resubmitHistory({startDay: today-20});
|
||||
|
||||
let assertDeferred = oldPromise.defer();
|
||||
|
@ -167,7 +154,7 @@ exports["test chunk size 1"] = function test_ChunkSize1(assert, done) {
|
|||
// now set chunksize to 1 and read from same id
|
||||
storageBackend = {};
|
||||
streamObjects = initStream(storageBackend);
|
||||
historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects, 10, lwcaClassifier, storageBackend);
|
||||
historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects, 10, storageBackend);
|
||||
yield historyReader.resubmitHistory({startDay: today-20, chunkSize: 1});
|
||||
|
||||
assertDeferred = oldPromise.defer();
|
||||
|
@ -200,11 +187,7 @@ exports["test accumulation"] = function test_Accumulation(assert, done) {
|
|||
let storageBackend = {};
|
||||
let streamObjects = initStream(storageBackend);
|
||||
|
||||
let worker = new ChromeWorker(data.url("interests/lwcaWorker.js"));
|
||||
let lwcaClassifier = new LWCAClassifier(worker);
|
||||
yield lwcaClassifier.init();
|
||||
|
||||
let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(), streamObjects, 0, lwcaClassifier, storageBackend);
|
||||
let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(), streamObjects, 0, storageBackend);
|
||||
yield historyReader.resubmitHistory({startDay: today-20});
|
||||
|
||||
let assertDeferred = oldPromise.defer();
|
||||
|
@ -244,11 +227,7 @@ exports["test stop and restart"] = function test_StopAndRestart(assert, done) {
|
|||
let storageBackend = {};
|
||||
let streamObjects = initStream(storageBackend);
|
||||
|
||||
let worker = new ChromeWorker(data.url("interests/lwcaWorker.js"));
|
||||
let lwcaClassifier = new LWCAClassifier(worker);
|
||||
yield lwcaClassifier.init();
|
||||
|
||||
let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects, 0, lwcaClassifier, storageBackend);
|
||||
let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects, 0, storageBackend);
|
||||
|
||||
let processDeferred;
|
||||
|
||||
|
@ -298,7 +277,7 @@ exports["test stop and restart"] = function test_StopAndRestart(assert, done) {
|
|||
}
|
||||
});
|
||||
|
||||
historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects, 0, lwcaClassifier, storageBackend);
|
||||
historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects, 0, storageBackend);
|
||||
let promise = historyReader.resubmitHistory({startDay: today-61});
|
||||
let cycles = 0;
|
||||
while (true) {
|
||||
|
@ -309,7 +288,7 @@ exports["test stop and restart"] = function test_StopAndRestart(assert, done) {
|
|||
if (lastTimeStamp == theVeryLastTimeStamp) {
|
||||
break;
|
||||
}
|
||||
historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects,lastTimeStamp, lwcaClassifier, storageBackend);
|
||||
historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects,lastTimeStamp, storageBackend);
|
||||
promise = historyReader.resubmitHistory({startDay: today-61});
|
||||
cycles ++;
|
||||
}
|
||||
|
@ -362,11 +341,7 @@ exports["test tldCounter"] = function test_TldCounter(assert, done) {
|
|||
let storageBackend = {};
|
||||
let streamObjects = initStream(storageBackend);
|
||||
|
||||
let worker = new ChromeWorker(data.url("interests/lwcaWorker.js"));
|
||||
let lwcaClassifier = new LWCAClassifier(worker);
|
||||
yield lwcaClassifier.init();
|
||||
|
||||
let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects,0, lwcaClassifier, storageBackend);
|
||||
let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects,0, storageBackend);
|
||||
yield historyReader.resubmitHistory({startDay: today-20},1);
|
||||
assert.deepEqual(storageBackend.tldCounter,
|
||||
{"au":{"mysql.au":1,"facebook.au":1},
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
|
||||
const {testUtils} = require("./helpers");
|
||||
const {Cc, Ci, Cu, ChromeWorker} = require("chrome");
|
||||
const {data} = require("sdk/self");
|
||||
const oldPromise = require("sdk/core/promise");
|
||||
Cu.import("resource://gre/modules/Services.jsm");
|
||||
Cu.import("resource://gre/modules/NetUtil.jsm");
|
||||
|
@ -64,13 +65,26 @@ exports["test default matcher"] = function test_default_matcher(assert, done) {
|
|||
} // end of handleEvent
|
||||
};
|
||||
|
||||
|
||||
let scriptLoader = Cc["@mozilla.org/moz/jssubscript-loader;1"].getService(Ci.mozIJSSubScriptLoader);
|
||||
scriptLoader.loadSubScript(data.url("words.js"));
|
||||
scriptLoader.loadSubScript(data.url("rules.js"));
|
||||
|
||||
let worker = testUtils.getWorker({
|
||||
namespace: "test-Matching",
|
||||
regionCode: 'zh-CN',
|
||||
listener: workerTester,
|
||||
domainRules: testDomainRules,
|
||||
textModel: null,
|
||||
urlStopWords: ['php', 'html']
|
||||
urlStopWords: ['php', 'html'],
|
||||
domain_rules: domain_rules,
|
||||
host_rules: host_rules,
|
||||
path_rules: path_rules,
|
||||
words_tree: words_tree,
|
||||
ignore_words: ignore_words,
|
||||
ignore_domains: ignore_domains,
|
||||
ignore_exts: ignore_exts,
|
||||
bad_domain_specific: bad_domain_specific
|
||||
});
|
||||
|
||||
Task.spawn(function() {
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
|
||||
const {testUtils} = require("./helpers");
|
||||
const {Cc, Ci, Cu, ChromeWorker} = require("chrome");
|
||||
const {data} = require("sdk/self");
|
||||
const oldPromise = require("sdk/core/promise");
|
||||
Cu.import("resource://gre/modules/Services.jsm");
|
||||
Cu.import("resource://gre/modules/NetUtil.jsm");
|
||||
|
@ -170,12 +171,24 @@ exports["test default matcher"] = function test_default_matcher(assert, done) {
|
|||
} // end of handleEvent
|
||||
};
|
||||
|
||||
let scriptLoader = Cc["@mozilla.org/moz/jssubscript-loader;1"].getService(Ci.mozIJSSubScriptLoader);
|
||||
scriptLoader.loadSubScript(data.url("words.js"));
|
||||
scriptLoader.loadSubScript(data.url("rules.js"));
|
||||
|
||||
let worker = testUtils.getWorker({
|
||||
namespace: "test-Matching",
|
||||
listener: workerTester,
|
||||
domainRules: testDomainRules,
|
||||
textModel: null,
|
||||
urlStopWords: ['php', 'html']
|
||||
urlStopWords: ['php', 'html'],
|
||||
domain_rules: domain_rules,
|
||||
host_rules: host_rules,
|
||||
path_rules: path_rules,
|
||||
words_tree: words_tree,
|
||||
ignore_words: ignore_words,
|
||||
ignore_domains: ignore_domains,
|
||||
ignore_exts: ignore_exts,
|
||||
bad_domain_specific: bad_domain_specific
|
||||
});
|
||||
|
||||
Task.spawn(function() {
|
||||
|
|
|
@ -54,12 +54,24 @@ exports["test edrules text"] = function test_edrules_text(assert, done) {
|
|||
} // end of handleEvent
|
||||
};
|
||||
|
||||
let scriptLoader = Cc["@mozilla.org/moz/jssubscript-loader;1"].getService(Ci.mozIJSSubScriptLoader);
|
||||
scriptLoader.loadSubScript(data.url("words.js"));
|
||||
scriptLoader.loadSubScript(data.url("rules.js"));
|
||||
|
||||
let worker = testUtils.getWorker({
|
||||
namespace: "test-edrules-text",
|
||||
listener: workerTester,
|
||||
domainRules: null,
|
||||
textModel: interestsClassifierModel,
|
||||
urlStopWords: interestsUrlStopwords,
|
||||
domain_rules: domain_rules,
|
||||
host_rules: host_rules,
|
||||
path_rules: path_rules,
|
||||
words_tree: words_tree,
|
||||
ignore_words: ignore_words,
|
||||
ignore_domains: ignore_domains,
|
||||
ignore_exts: ignore_exts,
|
||||
bad_domain_specific: bad_domain_specific
|
||||
});
|
||||
|
||||
Task.spawn(function() {
|
||||
|
@ -166,12 +178,24 @@ exports["test text classifier"] = function test_text_classification(assert, done
|
|||
} // end of handleEvent
|
||||
};
|
||||
|
||||
let scriptLoader = Cc["@mozilla.org/moz/jssubscript-loader;1"].getService(Ci.mozIJSSubScriptLoader);
|
||||
scriptLoader.loadSubScript(data.url("words.js"));
|
||||
scriptLoader.loadSubScript(data.url("rules.js"));
|
||||
|
||||
let worker = testUtils.getWorker({
|
||||
namespace: "test-text-classifier",
|
||||
listener: workerTester,
|
||||
domainRules: null,
|
||||
textModel: riggedMatchTests.interestsClassifierModel,
|
||||
urlStopWords: interestsUrlStopwords,
|
||||
domain_rules: domain_rules,
|
||||
host_rules: host_rules,
|
||||
path_rules: path_rules,
|
||||
words_tree: words_tree,
|
||||
ignore_words: ignore_words,
|
||||
ignore_domains: ignore_domains,
|
||||
ignore_exts: ignore_exts,
|
||||
bad_domain_specific: bad_domain_specific
|
||||
});
|
||||
|
||||
Task.spawn(function() {
|
||||
|
|
|
@ -24,7 +24,8 @@ exports["test interest classifier"] = function test_UrlClassifier(assert, done)
|
|||
let results = yield urlClassifier.classifyPage("http://www.autoblog.com/","Drive honda");
|
||||
assert.equal(Object.keys(results).length, workers.length);
|
||||
assert.deepEqual(results["58-cat"].results,
|
||||
[{"type":"rules","interests":["cars"]},
|
||||
[{"type": "lwca", interests: ["uncategorized"], subcat: "dummy"},
|
||||
{"type":"rules","interests":["cars"]},
|
||||
{"type":"keywords","interests":[]},
|
||||
{"type":"combined","interests":["cars"]}
|
||||
]);
|
||||
|
|
Загрузка…
Ссылка в новой задаче