|
|
|
@ -27,8 +27,6 @@ function LWCAClassifier(worker) {
|
|
|
|
|
let cdb = new ComponentDatabase(worker); //objects that help match title components and query variables
|
|
|
|
|
//it also checks if it needs to be updated etc
|
|
|
|
|
|
|
|
|
|
let ce = new ClassificationEngine(false) // set this flag to "true" if you want titles with countries to be matched (see function for more details)
|
|
|
|
|
|
|
|
|
|
//build vk-tree
|
|
|
|
|
vk_tree = {}
|
|
|
|
|
for (let top_level of Object.keys(words_tree)) {
|
|
|
|
@ -178,90 +176,6 @@ function LWCAClassifier(worker) {
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//Handle requests
|
|
|
|
|
//this.classify = function(url, title) {
|
|
|
|
|
|
|
|
|
|
//pre process
|
|
|
|
|
//title = title.toLowerCase()
|
|
|
|
|
|
|
|
|
|
//if (verbose) console.log("Pre processing")
|
|
|
|
|
// //shortcuts
|
|
|
|
|
//let sd = spotDefinites(url, title)
|
|
|
|
|
//if (sd) {
|
|
|
|
|
// if (verbose) console.log("Spotted definite match")
|
|
|
|
|
// return sd
|
|
|
|
|
//}
|
|
|
|
|
|
|
|
|
|
//cleaning
|
|
|
|
|
//if (verbose) console.log("title before cleaning: " + title)
|
|
|
|
|
//title = removePersistentTitleChunks(url, title, cdb.persistentTitleChunks) //returns a string
|
|
|
|
|
//if (verbose) console.log("removed persistents: " + title)
|
|
|
|
|
//let chunks = getURLChunks(url)
|
|
|
|
|
//title = chunks + " " + title
|
|
|
|
|
//if (verbose) console.log("added url chunks: <" + chunks + ">")
|
|
|
|
|
//title = removeDomainNames(url, title) //try to remove domain names
|
|
|
|
|
//if (verbose) console.log("removed domain names: " + title)
|
|
|
|
|
|
|
|
|
|
//classify
|
|
|
|
|
//if (verbose) console.log("Classifying")
|
|
|
|
|
|
|
|
|
|
//if (verbose) console.log("Payload size is: " + Object.keys(payload).length)
|
|
|
|
|
//if (verbose) console.log("DomainRules size is: " + Object.keys(domainRules).length)
|
|
|
|
|
|
|
|
|
|
//cosine similarity
|
|
|
|
|
//let scores = ce.classify(url, title)
|
|
|
|
|
|
|
|
|
|
//if (verbose) console.log("scores: " + scores)
|
|
|
|
|
|
|
|
|
|
//if (scores.length == 0) {
|
|
|
|
|
// return ['uncategorized', 'dummy']
|
|
|
|
|
//}
|
|
|
|
|
|
|
|
|
|
//post process
|
|
|
|
|
//if (verbose) console.log("Post processing")
|
|
|
|
|
//
|
|
|
|
|
//if (verbose) console.log('looking for domain scores')
|
|
|
|
|
//let domain_scores = augmentDomainMatchers(url, title, scores)
|
|
|
|
|
//if (domain_scores != scores) {
|
|
|
|
|
// if (verbose) console.log('adjusted!')
|
|
|
|
|
// return domain_scores.sort(sortDescendingBySecondElement)[0]
|
|
|
|
|
//}
|
|
|
|
|
|
|
|
|
|
//if that didn't change anything, last resort is using queries and repeats
|
|
|
|
|
//if (verbose) console.log("trying query augmentation")
|
|
|
|
|
//scores = augmentQueries(url, scores, cdb.queryVariables)
|
|
|
|
|
//
|
|
|
|
|
////remove any scores with a similarity of less than 0.3
|
|
|
|
|
//let scores_filtered = []
|
|
|
|
|
//for (let s of scores) {
|
|
|
|
|
// if (s[1] >= 0.25) {
|
|
|
|
|
// scores_filtered.push(s)
|
|
|
|
|
// }
|
|
|
|
|
//}
|
|
|
|
|
//scores = scores_filtered
|
|
|
|
|
//
|
|
|
|
|
//if (verbose) console.log('scores: ' + scores)
|
|
|
|
|
// //console.log("trying repeat word augmentation")
|
|
|
|
|
// //scores = augmentRepeatWords(scores)
|
|
|
|
|
// //console.log('scores: ' + scores)
|
|
|
|
|
//
|
|
|
|
|
////convert from Wiki to IAB v2
|
|
|
|
|
//scores = convertWikiToIAB(scores)
|
|
|
|
|
//
|
|
|
|
|
////finish up
|
|
|
|
|
//if (verbose) console.log("Finishing up")
|
|
|
|
|
|
|
|
|
|
//return scores
|
|
|
|
|
|
|
|
|
|
//}
|
|
|
|
|
|
|
|
|
|
this.init = function() {
|
|
|
|
|
return Task.spawn(function*() {
|
|
|
|
|
yield cdb.init();
|
|
|
|
@ -593,31 +507,6 @@ function removeDomainNames(url, title) {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function getURLChunks(url) {
|
|
|
|
|
//gets chunks from the URL
|
|
|
|
|
//this will have filters such as only words that exist in mozcat or something
|
|
|
|
|
//but that requires a huge amount of edit distance stuff
|
|
|
|
|
|
|
|
|
|
url = url.toLowerCase()
|
|
|
|
|
//domain = getDomain(url)
|
|
|
|
|
//url = url.split(domain)[1] //remove domain itself
|
|
|
|
|
//url = url.split("?")[0] //eliminate query variables
|
|
|
|
|
|
|
|
|
|
url = url.match(wordFinder)
|
|
|
|
|
if (verbose) console.log('url chunks found in word finder: ' + url)
|
|
|
|
|
|
|
|
|
|
let useful_words = []
|
|
|
|
|
for (let word of url) {
|
|
|
|
|
if (word in mozcat_words) {
|
|
|
|
|
useful_words.push(word)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (verbose) console.log('url chunks left after mozcat_words: ' + useful_words)
|
|
|
|
|
|
|
|
|
|
return useful_words.join(" ")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Classification
|
|
|
|
|
|
|
|
|
|
function cosineSimilarity(text, category_keywords, category_magnitude) {
|
|
|
|
@ -659,84 +548,6 @@ function cosineSimilarity(text, category_keywords, category_magnitude) {
|
|
|
|
|
return 0
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function ClassificationEngine(world = false) {
|
|
|
|
|
//a class that can classify a visit
|
|
|
|
|
//There's one option, a flag called "world". Basically there are lots of titles where the only useful
|
|
|
|
|
//thing that can be matched to is the fact that it contains "china" or "argentina". These fall into the
|
|
|
|
|
// <world> top level category in the tree. They will be later used for plotting things on a map, but at the
|
|
|
|
|
//moment I think we can sacrifice some recall for the higher precision that this will attain.
|
|
|
|
|
|
|
|
|
|
//initializer and world remover
|
|
|
|
|
let categories = []
|
|
|
|
|
if (world === true) {
|
|
|
|
|
categories = Object.keys(new_mappings)
|
|
|
|
|
} else {
|
|
|
|
|
for (let k of Object.keys(new_mappings)) {
|
|
|
|
|
if (tree['world'].indexOf(new_mappings[k]) == -1) {
|
|
|
|
|
categories.push(k)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (verbose) console.log('<world> is set to ' + world + " and the inverse index will use " + categories.length + " categories out of " + Object.keys(new_mappings).length + " total")
|
|
|
|
|
|
|
|
|
|
//build inverse index and magnitudes
|
|
|
|
|
this.id_to_article = {}
|
|
|
|
|
this.inverse_index = {}
|
|
|
|
|
this.magnitudes = {} //note that magnitudes are based on article ids, not category names
|
|
|
|
|
|
|
|
|
|
for (let index = 0; index < categories.length; index++) {
|
|
|
|
|
let category = categories[index]
|
|
|
|
|
let keywords = payload[category]
|
|
|
|
|
let magnitude = 0
|
|
|
|
|
|
|
|
|
|
this.id_to_article[index] = category
|
|
|
|
|
for (let k in keywords) {
|
|
|
|
|
if (this.inverse_index.hasOwnProperty(k) == false) {
|
|
|
|
|
this.inverse_index[k] = [index]
|
|
|
|
|
} else {
|
|
|
|
|
this.inverse_index[k].push(index)
|
|
|
|
|
}
|
|
|
|
|
magnitude += Math.pow(keywords[k], 2)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
magnitude = Math.sqrt(magnitude) //precalculate magnitude square roots
|
|
|
|
|
this.magnitudes[index] = magnitude
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//classifier
|
|
|
|
|
this.classify = function(url, title) {
|
|
|
|
|
title = title.toLowerCase().match(wordFinder)
|
|
|
|
|
let matches = []
|
|
|
|
|
|
|
|
|
|
let articles = {} // a set of articles worth looking at, auto-deduped
|
|
|
|
|
|
|
|
|
|
for (let keyword of title) {
|
|
|
|
|
if (this.inverse_index.hasOwnProperty(keyword)) {
|
|
|
|
|
for (let article of this.inverse_index[keyword]) {
|
|
|
|
|
articles[article] = true //effectively the set intersection
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let scores = [] //classify against each category
|
|
|
|
|
|
|
|
|
|
for (let article_number in articles) {
|
|
|
|
|
let category = this.id_to_article[article_number]
|
|
|
|
|
let words = payload[category]
|
|
|
|
|
let similarity = cosineSimilarity(title, words, this.magnitudes[article_number])
|
|
|
|
|
if (similarity != 0) {
|
|
|
|
|
scores.push([category, similarity])
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
scores = scores.sort(sortDescendingBySecondElement)
|
|
|
|
|
return scores.slice(0, 10)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Post processing
|
|
|
|
|
|
|
|
|
|
function augmentRepeatWords(results) {
|
|
|
|
@ -766,163 +577,6 @@ function augmentRepeatWords(results) {
|
|
|
|
|
return results
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function augmentDomainMatchers(url, title, results) {
|
|
|
|
|
// grab domain classifications and multiply those that have
|
|
|
|
|
// matching word lemmas/stems
|
|
|
|
|
|
|
|
|
|
//typically anything called society or reference is a bad classification
|
|
|
|
|
let ignore = {
|
|
|
|
|
'society': true,
|
|
|
|
|
'reference': true,
|
|
|
|
|
'uncategorized': true,
|
|
|
|
|
'__news_counter': true,
|
|
|
|
|
'marketing': true,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let class_maps = {
|
|
|
|
|
'history': ['histor'],
|
|
|
|
|
'sports': ['sport', 'gam'],
|
|
|
|
|
'computers': ['comput', 'tech', 'algorithm', 'model'],
|
|
|
|
|
'science': ['theor', 'hypothes', 'species', 'scien'],
|
|
|
|
|
'shopping': ['store', 'shop', 'brand', 'outlet', 'inc', 'ltd', 'compan'],
|
|
|
|
|
'news': ['the ', 'daily', 'morning', 'times', 'new'],
|
|
|
|
|
'health': ['diet', 'health'],
|
|
|
|
|
'hobby': ['interest', 'coin', 'stamp', 'hobb'],
|
|
|
|
|
'cuisine': ['cuisine', 'culinary', 'food', 'sauce', 'method', 'cook', 'technique', 'style'],
|
|
|
|
|
'travel': ['city', 'travel', 'rout', 'hotel', 'town', 'countr', 'state', 'region'],
|
|
|
|
|
'education': ['school', 'education', 'class', 'university', 'college', 'campus'],
|
|
|
|
|
'family': ['parent', 'famil', 'child', 'matern', 'father', 'mother', 'pat', 'mat', 'sister', 'brother', 'pregnan'],
|
|
|
|
|
'finance': ['bank', 'financ', 'institut', 'loan', 'rate', 'tax'],
|
|
|
|
|
'business': ['compan', 'inc', 'ltd', 'business'],
|
|
|
|
|
'video-games': ['gam', 'video', 'computer', 'system', 'console'],
|
|
|
|
|
'fashion': ['brand', 'design', 'fashion'],
|
|
|
|
|
'tv': ['telev', 'tv', 'show', 'series', 'episode', 'season', 'character', 'act', 'theme'],
|
|
|
|
|
'movies': ['film', 'movie', 'direct', 'act', 'prod', 'cinem', 'studio', 'set'],
|
|
|
|
|
'technology': ['tech', 'digit', 'elec'],
|
|
|
|
|
'food': ['recipe', 'restaurant', 'bar', 'cuisine', 'food', 'sauce', 'cook', 'technique', 'style'],
|
|
|
|
|
'women': ['wom', 'fem'],
|
|
|
|
|
'government': ['gov', 'admin', 'dept', 'nationa', 'polic'],
|
|
|
|
|
'discounts': ['coupon', 'discount'],
|
|
|
|
|
'consumer-electronics': ['model', 'brand', 'series', 'inc'],
|
|
|
|
|
'arts': ['artist', 'paint', 'direct'],
|
|
|
|
|
'politics': ['gov', 'polit', 'polic', 'law', 'charter', 'treat', 'part', 'coalition', 'bill', 'usc', 'parl', 'tax', 'camp'],
|
|
|
|
|
'music': ['music', 'band', 'album', 'single', 'side', 'release', 'song', 'sing', 'lyric', 'genre', 'style'],
|
|
|
|
|
'banking': ['bank', 'financ', 'institut', 'account', 'credit', 'debit'],
|
|
|
|
|
'drinks': ['drink', 'ingredient'],
|
|
|
|
|
'religion': ['religi', 'church', 'temple', 'congregat'],
|
|
|
|
|
'cars': ['car', 'model', 'engin', 'moto', 'auto'],
|
|
|
|
|
'outdoors': ['range', 'rout'],
|
|
|
|
|
'reading': ['read', 'book', 'novel', 'ser', 'auth'],
|
|
|
|
|
'games': ['game', 'lotter'],
|
|
|
|
|
'home': ['home', 'style'],
|
|
|
|
|
'career': ['career', 'job', 'pro'],
|
|
|
|
|
'weather': ['hurr', 'season'],
|
|
|
|
|
'photography': ['style'],
|
|
|
|
|
'entertainment': ['entertain'],
|
|
|
|
|
'blogging': ['blog'],
|
|
|
|
|
'reviews': ['review'],
|
|
|
|
|
'image-sharing': ['imag', 'shar'],
|
|
|
|
|
'relationship': ['relation'],
|
|
|
|
|
'clothes': ['brand', 'cloth', 'design', 'fashion'],
|
|
|
|
|
'shoes': ['shoe', 'foot'],
|
|
|
|
|
'email': ['mail'],
|
|
|
|
|
'law': ['law', 'bill', 'treat', 'armis', 'cease', 'peace', 'legal', 'camp'],
|
|
|
|
|
'real-estate': ['real', 'estate', 'zone', 'house', 'apart'],
|
|
|
|
|
'radio': ['radio', 'channel', 'station'],
|
|
|
|
|
'men': ['male', 'man', 'masc', 'men'],
|
|
|
|
|
'pets': ['spec', 'breed', 'type', 'animal', 'pet'],
|
|
|
|
|
'maps': ['map', 'chart', 'cart', 'projec'],
|
|
|
|
|
'writing': ['author', 'book', 'series', 'issue', 'style', 'writ'],
|
|
|
|
|
'motorcycles': ['bike', 'motor'],
|
|
|
|
|
'dance': ['danc'],
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
url = parseUri(url)
|
|
|
|
|
title = title.toLowerCase()
|
|
|
|
|
//have to basically iteratively check if bits of the url are in domainRules
|
|
|
|
|
//e.g. http://something.search.domain.com should first search for everything,
|
|
|
|
|
//then search.domain.com, then domain.com
|
|
|
|
|
//no point in searching for just .com
|
|
|
|
|
|
|
|
|
|
domain = url.host.split(".")
|
|
|
|
|
for (let dot_count in domain) {
|
|
|
|
|
let key = domain.slice(dot_count).join(".")
|
|
|
|
|
if (domainRules.hasOwnProperty(key)) {
|
|
|
|
|
//found an entry in domainRules
|
|
|
|
|
|
|
|
|
|
//For example:
|
|
|
|
|
// "engadget.com" : {
|
|
|
|
|
// "topics robots" : "science",
|
|
|
|
|
// "imac" : "computers",
|
|
|
|
|
// "__ANY" : [
|
|
|
|
|
// "technology",
|
|
|
|
|
// "shopping",
|
|
|
|
|
// "consumer-electronics"
|
|
|
|
|
// ],
|
|
|
|
|
// "review" : "reviews",
|
|
|
|
|
// "tag nintendowiiu" : "video-games"
|
|
|
|
|
// },
|
|
|
|
|
|
|
|
|
|
let category_matchers = domainRules[key]
|
|
|
|
|
let decision = false
|
|
|
|
|
let keys = Object.keys(category_matchers).sort()
|
|
|
|
|
|
|
|
|
|
//iterate through all keys, __ANY comes last to see if one matches
|
|
|
|
|
for (let k in Object.keys(category_matchers)) {
|
|
|
|
|
if (k != "__ANY") {
|
|
|
|
|
let tokens = k.split(" ")
|
|
|
|
|
let match_count = 0
|
|
|
|
|
|
|
|
|
|
for (let token of tokens) {
|
|
|
|
|
if (title.indexOf(token) != -1) {
|
|
|
|
|
match_count += 1
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (match_count == tokens.length) {
|
|
|
|
|
decision = category_matchers[k]
|
|
|
|
|
if (verbose) console.log("Exact token match found")
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//check if decision was made
|
|
|
|
|
if (decision == false) {
|
|
|
|
|
if (category_matchers.hasOwnProperty("__ANY")) { //if not, look at __ANY
|
|
|
|
|
if (verbose) console.log("No exact title token match found, so going with __ANY, which is: " + category_matchers['__ANY'])
|
|
|
|
|
decision = category_matchers['__ANY']
|
|
|
|
|
} else {
|
|
|
|
|
return results //if there's still nothing, just return the original results from the argument
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//now try and rerank results based on components
|
|
|
|
|
if (typeof decision === "string") { //decision could be 1 or more categories, make it consistent
|
|
|
|
|
decision = [decision]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//now iterate through the decision categories and add 1 to each result
|
|
|
|
|
//category that contains the stems
|
|
|
|
|
|
|
|
|
|
for (let category of decision) {
|
|
|
|
|
if (class_maps.hasOwnProperty(category)) {
|
|
|
|
|
for (i = 0; i < results.length; i++) {
|
|
|
|
|
for (let stem of class_maps[category]) {
|
|
|
|
|
if (results[i][0].toLowerCase().indexOf(stem) != -1) {
|
|
|
|
|
results[i][1] += 1
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return results
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function augmentQueries(url, results, queryDatabase) {
|
|
|
|
|
//Tries to spot any search queries in the url
|
|
|
|
|
//Doubles the score of anything that contains a search query word
|
|
|
|
@ -959,112 +613,14 @@ function augmentQueries(url, results, queryDatabase) {
|
|
|
|
|
return results
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function convertWikiToIAB(results, level = "top") {
|
|
|
|
|
//converts a set of wiki categories to IAB categories
|
|
|
|
|
|
|
|
|
|
let counts = {}
|
|
|
|
|
let mappings = {}
|
|
|
|
|
|
|
|
|
|
for (let result of results) { //get frequencies per top level
|
|
|
|
|
let wiki_cat_name = result[0].toLowerCase()
|
|
|
|
|
let iab_mapping = new_mappings[wiki_cat_name]
|
|
|
|
|
if (verbose) console.log('checking wiki: ' + wiki_cat_name + ' which has IAB mapping: ' + iab_mapping)
|
|
|
|
|
|
|
|
|
|
top_level = 0
|
|
|
|
|
sub_level = 0
|
|
|
|
|
|
|
|
|
|
//is the IAB mapping already top level?
|
|
|
|
|
if (tree.hasOwnProperty(iab_mapping)) {
|
|
|
|
|
top_level = iab_mapping
|
|
|
|
|
sub_level = "general"
|
|
|
|
|
} else {
|
|
|
|
|
for (let tlcat of Object.keys(tree)) {
|
|
|
|
|
if (tree[tlcat].indexOf(iab_mapping) != -1) {
|
|
|
|
|
top_level = tlcat
|
|
|
|
|
sub_level = iab_mapping
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (verbose) console.log('TL: ' + top_level + " SL: " + sub_level)
|
|
|
|
|
|
|
|
|
|
if (top_level != 0) {
|
|
|
|
|
if (mappings.hasOwnProperty(top_level) == false) {
|
|
|
|
|
mappings[top_level] = {}
|
|
|
|
|
}
|
|
|
|
|
if (mappings[top_level].hasOwnProperty(sub_level) == false) {
|
|
|
|
|
mappings[top_level][sub_level] = 0
|
|
|
|
|
}
|
|
|
|
|
mappings[top_level][sub_level] += 1
|
|
|
|
|
|
|
|
|
|
if (counts.hasOwnProperty(top_level) == false) {
|
|
|
|
|
counts[top_level] = 1
|
|
|
|
|
} else {
|
|
|
|
|
counts[top_level] += 1
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (verbose) console.log('counts: ' + JSON.stringify(counts))
|
|
|
|
|
if (verbose) console.log('mapping counts: ' + JSON.stringify(mappings))
|
|
|
|
|
|
|
|
|
|
//if there's nothing
|
|
|
|
|
if (Object.keys(counts).length == 0) {
|
|
|
|
|
return ['uncategorized', 'dummy']
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//get top item
|
|
|
|
|
counts_list = []
|
|
|
|
|
total = 0
|
|
|
|
|
for (let key in counts) {
|
|
|
|
|
counts_list.push([key, counts[key]]);
|
|
|
|
|
total += counts[key]
|
|
|
|
|
} //convert to list
|
|
|
|
|
counts_list.sort(sortDescendingBySecondElement)
|
|
|
|
|
top = counts_list[0]
|
|
|
|
|
|
|
|
|
|
//check if the match is strong enough
|
|
|
|
|
if (top[1] >= 0.3 * total) { //at least 30% of the matches
|
|
|
|
|
if (top[1] > 1) { //as long as its not just 3 all with a score of 1
|
|
|
|
|
|
|
|
|
|
to_return = [top[0]] //name of the top level
|
|
|
|
|
|
|
|
|
|
subs_list = [] //convert to list
|
|
|
|
|
for (let key in mappings[top[0]]) {
|
|
|
|
|
subs_list.push([key, mappings[top[0]][key]])
|
|
|
|
|
}
|
|
|
|
|
subs_list.sort(sortDescendingBySecondElement)
|
|
|
|
|
sub_list_names = [] //i wish there were list comprehensions...
|
|
|
|
|
for (let x of subs_list) {
|
|
|
|
|
sub_list_names.push(x[0])
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//concatenate the different sub level cats
|
|
|
|
|
sub_levels = sub_list_names.join("/")
|
|
|
|
|
to_return.push(sub_levels)
|
|
|
|
|
|
|
|
|
|
return to_return
|
|
|
|
|
} else {
|
|
|
|
|
return ['uncategorized', 'dummy']
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
return ['uncategorized', 'dummy']
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Auxiliary functions, matchers, options etc
|
|
|
|
|
|
|
|
|
|
const {data} = require("sdk/self"); //not quite sure why this is necessary
|
|
|
|
|
let {TextEncoder, TextDecoder, OS} = Cu.import("resource://gre/modules/osfile.jsm", {}); //for file IO
|
|
|
|
|
let historyService = Cc["@mozilla.org/browser/nav-history-service;1"].getService(Ci.nsINavHistoryService);
|
|
|
|
|
let scriptLoader = Cc["@mozilla.org/moz/jssubscript-loader;1"].getService(Ci.mozIJSSubScriptLoader);
|
|
|
|
|
scriptLoader.loadSubScript(data.url("domainRules.json"));
|
|
|
|
|
scriptLoader.loadSubScript(data.url("new_mappings.json"));
|
|
|
|
|
scriptLoader.loadSubScript(data.url("mozcat_heirarchy.json"));
|
|
|
|
|
scriptLoader.loadSubScript(data.url("mozcat_words.json"));
|
|
|
|
|
scriptLoader.loadSubScript(data.url("words.js"));
|
|
|
|
|
scriptLoader.loadSubScript(data.url("rules.js"));
|
|
|
|
|
let payload = JSON.parse(data.load("payload.json"));
|
|
|
|
|
|
|
|
|
|
function getDomain(url) {
|
|
|
|
|
//returns the (sub)domain of a url
|
|
|
|
@ -1172,49 +728,6 @@ function sortDescendingByElementLength(first, second) {
|
|
|
|
|
return second.length - first.length
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//Classification persistence on disc
|
|
|
|
|
|
|
|
|
|
function saveClassifications(visit_id_to_iab_lower) {
|
|
|
|
|
//creates an id-iab mapping for brevity
|
|
|
|
|
//saves that, and a mapping of visit id to classification id
|
|
|
|
|
|
|
|
|
|
//create tree mapping using mozcat heirarchy
|
|
|
|
|
iab_ids = {}
|
|
|
|
|
count = 0
|
|
|
|
|
for (let top_level in tree) {
|
|
|
|
|
iab_ids[top_level] = count
|
|
|
|
|
count += 1
|
|
|
|
|
for (let subcat in tree[top_level]) {
|
|
|
|
|
iab_ids[subcat] = count
|
|
|
|
|
count += 1
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//map classifications
|
|
|
|
|
classifications = {}
|
|
|
|
|
for (let visit_id in visit_id_to_iab_lower) {
|
|
|
|
|
iab = visit_id_to_iab_lower[visit_id]
|
|
|
|
|
mapping = iab_ids[iab]
|
|
|
|
|
classifications[visit_id] = mapping
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//now put everything together
|
|
|
|
|
|
|
|
|
|
everything = {
|
|
|
|
|
'mapping': iab_ids,
|
|
|
|
|
'classifications': classifications
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//now save
|
|
|
|
|
|
|
|
|
|
let encoder = new TextEncoder();
|
|
|
|
|
let array = encoder.encode(everything);
|
|
|
|
|
let promise = OS.File.writeAtomic(OS.Path.join(OS.Constants.Path.profileDir, "classifications.json"), array, {
|
|
|
|
|
tmpPath: OS.Path.join(OS.Constants.Path.profileDir, "classifications.json.tmp")
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function loadClassifications() {
|
|
|
|
|
//returns an id to iab mapping
|
|
|
|
|
//loads meta information into an object with timestamp and id
|
|
|
|
|