Removing empty/unused json files and associated code.

This commit is contained in:
Marina Samuel 2014-11-18 14:44:55 -05:00
Родитель 15b7b23aa2
Коммит ceb587f613
6 изменённых файлов: 0 добавлений и 493 удалений

Просмотреть файл

@ -1 +0,0 @@
let domainRules = {}

Просмотреть файл

@ -1 +0,0 @@
let tree = {}

Просмотреть файл

@ -1 +0,0 @@
let mozcat_words = {}

Просмотреть файл

@ -1,2 +0,0 @@
let new_mappings = {}

Просмотреть файл

@ -1 +0,0 @@
{}

Просмотреть файл

@ -27,8 +27,6 @@ function LWCAClassifier(worker) {
let cdb = new ComponentDatabase(worker); //objects that help match title components and query variables
//it also checks if it needs to be updated etc
let ce = new ClassificationEngine(false) // set this flag to "true" if you want titles with countries to be matched (see function for more details)
//build vk-tree
vk_tree = {}
for (let top_level of Object.keys(words_tree)) {
@ -178,90 +176,6 @@ function LWCAClassifier(worker) {
}
//Handle requests
//this.classify = function(url, title) {
//pre process
//title = title.toLowerCase()
//if (verbose) console.log("Pre processing")
// //shortcuts
//let sd = spotDefinites(url, title)
//if (sd) {
// if (verbose) console.log("Spotted definite match")
// return sd
//}
//cleaning
//if (verbose) console.log("title before cleaning: " + title)
//title = removePersistentTitleChunks(url, title, cdb.persistentTitleChunks) //returns a string
//if (verbose) console.log("removed persistents: " + title)
//let chunks = getURLChunks(url)
//title = chunks + " " + title
//if (verbose) console.log("added url chunks: <" + chunks + ">")
//title = removeDomainNames(url, title) //try to remove domain names
//if (verbose) console.log("removed domain names: " + title)
//classify
//if (verbose) console.log("Classifying")
//if (verbose) console.log("Payload size is: " + Object.keys(payload).length)
//if (verbose) console.log("DomainRules size is: " + Object.keys(domainRules).length)
//cosine similarity
//let scores = ce.classify(url, title)
//if (verbose) console.log("scores: " + scores)
//if (scores.length == 0) {
// return ['uncategorized', 'dummy']
//}
//post process
//if (verbose) console.log("Post processing")
//
//if (verbose) console.log('looking for domain scores')
//let domain_scores = augmentDomainMatchers(url, title, scores)
//if (domain_scores != scores) {
// if (verbose) console.log('adjusted!')
// return domain_scores.sort(sortDescendingBySecondElement)[0]
//}
//if that didn't change anything, last resort is using queries and repeats
//if (verbose) console.log("trying query augmentation")
//scores = augmentQueries(url, scores, cdb.queryVariables)
//
////remove any scores with a similarity of less than 0.3
//let scores_filtered = []
//for (let s of scores) {
// if (s[1] >= 0.25) {
// scores_filtered.push(s)
// }
//}
//scores = scores_filtered
//
//if (verbose) console.log('scores: ' + scores)
// //console.log("trying repeat word augmentation")
// //scores = augmentRepeatWords(scores)
// //console.log('scores: ' + scores)
//
////convert from Wiki to IAB v2
//scores = convertWikiToIAB(scores)
//
////finish up
//if (verbose) console.log("Finishing up")
//return scores
//}
this.init = function() {
return Task.spawn(function*() {
yield cdb.init();
@ -593,31 +507,6 @@ function removeDomainNames(url, title) {
}
}
function getURLChunks(url) {
//gets chunks from the URL
//this will have filters such as only words that exist in mozcat or something
//but that requires a huge amount of edit distance stuff
url = url.toLowerCase()
//domain = getDomain(url)
//url = url.split(domain)[1] //remove domain itself
//url = url.split("?")[0] //eliminate query variables
url = url.match(wordFinder)
if (verbose) console.log('url chunks found in word finder: ' + url)
let useful_words = []
for (let word of url) {
if (word in mozcat_words) {
useful_words.push(word)
}
}
if (verbose) console.log('url chunks left after mozcat_words: ' + useful_words)
return useful_words.join(" ")
}
// Classification
function cosineSimilarity(text, category_keywords, category_magnitude) {
@ -659,84 +548,6 @@ function cosineSimilarity(text, category_keywords, category_magnitude) {
return 0
}
function ClassificationEngine(world = false) {
//a class that can classify a visit
//There's one option, a flag called "world". Basically there are lots of titles where the only useful
//thing that can be matched to is the fact that it contains "china" or "argentina". These fall into the
// <world> top level category in the tree. They will be later used for plotting things on a map, but at the
//moment I think we can sacrifice some recall for the higher precision that this will attain.
//initializer and world remover
let categories = []
if (world === true) {
categories = Object.keys(new_mappings)
} else {
for (let k of Object.keys(new_mappings)) {
if (tree['world'].indexOf(new_mappings[k]) == -1) {
categories.push(k)
}
}
}
if (verbose) console.log('<world> is set to ' + world + " and the inverse index will use " + categories.length + " categories out of " + Object.keys(new_mappings).length + " total")
//build inverse index and magnitudes
this.id_to_article = {}
this.inverse_index = {}
this.magnitudes = {} //note that magnitudes are based on article ids, not category names
for (let index = 0; index < categories.length; index++) {
let category = categories[index]
let keywords = payload[category]
let magnitude = 0
this.id_to_article[index] = category
for (let k in keywords) {
if (this.inverse_index.hasOwnProperty(k) == false) {
this.inverse_index[k] = [index]
} else {
this.inverse_index[k].push(index)
}
magnitude += Math.pow(keywords[k], 2)
}
magnitude = Math.sqrt(magnitude) //precalculate magnitude square roots
this.magnitudes[index] = magnitude
}
//classifier
this.classify = function(url, title) {
title = title.toLowerCase().match(wordFinder)
let matches = []
let articles = {} // a set of articles worth looking at, auto-deduped
for (let keyword of title) {
if (this.inverse_index.hasOwnProperty(keyword)) {
for (let article of this.inverse_index[keyword]) {
articles[article] = true //effectively the set intersection
}
}
}
let scores = [] //classify against each category
for (let article_number in articles) {
let category = this.id_to_article[article_number]
let words = payload[category]
let similarity = cosineSimilarity(title, words, this.magnitudes[article_number])
if (similarity != 0) {
scores.push([category, similarity])
}
}
scores = scores.sort(sortDescendingBySecondElement)
return scores.slice(0, 10)
}
}
// Post processing
function augmentRepeatWords(results) {
@ -766,163 +577,6 @@ function augmentRepeatWords(results) {
return results
}
function augmentDomainMatchers(url, title, results) {
// grab domain classifications and multiply those that have
// matching word lemmas/stems
//typically anything called society or reference is a bad classification
let ignore = {
'society': true,
'reference': true,
'uncategorized': true,
'__news_counter': true,
'marketing': true,
}
let class_maps = {
'history': ['histor'],
'sports': ['sport', 'gam'],
'computers': ['comput', 'tech', 'algorithm', 'model'],
'science': ['theor', 'hypothes', 'species', 'scien'],
'shopping': ['store', 'shop', 'brand', 'outlet', 'inc', 'ltd', 'compan'],
'news': ['the ', 'daily', 'morning', 'times', 'new'],
'health': ['diet', 'health'],
'hobby': ['interest', 'coin', 'stamp', 'hobb'],
'cuisine': ['cuisine', 'culinary', 'food', 'sauce', 'method', 'cook', 'technique', 'style'],
'travel': ['city', 'travel', 'rout', 'hotel', 'town', 'countr', 'state', 'region'],
'education': ['school', 'education', 'class', 'university', 'college', 'campus'],
'family': ['parent', 'famil', 'child', 'matern', 'father', 'mother', 'pat', 'mat', 'sister', 'brother', 'pregnan'],
'finance': ['bank', 'financ', 'institut', 'loan', 'rate', 'tax'],
'business': ['compan', 'inc', 'ltd', 'business'],
'video-games': ['gam', 'video', 'computer', 'system', 'console'],
'fashion': ['brand', 'design', 'fashion'],
'tv': ['telev', 'tv', 'show', 'series', 'episode', 'season', 'character', 'act', 'theme'],
'movies': ['film', 'movie', 'direct', 'act', 'prod', 'cinem', 'studio', 'set'],
'technology': ['tech', 'digit', 'elec'],
'food': ['recipe', 'restaurant', 'bar', 'cuisine', 'food', 'sauce', 'cook', 'technique', 'style'],
'women': ['wom', 'fem'],
'government': ['gov', 'admin', 'dept', 'nationa', 'polic'],
'discounts': ['coupon', 'discount'],
'consumer-electronics': ['model', 'brand', 'series', 'inc'],
'arts': ['artist', 'paint', 'direct'],
'politics': ['gov', 'polit', 'polic', 'law', 'charter', 'treat', 'part', 'coalition', 'bill', 'usc', 'parl', 'tax', 'camp'],
'music': ['music', 'band', 'album', 'single', 'side', 'release', 'song', 'sing', 'lyric', 'genre', 'style'],
'banking': ['bank', 'financ', 'institut', 'account', 'credit', 'debit'],
'drinks': ['drink', 'ingredient'],
'religion': ['religi', 'church', 'temple', 'congregat'],
'cars': ['car', 'model', 'engin', 'moto', 'auto'],
'outdoors': ['range', 'rout'],
'reading': ['read', 'book', 'novel', 'ser', 'auth'],
'games': ['game', 'lotter'],
'home': ['home', 'style'],
'career': ['career', 'job', 'pro'],
'weather': ['hurr', 'season'],
'photography': ['style'],
'entertainment': ['entertain'],
'blogging': ['blog'],
'reviews': ['review'],
'image-sharing': ['imag', 'shar'],
'relationship': ['relation'],
'clothes': ['brand', 'cloth', 'design', 'fashion'],
'shoes': ['shoe', 'foot'],
'email': ['mail'],
'law': ['law', 'bill', 'treat', 'armis', 'cease', 'peace', 'legal', 'camp'],
'real-estate': ['real', 'estate', 'zone', 'house', 'apart'],
'radio': ['radio', 'channel', 'station'],
'men': ['male', 'man', 'masc', 'men'],
'pets': ['spec', 'breed', 'type', 'animal', 'pet'],
'maps': ['map', 'chart', 'cart', 'projec'],
'writing': ['author', 'book', 'series', 'issue', 'style', 'writ'],
'motorcycles': ['bike', 'motor'],
'dance': ['danc'],
}
url = parseUri(url)
title = title.toLowerCase()
//have to basically iteratively check if bits of the url are in domainRules
//e.g. http://something.search.domain.com should first search for everything,
//then search.domain.com, then domain.com
//no point in searching for just .com
domain = url.host.split(".")
for (let dot_count in domain) {
let key = domain.slice(dot_count).join(".")
if (domainRules.hasOwnProperty(key)) {
//found an entry in domainRules
//For example:
// "engadget.com" : {
// "topics robots" : "science",
// "imac" : "computers",
// "__ANY" : [
// "technology",
// "shopping",
// "consumer-electronics"
// ],
// "review" : "reviews",
// "tag nintendowiiu" : "video-games"
// },
let category_matchers = domainRules[key]
let decision = false
let keys = Object.keys(category_matchers).sort()
//iterate through all keys, __ANY comes last to see if one matches
for (let k in Object.keys(category_matchers)) {
if (k != "__ANY") {
let tokens = k.split(" ")
let match_count = 0
for (let token of tokens) {
if (title.indexOf(token) != -1) {
match_count += 1
}
}
if (match_count == tokens.length) {
decision = category_matchers[k]
if (verbose) console.log("Exact token match found")
break
}
}
}
//check if decision was made
if (decision == false) {
if (category_matchers.hasOwnProperty("__ANY")) { //if not, look at __ANY
if (verbose) console.log("No exact title token match found, so going with __ANY, which is: " + category_matchers['__ANY'])
decision = category_matchers['__ANY']
} else {
return results //if there's still nothing, just return the original results from the argument
}
}
//now try and rerank results based on components
if (typeof decision === "string") { //decision could be 1 or more categories, make it consistent
decision = [decision]
}
//now iterate through the decision categories and add 1 to each result
//category that contains the stems
for (let category of decision) {
if (class_maps.hasOwnProperty(category)) {
for (i = 0; i < results.length; i++) {
for (let stem of class_maps[category]) {
if (results[i][0].toLowerCase().indexOf(stem) != -1) {
results[i][1] += 1
break
}
}
}
}
}
break
}
}
return results
}
function augmentQueries(url, results, queryDatabase) {
//Tries to spot any search queries in the url
//Doubles the score of anything that contains a search query word
@ -959,112 +613,14 @@ function augmentQueries(url, results, queryDatabase) {
return results
}
function convertWikiToIAB(results, level = "top") {
//converts a set of wiki categories to IAB categories
let counts = {}
let mappings = {}
for (let result of results) { //get frequencies per top level
let wiki_cat_name = result[0].toLowerCase()
let iab_mapping = new_mappings[wiki_cat_name]
if (verbose) console.log('checking wiki: ' + wiki_cat_name + ' which has IAB mapping: ' + iab_mapping)
top_level = 0
sub_level = 0
//is the IAB mapping already top level?
if (tree.hasOwnProperty(iab_mapping)) {
top_level = iab_mapping
sub_level = "general"
} else {
for (let tlcat of Object.keys(tree)) {
if (tree[tlcat].indexOf(iab_mapping) != -1) {
top_level = tlcat
sub_level = iab_mapping
}
}
}
if (verbose) console.log('TL: ' + top_level + " SL: " + sub_level)
if (top_level != 0) {
if (mappings.hasOwnProperty(top_level) == false) {
mappings[top_level] = {}
}
if (mappings[top_level].hasOwnProperty(sub_level) == false) {
mappings[top_level][sub_level] = 0
}
mappings[top_level][sub_level] += 1
if (counts.hasOwnProperty(top_level) == false) {
counts[top_level] = 1
} else {
counts[top_level] += 1
}
}
}
if (verbose) console.log('counts: ' + JSON.stringify(counts))
if (verbose) console.log('mapping counts: ' + JSON.stringify(mappings))
//if there's nothing
if (Object.keys(counts).length == 0) {
return ['uncategorized', 'dummy']
}
//get top item
counts_list = []
total = 0
for (let key in counts) {
counts_list.push([key, counts[key]]);
total += counts[key]
} //convert to list
counts_list.sort(sortDescendingBySecondElement)
top = counts_list[0]
//check if the match is strong enough
if (top[1] >= 0.3 * total) { //at least 30% of the matches
if (top[1] > 1) { //as long as its not just 3 all with a score of 1
to_return = [top[0]] //name of the top level
subs_list = [] //convert to list
for (let key in mappings[top[0]]) {
subs_list.push([key, mappings[top[0]][key]])
}
subs_list.sort(sortDescendingBySecondElement)
sub_list_names = [] //i wish there were list comprehensions...
for (let x of subs_list) {
sub_list_names.push(x[0])
}
//concatenate the different sub level cats
sub_levels = sub_list_names.join("/")
to_return.push(sub_levels)
return to_return
} else {
return ['uncategorized', 'dummy']
}
} else {
return ['uncategorized', 'dummy']
}
}
// Auxiliary functions, matchers, options etc
const {data} = require("sdk/self"); //not quite sure why this is necessary
let {TextEncoder, TextDecoder, OS} = Cu.import("resource://gre/modules/osfile.jsm", {}); //for file IO
let historyService = Cc["@mozilla.org/browser/nav-history-service;1"].getService(Ci.nsINavHistoryService);
let scriptLoader = Cc["@mozilla.org/moz/jssubscript-loader;1"].getService(Ci.mozIJSSubScriptLoader);
scriptLoader.loadSubScript(data.url("domainRules.json"));
scriptLoader.loadSubScript(data.url("new_mappings.json"));
scriptLoader.loadSubScript(data.url("mozcat_heirarchy.json"));
scriptLoader.loadSubScript(data.url("mozcat_words.json"));
scriptLoader.loadSubScript(data.url("words.js"));
scriptLoader.loadSubScript(data.url("rules.js"));
let payload = JSON.parse(data.load("payload.json"));
function getDomain(url) {
//returns the (sub)domain of a url
@ -1172,49 +728,6 @@ function sortDescendingByElementLength(first, second) {
return second.length - first.length
}
//Classification persistence on disc
function saveClassifications(visit_id_to_iab_lower) {
//creates an id-iab mapping for brevity
//saves that, and a mapping of visit id to classification id
//create tree mapping using mozcat heirarchy
iab_ids = {}
count = 0
for (let top_level in tree) {
iab_ids[top_level] = count
count += 1
for (let subcat in tree[top_level]) {
iab_ids[subcat] = count
count += 1
}
}
//map classifications
classifications = {}
for (let visit_id in visit_id_to_iab_lower) {
iab = visit_id_to_iab_lower[visit_id]
mapping = iab_ids[iab]
classifications[visit_id] = mapping
}
//now put everything together
everything = {
'mapping': iab_ids,
'classifications': classifications
}
//now save
let encoder = new TextEncoder();
let array = encoder.encode(everything);
let promise = OS.File.writeAtomic(OS.Path.join(OS.Constants.Path.profileDir, "classifications.json"), array, {
tmpPath: OS.Path.join(OS.Constants.Path.profileDir, "classifications.json.tmp")
});
}
function loadClassifications() {
//returns an id to iab mapping
//loads meta information into an object with timestamp and id