Removing empty/unused json files and associated code.

2014-11-18 14:44:55 -05:00 · 2014-11-18 14:44:55 -05:00 · ceb587f613
--- a/data/domainRules.json
+++ b/data/domainRules.json
@ -1 +0,0 @@
-let domainRules = {}
--- a/data/mozcat_heirarchy.json
+++ b/data/mozcat_heirarchy.json
@ -1 +0,0 @@
-let tree = {}
--- a/data/mozcat_words.json
+++ b/data/mozcat_words.json
@ -1 +0,0 @@
-let mozcat_words = {}
--- a/data/new_mappings.json
+++ b/data/new_mappings.json
@ -1,2 +0,0 @@
-let new_mappings = {}
-
--- a/data/payload.json
+++ b/data/payload.json
@ -1 +0,0 @@
-{}
--- a/lib/lwca_refined.js
+++ b/lib/lwca_refined.js
@ -27,8 +27,6 @@ function LWCAClassifier(worker) {
 	let cdb = new ComponentDatabase(worker); //objects that help match title components and query variables
 	//it also checks if it needs to be updated etc

-	let ce = new ClassificationEngine(false) // set this flag to "true" if you want titles with countries to be matched (see function for more details)
-
 	//build vk-tree
 	vk_tree = {}
 	for (let top_level of Object.keys(words_tree)) {
@ -178,90 +176,6 @@ function LWCAClassifier(worker) {

 	}

-
-
-
-
-
-
-
-	//Handle requests
-	//this.classify = function(url, title) {
-
-		//pre process
-		//title = title.toLowerCase()
-
-		//if (verbose) console.log("Pre processing")
-		//	//shortcuts
-		//let sd = spotDefinites(url, title)
-		//if (sd) {
-		//	if (verbose) console.log("Spotted definite match")
-		//	return sd
-		//}
-
-		//cleaning
-		//if (verbose) console.log("title before cleaning: " + title)
-		//title = removePersistentTitleChunks(url, title, cdb.persistentTitleChunks) //returns a string
-		//if (verbose) console.log("removed persistents: " + title)
-		//let chunks = getURLChunks(url)
-		//title = chunks + " " + title
-		//if (verbose) console.log("added url chunks: <" + chunks + ">")
-		//title = removeDomainNames(url, title) //try to remove domain names
-		//if (verbose) console.log("removed domain names: " + title)
-
-		//classify
-		//if (verbose) console.log("Classifying")
-
-		//if (verbose) console.log("Payload size is: " + Object.keys(payload).length)
-		//if (verbose) console.log("DomainRules size is: " + Object.keys(domainRules).length)
-
-		//cosine similarity
-		//let scores = ce.classify(url, title)
-
-		//if (verbose) console.log("scores: " + scores)
-
-		//if (scores.length == 0) {
-		//	return ['uncategorized', 'dummy']
-		//}
-
-		//post process
-		//if (verbose) console.log("Post processing")
-		//
-		//if (verbose) console.log('looking for domain scores')
-		//let domain_scores = augmentDomainMatchers(url, title, scores)
-		//if (domain_scores != scores) {
-		//	if (verbose) console.log('adjusted!')
-		//	return domain_scores.sort(sortDescendingBySecondElement)[0]
-		//}
-
-		//if that didn't change anything, last resort is using queries and repeats
-		//if (verbose) console.log("trying query augmentation")
-		//scores = augmentQueries(url, scores, cdb.queryVariables)
-		//
-		////remove any scores with a similarity of less than 0.3
-		//let scores_filtered = []
-		//for (let s of scores) {
-		//	if (s[1] >= 0.25) {
-		//		scores_filtered.push(s)
-		//	}
-		//}
-		//scores = scores_filtered
-		//
-		//if (verbose) console.log('scores: ' + scores)
-		//	//console.log("trying repeat word augmentation")
-		//	//scores = augmentRepeatWords(scores)
-		//	//console.log('scores: ' + scores)
-		//
-		////convert from Wiki to IAB v2
-		//scores = convertWikiToIAB(scores)
-		//
-		////finish up
-		//if (verbose) console.log("Finishing up")
-
-		//return scores
-
-	//}
-
 	this.init = function() {
 		return Task.spawn(function*() {
 			yield cdb.init();
@ -593,31 +507,6 @@ function removeDomainNames(url, title) {
 	}
 }

-function getURLChunks(url) {
-	//gets chunks from the URL
-	//this will have filters such as only words that exist in mozcat or something
-	//but that requires a huge amount of edit distance stuff
-
-	url = url.toLowerCase()
-		//domain = getDomain(url)
-		//url = url.split(domain)[1] //remove domain itself
-		//url = url.split("?")[0] //eliminate query variables
-
-	url = url.match(wordFinder)
-	if (verbose) console.log('url chunks found in word finder: ' + url)
-
-	let useful_words = []
-	for (let word of url) {
-		if (word in mozcat_words) {
-			useful_words.push(word)
-		}
-	}
-
-	if (verbose) console.log('url chunks left after mozcat_words: ' + useful_words)
-
-	return useful_words.join(" ")
-}
-
 // Classification

 function cosineSimilarity(text, category_keywords, category_magnitude) {
@ -659,84 +548,6 @@ function cosineSimilarity(text, category_keywords, category_magnitude) {
 	return 0
 }

-function ClassificationEngine(world = false) {
-	//a class that can classify a visit
-	//There's one option, a flag called "world". Basically there are lots of titles where the only useful
-	//thing that can be matched to is the fact that it contains "china" or "argentina". These fall into the
-	// <world> top level category in the tree. They will be later used for plotting things on a map, but at the
-	//moment I think we can sacrifice some recall for the higher precision that this will attain.
-
-	//initializer and world remover
-	let categories = []
-	if (world === true) {
-		categories = Object.keys(new_mappings)
-	} else {
-		for (let k of Object.keys(new_mappings)) {
-			if (tree['world'].indexOf(new_mappings[k]) == -1) {
-				categories.push(k)
-			}
-		}
-	}
-
-	if (verbose) console.log('<world> is set to ' + world + " and the inverse index will use " + categories.length + " categories out of " + Object.keys(new_mappings).length + " total")
-
-	//build inverse index and magnitudes
-	this.id_to_article = {}
-	this.inverse_index = {}
-	this.magnitudes = {} //note that magnitudes are based on article ids, not category names
-
-	for (let index = 0; index < categories.length; index++) {
-		let category = categories[index]
-		let keywords = payload[category]
-		let magnitude = 0
-
-		this.id_to_article[index] = category
-		for (let k in keywords) {
-			if (this.inverse_index.hasOwnProperty(k) == false) {
-				this.inverse_index[k] = [index]
-			} else {
-				this.inverse_index[k].push(index)
-			}
-			magnitude += Math.pow(keywords[k], 2)
-		}
-
-		magnitude = Math.sqrt(magnitude) //precalculate magnitude square roots
-		this.magnitudes[index] = magnitude
-	}
-
-	//classifier
-	this.classify = function(url, title) {
-		title = title.toLowerCase().match(wordFinder)
-		let matches = []
-
-		let articles = {} // a set of articles worth looking at, auto-deduped
-
-		for (let keyword of title) {
-			if (this.inverse_index.hasOwnProperty(keyword)) {
-				for (let article of this.inverse_index[keyword]) {
-					articles[article] = true //effectively the set intersection
-				}
-			}
-		}
-
-		let scores = [] //classify against each category
-
-		for (let article_number in articles) {
-			let category = this.id_to_article[article_number]
-			let words = payload[category]
-			let similarity = cosineSimilarity(title, words, this.magnitudes[article_number])
-			if (similarity != 0) {
-				scores.push([category, similarity])
-			}
-
-		}
-
-		scores = scores.sort(sortDescendingBySecondElement)
-		return scores.slice(0, 10)
-	}
-
-}
-
 // Post processing

 function augmentRepeatWords(results) {
@ -766,163 +577,6 @@ function augmentRepeatWords(results) {
 	return results
 }

-function augmentDomainMatchers(url, title, results) {
-	// grab domain classifications and multiply those that have
-	// matching word lemmas/stems
-
-	//typically anything called society or reference is a bad classification
-	let ignore = {
-		'society': true,
-		'reference': true,
-		'uncategorized': true,
-		'__news_counter': true,
-		'marketing': true,
-	}
-
-	let class_maps = {
-		'history': ['histor'],
-		'sports': ['sport', 'gam'],
-		'computers': ['comput', 'tech', 'algorithm', 'model'],
-		'science': ['theor', 'hypothes', 'species', 'scien'],
-		'shopping': ['store', 'shop', 'brand', 'outlet', 'inc', 'ltd', 'compan'],
-		'news': ['the ', 'daily', 'morning', 'times', 'new'],
-		'health': ['diet', 'health'],
-		'hobby': ['interest', 'coin', 'stamp', 'hobb'],
-		'cuisine': ['cuisine', 'culinary', 'food', 'sauce', 'method', 'cook', 'technique', 'style'],
-		'travel': ['city', 'travel', 'rout', 'hotel', 'town', 'countr', 'state', 'region'],
-		'education': ['school', 'education', 'class', 'university', 'college', 'campus'],
-		'family': ['parent', 'famil', 'child', 'matern', 'father', 'mother', 'pat', 'mat', 'sister', 'brother', 'pregnan'],
-		'finance': ['bank', 'financ', 'institut', 'loan', 'rate', 'tax'],
-		'business': ['compan', 'inc', 'ltd', 'business'],
-		'video-games': ['gam', 'video', 'computer', 'system', 'console'],
-		'fashion': ['brand', 'design', 'fashion'],
-		'tv': ['telev', 'tv', 'show', 'series', 'episode', 'season', 'character', 'act', 'theme'],
-		'movies': ['film', 'movie', 'direct', 'act', 'prod', 'cinem', 'studio', 'set'],
-		'technology': ['tech', 'digit', 'elec'],
-		'food': ['recipe', 'restaurant', 'bar', 'cuisine', 'food', 'sauce', 'cook', 'technique', 'style'],
-		'women': ['wom', 'fem'],
-		'government': ['gov', 'admin', 'dept', 'nationa', 'polic'],
-		'discounts': ['coupon', 'discount'],
-		'consumer-electronics': ['model', 'brand', 'series', 'inc'],
-		'arts': ['artist', 'paint', 'direct'],
-		'politics': ['gov', 'polit', 'polic', 'law', 'charter', 'treat', 'part', 'coalition', 'bill', 'usc', 'parl', 'tax', 'camp'],
-		'music': ['music', 'band', 'album', 'single', 'side', 'release', 'song', 'sing', 'lyric', 'genre', 'style'],
-		'banking': ['bank', 'financ', 'institut', 'account', 'credit', 'debit'],
-		'drinks': ['drink', 'ingredient'],
-		'religion': ['religi', 'church', 'temple', 'congregat'],
-		'cars': ['car', 'model', 'engin', 'moto', 'auto'],
-		'outdoors': ['range', 'rout'],
-		'reading': ['read', 'book', 'novel', 'ser', 'auth'],
-		'games': ['game', 'lotter'],
-		'home': ['home', 'style'],
-		'career': ['career', 'job', 'pro'],
-		'weather': ['hurr', 'season'],
-		'photography': ['style'],
-		'entertainment': ['entertain'],
-		'blogging': ['blog'],
-		'reviews': ['review'],
-		'image-sharing': ['imag', 'shar'],
-		'relationship': ['relation'],
-		'clothes': ['brand', 'cloth', 'design', 'fashion'],
-		'shoes': ['shoe', 'foot'],
-		'email': ['mail'],
-		'law': ['law', 'bill', 'treat', 'armis', 'cease', 'peace', 'legal', 'camp'],
-		'real-estate': ['real', 'estate', 'zone', 'house', 'apart'],
-		'radio': ['radio', 'channel', 'station'],
-		'men': ['male', 'man', 'masc', 'men'],
-		'pets': ['spec', 'breed', 'type', 'animal', 'pet'],
-		'maps': ['map', 'chart', 'cart', 'projec'],
-		'writing': ['author', 'book', 'series', 'issue', 'style', 'writ'],
-		'motorcycles': ['bike', 'motor'],
-		'dance': ['danc'],
-	}
-
-	url = parseUri(url)
-	title = title.toLowerCase()
-		//have to basically iteratively check if bits of the url are in domainRules
-		//e.g. http://something.search.domain.com should first search for everything,
-		//then search.domain.com, then domain.com
-		//no point in searching for just .com
-
-	domain = url.host.split(".")
-	for (let dot_count in domain) {
-		let key = domain.slice(dot_count).join(".")
-		if (domainRules.hasOwnProperty(key)) {
-			//found an entry in domainRules
-
-			//For example:
-			//   "engadget.com" : {
-			//		"topics robots" : "science",
-			//		"imac" : "computers",
-			//		"__ANY" : [
-			//		   "technology",
-			//		   "shopping",
-			//		   "consumer-electronics"
-			//		],
-			//		"review" : "reviews",
-			//		"tag nintendowiiu" : "video-games"
-			//	 },
-
-			let category_matchers = domainRules[key]
-			let decision = false
-			let keys = Object.keys(category_matchers).sort()
-
-			//iterate through all keys, __ANY comes last to see if one matches
-			for (let k in Object.keys(category_matchers)) {
-				if (k != "__ANY") {
-					let tokens = k.split(" ")
-					let match_count = 0
-
-					for (let token of tokens) {
-						if (title.indexOf(token) != -1) {
-							match_count += 1
-						}
-					}
-
-					if (match_count == tokens.length) {
-						decision = category_matchers[k]
-						if (verbose) console.log("Exact token match found")
-						break
-					}
-				}
-			}
-
-			//check if decision was made
-			if (decision == false) {
-				if (category_matchers.hasOwnProperty("__ANY")) { //if not, look at __ANY
-					if (verbose) console.log("No exact title token match found, so going with __ANY, which is: " + category_matchers['__ANY'])
-					decision = category_matchers['__ANY']
-				} else {
-					return results //if there's still nothing, just return the original results from the argument
-				}
-			}
-
-			//now try and rerank results based on components
-			if (typeof decision === "string") { //decision could be 1 or more categories, make it consistent
-				decision = [decision]
-			}
-
-			//now iterate through the decision categories and add 1 to each result
-			//category that contains the stems
-
-			for (let category of decision) {
-				if (class_maps.hasOwnProperty(category)) {
-					for (i = 0; i < results.length; i++) {
-						for (let stem of class_maps[category]) {
-							if (results[i][0].toLowerCase().indexOf(stem) != -1) {
-								results[i][1] += 1
-								break
-							}
-						}
-					}
-				}
-			}
-			break
-		}
-	}
-	return results
-}
-
 function augmentQueries(url, results, queryDatabase) {
 	//Tries to spot any search queries in the url
 	//Doubles the score of anything that contains a search query word
@ -959,112 +613,14 @@ function augmentQueries(url, results, queryDatabase) {
 	return results
 }

-function convertWikiToIAB(results, level = "top") {
-	//converts a set of wiki categories to IAB categories
-
-	let counts = {}
-	let mappings = {}
-
-	for (let result of results) { //get frequencies per top level
-		let wiki_cat_name = result[0].toLowerCase()
-		let iab_mapping = new_mappings[wiki_cat_name]
-		if (verbose) console.log('checking wiki: ' + wiki_cat_name + ' which has IAB mapping: ' + iab_mapping)
-
-		top_level = 0
-		sub_level = 0
-
-		//is the IAB mapping already top level?
-		if (tree.hasOwnProperty(iab_mapping)) {
-			top_level = iab_mapping
-			sub_level = "general"
-		} else {
-			for (let tlcat of Object.keys(tree)) {
-				if (tree[tlcat].indexOf(iab_mapping) != -1) {
-					top_level = tlcat
-					sub_level = iab_mapping
-				}
-			}
-		}
-
-		if (verbose) console.log('TL: ' + top_level + " SL: " + sub_level)
-
-		if (top_level != 0) {
-			if (mappings.hasOwnProperty(top_level) == false) {
-				mappings[top_level] = {}
-			}
-			if (mappings[top_level].hasOwnProperty(sub_level) == false) {
-				mappings[top_level][sub_level] = 0
-			}
-			mappings[top_level][sub_level] += 1
-
-			if (counts.hasOwnProperty(top_level) == false) {
-				counts[top_level] = 1
-			} else {
-				counts[top_level] += 1
-			}
-		}
-	}
-
-	if (verbose) console.log('counts: ' + JSON.stringify(counts))
-	if (verbose) console.log('mapping counts: ' + JSON.stringify(mappings))
-
-	//if there's nothing
-	if (Object.keys(counts).length == 0) {
-		return ['uncategorized', 'dummy']
-	}
-
-	//get top item
-	counts_list = []
-	total = 0
-	for (let key in counts) {
-		counts_list.push([key, counts[key]]);
-		total += counts[key]
-	} //convert to list
-	counts_list.sort(sortDescendingBySecondElement)
-	top = counts_list[0]
-
-	//check if the match is strong enough
-	if (top[1] >= 0.3 * total) { //at least 30% of the matches
-		if (top[1] > 1) { //as long as its not just 3 all with a score of 1
-
-			to_return = [top[0]] //name of the top level
-
-			subs_list = [] //convert to list
-			for (let key in mappings[top[0]]) {
-				subs_list.push([key, mappings[top[0]][key]])
-			}
-			subs_list.sort(sortDescendingBySecondElement)
-			sub_list_names = [] //i wish there were list comprehensions...
-			for (let x of subs_list) {
-				sub_list_names.push(x[0])
-			}
-
-			//concatenate the different sub level cats
-			sub_levels = sub_list_names.join("/")
-			to_return.push(sub_levels)
-
-			return to_return
-		} else {
-			return ['uncategorized', 'dummy']
-		}
-	} else {
-		return ['uncategorized', 'dummy']
-	}
-}
-
 // Auxiliary functions, matchers, options etc

 const {data} = require("sdk/self"); //not quite sure why this is necessary
 let {TextEncoder, TextDecoder, OS} = Cu.import("resource://gre/modules/osfile.jsm", {}); //for file IO
 let historyService = Cc["@mozilla.org/browser/nav-history-service;1"].getService(Ci.nsINavHistoryService);
 let scriptLoader = Cc["@mozilla.org/moz/jssubscript-loader;1"].getService(Ci.mozIJSSubScriptLoader);
-scriptLoader.loadSubScript(data.url("domainRules.json"));
-scriptLoader.loadSubScript(data.url("new_mappings.json"));
-scriptLoader.loadSubScript(data.url("mozcat_heirarchy.json"));
-scriptLoader.loadSubScript(data.url("mozcat_words.json"));
 scriptLoader.loadSubScript(data.url("words.js"));
 scriptLoader.loadSubScript(data.url("rules.js"));
-let payload = JSON.parse(data.load("payload.json"));

 function getDomain(url) {
 	//returns the (sub)domain of a url
@ -1172,49 +728,6 @@ function sortDescendingByElementLength(first, second) {
 	return second.length - first.length
 }

-//Classification persistence on disc
-
-function saveClassifications(visit_id_to_iab_lower) {
-	//creates an id-iab mapping for brevity
-	//saves that, and a mapping of visit id to classification id
-
-	//create tree mapping using mozcat heirarchy
-	iab_ids = {}
-	count = 0
-	for (let top_level in tree) {
-		iab_ids[top_level] = count
-		count += 1
-		for (let subcat in tree[top_level]) {
-			iab_ids[subcat] = count
-			count += 1
-		}
-	}
-
-	//map classifications
-	classifications = {}
-	for (let visit_id in visit_id_to_iab_lower) {
-		iab = visit_id_to_iab_lower[visit_id]
-		mapping = iab_ids[iab]
-		classifications[visit_id] = mapping
-	}
-
-	//now put everything together
-
-	everything = {
-		'mapping': iab_ids,
-		'classifications': classifications
-	}
-
-	//now save
-
-	let encoder = new TextEncoder();
-	let array = encoder.encode(everything);
-	let promise = OS.File.writeAtomic(OS.Path.join(OS.Constants.Path.profileDir, "classifications.json"), array, {
-		tmpPath: OS.Path.join(OS.Constants.Path.profileDir, "classifications.json.tmp")
-	});
-
-}
-
 function loadClassifications() {
 	//returns an id to iab mapping
 	//loads meta information into an object with timestamp and id