Merge pull request #22 from mozilla/lwca-cleanup

Closes Bug 1110234 - Remove unused pieces of lwca_refined and move classify() call to the interest worker.
2014-12-11 17:24:34 -08:00 · 2014-12-11 17:24:34 -08:00 · f0c8a015e2
--- a/data/interests/interestsWorker.js
+++ b/data/interests/interestsWorker.js
@ -6,6 +6,7 @@

 importScripts("tokenizerFactory.js");
 importScripts("naiveBayesClassifier.js");
+importScripts("lwca_refined.js");

 function InterestsWorkerError(message) {
    this.name = "InterestsWorkerError";
@ -23,6 +24,7 @@ let gNamespace = null;
 let gRegionCode = null;
 let gTokenizer = null;
 let gClassifier = null;
+let gLWCAClassifier = null;
 let gInterestsData = null;

 // XXX The original splitter doesn't apply to chinese:
@ -31,6 +33,8 @@ const kSplitter = /[\s-]+/;

 // bootstrap the worker with data and models
 function bootstrap(aMessageData) {
+  gLWCAClassifier = new LWCAClassifier(aMessageData);
+
  // expects : {interestsData, interestsDataType, interestsClassifierModel, interestsUrlStopwords, workerRegionCode}
  gRegionCode = aMessageData.workerRegionCode;

@ -162,6 +166,19 @@ function textClassify({url, title}) {
  return [];
 }

+function lwcaClassify({url, title}) {
+  try {
+    if (url && title && gNamespace == "58-cat") {
+      let classification = gLWCAClassifier.classify(url, title);
+      let subcat = classification[1].split("/")[0];
+      return {"category": [classification[0]], "subcat": subcat};
+    }
+  } catch (ex) {
+    console.log(ex);
+  }
+  return [];
+}
+
 // Figure out which interests are associated to the document
 function getInterestsForDocument(aMessageData) {

@ -191,6 +208,11 @@ function getInterestsForDocument(aMessageData) {
  let results = [];
  let combinedInterests = [];
  try {
+    interests = lwcaClassify(aMessageData);
+    if (Object.keys(interests).length > 0) {
+      results.push({type: "lwca", interests: interests.category, subcat: interests.subcat});
+    }
+
    interests = ruleClassify(aMessageData);
    results.push({type: "rules", interests: dedupeInterests(interests)});

--- a/data/interests/lwcaWorker.js
+++ b/data/interests/lwcaWorker.js
@ -1,170 +0,0 @@
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-/**
- * This worker is responsible for any extensive processing required
- * for LWCA. This includes computations for persistentTitleChunks
- * and queryVariables.
- */
-function parseUri(str) {
-	// parseUri 1.2.2
-	// (c) Steven Levithan <stevenlevithan.com>
-	// MIT License
-	// http://blog.stevenlevithan.com/archives/parseuri
-	var o = parseUri.options,
-		m = o.parser[o.strictMode ? "strict" : "loose"].exec(str),
-		uri = {},
-		i = 14;
-
-	while (i--) uri[o.key[i]] = m[i] || "";
-
-	uri[o.q.name] = {};
-	uri[o.key[12]].replace(o.q.parser, function($0, $1, $2) {
-		if ($1) uri[o.q.name][$1] = $2;
-	});
-
-	return uri;
-};
-
-parseUri.options = {
-	strictMode: false,
-	key: ["source", "protocol", "authority", "userInfo", "user", "password", "host", "port", "relative", "path", "directory", "file", "query", "anchor"],
-	q: {
-		name: "queryKey",
-		parser: /(?:^|&)([^&=]*)=?([^&]*)/g
-	},
-	parser: {
-		strict: /^(?:([^:\/?#]+):)?(?:\/\/((?:(([^:@]*)(?::([^:@]*))?)?@)?([^:\/?#]*)(?::(\d*))?))?((((?:[^?#\/]*\/)*)([^?#]*))(?:\?([^#]*))?(?:#(.*))?)/,
-		loose: /^(?:(?![^:@]+:[^:@\/]*@)([^:\/?#.]+):)?(?:\/\/)?((?:(([^:@]*)(?::([^:@]*))?)?@)?([^:\/?#]*)(?::(\d*))?)(((\/(?:[^?#](?![^?#\/]*\.[^?#\/.]+(?:[?#]|$)))*\/?)?([^?#\/]*))(?:\?([^#]*))?(?:#(.*))?)/
-	}
-};
-
-function processHistoryEntry({visit, timestamp, qv}) {
-	let domain_titles = {};
-	let spaceFinder = RegExp(/.+(%20|\+|\s).+/g) //finds get variable values that have spaces in them
-	let url = parseUri(visit[0])
-	let domain = url.host
-
-	//scan components
-	for (let var_name in url.queryKey) {
-		if (spaceFinder.test(url.queryKey[var_name])) {
-			//Note: the following spaghetti is why you use a decent language like python
-			//with sets/defaultdicts
-			if (qv.hasOwnProperty(domain) == false) {
-				qv[domain] = {}
-			}
-			if (qv[domain].hasOwnProperty(var_name) == false) {
-				qv[domain][var_name] = 0
-			}
-			qv[domain][var_name] += 1
-		}
-	}
-
-	//sort title
-	if (domain_titles.hasOwnProperty(domain) == false) {
-		domain_titles[domain] = []
-	}
-
-	if (visit[1] != null) {
-		domain_titles[domain].push(visit[1])
-	}
-	if (visit[2] > timestamp) {
-		timestamp = visit[2] //timestamp is now last item loaded
-	}
-	self.postMessage({
-		"message": "visitProcessComplete",
-		"qv": qv,
-		"domain_titles": domain_titles,
-		"timestamp": timestamp,
-		"totalEntries": visit[3]
-	});
-}
-
-function longestCommonNgramSuffix(s1, s2) {
-	//Does what it says on the tin
-	s1 = s1.split(" ")
-	s2 = s2.split(" ")
-	let min_len = s1.length < s2.length ? s1.length : s2.length
-
-	let result = false
-	for (let a = 1; a < min_len + 1; a++) {
-		if (s1[s1.length - a] != s2[s2.length - a]) {
-			result = s1.slice(s1.length - a + 1)
-			break
-		}
-	}
-
-	if (result == false) {
-		return false
-	} else if (result == []) {
-		return false
-	} else {
-		return result.join(" ")
-	}
-}
-
-function sortDescendingByElementLength(first, second) {
-	//sorting function to sort a list of strings
-	return second.length - first.length
-}
-
-function computePTC({domain_titles}) {
-	let ptc = {};
-	let titleCount = 1;
-	//now for processing
-	for (let domain in domain_titles) {
-		let suffixes = {}
-		let titles = domain_titles[domain]
-		for (let x = 0; x < titles.length; x++) {
-			for (let y = x + 1; y < titles.length; y++) {
-				if (titles[x] != titles[y]) {
-					let lcns = longestCommonNgramSuffix(titles[x], titles[y])
-					if (lcns != false) {
-						if (suffixes.hasOwnProperty(lcns) == false) {
-							suffixes[lcns] = 0
-						}
-						suffixes[lcns] += 1
-					}
-				}
-			}
-			self.postMessage({
-				"message": "titleAnalyzed",
-				"domainCount": titleCount
-			});
-			titleCount++;
-		}
-		//eliminate those that only appear once
-		let to_add = [];
-		for (let suffix in suffixes) {
-			let count = suffixes[suffix]
-			if (count > 1) {
-				to_add.push(suffix)
-			}
-		}
-		//to_add must be sorted in descending order of length
-		//as largest matches should be eliminated first
-		to_add = to_add.sort(sortDescendingByElementLength)
-		ptc[domain] = to_add
-	}
-
-	//now remove anything empty
-	let to_delete = []
-	for (let x in ptc) {
-		if (ptc[x].length == 0) {
-			to_delete.push(x)
-		}
-	}
-	for (let x of to_delete) {
-		delete ptc[x]
-	}
-
-	self.postMessage({
-		"message": "computedPTC",
-		"ptc": ptc
-	});
-}
-
-self.onmessage = function({data}) {
-	self[data.command](data.payload);
-};
--- a/data/interests/lwca_refined.js
+++ b/data/interests/lwca_refined.js
@ -0,0 +1,192 @@
+//LWCA refined
+//2014-09-08 mruttley
+//Refined version of LWCA algorithm/process
+
+//How to use? Simply:
+// > let lwca = new LWCAClassifier()
+// > lwca.classify("http://www.bbc.com/some_very_interesting_article", "Apple reveals shiny new gadget")
+// >>> ['computers', 0.75]
+
+let verbose = false
+
+function LWCAClassifier({domain_rules, host_rules, path_rules, words_tree, ignore_words, ignore_domains, ignore_exts, bad_domain_specific}) {
+	// Main handler class
+	this.domain_rules = domain_rules;
+	this.host_rules = host_rules;
+	this.path_rules = path_rules;
+	this.words_tree = words_tree;
+	this.ignore_words = ignore_words;
+	this.ignore_domains = ignore_domains;
+	this.ignore_exts = ignore_exts;
+	this.bad_domain_specific = bad_domain_specific;
+
+	//Initialize various processors
+	if (verbose) console.log("Initializing...")
+
+	//build vk-tree
+	vk_tree = {}
+	for (let top_level of Object.keys(this.words_tree)) {
+		for (let sub_level of Object.keys(this.words_tree[top_level])) {
+			for (let kw of this.words_tree[top_level][sub_level]) {
+				vk_tree[kw] = [top_level, sub_level]
+			}
+		}
+	}
+
+	//build bad_domains, bad_ext, bad_chunk
+	let bad_domains = {}
+	let bad_exts = {}
+	let bad_chunks = {}
+
+	for (let domain_name of Object.keys(this.bad_domain_specific)){
+		let domain_name_chunks = domain_name.split(".")
+		bad_domains[domain_name_chunks[0]] = 1
+		bad_exts[domain_name_chunks[1]] = 1
+		for (let chunk of this.bad_domain_specific[domain_name]) {
+			bad_chunks[chunk] = 1
+		}
+	}
+
+	//New classifier
+	this.classify = function(url, title) {
+
+		if (verbose) console.log(url)
+		url = url.toLowerCase()
+
+		//check domains, hosts and paths for exact matches
+		//first check domain
+		domain = url.split("://")[1].split("/")[0]
+		domain_chunks = domain.split('.')
+		rule_mapping = false
+		for (let i in domain_chunks) {
+			fragment = domain_chunks.slice(i).join(".")
+			if (this.host_rules.hasOwnProperty(fragment)) {
+				rule_mapping = this.host_rules[fragment]; break
+			}
+			if (this.domain_rules.hasOwnProperty(fragment)) {
+				rule_mapping = this.domain_rules[fragment]; break
+			}
+		}
+
+		domain_and_path = url.split(domain)[1].split('?')[0].split('/').slice(1)
+
+		// http://www2.palmbeachpost.com/classifieds/catpage5.html
+		// /classifieds/catpage5.html
+		// [, classifieds, catpage5.html]
+		// [classifieds, catpage5.html]
+
+		for (let i in domain_and_path) {
+			path_fragment = domain_and_path.slice(i).join('/')
+			for (let j in domain_chunks) {
+				domain_fragment = domain_chunks.slice(j).join('.')
+				full_fragment = domain_fragment + "/" + path_fragment
+				if (this.path_rules.hasOwnProperty(full_fragment)) {
+					rule_mapping = this.path_rules[full_fragment]; break
+				}
+			}
+		}
+
+		if (rule_mapping != false) {
+			//is it top level already?
+			if (this.words_tree.hasOwnProperty(rule_mapping)) {
+				if(verbose) console.log('Used maxchunk to classify ' + url + " as " + [rule_mapping, "general"])
+				return [rule_mapping, "general"]
+			}else{
+				if (vk_tree.hasOwnProperty(rule_mapping)) {
+					vk_tree_mapping = vk_tree[rule_mapping]
+					if(verbose) console.log('Used maxchunk to classify ' + url + " as " + [top_level, rule_mapping])
+					return vk_tree_mapping
+					//return [vk_tree_mapping[0], rule_mapping]
+				}
+			}
+		}
+
+		//tokenize the url
+		url = url.match(wordFinder)
+
+		if (verbose) console.log(url)
+
+		bad_domain = 0
+		bad_ext = 0
+		bad_chunk = 0
+		ignore_domain = 0
+		ignore_ext = 0
+
+		scores = {} //top level & sub_level counts
+
+		for (let chunk of url) {
+
+			if (this.ignore_domains.hasOwnProperty(chunk)) ignore_domain += 1
+			if (this.ignore_exts.hasOwnProperty(chunk)) ignore_ext += 1
+			if (bad_domains.hasOwnProperty(chunk)) bad_domain += 1
+			if (bad_exts.hasOwnProperty(chunk)) bad_ext += 1
+			if (bad_chunks.hasOwnProperty(chunk)) bad_chunk += 1
+			if (this.ignore_words.hasOwnProperty(chunk)) continue
+
+			if (vk_tree.hasOwnProperty(chunk)) {
+				mapping = vk_tree[chunk]
+				if (scores.hasOwnProperty(mapping[0]) == false) {
+					scores[mapping[0]] = {}
+				}
+				if (scores[mapping[0]].hasOwnProperty(mapping[1]) == false) {
+					scores[mapping[0]][mapping[1]] = 0
+				}
+				scores[mapping[0]][mapping[1]] += 1
+			}
+		}
+
+		if (verbose) console.log(scores)
+
+		if (ignore_domain + ignore_ext >= 2) return ['uncategorized', 'dummy']
+		if (bad_domain + bad_chunk + bad_ext >= 3) return ['uncategorized', 'dummy'] //check that it's not a bad combination
+		if (Object.keys(scores).length == 0) return ['uncategorized', 'dummy'] //or there's no score
+
+		//convert to list of top levels
+		sl = []
+		sub_level_strings = {}
+		for (let top_level of Object.keys(scores)) {
+			sub_level_count = 0
+			subcats = []
+			for (let sub_level of Object.keys(scores[top_level])) {
+				subcats.push(sub_level)
+				sub_level_count += scores[top_level][sub_level]
+			}
+			sl.push([top_level, sub_level_count])
+			sub_level_strings[top_level] = subcats.join("/")
+		}
+		sl = sl.sort(sortDescendingBySecondElement)
+
+		//if just one item then return that
+		if (sl.length == 1) {
+			if (verbose) console.log([sl[0][0], sub_level_strings[sl[0][0]]])
+			return [sl[0][0], sub_level_strings[sl[0][0]]]
+		}
+
+		//if the top 2 are the same, return uncategorized
+		if (sl[0][1] == sl[1][1]) {
+			return ['uncategorized', 'dummy']
+		}else{ //else if there is a top item, return it
+			if (verbose) console.log([sl[0][0], sub_level_strings[sl[0][0]]])
+			return [sl[0][0], sub_level_strings[sl[0][0]]]
+		}
+
+	}
+}
+
+let wordFinder = RegExp("[a-z]{3,}", "g") //tokenizes english sentences
+
+function sortDescendingBySecondElement(first, second) {
+	//function to be used in sort(some_function)
+	//does what it says on the tin
+	first = first[1]
+	second = second[1]
+	if (first == second) {
+		return 0
+	} else {
+		if (first > second) {
+			return false
+		} else {
+			return true
+		}
+	}
+}
--- a/lib/Controller.js
+++ b/lib/Controller.js
@ -21,7 +21,6 @@ const {DayCountRankerBolt} = require("streams/dayCountRankerBolt");
 const {ChartDataProcessorBolt} = require("streams/chartDataProcessorBolt");
 const {InterestDashboardDataProcessorBolt} = require("streams/interestDashboardDataProcessorBolt");
 const {DateUtils} = require("DateUtils");
-const {LWCAClassifier} = require("lwca_refined");
 const {UrlClassifier} = require("UrlClassifier");
 const {computeInterestsFromHosts} = require("Utils");

@ -32,13 +31,6 @@ const {data} = require("sdk/self");
 const kDefaultResubmitHistoryDays = 30;

 function Controller(options={}) {
-  let self = this;
-  let worker = new ChromeWorker(data.url("interests/lwcaWorker.js"));
-  Task.spawn(function*() {
-    self._lwcaClassifier = new LWCAClassifier(worker);
-    yield self._lwcaClassifier.init();
-  });
-
  let historyDaysToResubmit = options.historyDays || kDefaultResubmitHistoryDays;
  this._workerFactory = new WorkerFactory();
  this._historyDaysToResubmit = historyDaysToResubmit;
@ -141,7 +133,7 @@ Controller.prototype = {
    this._processingHistoryPromise = Task.spawn(function() {
      let startDay = DateUtils.today() - daysAgo;
      let lastTimeStamp = this.storage.lastTimeStamp || 0;
-      this._currentReader = new HistoryReader(this._workers, this._streamObjects, lastTimeStamp, this._lwcaClassifier);
+      this._currentReader = new HistoryReader(this._workers, this._streamObjects, lastTimeStamp);
      yield this._currentReader.resubmitHistory({startDay: startDay});
      this.storage.lastTimeStamp = this._currentReader.getLastTimeStamp();
      if (flush) {
--- a/lib/HistoryReader.js
+++ b/lib/HistoryReader.js
@ -22,8 +22,7 @@ Cu.import("resource://gre/modules/NetUtil.jsm");

 const MS_PER_DAY = 86400000;

-function HistoryReader(workers, streamObjects, lastTimeStamp = 0, lwcaClassifier, storageBackend) {
-  this._lwcaClassifier = lwcaClassifier;
+function HistoryReader(workers, streamObjects, lastTimeStamp = 0, storageBackend) {
  this._workers = workers;
  this._ResubmitRecentHistoryLastTimeStamp = lastTimeStamp;
  this._streamObjects = streamObjects;
@ -112,21 +111,10 @@ HistoryReader.prototype = {

  _handleInterestsResults: function I__handleInterestsResults(aData) {
    if (aData.messageId == "resubmit") {
-      // LWCA classification.
-      try {
-        if (aData.url && aData.title && aData.namespace == "58-cat") {
-          if (!shouldSkip(aData.url)) {
-            let classification = this._lwcaClassifier.classify(aData.url, aData.title);
-            let subcat = classification[1].split("/")[0];
-            aData.results.push({"type": "lwca", "interests": [classification[0]], "subcat": subcat});
-          }
-        }
-      } catch (ex) {
-        console.log(ex);
-      }
-
      // save classification results in _interestsWorkersData array untill all have responded
-      this._interestsWorkersData.push(aData);
+      if (!shouldSkip(aData.url)) {
+        this._interestsWorkersData.push(aData);
+      }
      // decrement url count and check if we have seen them all
      this._ResubmitRecentHistoryUrlCount.interests--;
      if (this._ResubmitRecentHistoryUrlCount.interests == 0) {
--- a/lib/WorkerFactory.js
+++ b/lib/WorkerFactory.js
@ -91,6 +91,8 @@ WorkerFactory.prototype = {
    scriptLoader.loadSubScript(data.url("models/" + this._localeCode + "/" + modelName + "/textModel.json"));
    // use the same url stop words
    scriptLoader.loadSubScript(data.url("models/urlStopwords.json"));
+    scriptLoader.loadSubScript(data.url("words.js"));
+    scriptLoader.loadSubScript(data.url("rules.js"));

    let worker = new ChromeWorker(data.url("interests/interestsWorker.js"));
    worker.postMessage({
@ -101,7 +103,15 @@ WorkerFactory.prototype = {
        interestsDataType: "dfr",
        interestsData: interestsData,
        interestsClassifierModel: interestsClassifierModel,
-        interestsUrlStopwords: interestsUrlStopwords
+        interestsUrlStopwords: interestsUrlStopwords,
+        domain_rules: domain_rules,
+        host_rules: host_rules,
+        path_rules: path_rules,
+        words_tree: words_tree,
+        ignore_words: ignore_words,
+        ignore_domains: ignore_domains,
+        ignore_exts: ignore_exts,
+        bad_domain_specific: bad_domain_specific
      }
    });

--- a/lib/lwca_refined.js
+++ b/lib/lwca_refined.js
@ -1,767 +0,0 @@
-//LWCA refined
-//2014-09-08 mruttley
-//Refined version of LWCA algorithm/process
-
-//Three stages:
-// - Pre-processing
-// - Classification
-// - Post-processing
-
-//How to use? Simply:
-// > var lwca = new LWCAClassifier()
-// > lwca.classify("http://www.bbc.com/some_very_interesting_article", "Apple reveals shiny new gadget")
-// >>> ['computers', 0.75]
-
-const {Cc, Ci, Cu, ChromeWorker} = require("chrome");
-Cu.import("resource://gre/modules/Task.jsm");
-
-var preprocessingProgressPercent = 0 //global variable to indicate how far in the pre processing the user is
-var verbose = false
-
-function LWCAClassifier(worker) {
-	// Main handler class
-
-	//Initialize various processors
-	if (verbose) console.log("Initializing...")
-
-	let cdb = new ComponentDatabase(worker); //objects that help match title components and query variables
-	//it also checks if it needs to be updated etc
-
-	//build vk-tree
-	vk_tree = {}
-	for (let top_level of Object.keys(words_tree)) {
-		for (let sub_level of Object.keys(words_tree[top_level])) {
-			for (let kw of words_tree[top_level][sub_level]) {
-				vk_tree[kw] = [top_level, sub_level]
-			}
-		}
-	}
-
-	//build bad_domains, bad_ext, bad_chunk
-	bad_domains = {}
-	bad_exts = {}
-	bad_chunks = {}
-
-	for (let domain_name of Object.keys(bad_domain_specific)){
-		domain_name_chunks = domain_name.split(".")
-		bad_domains[domain_name_chunks[0]] = 1
-		bad_exts[domain_name_chunks[1]] = 1
-		for (let chunk of bad_domain_specific[domain_name]) {
-			bad_chunks[chunk] = 1
-		}
-	}
-
-	//New classifier
-	this.classify = function(url, title) {
-
-		if (verbose) console.log(url)
-		url = url.toLowerCase()
-
-		//check domains, hosts and paths for exact matches
-		//first check domain
-		domain = url.split("://")[1].split("/")[0]
-		domain_chunks = domain.split('.')
-		rule_mapping = false
-		for (let i in domain_chunks) {
-			fragment = domain_chunks.slice(i).join(".")
-			if (host_rules.hasOwnProperty(fragment)) {
-				rule_mapping = host_rules[fragment]; break
-			}
-			if (domain_rules.hasOwnProperty(fragment)) {
-				rule_mapping = domain_rules[fragment]; break
-			}
-		}
-
-		domain_and_path = url.split(domain)[1].split('?')[0].split('/').slice(1)
-
-		// http://www2.palmbeachpost.com/classifieds/catpage5.html
-		// /classifieds/catpage5.html
-		// [, classifieds, catpage5.html]
-		// [classifieds, catpage5.html]
-
-		for (let i in domain_and_path) {
-			path_fragment = domain_and_path.slice(i).join('/')
-			for (let j in domain_chunks) {
-				domain_fragment = domain_chunks.slice(j).join('.')
-				full_fragment = domain_fragment + "/" + path_fragment
-				if (path_rules.hasOwnProperty(full_fragment)) {
-					rule_mapping = path_rules[full_fragment]; break
-				}
-			}
-		}
-
-		if (rule_mapping != false) {
-			//is it top level already?
-			if (words_tree.hasOwnProperty(rule_mapping)) {
-				if(verbose) console.log('Used maxchunk to classify ' + url + " as " + [rule_mapping, "general"])
-				return [rule_mapping, "general"]
-			}else{
-				if (vk_tree.hasOwnProperty(rule_mapping)) {
-					vk_tree_mapping = vk_tree[rule_mapping]
-					if(verbose) console.log('Used maxchunk to classify ' + url + " as " + [top_level, rule_mapping])
-					return vk_tree_mapping
-					//return [vk_tree_mapping[0], rule_mapping]
-				}
-			}
-		}
-
-		//tokenize the url
-		url = url.match(wordFinder)
-
-		if (verbose) console.log(url)
-
-		bad_domain = 0
-		bad_ext = 0
-		bad_chunk = 0
-		ignore_domain = 0
-		ignore_ext = 0
-
-		scores = {} //top level & sub_level counts
-
-		for (let chunk of url) {
-
-			if (ignore_domains.hasOwnProperty(chunk)) ignore_domain += 1
-			if (ignore_exts.hasOwnProperty(chunk)) ignore_ext += 1
-			if (bad_domains.hasOwnProperty(chunk)) bad_domain += 1
-			if (bad_exts.hasOwnProperty(chunk)) bad_ext += 1
-			if (bad_chunks.hasOwnProperty(chunk)) bad_chunk += 1
-			if (ignore_words.hasOwnProperty(chunk)) continue
-
-			if (vk_tree.hasOwnProperty(chunk)) {
-				mapping = vk_tree[chunk]
-				if (scores.hasOwnProperty(mapping[0]) == false) {
-					scores[mapping[0]] = {}
-				}
-				if (scores[mapping[0]].hasOwnProperty(mapping[1]) == false) {
-					scores[mapping[0]][mapping[1]] = 0
-				}
-				scores[mapping[0]][mapping[1]] += 1
-			}
-		}
-
-		if (verbose) console.log(scores)
-
-		if (ignore_domain + ignore_ext >= 2) return ['uncategorized', 'dummy']
-		if (bad_domain + bad_chunk + bad_ext >= 3) return ['uncategorized', 'dummy'] //check that it's not a bad combination
-		if (Object.keys(scores).length == 0) return ['uncategorized', 'dummy'] //or there's no score
-
-		//convert to list of top levels
-		sl = []
-		sub_level_strings = {}
-		for (let top_level of Object.keys(scores)) {
-			sub_level_count = 0
-			subcats = []
-			for (let sub_level of Object.keys(scores[top_level])) {
-				subcats.push(sub_level)
-				sub_level_count += scores[top_level][sub_level]
-			}
-			sl.push([top_level, sub_level_count])
-			sub_level_strings[top_level] = subcats.join("/")
-		}
-		sl = sl.sort(sortDescendingBySecondElement)
-
-		//if just one item then return that
-		if (sl.length == 1) {
-			if (verbose) console.log([sl[0][0], sub_level_strings[sl[0][0]]])
-			return [sl[0][0], sub_level_strings[sl[0][0]]]
-		}
-
-		//if the top 2 are the same, return uncategorized
-		if (sl[0][1] == sl[1][1]) {
-			return ['uncategorized', 'dummy']
-		}else{ //else if there is a top item, return it
-			if (verbose) console.log([sl[0][0], sub_level_strings[sl[0][0]]])
-			return [sl[0][0], sub_level_strings[sl[0][0]]]
-		}
-
-	}
-
-	this.init = function() {
-		return Task.spawn(function*() {
-			yield cdb.init();
-		});
-	};
-}
-
-// Pre-processors
-
-function spotDefinites(url, title) {
-	//function to spot a definite classification
-	//e.g. "real estate" is definitely real estate
-
-	let definites = {
-		"real estate": "real estate", //TODO: moarr
-	}
-
-	for (let definiteMatch in definites) {
-		if (title.indexOf(definiteMatch) != -1) {
-			return [definites[definiteMatch], 'general']
-		}
-	}
-
-	return false //false if nothing found
-}
-
-function ComponentDatabase(worker, create_objects = true) {
-	//creates a database of known query variables and persistent title components
-
-	//initialization
-	this._worker = worker;
-	this._worker.addEventListener("message", this, false);
-	this._worker.addEventListener("error", this, false);
-
-	this.queryVariables = {}
-	this.persistentTitleChunks = {}
-	this.meta = {
-		'timestamp': 0
-	}
-
-	this.init = function() {
-		return Task.spawn(function*() {
-			//////// temporarily decoupled
-
-			////////
-			//if (verbose) console.log("Began the init function in Cdb")
-			//let ts =
-			//	yield this.find_start_and_end();
-			//if (ts['start'] == 0) {
-			//	//nothing ever made before
-			//	if (verbose) console.log('Nothing found in local directory, so scanning the whole history')
-			//	this.scan(ts['start'], ts['end']);
-			//} else {
-			//	//something made before, so load it
-			//	if (verbose) console.log('Found cdb in local directory, importing')
-			//	yield this.load_component_database();
-			//
-			//	//fill in the rest
-			//	this.scan(ts['start'], ts['end']);
-			//	if (verbose) console.log('loaded existing cdb from disc')
-			//}
-		}.bind(this));
-	};
-
-	this.find_start_and_end = function() {
-		return Task.spawn(function*() {
-			//where to start and end the scanning (if any)
-
-			//mostly a copy of get_history
-			let options = historyService.getNewQueryOptions(); //make a blank query
-			options.sortingMode = Ci.nsINavHistoryQueryOptions.SORT_BY_DATE_DESCENDING;
-			let query = historyService.getNewQuery();
-			let result = historyService.executeQuery(query, options);
-			let cont = result.root;
-			cont.containerOpen = true;
-			let latest_timestamp = cont.getChild(0).time; //this is the last url that the user visited, which is the 'end'
-			cont.containerOpen = false;
-
-
-			let lm = yield this.load_meta(); //find last url visited's id
-			if (lm == false) {
-				if (verbose) console.log('Could not find any meta information. Everything needs to be scanned. Please create a component database first')
-				return {
-					'start': 0,
-					'end': latest_timestamp
-				}
-			} else {
-				if (verbose) console.log('Found meta information on disc (ts: ' + this.meta['timestamp'] + ")")
-				return {
-					'start': this.meta['timestamp'],
-					'end': latest_timestamp
-				} //start and ending timestamps of whatever needs to be updated
-			}
-		}.bind(this));
-	};
-
-	this._handleVisitProcessComplete = function(msgData) {
-		this._qv = msgData.qv;
-		for (let domain in msgData.domain_titles) {
-			//sort title
-			if (this._domain_titles.hasOwnProperty(domain) == false) {
-				this._domain_titles[domain] = []
-			}
-			this._totalTitles += msgData.domain_titles[domain].length;
-			this._domain_titles[domain] = this._domain_titles[domain].concat(msgData.domain_titles[domain]);
-		}
-
-		this.meta['timestamp'] = msgData.timestamp;
-		this._processNextHistoryEvent();
-		if (this._historyProgressCallback) {
-			this._historyProgressCallback("historyProgress", this._history_total, msgData.totalEntries);
-		}
-	};
-
-	this._handleAnalyzedTitle = function(msgData) {
-		if (this._titleProgressCallback) {
-			this._titleProgressCallback("titleProgress", msgData.domainCount, this._totalTitles);
-		}
-	}
-
-	this._handleComputedPTC = function(msgData) {
-		let ptc = msgData.ptc;
-
-		if (this._start != 0) {
-			//merge the new stuff with the old stuff
-			//first query variables
-			for (let domain in this._qv) {
-				if (this.queryVariables.hasOwnProperty(domain) == false) {
-					this.queryVariables[domain] = {}
-				}
-				for (let v in this._qv[domain]) {
-					if (this.queryVariables[domain].hasOwnProperty(v) == false) {
-						this.queryVariables[domain][v] = 1
-					}
-				}
-			}
-
-			//then title components
-			for (let domain in ptc) {
-				if (this.persistentTitleChunks.hasOwnProperty(domain) == false) {
-					this.persistentTitleChunks[domain] = {}
-				}
-				for (let v of ptc[domain]) {
-					if (this.persistentTitleChunks[domain].hasOwnProperty(v) == false) {
-						this.persistentTitleChunks[domain][v] = 1
-					}
-				}
-			}
-			if (verbose) console.log('loaded existing cdb from disc')
-		} else {
-			this.queryVariables = this._qv;
-			this.persistentTitleChunks = ptc;
-		}
-		this._callback();
-		this.save() //now save everything
-	};
-
-	this.handleEvent = function(aEvent) {
-		let eventType = aEvent.type;
-		if (eventType == "message") {
-			let msgData = aEvent.data;
-			switch (msgData.message) {
-				case "visitProcessComplete":
-					this._handleVisitProcessComplete(msgData);
-					break;
-				case "computedPTC":
-					this._handleComputedPTC(msgData);
-					break;
-				case "titleAnalyzed":
-					this._handleAnalyzedTitle(msgData);
-					break;
-			}
-		} else if (eventType == "error") {
-			//TODO:handle error
-			console.log(aEvent.message);
-		}
-	};
-
-	this._processNextHistoryEvent = function() {
-		try {
-			let nextVisit = this._history.next();
-			this._history_total += 1;
-
-			this._worker.postMessage({
-				command: "processHistoryEntry",
-				payload: {
-					"visit": nextVisit,
-					"timestamp": this.meta['timestamp'],
-					"qv": this._qv
-				}
-			});
-		} catch (ex if ex instanceof StopIteration) {
-			if (verbose) console.log("Total history items loaded: " + this._history_total);
-			if (verbose) console.log("Finding common suffixes in " + Object.keys(this._domain_titles).length + " domains ");
-
-			this._worker.postMessage({
-				command: "computePTC",
-				payload: {
-					"domain_titles": this._domain_titles,
-				}
-			});
-		}
-	};
-
-	this.scan = function(start, end) {
-		this._history = getHistory(start, end);
-		this._history_total = 0;
-		this._start = start;
-		this._end = end;
-		this._qv = {}; //query variables
-		this._ptc = {}; //persistent title components
-		this._domain_titles = {};
-		this._totalTitles = 0;
-		this._processNextHistoryEvent(start, end);
-	}
-
-	this.load_meta = function() {
-		return Task.spawn(function*() {
-			if (verbose) console.log("load_meta function called")
-				//load meta
-			let decoder = new TextDecoder();
-
-			/////////DEBUGGING
-			let meta_location = OS.Path.join(OS.Constants.Path.profileDir, "meta.json");
-			console.log("Meta should be stored at: " + meta_location)
-
-			let meta_exists =
-				yield OS.File.exists(meta_location);
-			if (meta_exists) {
-				console.log("Meta file exists");
-			} else {
-				console.log("Meta does not exist");
-				return false;
-			}
-			///////////////////
-
-			try {
-				let array =
-					yield OS.File.read(meta_location);
-				if (verbose) console.log('onSuccess for meta loading called')
-				let info = decoder.decode(array);
-				let data = JSON.parse(info)
-				if (verbose) console.log('meta data found was: ' + JSON.stringify(data))
-				this.meta = data
-				return true //loads meta information into an object with timestamp and id
-			} catch (ex) {
-				if (verbose) console.log("Meta was not found")
-				return false //file doesn't exist
-			}
-		}.bind(this));
-	};
-
-	this.load_component_database = function() {
-		return Task.spawn(function*() {
-			//loads the component database if it exists, else returns false
-			let decoder = new TextDecoder();
-			try {
-				let array =
-					yield OS.File.read(OS.Path.join(OS.Constants.Path.profileDir, "cdb.json"));
-				let info = decoder.decode(array);
-				info = JSON.parse(info)
-				this.queryVariables = info['queryVariables']
-				this.persistentTitleChunks = info['persistentTitleChunks']
-				return true
-			} catch (ex) {
-				return false //file doesn't exist
-			}
-		}.bind(this));
-	};
-
-	this.save = function() {
-		return Task.spawn(function*() {
-			//assumes that both cdb and meta have been created
-			let encoder = new TextEncoder();
-			let meta_enc = encoder.encode(JSON.stringify(this.meta));
-			let cdb_enc = encoder.encode(JSON.stringify({
-				'queryVariables': this.queryVariables,
-				'persistentTitleChunks': this.persistentTitleChunks
-			}));
-			//save meta
-			yield OS.File.writeAtomic(OS.Path.join(OS.Constants.Path.profileDir, "meta.json"), meta_enc, {
-				tmpPath: OS.Path.join(OS.Constants.Path.profileDir, "meta.json.tmp")
-			});
-			//save component database
-			yield OS.File.writeAtomic(OS.Path.join(OS.Constants.Path.profileDir, "cdb.json"), cdb_enc, {
-				tmpPath: OS.Path.join(OS.Constants.Path.profileDir, "cdb.json.tmp")
-			});
-		}.bind(this));
-	};
-}
-
-function removePersistentTitleChunks(url, title, cdb) {
-	//Removes common title endings such as " - Google Search" using the component database
-
-	let domain = getDomain(url)
-	if (cdb.hasOwnProperty(domain)) {
-		for (let suffix of cdb[domain]) {
-			if (title.toLowerCase().endsWith(suffix.toLowerCase())) {
-				//chop suffix from end
-				title = title.slice(0, title.length - suffix.length)
-				break
-			}
-		}
-	}
-
-	return title
-}
-
-function removeDomainNames(url, title) {
-	//tries to remove the domain name (or aspects of it) from the title
-	//if this reduces the title to nothing, then just leave them in
-	url = parseUri(url)
-	url = url.host.split(".")
-	title = title.toLowerCase().match(wordFinder)
-
-	let new_title = []
-	let removed = []
-
-	for (let token of title) {
-		if (url.indexOf(token) == -1) {
-			new_title.push(token)
-		}
-	}
-
-	if (new_title.length == 0) {
-		return title.join(" ")
-	} else {
-		return new_title.join(" ")
-	}
-}
-
-// Classification
-
-function cosineSimilarity(text, category_keywords, category_magnitude) {
-	//calculates the cosine similarity between the two arguments
-	//expects text to be an array of strings
-	//expects category_keywords to be an object of string: int
-	//returns a float
-
-	//create vector
-	let vector = {} //object of word: [text count, category count]
-	for (let word of text) {
-		if (vector.hasOwnProperty(word) == false) {
-			if (category_keywords.hasOwnProperty(word) == false) {
-				vector[word] = [1, 0]
-			} else {
-				vector[word] = [1, category_keywords[word]]
-			}
-		} else {
-			vector[word][0] += 1
-		}
-	}
-
-	//calculate dot product
-
-	let dot_product = 0
-	let text_vector_magnitude = 0
-
-	for (let word in vector) {
-		dot_product += (vector[word][0] * vector[word][1])
-		text_vector_magnitude += Math.pow(vector[word][0], 2)
-	}
-
-	let denominator = Math.sqrt(text_vector_magnitude) * category_magnitude
-
-	if (denominator != 0) {
-		return dot_product / denominator
-	}
-
-	return 0
-}
-
-// Post processing
-
-function augmentRepeatWords(results) {
-	//Adds 1 to the score of any result containing a repeated word
-
-	wordCounts = {}
-	for (i = 0; i < results.length; i++) {
-		tokens = results[i][0].toLowerCase().match(wordFinder)
-		for (let token of tokens) {
-			if (wordCounts.hasOwnProperty(token) == false) {
-				wordCounts[token] = 0
-			}
-			wordCounts[token] += 1
-		}
-	}
-
-	//now go through again and find the repeats
-	for (i = 0; i < results.length; i++) {
-		tokens = results[i][0].toLowerCase().match(wordFinder)
-		for (let token of tokens) {
-			if (wordCounts[token] > 1) { //must be a repeat
-				results[i][1] += 1
-			}
-		}
-	}
-
-	return results
-}
-
-function augmentQueries(url, results, queryDatabase) {
-	//Tries to spot any search queries in the url
-	//Doubles the score of anything that contains a search query word
-
-	if (verbose) console.log("URL: " + url)
-
-	let queries = [] //a list of strings
-	url = parseUri(url) //
-
-	if (queryDatabase.hasOwnProperty(url.host)) { //if the domain is in the db
-		if (verbose) console.log("Domain: " + url.host + " is in the database")
-		if (verbose) console.log("There are " + Object.keys(url.queryKey).length + " keys in the url")
-		for (let variable in url.queryKey) { //iterate through url get variables
-			if (queryDatabase[url.host].hasOwnProperty(variable)) { //if in the db
-				query = unescape(url.queryKey[variable]) //append to list
-				queries.concat(query.match(wordFinder))
-			}
-		}
-	}
-
-	//now find any result that contains a query word
-	if (queries.length > 0) {
-		for (let result in results) {
-			if (verbose) console.log("Iterating through results")
-			for (let word of queries) {
-				if (results[result][0].indexOf(word) != -1) {
-					results[result][1] *= 2 //double the score
-				}
-			}
-		}
-	}
-
-
-	return results
-}
-
-// Auxiliary functions, matchers, options etc
-
-const {data} = require("sdk/self"); //not quite sure why this is necessary
-let {TextEncoder, TextDecoder, OS} = Cu.import("resource://gre/modules/osfile.jsm", {}); //for file IO
-let historyService = Cc["@mozilla.org/browser/nav-history-service;1"].getService(Ci.nsINavHistoryService);
-let scriptLoader = Cc["@mozilla.org/moz/jssubscript-loader;1"].getService(Ci.mozIJSSubScriptLoader);
-scriptLoader.loadSubScript(data.url("words.js"));
-scriptLoader.loadSubScript(data.url("rules.js"));
-
-function getDomain(url) {
-	//returns the (sub)domain of a url
-	//subdomains are treated as different entities to top level urls
-	if (url.indexOf("://") != -1) {
-		url = url.split("://")[1]
-		if (url.indexOf("/") != -1) {
-			url = url.split("/")[0]
-		}
-		if (url.indexOf("?") != -1) {
-			url = url.split("?")[0]
-		}
-	} else {
-		return false
-	}
-	return url
-}
-
-function getHistory(start, end) {
-	//Generator that yields the most recent history urls one by one
-	//Returned in the form [url, title, timestamp]
-
-	//make a blank query
-	let options = historyService.getNewQueryOptions();
-	options.sortingMode = Ci.nsINavHistoryQueryOptions.SORT_BY_DATE_DESCENDING;
-	let query = historyService.getNewQuery();
-	query.beginTime = start;
-	query.endTime = end;
-	let result = historyService.executeQuery(query, options);
-
-	//open up the results
-	let cont = result.root;
-	cont.containerOpen = true;
-
-	//yield whatever there is
-	for (let i = 0; i < cont.childCount; i++) {
-		let node = cont.getChild(i);
-		yield [node.uri, node.title, node.time, cont.childCount];
-	}
-
-	//close the results container
-	cont.containerOpen = false;
-}
-
-function parseUri(str) {
-	// parseUri 1.2.2
-	// (c) Steven Levithan <stevenlevithan.com>
-	// MIT License
-	// http://blog.stevenlevithan.com/archives/parseuri
-	var o = parseUri.options,
-		m = o.parser[o.strictMode ? "strict" : "loose"].exec(str),
-		uri = {},
-		i = 14;
-
-	while (i--) uri[o.key[i]] = m[i] || "";
-
-	uri[o.q.name] = {};
-	uri[o.key[12]].replace(o.q.parser, function($0, $1, $2) {
-		if ($1) uri[o.q.name][$1] = $2;
-	});
-
-	return uri;
-};
-
-parseUri.options = {
-	strictMode: false,
-	key: ["source", "protocol", "authority", "userInfo", "user", "password", "host", "port", "relative", "path", "directory", "file", "query", "anchor"],
-	q: {
-		name: "queryKey",
-		parser: /(?:^|&)([^&=]*)=?([^&]*)/g
-	},
-	parser: {
-		strict: /^(?:([^:\/?#]+):)?(?:\/\/((?:(([^:@]*)(?::([^:@]*))?)?@)?([^:\/?#]*)(?::(\d*))?))?((((?:[^?#\/]*\/)*)([^?#]*))(?:\?([^#]*))?(?:#(.*))?)/,
-		loose: /^(?:(?![^:@]+:[^:@\/]*@)([^:\/?#.]+):)?(?:\/\/)?((?:(([^:@]*)(?::([^:@]*))?)?@)?([^:\/?#]*)(?::(\d*))?)(((\/(?:[^?#](?![^?#\/]*\.[^?#\/.]+(?:[?#]|$)))*\/?)?([^?#\/]*))(?:\?([^#]*))?(?:#(.*))?)/
-	}
-};
-
-String.prototype.endsWith = function(suffix) {
-	//http://stackoverflow.com/a/2548133/849354
-	return this.indexOf(suffix, this.length - suffix.length) !== -1;
-};
-
-var wordFinder = RegExp("[a-z]{3,}", "g") //tokenizes english sentences
-var spaceFinder = RegExp(/.+(%20|\+|\s).+/g) //finds get variable values that have spaces in them
-	//bizarrely, if spaceFinder is declared in the way wordFinder is (two args), it returns an error. Oh JS...
-
-function sortDescendingBySecondElement(first, second) {
-	//function to be used in sort(some_function)
-	//does what it says on the tin
-	first = first[1]
-	second = second[1]
-	if (first == second) {
-		return 0
-	} else {
-		if (first > second) {
-			return false
-		} else {
-			return true
-		}
-	}
-}
-
-function sortDescendingByElementLength(first, second) {
-	//sorting function to sort a list of strings
-	return second.length - first.length
-}
-
-function loadClassifications() {
-	//returns an id to iab mapping
-	//loads meta information into an object with timestamp and id
-	let decoder = new TextDecoder();
-	let promise = OS.File.read(OS.Path.join(OS.Constants.Path.profileDir, "meta.json"));
-	promise = promise.then(
-		function onSuccess(array) {
-			let info = decoder.decode(array);
-			info = JSON.parse(info)
-
-			//now expand it
-			//create an id-to-text version of the mapping
-			id_to_text = {}
-			for (let iab in info['mapping']) {
-				id = info['mapping'][iab]
-				id_to_text[id] = iab
-			}
-
-			//need id to text version of iab
-			for (let visitid in info['classifications']) {
-				mapping_id = info['classifications'][visitid]
-				info['classifications'][visitid] = id_to_text[mapping_id]
-			}
-
-			return info['classifications']
-
-		},
-		function onFailure() {
-			return false //file doesn't exist
-		}
-	);
-
-}
-
-//for the extension main.js to access
-exports.LWCAClassifier = LWCAClassifier
-exports.ComponentDatabase = ComponentDatabase
--- a/test/helpers.js
+++ b/test/helpers.js
@ -46,7 +46,9 @@ exports.testUtils = {
    return false;
  },

-  getWorker : function getWorker({namespace, domainRules, textModel, urlStopWords, listener, regionCode}) {
+  getWorker : function getWorker({namespace, domainRules, textModel, urlStopWords,
+                                  listener, regionCode, domain_rules, host_rules, path_rules,
+                                  words_tree, ignore_words, ignore_domains, ignore_exts, bad_domain_specific}) {
    let worker = new ChromeWorker(data.url("interests/interestsWorker.js"));
    worker.addEventListener("message", listener, false);
    worker.addEventListener("error", listener, false);
@ -58,7 +60,15 @@ exports.testUtils = {
        interestsDataType: "dfr",
        interestsData: domainRules,
        interestsClassifierModel: textModel,
-        interestsUrlStopwords: urlStopWords
+        interestsUrlStopwords: urlStopWords,
+        domain_rules: domain_rules,
+        host_rules: host_rules,
+        path_rules: path_rules,
+        words_tree: words_tree,
+        ignore_words: ignore_words,
+        ignore_domains: ignore_domains,
+        ignore_exts: ignore_exts,
+        bad_domain_specific: bad_domain_specific
      }
    });
    return worker;
--- a/test/test-HistoryReader.js
+++ b/test/test-HistoryReader.js
@ -22,7 +22,6 @@ const {DayCountRankerBolt} = require("streams/dayCountRankerBolt");
 const {DailyInterestsSpout} = require("streams/dailyInterestsSpout");
 const {ChartDataProcessorBolt} = require("streams/chartDataProcessorBolt");
 const {getPlacesHostForURI, getBaseDomain} = require("Utils");
-const {LWCAClassifier} = require("lwca_refined");
 const test = require("sdk/test");
 const {data} = require("sdk/self");

@ -63,11 +62,7 @@ exports["test read all"] = function test_readAll(assert, done) {
    let storageBackend = {};
    let streamObjects = initStream(storageBackend);

-    let worker = new ChromeWorker(data.url("interests/lwcaWorker.js"));
-    let lwcaClassifier = new LWCAClassifier(worker);
-    yield lwcaClassifier.init();
-
-    let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(), streamObjects, 0, lwcaClassifier, storageBackend);
+    let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(), streamObjects, 0, storageBackend);
    yield historyReader.resubmitHistory({startDay: today-20});

    let assertDeferred = oldPromise.defer();
@ -100,12 +95,8 @@ exports["test read from given timestamp"] = function test_readFromGivenTimestamp
    let storageBackend = {};
    let streamObjects = initStream(storageBackend);

-    let worker = new ChromeWorker(data.url("interests/lwcaWorker.js"));
-    let lwcaClassifier = new LWCAClassifier(worker);
-    yield lwcaClassifier.init();
-
    // only read starting from id == 10
-    let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(), streamObjects, (today-10)*MICROS_PER_DAY, lwcaClassifier, storageBackend);
+    let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(), streamObjects, (today-10)*MICROS_PER_DAY, storageBackend);
    yield historyReader.resubmitHistory({startDay: today-20});

    let assertDeferred = oldPromise.defer();
@ -143,12 +134,8 @@ exports["test chunk size 1"] = function test_ChunkSize1(assert, done) {
    let storageBackend = {};
    let streamObjects = initStream(storageBackend);

-    let worker = new ChromeWorker(data.url("interests/lwcaWorker.js"));
-    let lwcaClassifier = new LWCAClassifier(worker);
-    yield lwcaClassifier.init();
-
    // only read starting from id == 10
-    let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects, 10, lwcaClassifier, storageBackend);
+    let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects, 10, storageBackend);
    yield historyReader.resubmitHistory({startDay: today-20});

    let assertDeferred = oldPromise.defer();
@ -167,7 +154,7 @@ exports["test chunk size 1"] = function test_ChunkSize1(assert, done) {
    // now set chunksize to 1 and read from same id
    storageBackend = {};
    streamObjects = initStream(storageBackend);
-    historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects, 10, lwcaClassifier, storageBackend);
+    historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects, 10, storageBackend);
    yield historyReader.resubmitHistory({startDay: today-20, chunkSize: 1});

    assertDeferred = oldPromise.defer();
@ -200,11 +187,7 @@ exports["test accumulation"] = function test_Accumulation(assert, done) {
    let storageBackend = {};
    let streamObjects = initStream(storageBackend);

-    let worker = new ChromeWorker(data.url("interests/lwcaWorker.js"));
-    let lwcaClassifier = new LWCAClassifier(worker);
-    yield lwcaClassifier.init();
-
-    let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(), streamObjects, 0, lwcaClassifier, storageBackend);
+    let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(), streamObjects, 0, storageBackend);
    yield historyReader.resubmitHistory({startDay: today-20});

    let assertDeferred = oldPromise.defer();
@ -244,11 +227,7 @@ exports["test stop and restart"] = function test_StopAndRestart(assert, done) {
      let storageBackend = {};
      let streamObjects = initStream(storageBackend);

-      let worker = new ChromeWorker(data.url("interests/lwcaWorker.js"));
-      let lwcaClassifier = new LWCAClassifier(worker);
-      yield lwcaClassifier.init();
-
-      let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects, 0, lwcaClassifier, storageBackend);
+      let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects, 0, storageBackend);

      let processDeferred;

@ -298,7 +277,7 @@ exports["test stop and restart"] = function test_StopAndRestart(assert, done) {
        }
      });

-      historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects, 0, lwcaClassifier, storageBackend);
+      historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects, 0, storageBackend);
      let promise = historyReader.resubmitHistory({startDay: today-61});
      let cycles = 0;
      while (true) {
@ -309,7 +288,7 @@ exports["test stop and restart"] = function test_StopAndRestart(assert, done) {
        if (lastTimeStamp == theVeryLastTimeStamp) {
          break;
        }
-        historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects,lastTimeStamp, lwcaClassifier, storageBackend);
+        historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects,lastTimeStamp, storageBackend);
        promise = historyReader.resubmitHistory({startDay: today-61});
        cycles ++;
      }
@ -362,11 +341,7 @@ exports["test tldCounter"] = function test_TldCounter(assert, done) {
    let storageBackend = {};
    let streamObjects = initStream(storageBackend);

-    let worker = new ChromeWorker(data.url("interests/lwcaWorker.js"));
-    let lwcaClassifier = new LWCAClassifier(worker);
-    yield lwcaClassifier.init();
-
-    let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects,0, lwcaClassifier, storageBackend);
+    let historyReader = new HistoryReader(gWorkerFactory.getCurrentWorkers(),streamObjects,0, storageBackend);
    yield historyReader.resubmitHistory({startDay: today-20},1);
    assert.deepEqual(storageBackend.tldCounter,
      {"au":{"mysql.au":1,"facebook.au":1},
--- a/test/test-IW-rules-cn.js
+++ b/test/test-IW-rules-cn.js
@ -8,6 +8,7 @@

 const {testUtils} = require("./helpers");
 const {Cc, Ci, Cu, ChromeWorker} = require("chrome");
+const {data} = require("sdk/self");
 const oldPromise = require("sdk/core/promise");
 Cu.import("resource://gre/modules/Services.jsm");
 Cu.import("resource://gre/modules/NetUtil.jsm");
@ -64,13 +65,26 @@ exports["test default matcher"] = function test_default_matcher(assert, done) {
    } // end of handleEvent
  };

+
+  let scriptLoader = Cc["@mozilla.org/moz/jssubscript-loader;1"].getService(Ci.mozIJSSubScriptLoader);
+  scriptLoader.loadSubScript(data.url("words.js"));
+  scriptLoader.loadSubScript(data.url("rules.js"));
+
  let worker = testUtils.getWorker({
      namespace: "test-Matching",
      regionCode: 'zh-CN',
      listener: workerTester,
      domainRules: testDomainRules,
      textModel: null,
-      urlStopWords: ['php', 'html']
+      urlStopWords: ['php', 'html'],
+      domain_rules: domain_rules,
+      host_rules: host_rules,
+      path_rules: path_rules,
+      words_tree: words_tree,
+      ignore_words: ignore_words,
+      ignore_domains: ignore_domains,
+      ignore_exts: ignore_exts,
+      bad_domain_specific: bad_domain_specific
  });

  Task.spawn(function() {
--- a/test/test-IW-rules-en.js
+++ b/test/test-IW-rules-en.js
@ -8,6 +8,7 @@

 const {testUtils} = require("./helpers");
 const {Cc, Ci, Cu, ChromeWorker} = require("chrome");
+const {data} = require("sdk/self");
 const oldPromise = require("sdk/core/promise");
 Cu.import("resource://gre/modules/Services.jsm");
 Cu.import("resource://gre/modules/NetUtil.jsm");
@ -170,12 +171,24 @@ exports["test default matcher"] = function test_default_matcher(assert, done) {
    } // end of handleEvent
  };

+  let scriptLoader = Cc["@mozilla.org/moz/jssubscript-loader;1"].getService(Ci.mozIJSSubScriptLoader);
+  scriptLoader.loadSubScript(data.url("words.js"));
+  scriptLoader.loadSubScript(data.url("rules.js"));
+
  let worker = testUtils.getWorker({
      namespace: "test-Matching",
      listener: workerTester,
      domainRules: testDomainRules,
      textModel: null,
-      urlStopWords: ['php', 'html']
+      urlStopWords: ['php', 'html'],
+      domain_rules: domain_rules,
+      host_rules: host_rules,
+      path_rules: path_rules,
+      words_tree: words_tree,
+      ignore_words: ignore_words,
+      ignore_domains: ignore_domains,
+      ignore_exts: ignore_exts,
+      bad_domain_specific: bad_domain_specific
  });

  Task.spawn(function() {
--- a/test/test-IW-textClassifier.js
+++ b/test/test-IW-textClassifier.js
@ -54,12 +54,24 @@ exports["test edrules text"] = function test_edrules_text(assert, done) {
    } // end of handleEvent
  };

+  let scriptLoader = Cc["@mozilla.org/moz/jssubscript-loader;1"].getService(Ci.mozIJSSubScriptLoader);
+  scriptLoader.loadSubScript(data.url("words.js"));
+  scriptLoader.loadSubScript(data.url("rules.js"));
+
  let worker = testUtils.getWorker({
      namespace: "test-edrules-text",
      listener: workerTester,
      domainRules: null,
      textModel: interestsClassifierModel,
      urlStopWords: interestsUrlStopwords,
+      domain_rules: domain_rules,
+      host_rules: host_rules,
+      path_rules: path_rules,
+      words_tree: words_tree,
+      ignore_words: ignore_words,
+      ignore_domains: ignore_domains,
+      ignore_exts: ignore_exts,
+      bad_domain_specific: bad_domain_specific
  });

  Task.spawn(function() {
@ -166,12 +178,24 @@ exports["test text classifier"] = function test_text_classification(assert, done
    } // end of handleEvent
  };

+  let scriptLoader = Cc["@mozilla.org/moz/jssubscript-loader;1"].getService(Ci.mozIJSSubScriptLoader);
+  scriptLoader.loadSubScript(data.url("words.js"));
+  scriptLoader.loadSubScript(data.url("rules.js"));
+
  let worker = testUtils.getWorker({
      namespace: "test-text-classifier",
      listener: workerTester,
      domainRules: null,
      textModel: riggedMatchTests.interestsClassifierModel,
      urlStopWords: interestsUrlStopwords,
+      domain_rules: domain_rules,
+      host_rules: host_rules,
+      path_rules: path_rules,
+      words_tree: words_tree,
+      ignore_words: ignore_words,
+      ignore_domains: ignore_domains,
+      ignore_exts: ignore_exts,
+      bad_domain_specific: bad_domain_specific
  });

  Task.spawn(function() {
--- a/test/test-UrlClassifier.js
+++ b/test/test-UrlClassifier.js
@ -24,7 +24,8 @@ exports["test interest classifier"] = function test_UrlClassifier(assert, done)
    let results = yield urlClassifier.classifyPage("http://www.autoblog.com/","Drive honda");
    assert.equal(Object.keys(results).length, workers.length);
    assert.deepEqual(results["58-cat"].results,
-          [{"type":"rules","interests":["cars"]},
+          [{"type": "lwca", interests: ["uncategorized"], subcat: "dummy"},
+           {"type":"rules","interests":["cars"]},
           {"type":"keywords","interests":[]},
           {"type":"combined","interests":["cars"]}
          ]);