Initial commit for http request id extraction

This commit is contained in:
Christian 2014-03-22 13:22:50 -04:00
Родитель 928d7481c7
Коммит 4951899c14
3 изменённых файлов: 101 добавлений и 22 удалений

28
census/census_util.py Normal file
Просмотреть файл

@ -0,0 +1,28 @@
import difflib
import itertools
import urlparse
from tld import get_tld
# are all items the same?
def all_same(items):
return all(x == items[0] for x in items)
# are all strings of the same length?
def all_same_len(items):
return all(len(x) == len(items[0]) for x in items)
# Are two cookies more than 80% similar in accordance to Ratcliff-Obershelp metric
def ro_similar(seq1, seq2):
return difflib.SequenceMatcher(a=seq1, b=seq2).ratio() > 0.8
# Are all cookies in a list pairwise-dissimilar (i.e. fail ro-test)
def all_dissimilar(items):
pairs = list(itertools.combinations(items, 2))
return all(not ro_similar(x[0], x[1]) for x in pairs)
# gets the domain from a url
def extract_domain(url):
try:
return get_tld(url)
except:
return urlparse.urlparse(url).netloc

Просмотреть файл

@ -1,28 +1,10 @@
# This module is used to extract persistent cookie IDs using the same heuristics from the PETS 2014 paper
import census_util
from collections import defaultdict
import difflib
import itertools
import sqlite3 as lite
from dateutil import parser
import datetime
# are all items the same?
def all_same(items):
return all(x == items[0] for x in items)
# are all strings of the same length?
def all_same_len(items):
return all(len(x) == len(items[0]) for x in items)
# Are two cookies more than 80% similar in accordance to Ratcliff-Obershelp metric
def ro_similar(seq1, seq2):
return difflib.SequenceMatcher(a=seq1, b=seq2).ratio() > 0.8
# Are all cookies in a list pairwise-dissimilar (i.e. fail ro-test)
def all_dissimilar(items):
pairs = list(itertools.combinations(items, 2))
return all(not ro_similar(x[0], x[1]) for x in pairs)
# builds a dictionary with keys = (domain, name) and values being list of cookie values
# values must be from non short-lived cookies and consistent across the crawls
@ -44,7 +26,7 @@ def extract_cookies_from_db(db_name):
# only keep cookies with values that remain constant throughout the crawl
cookie_dict = {}
for cookie in raw_cookie_dict:
if all_same(raw_cookie_dict[cookie]):
if census_util.all_same(raw_cookie_dict[cookie]):
cookie_dict[cookie] = raw_cookie_dict[cookie][0]
return cookie_dict
@ -61,8 +43,8 @@ def extract_persistent_ids(cookie_dicts):
raw_id_dict[cookie].append(cookie_dict[cookie])
for cookie in raw_id_dict:
if len(raw_id_dict[cookie]) > 1 and all_same_len(raw_id_dict[cookie]) \
and len(raw_id_dict[cookie][0]) > 5 and all_dissimilar(raw_id_dict[cookie]):
if len(raw_id_dict[cookie]) > 1 and census_util.all_same_len(raw_id_dict[cookie]) \
and len(raw_id_dict[cookie][0]) > 5 and census_util.all_dissimilar(raw_id_dict[cookie]):
pass
#print str(cookie) + "\t" + str(raw_id_dict[cookie])

Просмотреть файл

@ -0,0 +1,69 @@
import sqlite3 as lite
import urlparse
import census_util
from collections import defaultdict
def add_params(raw_params, domain, domain_dict):
# add the entry assuming it does not exist
if domain not in raw_params:
raw_params[domain] = defaultdict(list)
for param in domain_dict:
for value in domain_dict[param]:
raw_params[domain][param].append(value)
return raw_params
def extract_parameters_from_db(db_name):
raw_param_dict = {}
con = lite.connect(db_name)
cur = con.cursor()
#raw_cookie_dict = defaultdict(list) # cookie dict containing list of values
for url, in cur.execute('SELECT url FROM http_requests'):
domain = census_util.extract_domain(url)
query = urlparse.urlparse(url).query
if query is None:
continue
param_dict = urlparse.parse_qs(query)
if len(param_dict) == 0:
continue
raw_param_dict = add_params(raw_param_dict, domain, param_dict)
# throw away parameters that do no stay the same the entire time
param_dict = {}
for domain in raw_param_dict:
param_dict[domain] = {}
for param in raw_param_dict[domain]:
if census_util.all_same(raw_param_dict[domain][param]):
param_dict[domain][param] = raw_param_dict[domain][param][0]
return param_dict
def extract_persistent_parameters(param_dicts):
raw_param_dict = {}
for dict in param_dicts:
for domain in dict:
if domain not in raw_param_dict:
raw_param_dict[domain] = defaultdict(list)
for param in dict[domain]:
raw_param_dict[domain][param].append(dict[domain][param])
# extract same-lengthed parameter values that are also sufficiently dis-similar and long enough
param_dict = {}
for domain in raw_param_dict:
param_dict[domain] = {}
for param in raw_param_dict[domain]:
if len(raw_param_dict[domain][param]) > 1 and len(raw_param_dict[domain][param][0]) > 5 \
and census_util.all_same_len(raw_param_dict[domain][param]) \
and census_util.all_dissimilar(raw_param_dict[domain][param]):
print domain + "\t" + param + "\t" + str(raw_param_dict[domain][param])
if __name__ == "__main__":
d1 = extract_parameters_from_db("/home/christian/Desktop/crawl1.sqlite")
d2 = extract_parameters_from_db("/home/christian/Desktop/crawl2.sqlite")
extract_persistent_parameters([d1, d2])