зеркало из https://github.com/openwpm/OpenWPM.git
Initial commit for http request id extraction
This commit is contained in:
Родитель
928d7481c7
Коммит
4951899c14
|
@ -0,0 +1,28 @@
|
|||
import difflib
|
||||
import itertools
|
||||
import urlparse
|
||||
from tld import get_tld
|
||||
|
||||
# are all items the same?
|
||||
def all_same(items):
|
||||
return all(x == items[0] for x in items)
|
||||
|
||||
# are all strings of the same length?
|
||||
def all_same_len(items):
|
||||
return all(len(x) == len(items[0]) for x in items)
|
||||
|
||||
# Are two cookies more than 80% similar in accordance to Ratcliff-Obershelp metric
|
||||
def ro_similar(seq1, seq2):
|
||||
return difflib.SequenceMatcher(a=seq1, b=seq2).ratio() > 0.8
|
||||
|
||||
# Are all cookies in a list pairwise-dissimilar (i.e. fail ro-test)
|
||||
def all_dissimilar(items):
|
||||
pairs = list(itertools.combinations(items, 2))
|
||||
return all(not ro_similar(x[0], x[1]) for x in pairs)
|
||||
|
||||
# gets the domain from a url
|
||||
def extract_domain(url):
|
||||
try:
|
||||
return get_tld(url)
|
||||
except:
|
||||
return urlparse.urlparse(url).netloc
|
|
@ -1,28 +1,10 @@
|
|||
# This module is used to extract persistent cookie IDs using the same heuristics from the PETS 2014 paper
|
||||
|
||||
import census_util
|
||||
from collections import defaultdict
|
||||
import difflib
|
||||
import itertools
|
||||
import sqlite3 as lite
|
||||
from dateutil import parser
|
||||
import datetime
|
||||
|
||||
# are all items the same?
|
||||
def all_same(items):
|
||||
return all(x == items[0] for x in items)
|
||||
|
||||
# are all strings of the same length?
|
||||
def all_same_len(items):
|
||||
return all(len(x) == len(items[0]) for x in items)
|
||||
|
||||
# Are two cookies more than 80% similar in accordance to Ratcliff-Obershelp metric
|
||||
def ro_similar(seq1, seq2):
|
||||
return difflib.SequenceMatcher(a=seq1, b=seq2).ratio() > 0.8
|
||||
|
||||
# Are all cookies in a list pairwise-dissimilar (i.e. fail ro-test)
|
||||
def all_dissimilar(items):
|
||||
pairs = list(itertools.combinations(items, 2))
|
||||
return all(not ro_similar(x[0], x[1]) for x in pairs)
|
||||
|
||||
# builds a dictionary with keys = (domain, name) and values being list of cookie values
|
||||
# values must be from non short-lived cookies and consistent across the crawls
|
||||
|
@ -44,7 +26,7 @@ def extract_cookies_from_db(db_name):
|
|||
# only keep cookies with values that remain constant throughout the crawl
|
||||
cookie_dict = {}
|
||||
for cookie in raw_cookie_dict:
|
||||
if all_same(raw_cookie_dict[cookie]):
|
||||
if census_util.all_same(raw_cookie_dict[cookie]):
|
||||
cookie_dict[cookie] = raw_cookie_dict[cookie][0]
|
||||
|
||||
return cookie_dict
|
||||
|
@ -61,8 +43,8 @@ def extract_persistent_ids(cookie_dicts):
|
|||
raw_id_dict[cookie].append(cookie_dict[cookie])
|
||||
|
||||
for cookie in raw_id_dict:
|
||||
if len(raw_id_dict[cookie]) > 1 and all_same_len(raw_id_dict[cookie]) \
|
||||
and len(raw_id_dict[cookie][0]) > 5 and all_dissimilar(raw_id_dict[cookie]):
|
||||
if len(raw_id_dict[cookie]) > 1 and census_util.all_same_len(raw_id_dict[cookie]) \
|
||||
and len(raw_id_dict[cookie][0]) > 5 and census_util.all_dissimilar(raw_id_dict[cookie]):
|
||||
pass
|
||||
#print str(cookie) + "\t" + str(raw_id_dict[cookie])
|
||||
|
||||
|
|
|
@ -0,0 +1,69 @@
|
|||
import sqlite3 as lite
|
||||
import urlparse
|
||||
import census_util
|
||||
from collections import defaultdict
|
||||
|
||||
def add_params(raw_params, domain, domain_dict):
|
||||
# add the entry assuming it does not exist
|
||||
if domain not in raw_params:
|
||||
raw_params[domain] = defaultdict(list)
|
||||
|
||||
for param in domain_dict:
|
||||
for value in domain_dict[param]:
|
||||
raw_params[domain][param].append(value)
|
||||
|
||||
return raw_params
|
||||
|
||||
|
||||
def extract_parameters_from_db(db_name):
|
||||
raw_param_dict = {}
|
||||
|
||||
con = lite.connect(db_name)
|
||||
cur = con.cursor()
|
||||
#raw_cookie_dict = defaultdict(list) # cookie dict containing list of values
|
||||
for url, in cur.execute('SELECT url FROM http_requests'):
|
||||
domain = census_util.extract_domain(url)
|
||||
query = urlparse.urlparse(url).query
|
||||
if query is None:
|
||||
continue
|
||||
|
||||
param_dict = urlparse.parse_qs(query)
|
||||
if len(param_dict) == 0:
|
||||
continue
|
||||
|
||||
raw_param_dict = add_params(raw_param_dict, domain, param_dict)
|
||||
|
||||
# throw away parameters that do no stay the same the entire time
|
||||
param_dict = {}
|
||||
for domain in raw_param_dict:
|
||||
param_dict[domain] = {}
|
||||
for param in raw_param_dict[domain]:
|
||||
if census_util.all_same(raw_param_dict[domain][param]):
|
||||
param_dict[domain][param] = raw_param_dict[domain][param][0]
|
||||
|
||||
return param_dict
|
||||
|
||||
def extract_persistent_parameters(param_dicts):
|
||||
raw_param_dict = {}
|
||||
for dict in param_dicts:
|
||||
for domain in dict:
|
||||
if domain not in raw_param_dict:
|
||||
raw_param_dict[domain] = defaultdict(list)
|
||||
|
||||
for param in dict[domain]:
|
||||
raw_param_dict[domain][param].append(dict[domain][param])
|
||||
|
||||
# extract same-lengthed parameter values that are also sufficiently dis-similar and long enough
|
||||
param_dict = {}
|
||||
for domain in raw_param_dict:
|
||||
param_dict[domain] = {}
|
||||
for param in raw_param_dict[domain]:
|
||||
if len(raw_param_dict[domain][param]) > 1 and len(raw_param_dict[domain][param][0]) > 5 \
|
||||
and census_util.all_same_len(raw_param_dict[domain][param]) \
|
||||
and census_util.all_dissimilar(raw_param_dict[domain][param]):
|
||||
print domain + "\t" + param + "\t" + str(raw_param_dict[domain][param])
|
||||
|
||||
if __name__ == "__main__":
|
||||
d1 = extract_parameters_from_db("/home/christian/Desktop/crawl1.sqlite")
|
||||
d2 = extract_parameters_from_db("/home/christian/Desktop/crawl2.sqlite")
|
||||
extract_persistent_parameters([d1, d2])
|
Загрузка…
Ссылка в новой задаче