зеркало из https://github.com/openwpm/OpenWPM.git
Finished PETS heuristics for cookie extraction
This commit is contained in:
Родитель
4951899c14
Коммит
482ec0f35d
|
@ -5,20 +5,20 @@ from collections import defaultdict
|
|||
import sqlite3 as lite
|
||||
from dateutil import parser
|
||||
|
||||
|
||||
# builds a dictionary with keys = (domain, name) and values being list of cookie values
|
||||
# values must be from non short-lived cookies and consistent across the crawls
|
||||
# after extracting from a _single_ FP database # TODO: our cookie database
|
||||
# extracts from a single OpenWPM database
|
||||
def extract_cookies_from_db(db_name):
|
||||
con = lite.connect(db_name)
|
||||
cur = con.cursor()
|
||||
|
||||
raw_cookie_dict = defaultdict(list) # cookie dict containing list of values
|
||||
# first, add all cookie/value pairs for cookies that live at least one month
|
||||
raw_cookie_dict = defaultdict(list) # maps (domain, names) to lists of values
|
||||
for domain, name, value, access, expiry in cur.execute('SELECT domain, name, value, accessed, expiry FROM cookies'):
|
||||
# TODO: extract domains
|
||||
domain = domain if len(domain) == 0 or domain[0] != "." else domain[1:]
|
||||
|
||||
# prune away cookies with expiry times under a month
|
||||
if (parser.parse(expiry) - parser.parse(access)).days < 30:
|
||||
if (parser.parse(expiry).replace(tzinfo=None) - parser.parse(access).replace(tzinfo=None)).days < 30:
|
||||
continue
|
||||
|
||||
raw_cookie_dict[(domain, name)].append(value)
|
||||
|
@ -35,21 +35,30 @@ def extract_cookies_from_db(db_name):
|
|||
# finds common ids that have values of the same length
|
||||
# are not the same and have pairwise similarities < 0.9 (which covers all being equal)
|
||||
# an ID must appear in at least 2 different crawls (otherwise, can't make a definitive statement about it)
|
||||
# prunes away cookies with lengths less than or equal to 5 (these strings are probably too short
|
||||
# returns dictionary with domains as keys and cookie names as values
|
||||
def extract_persistent_ids(cookie_dicts):
|
||||
raw_id_dict = defaultdict(list) # for each cookie, a list of the values across each crawl
|
||||
|
||||
# combine all smaller cookie dictionaries into a larger dictionary
|
||||
for cookie_dict in cookie_dicts:
|
||||
for cookie in cookie_dict:
|
||||
raw_id_dict[cookie].append(cookie_dict[cookie])
|
||||
|
||||
domain_dict = defaultdict(list) # for each domain,list of candidate ID cookies
|
||||
|
||||
# prune away cookies that fail one of our unique ID heuristics
|
||||
for cookie in raw_id_dict:
|
||||
if len(raw_id_dict[cookie]) > 1 and census_util.all_same_len(raw_id_dict[cookie]) \
|
||||
and len(raw_id_dict[cookie][0]) > 5 and census_util.all_dissimilar(raw_id_dict[cookie]):
|
||||
pass
|
||||
#print str(cookie) + "\t" + str(raw_id_dict[cookie])
|
||||
if len(raw_id_dict[cookie]) <= 1 or len(raw_id_dict[cookie][0]) <= 5 \
|
||||
or not census_util.all_same_len(raw_id_dict[cookie]) \
|
||||
or not census_util.all_dissimilar(raw_id_dict[cookie]):
|
||||
continue
|
||||
|
||||
domain_dict[cookie[0]].append(cookie[1])
|
||||
|
||||
return domain_dict
|
||||
|
||||
if __name__ == "__main__":
|
||||
c1 = extract_cookies_from_db("/Users/Christian/Desktop/data/crawl1.sqlite")
|
||||
c2 = extract_cookies_from_db("/Users/Christian/Desktop/data/crawl2.sqlite")
|
||||
c3 = extract_cookies_from_db("/Users/Christian/Desktop/data/crawl3.sqlite")
|
||||
extract_persistent_ids([c1, c2, c3])
|
||||
c1 = extract_cookies_from_db("/home/christian/Desktop/crawl1.sqlite")
|
||||
c2 = extract_cookies_from_db("/home/christian/Desktop/crawl2.sqlite")
|
||||
print extract_persistent_ids([c1, c2])
|
||||
|
|
Загрузка…
Ссылка в новой задаче