Adds additional cookie heuristics that find ids buried in parameter-value strings in cookie values

This commit is contained in:
Christian 2014-03-24 17:08:20 -04:00
Родитель 482ec0f35d
Коммит 9f86d97d1a
2 изменённых файлов: 20 добавлений и 5 удалений

Просмотреть файл

@ -13,7 +13,7 @@ def all_same_len(items):
# Are two cookies more than 80% similar in accordance to Ratcliff-Obershelp metric
def ro_similar(seq1, seq2):
return difflib.SequenceMatcher(a=seq1, b=seq2).ratio() > 0.8
return difflib.SequenceMatcher(a=seq1, b=seq2).ratio() > 0.5
# Are all cookies in a list pairwise-dissimilar (i.e. fail ro-test)
def all_dissimilar(items):

Просмотреть файл

@ -21,7 +21,9 @@ def extract_cookies_from_db(db_name):
if (parser.parse(expiry).replace(tzinfo=None) - parser.parse(access).replace(tzinfo=None)).days < 30:
continue
# add the basic values, then try to parse inner parameters and add them
raw_cookie_dict[(domain, name)].append(value)
add_inner_parameters(raw_cookie_dict, domain, name, value)
# only keep cookies with values that remain constant throughout the crawl
cookie_dict = {}
@ -31,6 +33,19 @@ def extract_cookies_from_db(db_name):
return cookie_dict
# goes through the cookies values and looks for values of the form id=XXX&time=YYYY
# then appends to raw_cookie_dict the cookie (domain, name#id) XXX
# and (domain, name#time) YYY
# currently uses two known delimiters
def add_inner_parameters(raw_cookie_dict, domain, name, value):
delimiters = [":", "&"] # currently known inner cookie delimiters
for delimiter in delimiters:
parts = value.split(delimiter)
for part in parts:
params = part.split("=")
if len(params) == 2:
raw_cookie_dict[(domain, name + "#" + params[0])].append(params[1])
# takes in dictionaries of persistent, non-session cookies
# finds common ids that have values of the same length
# are not the same and have pairwise similarities < 0.9 (which covers all being equal)
@ -49,7 +64,7 @@ def extract_persistent_ids(cookie_dicts):
# prune away cookies that fail one of our unique ID heuristics
for cookie in raw_id_dict:
if len(raw_id_dict[cookie]) <= 1 or len(raw_id_dict[cookie][0]) <= 5 \
if len(raw_id_dict[cookie]) <= 1 or len(raw_id_dict[cookie][0]) <= 5 or len(raw_id_dict[cookie][0]) > 100 \
or not census_util.all_same_len(raw_id_dict[cookie]) \
or not census_util.all_dissimilar(raw_id_dict[cookie]):
continue
@ -59,6 +74,6 @@ def extract_persistent_ids(cookie_dicts):
return domain_dict
if __name__ == "__main__":
c1 = extract_cookies_from_db("/home/christian/Desktop/crawl1.sqlite")
c2 = extract_cookies_from_db("/home/christian/Desktop/crawl2.sqlite")
print extract_persistent_ids([c1, c2])
c1 = extract_cookies_from_db("/home/christian/Desktop/crawl1.sqlite")
c2 = extract_cookies_from_db("/home/christian/Desktop/crawl2.sqlite")
print len(extract_persistent_ids([c1, c2]))