From 4e1f491c6c0f1083ee58e39d2b7a6186f44e10bd Mon Sep 17 00:00:00 2001
From: Alex <little_al14@hotmail.com>
Date: Fri, 13 Apr 2018 11:28:27 -0300
Subject: [PATCH] data bricks evercookie search script added

---
 analyses/Databricks Evercookie Search.ipynb |   1 +
 analyses/Databricks Evercookie Search.md    | 382 +++++++++++++++++++
 analyses/Databricks Evercookie Search.py    | 389 ++++++++++++++++++++
 3 files changed, 772 insertions(+)
 create mode 100644 analyses/Databricks Evercookie Search.ipynb
 create mode 100644 analyses/Databricks Evercookie Search.md
 create mode 100644 analyses/Databricks Evercookie Search.py

diff --git a/analyses/Databricks Evercookie Search.ipynb b/analyses/Databricks Evercookie Search.ipynb
new file mode 100644
index 0000000..f923a35
--- /dev/null
+++ b/analyses/Databricks Evercookie Search.ipynb	
@@ -0,0 +1 @@
+{"cells":[{"cell_type":"markdown","source":["### code to remount data if necessary\nThis is the critical code for accessing data from S3.\n\nMounting essentially creates a filesystem within the cluster located at `/dbfs/mnt/$BUCKET_NAME`. You can view the files via sh, or in python (See: \"basic text file reitreval example\" for python example and \"Example of shell file retrieval\" for a shell example)"],"metadata":{}},{"cell_type":"code","source":["# ACCESS_KEY = \"\"\n# SECRET_KEY = \"\"\n# ENCODED_SECRET_KEY = SECRET_KEY.replace(\"/\", \"%2F\")\nBUCKET_NAME = \"safe-ucosp-2017\"\nMOUNT_NAME = \"safe-browsing\"\n\n# dbutils.fs.mount(\"s3a://%s:%s@%s\" % (ACCESS_KEY, ENCODED_SECRET_KEY, BUCKET_NAME), \"/mnt/%s\" % MOUNT_NAME)\n# dbutils.fs.mounts()\n"],"metadata":{},"outputs":[],"execution_count":2},{"cell_type":"markdown","source":["### basic text file retrieval example in Python\nmyRDD.first() returns the first file in the rdd, which in this case is the only file"],"metadata":{}},{"cell_type":"code","source":["filename = \"1_1c1303ea628d99c741eef0ec90714853e72b7718022f1463903fb367.json\"\n\nfile =  \"/mnt/%s/%s\" % (MOUNT_NAME, filename)\n\nmyRDD = sc.textFile(file)\nmyRDD.first()\n"],"metadata":{},"outputs":[],"execution_count":4},{"cell_type":"markdown","source":["### Functions to load data"],"metadata":{}},{"cell_type":"code","source":["import requests\nimport os\nimport json\n\nfrom os import listdir\nfrom os.path import isfile, join\n\nproject_root_directory = os.path.dirname(\"/\")\n\ndef load_index_file():\n  \"\"\"Load the file containing all indexes of the bucket and return it as an array of indexes - will download the index file if it's not in local storage\"\"\"\n  file_name = \"file_index.txt\"\n  file_path = os.path.join(project_root_directory, file_name)\n  if not os.path.isfile(file_path):\n      fetch_file(\"http://www.arewedatayet.com/\", file_name, 'wb+')\n\n  with open(file_path, \"r\") as f:\n      lines = f.readlines()\n  return lines\n\n  \ndef fetch_file(file_url, file_name, mode):\n    \"\"\"Fetch file from base_url and store it in the project root directory.\n    Keyword arguments:\n    file_url -- url of where the file is\n    file_name -- name of the file to be saved as\n    mode -- mode of the file to open it in (for example: wb+)     \n    \"\"\"\n    file = requests.get(file_url + file_name)\n\n    file_path = os.path.join(project_root_directory, file_name)\n    with open(file_path, mode) as f:\n        f.write(file.content)\n\n\ndef load_file_via_rdd(file_count, lines):\n  \"\"\" \n  file_count: int or \"all\"\n    indicating all files or a specific subset of files from the start of the data\n  lines:\n    an array containing the list of the json file names\n    \n  return: rdd file containing the data based on the parameters provided\n  \"\"\"\n  if file_count == \"all\":\n    file_count = len(lines)\n  url_list = map(url, lines[0:file_count])\n  return sc.textFile(','.join(url_list))\n\ndef url(filename):\n  \"\"\" converts index file name into mount storage location file name \"\"\"\n  x = \"/mnt/%s/%s\" % (MOUNT_NAME, filename)\n  return x.strip()\n\ndef prep_rdd(rdd):\n  \"\"\" converts rdd of files into rdd of json \"\"\"\n  return rdd.map(prep_data).flatMap(lambda x: json.loads(x))\n\ndef prep_data(data):\n  \"\"\" converts S3 data file into json \"\"\"\n  return \"[\" + data[1:-1] + \"]\"\n\ndef p(s):\n  \"\"\" Quick and dirty file print function for a collected rdd \"\"\"\n  for p in s:\n    print(p)\n\n### Alternative attempts at Retrieving data\n# \"http://\"+BUCKET_NAME+\".s3.amazonaws.com/\"+filename\n# \"s3a://%s:%s@%s/%s\" % (ACCESS_ID, ENCODED_SECRET_KEY, BUCKET_NAME, filename)\n"],"metadata":{},"outputs":[],"execution_count":6},{"cell_type":"markdown","source":["### Load parquet file"],"metadata":{}},{"cell_type":"code","source":["file =  \"/mnt/%s/safe_dataset/v1/clean.parquet/*.snappy.parquet\" % (MOUNT_NAME)\n\nclean_parquet_df = sqlContext.read.parquet(file)\ndisplay(clean_parquet_df)"],"metadata":{},"outputs":[],"execution_count":8},{"cell_type":"markdown","source":["### Load Index File\nIndex file contains the names of all the files on S3. The index file hosted on Martins server. I have tried to load in all the files via the file system unsuccessfully. The index file is very large and is loaded over the internet so it takes a while to load."],"metadata":{}},{"cell_type":"code","source":["index = load_index_file()\n"],"metadata":{},"outputs":[],"execution_count":10},{"cell_type":"markdown","source":["### Examples of file retrieval via shell"],"metadata":{}},{"cell_type":"code","source":["%sh\ncd /\ncd dbfs/mnt/safe-browsing/\nhead 1_00001314358470f0c99914d5c7af0cd89248e54883ac3b5083395b61.json\n\n"],"metadata":{},"outputs":[],"execution_count":12},{"cell_type":"code","source":["%sh\ncd /\n\ncd dbfs/mnt/safe-browsing/safe_dataset/v1/clean.parquet\nls\n# head -n5 'part-00000-dfb456b1-7820-4f5f-a380-3f8e7c566a60-c000.snappy.parquet'\n"],"metadata":{},"outputs":[],"execution_count":13},{"cell_type":"markdown","source":["### Prep and Cache Data"],"metadata":{}},{"cell_type":"code","source":["\nrdd_files = load_file_via_rdd(20000, index)\nrdd_files = prep_rdd(rdd_files)\n\n\n"],"metadata":{},"outputs":[],"execution_count":15},{"cell_type":"markdown","source":["## Attempt to search for Evercookies\nEvercookie is a JavaScript-based application created by Samy Kamkar which produces zombie cookies in a web browser that are intentionally difficult to delete. The source code is found here: [Evercookie script](https://github.com/samyk/evercookie/blob/master/js/evercookie.js). Additional information can be found at the website https://samy.pl/evercookie/\n\nbelow is a list of candidate attributes that could be used a sign of an evercookie is being used. \nA significant sign of an evercookie being used would be that the same value is being passed multiple times to these API calls.\n\n```  \n  option_list = [\n    \"cookie\", \"name\", \"embedSWF\",  # misc\n    \"createElement\", \"getContext\", # canvas\n    \"localStorage\", \"sessionStorage\", \"globalStorage\", \"Storage\" # storage\n    \"indexedDB\", \"mozIndexedDB\", \"openDatabase\", \"IDBTransaction\", \"IDBKeyRange\", # DB\n  ]\n```"],"metadata":{}},{"cell_type":"markdown","source":["The below script is to get a sense of candidate api calls"],"metadata":{}},{"cell_type":"code","source":["\nuniqueSymbols = rdd_files.map(lambda x : x['symbol']).distinct().collect()\np(uniqueSymbols)"],"metadata":{},"outputs":[],"execution_count":18},{"cell_type":"markdown","source":["### list of helper functions for investigation into evercookies"],"metadata":{}},{"cell_type":"code","source":["import time\nimport json\nfrom urlparse import urlparse\nfrom pandas.io.json import json_normalize\n\n\"\"\" misc functions \"\"\"\ndef quick_print(rdd):\n  \"\"\" collection and print rdd \"\"\"\n  for p in rdd.collect():\n    print(p)\n\ndef extract_domain_name(data):\n  \"\"\" extracts the domain name from a url to track patterns across domains \"\"\"\n  parsed_uri = urlparse( data )\n  return '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)\n\ndef unique_list(rdd_list):\n  \"\"\" convert rdd list into a list of unique values \"\"\"\n  return list(set(rdd_list))\n\ndef get_cookie_info(data):\n  \"\"\" generate tuple of domain name, symbol and arguments for the function call \"\"\"\n  location = extract_domain_name(data['location'])\n  arguments = data.get('arguments', '')\n  return (location, (data['symbol'],  arguments))\n\ndef get_cookie_info_list_args(data):\n  \"\"\" generate tuples with json values as arguments, json values might contain overlap with other data API call args? \"\"\"\n  location = extract_domain_name(data['location'])\n  arguments = extract_json_values(data.get('arguments', ''))\n  return (location, (data['symbol'],  arguments))\n\n\"\"\" functions to check if the api call is an evercookie \"\"\"\nev_cookie_api = [\n  \"cookie\", \"name\", \"embedSWF\",  #misc\n  \"createElement\", \"getContext\", #canvas\n  \"localStorage\", \"sessionStorage\", \"globalStorage\", \"Storage\" #storage\n  \"indexedDB\", \"mozIndexedDB\", \"openDatabase\", \"IDBTransaction\", \"IDBKeyRange\", # DB\n]\n\ndef ev_cookie_candidate(symbol_arr):\n  \"\"\" check if the symbol is a possible API call that could be used in an evercookie \"\"\"\n  for symbol in symbol_arr[1]:\n    for attr_name in symbol[0].split(\".\"):\n      if attr_name in ev_cookie_api:\n        return True\n  return False\n\ndef api_call_is_ev_cookie_candidate(api_call):\n  \"\"\" check if api call is a possible evercookie api call \"\"\"\n  for attr_name in api_call.split(\".\"):\n      if attr_name in ev_cookie_api:\n        return True\n  return False\n\n\"\"\" Json Extraction related \"\"\"\ndef flatten_json(y):\n  \"\"\" flattens json including arrays \"\"\"\n  out = {}\n  def flatten(x, name=''):\n    if type(x) is dict:\n      for a in x:\n        flatten(x[a], name + a + '_')\n      elif type(x) is list:\n        i = 0\n        for a in x:\n          flatten(a, name + str(i) + '_')\n          i += 1\n      else:\n          out[name[:-1]] = x\n\n  flatten(y)\n  return out\n\ndef extract_json_values(arg):\n  \"\"\" extract json values into a tuple or store the data into a tuple \"\"\"\n  try:\n    json_obj = json.loads(arg)\n    flat = flatten_json(json_obj) # flatten and normalize json to extract all values\n    return tuple(json_normalize(flat).values()) # if arg is json return values, as values might contain overlap with other api calls\n  except JSONDecodeError:          \n    return tuple(arg)\n\n\"\"\" Key related filtering funtions \"\"\"\ndef hasNumbers(inputString):\n  \"\"\" true if string contains a number \"\"\"\n  return any(char.isdigit() for char in inputString)\n\ndef is_possible_key(arg):\n  \"\"\" is string and string length 5 or greater and has a number in the string. We are assuming that it is possibly an ID \"\"\"\n  return type(arg) is str and len(arg) >= 5 and hasNumbers(arg)\n\ndef site_has_possible_key(domain_data):\n  \"\"\" dig through nested data to check if the domain has an API call with a possible ID \"\"\"\n  for api_call in domain_data[1]:\n    for arg in api_call[1]:\n      if is_possible_key(arg):\n        return True\n  return False\n\ndef has_overlap_arg(domain_data):\n  \"\"\" dig through nested data to check if the domain has a candidate function with an argument that overlaps and its a key \"\"\"\n  arg_total_list = []\n  for api_call in domain_data[1]:\n    if api_call_is_ev_cookie_candidate(api_call[0]):\n      arg_list = list(api_call[1])\n      print(arg_total_list)\n      for arg in arg_list:\n        if arg in arg_total_list and is_possible_key(arg):\n          return True\n      arg_total_list.extend(arg_list)\n  return False\n"],"metadata":{},"outputs":[],"execution_count":20},{"cell_type":"markdown","source":["### Below was a preliminary attempt to search for Evercookies\nAs a preliminary step I have generate an rdd of tuples containing the domains and the associated api calls with their arguments\nThen I filter out all the domains that don't make api calls to evercookie caching techniques."],"metadata":{}},{"cell_type":"code","source":["out = rdd_files.map(get_cookie_info).groupByKey().mapValues(unique_list)\nout = out.filter(ev_cookie_candidate )\n\n\"\"\"\ncurrent structure:\n[\n  (domain_name, [(api_call, arg_string), ...] )\n]\n\"\"\"\nquick_print(out)\n"],"metadata":{},"outputs":[],"execution_count":22},{"cell_type":"markdown","source":["### Evercookie attempt 2\nextract json values passed to the api as possible duplicate values used by evercookies. Remove all domains that do no have overlaping values in the args passed to the api calls.\n\nThe result is a lot of false positives. Difficult to find any good candidates for further examination. Ideas to narrow down include: searching for IDs (aphanumeric?), filter out obvious values like True, False."],"metadata":{}},{"cell_type":"code","source":["rdd_files = load_file_via_rdd(\"all\", index)\nrdd_files = prep_rdd(rdd_files)"],"metadata":{},"outputs":[],"execution_count":24},{"cell_type":"code","source":["pout = rdd_files.filter(lambda x : x.get('arguments', '') != '')\n\npout = pout.map(get_cookie_info_list_args).groupByKey().mapValues(unique_list)\n\n\"\"\"\ncurrent structure:\n[\n  (domain_name, [(api_call, (arg1,arg2,...)), ...] )\n]\n\"\"\"\n\n# filter domains that don't make calls to web apis used by evercookies\npout = pout.filter(ev_cookie_candidate )\n\npout = pout.filter(site_has_possible_key)\npout = pout.filter(has_overlap_arg)\n\nquick_print(pout)\n"],"metadata":{},"outputs":[],"execution_count":25},{"cell_type":"code","source":["all_files = load_file_via_rdd(\"all\", index)\nall_files = prep_rdd(all_files)"],"metadata":{},"outputs":[],"execution_count":26},{"cell_type":"code","source":["def isthing(data):\n  return \"evercookie.js\" in data['script_url']\n\nall_files.filter(isthing).count()"],"metadata":{},"outputs":[],"execution_count":27}],"metadata":{"name":"Databricks Evercookie Search","notebookId":221887796231301},"nbformat":4,"nbformat_minor":0}
diff --git a/analyses/Databricks Evercookie Search.md b/analyses/Databricks Evercookie Search.md
new file mode 100644
index 0000000..37a428e
--- /dev/null
+++ b/analyses/Databricks Evercookie Search.md	
@@ -0,0 +1,382 @@
+### code to remount data if necessary
+This is the critical code for accessing
+data from S3.
+
+Mounting essentially creates a filesystem within the cluster
+located at `/dbfs/mnt/$BUCKET_NAME`. You can view the files via sh, or in python
+(See: "basic text file reitreval example" for python example and "Example of
+shell file retrieval" for a shell example)
+
+```python
+# ACCESS_KEY = ""
+# SECRET_KEY = ""
+# ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
+BUCKET_NAME = "safe-ucosp-2017"
+MOUNT_NAME = "safe-browsing"
+
+# dbutils.fs.mount("s3a://%s:%s@%s" % (ACCESS_KEY, ENCODED_SECRET_KEY, BUCKET_NAME), "/mnt/%s" % MOUNT_NAME)
+# dbutils.fs.mounts()
+
+```
+
+### basic text file retrieval example in Python
+myRDD.first() returns the first
+file in the rdd, which in this case is the only file
+
+```python
+filename = "1_1c1303ea628d99c741eef0ec90714853e72b7718022f1463903fb367.json"
+
+file =  "/mnt/%s/%s" % (MOUNT_NAME, filename)
+
+myRDD = sc.textFile(file)
+myRDD.first()
+
+```
+
+### Functions to load data
+
+```python
+import requests
+import os
+import json
+
+from os import listdir
+from os.path import isfile, join
+
+project_root_directory = os.path.dirname("/")
+
+def load_index_file():
+  """Load the file containing all indexes of the bucket and return it as an array of indexes - will download the index file if it's not in local storage"""
+  file_name = "file_index.txt"
+  file_path = os.path.join(project_root_directory, file_name)
+  if not os.path.isfile(file_path):
+      fetch_file("http://www.arewedatayet.com/", file_name, 'wb+')
+
+  with open(file_path, "r") as f:
+      lines = f.readlines()
+  return lines
+
+  
+def fetch_file(file_url, file_name, mode):
+    """Fetch file from base_url and store it in the project root directory.
+    Keyword arguments:
+    file_url -- url of where the file is
+    file_name -- name of the file to be saved as
+    mode -- mode of the file to open it in (for example: wb+)     
+    """
+    file = requests.get(file_url + file_name)
+
+    file_path = os.path.join(project_root_directory, file_name)
+    with open(file_path, mode) as f:
+        f.write(file.content)
+
+
+def load_file_via_rdd(file_count, lines):
+  """ 
+  file_count: int or "all"
+    indicating all files or a specific subset of files from the start of the data
+  lines:
+    an array containing the list of the json file names
+    
+  return: rdd file containing the data based on the parameters provided
+  """
+  if file_count == "all":
+    file_count = len(lines)
+  url_list = map(url, lines[0:file_count])
+  return sc.textFile(','.join(url_list))
+
+def url(filename):
+  """ converts index file name into mount storage location file name """
+  x = "/mnt/%s/%s" % (MOUNT_NAME, filename)
+  return x.strip()
+
+def prep_rdd(rdd):
+  """ converts rdd of files into rdd of json """
+  return rdd.map(prep_data).flatMap(lambda x: json.loads(x))
+
+def prep_data(data):
+  """ converts S3 data file into json """
+  return "[" + data[1:-1] + "]"
+
+def p(s):
+  """ Quick and dirty file print function for a collected rdd """
+  for p in s:
+    print(p)
+
+### Alternative attempts at Retrieving data
+# "http://"+BUCKET_NAME+".s3.amazonaws.com/"+filename
+# "s3a://%s:%s@%s/%s" % (ACCESS_ID, ENCODED_SECRET_KEY, BUCKET_NAME, filename)
+
+```
+
+### Load parquet file
+
+```python
+file =  "/mnt/%s/safe_dataset/v1/clean.parquet/*.snappy.parquet" % (MOUNT_NAME)
+
+clean_parquet_df = sqlContext.read.parquet(file)
+display(clean_parquet_df)
+```
+
+### Load Index File
+Index file contains the names of all the files on S3. The
+index file hosted on Martins server. I have tried to load in all the files via
+the file system unsuccessfully. The index file is very large and is loaded over
+the internet so it takes a while to load.
+
+```python
+index = load_index_file()
+
+```
+
+### Examples of file retrieval via shell
+
+```python
+%sh
+cd /
+cd dbfs/mnt/safe-browsing/
+head 1_00001314358470f0c99914d5c7af0cd89248e54883ac3b5083395b61.json
+
+
+```
+
+```python
+%sh
+cd /
+
+cd dbfs/mnt/safe-browsing/safe_dataset/v1/clean.parquet
+ls
+# head -n5 'part-00000-dfb456b1-7820-4f5f-a380-3f8e7c566a60-c000.snappy.parquet'
+
+```
+
+### Prep and Cache Data
+
+```python
+
+rdd_files = load_file_via_rdd(20000, index)
+rdd_files = prep_rdd(rdd_files)
+
+
+
+```
+
+## Attempt to search for Evercookies
+Evercookie is a JavaScript-based
+application created by Samy Kamkar which produces zombie cookies in a web
+browser that are intentionally difficult to delete. The source code is found
+here: [Evercookie
+script](https://github.com/samyk/evercookie/blob/master/js/evercookie.js).
+Additional information can be found at the website https://samy.pl/evercookie/
+below is a list of candidate attributes that could be used a sign of an
+evercookie is being used. 
+A significant sign of an evercookie being used would
+be that the same value is being passed multiple times to these API calls.
+
+```
+option_list = [
+    "cookie", "name", "embedSWF",  # misc
+    "createElement",
+"getContext", # canvas
+    "localStorage", "sessionStorage", "globalStorage",
+"Storage" # storage
+    "indexedDB", "mozIndexedDB", "openDatabase",
+"IDBTransaction", "IDBKeyRange", # DB
+  ]
+```
+
+The below script is to get a sense of candidate api calls
+
+```python
+
+uniqueSymbols = rdd_files.map(lambda x : x['symbol']).distinct().collect()
+p(uniqueSymbols)
+```
+
+### list of helper functions for investigation into evercookies
+
+```python
+import time
+import json
+from urlparse import urlparse
+from pandas.io.json import json_normalize
+
+""" misc functions """
+def quick_print(rdd):
+  """ collection and print rdd """
+  for p in rdd.collect():
+    print(p)
+
+def extract_domain_name(data):
+  """ extracts the domain name from a url to track patterns across domains """
+  parsed_uri = urlparse( data )
+  return '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
+
+def unique_list(rdd_list):
+  """ convert rdd list into a list of unique values """
+  return list(set(rdd_list))
+
+def get_cookie_info(data):
+  """ generate tuple of domain name, symbol and arguments for the function call """
+  location = extract_domain_name(data['location'])
+  arguments = data.get('arguments', '')
+  return (location, (data['symbol'],  arguments))
+
+def get_cookie_info_list_args(data):
+  """ generate tuples with json values as arguments, json values might contain overlap with other data API call args? """
+  location = extract_domain_name(data['location'])
+  arguments = extract_json_values(data.get('arguments', ''))
+  return (location, (data['symbol'],  arguments))
+
+""" functions to check if the api call is an evercookie """
+ev_cookie_api = [
+  "cookie", "name", "embedSWF",  #misc
+  "createElement", "getContext", #canvas
+  "localStorage", "sessionStorage", "globalStorage", "Storage" #storage
+  "indexedDB", "mozIndexedDB", "openDatabase", "IDBTransaction", "IDBKeyRange", # DB
+]
+
+def ev_cookie_candidate(symbol_arr):
+  """ check if the symbol is a possible API call that could be used in an evercookie """
+  for symbol in symbol_arr[1]:
+    for attr_name in symbol[0].split("."):
+      if attr_name in ev_cookie_api:
+        return True
+  return False
+
+def api_call_is_ev_cookie_candidate(api_call):
+  """ check if api call is a possible evercookie api call """
+  for attr_name in api_call.split("."):
+      if attr_name in ev_cookie_api:
+        return True
+  return False
+
+""" Json Extraction related """
+def flatten_json(y):
+  """ flattens json including arrays """
+  out = {}
+  def flatten(x, name=''):
+    if type(x) is dict:
+      for a in x:
+        flatten(x[a], name + a + '_')
+      elif type(x) is list:
+        i = 0
+        for a in x:
+          flatten(a, name + str(i) + '_')
+          i += 1
+      else:
+          out[name[:-1]] = x
+
+  flatten(y)
+  return out
+
+def extract_json_values(arg):
+  """ extract json values into a tuple or store the data into a tuple """
+  try:
+    json_obj = json.loads(arg)
+    flat = flatten_json(json_obj) # flatten and normalize json to extract all values
+    return tuple(json_normalize(flat).values()) # if arg is json return values, as values might contain overlap with other api calls
+  except JSONDecodeError:          
+    return tuple(arg)
+
+""" Key related filtering funtions """
+def hasNumbers(inputString):
+  """ true if string contains a number """
+  return any(char.isdigit() for char in inputString)
+
+def is_possible_key(arg):
+  """ is string and string length 5 or greater and has a number in the string. We are assuming that it is possibly an ID """
+  return type(arg) is str and len(arg) >= 5 and hasNumbers(arg)
+
+def site_has_possible_key(domain_data):
+  """ dig through nested data to check if the domain has an API call with a possible ID """
+  for api_call in domain_data[1]:
+    for arg in api_call[1]:
+      if is_possible_key(arg):
+        return True
+  return False
+
+def has_overlap_arg(domain_data):
+  """ dig through nested data to check if the domain has a candidate function with an argument that overlaps and its a key """
+  arg_total_list = []
+  for api_call in domain_data[1]:
+    if api_call_is_ev_cookie_candidate(api_call[0]):
+      arg_list = list(api_call[1])
+      print(arg_total_list)
+      for arg in arg_list:
+        if arg in arg_total_list and is_possible_key(arg):
+          return True
+      arg_total_list.extend(arg_list)
+  return False
+
+```
+
+### Below was a preliminary attempt to search for Evercookies
+As a preliminary
+step I have generate an rdd of tuples containing the domains and the associated
+api calls with their arguments
+Then I filter out all the domains that don't make
+api calls to evercookie caching techniques.
+
+```python
+out = rdd_files.map(get_cookie_info).groupByKey().mapValues(unique_list)
+out = out.filter(ev_cookie_candidate )
+
+"""
+current structure:
+[
+  (domain_name, [(api_call, arg_string), ...] )
+]
+"""
+quick_print(out)
+
+```
+
+### Evercookie attempt 2
+extract json values passed to the api as possible
+duplicate values used by evercookies. Remove all domains that do no have
+overlaping values in the args passed to the api calls.
+
+The result is a lot of
+false positives. Difficult to find any good candidates for further examination.
+Ideas to narrow down include: searching for IDs (aphanumeric?), filter out
+obvious values like True, False.
+
+```python
+rdd_files = load_file_via_rdd("all", index)
+rdd_files = prep_rdd(rdd_files)
+```
+
+```python
+pout = rdd_files.filter(lambda x : x.get('arguments', '') != '')
+
+pout = pout.map(get_cookie_info_list_args).groupByKey().mapValues(unique_list)
+
+"""
+current structure:
+[
+  (domain_name, [(api_call, (arg1,arg2,...)), ...] )
+]
+"""
+
+# filter domains that don't make calls to web apis used by evercookies
+pout = pout.filter(ev_cookie_candidate )
+
+pout = pout.filter(site_has_possible_key)
+pout = pout.filter(has_overlap_arg)
+
+quick_print(pout)
+
+```
+
+```python
+all_files = load_file_via_rdd("all", index)
+all_files = prep_rdd(all_files)
+```
+
+```python
+def isthing(data):
+  return "evercookie.js" in data['script_url']
+
+all_files.filter(isthing).count()
+```
diff --git a/analyses/Databricks Evercookie Search.py b/analyses/Databricks Evercookie Search.py
new file mode 100644
index 0000000..b0b9224
--- /dev/null
+++ b/analyses/Databricks Evercookie Search.py	
@@ -0,0 +1,389 @@
+# Databricks notebook source
+# MAGIC %md
+# MAGIC ### code to remount data if necessary
+# MAGIC This is the critical code for accessing data from S3.
+# MAGIC 
+# MAGIC Mounting essentially creates a filesystem within the cluster located at `/dbfs/mnt/$BUCKET_NAME`. You can view the files via sh, or in python (See: "basic text file reitreval example" for python example and "Example of shell file retrieval" for a shell example)
+
+# COMMAND ----------
+
+# ACCESS_KEY = ""
+# SECRET_KEY = ""
+# ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
+BUCKET_NAME = "safe-ucosp-2017"
+MOUNT_NAME = "safe-browsing"
+
+# dbutils.fs.mount("s3a://%s:%s@%s" % (ACCESS_KEY, ENCODED_SECRET_KEY, BUCKET_NAME), "/mnt/%s" % MOUNT_NAME)
+# dbutils.fs.mounts()
+
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ### basic text file retrieval example in Python
+# MAGIC myRDD.first() returns the first file in the rdd, which in this case is the only file
+
+# COMMAND ----------
+
+filename = "1_1c1303ea628d99c741eef0ec90714853e72b7718022f1463903fb367.json"
+
+file =  "/mnt/%s/%s" % (MOUNT_NAME, filename)
+
+myRDD = sc.textFile(file)
+myRDD.first()
+
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ### Functions to load data
+
+# COMMAND ----------
+
+import requests
+import os
+import json
+
+from os import listdir
+from os.path import isfile, join
+
+project_root_directory = os.path.dirname("/")
+
+def load_index_file():
+  """Load the file containing all indexes of the bucket and return it as an array of indexes - will download the index file if it's not in local storage"""
+  file_name = "file_index.txt"
+  file_path = os.path.join(project_root_directory, file_name)
+  if not os.path.isfile(file_path):
+      fetch_file("http://www.arewedatayet.com/", file_name, 'wb+')
+
+  with open(file_path, "r") as f:
+      lines = f.readlines()
+  return lines
+
+  
+def fetch_file(file_url, file_name, mode):
+    """Fetch file from base_url and store it in the project root directory.
+    Keyword arguments:
+    file_url -- url of where the file is
+    file_name -- name of the file to be saved as
+    mode -- mode of the file to open it in (for example: wb+)     
+    """
+    file = requests.get(file_url + file_name)
+
+    file_path = os.path.join(project_root_directory, file_name)
+    with open(file_path, mode) as f:
+        f.write(file.content)
+
+
+def load_file_via_rdd(file_count, lines):
+  """ 
+  file_count: int or "all"
+    indicating all files or a specific subset of files from the start of the data
+  lines:
+    an array containing the list of the json file names
+    
+  return: rdd file containing the data based on the parameters provided
+  """
+  if file_count == "all":
+    file_count = len(lines)
+  url_list = map(url, lines[0:file_count])
+  return sc.textFile(','.join(url_list))
+
+def url(filename):
+  """ converts index file name into mount storage location file name """
+  x = "/mnt/%s/%s" % (MOUNT_NAME, filename)
+  return x.strip()
+
+def prep_rdd(rdd):
+  """ converts rdd of files into rdd of json """
+  return rdd.map(prep_data).flatMap(lambda x: json.loads(x))
+
+def prep_data(data):
+  """ converts S3 data file into json """
+  return "[" + data[1:-1] + "]"
+
+def p(s):
+  """ Quick and dirty file print function for a collected rdd """
+  for p in s:
+    print(p)
+
+### Alternative attempts at Retrieving data
+# "http://"+BUCKET_NAME+".s3.amazonaws.com/"+filename
+# "s3a://%s:%s@%s/%s" % (ACCESS_ID, ENCODED_SECRET_KEY, BUCKET_NAME, filename)
+
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ### Load parquet file
+
+# COMMAND ----------
+
+file =  "/mnt/%s/safe_dataset/v1/clean.parquet/*.snappy.parquet" % (MOUNT_NAME)
+
+clean_parquet_df = sqlContext.read.parquet(file)
+display(clean_parquet_df)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ### Load Index File
+# MAGIC Index file contains the names of all the files on S3. The index file hosted on Martins server. I have tried to load in all the files via the file system unsuccessfully. The index file is very large and is loaded over the internet so it takes a while to load.
+
+# COMMAND ----------
+
+index = load_index_file()
+
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ### Examples of file retrieval via shell
+
+# COMMAND ----------
+
+# MAGIC %sh
+# MAGIC cd /
+# MAGIC cd dbfs/mnt/safe-browsing/
+# MAGIC head 1_00001314358470f0c99914d5c7af0cd89248e54883ac3b5083395b61.json
+
+# COMMAND ----------
+
+# MAGIC %sh
+# MAGIC cd /
+# MAGIC 
+# MAGIC cd dbfs/mnt/safe-browsing/safe_dataset/v1/clean.parquet
+# MAGIC ls
+# MAGIC # head -n5 'part-00000-dfb456b1-7820-4f5f-a380-3f8e7c566a60-c000.snappy.parquet'
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ### Prep and Cache Data
+
+# COMMAND ----------
+
+
+rdd_files = load_file_via_rdd(20000, index)
+rdd_files = prep_rdd(rdd_files)
+
+
+
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Attempt to search for Evercookies
+# MAGIC Evercookie is a JavaScript-based application created by Samy Kamkar which produces zombie cookies in a web browser that are intentionally difficult to delete. The source code is found here: [Evercookie script](https://github.com/samyk/evercookie/blob/master/js/evercookie.js). Additional information can be found at the website https://samy.pl/evercookie/
+# MAGIC 
+# MAGIC below is a list of candidate attributes that could be used a sign of an evercookie is being used. 
+# MAGIC A significant sign of an evercookie being used would be that the same value is being passed multiple times to these API calls.
+# MAGIC 
+# MAGIC ```  
+# MAGIC   option_list = [
+# MAGIC     "cookie", "name", "embedSWF",  # misc
+# MAGIC     "createElement", "getContext", # canvas
+# MAGIC     "localStorage", "sessionStorage", "globalStorage", "Storage" # storage
+# MAGIC     "indexedDB", "mozIndexedDB", "openDatabase", "IDBTransaction", "IDBKeyRange", # DB
+# MAGIC   ]
+# MAGIC ```
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC The below script is to get a sense of candidate api calls
+
+# COMMAND ----------
+
+
+uniqueSymbols = rdd_files.map(lambda x : x['symbol']).distinct().collect()
+p(uniqueSymbols)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ### list of helper functions for investigation into evercookies
+
+# COMMAND ----------
+
+import time
+import json
+from urlparse import urlparse
+from pandas.io.json import json_normalize
+
+""" misc functions """
+def quick_print(rdd):
+  """ collection and print rdd """
+  for p in rdd.collect():
+    print(p)
+
+def extract_domain_name(data):
+  """ extracts the domain name from a url to track patterns across domains """
+  parsed_uri = urlparse( data )
+  return '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
+
+def unique_list(rdd_list):
+  """ convert rdd list into a list of unique values """
+  return list(set(rdd_list))
+
+def get_cookie_info(data):
+  """ generate tuple of domain name, symbol and arguments for the function call """
+  location = extract_domain_name(data['location'])
+  arguments = data.get('arguments', '')
+  return (location, (data['symbol'],  arguments))
+
+def get_cookie_info_list_args(data):
+  """ generate tuples with json values as arguments, json values might contain overlap with other data API call args? """
+  location = extract_domain_name(data['location'])
+  arguments = extract_json_values(data.get('arguments', ''))
+  return (location, (data['symbol'],  arguments))
+
+""" functions to check if the api call is an evercookie """
+ev_cookie_api = [
+  "cookie", "name", "embedSWF",  #misc
+  "createElement", "getContext", #canvas
+  "localStorage", "sessionStorage", "globalStorage", "Storage" #storage
+  "indexedDB", "mozIndexedDB", "openDatabase", "IDBTransaction", "IDBKeyRange", # DB
+]
+
+def ev_cookie_candidate(symbol_arr):
+  """ check if the symbol is a possible API call that could be used in an evercookie """
+  for symbol in symbol_arr[1]:
+    for attr_name in symbol[0].split("."):
+      if attr_name in ev_cookie_api:
+        return True
+  return False
+
+def api_call_is_ev_cookie_candidate(api_call):
+  """ check if api call is a possible evercookie api call """
+  for attr_name in api_call.split("."):
+      if attr_name in ev_cookie_api:
+        return True
+  return False
+
+""" Json Extraction related """
+def flatten_json(y):
+  """ flattens json including arrays """
+  out = {}
+  def flatten(x, name=''):
+    if type(x) is dict:
+      for a in x:
+        flatten(x[a], name + a + '_')
+      elif type(x) is list:
+        i = 0
+        for a in x:
+          flatten(a, name + str(i) + '_')
+          i += 1
+      else:
+          out[name[:-1]] = x
+
+  flatten(y)
+  return out
+
+def extract_json_values(arg):
+  """ extract json values into a tuple or store the data into a tuple """
+  try:
+    json_obj = json.loads(arg)
+    flat = flatten_json(json_obj) # flatten and normalize json to extract all values
+    return tuple(json_normalize(flat).values()) # if arg is json return values, as values might contain overlap with other api calls
+  except JSONDecodeError:          
+    return tuple(arg)
+
+""" Key related filtering funtions """
+def hasNumbers(inputString):
+  """ true if string contains a number """
+  return any(char.isdigit() for char in inputString)
+
+def is_possible_key(arg):
+  """ is string and string length 5 or greater and has a number in the string. We are assuming that it is possibly an ID """
+  return type(arg) is str and len(arg) >= 5 and hasNumbers(arg)
+
+def site_has_possible_key(domain_data):
+  """ dig through nested data to check if the domain has an API call with a possible ID """
+  for api_call in domain_data[1]:
+    for arg in api_call[1]:
+      if is_possible_key(arg):
+        return True
+  return False
+
+def has_overlap_arg(domain_data):
+  """ dig through nested data to check if the domain has a candidate function with an argument that overlaps and its a key """
+  arg_total_list = []
+  for api_call in domain_data[1]:
+    if api_call_is_ev_cookie_candidate(api_call[0]):
+      arg_list = list(api_call[1])
+      print(arg_total_list)
+      for arg in arg_list:
+        if arg in arg_total_list and is_possible_key(arg):
+          return True
+      arg_total_list.extend(arg_list)
+  return False
+
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ### Below was a preliminary attempt to search for Evercookies
+# MAGIC As a preliminary step I have generate an rdd of tuples containing the domains and the associated api calls with their arguments
+# MAGIC Then I filter out all the domains that don't make api calls to evercookie caching techniques.
+
+# COMMAND ----------
+
+out = rdd_files.map(get_cookie_info).groupByKey().mapValues(unique_list)
+out = out.filter(ev_cookie_candidate )
+
+"""
+current structure:
+[
+  (domain_name, [(api_call, arg_string), ...] )
+]
+"""
+quick_print(out)
+
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ### Evercookie attempt 2
+# MAGIC extract json values passed to the api as possible duplicate values used by evercookies. Remove all domains that do no have overlaping values in the args passed to the api calls.
+# MAGIC 
+# MAGIC The result is a lot of false positives. Difficult to find any good candidates for further examination. Ideas to narrow down include: searching for IDs (aphanumeric?), filter out obvious values like True, False.
+
+# COMMAND ----------
+
+rdd_files = load_file_via_rdd("all", index)
+rdd_files = prep_rdd(rdd_files)
+
+# COMMAND ----------
+
+pout = rdd_files.filter(lambda x : x.get('arguments', '') != '')
+
+pout = pout.map(get_cookie_info_list_args).groupByKey().mapValues(unique_list)
+
+"""
+current structure:
+[
+  (domain_name, [(api_call, (arg1,arg2,...)), ...] )
+]
+"""
+
+# filter domains that don't make calls to web apis used by evercookies
+pout = pout.filter(ev_cookie_candidate )
+
+pout = pout.filter(site_has_possible_key)
+pout = pout.filter(has_overlap_arg)
+
+quick_print(pout)
+
+
+# COMMAND ----------
+
+all_files = load_file_via_rdd("all", index)
+all_files = prep_rdd(all_files)
+
+# COMMAND ----------
+
+def isthing(data):
+  return "evercookie.js" in data['script_url']
+
+all_files.filter(isthing).count()