1
0
Форкнуть 0

UCOSP project fresh/historyless commit.

This commit is contained in:
mlopatka 2018-03-26 11:15:13 +02:00
Родитель 3cc44edbe6
Коммит 81e5cfba55
16 изменённых файлов: 12862 добавлений и 0 удалений

114
.gitignore поставляемый Normal file
Просмотреть файл

@ -0,0 +1,114 @@
# misc OS crap
.DS_Store
# large text files should be locally stored
file_index.txt
analyses/result.csv
cache/
/analyses/cache
.ipynb_checkpoints/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# dotenv
.env
# virtualenv
.venv
venv/
ENV/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
# cache
cache/

2
README.md Normal file
Просмотреть файл

@ -0,0 +1,2 @@
# sb2018
Safe Browsing Project - Contributions from the UCOSP 2018 winter semester cohort

309
analyses/SampleData.ipynb Normal file

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

1773
analyses/SampleData.md Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

0
analyses/__init__.py Normal file
Просмотреть файл

Просмотреть файл

@ -0,0 +1,883 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import json\n",
"import sys\n",
"sys.path.append('..')\n",
"from utils.load_data_util import load_random_data\n",
"\n",
"result = load_random_data(10, seed=42)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>key</th>\n",
" <th>value</th>\n",
" <th>script_url</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>RNLBSERVERID</td>\n",
" <td>ded6726</td>\n",
" <td>https://syndication.exosrv.com/splash.php?idzo...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>_ga</td>\n",
" <td>GA1.2.692713596.1513387628</td>\n",
" <td>https://syndication.exosrv.com/splash.php?idzo...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>_gid</td>\n",
" <td>GA1.2.1540566351.1513387628</td>\n",
" <td>https://syndication.exosrv.com/splash.php?idzo...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>_gat</td>\n",
" <td>1</td>\n",
" <td>https://syndication.exosrv.com/splash.php?idzo...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>splash_i</td>\n",
" <td>false</td>\n",
" <td>https://syndication.exosrv.com/splash.php?idzo...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>expires</td>\n",
" <td>Sun, 16 Dec 2018 01:27:12 GMT</td>\n",
" <td>https://syndication.exosrv.com/splash.php?idzo...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>path</td>\n",
" <td>/</td>\n",
" <td>https://syndication.exosrv.com/splash.php?idzo...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Adshow</td>\n",
" <td>1</td>\n",
" <td>http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>z_pro_city</td>\n",
" <td>s_provice%3Dmixiegenzhou%26s_city%3Dnull</td>\n",
" <td>http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>userProvinceId</td>\n",
" <td>1</td>\n",
" <td>http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>userCityId</td>\n",
" <td>0</td>\n",
" <td>http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>userCountyId</td>\n",
" <td>0</td>\n",
" <td>http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>userLocationId</td>\n",
" <td>1</td>\n",
" <td>http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>ip_ck</td>\n",
" <td>4c+H5PP1j7QuNjg2MzkyLjE1MTM0MTU0NzQ%3D</td>\n",
" <td>http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>lv</td>\n",
" <td>1513415476</td>\n",
" <td>http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>vn</td>\n",
" <td>1</td>\n",
" <td>http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Adshow</td>\n",
" <td>1</td>\n",
" <td>http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>z_pro_city</td>\n",
" <td>s_provice%3Dmixiegenzhou%26s_city%3Dnull</td>\n",
" <td>http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>userProvinceId</td>\n",
" <td>1</td>\n",
" <td>http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>userCityId</td>\n",
" <td>0</td>\n",
" <td>http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>userCountyId</td>\n",
" <td>0</td>\n",
" <td>http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>userLocationId</td>\n",
" <td>1</td>\n",
" <td>http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>ip_ck</td>\n",
" <td>4c+H5PP1j7QuNjg2MzkyLjE1MTM0MTU0NzQ%3D</td>\n",
" <td>http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>lv</td>\n",
" <td>1513415476</td>\n",
" <td>http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>vn</td>\n",
" <td>1</td>\n",
" <td>http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>POSMEDIAID</td>\n",
" <td>c8de8cfb85858ad6c30636190806b8fc9b43af469b42ff...</td>\n",
" <td>http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>path</td>\n",
" <td>/</td>\n",
" <td>http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>expires</td>\n",
" <td>Thu, 30 Nov 2090 18:22:56 GMT</td>\n",
" <td>http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>domain</td>\n",
" <td>.ydjs.zol.com.cn</td>\n",
" <td>http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>sd</td>\n",
" <td>1</td>\n",
" <td>https://content.adriver.ru/banners/0002186/000...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4822</th>\n",
" <td>__utmt</td>\n",
" <td>1</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4823</th>\n",
" <td>__utma</td>\n",
" <td>219845656.2053000654.1513465816.1513465816.151...</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4824</th>\n",
" <td>__utmb</td>\n",
" <td>219845656.1.10.1513465816</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4825</th>\n",
" <td>__utmc</td>\n",
" <td>219845656</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4826</th>\n",
" <td>__utmz</td>\n",
" <td>219845656.1513465816.1.1.utmcsr=(direct)|utmcc...</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4827</th>\n",
" <td>__utmt</td>\n",
" <td>1</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4828</th>\n",
" <td>__utma</td>\n",
" <td>219845656.2053000654.1513465816.1513465816.151...</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4829</th>\n",
" <td>__utmb</td>\n",
" <td>219845656.1.10.1513465816</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4830</th>\n",
" <td>__utmc</td>\n",
" <td>219845656</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4831</th>\n",
" <td>__utmz</td>\n",
" <td>219845656.1513465816.1.1.utmcsr=(direct)|utmcc...</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4832</th>\n",
" <td>__utmt</td>\n",
" <td>1</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4833</th>\n",
" <td>__utma</td>\n",
" <td>219845656.2053000654.1513465816.1513465816.151...</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4834</th>\n",
" <td>path</td>\n",
" <td>/</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4835</th>\n",
" <td>expires</td>\n",
" <td>Mon, 16 Dec 2019 23:10:15 GMT</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4836</th>\n",
" <td>domain</td>\n",
" <td>pepper.pr.co</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4837</th>\n",
" <td>__utmb</td>\n",
" <td>219845656.2.9.1513465816</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4838</th>\n",
" <td>path</td>\n",
" <td>/</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4839</th>\n",
" <td>expires</td>\n",
" <td>Sat, 16 Dec 2017 23:40:15 GMT</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4840</th>\n",
" <td>domain</td>\n",
" <td>pepper.pr.co</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4841</th>\n",
" <td>__utmc</td>\n",
" <td>219845656</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4842</th>\n",
" <td>path</td>\n",
" <td>/</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4843</th>\n",
" <td>domain</td>\n",
" <td>pepper.pr.co</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4844</th>\n",
" <td>__utmz</td>\n",
" <td>219845656.1513465816.1.1.utmcsr=(direct)|utmcc...</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4845</th>\n",
" <td>path</td>\n",
" <td>/</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4846</th>\n",
" <td>expires</td>\n",
" <td>Sun, 17 Jun 2018 11:10:15 GMT</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4847</th>\n",
" <td>domain</td>\n",
" <td>pepper.pr.co</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4848</th>\n",
" <td>__utmv</td>\n",
" <td></td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4849</th>\n",
" <td>path</td>\n",
" <td>/</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4850</th>\n",
" <td>expires</td>\n",
" <td>Sat, 16 Dec 2017 23:10:15 GMT</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4851</th>\n",
" <td>domain</td>\n",
" <td>pepper.pr.co</td>\n",
" <td>http://stats.g.doubleclick.net/dc.js</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>4852 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" key value script_url\n",
"0 RNLBSERVERID ded6726 https://syndication.exosrv.com/splash.php?idzo...\n",
"1 _ga GA1.2.692713596.1513387628 https://syndication.exosrv.com/splash.php?idzo...\n",
"2 _gid GA1.2.1540566351.1513387628 https://syndication.exosrv.com/splash.php?idzo...\n",
"3 _gat 1 https://syndication.exosrv.com/splash.php?idzo...\n",
"4 splash_i false https://syndication.exosrv.com/splash.php?idzo...\n",
"5 expires Sun, 16 Dec 2018 01:27:12 GMT https://syndication.exosrv.com/splash.php?idzo...\n",
"6 path / https://syndication.exosrv.com/splash.php?idzo...\n",
"7 Adshow 1 http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...\n",
"8 z_pro_city s_provice%3Dmixiegenzhou%26s_city%3Dnull http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...\n",
"9 userProvinceId 1 http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...\n",
"10 userCityId 0 http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...\n",
"11 userCountyId 0 http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...\n",
"12 userLocationId 1 http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...\n",
"13 ip_ck 4c+H5PP1j7QuNjg2MzkyLjE1MTM0MTU0NzQ%3D http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...\n",
"14 lv 1513415476 http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...\n",
"15 vn 1 http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...\n",
"16 Adshow 1 http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...\n",
"17 z_pro_city s_provice%3Dmixiegenzhou%26s_city%3Dnull http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...\n",
"18 userProvinceId 1 http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...\n",
"19 userCityId 0 http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...\n",
"20 userCountyId 0 http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...\n",
"21 userLocationId 1 http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...\n",
"22 ip_ck 4c+H5PP1j7QuNjg2MzkyLjE1MTM0MTU0NzQ%3D http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...\n",
"23 lv 1513415476 http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...\n",
"24 vn 1 http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...\n",
"25 POSMEDIAID c8de8cfb85858ad6c30636190806b8fc9b43af469b42ff... http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...\n",
"26 path / http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...\n",
"27 expires Thu, 30 Nov 2090 18:22:56 GMT http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...\n",
"28 domain .ydjs.zol.com.cn http://ydjs.zol.com.cn/m.html?mediaid=c8de8cfb...\n",
"29 sd 1 https://content.adriver.ru/banners/0002186/000...\n",
"... ... ... ...\n",
"4822 __utmt 1 http://stats.g.doubleclick.net/dc.js\n",
"4823 __utma 219845656.2053000654.1513465816.1513465816.151... http://stats.g.doubleclick.net/dc.js\n",
"4824 __utmb 219845656.1.10.1513465816 http://stats.g.doubleclick.net/dc.js\n",
"4825 __utmc 219845656 http://stats.g.doubleclick.net/dc.js\n",
"4826 __utmz 219845656.1513465816.1.1.utmcsr=(direct)|utmcc... http://stats.g.doubleclick.net/dc.js\n",
"4827 __utmt 1 http://stats.g.doubleclick.net/dc.js\n",
"4828 __utma 219845656.2053000654.1513465816.1513465816.151... http://stats.g.doubleclick.net/dc.js\n",
"4829 __utmb 219845656.1.10.1513465816 http://stats.g.doubleclick.net/dc.js\n",
"4830 __utmc 219845656 http://stats.g.doubleclick.net/dc.js\n",
"4831 __utmz 219845656.1513465816.1.1.utmcsr=(direct)|utmcc... http://stats.g.doubleclick.net/dc.js\n",
"4832 __utmt 1 http://stats.g.doubleclick.net/dc.js\n",
"4833 __utma 219845656.2053000654.1513465816.1513465816.151... http://stats.g.doubleclick.net/dc.js\n",
"4834 path / http://stats.g.doubleclick.net/dc.js\n",
"4835 expires Mon, 16 Dec 2019 23:10:15 GMT http://stats.g.doubleclick.net/dc.js\n",
"4836 domain pepper.pr.co http://stats.g.doubleclick.net/dc.js\n",
"4837 __utmb 219845656.2.9.1513465816 http://stats.g.doubleclick.net/dc.js\n",
"4838 path / http://stats.g.doubleclick.net/dc.js\n",
"4839 expires Sat, 16 Dec 2017 23:40:15 GMT http://stats.g.doubleclick.net/dc.js\n",
"4840 domain pepper.pr.co http://stats.g.doubleclick.net/dc.js\n",
"4841 __utmc 219845656 http://stats.g.doubleclick.net/dc.js\n",
"4842 path / http://stats.g.doubleclick.net/dc.js\n",
"4843 domain pepper.pr.co http://stats.g.doubleclick.net/dc.js\n",
"4844 __utmz 219845656.1513465816.1.1.utmcsr=(direct)|utmcc... http://stats.g.doubleclick.net/dc.js\n",
"4845 path / http://stats.g.doubleclick.net/dc.js\n",
"4846 expires Sun, 17 Jun 2018 11:10:15 GMT http://stats.g.doubleclick.net/dc.js\n",
"4847 domain pepper.pr.co http://stats.g.doubleclick.net/dc.js\n",
"4848 __utmv http://stats.g.doubleclick.net/dc.js\n",
"4849 path / http://stats.g.doubleclick.net/dc.js\n",
"4850 expires Sat, 16 Dec 2017 23:10:15 GMT http://stats.g.doubleclick.net/dc.js\n",
"4851 domain pepper.pr.co http://stats.g.doubleclick.net/dc.js\n",
"\n",
"[4852 rows x 3 columns]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result_cookie = result.loc[(result['symbol'] == \"window.document.cookie\") & (result['value'].str.contains(\"=\"))]\n",
"cookiedf = pd.DataFrame(columns=['key', 'value', 'script_url'])\n",
"for i, row in result_cookie.iterrows():\n",
" for kv in row['value'].split(\";\"):\n",
" if \"=\" in kv:\n",
" keyValueArr = kv.strip().split(\"=\", maxsplit=1)\n",
" cookiedf = cookiedf.append({'key':keyValueArr[0], 'value':keyValueArr[1], 'script_url':row['script_url']}, ignore_index=True) \n",
"\n",
"cookiedf\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'json': {'i18nextLng': 'en-US'},\n",
" 'script_url': 'https://syndication.exosrv.com/splash.php?idzone=1931806&type=3&sub=1'},\n",
" {'json': {},\n",
" 'script_url': 'http://d1m6l9dfulcyw7.cloudfront.net/uxa/a82093cd9cdbf.js'},\n",
" {'json': {'CSLocalStorageTest': 'CSLocalStorageTest'},\n",
" 'script_url': 'http://d1m6l9dfulcyw7.cloudfront.net/uxa/a82093cd9cdbf.js'},\n",
" {'json': {},\n",
" 'script_url': 'http://d1m6l9dfulcyw7.cloudfront.net/uxa/a82093cd9cdbf.js'},\n",
" {'json': {},\n",
" 'script_url': 'http://d1m6l9dfulcyw7.cloudfront.net/uxa/a82093cd9cdbf.js'},\n",
" {'json': {'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'http://d1m6l9dfulcyw7.cloudfront.net/uxa/a82093cd9cdbf.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '0',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'https://js.datadome.co/tags.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '0',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'http://d1m6l9dfulcyw7.cloudfront.net/uxa/a82093cd9cdbf.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'http://d1m6l9dfulcyw7.cloudfront.net/uxa/a82093cd9cdbf.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'http://d1m6l9dfulcyw7.cloudfront.net/uxa/a82093cd9cdbf.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PROJECT_ID': '269',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'http://d1m6l9dfulcyw7.cloudfront.net/uxa/a82093cd9cdbf.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:CACHE_BUSTING': '\"263385\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PROJECT_ID': '269',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'http://d1m6l9dfulcyw7.cloudfront.net/uxa/a82093cd9cdbf.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:CACHE_BUSTING': '\"263385\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PROJECT_ID': '269',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:USER_ID': '\"d8151d4d-1a7c-a810-c2dd-5b1cc3a0f286\"',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'http://d1m6l9dfulcyw7.cloudfront.net/uxa/a82093cd9cdbf.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:CACHE_BUSTING': '\"263385\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PROJECT_ID': '269',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SESSION_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:USER_ID': '\"d8151d4d-1a7c-a810-c2dd-5b1cc3a0f286\"',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'http://d1m6l9dfulcyw7.cloudfront.net/uxa/a82093cd9cdbf.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:CACHE_BUSTING': '\"263385\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_VISIT': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PROJECT_ID': '269',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SESSION_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:USER_ID': '\"d8151d4d-1a7c-a810-c2dd-5b1cc3a0f286\"',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'http://d1m6l9dfulcyw7.cloudfront.net/uxa/a82093cd9cdbf.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:CACHE_BUSTING': '\"263385\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_VISIT': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PROJECT_ID': '269',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SESSION_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:USER_ID': '\"d8151d4d-1a7c-a810-c2dd-5b1cc3a0f286\"',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'http://d1m6l9dfulcyw7.cloudfront.net/uxa/a82093cd9cdbf.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:CACHE_BUSTING': '\"263385\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_VISIT': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PROJECT_ID': '269',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SESSION_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:USER_ID': '\"d8151d4d-1a7c-a810-c2dd-5b1cc3a0f286\"',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'http://d1m6l9dfulcyw7.cloudfront.net/uxa/a82093cd9cdbf.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:CACHE_BUSTING': '\"263385\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_VISIT': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PAGE_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PROJECT_ID': '269',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SESSION_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:USER_ID': '\"d8151d4d-1a7c-a810-c2dd-5b1cc3a0f286\"',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'http://d1m6l9dfulcyw7.cloudfront.net/uxa/a82093cd9cdbf.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:CACHE_BUSTING': '\"263385\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_VISIT': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PAGE_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PROJECT_ID': '269',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:RECORDING': '\"0\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SESSION_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:USER_ID': '\"d8151d4d-1a7c-a810-c2dd-5b1cc3a0f286\"',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'http://d1m6l9dfulcyw7.cloudfront.net/uxa/a82093cd9cdbf.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:CACHE_BUSTING': '\"263385\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_VISIT': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PAGE_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PROJECT_ID': '269',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:RECORDING': '\"0\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SESSION_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:USER_ID': '\"d8151d4d-1a7c-a810-c2dd-5b1cc3a0f286\"',\n",
" 'CSStorageData:stores:412876:eventsStore:queues:events:0': '[0,3485,1366,697]',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'http://static.criteo.net/js/ld/ld.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:CACHE_BUSTING': '\"263385\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_VISIT': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PAGE_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PROJECT_ID': '269',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:RECORDING': '\"0\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SESSION_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:USER_ID': '\"d8151d4d-1a7c-a810-c2dd-5b1cc3a0f286\"',\n",
" 'CSStorageData:stores:412876:eventsStore:queues:events:0': '[0,3485,1366,697]',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'http://static.criteo.net/js/ld/ld.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:CACHE_BUSTING': '\"263385\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_VISIT': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PAGE_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PROJECT_ID': '269',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:RECORDING': '\"0\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SESSION_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:USER_ID': '\"d8151d4d-1a7c-a810-c2dd-5b1cc3a0f286\"',\n",
" 'CSStorageData:stores:412876:eventsStore:queues:events:0': '[0,3485,1366,697]',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'http://static.criteo.net/js/ld/ld.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:CACHE_BUSTING': '\"263385\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_VISIT': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PAGE_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PROJECT_ID': '269',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:RECORDING': '\"0\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SESSION_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:USER_ID': '\"d8151d4d-1a7c-a810-c2dd-5b1cc3a0f286\"',\n",
" 'CSStorageData:stores:412876:eventsStore:queues:events:0': '[0,3485,1366,697]',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'http://static.criteo.net/js/ld/ld.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:CACHE_BUSTING': '\"263385\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_VISIT': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PAGE_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PROJECT_ID': '269',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:RECORDING': '\"0\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SESSION_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:USER_ID': '\"d8151d4d-1a7c-a810-c2dd-5b1cc3a0f286\"',\n",
" 'CSStorageData:stores:412876:eventsStore:queues:events:0': '[0,3485,1366,697]',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'http://static.criteo.net/js/ld/ld.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:CACHE_BUSTING': '\"263385\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_VISIT': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PAGE_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PROJECT_ID': '269',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:RECORDING': '\"0\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SESSION_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:USER_ID': '\"d8151d4d-1a7c-a810-c2dd-5b1cc3a0f286\"',\n",
" 'CSStorageData:stores:412876:eventsStore:queues:events:0': '[0,3485,1366,697]',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'http://static.criteo.net/js/ld/ld.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:CACHE_BUSTING': '\"263385\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_VISIT': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PAGE_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PROJECT_ID': '269',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:RECORDING': '\"0\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SESSION_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:USER_ID': '\"d8151d4d-1a7c-a810-c2dd-5b1cc3a0f286\"',\n",
" 'CSStorageData:stores:412876:eventsStore:queues:events:0': '[0,3485,1366,697]',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'http://static.criteo.net/js/ld/ld.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:CACHE_BUSTING': '\"263385\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_VISIT': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PAGE_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PROJECT_ID': '269',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:RECORDING': '\"0\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SESSION_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:USER_ID': '\"d8151d4d-1a7c-a810-c2dd-5b1cc3a0f286\"',\n",
" 'CSStorageData:stores:412876:eventsStore:queues:events:0': '[0,3485,1366,697]',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'http://static.criteo.net/js/ld/ld.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:CACHE_BUSTING': '\"263385\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_VISIT': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PAGE_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PROJECT_ID': '269',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:RECORDING': '\"0\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SESSION_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:USER_ID': '\"d8151d4d-1a7c-a810-c2dd-5b1cc3a0f286\"',\n",
" 'CSStorageData:stores:412876:eventsStore:queues:events:0': '[0,3485,1366,697]',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'https://pmcdn.staticpmrk.com/rakuten-static-deliver/app/397.0.1/20170919/static/front/libraries/ral/ral-1.0.20.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:CACHE_BUSTING': '\"263385\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_VISIT': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PAGE_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PROJECT_ID': '269',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:RECORDING': '\"0\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SESSION_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:USER_ID': '\"d8151d4d-1a7c-a810-c2dd-5b1cc3a0f286\"',\n",
" 'CSStorageData:stores:412876:eventsStore:queues:events:0': '[0,3485,1366,697]',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'https://pmcdn.staticpmrk.com/rakuten-static-deliver/app/397.0.1/20170919/static/front/libraries/ral/ral-1.0.20.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:CACHE_BUSTING': '\"263385\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_VISIT': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PAGE_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PROJECT_ID': '269',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:RECORDING': '\"0\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SESSION_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:USER_ID': '\"d8151d4d-1a7c-a810-c2dd-5b1cc3a0f286\"',\n",
" 'CSStorageData:stores:412876:eventsStore:queues:events:0': '[0,3485,1366,697]',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'https://pmcdn.staticpmrk.com/rakuten-static-deliver/app/397.0.1/20170919/static/front/libraries/ral/ral-1.0.20.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:CACHE_BUSTING': '\"263385\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_VISIT': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PAGE_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PROJECT_ID': '269',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:RECORDING': '\"0\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SESSION_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:USER_ID': '\"d8151d4d-1a7c-a810-c2dd-5b1cc3a0f286\"',\n",
" 'CSStorageData:stores:412876:eventsStore:queues:events:0': '[0,3485,1366,697]',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'https://pmcdn.staticpmrk.com/rakuten-static-deliver/app/397.0.1/20170919/static/front/libraries/ral/ral-1.0.20.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:CACHE_BUSTING': '\"263385\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_VISIT': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PAGE_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PROJECT_ID': '269',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:RECORDING': '\"0\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SESSION_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:USER_ID': '\"d8151d4d-1a7c-a810-c2dd-5b1cc3a0f286\"',\n",
" 'CSStorageData:stores:412876:eventsStore:queues:events:0': '[0,3485,1366,697]',\n",
" 'CSStorageData:timeStamp': '1513385259071'},\n",
" 'script_url': 'https://pmcdn.staticpmrk.com/rakuten-static-deliver/app/397.0.1/20170919/static/front/libraries/ral/ral-1.0.20.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:CACHE_BUSTING': '\"263385\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_VISIT': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PAGE_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PROJECT_ID': '269',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:RECORDING': '\"0\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SESSION_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:USER_ID': '\"d8151d4d-1a7c-a810-c2dd-5b1cc3a0f286\"',\n",
" 'CSStorageData:stores:412876:eventsStore:queues:events:0': '[0,3485,1366,697]',\n",
" 'CSStorageData:timeStamp': '1513385259071',\n",
" 'RALROFL': '{\"ltm\":\"2017-12-16 00:47:43\",\"acc\":10,\"cntln\":\"fr\",\"reqc\":\"success\",\"pgt\":\"nav\",\"pgn\":\"nav\",\"aid\":5,\"cp\":{\"rg\":null,\"buyer\":\"2\",\"rsp\":5,\"ekm\":\"\",\"usergenre\":null,\"usertrackinggroup\":null,\"club_status\":\"\"},\"pgl\":\"PC\",\"icategories\":\"jardin,mobilier-de-jardin\",\"cat1\":\"jardin\",\"cat2\":\"mobilier-de-jardin\",\"etype\":\"pv\",\"url\":\"http://www.priceminister.com/nav/jardin_mobilier-de-jardin/f2/Coffre+de+jardin\",\"tid\":\"9c355f85\",\"tzo\":0,\"res\":\"1366x768\",\"jav\":false,\"bln\":\"en-US\",\"ua\":\"Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0\",\"online\":true,\"ver\":\"1.0.20d\",\"rqtime\":1041,\"ldtime\":2446,\"astime\":4157,\"navtype\":0,\"ifr\":0,\"pgid\":\"d61245cc156d369b\",\"cks\":\"4fd32bc9-4a6d-4d63-88b6-bdefd08f3c22\"}'},\n",
" 'script_url': 'https://pmcdn.staticpmrk.com/rakuten-static-deliver/app/397.0.1/20170919/static/front/libraries/ral/ral-1.0.20.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:CACHE_BUSTING': '\"263385\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_VISIT': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PAGE_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PROJECT_ID': '269',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:RECORDING': '\"0\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SESSION_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:USER_ID': '\"d8151d4d-1a7c-a810-c2dd-5b1cc3a0f286\"',\n",
" 'CSStorageData:stores:412876:eventsStore:queues:events:0': '[0,3485,1366,697]',\n",
" 'CSStorageData:timeStamp': '1513385259071',\n",
" 'RALROFL': '{\"ltm\":\"2017-12-16 00:47:43\",\"acc\":10,\"cntln\":\"fr\",\"reqc\":\"success\",\"pgt\":\"nav\",\"pgn\":\"nav\",\"aid\":5,\"cp\":{\"rg\":null,\"buyer\":\"2\",\"rsp\":5,\"ekm\":\"\",\"usergenre\":null,\"usertrackinggroup\":null,\"club_status\":\"\"},\"pgl\":\"PC\",\"icategories\":\"jardin,mobilier-de-jardin\",\"cat1\":\"jardin\",\"cat2\":\"mobilier-de-jardin\",\"etype\":\"pv\",\"url\":\"http://www.priceminister.com/nav/jardin_mobilier-de-jardin/f2/Coffre+de+jardin\",\"tid\":\"9c355f85\",\"tzo\":0,\"res\":\"1366x768\",\"jav\":false,\"bln\":\"en-US\",\"ua\":\"Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0\",\"online\":true,\"ver\":\"1.0.20d\",\"rqtime\":1041,\"ldtime\":2446,\"astime\":4157,\"navtype\":0,\"ifr\":0,\"pgid\":\"d61245cc156d369b\",\"cks\":\"4fd32bc9-4a6d-4d63-88b6-bdefd08f3c22\"}'},\n",
" 'script_url': 'https://pmcdn.staticpmrk.com/rakuten-static-deliver/app/397.0.1/20170919/static/front/libraries/ral/ral-1.0.20.js'},\n",
" {'json': {'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:CACHE_BUSTING': '\"263385\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_HIT_DATE': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:LAST_VISIT': '\"1513385259\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PAGE_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:PROJECT_ID': '269',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:RECORDING': '\"0\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SCROLL_RATE': '7',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:SESSION_NUMBER': '\"1\"',\n",
" 'CSStorageData:stores:412876:eventsStore:maps:eventsMetaData:USER_ID': '\"d8151d4d-1a7c-a810-c2dd-5b1cc3a0f286\"',\n",
" 'CSStorageData:stores:412876:eventsStore:queues:events:0': '[0,3485,1366,697]',\n",
" 'CSStorageData:timeStamp': '1513385259071',\n",
" 'RALROFL': '{\"ltm\":\"2017-12-16 00:47:43\",\"acc\":10,\"cntln\":\"fr\",\"reqc\":\"success\",\"pgt\":\"nav\",\"pgn\":\"nav\",\"aid\":5,\"cp\":{\"rg\":null,\"buyer\":\"2\",\"rsp\":5,\"ekm\":\"\",\"usergenre\":null,\"usertrackinggroup\":null,\"club_status\":\"\"},\"pgl\":\"PC\",\"icategories\":\"jardin,mobilier-de-jardin\",\"cat1\":\"jardin\",\"cat2\":\"mobilier-de-jardin\",\"etype\":\"pv\",\"url\":\"http://www.priceminister.com/nav/jardin_mobilier-de-jardin/f2/Coffre+de+jardin\",\"tid\":\"9c355f85\",\"tzo\":0,\"res\":\"1366x768\",\"jav\":false,\"bln\":\"en-US\",\"ua\":\"Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0\",\"online\":true,\"ver\":\"1.0.20d\",\"rqtime\":1041,\"ldtime\":2446,\"astime\":4157,\"navtype\":0,\"ifr\":0,\"pgid\":\"d61245cc156d369b\",\"cks\":\"4fd32bc9-4a6d-4d63-88b6-bdefd08f3c22\"}'},\n",
" 'script_url': 'https://pmcdn.staticpmrk.com/rakuten-static-deliver/app/397.0.1/20170919/static/front/libraries/ral/ral-1.0.20.js'},\n",
" {'json': {'_at.cww': '{\"value\":true,\"expires\":1513386547738}',\n",
" 'at-lojson-cache-xa-4d83f5dd760fecd5': '{\"config\":null,\"perConfig\":{}}',\n",
" 'at-rand': '0.6129927146038678',\n",
" 'google_experiment_mod': '570',\n",
" 'google_pub_config': '{\"sraConfigs\":{\"2\":{\"sraTimeout\":60000},\"4\":{\"sraTimeout\":60000}}}'},\n",
" 'script_url': 'https://s7.addthis.com/js/250/addthis_widget.js#pubid=xa-4d83f5dd760fecd5'}]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test = result.loc[(result['symbol'] == \"window.localStorage\")]\n",
"localStorage = []\n",
"for i, row in test.iterrows():\n",
" localStorage.append({'json': json.loads(row['value']), 'script_url':row['script_url']})\n",
"\n",
"localStorage"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,206 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# HTTPS and Mixed Content Vulnerability Analysis"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook looks at 2 things from the crawl dataset:\n",
"1. What percentage of websites use https.\n",
"2. How many websites are using mixed content. \n",
"\n",
"Mixed content is when a HTTPS webpage loads resouces, such as javascript files, over an insecure HTTP connection."
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append('..')\n",
"from utils import load_data_util"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"files_to_analyze = 10000"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Download webcrawl data from S3 and build a dictionary with webpage urls as keys and HTTP / HTTPS information as values."
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data = load_data_util.load_random_data(files_to_analyze, False, 42, False)\n",
"\n",
"result = {}\n",
"for index, row in data.iterrows():\n",
" # get the url of the webpage that was being crawled and use that as a unique key.\n",
" key = row['location']\n",
" \n",
" if key not in result:\n",
" # check if the webpage is using https.\n",
" is_https = False\n",
" if key.split(\":\")[0] == \"https\":\n",
" is_https = True\n",
"\n",
" result[key] = {\n",
" \"is_https\": is_https,\n",
" \"http_script_urls\": 0,\n",
" \"https_script_urls\": 0\n",
" }\n",
"\n",
" # record the number of javascript function calls for the webpage \n",
" # whose script url is fetched using http or https.\n",
" url_protocol = row[\"script_url\"].split(\"://\")[0]\n",
" if url_protocol == \"http\":\n",
" result[key]['http_script_urls'] += 1\n",
" elif url_protocol == \"https\":\n",
" result[key]['https_script_urls'] += 1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Analyze the collected data to get:\n",
"* A count of the number of websites that use https.\n",
"* A list of websites that have mixed content."
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"urls_of_websites_with_mixed_content_vulnerability = []\n",
"number_of_https_websites = 0\n",
"x = 0\n",
"for key in result:\n",
" if result[key]['is_https']:\n",
" if result[key]['http_script_urls'] > 0:\n",
" urls_of_websites_with_mixed_content_vulnerability.append(key)\n",
" number_of_https_websites += 1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Print out information for the total number of webpages that use HTTPS."
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"69.79% (6979/10000) of websites use https.\n"
]
}
],
"source": [
"percent_of_websites_using_https = round(number_of_https_websites / files_to_analyze * 100, 4)\n",
"print(\n",
" str(percent_of_websites_using_https) + \"% (\" + \n",
" str(number_of_https_websites) + \"/\" + str(files_to_analyze) + \n",
" \") of websites use https.\"\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Print out information collected for the number of webpages that have mixed content."
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.0% (0/10000) of websites have mixed content.\n"
]
}
],
"source": [
"number_of_websites_vulnerable = len(urls_of_websites_with_mixed_content_vulnerability)\n",
"percent_of_websites_vulnerable = number_of_websites_vulnerable / files_to_analyze * 100\n",
"print(\n",
" str(percent_of_websites_vulnerable) + \"% (\" + \n",
" str(number_of_websites_vulnerable) + \"/\" + str(files_to_analyze) + \n",
" \") of websites have mixed content.\"\n",
")\n",
"\n",
"if number_of_websites_vulnerable > 0:\n",
" print(\"The following websites have mixed content:\")\n",
"for url in urls_of_websites_with_mixed_content_vulnerability:\n",
" print(url)"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

104
analyses/https_analysis.md Normal file
Просмотреть файл

@ -0,0 +1,104 @@
# HTTPS and Mixed Content Vulnerability Analysis
This notebook looks at 2 things from the crawl dataset:
1. What percentage of websites use https.
2. How many websites are using mixed content.
Mixed content is when a HTTPS webpage loads resouces, such as javascript files, over an insecure HTTP connection.
```python
import sys
sys.path.append('..')
from utils import load_data_util
```
```python
files_to_analyze = 10000
```
#### Download webcrawl data from S3 and build a dictionary with webpage urls as keys and HTTP / HTTPS information as values.
```python
data = load_data_util.load_random_data(files_to_analyze, False, 42, False)
result = {}
for index, row in data.iterrows():
# get the url of the webpage that was being crawled and use that as a unique key.
key = row['location']
if key not in result:
# check if the webpage is using https.
is_https = False
if key.split(":")[0] == "https":
is_https = True
result[key] = {
"is_https": is_https,
"http_script_urls": 0,
"https_script_urls": 0
}
# record the number of javascript function calls for the webpage
# whose script url is fetched using http or https.
url_protocol = row["script_url"].split("://")[0]
if url_protocol == "http":
result[key]['http_script_urls'] += 1
elif url_protocol == "https":
result[key]['https_script_urls'] += 1
```
### Analyze the collected data to get:
* A count of the number of websites that use https.
* A list of websites that have mixed content.
```python
urls_of_websites_with_mixed_content_vulnerability = []
number_of_https_websites = 0
x = 0
for key in result:
if result[key]['is_https']:
if result[key]['http_script_urls'] > 0:
urls_of_websites_with_mixed_content_vulnerability.append(key)
number_of_https_websites += 1
```
### Print out information for the total number of webpages that use HTTPS.
```python
percent_of_websites_using_https = round(number_of_https_websites / files_to_analyze * 100, 4)
print(
str(percent_of_websites_using_https) + "% (" +
str(number_of_https_websites) + "/" + str(files_to_analyze) +
") of websites use https."
)
```
69.79% (6979/10000) of websites use https.
### Print out information collected for the number of webpages that have mixed content.
```python
number_of_websites_vulnerable = len(urls_of_websites_with_mixed_content_vulnerability)
percent_of_websites_vulnerable = number_of_websites_vulnerable / files_to_analyze * 100
print(
str(percent_of_websites_vulnerable) + "% (" +
str(number_of_websites_vulnerable) + "/" + str(files_to_analyze) +
") of websites have mixed content."
)
if number_of_websites_vulnerable > 0:
print("The following websites have mixed content:")
for url in urls_of_websites_with_mixed_content_vulnerability:
print(url)
```
0.0% (0/10000) of websites have mixed content.

77
analyses/sample_data.py Normal file
Просмотреть файл

@ -0,0 +1,77 @@
import pandas as pd
def make_entity_list_df():
"""Create df from disconnectme entity list."""
frames = []
entityList = pd.read_json(
"https://raw.githubusercontent.com/disconnectme/disconnect-tracking-protection/master/services.json")
for category in entityList["categories"]: # Category eg. "Advertising"
for entity in category: # Entity eg. {'reddit': {'http://www.reddit.com/': ['reddit.com']}}
name = list(entity.keys())[0]
url = list(entity[name].keys())[0]
# Rename key from original url to "resources" to collapse resulting df columns.
entity[name]["resources"] = entity[name][url]
del entity[name][url]
# Create df for each entity.
frame = pd.DataFrame.from_dict(entity, orient="index")
frames.append(frame)
result = pd.concat(frames)
result["count"] = 0
return result[["resources", "count"]]
def sample_random_files(files):
"""Produce statistics for subset of files
Keyword arguments:
files -- df of random files produced by load_random_data()
Output:
df containing TDL's with count > 0, where count represents the number of times
a script was called from a site belonging to that TDL
"""
result = make_entity_list_df()
# Remove rows where one TDL calls the same script url (ie. only keep unique calls).
uniquecalls = files.drop_duplicates(subset={'location', 'script_url'}, keep="last")
# Reset samplings statistics.
result["count"] = 0
result["calledFrom"] = [[]] * len(result)
# For each unique call, for every site owned by a domain on the entity list,
# increment count for that domain if the call uses a script from a site owned by the domain.
for i, rowCalls in uniquecalls.iterrows():
match_found = False
url = rowCalls["script_url"]
if "//" in url:
# Isolate the TLD+1 element by string matching between "//" and first occurrence of "/" element.
url = url.split("//")[1].split("/")[0]
else:
print("irregular script_url: ", url)
# TODO: This deeply nested loop can be made more efficient.
# TODO: Add comments inside lop to explain the flow of execution.
for j, rowResult in result.iterrows():
if not match_found:
for site in result["sites"][j]:
if not match_found:
if site in url:
if not result.at[j, "calledFrom"]:
result.at[j, "calledFrom"] = []
result.at[j, "calledFrom"].append(rowCalls["location"])
result.at[j, 'count'] += 1
print("match found! ", "script_url: ", site, "calledFrom: ", url)
match_found = True
else:
# Match found already.
break
return result[result['count'] > 0]
el = make_entity_list_df()

Просмотреть файл

@ -0,0 +1,115 @@
symbol,operation,N
window.Storage.getItem,call,4254
window.Storage.setItem,call,1698
window.Storage.removeItem,call,1224
CanvasRenderingContext2D.fillText,call,224
HTMLCanvasElement.getContext,call,171
window.Storage.key,call,110
CanvasRenderingContext2D.measureText,call,60
CanvasRenderingContext2D.createRadialGradient,call,59
HTMLCanvasElement.toDataURL,call,51
CanvasRenderingContext2D.fillRect,call,46
window.Storage.hasOwnProperty,call,33
CanvasRenderingContext2D.fill,call,31
CanvasRenderingContext2D.arc,call,22
CanvasRenderingContext2D.getImageData,call,20
CanvasRenderingContext2D.createImageData,call,17
CanvasRenderingContext2D.putImageData,call,17
CanvasRenderingContext2D.bezierCurveTo,call,12
CanvasRenderingContext2D.rect,call,10
CanvasRenderingContext2D.stroke,call,9
CanvasRenderingContext2D.createLinearGradient,call,9
CanvasRenderingContext2D.save,call,8
CanvasRenderingContext2D.restore,call,8
HTMLCanvasElement.getAttribute,call,7
HTMLCanvasElement.addEventListener,call,6
RTCPeerConnection.createDataChannel,call,4
RTCPeerConnection.createOffer,call,4
RTCPeerConnection.setLocalDescription,call,4
CanvasRenderingContext2D.isPointInPath,call,3
CanvasRenderingContext2D.clip,call,2
window.Storage.clear,call,1
HTMLCanvasElement.getBoundingClientRect,call,1
AudioContext.createOscillator,call,1
window.document.cookie,get,13490
window.navigator.userAgent,get,6738
window.localStorage,get,4067
window.sessionStorage,get,1481
window.name,get,1224
window.navigator.plugins[Shockwave Flash].description,get,852
window.screen.colorDepth,get,721
window.navigator.appName,get,588
window.navigator.language,get,557
window.navigator.platform,get,529
window.navigator.plugins[Shockwave Flash].name,get,483
window.navigator.cookieEnabled,get,352
window.navigator.appVersion,get,283
window.navigator.vendor,get,256
HTMLCanvasElement.offsetWidth,get,173
HTMLCanvasElement.offsetHeight,get,173
HTMLCanvasElement.offsetTop,get,171
HTMLCanvasElement.offsetLeft,get,171
window.navigator.doNotTrack,get,153
window.navigator.product,get,139
window.navigator.plugins[Shockwave Flash].filename,get,135
window.Storage.length,get,122
window.navigator.mimeTypes[application/x-shockwave-flash].type,get,117
window.navigator.languages,get,102
window.screen.pixelDepth,get,85
window.navigator.plugins[Shockwave Flash].version,get,75
window.navigator.plugins[Shockwave Flash].length,get,69
window.navigator.mimeTypes[application/futuresplash].type,get,66
window.navigator.mimeTypes[application/x-shockwave-flash].suffixes,get,43
window.navigator.mimeTypes[application/futuresplash].suffixes,get,43
window.navigator.productSub,get,41
window.navigator.mimeTypes[application/x-shockwave-flash].description,get,38
window.navigator.mimeTypes[application/futuresplash].description,get,38
window.navigator.oscpu,get,34
window.navigator.onLine,get,32
window.navigator.geolocation,get,32
HTMLCanvasElement.style,get,31
HTMLCanvasElement.height,get,31
window.navigator.appCodeName,get,28
HTMLCanvasElement.width,get,28
window.navigator.buildID,get,20
window.navigator.vendorSub,get,19
RTCPeerConnection.localDescription,get,6
RTCPeerConnection.remoteDescription,get,6
RTCPeerConnection.signalingState,get,6
RTCPeerConnection.iceGatheringState,get,6
RTCPeerConnection.onicecandidate,get,6
HTMLCanvasElement.nodeName,get,4
RTCPeerConnection.idpLoginUrl,get,3
RTCPeerConnection.peerIdentity,get,3
RTCPeerConnection.onremovestream,get,3
HTMLCanvasElement.nodeType,get,3
HTMLCanvasElement.className,get,3
HTMLCanvasElement.tagName,get,2
HTMLCanvasElement.firstElementChild,get,2
HTMLCanvasElement.firstChild,get,2
HTMLCanvasElement.nextElementSibling,get,2
OscillatorNode.frequency,get,2
CanvasRenderingContext2D.globalCompositeOperation,get,1
HTMLCanvasElement.parentNode,get,1
HTMLCanvasElement.clientTop,get,1
HTMLCanvasElement.clientLeft,get,1
HTMLCanvasElement.localName,get,1
HTMLCanvasElement.attributes,get,1
HTMLCanvasElement.childNodes,get,1
window.document.cookie,set,3653
window.name,set,204
CanvasRenderingContext2D.fillStyle,set,163
CanvasRenderingContext2D.font,set,161
HTMLCanvasElement.height,set,47
HTMLCanvasElement.width,set,47
CanvasRenderingContext2D.textBaseline,set,36
CanvasRenderingContext2D.shadowColor,set,20
CanvasRenderingContext2D.shadowBlur,set,20
CanvasRenderingContext2D.shadowOffsetX,set,20
CanvasRenderingContext2D.shadowOffsetY,set,20
CanvasRenderingContext2D.strokeStyle,set,15
CanvasRenderingContext2D.lineWidth,set,14
CanvasRenderingContext2D.globalCompositeOperation,set,13
RTCPeerConnection.onicecandidate,set,8
HTMLCanvasElement.requestPointerLock,set,1
CanvasRenderingContext2D.lineJoin,set,1
1 symbol operation N
2 window.Storage.getItem call 4254
3 window.Storage.setItem call 1698
4 window.Storage.removeItem call 1224
5 CanvasRenderingContext2D.fillText call 224
6 HTMLCanvasElement.getContext call 171
7 window.Storage.key call 110
8 CanvasRenderingContext2D.measureText call 60
9 CanvasRenderingContext2D.createRadialGradient call 59
10 HTMLCanvasElement.toDataURL call 51
11 CanvasRenderingContext2D.fillRect call 46
12 window.Storage.hasOwnProperty call 33
13 CanvasRenderingContext2D.fill call 31
14 CanvasRenderingContext2D.arc call 22
15 CanvasRenderingContext2D.getImageData call 20
16 CanvasRenderingContext2D.createImageData call 17
17 CanvasRenderingContext2D.putImageData call 17
18 CanvasRenderingContext2D.bezierCurveTo call 12
19 CanvasRenderingContext2D.rect call 10
20 CanvasRenderingContext2D.stroke call 9
21 CanvasRenderingContext2D.createLinearGradient call 9
22 CanvasRenderingContext2D.save call 8
23 CanvasRenderingContext2D.restore call 8
24 HTMLCanvasElement.getAttribute call 7
25 HTMLCanvasElement.addEventListener call 6
26 RTCPeerConnection.createDataChannel call 4
27 RTCPeerConnection.createOffer call 4
28 RTCPeerConnection.setLocalDescription call 4
29 CanvasRenderingContext2D.isPointInPath call 3
30 CanvasRenderingContext2D.clip call 2
31 window.Storage.clear call 1
32 HTMLCanvasElement.getBoundingClientRect call 1
33 AudioContext.createOscillator call 1
34 window.document.cookie get 13490
35 window.navigator.userAgent get 6738
36 window.localStorage get 4067
37 window.sessionStorage get 1481
38 window.name get 1224
39 window.navigator.plugins[Shockwave Flash].description get 852
40 window.screen.colorDepth get 721
41 window.navigator.appName get 588
42 window.navigator.language get 557
43 window.navigator.platform get 529
44 window.navigator.plugins[Shockwave Flash].name get 483
45 window.navigator.cookieEnabled get 352
46 window.navigator.appVersion get 283
47 window.navigator.vendor get 256
48 HTMLCanvasElement.offsetWidth get 173
49 HTMLCanvasElement.offsetHeight get 173
50 HTMLCanvasElement.offsetTop get 171
51 HTMLCanvasElement.offsetLeft get 171
52 window.navigator.doNotTrack get 153
53 window.navigator.product get 139
54 window.navigator.plugins[Shockwave Flash].filename get 135
55 window.Storage.length get 122
56 window.navigator.mimeTypes[application/x-shockwave-flash].type get 117
57 window.navigator.languages get 102
58 window.screen.pixelDepth get 85
59 window.navigator.plugins[Shockwave Flash].version get 75
60 window.navigator.plugins[Shockwave Flash].length get 69
61 window.navigator.mimeTypes[application/futuresplash].type get 66
62 window.navigator.mimeTypes[application/x-shockwave-flash].suffixes get 43
63 window.navigator.mimeTypes[application/futuresplash].suffixes get 43
64 window.navigator.productSub get 41
65 window.navigator.mimeTypes[application/x-shockwave-flash].description get 38
66 window.navigator.mimeTypes[application/futuresplash].description get 38
67 window.navigator.oscpu get 34
68 window.navigator.onLine get 32
69 window.navigator.geolocation get 32
70 HTMLCanvasElement.style get 31
71 HTMLCanvasElement.height get 31
72 window.navigator.appCodeName get 28
73 HTMLCanvasElement.width get 28
74 window.navigator.buildID get 20
75 window.navigator.vendorSub get 19
76 RTCPeerConnection.localDescription get 6
77 RTCPeerConnection.remoteDescription get 6
78 RTCPeerConnection.signalingState get 6
79 RTCPeerConnection.iceGatheringState get 6
80 RTCPeerConnection.onicecandidate get 6
81 HTMLCanvasElement.nodeName get 4
82 RTCPeerConnection.idpLoginUrl get 3
83 RTCPeerConnection.peerIdentity get 3
84 RTCPeerConnection.onremovestream get 3
85 HTMLCanvasElement.nodeType get 3
86 HTMLCanvasElement.className get 3
87 HTMLCanvasElement.tagName get 2
88 HTMLCanvasElement.firstElementChild get 2
89 HTMLCanvasElement.firstChild get 2
90 HTMLCanvasElement.nextElementSibling get 2
91 OscillatorNode.frequency get 2
92 CanvasRenderingContext2D.globalCompositeOperation get 1
93 HTMLCanvasElement.parentNode get 1
94 HTMLCanvasElement.clientTop get 1
95 HTMLCanvasElement.clientLeft get 1
96 HTMLCanvasElement.localName get 1
97 HTMLCanvasElement.attributes get 1
98 HTMLCanvasElement.childNodes get 1
99 window.document.cookie set 3653
100 window.name set 204
101 CanvasRenderingContext2D.fillStyle set 163
102 CanvasRenderingContext2D.font set 161
103 HTMLCanvasElement.height set 47
104 HTMLCanvasElement.width set 47
105 CanvasRenderingContext2D.textBaseline set 36
106 CanvasRenderingContext2D.shadowColor set 20
107 CanvasRenderingContext2D.shadowBlur set 20
108 CanvasRenderingContext2D.shadowOffsetX set 20
109 CanvasRenderingContext2D.shadowOffsetY set 20
110 CanvasRenderingContext2D.strokeStyle set 15
111 CanvasRenderingContext2D.lineWidth set 14
112 CanvasRenderingContext2D.globalCompositeOperation set 13
113 RTCPeerConnection.onicecandidate set 8
114 HTMLCanvasElement.requestPointerLock set 1
115 CanvasRenderingContext2D.lineJoin set 1

5706
dataExplorationScript.ipynb Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

940
dataExplorationScript.md Normal file
Просмотреть файл

@ -0,0 +1,940 @@
```python
import boto3
import botocore
import json
import pandas as pd
import utils.load_data_util
# Pandas Display Settings to allow the dataframe to display in one view
pd.set_option('display.max_columns', 500)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_rows', 50000)
s3 = boto3.resource('s3')
```
```python
# Helper function to trim the json files into a proper json format
def process_string(data):
return "[" + data[1:-1] + "]"
#Helper function to count the occurance of a given key
def count_key(data, key, key_value_count):
for site in data :
key_value = site[key]
key_value_count[key_value] = key_value_count.get(key_value, 0) + 1
```
```python
result = utils.load_data_util.load_random_data(50)
```
```python
unique_args = result.arguments.unique()
```
```python
count = 0
with open("uniqueArgs.txt", "wb") as f:
for arg in unique_args:
count += 1
f.write((str(arg)+"\n").encode("utf-8"))
```
```python
grouped_by_symbol = result.groupby(['symbol']).count()
```
```python
grouped_by_symbol
```
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>arguments</th>
<th>call_stack</th>
<th>crawl_id</th>
<th>file_number</th>
<th>func_name</th>
<th>in_iframe</th>
<th>location</th>
<th>operation</th>
<th>script_col</th>
<th>script_line</th>
<th>script_loc_eval</th>
<th>script_url</th>
<th>time_stamp</th>
<th>value</th>
</tr>
<tr>
<th>symbol</th>
<th></th>
<th></th>
<th></th>
<th></th>
<th></th>
<th></th>
<th></th>
<th></th>
<th></th>
<th></th>
<th></th>
<th></th>
<th></th>
<th></th>
</tr>
</thead>
<tbody>
<tr>
<th>CanvasRenderingContext2D.fillRect</th>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
</tr>
<tr>
<th>CanvasRenderingContext2D.fillStyle</th>
<td>0</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
</tr>
<tr>
<th>CanvasRenderingContext2D.textBaseline</th>
<td>0</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
</tr>
<tr>
<th>HTMLCanvasElement.getContext</th>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
</tr>
<tr>
<th>HTMLCanvasElement.height</th>
<td>0</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
</tr>
<tr>
<th>HTMLCanvasElement.style</th>
<td>0</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
</tr>
<tr>
<th>HTMLCanvasElement.width</th>
<td>0</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
</tr>
<tr>
<th>RTCPeerConnection.iceGatheringState</th>
<td>0</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
</tr>
<tr>
<th>RTCPeerConnection.idpLoginUrl</th>
<td>0</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
</tr>
<tr>
<th>RTCPeerConnection.localDescription</th>
<td>0</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
</tr>
<tr>
<th>RTCPeerConnection.onicecandidate</th>
<td>0</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
</tr>
<tr>
<th>RTCPeerConnection.onremovestream</th>
<td>0</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
</tr>
<tr>
<th>RTCPeerConnection.peerIdentity</th>
<td>0</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
</tr>
<tr>
<th>RTCPeerConnection.remoteDescription</th>
<td>0</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
</tr>
<tr>
<th>RTCPeerConnection.signalingState</th>
<td>0</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
</tr>
<tr>
<th>window.Storage.getItem</th>
<td>182</td>
<td>182</td>
<td>182</td>
<td>182</td>
<td>182</td>
<td>182</td>
<td>182</td>
<td>182</td>
<td>182</td>
<td>182</td>
<td>182</td>
<td>182</td>
<td>182</td>
<td>182</td>
</tr>
<tr>
<th>window.Storage.key</th>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
</tr>
<tr>
<th>window.Storage.length</th>
<td>0</td>
<td>5</td>
<td>5</td>
<td>5</td>
<td>5</td>
<td>5</td>
<td>5</td>
<td>5</td>
<td>5</td>
<td>5</td>
<td>5</td>
<td>5</td>
<td>5</td>
<td>5</td>
</tr>
<tr>
<th>window.Storage.removeItem</th>
<td>35</td>
<td>35</td>
<td>35</td>
<td>35</td>
<td>35</td>
<td>35</td>
<td>35</td>
<td>35</td>
<td>35</td>
<td>35</td>
<td>35</td>
<td>35</td>
<td>35</td>
<td>35</td>
</tr>
<tr>
<th>window.Storage.setItem</th>
<td>49</td>
<td>49</td>
<td>49</td>
<td>49</td>
<td>49</td>
<td>49</td>
<td>49</td>
<td>49</td>
<td>49</td>
<td>49</td>
<td>49</td>
<td>49</td>
<td>49</td>
<td>49</td>
</tr>
<tr>
<th>window.document.cookie</th>
<td>0</td>
<td>479</td>
<td>479</td>
<td>479</td>
<td>479</td>
<td>479</td>
<td>479</td>
<td>479</td>
<td>479</td>
<td>479</td>
<td>479</td>
<td>479</td>
<td>479</td>
<td>479</td>
</tr>
<tr>
<th>window.localStorage</th>
<td>0</td>
<td>94</td>
<td>94</td>
<td>94</td>
<td>94</td>
<td>94</td>
<td>94</td>
<td>94</td>
<td>94</td>
<td>94</td>
<td>94</td>
<td>94</td>
<td>94</td>
<td>94</td>
</tr>
<tr>
<th>window.name</th>
<td>0</td>
<td>31</td>
<td>31</td>
<td>31</td>
<td>31</td>
<td>31</td>
<td>31</td>
<td>31</td>
<td>31</td>
<td>31</td>
<td>31</td>
<td>31</td>
<td>31</td>
<td>31</td>
</tr>
<tr>
<th>window.navigator.appCodeName</th>
<td>0</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
</tr>
<tr>
<th>window.navigator.appName</th>
<td>0</td>
<td>20</td>
<td>20</td>
<td>20</td>
<td>20</td>
<td>20</td>
<td>20</td>
<td>20</td>
<td>20</td>
<td>20</td>
<td>20</td>
<td>20</td>
<td>20</td>
<td>20</td>
</tr>
<tr>
<th>window.navigator.appVersion</th>
<td>0</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
</tr>
<tr>
<th>window.navigator.cookieEnabled</th>
<td>0</td>
<td>14</td>
<td>14</td>
<td>14</td>
<td>14</td>
<td>14</td>
<td>14</td>
<td>14</td>
<td>14</td>
<td>14</td>
<td>14</td>
<td>14</td>
<td>14</td>
<td>14</td>
</tr>
<tr>
<th>window.navigator.language</th>
<td>0</td>
<td>21</td>
<td>21</td>
<td>21</td>
<td>21</td>
<td>21</td>
<td>21</td>
<td>21</td>
<td>21</td>
<td>21</td>
<td>21</td>
<td>21</td>
<td>21</td>
<td>21</td>
</tr>
<tr>
<th>window.navigator.mimeTypes[application/futuresplash].type</th>
<td>0</td>
<td>4</td>
<td>4</td>
<td>4</td>
<td>4</td>
<td>4</td>
<td>4</td>
<td>4</td>
<td>4</td>
<td>4</td>
<td>4</td>
<td>4</td>
<td>4</td>
<td>4</td>
</tr>
<tr>
<th>window.navigator.mimeTypes[application/x-shockwave-flash].type</th>
<td>0</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
<td>3</td>
</tr>
<tr>
<th>window.navigator.onLine</th>
<td>0</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
</tr>
<tr>
<th>window.navigator.platform</th>
<td>0</td>
<td>23</td>
<td>23</td>
<td>23</td>
<td>23</td>
<td>23</td>
<td>23</td>
<td>23</td>
<td>23</td>
<td>23</td>
<td>23</td>
<td>23</td>
<td>23</td>
<td>23</td>
</tr>
<tr>
<th>window.navigator.plugins[Shockwave Flash].description</th>
<td>0</td>
<td>39</td>
<td>39</td>
<td>39</td>
<td>39</td>
<td>39</td>
<td>39</td>
<td>39</td>
<td>39</td>
<td>39</td>
<td>39</td>
<td>39</td>
<td>39</td>
<td>39</td>
</tr>
<tr>
<th>window.navigator.plugins[Shockwave Flash].filename</th>
<td>0</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
</tr>
<tr>
<th>window.navigator.plugins[Shockwave Flash].length</th>
<td>0</td>
<td>9</td>
<td>9</td>
<td>9</td>
<td>9</td>
<td>9</td>
<td>9</td>
<td>9</td>
<td>9</td>
<td>9</td>
<td>9</td>
<td>9</td>
<td>9</td>
<td>9</td>
</tr>
<tr>
<th>window.navigator.plugins[Shockwave Flash].name</th>
<td>0</td>
<td>10</td>
<td>10</td>
<td>10</td>
<td>10</td>
<td>10</td>
<td>10</td>
<td>10</td>
<td>10</td>
<td>10</td>
<td>10</td>
<td>10</td>
<td>10</td>
<td>10</td>
</tr>
<tr>
<th>window.navigator.plugins[Shockwave Flash].version</th>
<td>0</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
</tr>
<tr>
<th>window.navigator.product</th>
<td>0</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
</tr>
<tr>
<th>window.navigator.productSub</th>
<td>0</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
</tr>
<tr>
<th>window.navigator.userAgent</th>
<td>0</td>
<td>258</td>
<td>258</td>
<td>258</td>
<td>258</td>
<td>258</td>
<td>258</td>
<td>258</td>
<td>258</td>
<td>258</td>
<td>258</td>
<td>258</td>
<td>258</td>
<td>258</td>
</tr>
<tr>
<th>window.navigator.vendor</th>
<td>0</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
<td>7</td>
</tr>
<tr>
<th>window.navigator.vendorSub</th>
<td>0</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>1</td>
</tr>
<tr>
<th>window.screen.colorDepth</th>
<td>0</td>
<td>22</td>
<td>22</td>
<td>22</td>
<td>22</td>
<td>22</td>
<td>22</td>
<td>22</td>
<td>22</td>
<td>22</td>
<td>22</td>
<td>22</td>
<td>22</td>
<td>22</td>
</tr>
<tr>
<th>window.screen.pixelDepth</th>
<td>0</td>
<td>5</td>
<td>5</td>
<td>5</td>
<td>5</td>
<td>5</td>
<td>5</td>
<td>5</td>
<td>5</td>
<td>5</td>
<td>5</td>
<td>5</td>
<td>5</td>
<td>5</td>
</tr>
<tr>
<th>window.sessionStorage</th>
<td>0</td>
<td>65</td>
<td>65</td>
<td>65</td>
<td>65</td>
<td>65</td>
<td>65</td>
<td>65</td>
<td>65</td>
<td>65</td>
<td>65</td>
<td>65</td>
<td>65</td>
<td>65</td>
</tr>
</tbody>
</table>
</div>
```python
result.corr()
```
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>crawl_id</th>
<th>file_number</th>
<th>in_iframe</th>
</tr>
</thead>
<tbody>
<tr>
<th>crawl_id</th>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
</tr>
<tr>
<th>file_number</th>
<td>NaN</td>
<td>1.000000</td>
<td>0.137485</td>
</tr>
<tr>
<th>in_iframe</th>
<td>NaN</td>
<td>0.137485</td>
<td>1.000000</td>
</tr>
</tbody>
</table>
</div>

2326
explore-crawl-data.ipynb Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

116
schema.md Normal file
Просмотреть файл

@ -0,0 +1,116 @@
* __call_stack:__
* __Type:__ String
* __Description:__ The call stack at the point when the function is called. The output is in the format: (function_name)(@)(javascript_source_file)(:)(line_number)(column_number)(new_line_character)
* __Example:__
```
jQuery.cookie@https://cdn.livechatinc.com/js/embedded.20171215135707.js:5:8393\nStore</s.get@https://cdn.livechatinc.com/js/embedded.20171215135707.js:8:3323\nStore</</s[p]@https://cdn.livechatinc.com/js/embedded.20171215135707.js:8:3746\nWindowsCommunicator.prototype.startCheckingForMainWindow/e<@https://cdn.livechatinc.com/js/embedded.20171215135707.js:10:11730
```
* __crawl_id:__
* __Type:__ Integer
* __Description:__ Crawl_id appears to be the value 1 for all json files. It is possible this field was not used when generating the data using the crawler.
* __Example:__ 1
* __func_name:__
* __Type:__ String
* __Description:__ The name of the javascript function. Due to obfuscation the functions are often nonsensical and thus can be thought of as tokens. Anonymous functions will not have a name and the value will be an empty string.
* __Examples:__
```
""
a<4k
getName
```
* __in_iframe:__
* __Type:__ boolean
* __Description:__ in_iframe is a boolean that indicates that the javascript code was run inside of an iframe. This is new functionality that was added ontop of the origional OpenWPM repository.
* __location:__
* __Type:__ string
* __Description:__ The url of the file that was being crawled to generate the json file. All objects in a json file should have the same location value. The url can be for any type of file such as .html, .js or have no file extension.
* __Examples:__
```
https://www.dresslily.com/bottom-c-36.html
http://www.vidalfrance.com/component/forme/?fid=2
```
* __operation:__
* __Type:__ string
* __Description:__ Corresponds to the "symbol" field. Operation is a call if the symbol is a method. Get/set operations get and set symbols that are properties with values.
* __Possible Values:__ get, call, set
* __script_col:__
* __Type:__ string
* __Description:__ The column in the `script_line` where the function call starts. Note: currently some string do not contain numbers, but instead they contain urls such as the example bellow.
* __Examples:__
```
57
211
//hdjs.hiido.com/hiido_internal.js?siteid=mhssj
```
* __script_line:__
* __Type:__ string
* __Description:__ The line in the file, indicated in the above `location` element, where the function call is located. Note: Currently some strings do not contain numbers, but instead they contain the protocol identifier for a url, such as in the example bellow.
* __Examples:__
```
12
129
http
https
```
* __script_loc_eval:__
* __Type:__ string
* __Description:__ If a function call is generated using the `eval()` function, or is created using `new Function()`, then the "script_loc_eval" value will be set. For example `eval("console.log('my message')")` or `var log = new Function("message", "console.log(message)"); log("my message");` will both cause the "script_loc_evel" value be set when the function calls were collected. The format of "scipt_loc_eval" is: (line) (LINE_NUMBER) (>) (eval | Function) and can be repeated multiple times. Additional information on how the eval line number is generated can be found at the bottom of the [MDN page](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Error/Stack) which discusses the `Error` objects `stack` property. The "script_loc_eval" element is generated from this stack property.
* __Examples:__
```
""
line 2 > eval
line 70 > Function
line 140 > eval line 232 > Function
line 1 > Function line 1 > eval line 1 > eval
```
* __script_url:__
* __Type:__ string
* __Description:__ The url of the file where the javascript function call was run. This may be the same value at "location", or it may be an external web url that was loaded into the website with the use of the `<script>` tag.
* __Examples:__
```
http://www.google-analytics.com/analytics.js
http://ajax.googleapis.com/ajax/libs/jquery/1.6/jquery.min.js
http://pw.myersinfosys.com/javascripts/jquery-cookie.js?rwdv2
https://g.alicdn.com/alilog/oneplus/blk.html#coid=52m7EjiWaj8CASPiP1nwaYXC&noid=&grd=n
```
* __symbol:__
* __Type:__ string
* __Description:__ Either a Web API interface property (with a value) or method (which may take args as listed in "arguments" field). Symbol corresponds to "operation" field.
* __Examples:__
```
window.Storage.getItem
window.navigator.userAgent
CanvasRenderingContext2D.textBaseline
```
* __time_stamp:__
* __Type:__ string
* __Description:__ The timestamp of when the javascript function information was collected. The timestamp is collected using Javascripts Date.now() function. It is in the format YYYY-MM-DDTHH:mm:ss.sssZ.
* YYYY-MM-DD is the: year-month-day.
* "T" is a delimiter to seperate the two sections.
* HH:mm:ss.sss represents the: hours, minutes, seconds, and milliseconds.
* Z is optional and denotes the time zone. Z represents the time zone UTC+0.
* __Examples:__
```
2017-12-16T00:17:37.973Z
2017-12-16T00:24:09.355Z
2017-12-16T08:10:24.749Z
```
* __value:__
* __Type:__ string
* __Description:__ The value that the function returned.
* __Examples:__
```
""
{}
Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0
\_ga=GA1.2.1076416180.1513383458; \_gid=GA1.2.1940452730.1513383458
{"name": "example", "Browser": "Mozilla/5.0"}
```
* __arguments:__
* __Type:__ object
* __Description:__ Optional property which lists the arguments taken by the method in "symbol" field.
* __Examples:__
```
{\"0\":\"liveAgentPc\"}
{\"0\":\"liveAgentPage_0\",\"1\":\"http://www.alamy.com/help/what-is-model-release-property-release.aspx\"}
```

0
utils/__init__.py Normal file
Просмотреть файл

191
utils/load_data_util.py Normal file
Просмотреть файл

@ -0,0 +1,191 @@
import boto3
import botocore
import json
import pandas as pd
import requests
import random
import os
from pathlib import Path
BUCKET_NAME = "safe-ucosp-2017"
pd.set_option('display.max_columns', 500)
pd.set_option('display.expand_frame_repr', False)
s3 = boto3.resource('s3')
bucket = s3.Bucket(BUCKET_NAME)
this_files_directory = os.path.dirname(os.path.realpath(__file__))
project_root_directory = os.path.join(this_files_directory, "..")
cache_file_directory = os.path.join(project_root_directory, "cache")
if not os.path.exists(cache_file_directory):
os.makedirs(cache_file_directory)
def load_data(number_of_files, to_output_csv = True, cache_s3_data = False):
"""Load the files from the beginning of the bucket
Keyword arguments:
number_of_files -- number of files to load
to_output_csv -- boolean to save the result of the data into a csv file called result.csv (default True)
cache_s3_data -- boolean to cache each file once downloaded in a json file to speed up future data loads. (default False)
"""
file_number = 0
frames = []
#Take the first number_of_files and transform them into dataframes
for bucket_data_object in bucket.objects.limit(number_of_files):
file_number += 1
data_frame = load_data_to_dataframe(bucket_data_object, file_number, cache_s3_data)
frames.append(data_frame)
#Concat DataFrames generated from each file into a large DataFrame
result = pd.concat(frames)
#Output the results to a csv if desired
if (to_output_csv):
result.to_csv('result.csv', header=True, index=False, encoding='utf-8')
return result
def transform_into_dataframe(data, file_number):
"""Process the data, read it into a pandas DataFrame and add a column for file number
Keyword arguments:
data -- the string containing the file data
file_number -- the file number the data came from
"""
frame = pd.read_json(data)
frame['file_number'] = file_number
return frame
def process_string(data):
return "[" + data[1:-1] + "]"
def create_file_index():
"""Function used to create the file index file"""
count = 0
with open("file_index.json", "a+") as f:
for key in bucket.objects.all():
count += 1
f.write(key.key + "\n")
if (count%1000 == 0):
print(count)
def validate_file_fetch():
"""Function used to validate the file index has unique entries"""
with open("file_index.json") as f:
index_list = f.readlines()
index_set = set(index_list)
return len(index_list) == len(index_set)
def fetch_file(file_url, file_name, mode):
"""Fetch file from base_url and store it in the project root directory.
Keyword arguments:
file_url -- url of where the file is
file_name -- name of the file to be saved as
mode -- mode of the file to open it in (for example: wb+)
"""
file = requests.get(file_url + file_name)
file_path = os.path.join(project_root_directory, file_name)
with open(file_path, mode) as f:
f.write(file.content)
def load_index_file():
"""Load the file containing all indexes of the bucket and return it as an array of indexes - will download the index file if it's not in local storage"""
file_name = "file_index.txt"
file_path = os.path.join(project_root_directory, file_name)
if not os.path.isfile(file_path):
fetch_file("http://www.arewedatayet.com/", file_name, 'wb+')
with open(file_path, "r") as f:
lines = f.readlines()
return lines
def load_random_data(number_of_files, to_output_csv = True, seed = None, cache_s3_data = False):
"""Load random files from the bucket
Keyword arguments:
number_of_files -- number of files to load
to_output_csv -- boolean to save the result of the data into a csv file called result.csv (default True)
seed -- seed for generating random samples (default None)
cache_s3_data -- boolean to cache each file once downloaded in a json file to speed up future data loads. (default False)
"""
frames = []
file_number = 0
lines = load_index_file()
random.seed(seed)
#Get x number of random files
samples = random.sample(lines, number_of_files)
#Take the first number_of_files and transform them into dataframes
for sample in samples:
sample = sample.strip()
file_number += 1
bucket_data_object = s3.ObjectSummary(BUCKET_NAME, sample)
data_frame = load_data_to_dataframe(bucket_data_object, file_number, cache_s3_data)
frames.append(data_frame)
#Concat DataFrames generated from each file into a large DataFrame
result = pd.concat(frames)
#Output the results to a csv if desired
if (to_output_csv):
result.to_csv('result.csv', header=True, index=False, encoding='utf-8')
return result
def get_json_data(bucket_data_object, cache_s3_data):
"""Get the json data from either the given s3 bucket or the cache.
Keyword arguments:
bucket_data_object -- a boto3 ObjectSummary class that is connected to the json data we want to load
cache_s3_data -- boolean which determines if we cache the downloaded data in a file in the cache directory
"""
file_name = os.path.join(cache_file_directory, bucket_data_object.key)
if os.path.isfile(file_name): # get data from file
json_data = json.load(open(file_name))
return json.dumps(json_data)
else: # get data from S3
bucket_json_data = bucket_data_object.get()
# data is in a byte format so we must decode it to utf-8.
json_data = bucket_json_data['Body'].read().decode("utf-8")
json_data = process_string(json_data)
if cache_s3_data:
data_file = open(file_name, "w")
# pretty print json data to file
json_dump = json.dumps(json.loads(json_data), sort_keys=True, indent=2)
data_file.write(json_dump)
data_file.close()
return json_data
def load_data_to_dataframe(bucket_data_object, file_number, cache_s3_data):
"""Download the file from the given bucket object and transform it into a DataFrame
Keyword arguments:
bucket_data_object -- a boto3 ObjectSummary class that is connected to the json data we want to load
file_number -- file number to be attached to the DataFrame
"""
data = get_json_data(bucket_data_object, cache_s3_data)
return transform_into_dataframe(data, file_number)
def extract_column_json_to_list(series_to_be_processed):
"""Parse the selected column, transform the string into json, and throw the values into a list
Keyword arguments:
series_to_be_processed -- a pandas Series that contains strings in the form of json
"""
return series_to_be_processed.map(lambda arguments: list(json.loads(arguments).values()) if isinstance(arguments, str) else [])