1
0
Форкнуть 0

Minor updates to Spark eval analysis notebook:

- reran on our DataBricks cluster
- added compatibility code to run from Python 2
- added some description
- print out each count as they are computed
- cleaned up final print statements
This commit is contained in:
Dave Zeber 2018-05-10 17:03:58 -05:00
Родитель cc8e0b6fed
Коммит 40425fafe0
4 изменённых файлов: 100 добавлений и 127 удалений

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -9,35 +9,52 @@ Similairly to in the other analysis, we will refer to using eval, rather than bo
After looking at the entire dataset we found:
1. 3.72% of function calls are created using eval.
2. 8.76% of web pages have atleast 1 function call created using eval. This is very similair to the samples taken in the previous analysis which was 8.92%.
3. 32.65% of scripts hosting functions created using eval are hosted on a different domain that the domain of the web page that uses them. This is rounghly half of what was found in the earlier sample based analysis.
2. 8.76% of web pages have at least 1 function call created using eval. This is very similar to the samples taken in the previous analysis, which was 8.92%.
3. 32.65% of scripts hosting functions created using eval are hosted on a different domain that the domain of the web page that uses them. This is roughly half of what was found in the earlier sample based analysis.
```python
## Python 2/3 compatibility.
from __future__ import absolute_import
from __future__ import division
from pyspark.sql.functions import udf
from __future__ import print_function
from __future__ import unicode_literals
from future import standard_library
standard_library.install_aliases()
from six import text_type
from pyspark.sql.functions import udf
import numpy as np
import tldextract
```
```python
#BUCKET = 'safe-ucosp-2017/safe_dataset/v1'
#ACCESS_KEY = "YOUR_KEY"
#SECRET_KEY = "YOUR_KEY"
#ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
#AWS_BUCKET_NAME = BUCKET
#S3_LOCATION = "s3a://{}:{}@{}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_BUCKET_NAME)
#MOUNT = "/mnt/{}".format(BUCKET.replace("/", "-"))
#mountPoints = lambda: np.array([m.mountPoint for m in dbutils.fs.mounts()])
#already_mounted = np.any(mountPoints() == MOUNT)
#if not already_mounted:
# dbutils.fs.mount(S3_LOCATION, MOUNT)
#display(dbutils.fs.ls(MOUNT))
```
```python
BUCKET = 'safe-ucosp-2017/safe_dataset/v1'
ACCESS_KEY = "YOUR_KEY"
SECRET_KEY = "YOUR_KEY"
ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
AWS_BUCKET_NAME = BUCKET
S3_LOCATION = "s3a://{}:{}@{}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_BUCKET_NAME)
MOUNT = "/mnt/{}".format(BUCKET.replace("/", "-"))
mountPoints = lambda: np.array([m.mountPoint for m in dbutils.fs.mounts()])
already_mounted = np.any(mountPoints() == MOUNT)
mountPoints = [m.mountPoint for m in dbutils.fs.mounts()]
already_mounted = MOUNT in mountPoints
if not already_mounted:
dbutils.fs.mount(S3_LOCATION, MOUNT)
display(dbutils.fs.ls(MOUNT))
dbutils.fs.mount("s3://" + BUCKET, MOUNT)
```
@ -45,29 +62,42 @@ display(dbutils.fs.ls(MOUNT))
df = spark.read.parquet("{}/{}".format(MOUNT, 'clean.parquet'))
```
Compute counts of:
- the number of unique URLs crawled
- the total number of function calls (rows) in the dataset
```python
unique_webpages = df.groupBy('location').count()
unique_webpage_count = unique_webpages.count()
unique_webpage_count
```
```python
function_count = df.count()
function_count
```
And now for `eval` calls:
```python
eval_calls = df[df.script_loc_eval.startswith('line')]
eval_call_count = eval_calls.count()
eval_call_count
```
```python
webpage_using_eval = eval_calls.groupBy('location').count()
webpage_using_eval_count = webpage_using_eval.count()
webpage_using_eval_count
```
How many of the scripts using `eval` calls are on a different domain from the original page?
```python
def check_if_different_domain(location, script_url):
@ -84,32 +114,33 @@ different_domain_df = eval_calls.withColumn(
different_domains = different_domain_df[different_domain_df.is_different_domain == True].groupBy('script_url').count()
different_domain_count = different_domains.count()
different_domain_count
```
```python
unique_script_urls = eval_calls.groupBy('script_url').count()
unique_script_urls_count = unique_script_urls.count()
unique_script_urls_count
```
```python
eval_call_percentage = round(eval_call_count / function_count * 100, 2)
print(str(eval_call_percentage) + "% (" + str(eval_call_count) + "/" + str(function_count) + ") of total function calls are created using eval.")
eval_call_percentage = eval_call_count / function_count * 100.0
print("{:.2f}% ({:,}/{:,}) of total function calls are created using eval.".format(
eval_call_percentage, eval_call_count, function_count))
```
3.72% (4228327/113714009) of total function calls are created using eval.
```python
webpage_using_eval_percentage = round(webpage_using_eval_count / unique_webpage_count * 100 , 2)
print(str(webpage_using_eval_percentage) + "% (" + str(webpage_using_eval_count) + "/" + str(unique_webpage_count) + ") of web pages have atleast 1 function created using eval.")
webpage_using_eval_percentage = webpage_using_eval_count / unique_webpage_count * 100.0
print("{:.2f}% ({:,}/{:,}) of web pages have at least 1 function created using eval.".format(
webpage_using_eval_percentage, webpage_using_eval_count, unique_webpage_count))
```
8.76% (180395/2059503) of web pages have atleast 1 function created using eval.
```python
different_domain_percentage = round(different_domain_count / unique_script_urls_count * 100, 2)
print(str(different_domain_percentage) + "% (" + str(different_domain_count) + "/" + str(unique_script_urls_count) + ") of scripts are hosted on a different domain than the web page that called them.")
different_domain_percentage = different_domain_count / unique_script_urls_count * 100.0
print("{:.2f}% ({:,}/{:,}) of scripts using eval are hosted on a different domain than the web page that called them.".format(
different_domain_percentage, different_domain_count, unique_script_urls_count))
```
32.65% (10235/31349) of scripts are hosted on a different domain than the web page that called them.

Просмотреть файл

@ -1,100 +0,0 @@
# Databricks notebook source
# MAGIC %md
# MAGIC ### Eval and Function Analysis with Spark
# MAGIC
# MAGIC This is a continuation of the analysis on the usage of Eval and the Function class to create Javascript functions for web pages in the webcrawl dataset. This analysis uses Spark to look at the entire dataset, as apposed to the other analysis which looked at random samples in the dataset. The primary purpose of this analysis is to compare an analysis of the entire dataset with the previous analysis of small samples. This is being done to check if the small sample was representative of the entire dataset.
# MAGIC
# MAGIC The previous Analysis can be found here: [eval_analysis.ipynb](eval_analysis.ipynb) and [eval_analysis.md](eval_analysis.md). An explanation for what eval is and a justification for why we are looking at eval usage can be found in the previous analysis.
# MAGIC
# MAGIC Similairly to in the other analysis, we will refer to using eval, rather than both eval and new Function, for simplicity.
# MAGIC
# MAGIC After looking at the entire dataset we found:
# MAGIC 1. 3.72% of function calls are created using eval.
# MAGIC 2. 8.76% of web pages have atleast 1 function call created using eval. This is very similair to the samples taken in the previous analysis which was 8.92%.
# MAGIC 3. 32.65% of scripts hosting functions created using eval are hosted on a different domain that the domain of the web page that uses them. This is rounghly half of what was found in the earlier sample based analysis.
# COMMAND ----------
from __future__ import division
from pyspark.sql.functions import udf
import numpy as np
import tldextract
# COMMAND ----------
BUCKET = 'safe-ucosp-2017/safe_dataset/v1'
ACCESS_KEY = "YOUR_KEY"
SECRET_KEY = "YOUR_KEY"
ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
AWS_BUCKET_NAME = BUCKET
S3_LOCATION = "s3a://{}:{}@{}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_BUCKET_NAME)
MOUNT = "/mnt/{}".format(BUCKET.replace("/", "-"))
mountPoints = lambda: np.array([m.mountPoint for m in dbutils.fs.mounts()])
already_mounted = np.any(mountPoints() == MOUNT)
if not already_mounted:
dbutils.fs.mount(S3_LOCATION, MOUNT)
display(dbutils.fs.ls(MOUNT))
# COMMAND ----------
df = spark.read.parquet("{}/{}".format(MOUNT, 'clean.parquet'))
# COMMAND ----------
unique_webpages = df.groupBy('location').count()
unique_webpage_count = unique_webpages.count()
# COMMAND ----------
function_count = df.count()
# COMMAND ----------
eval_calls = df[df.script_loc_eval.startswith('line')]
eval_call_count = eval_calls.count()
# COMMAND ----------
webpage_using_eval = eval_calls.groupBy('location').count()
webpage_using_eval_count = webpage_using_eval.count()
# COMMAND ----------
def check_if_different_domain(location, script_url):
location_domain = tldextract.extract(location).domain
script_domain = tldextract.extract(script_url).domain
return location_domain != script_domain
is_different_domain = udf(check_if_different_domain)
different_domain_df = eval_calls.withColumn(
'is_different_domain', is_different_domain(eval_calls.location, eval_calls.script_url)
)
different_domains = different_domain_df[different_domain_df.is_different_domain == True].groupBy('script_url').count()
different_domain_count = different_domains.count()
# COMMAND ----------
unique_script_urls = eval_calls.groupBy('script_url').count()
unique_script_urls_count = unique_script_urls.count()
# COMMAND ----------
eval_call_percentage = round(eval_call_count / function_count * 100, 2)
print(str(eval_call_percentage) + "% (" + str(eval_call_count) + "/" + str(function_count) + ") of total function calls are created using eval.")
# COMMAND ----------
webpage_using_eval_percentage = round(webpage_using_eval_count / unique_webpage_count * 100 , 2)
print(str(webpage_using_eval_percentage) + "% (" + str(webpage_using_eval_count) + "/" + str(unique_webpage_count) + ") of web pages have atleast 1 function created using eval.")
# COMMAND ----------
different_domain_percentage = round(different_domain_count / unique_script_urls_count * 100, 2)
print(str(different_domain_percentage) + "% (" + str(different_domain_count) + "/" + str(unique_script_urls_count) + ") of scripts are hosted on a different domain than the web page that called them.")