Updated cryptojacking notebook:

- added some compatibility code for running in Python 2 - added rendered HTML showing cell outputs
2018-04-21 12:52:34 +02:00 · 2018-04-21 12:52:34 +02:00 · e42d285e8d
--- a/analyses/cryptojacking_analysis.html
+++ b/analyses/cryptojacking_analysis.html
--- a/analyses/cryptojacking_analysis.ipynb
+++ b/analyses/cryptojacking_analysis.ipynb
--- a/analyses/cryptojacking_analysis.md
+++ b/analyses/cryptojacking_analysis.md
@ -37,7 +37,7 @@ Sources:
 ### Results: Summary

 Less than 0.02% of script calls are detected with cryptojacking.  
-However, it is important to note that cryptojacking code can be executed in order ways than by including the host .js script in a script tag. It can be disguised, stealthily executed in iframes,  or directly used in a function of a first party script. Users may also face redirect loops that eventually lead to a page with a mining script. Another reason for the low detection rate could be that the prominence/popularity of the analysed sites from the web crawl is enough to dissuade site owners from implementing obvious cryptojacking script usage.  
+However, it is important to note that cryptojacking code can be executed in other ways than by including the host .js script in a script tag. It can be disguised, stealthily executed in iframes,  or directly used in a function of a first party script. Users may also face redirect loops that eventually lead to a page with a mining script. Another reason for the low detection rate could be that the prominence/popularity of the analysed sites from the web crawl is enough to dissuade site owners from implementing obvious cryptojacking script usage.  
 It is likely that the actual rate of cryptojacking is higher. 

 The majority of domains that were detected with cryptojacking are streaming sites. This is unsurprising as users will have streaming sites open for longer as they are watching shows/movies, and mining scripts can be executed longer. 
@ -68,21 +68,48 @@ number of cryptojacking hosts used: 11

 ```
 import numpy as np
+## Compatibilty with Python 2.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from builtins import open
+from builtins import str
+from builtins import basestring
+
+from future import standard_library
+standard_library.install_aliases()
+```
+
+
+```
 BUCKET = 'safe-ucosp-2017/safe_dataset/v1'
-
-ACCESS_KEY = "MY_ACCESS_KEY"
-SECRET_KEY = "MY_SECRET_KEY"
-ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
-AWS_BUCKET_NAME = BUCKET
-
-S3_LOCATION = "s3a://{}:{}@{}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_BUCKET_NAME)
 MOUNT = "/mnt/{}".format(BUCKET.replace("/", "-"))
-
-mountPoints = lambda: np.array([m.mountPoint for m in dbutils.fs.mounts()])
-already_mounted = np.any(mountPoints() == MOUNT)
+mountPoints = [m.mountPoint for m in dbutils.fs.mounts()]
+already_mounted = MOUNT in mountPoints
 if not already_mounted:
-    dbutils.fs.mount(S3_LOCATION, MOUNT)
-display(dbutils.fs.ls(MOUNT))
+  dbutils.fs.mount("s3://" + BUCKET, MOUNT)
+```
+
+
+```
+#import numpy as np
+#BUCKET = 'safe-ucosp-2017/safe_dataset/v1'
+
+#ACCESS_KEY = "MY_ACCESS_KEY"
+#SECRET_KEY = "MY_SECRET_KEY"
+#ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
+#AWS_BUCKET_NAME = BUCKET
+
+#S3_LOCATION = "s3a://{}:{}@{}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_BUCKET_NAME)
+#MOUNT = "/mnt/{}".format(BUCKET.replace("/", "-"))
+
+#mountPoints = lambda: np.array([m.mountPoint for m in dbutils.fs.mounts()])
+#already_mounted = np.any(mountPoints() == MOUNT)
+#if not already_mounted:
+#    dbutils.fs.mount(S3_LOCATION, MOUNT)
+#display(dbutils.fs.ls(MOUNT))
 ```

 Note: Analysis was done on union of clean and invalid parquet files but no cryptojacking was detected in invalid files - can avoid reading invalid files to save computation time.
@ -120,9 +147,9 @@ def get_cryptojacking_sites():
    skipped_suffixes = ["com", ""]
    
    for line in data: 
-        string = str(line)
-        if string.startswith("b'0.0.0.0"):
-            sites.append(string[len("b'0.0.0.0 "):-len("\\\\n")])
+        string = str(line).strip()
+        if string.startswith("0.0.0.0"):
+            sites.append(string[8:])
    print("Unmodified list: " + str(sites))
            
    for site in sites:
@ -164,7 +191,7 @@ cryptoDF_count


 ```
-cryptoDF.show(500, False)
+cryptoDF.show(50, False)
 ```


@ -181,13 +208,14 @@ domains = cryptoDF.select('location').rdd.distinct()
 domains_count = domains.count()
 domains_count

-#14523 (without prefixes/suffixes)
-#49 (with prefixes/suffixes)
+#14523 (with prefixes/suffixes)
+#49 (without prefixes/suffixes)
 ```


 ```
 #Which domains are the "worst" for crypto-jacking - i.e., all the domains where crypto-jacking was detecting, ordered by the number of crypto-jacking scripts observed on them.
+# However, since we have stripped prefixed and suffixes, this counts instances of crpyto-jacking scripts across all pages in the dataset sharing this domain.
 domain_with_counts = cryptoDF.groupby("location").count()
 ordered_domains_with_counts = domain_with_counts.orderBy("count", ascending=False).show(100)

--- a/analyses/cryptojacking_analysis.py
+++ b/analyses/cryptojacking_analysis.py
@ -39,7 +39,7 @@
 # MAGIC %md ### Results: Summary
 # MAGIC 
 # MAGIC Less than 0.02% of script calls are detected with cryptojacking.  
-# MAGIC However, it is important to note that cryptojacking code can be executed in order ways than by including the host .js script in a script tag. It can be disguised, stealthily executed in iframes,  or directly used in a function of a first party script. Users may also face redirect loops that eventually lead to a page with a mining script. Another reason for the low detection rate could be that the prominence/popularity of the analysed sites from the web crawl is enough to dissuade site owners from implementing obvious cryptojacking script usage.  
+# MAGIC However, it is important to note that cryptojacking code can be executed in other ways than by including the host .js script in a script tag. It can be disguised, stealthily executed in iframes,  or directly used in a function of a first party script. Users may also face redirect loops that eventually lead to a page with a mining script. Another reason for the low detection rate could be that the prominence/popularity of the analysed sites from the web crawl is enough to dissuade site owners from implementing obvious cryptojacking script usage.  
 # MAGIC It is likely that the actual rate of cryptojacking is higher. 
 # MAGIC 
 # MAGIC The majority of domains that were detected with cryptojacking are streaming sites. This is unsurprising as users will have streaming sites open for longer as they are watching shows/movies, and mining scripts can be executed longer. 
@ -74,21 +74,46 @@
 # COMMAND ----------

 import numpy as np
+## Compatibilty with Python 2.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from builtins import open
+from builtins import str
+from builtins import basestring
+
+from future import standard_library
+standard_library.install_aliases()
+
+# COMMAND ----------
+
 BUCKET = 'safe-ucosp-2017/safe_dataset/v1'
-
-ACCESS_KEY = "MY_ACCESS_KEY"
-SECRET_KEY = "MY_SECRET_KEY"
-ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
-AWS_BUCKET_NAME = BUCKET
-
-S3_LOCATION = "s3a://{}:{}@{}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_BUCKET_NAME)
 MOUNT = "/mnt/{}".format(BUCKET.replace("/", "-"))
-
-mountPoints = lambda: np.array([m.mountPoint for m in dbutils.fs.mounts()])
-already_mounted = np.any(mountPoints() == MOUNT)
+mountPoints = [m.mountPoint for m in dbutils.fs.mounts()]
+already_mounted = MOUNT in mountPoints
 if not already_mounted:
-    dbutils.fs.mount(S3_LOCATION, MOUNT)
-display(dbutils.fs.ls(MOUNT))
+  dbutils.fs.mount("s3://" + BUCKET, MOUNT)
+
+# COMMAND ----------
+
+#import numpy as np
+#BUCKET = 'safe-ucosp-2017/safe_dataset/v1'
+
+#ACCESS_KEY = "MY_ACCESS_KEY"
+#SECRET_KEY = "MY_SECRET_KEY"
+#ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
+#AWS_BUCKET_NAME = BUCKET
+
+#S3_LOCATION = "s3a://{}:{}@{}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_BUCKET_NAME)
+#MOUNT = "/mnt/{}".format(BUCKET.replace("/", "-"))
+
+#mountPoints = lambda: np.array([m.mountPoint for m in dbutils.fs.mounts()])
+#already_mounted = np.any(mountPoints() == MOUNT)
+#if not already_mounted:
+#    dbutils.fs.mount(S3_LOCATION, MOUNT)
+#display(dbutils.fs.ls(MOUNT))

 # COMMAND ----------

@ -125,9 +150,9 @@ def get_cryptojacking_sites():
    skipped_suffixes = ["com", ""]
    
    for line in data: 
-        string = str(line)
-        if string.startswith("b'0.0.0.0"):
-            sites.append(string[len("b'0.0.0.0 "):-len("\\\\n")])
+        string = str(line).strip()
+        if string.startswith("0.0.0.0"):
+            sites.append(string[8:])
    print("Unmodified list: " + str(sites))
            
    for site in sites:
@ -167,7 +192,7 @@ cryptoDF_count

 # COMMAND ----------

-cryptoDF.show(500, False)
+cryptoDF.show(50, False)

 # COMMAND ----------

@ -182,12 +207,13 @@ domains = cryptoDF.select('location').rdd.distinct()
 domains_count = domains.count()
 domains_count

-#14523 (without prefixes/suffixes)
-#49 (with prefixes/suffixes)
+#14523 (with prefixes/suffixes)
+#49 (without prefixes/suffixes)

 # COMMAND ----------

 #Which domains are the "worst" for crypto-jacking - i.e., all the domains where crypto-jacking was detecting, ordered by the number of crypto-jacking scripts observed on them.
+# However, since we have stripped prefixed and suffixes, this counts instances of crpyto-jacking scripts across all pages in the dataset sharing this domain.
 domain_with_counts = cryptoDF.groupby("location").count()
 ordered_domains_with_counts = domain_with_counts.orderBy("count", ascending=False).show(100)