1
0
Форкнуть 0

Updated cryptojacking notebook:

- added some compatibility code for running in Python 2
- added rendered HTML showing cell outputs
This commit is contained in:
Dave Zeber 2018-04-21 12:52:34 +02:00
Родитель 02b434249c
Коммит e42d285e8d
4 изменённых файлов: 135 добавлений и 331 удалений

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -37,7 +37,7 @@ Sources:
### Results: Summary
Less than 0.02% of script calls are detected with cryptojacking.
However, it is important to note that cryptojacking code can be executed in order ways than by including the host .js script in a script tag. It can be disguised, stealthily executed in iframes, or directly used in a function of a first party script. Users may also face redirect loops that eventually lead to a page with a mining script. Another reason for the low detection rate could be that the prominence/popularity of the analysed sites from the web crawl is enough to dissuade site owners from implementing obvious cryptojacking script usage.
However, it is important to note that cryptojacking code can be executed in other ways than by including the host .js script in a script tag. It can be disguised, stealthily executed in iframes, or directly used in a function of a first party script. Users may also face redirect loops that eventually lead to a page with a mining script. Another reason for the low detection rate could be that the prominence/popularity of the analysed sites from the web crawl is enough to dissuade site owners from implementing obvious cryptojacking script usage.
It is likely that the actual rate of cryptojacking is higher.
The majority of domains that were detected with cryptojacking are streaming sites. This is unsurprising as users will have streaming sites open for longer as they are watching shows/movies, and mining scripts can be executed longer.
@ -68,21 +68,48 @@ number of cryptojacking hosts used: 11
```
import numpy as np
## Compatibilty with Python 2.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from builtins import open
from builtins import str
from builtins import basestring
from future import standard_library
standard_library.install_aliases()
```
```
BUCKET = 'safe-ucosp-2017/safe_dataset/v1'
ACCESS_KEY = "MY_ACCESS_KEY"
SECRET_KEY = "MY_SECRET_KEY"
ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
AWS_BUCKET_NAME = BUCKET
S3_LOCATION = "s3a://{}:{}@{}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_BUCKET_NAME)
MOUNT = "/mnt/{}".format(BUCKET.replace("/", "-"))
mountPoints = lambda: np.array([m.mountPoint for m in dbutils.fs.mounts()])
already_mounted = np.any(mountPoints() == MOUNT)
mountPoints = [m.mountPoint for m in dbutils.fs.mounts()]
already_mounted = MOUNT in mountPoints
if not already_mounted:
dbutils.fs.mount(S3_LOCATION, MOUNT)
display(dbutils.fs.ls(MOUNT))
dbutils.fs.mount("s3://" + BUCKET, MOUNT)
```
```
#import numpy as np
#BUCKET = 'safe-ucosp-2017/safe_dataset/v1'
#ACCESS_KEY = "MY_ACCESS_KEY"
#SECRET_KEY = "MY_SECRET_KEY"
#ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
#AWS_BUCKET_NAME = BUCKET
#S3_LOCATION = "s3a://{}:{}@{}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_BUCKET_NAME)
#MOUNT = "/mnt/{}".format(BUCKET.replace("/", "-"))
#mountPoints = lambda: np.array([m.mountPoint for m in dbutils.fs.mounts()])
#already_mounted = np.any(mountPoints() == MOUNT)
#if not already_mounted:
# dbutils.fs.mount(S3_LOCATION, MOUNT)
#display(dbutils.fs.ls(MOUNT))
```
Note: Analysis was done on union of clean and invalid parquet files but no cryptojacking was detected in invalid files - can avoid reading invalid files to save computation time.
@ -120,9 +147,9 @@ def get_cryptojacking_sites():
skipped_suffixes = ["com", ""]
for line in data:
string = str(line)
if string.startswith("b'0.0.0.0"):
sites.append(string[len("b'0.0.0.0 "):-len("\\\\n")])
string = str(line).strip()
if string.startswith("0.0.0.0"):
sites.append(string[8:])
print("Unmodified list: " + str(sites))
for site in sites:
@ -164,7 +191,7 @@ cryptoDF_count
```
cryptoDF.show(500, False)
cryptoDF.show(50, False)
```
@ -181,13 +208,14 @@ domains = cryptoDF.select('location').rdd.distinct()
domains_count = domains.count()
domains_count
#14523 (without prefixes/suffixes)
#49 (with prefixes/suffixes)
#14523 (with prefixes/suffixes)
#49 (without prefixes/suffixes)
```
```
#Which domains are the "worst" for crypto-jacking - i.e., all the domains where crypto-jacking was detecting, ordered by the number of crypto-jacking scripts observed on them.
# However, since we have stripped prefixed and suffixes, this counts instances of crpyto-jacking scripts across all pages in the dataset sharing this domain.
domain_with_counts = cryptoDF.groupby("location").count()
ordered_domains_with_counts = domain_with_counts.orderBy("count", ascending=False).show(100)

Просмотреть файл

@ -39,7 +39,7 @@
# MAGIC %md ### Results: Summary
# MAGIC
# MAGIC Less than 0.02% of script calls are detected with cryptojacking.
# MAGIC However, it is important to note that cryptojacking code can be executed in order ways than by including the host .js script in a script tag. It can be disguised, stealthily executed in iframes, or directly used in a function of a first party script. Users may also face redirect loops that eventually lead to a page with a mining script. Another reason for the low detection rate could be that the prominence/popularity of the analysed sites from the web crawl is enough to dissuade site owners from implementing obvious cryptojacking script usage.
# MAGIC However, it is important to note that cryptojacking code can be executed in other ways than by including the host .js script in a script tag. It can be disguised, stealthily executed in iframes, or directly used in a function of a first party script. Users may also face redirect loops that eventually lead to a page with a mining script. Another reason for the low detection rate could be that the prominence/popularity of the analysed sites from the web crawl is enough to dissuade site owners from implementing obvious cryptojacking script usage.
# MAGIC It is likely that the actual rate of cryptojacking is higher.
# MAGIC
# MAGIC The majority of domains that were detected with cryptojacking are streaming sites. This is unsurprising as users will have streaming sites open for longer as they are watching shows/movies, and mining scripts can be executed longer.
@ -74,21 +74,46 @@
# COMMAND ----------
import numpy as np
## Compatibilty with Python 2.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from builtins import open
from builtins import str
from builtins import basestring
from future import standard_library
standard_library.install_aliases()
# COMMAND ----------
BUCKET = 'safe-ucosp-2017/safe_dataset/v1'
ACCESS_KEY = "MY_ACCESS_KEY"
SECRET_KEY = "MY_SECRET_KEY"
ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
AWS_BUCKET_NAME = BUCKET
S3_LOCATION = "s3a://{}:{}@{}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_BUCKET_NAME)
MOUNT = "/mnt/{}".format(BUCKET.replace("/", "-"))
mountPoints = lambda: np.array([m.mountPoint for m in dbutils.fs.mounts()])
already_mounted = np.any(mountPoints() == MOUNT)
mountPoints = [m.mountPoint for m in dbutils.fs.mounts()]
already_mounted = MOUNT in mountPoints
if not already_mounted:
dbutils.fs.mount(S3_LOCATION, MOUNT)
display(dbutils.fs.ls(MOUNT))
dbutils.fs.mount("s3://" + BUCKET, MOUNT)
# COMMAND ----------
#import numpy as np
#BUCKET = 'safe-ucosp-2017/safe_dataset/v1'
#ACCESS_KEY = "MY_ACCESS_KEY"
#SECRET_KEY = "MY_SECRET_KEY"
#ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
#AWS_BUCKET_NAME = BUCKET
#S3_LOCATION = "s3a://{}:{}@{}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_BUCKET_NAME)
#MOUNT = "/mnt/{}".format(BUCKET.replace("/", "-"))
#mountPoints = lambda: np.array([m.mountPoint for m in dbutils.fs.mounts()])
#already_mounted = np.any(mountPoints() == MOUNT)
#if not already_mounted:
# dbutils.fs.mount(S3_LOCATION, MOUNT)
#display(dbutils.fs.ls(MOUNT))
# COMMAND ----------
@ -125,9 +150,9 @@ def get_cryptojacking_sites():
skipped_suffixes = ["com", ""]
for line in data:
string = str(line)
if string.startswith("b'0.0.0.0"):
sites.append(string[len("b'0.0.0.0 "):-len("\\\\n")])
string = str(line).strip()
if string.startswith("0.0.0.0"):
sites.append(string[8:])
print("Unmodified list: " + str(sites))
for site in sites:
@ -167,7 +192,7 @@ cryptoDF_count
# COMMAND ----------
cryptoDF.show(500, False)
cryptoDF.show(50, False)
# COMMAND ----------
@ -182,12 +207,13 @@ domains = cryptoDF.select('location').rdd.distinct()
domains_count = domains.count()
domains_count
#14523 (without prefixes/suffixes)
#49 (with prefixes/suffixes)
#14523 (with prefixes/suffixes)
#49 (without prefixes/suffixes)
# COMMAND ----------
#Which domains are the "worst" for crypto-jacking - i.e., all the domains where crypto-jacking was detecting, ordered by the number of crypto-jacking scripts observed on them.
# However, since we have stripped prefixed and suffixes, this counts instances of crpyto-jacking scripts across all pages in the dataset sharing this domain.
domain_with_counts = cryptoDF.groupby("location").count()
ordered_domains_with_counts = domain_with_counts.orderBy("count", ascending=False).show(100)