Родитель
5626586df5
Коммит
e9e883b662
16
README.md
16
README.md
|
@ -1,11 +1,15 @@
|
||||||
|
|
||||||
## Build and run tests
|
# `openwpm_utils`
|
||||||
|
|
||||||
$ python setup.py test
|
A collection of utilities for working with OpenWPM datasets
|
||||||
|
|
||||||
|
The domain\_utils are available as a standalone package [domain_utils](github.com/mozilla/domain_utils)
|
||||||
|
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
$ pip install openwpm-utils
|
||||||
|
|
||||||
Or
|
Or
|
||||||
|
|
||||||
$ pip install -r requirements.txt
|
$ python setup.py install
|
||||||
$ pip install -r requirements_test.txt
|
|
||||||
$ py.test
|
|
||||||
|
|
||||||
|
|
|
@ -1,146 +1,2 @@
|
||||||
from __future__ import absolute_import
|
# Leave for compatibility
|
||||||
from __future__ import print_function
|
from domain_utils import *
|
||||||
import tempfile
|
|
||||||
import codecs
|
|
||||||
import os
|
|
||||||
import six
|
|
||||||
|
|
||||||
from ipaddress import ip_address
|
|
||||||
from functools import wraps
|
|
||||||
from publicsuffix import PublicSuffixList, fetch
|
|
||||||
from six.moves import range
|
|
||||||
from six.moves.urllib.parse import urlparse
|
|
||||||
|
|
||||||
# We cache the Public Suffix List in temp directory
|
|
||||||
PSL_CACHE_LOC = os.path.join(tempfile.gettempdir(), 'public_suffix_list.dat')
|
|
||||||
|
|
||||||
|
|
||||||
def get_psl(location=PSL_CACHE_LOC):
|
|
||||||
"""
|
|
||||||
Grabs an updated public suffix list.
|
|
||||||
"""
|
|
||||||
if not os.path.isfile(location):
|
|
||||||
psl_file = fetch()
|
|
||||||
with codecs.open(location, 'w', encoding='utf8') as f:
|
|
||||||
f.write(psl_file.read())
|
|
||||||
psl_cache = codecs.open(location, encoding='utf8')
|
|
||||||
return PublicSuffixList(psl_cache)
|
|
||||||
|
|
||||||
|
|
||||||
def load_psl(function):
|
|
||||||
@wraps(function)
|
|
||||||
def wrapper(*args, **kwargs):
|
|
||||||
if 'psl' not in kwargs:
|
|
||||||
if wrapper.psl is None:
|
|
||||||
wrapper.psl = get_psl()
|
|
||||||
return function(*args, psl=wrapper.psl, **kwargs)
|
|
||||||
else:
|
|
||||||
return function(*args, **kwargs)
|
|
||||||
wrapper.psl = None
|
|
||||||
return wrapper
|
|
||||||
|
|
||||||
|
|
||||||
def is_ip_address(hostname):
|
|
||||||
"""
|
|
||||||
Check if the given string is a valid IP address
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
ip_address(six.text_type(hostname))
|
|
||||||
return True
|
|
||||||
except ValueError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
@load_psl
|
|
||||||
def get_ps_plus_1(url, **kwargs):
|
|
||||||
"""
|
|
||||||
Returns the PS+1 of the url. This will also return
|
|
||||||
an IP address if the hostname of the url is a valid
|
|
||||||
IP address.
|
|
||||||
|
|
||||||
An (optional) PublicSuffixList object can be passed with keyword arg 'psl',
|
|
||||||
otherwise a version cached in the system temp directory is used.
|
|
||||||
"""
|
|
||||||
if 'psl' not in kwargs:
|
|
||||||
raise ValueError(
|
|
||||||
"A PublicSuffixList must be passed as a keyword argument.")
|
|
||||||
hostname = urlparse(url).hostname
|
|
||||||
if is_ip_address(hostname):
|
|
||||||
return hostname
|
|
||||||
elif hostname is None:
|
|
||||||
# Possible reasons hostname is None, `url` is:
|
|
||||||
# * malformed
|
|
||||||
# * a relative url
|
|
||||||
# * a `javascript:` or `data:` url
|
|
||||||
# * many others
|
|
||||||
return
|
|
||||||
else:
|
|
||||||
return kwargs['psl'].get_public_suffix(hostname)
|
|
||||||
|
|
||||||
|
|
||||||
@load_psl
|
|
||||||
def hostname_subparts(url, include_ps=False, **kwargs):
|
|
||||||
"""
|
|
||||||
Returns a list of slices of a url's hostname down to the PS+1
|
|
||||||
|
|
||||||
If `include_ps` is set, the hostname slices will include the public suffix
|
|
||||||
|
|
||||||
For example: http://a.b.c.d.com/path?query#frag would yield:
|
|
||||||
[a.b.c.d.com, b.c.d.com, c.d.com, d.com] if include_ps == False
|
|
||||||
[a.b.c.d.com, b.c.d.com, c.d.com, d.com, com] if include_ps == True
|
|
||||||
|
|
||||||
An (optional) PublicSuffixList object can be passed with keyword arg 'psl'.
|
|
||||||
otherwise a version cached in the system temp directory is used.
|
|
||||||
"""
|
|
||||||
if 'psl' not in kwargs:
|
|
||||||
raise ValueError(
|
|
||||||
"A PublicSuffixList must be passed as a keyword argument.")
|
|
||||||
hostname = urlparse(url).hostname
|
|
||||||
|
|
||||||
# If an IP address, just return a single item list with the IP
|
|
||||||
if is_ip_address(hostname):
|
|
||||||
return [hostname]
|
|
||||||
|
|
||||||
subparts = list()
|
|
||||||
ps_plus_1 = kwargs['psl'].get_public_suffix(hostname)
|
|
||||||
|
|
||||||
# We expect all ps_plus_1s to have at least one '.'
|
|
||||||
# If they don't, the url was likely malformed, so we'll just return an
|
|
||||||
# empty list
|
|
||||||
if '.' not in ps_plus_1:
|
|
||||||
return []
|
|
||||||
subdomains = hostname[:-(len(ps_plus_1)+1)].split('.')
|
|
||||||
if subdomains == ['']:
|
|
||||||
subdomains = []
|
|
||||||
for i in range(len(subdomains)):
|
|
||||||
subparts.append('.'.join(subdomains[i:])+'.'+ps_plus_1)
|
|
||||||
subparts.append(ps_plus_1)
|
|
||||||
if include_ps:
|
|
||||||
try:
|
|
||||||
subparts.append(ps_plus_1[ps_plus_1.index('.')+1:])
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
return subparts
|
|
||||||
|
|
||||||
|
|
||||||
def get_stripped_url(url, scheme=False):
|
|
||||||
"""Returns a url stripped to (scheme)?+hostname+path"""
|
|
||||||
purl = urlparse(url)
|
|
||||||
surl = ''
|
|
||||||
if scheme:
|
|
||||||
surl += purl.scheme + '://'
|
|
||||||
try:
|
|
||||||
surl += purl.hostname + purl.path
|
|
||||||
except TypeError:
|
|
||||||
surl += purl.hostname
|
|
||||||
return surl
|
|
||||||
|
|
||||||
|
|
||||||
def get_stripped_urls(urls, scheme=False):
|
|
||||||
""" Returns a set (or list) of urls stripped to (scheme)?+hostname+path """
|
|
||||||
new_urls = list()
|
|
||||||
for url in urls:
|
|
||||||
get_stripped_url(url, scheme)
|
|
||||||
if type(urls) == set:
|
|
||||||
return set(new_urls)
|
|
||||||
return new_urls
|
|
||||||
|
|
|
@ -1,9 +1,8 @@
|
||||||
boto3
|
boto3
|
||||||
ipaddress
|
domain_utils==0.3.0
|
||||||
jsbeautifier
|
jsbeautifier
|
||||||
pandas
|
pandas
|
||||||
plyvel
|
plyvel
|
||||||
publicsuffix
|
|
||||||
pyarrow
|
pyarrow
|
||||||
pyspark
|
pyspark
|
||||||
s3fs
|
s3fs
|
||||||
|
|
|
@ -1,2 +0,0 @@
|
||||||
pytest==3.7.4
|
|
||||||
pytest-flake8==1.0.2
|
|
5
setup.py
5
setup.py
|
@ -2,8 +2,6 @@ from setuptools import setup
|
||||||
|
|
||||||
with open('requirements.txt') as f:
|
with open('requirements.txt') as f:
|
||||||
requirements = f.read().splitlines()
|
requirements = f.read().splitlines()
|
||||||
with open('requirements_test.txt') as f:
|
|
||||||
test_requirements = f.read().splitlines()
|
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
# Meta
|
# Meta
|
||||||
|
@ -18,8 +16,7 @@ setup(
|
||||||
|
|
||||||
# Dependencies
|
# Dependencies
|
||||||
install_requires=requirements,
|
install_requires=requirements,
|
||||||
tests_require=test_requirements,
|
setup_requires=['setuptools_scm',],
|
||||||
setup_requires=['setuptools_scm', 'pytest-runner'],
|
|
||||||
|
|
||||||
# Packaging
|
# Packaging
|
||||||
include_package_data=True,
|
include_package_data=True,
|
||||||
|
|
|
@ -1,14 +0,0 @@
|
||||||
import pytest
|
|
||||||
from crawl_utils.domain import (
|
|
||||||
get_ps_plus_1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_ps_plus_one_cloudfront():
|
|
||||||
assert get_ps_plus_1(
|
|
||||||
'https://my.domain.cloudfront.net') == 'domain.cloudfront.net'
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="Currently not supported")
|
|
||||||
def test_get_ps_plus_one_no_https():
|
|
||||||
assert get_ps_plus_1('my.domain.cloudfront.net') == 'domain.cloudfront.net'
|
|
Загрузка…
Ссылка в новой задаче