Родитель
5626586df5
Коммит
e9e883b662
16
README.md
16
README.md
|
@ -1,11 +1,15 @@
|
|||
|
||||
## Build and run tests
|
||||
# `openwpm_utils`
|
||||
|
||||
$ python setup.py test
|
||||
A collection of utilities for working with OpenWPM datasets
|
||||
|
||||
The domain\_utils are available as a standalone package [domain_utils](github.com/mozilla/domain_utils)
|
||||
|
||||
|
||||
## Installation
|
||||
|
||||
$ pip install openwpm-utils
|
||||
|
||||
Or
|
||||
|
||||
$ pip install -r requirements.txt
|
||||
$ pip install -r requirements_test.txt
|
||||
$ py.test
|
||||
|
||||
$ python setup.py install
|
||||
|
|
|
@ -1,146 +1,2 @@
|
|||
from __future__ import absolute_import
|
||||
from __future__ import print_function
|
||||
import tempfile
|
||||
import codecs
|
||||
import os
|
||||
import six
|
||||
|
||||
from ipaddress import ip_address
|
||||
from functools import wraps
|
||||
from publicsuffix import PublicSuffixList, fetch
|
||||
from six.moves import range
|
||||
from six.moves.urllib.parse import urlparse
|
||||
|
||||
# We cache the Public Suffix List in temp directory
|
||||
PSL_CACHE_LOC = os.path.join(tempfile.gettempdir(), 'public_suffix_list.dat')
|
||||
|
||||
|
||||
def get_psl(location=PSL_CACHE_LOC):
|
||||
"""
|
||||
Grabs an updated public suffix list.
|
||||
"""
|
||||
if not os.path.isfile(location):
|
||||
psl_file = fetch()
|
||||
with codecs.open(location, 'w', encoding='utf8') as f:
|
||||
f.write(psl_file.read())
|
||||
psl_cache = codecs.open(location, encoding='utf8')
|
||||
return PublicSuffixList(psl_cache)
|
||||
|
||||
|
||||
def load_psl(function):
|
||||
@wraps(function)
|
||||
def wrapper(*args, **kwargs):
|
||||
if 'psl' not in kwargs:
|
||||
if wrapper.psl is None:
|
||||
wrapper.psl = get_psl()
|
||||
return function(*args, psl=wrapper.psl, **kwargs)
|
||||
else:
|
||||
return function(*args, **kwargs)
|
||||
wrapper.psl = None
|
||||
return wrapper
|
||||
|
||||
|
||||
def is_ip_address(hostname):
|
||||
"""
|
||||
Check if the given string is a valid IP address
|
||||
"""
|
||||
try:
|
||||
ip_address(six.text_type(hostname))
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
@load_psl
|
||||
def get_ps_plus_1(url, **kwargs):
|
||||
"""
|
||||
Returns the PS+1 of the url. This will also return
|
||||
an IP address if the hostname of the url is a valid
|
||||
IP address.
|
||||
|
||||
An (optional) PublicSuffixList object can be passed with keyword arg 'psl',
|
||||
otherwise a version cached in the system temp directory is used.
|
||||
"""
|
||||
if 'psl' not in kwargs:
|
||||
raise ValueError(
|
||||
"A PublicSuffixList must be passed as a keyword argument.")
|
||||
hostname = urlparse(url).hostname
|
||||
if is_ip_address(hostname):
|
||||
return hostname
|
||||
elif hostname is None:
|
||||
# Possible reasons hostname is None, `url` is:
|
||||
# * malformed
|
||||
# * a relative url
|
||||
# * a `javascript:` or `data:` url
|
||||
# * many others
|
||||
return
|
||||
else:
|
||||
return kwargs['psl'].get_public_suffix(hostname)
|
||||
|
||||
|
||||
@load_psl
|
||||
def hostname_subparts(url, include_ps=False, **kwargs):
|
||||
"""
|
||||
Returns a list of slices of a url's hostname down to the PS+1
|
||||
|
||||
If `include_ps` is set, the hostname slices will include the public suffix
|
||||
|
||||
For example: http://a.b.c.d.com/path?query#frag would yield:
|
||||
[a.b.c.d.com, b.c.d.com, c.d.com, d.com] if include_ps == False
|
||||
[a.b.c.d.com, b.c.d.com, c.d.com, d.com, com] if include_ps == True
|
||||
|
||||
An (optional) PublicSuffixList object can be passed with keyword arg 'psl'.
|
||||
otherwise a version cached in the system temp directory is used.
|
||||
"""
|
||||
if 'psl' not in kwargs:
|
||||
raise ValueError(
|
||||
"A PublicSuffixList must be passed as a keyword argument.")
|
||||
hostname = urlparse(url).hostname
|
||||
|
||||
# If an IP address, just return a single item list with the IP
|
||||
if is_ip_address(hostname):
|
||||
return [hostname]
|
||||
|
||||
subparts = list()
|
||||
ps_plus_1 = kwargs['psl'].get_public_suffix(hostname)
|
||||
|
||||
# We expect all ps_plus_1s to have at least one '.'
|
||||
# If they don't, the url was likely malformed, so we'll just return an
|
||||
# empty list
|
||||
if '.' not in ps_plus_1:
|
||||
return []
|
||||
subdomains = hostname[:-(len(ps_plus_1)+1)].split('.')
|
||||
if subdomains == ['']:
|
||||
subdomains = []
|
||||
for i in range(len(subdomains)):
|
||||
subparts.append('.'.join(subdomains[i:])+'.'+ps_plus_1)
|
||||
subparts.append(ps_plus_1)
|
||||
if include_ps:
|
||||
try:
|
||||
subparts.append(ps_plus_1[ps_plus_1.index('.')+1:])
|
||||
except Exception:
|
||||
pass
|
||||
return subparts
|
||||
|
||||
|
||||
def get_stripped_url(url, scheme=False):
|
||||
"""Returns a url stripped to (scheme)?+hostname+path"""
|
||||
purl = urlparse(url)
|
||||
surl = ''
|
||||
if scheme:
|
||||
surl += purl.scheme + '://'
|
||||
try:
|
||||
surl += purl.hostname + purl.path
|
||||
except TypeError:
|
||||
surl += purl.hostname
|
||||
return surl
|
||||
|
||||
|
||||
def get_stripped_urls(urls, scheme=False):
|
||||
""" Returns a set (or list) of urls stripped to (scheme)?+hostname+path """
|
||||
new_urls = list()
|
||||
for url in urls:
|
||||
get_stripped_url(url, scheme)
|
||||
if type(urls) == set:
|
||||
return set(new_urls)
|
||||
return new_urls
|
||||
# Leave for compatibility
|
||||
from domain_utils import *
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
boto3
|
||||
ipaddress
|
||||
domain_utils==0.3.0
|
||||
jsbeautifier
|
||||
pandas
|
||||
plyvel
|
||||
publicsuffix
|
||||
pyarrow
|
||||
pyspark
|
||||
s3fs
|
||||
|
|
|
@ -1,2 +0,0 @@
|
|||
pytest==3.7.4
|
||||
pytest-flake8==1.0.2
|
5
setup.py
5
setup.py
|
@ -2,8 +2,6 @@ from setuptools import setup
|
|||
|
||||
with open('requirements.txt') as f:
|
||||
requirements = f.read().splitlines()
|
||||
with open('requirements_test.txt') as f:
|
||||
test_requirements = f.read().splitlines()
|
||||
|
||||
setup(
|
||||
# Meta
|
||||
|
@ -18,8 +16,7 @@ setup(
|
|||
|
||||
# Dependencies
|
||||
install_requires=requirements,
|
||||
tests_require=test_requirements,
|
||||
setup_requires=['setuptools_scm', 'pytest-runner'],
|
||||
setup_requires=['setuptools_scm',],
|
||||
|
||||
# Packaging
|
||||
include_package_data=True,
|
||||
|
|
|
@ -1,14 +0,0 @@
|
|||
import pytest
|
||||
from crawl_utils.domain import (
|
||||
get_ps_plus_1,
|
||||
)
|
||||
|
||||
|
||||
def test_get_ps_plus_one_cloudfront():
|
||||
assert get_ps_plus_1(
|
||||
'https://my.domain.cloudfront.net') == 'domain.cloudfront.net'
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Currently not supported")
|
||||
def test_get_ps_plus_one_no_https():
|
||||
assert get_ps_plus_1('my.domain.cloudfront.net') == 'domain.cloudfront.net'
|
Загрузка…
Ссылка в новой задаче