Domain utils moved to separate package.

Fixes #15.
This commit is contained in:
Sarah Bird 2020-03-25 17:18:01 -05:00
Родитель 5626586df5
Коммит e9e883b662
6 изменённых файлов: 14 добавлений и 174 удалений

Просмотреть файл

@ -1,11 +1,15 @@
## Build and run tests
# `openwpm_utils`
$ python setup.py test
A collection of utilities for working with OpenWPM datasets
The domain\_utils are available as a standalone package [domain_utils](github.com/mozilla/domain_utils)
## Installation
$ pip install openwpm-utils
Or
$ pip install -r requirements.txt
$ pip install -r requirements_test.txt
$ py.test
$ python setup.py install

Просмотреть файл

@ -1,146 +1,2 @@
from __future__ import absolute_import
from __future__ import print_function
import tempfile
import codecs
import os
import six
from ipaddress import ip_address
from functools import wraps
from publicsuffix import PublicSuffixList, fetch
from six.moves import range
from six.moves.urllib.parse import urlparse
# We cache the Public Suffix List in temp directory
PSL_CACHE_LOC = os.path.join(tempfile.gettempdir(), 'public_suffix_list.dat')
def get_psl(location=PSL_CACHE_LOC):
"""
Grabs an updated public suffix list.
"""
if not os.path.isfile(location):
psl_file = fetch()
with codecs.open(location, 'w', encoding='utf8') as f:
f.write(psl_file.read())
psl_cache = codecs.open(location, encoding='utf8')
return PublicSuffixList(psl_cache)
def load_psl(function):
@wraps(function)
def wrapper(*args, **kwargs):
if 'psl' not in kwargs:
if wrapper.psl is None:
wrapper.psl = get_psl()
return function(*args, psl=wrapper.psl, **kwargs)
else:
return function(*args, **kwargs)
wrapper.psl = None
return wrapper
def is_ip_address(hostname):
"""
Check if the given string is a valid IP address
"""
try:
ip_address(six.text_type(hostname))
return True
except ValueError:
return False
@load_psl
def get_ps_plus_1(url, **kwargs):
"""
Returns the PS+1 of the url. This will also return
an IP address if the hostname of the url is a valid
IP address.
An (optional) PublicSuffixList object can be passed with keyword arg 'psl',
otherwise a version cached in the system temp directory is used.
"""
if 'psl' not in kwargs:
raise ValueError(
"A PublicSuffixList must be passed as a keyword argument.")
hostname = urlparse(url).hostname
if is_ip_address(hostname):
return hostname
elif hostname is None:
# Possible reasons hostname is None, `url` is:
# * malformed
# * a relative url
# * a `javascript:` or `data:` url
# * many others
return
else:
return kwargs['psl'].get_public_suffix(hostname)
@load_psl
def hostname_subparts(url, include_ps=False, **kwargs):
"""
Returns a list of slices of a url's hostname down to the PS+1
If `include_ps` is set, the hostname slices will include the public suffix
For example: http://a.b.c.d.com/path?query#frag would yield:
[a.b.c.d.com, b.c.d.com, c.d.com, d.com] if include_ps == False
[a.b.c.d.com, b.c.d.com, c.d.com, d.com, com] if include_ps == True
An (optional) PublicSuffixList object can be passed with keyword arg 'psl'.
otherwise a version cached in the system temp directory is used.
"""
if 'psl' not in kwargs:
raise ValueError(
"A PublicSuffixList must be passed as a keyword argument.")
hostname = urlparse(url).hostname
# If an IP address, just return a single item list with the IP
if is_ip_address(hostname):
return [hostname]
subparts = list()
ps_plus_1 = kwargs['psl'].get_public_suffix(hostname)
# We expect all ps_plus_1s to have at least one '.'
# If they don't, the url was likely malformed, so we'll just return an
# empty list
if '.' not in ps_plus_1:
return []
subdomains = hostname[:-(len(ps_plus_1)+1)].split('.')
if subdomains == ['']:
subdomains = []
for i in range(len(subdomains)):
subparts.append('.'.join(subdomains[i:])+'.'+ps_plus_1)
subparts.append(ps_plus_1)
if include_ps:
try:
subparts.append(ps_plus_1[ps_plus_1.index('.')+1:])
except Exception:
pass
return subparts
def get_stripped_url(url, scheme=False):
"""Returns a url stripped to (scheme)?+hostname+path"""
purl = urlparse(url)
surl = ''
if scheme:
surl += purl.scheme + '://'
try:
surl += purl.hostname + purl.path
except TypeError:
surl += purl.hostname
return surl
def get_stripped_urls(urls, scheme=False):
""" Returns a set (or list) of urls stripped to (scheme)?+hostname+path """
new_urls = list()
for url in urls:
get_stripped_url(url, scheme)
if type(urls) == set:
return set(new_urls)
return new_urls
# Leave for compatibility
from domain_utils import *

Просмотреть файл

@ -1,9 +1,8 @@
boto3
ipaddress
domain_utils==0.3.0
jsbeautifier
pandas
plyvel
publicsuffix
pyarrow
pyspark
s3fs

Просмотреть файл

@ -1,2 +0,0 @@
pytest==3.7.4
pytest-flake8==1.0.2

Просмотреть файл

@ -2,8 +2,6 @@ from setuptools import setup
with open('requirements.txt') as f:
requirements = f.read().splitlines()
with open('requirements_test.txt') as f:
test_requirements = f.read().splitlines()
setup(
# Meta
@ -18,8 +16,7 @@ setup(
# Dependencies
install_requires=requirements,
tests_require=test_requirements,
setup_requires=['setuptools_scm', 'pytest-runner'],
setup_requires=['setuptools_scm',],
# Packaging
include_package_data=True,

Просмотреть файл

@ -1,14 +0,0 @@
import pytest
from crawl_utils.domain import (
get_ps_plus_1,
)
def test_get_ps_plus_one_cloudfront():
assert get_ps_plus_1(
'https://my.domain.cloudfront.net') == 'domain.cloudfront.net'
@pytest.mark.skip(reason="Currently not supported")
def test_get_ps_plus_one_no_https():
assert get_ps_plus_1('my.domain.cloudfront.net') == 'domain.cloudfront.net'