Domain utils moved to separate package.

Fixes #15.
This commit is contained in:
Sarah Bird 2020-03-25 17:18:01 -05:00
Родитель 5626586df5
Коммит e9e883b662
6 изменённых файлов: 14 добавлений и 174 удалений

Просмотреть файл

@ -1,11 +1,15 @@
## Build and run tests # `openwpm_utils`
$ python setup.py test A collection of utilities for working with OpenWPM datasets
The domain\_utils are available as a standalone package [domain_utils](github.com/mozilla/domain_utils)
## Installation
$ pip install openwpm-utils
Or Or
$ pip install -r requirements.txt $ python setup.py install
$ pip install -r requirements_test.txt
$ py.test

Просмотреть файл

@ -1,146 +1,2 @@
from __future__ import absolute_import # Leave for compatibility
from __future__ import print_function from domain_utils import *
import tempfile
import codecs
import os
import six
from ipaddress import ip_address
from functools import wraps
from publicsuffix import PublicSuffixList, fetch
from six.moves import range
from six.moves.urllib.parse import urlparse
# We cache the Public Suffix List in temp directory
PSL_CACHE_LOC = os.path.join(tempfile.gettempdir(), 'public_suffix_list.dat')
def get_psl(location=PSL_CACHE_LOC):
"""
Grabs an updated public suffix list.
"""
if not os.path.isfile(location):
psl_file = fetch()
with codecs.open(location, 'w', encoding='utf8') as f:
f.write(psl_file.read())
psl_cache = codecs.open(location, encoding='utf8')
return PublicSuffixList(psl_cache)
def load_psl(function):
@wraps(function)
def wrapper(*args, **kwargs):
if 'psl' not in kwargs:
if wrapper.psl is None:
wrapper.psl = get_psl()
return function(*args, psl=wrapper.psl, **kwargs)
else:
return function(*args, **kwargs)
wrapper.psl = None
return wrapper
def is_ip_address(hostname):
"""
Check if the given string is a valid IP address
"""
try:
ip_address(six.text_type(hostname))
return True
except ValueError:
return False
@load_psl
def get_ps_plus_1(url, **kwargs):
"""
Returns the PS+1 of the url. This will also return
an IP address if the hostname of the url is a valid
IP address.
An (optional) PublicSuffixList object can be passed with keyword arg 'psl',
otherwise a version cached in the system temp directory is used.
"""
if 'psl' not in kwargs:
raise ValueError(
"A PublicSuffixList must be passed as a keyword argument.")
hostname = urlparse(url).hostname
if is_ip_address(hostname):
return hostname
elif hostname is None:
# Possible reasons hostname is None, `url` is:
# * malformed
# * a relative url
# * a `javascript:` or `data:` url
# * many others
return
else:
return kwargs['psl'].get_public_suffix(hostname)
@load_psl
def hostname_subparts(url, include_ps=False, **kwargs):
"""
Returns a list of slices of a url's hostname down to the PS+1
If `include_ps` is set, the hostname slices will include the public suffix
For example: http://a.b.c.d.com/path?query#frag would yield:
[a.b.c.d.com, b.c.d.com, c.d.com, d.com] if include_ps == False
[a.b.c.d.com, b.c.d.com, c.d.com, d.com, com] if include_ps == True
An (optional) PublicSuffixList object can be passed with keyword arg 'psl'.
otherwise a version cached in the system temp directory is used.
"""
if 'psl' not in kwargs:
raise ValueError(
"A PublicSuffixList must be passed as a keyword argument.")
hostname = urlparse(url).hostname
# If an IP address, just return a single item list with the IP
if is_ip_address(hostname):
return [hostname]
subparts = list()
ps_plus_1 = kwargs['psl'].get_public_suffix(hostname)
# We expect all ps_plus_1s to have at least one '.'
# If they don't, the url was likely malformed, so we'll just return an
# empty list
if '.' not in ps_plus_1:
return []
subdomains = hostname[:-(len(ps_plus_1)+1)].split('.')
if subdomains == ['']:
subdomains = []
for i in range(len(subdomains)):
subparts.append('.'.join(subdomains[i:])+'.'+ps_plus_1)
subparts.append(ps_plus_1)
if include_ps:
try:
subparts.append(ps_plus_1[ps_plus_1.index('.')+1:])
except Exception:
pass
return subparts
def get_stripped_url(url, scheme=False):
"""Returns a url stripped to (scheme)?+hostname+path"""
purl = urlparse(url)
surl = ''
if scheme:
surl += purl.scheme + '://'
try:
surl += purl.hostname + purl.path
except TypeError:
surl += purl.hostname
return surl
def get_stripped_urls(urls, scheme=False):
""" Returns a set (or list) of urls stripped to (scheme)?+hostname+path """
new_urls = list()
for url in urls:
get_stripped_url(url, scheme)
if type(urls) == set:
return set(new_urls)
return new_urls

Просмотреть файл

@ -1,9 +1,8 @@
boto3 boto3
ipaddress domain_utils==0.3.0
jsbeautifier jsbeautifier
pandas pandas
plyvel plyvel
publicsuffix
pyarrow pyarrow
pyspark pyspark
s3fs s3fs

Просмотреть файл

@ -1,2 +0,0 @@
pytest==3.7.4
pytest-flake8==1.0.2

Просмотреть файл

@ -2,8 +2,6 @@ from setuptools import setup
with open('requirements.txt') as f: with open('requirements.txt') as f:
requirements = f.read().splitlines() requirements = f.read().splitlines()
with open('requirements_test.txt') as f:
test_requirements = f.read().splitlines()
setup( setup(
# Meta # Meta
@ -18,8 +16,7 @@ setup(
# Dependencies # Dependencies
install_requires=requirements, install_requires=requirements,
tests_require=test_requirements, setup_requires=['setuptools_scm',],
setup_requires=['setuptools_scm', 'pytest-runner'],
# Packaging # Packaging
include_package_data=True, include_package_data=True,

Просмотреть файл

@ -1,14 +0,0 @@
import pytest
from crawl_utils.domain import (
get_ps_plus_1,
)
def test_get_ps_plus_one_cloudfront():
assert get_ps_plus_1(
'https://my.domain.cloudfront.net') == 'domain.cloudfront.net'
@pytest.mark.skip(reason="Currently not supported")
def test_get_ps_plus_one_no_https():
assert get_ps_plus_1('my.domain.cloudfront.net') == 'domain.cloudfront.net'