Domain utils moved to separate package.

Fixes #15.
2020-03-25 17:18:01 -05:00 · 2020-03-25 17:18:01 -05:00 · e9e883b662
--- a/README.md
+++ b/README.md
@ -1,11 +1,15 @@

-## Build and run tests
+# `openwpm_utils`

-    $ python setup.py test
+A collection of utilities for working with OpenWPM datasets
+
+The domain\_utils are available as a standalone package [domain_utils](github.com/mozilla/domain_utils)
+
+
+## Installation
+
+    $ pip install openwpm-utils

 Or

-    $ pip install -r requirements.txt
-    $ pip install -r requirements_test.txt
-    $ py.test
-
+    $ python setup.py install
--- a/openwpm_utils/domain.py
+++ b/openwpm_utils/domain.py
@ -1,146 +1,2 @@
-from __future__ import absolute_import
-from __future__ import print_function
-import tempfile
-import codecs
-import os
-import six
-
-from ipaddress import ip_address
-from functools import wraps
-from publicsuffix import PublicSuffixList, fetch
-from six.moves import range
-from six.moves.urllib.parse import urlparse
-
-# We cache the Public Suffix List in temp directory
-PSL_CACHE_LOC = os.path.join(tempfile.gettempdir(), 'public_suffix_list.dat')
-
-
-def get_psl(location=PSL_CACHE_LOC):
-    """
-    Grabs an updated public suffix list.
-    """
-    if not os.path.isfile(location):
-        psl_file = fetch()
-        with codecs.open(location, 'w', encoding='utf8') as f:
-            f.write(psl_file.read())
-    psl_cache = codecs.open(location, encoding='utf8')
-    return PublicSuffixList(psl_cache)
-
-
-def load_psl(function):
-    @wraps(function)
-    def wrapper(*args, **kwargs):
-        if 'psl' not in kwargs:
-            if wrapper.psl is None:
-                wrapper.psl = get_psl()
-            return function(*args, psl=wrapper.psl, **kwargs)
-        else:
-            return function(*args, **kwargs)
-    wrapper.psl = None
-    return wrapper
-
-
-def is_ip_address(hostname):
-    """
-    Check if the given string is a valid IP address
-    """
-    try:
-        ip_address(six.text_type(hostname))
-        return True
-    except ValueError:
-        return False
-
-
-@load_psl
-def get_ps_plus_1(url, **kwargs):
-    """
-    Returns the PS+1 of the url. This will also return
-    an IP address if the hostname of the url is a valid
-    IP address.
-
-    An (optional) PublicSuffixList object can be passed with keyword arg 'psl',
-    otherwise a version cached in the system temp directory is used.
-    """
-    if 'psl' not in kwargs:
-        raise ValueError(
-            "A PublicSuffixList must be passed as a keyword argument.")
-    hostname = urlparse(url).hostname
-    if is_ip_address(hostname):
-        return hostname
-    elif hostname is None:
-        # Possible reasons hostname is None, `url` is:
-        # * malformed
-        # * a relative url
-        # * a `javascript:` or `data:` url
-        # * many others
-        return
-    else:
-        return kwargs['psl'].get_public_suffix(hostname)
-
-
-@load_psl
-def hostname_subparts(url, include_ps=False, **kwargs):
-    """
-    Returns a list of slices of a url's hostname down to the PS+1
-
-    If `include_ps` is set, the hostname slices will include the public suffix
-
-    For example: http://a.b.c.d.com/path?query#frag would yield:
-        [a.b.c.d.com, b.c.d.com, c.d.com, d.com] if include_ps == False
-        [a.b.c.d.com, b.c.d.com, c.d.com, d.com, com] if include_ps == True
-
-    An (optional) PublicSuffixList object can be passed with keyword arg 'psl'.
-    otherwise a version cached in the system temp directory is used.
-    """
-    if 'psl' not in kwargs:
-        raise ValueError(
-            "A PublicSuffixList must be passed as a keyword argument.")
-    hostname = urlparse(url).hostname
-
-    # If an IP address, just return a single item list with the IP
-    if is_ip_address(hostname):
-        return [hostname]
-
-    subparts = list()
-    ps_plus_1 = kwargs['psl'].get_public_suffix(hostname)
-
-    # We expect all ps_plus_1s to have at least one '.'
-    # If they don't, the url was likely malformed, so we'll just return an
-    # empty list
-    if '.' not in ps_plus_1:
-        return []
-    subdomains = hostname[:-(len(ps_plus_1)+1)].split('.')
-    if subdomains == ['']:
-        subdomains = []
-    for i in range(len(subdomains)):
-        subparts.append('.'.join(subdomains[i:])+'.'+ps_plus_1)
-    subparts.append(ps_plus_1)
-    if include_ps:
-        try:
-            subparts.append(ps_plus_1[ps_plus_1.index('.')+1:])
-        except Exception:
-            pass
-    return subparts
-
-
-def get_stripped_url(url, scheme=False):
-    """Returns a url stripped to (scheme)?+hostname+path"""
-    purl = urlparse(url)
-    surl = ''
-    if scheme:
-        surl += purl.scheme + '://'
-    try:
-        surl += purl.hostname + purl.path
-    except TypeError:
-        surl += purl.hostname
-    return surl
-
-
-def get_stripped_urls(urls, scheme=False):
-    """ Returns a set (or list) of urls stripped to (scheme)?+hostname+path """
-    new_urls = list()
-    for url in urls:
-        get_stripped_url(url, scheme)
-    if type(urls) == set:
-        return set(new_urls)
-    return new_urls
+# Leave for compatibility
+from domain_utils import *
--- a/requirements.txt
+++ b/requirements.txt
@ -1,9 +1,8 @@
 boto3
-ipaddress
+domain_utils==0.3.0
 jsbeautifier
 pandas
 plyvel
-publicsuffix
 pyarrow
 pyspark
 s3fs
--- a/requirements_test.txt
+++ b/requirements_test.txt
@ -1,2 +0,0 @@
-pytest==3.7.4
-pytest-flake8==1.0.2
--- a/setup.py
+++ b/setup.py
@ -2,8 +2,6 @@ from setuptools import setup

 with open('requirements.txt') as f:
    requirements = f.read().splitlines()
-with open('requirements_test.txt') as f:
-    test_requirements = f.read().splitlines()

 setup(
    # Meta
@ -18,8 +16,7 @@ setup(

    # Dependencies
    install_requires=requirements,
-    tests_require=test_requirements,
-    setup_requires=['setuptools_scm', 'pytest-runner'],
+    setup_requires=['setuptools_scm',],

    # Packaging
    include_package_data=True,
--- a/tests/test_domain.py
+++ b/tests/test_domain.py
@ -1,14 +0,0 @@
-import pytest
-from crawl_utils.domain import (
-    get_ps_plus_1,
-)
-
-
-def test_get_ps_plus_one_cloudfront():
-    assert get_ps_plus_1(
-        'https://my.domain.cloudfront.net') == 'domain.cloudfront.net'
-
-
-@pytest.mark.skip(reason="Currently not supported")
-def test_get_ps_plus_one_no_https():
-    assert get_ps_plus_1('my.domain.cloudfront.net') == 'domain.cloudfront.net'