Fix browse command to use PS+1, minor bugfix, and additional tests. Closes #85 Closes #78 Closes #82

2016-09-08 23:24:28 -04:00 · 2016-09-08 23:24:28 -04:00 · ae21cd26c2
--- a/automation/Commands/browser_commands.py
+++ b/automation/Commands/browser_commands.py
@ -160,7 +160,7 @@ def browse_website(url, num_links, sleep, visit_id, webdriver, proxy_queue,
        links = filter(lambda x: x.is_displayed() == True, links)
        if len(links) == 0:
            break
-        r = int(random.random()*len(links)-1)
+        r = int(random.random()*len(links))
        logger.info("BROWSER %i: visiting internal link %s" % (browser_params['crawl_id'], links[r].get_attribute("href")))

        try:
@ -171,7 +171,7 @@ def browse_website(url, num_links, sleep, visit_id, webdriver, proxy_queue,
                bot_mitigation(webdriver)
            webdriver.back()
            wait_until_loaded(webdriver, 300)
-        except Exception, e:
+        except Exception:
            pass

 def dump_flash_cookies(start_time, visit_id, webdriver, browser_params, manager_params):
--- a/automation/Commands/utils/domain_utils.py
+++ b/automation/Commands/utils/domain_utils.py
@ -0,0 +1,132 @@
+from publicsuffix import PublicSuffixList, fetch
+from ipaddress import ip_address
+from urlparse import urlparse
+from functools import wraps
+import tempfile
+import codecs
+import os
+
+# We cache the Public Suffix List in temp directory
+PSL_CACHE_LOC = os.path.join(tempfile.gettempdir(),'public_suffix_list.dat')
+
+def get_psl():
+    """
+    Grabs an updated public suffix list.
+    """
+    if not os.path.isfile(PSL_CACHE_LOC):
+        print "%s does not exist, downloading a copy." % PSL_CACHE_LOC
+        psl_file = fetch()
+        with codecs.open(PSL_CACHE_LOC, 'w', encoding='utf8') as f:
+            f.write(psl_file.read())
+    print "Using psl from cache: %s" % PSL_CACHE_LOC
+    psl_cache = codecs.open(PSL_CACHE_LOC, encoding='utf8')
+    return PublicSuffixList(psl_cache)
+
+def load_psl(function):
+    @wraps(function)
+    def wrapper(*args, **kwargs):
+        if not kwargs.has_key('psl'):
+            if wrapper.psl is None:
+                wrapper.psl = get_psl()
+            return function(*args, psl=wrapper.psl, **kwargs)
+        else:
+            return function(*args, **kwargs)
+    wrapper.psl = None
+    return wrapper
+
+def is_ip_address(hostname):
+    """
+    Check if the given string is a valid IP address
+    """
+    try:
+        ip_address(unicode(hostname))
+        return True
+    except ValueError:
+        return False
+
+@load_psl
+def get_ps_plus_1(url, **kwargs):
+    """
+    Returns the PS+1 of the url. This will also return
+    an IP address if the hostname of the url is a valid
+    IP address.
+
+    An (optional) PublicSuffixList object can be passed with keyword arg 'psl',
+    otherwise a version cached in the system temp directory is used.
+    """
+    if not kwargs.has_key('psl'):
+        raise ValueError("A PublicSuffixList must be passed as a keyword argument.")
+    hostname = urlparse(url).hostname
+    if is_ip_address(hostname):
+        return hostname
+    elif hostname is None:
+        # Possible reasons hostname is None, `url` is:
+        # * malformed
+        # * a relative url
+        # * a `javascript:` or `data:` url
+        # * many others
+        return
+    else:
+        return kwargs['psl'].get_public_suffix(hostname)
+
+@load_psl
+def hostname_subparts(url, include_ps=False, **kwargs):
+    """
+    Returns a list of slices of a url's hostname down to the PS+1 (or PS if include_ps)
+
+    For example: http://a.b.c.d.com/path?query#frag would yield:
+        [a.b.c.d.com, b.c.d.com, c.d.com, d.com] if include_ps == False
+        [a.b.c.d.com, b.c.d.com, c.d.com, d.com, com] if include_ps == True
+
+    An (optional) PublicSuffixList object can be passed with keyword arg 'psl'.
+    otherwise a version cached in the system temp directory is used.
+    """
+    if not kwargs.has_key('psl'):
+        raise ValueError("A PublicSuffixList must be passed as a keyword argument.")
+    hostname = urlparse(url).hostname
+
+    # If an IP address, just return a single item list with the IP
+    if is_ip_address(hostname):
+        return [hostname]
+
+    subparts = list()
+    ps_plus_1 = kwargs['psl'].get_public_suffix(hostname)
+
+    # We expect all ps_plus_1s to have at least one '.'
+    # If they don't, the url was likely malformed, so we'll just return an
+    # empty list
+    if '.' not in ps_plus_1:
+        return []
+    subdomains = hostname[:-(len(ps_plus_1)+1)].split('.')
+    if subdomains == ['']:
+        subdomains = []
+    for i in range(len(subdomains)):
+        subparts.append('.'.join(subdomains[i:])+'.'+ps_plus_1)
+    subparts.append(ps_plus_1)
+    if include_ps:
+        try:
+            subparts.append(ps_plus_1[ps_plus_1.index('.')+1:])
+        except:
+            pass
+    return subparts
+
+def get_stripped_url(url, scheme=False):
+    """Returns a url stripped to (scheme)?+hostname+path"""
+    purl = urlparse(url)
+    surl = ''
+    if scheme:
+        surl += purl.scheme + '://'
+    try:
+        surl += purl.hostname + purl.path
+    except TypeError:
+        surl += purl.hostname
+    return surl
+
+def get_stripped_urls(urls, scheme=False):
+    """ Returns a set (or list) of urls stripped to (scheme)?+hostname+path """
+    new_urls = list()
+    for url in urls:
+        get_stripped_url(url, scheme)
+    if type(urls) == set:
+        return set(new_urls)
+    return new_urls
--- a/automation/Commands/utils/webdriver_extensions.py
+++ b/automation/Commands/utils/webdriver_extensions.py
@ -5,10 +5,11 @@ from selenium.webdriver.support.ui import WebDriverWait
 from selenium.common.exceptions import TimeoutException
 from selenium.common.exceptions import ElementNotVisibleException
 from selenium.common.exceptions import NoSuchElementException
-from urlparse import urlparse
+from urlparse import urljoin
 import random
 import time

+import domain_utils as du
 import XPathUtil

 #### Basic functions
@ -35,8 +36,10 @@ def wait_until_loaded(webdriver, timeout, period=0.25):
    return False

 def get_intra_links(webdriver, url):
-    domain = urlparse(url).hostname
-    links = filter(lambda x: (x.get_attribute("href") and x.get_attribute("href").find(domain) > 0 and x.get_attribute("href").find("http") == 0), webdriver.find_elements_by_tag_name("a"))
+    ps1 = du.get_ps_plus_1(url)
+    links = filter(lambda x: (x.get_attribute("href") and
+                              du.get_ps_plus_1(urljoin(url, x.get_attribute("href"))) == ps1),
+                   webdriver.find_elements_by_tag_name("a"))
    return links

 ##### Search/Block Functions
@ -106,7 +109,7 @@ def is_clickable(driver, full_xpath, xpath, timeout = 1):
        w = WebDriverWait(driver, timeout)
        w.until(EC.element_to_be_clickable(('xpath',xpath)))
        return XPathUtil.is_clickable(full_xpath)
-    except TimeoutException, ElementNotVisibleException:
+    except (TimeoutException, ElementNotVisibleException):
        return False

 #TODO Update this. No direct access to DB right now
--- a/automation/Extension/firefox/@openwpm-0.0.1.xpi
+++ b/automation/Extension/firefox/@openwpm-0.0.1.xpi
--- a/test/test_pages/simple_a.html
+++ b/test/test_pages/simple_a.html
@ -11,4 +11,8 @@
 </head>
 <body onload="set_cookie()">
 <a href="http://localtest.me:8000/test_pages/simple_c.html">Click me!</a>
+ <a href="simple_d.html">Click me also!</a>
+ <a href="javascript:alert(1)">Click me for a JS alert!</a>
+ <a href="https://www.google.com">Go to google.com</a>
+ <a href="http://example.com/test.html?localtest.me">Go to example.com</a>
 </body></html>
--- a/test/test_pages/simple_d.html
+++ b/test/test_pages/simple_d.html
@ -0,0 +1,12 @@
+<!doctype html>
+<html>
+<head>
+<title>Simple Page D</title>
+  <script type="application/javascript">
+    function set_cookie() {
+        document.cookie = 'test_cookie=Test-Page-D; expires=Tue, 31 Dec 2030 00:00:00 UTC; path=/';
+     }
+  </script>
+ </head>
+ <body onload="set_cookie()">
+ </body></html>
--- a/test/test_profile.py
+++ b/test/test_profile.py
@ -49,7 +49,7 @@ class TestProfile():
        manager_params, browser_params = self.get_config(str(tmpdir))
        browser_params[0]['profile_tar'] = '/tmp/NOTREAL'
        with pytest.raises(ProfileLoadError):
-            manager = TaskManager.TaskManager(manager_params, browser_params)
+            manager = TaskManager.TaskManager(manager_params, browser_params) # noqa

    def test_profile_saved_when_launch_crashes(self, tmpdir):
        manager_params, browser_params = self.get_config(str(tmpdir))
--- a/test/test_simple_commands.py
+++ b/test/test_simple_commands.py
@ -1,5 +1,4 @@
-import pytest
-import time
+import pytest # noqa
 import os
 import utilities
 from ..automation import CommandSequence
@ -8,6 +7,7 @@ from ..automation import TaskManager
 url_a = utilities.BASE_TEST_URL + '/simple_a.html'
 url_b = utilities.BASE_TEST_URL + '/simple_b.html'
 url_c = utilities.BASE_TEST_URL + '/simple_c.html'
+url_d = utilities.BASE_TEST_URL + '/simple_d.html'

 class TestSimpleCommands():
    """Test correctness of simple commands and check
@ -120,14 +120,19 @@ class TestSimpleCommands():
        assert qry_res[1][0] == url_b

    def test_browse_http_table_valid(self, tmpdir):
-        """Check that 'browse' works and populates http tables correctly."""
+        """Check that 'browse' works and populates http tables correctly.
+
+        NOTE: Since the browse command is choosing links randomly, there is a
+              (very small -- 2*0.5^20) chance this test will fail with valid
+              code.
+        """
        # Run the test crawl
        manager_params, browser_params = self.get_config(str(tmpdir))
        manager = TaskManager.TaskManager(manager_params, browser_params)

        # Set up two sequential browse commands to two URLS
        cs_a = CommandSequence.CommandSequence(url_a)
-        cs_a.browse(num_links=1, sleep=1)
+        cs_a.browse(num_links=20, sleep=1)
        cs_b = CommandSequence.CommandSequence(url_b)
        cs_b.browse(num_links=1, sleep=1)

@ -163,11 +168,26 @@ class TestSimpleCommands():
                                     " WHERE url = ?", (url_b,))
        assert qry_res[0][0] == visit_ids[url_b]

-        # Page simple_a.html has a link to simple_c.html. This request should
-        # be keyed to the site visit for simple_a.html
+        # Page simple_a.html has three links:
+        # 1) An absolute link to simple_c.html
+        # 2) A relative link to simple_d.html
+        # 3) A javascript: link
+        # 4) A link to www.google.com
+        # 5) A link to example.com?localtest.me
+        # We should see page visits for 1 and 2, but not 3-5.
        qry_res = utilities.query_db(manager_params['db'],
                                     "SELECT visit_id FROM http_responses"
                                     " WHERE url = ?", (url_c,))
-        assert len(qry_res) == 1
+        assert qry_res[0][0] == visit_ids[url_a]
+        qry_res = utilities.query_db(manager_params['db'],
+                                     "SELECT visit_id FROM http_responses"
+                                     " WHERE url = ?", (url_d,))
        assert qry_res[0][0] == visit_ids[url_a]

+        # We expect 4 urls: a,c,d and a favicon request
+        qry_res = utilities.query_db(manager_params['db'],
+                                     "SELECT COUNT(DISTINCT url) FROM http_responses"
+                                     " WHERE visit_id = ?", (visit_ids[url_a],))
+        assert qry_res[0][0] == 4
+
+