Fix browse command to use PS+1, minor bugfix, and additional tests. Closes #85 Closes #78 Closes #82

This commit is contained in:
englehardt 2016-09-08 23:24:28 -04:00
Родитель 1141a893d8
Коммит ae21cd26c2
8 изменённых файлов: 185 добавлений и 14 удалений

Просмотреть файл

@ -160,7 +160,7 @@ def browse_website(url, num_links, sleep, visit_id, webdriver, proxy_queue,
links = filter(lambda x: x.is_displayed() == True, links)
if len(links) == 0:
break
r = int(random.random()*len(links)-1)
r = int(random.random()*len(links))
logger.info("BROWSER %i: visiting internal link %s" % (browser_params['crawl_id'], links[r].get_attribute("href")))
try:
@ -171,7 +171,7 @@ def browse_website(url, num_links, sleep, visit_id, webdriver, proxy_queue,
bot_mitigation(webdriver)
webdriver.back()
wait_until_loaded(webdriver, 300)
except Exception, e:
except Exception:
pass
def dump_flash_cookies(start_time, visit_id, webdriver, browser_params, manager_params):

Просмотреть файл

@ -0,0 +1,132 @@
from publicsuffix import PublicSuffixList, fetch
from ipaddress import ip_address
from urlparse import urlparse
from functools import wraps
import tempfile
import codecs
import os
# We cache the Public Suffix List in temp directory
PSL_CACHE_LOC = os.path.join(tempfile.gettempdir(),'public_suffix_list.dat')
def get_psl():
"""
Grabs an updated public suffix list.
"""
if not os.path.isfile(PSL_CACHE_LOC):
print "%s does not exist, downloading a copy." % PSL_CACHE_LOC
psl_file = fetch()
with codecs.open(PSL_CACHE_LOC, 'w', encoding='utf8') as f:
f.write(psl_file.read())
print "Using psl from cache: %s" % PSL_CACHE_LOC
psl_cache = codecs.open(PSL_CACHE_LOC, encoding='utf8')
return PublicSuffixList(psl_cache)
def load_psl(function):
@wraps(function)
def wrapper(*args, **kwargs):
if not kwargs.has_key('psl'):
if wrapper.psl is None:
wrapper.psl = get_psl()
return function(*args, psl=wrapper.psl, **kwargs)
else:
return function(*args, **kwargs)
wrapper.psl = None
return wrapper
def is_ip_address(hostname):
"""
Check if the given string is a valid IP address
"""
try:
ip_address(unicode(hostname))
return True
except ValueError:
return False
@load_psl
def get_ps_plus_1(url, **kwargs):
"""
Returns the PS+1 of the url. This will also return
an IP address if the hostname of the url is a valid
IP address.
An (optional) PublicSuffixList object can be passed with keyword arg 'psl',
otherwise a version cached in the system temp directory is used.
"""
if not kwargs.has_key('psl'):
raise ValueError("A PublicSuffixList must be passed as a keyword argument.")
hostname = urlparse(url).hostname
if is_ip_address(hostname):
return hostname
elif hostname is None:
# Possible reasons hostname is None, `url` is:
# * malformed
# * a relative url
# * a `javascript:` or `data:` url
# * many others
return
else:
return kwargs['psl'].get_public_suffix(hostname)
@load_psl
def hostname_subparts(url, include_ps=False, **kwargs):
"""
Returns a list of slices of a url's hostname down to the PS+1 (or PS if include_ps)
For example: http://a.b.c.d.com/path?query#frag would yield:
[a.b.c.d.com, b.c.d.com, c.d.com, d.com] if include_ps == False
[a.b.c.d.com, b.c.d.com, c.d.com, d.com, com] if include_ps == True
An (optional) PublicSuffixList object can be passed with keyword arg 'psl'.
otherwise a version cached in the system temp directory is used.
"""
if not kwargs.has_key('psl'):
raise ValueError("A PublicSuffixList must be passed as a keyword argument.")
hostname = urlparse(url).hostname
# If an IP address, just return a single item list with the IP
if is_ip_address(hostname):
return [hostname]
subparts = list()
ps_plus_1 = kwargs['psl'].get_public_suffix(hostname)
# We expect all ps_plus_1s to have at least one '.'
# If they don't, the url was likely malformed, so we'll just return an
# empty list
if '.' not in ps_plus_1:
return []
subdomains = hostname[:-(len(ps_plus_1)+1)].split('.')
if subdomains == ['']:
subdomains = []
for i in range(len(subdomains)):
subparts.append('.'.join(subdomains[i:])+'.'+ps_plus_1)
subparts.append(ps_plus_1)
if include_ps:
try:
subparts.append(ps_plus_1[ps_plus_1.index('.')+1:])
except:
pass
return subparts
def get_stripped_url(url, scheme=False):
"""Returns a url stripped to (scheme)?+hostname+path"""
purl = urlparse(url)
surl = ''
if scheme:
surl += purl.scheme + '://'
try:
surl += purl.hostname + purl.path
except TypeError:
surl += purl.hostname
return surl
def get_stripped_urls(urls, scheme=False):
""" Returns a set (or list) of urls stripped to (scheme)?+hostname+path """
new_urls = list()
for url in urls:
get_stripped_url(url, scheme)
if type(urls) == set:
return set(new_urls)
return new_urls

Просмотреть файл

@ -5,10 +5,11 @@ from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import ElementNotVisibleException
from selenium.common.exceptions import NoSuchElementException
from urlparse import urlparse
from urlparse import urljoin
import random
import time
import domain_utils as du
import XPathUtil
#### Basic functions
@ -35,8 +36,10 @@ def wait_until_loaded(webdriver, timeout, period=0.25):
return False
def get_intra_links(webdriver, url):
domain = urlparse(url).hostname
links = filter(lambda x: (x.get_attribute("href") and x.get_attribute("href").find(domain) > 0 and x.get_attribute("href").find("http") == 0), webdriver.find_elements_by_tag_name("a"))
ps1 = du.get_ps_plus_1(url)
links = filter(lambda x: (x.get_attribute("href") and
du.get_ps_plus_1(urljoin(url, x.get_attribute("href"))) == ps1),
webdriver.find_elements_by_tag_name("a"))
return links
##### Search/Block Functions
@ -106,7 +109,7 @@ def is_clickable(driver, full_xpath, xpath, timeout = 1):
w = WebDriverWait(driver, timeout)
w.until(EC.element_to_be_clickable(('xpath',xpath)))
return XPathUtil.is_clickable(full_xpath)
except TimeoutException, ElementNotVisibleException:
except (TimeoutException, ElementNotVisibleException):
return False
#TODO Update this. No direct access to DB right now

Двоичные данные
automation/Extension/firefox/@openwpm-0.0.1.xpi

Двоичный файл не отображается.

Просмотреть файл

@ -11,4 +11,8 @@
</head>
<body onload="set_cookie()">
<a href="http://localtest.me:8000/test_pages/simple_c.html">Click me!</a>
<a href="simple_d.html">Click me also!</a>
<a href="javascript:alert(1)">Click me for a JS alert!</a>
<a href="https://www.google.com">Go to google.com</a>
<a href="http://example.com/test.html?localtest.me">Go to example.com</a>
</body></html>

Просмотреть файл

@ -0,0 +1,12 @@
<!doctype html>
<html>
<head>
<title>Simple Page D</title>
<script type="application/javascript">
function set_cookie() {
document.cookie = 'test_cookie=Test-Page-D; expires=Tue, 31 Dec 2030 00:00:00 UTC; path=/';
}
</script>
</head>
<body onload="set_cookie()">
</body></html>

Просмотреть файл

@ -49,7 +49,7 @@ class TestProfile():
manager_params, browser_params = self.get_config(str(tmpdir))
browser_params[0]['profile_tar'] = '/tmp/NOTREAL'
with pytest.raises(ProfileLoadError):
manager = TaskManager.TaskManager(manager_params, browser_params)
manager = TaskManager.TaskManager(manager_params, browser_params) # noqa
def test_profile_saved_when_launch_crashes(self, tmpdir):
manager_params, browser_params = self.get_config(str(tmpdir))

Просмотреть файл

@ -1,5 +1,4 @@
import pytest
import time
import pytest # noqa
import os
import utilities
from ..automation import CommandSequence
@ -8,6 +7,7 @@ from ..automation import TaskManager
url_a = utilities.BASE_TEST_URL + '/simple_a.html'
url_b = utilities.BASE_TEST_URL + '/simple_b.html'
url_c = utilities.BASE_TEST_URL + '/simple_c.html'
url_d = utilities.BASE_TEST_URL + '/simple_d.html'
class TestSimpleCommands():
"""Test correctness of simple commands and check
@ -120,14 +120,19 @@ class TestSimpleCommands():
assert qry_res[1][0] == url_b
def test_browse_http_table_valid(self, tmpdir):
"""Check that 'browse' works and populates http tables correctly."""
"""Check that 'browse' works and populates http tables correctly.
NOTE: Since the browse command is choosing links randomly, there is a
(very small -- 2*0.5^20) chance this test will fail with valid
code.
"""
# Run the test crawl
manager_params, browser_params = self.get_config(str(tmpdir))
manager = TaskManager.TaskManager(manager_params, browser_params)
# Set up two sequential browse commands to two URLS
cs_a = CommandSequence.CommandSequence(url_a)
cs_a.browse(num_links=1, sleep=1)
cs_a.browse(num_links=20, sleep=1)
cs_b = CommandSequence.CommandSequence(url_b)
cs_b.browse(num_links=1, sleep=1)
@ -163,11 +168,26 @@ class TestSimpleCommands():
" WHERE url = ?", (url_b,))
assert qry_res[0][0] == visit_ids[url_b]
# Page simple_a.html has a link to simple_c.html. This request should
# be keyed to the site visit for simple_a.html
# Page simple_a.html has three links:
# 1) An absolute link to simple_c.html
# 2) A relative link to simple_d.html
# 3) A javascript: link
# 4) A link to www.google.com
# 5) A link to example.com?localtest.me
# We should see page visits for 1 and 2, but not 3-5.
qry_res = utilities.query_db(manager_params['db'],
"SELECT visit_id FROM http_responses"
" WHERE url = ?", (url_c,))
assert len(qry_res) == 1
assert qry_res[0][0] == visit_ids[url_a]
qry_res = utilities.query_db(manager_params['db'],
"SELECT visit_id FROM http_responses"
" WHERE url = ?", (url_d,))
assert qry_res[0][0] == visit_ids[url_a]
# We expect 4 urls: a,c,d and a favicon request
qry_res = utilities.query_db(manager_params['db'],
"SELECT COUNT(DISTINCT url) FROM http_responses"
" WHERE visit_id = ?", (visit_ids[url_a],))
assert qry_res[0][0] == 4