зеркало из https://github.com/openwpm/OpenWPM.git
Fix browse command to use PS+1, minor bugfix, and additional tests. Closes #85 Closes #78 Closes #82
This commit is contained in:
Родитель
1141a893d8
Коммит
ae21cd26c2
|
@ -160,7 +160,7 @@ def browse_website(url, num_links, sleep, visit_id, webdriver, proxy_queue,
|
|||
links = filter(lambda x: x.is_displayed() == True, links)
|
||||
if len(links) == 0:
|
||||
break
|
||||
r = int(random.random()*len(links)-1)
|
||||
r = int(random.random()*len(links))
|
||||
logger.info("BROWSER %i: visiting internal link %s" % (browser_params['crawl_id'], links[r].get_attribute("href")))
|
||||
|
||||
try:
|
||||
|
@ -171,7 +171,7 @@ def browse_website(url, num_links, sleep, visit_id, webdriver, proxy_queue,
|
|||
bot_mitigation(webdriver)
|
||||
webdriver.back()
|
||||
wait_until_loaded(webdriver, 300)
|
||||
except Exception, e:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def dump_flash_cookies(start_time, visit_id, webdriver, browser_params, manager_params):
|
||||
|
|
|
@ -0,0 +1,132 @@
|
|||
from publicsuffix import PublicSuffixList, fetch
|
||||
from ipaddress import ip_address
|
||||
from urlparse import urlparse
|
||||
from functools import wraps
|
||||
import tempfile
|
||||
import codecs
|
||||
import os
|
||||
|
||||
# We cache the Public Suffix List in temp directory
|
||||
PSL_CACHE_LOC = os.path.join(tempfile.gettempdir(),'public_suffix_list.dat')
|
||||
|
||||
def get_psl():
|
||||
"""
|
||||
Grabs an updated public suffix list.
|
||||
"""
|
||||
if not os.path.isfile(PSL_CACHE_LOC):
|
||||
print "%s does not exist, downloading a copy." % PSL_CACHE_LOC
|
||||
psl_file = fetch()
|
||||
with codecs.open(PSL_CACHE_LOC, 'w', encoding='utf8') as f:
|
||||
f.write(psl_file.read())
|
||||
print "Using psl from cache: %s" % PSL_CACHE_LOC
|
||||
psl_cache = codecs.open(PSL_CACHE_LOC, encoding='utf8')
|
||||
return PublicSuffixList(psl_cache)
|
||||
|
||||
def load_psl(function):
|
||||
@wraps(function)
|
||||
def wrapper(*args, **kwargs):
|
||||
if not kwargs.has_key('psl'):
|
||||
if wrapper.psl is None:
|
||||
wrapper.psl = get_psl()
|
||||
return function(*args, psl=wrapper.psl, **kwargs)
|
||||
else:
|
||||
return function(*args, **kwargs)
|
||||
wrapper.psl = None
|
||||
return wrapper
|
||||
|
||||
def is_ip_address(hostname):
|
||||
"""
|
||||
Check if the given string is a valid IP address
|
||||
"""
|
||||
try:
|
||||
ip_address(unicode(hostname))
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
@load_psl
|
||||
def get_ps_plus_1(url, **kwargs):
|
||||
"""
|
||||
Returns the PS+1 of the url. This will also return
|
||||
an IP address if the hostname of the url is a valid
|
||||
IP address.
|
||||
|
||||
An (optional) PublicSuffixList object can be passed with keyword arg 'psl',
|
||||
otherwise a version cached in the system temp directory is used.
|
||||
"""
|
||||
if not kwargs.has_key('psl'):
|
||||
raise ValueError("A PublicSuffixList must be passed as a keyword argument.")
|
||||
hostname = urlparse(url).hostname
|
||||
if is_ip_address(hostname):
|
||||
return hostname
|
||||
elif hostname is None:
|
||||
# Possible reasons hostname is None, `url` is:
|
||||
# * malformed
|
||||
# * a relative url
|
||||
# * a `javascript:` or `data:` url
|
||||
# * many others
|
||||
return
|
||||
else:
|
||||
return kwargs['psl'].get_public_suffix(hostname)
|
||||
|
||||
@load_psl
|
||||
def hostname_subparts(url, include_ps=False, **kwargs):
|
||||
"""
|
||||
Returns a list of slices of a url's hostname down to the PS+1 (or PS if include_ps)
|
||||
|
||||
For example: http://a.b.c.d.com/path?query#frag would yield:
|
||||
[a.b.c.d.com, b.c.d.com, c.d.com, d.com] if include_ps == False
|
||||
[a.b.c.d.com, b.c.d.com, c.d.com, d.com, com] if include_ps == True
|
||||
|
||||
An (optional) PublicSuffixList object can be passed with keyword arg 'psl'.
|
||||
otherwise a version cached in the system temp directory is used.
|
||||
"""
|
||||
if not kwargs.has_key('psl'):
|
||||
raise ValueError("A PublicSuffixList must be passed as a keyword argument.")
|
||||
hostname = urlparse(url).hostname
|
||||
|
||||
# If an IP address, just return a single item list with the IP
|
||||
if is_ip_address(hostname):
|
||||
return [hostname]
|
||||
|
||||
subparts = list()
|
||||
ps_plus_1 = kwargs['psl'].get_public_suffix(hostname)
|
||||
|
||||
# We expect all ps_plus_1s to have at least one '.'
|
||||
# If they don't, the url was likely malformed, so we'll just return an
|
||||
# empty list
|
||||
if '.' not in ps_plus_1:
|
||||
return []
|
||||
subdomains = hostname[:-(len(ps_plus_1)+1)].split('.')
|
||||
if subdomains == ['']:
|
||||
subdomains = []
|
||||
for i in range(len(subdomains)):
|
||||
subparts.append('.'.join(subdomains[i:])+'.'+ps_plus_1)
|
||||
subparts.append(ps_plus_1)
|
||||
if include_ps:
|
||||
try:
|
||||
subparts.append(ps_plus_1[ps_plus_1.index('.')+1:])
|
||||
except:
|
||||
pass
|
||||
return subparts
|
||||
|
||||
def get_stripped_url(url, scheme=False):
|
||||
"""Returns a url stripped to (scheme)?+hostname+path"""
|
||||
purl = urlparse(url)
|
||||
surl = ''
|
||||
if scheme:
|
||||
surl += purl.scheme + '://'
|
||||
try:
|
||||
surl += purl.hostname + purl.path
|
||||
except TypeError:
|
||||
surl += purl.hostname
|
||||
return surl
|
||||
|
||||
def get_stripped_urls(urls, scheme=False):
|
||||
""" Returns a set (or list) of urls stripped to (scheme)?+hostname+path """
|
||||
new_urls = list()
|
||||
for url in urls:
|
||||
get_stripped_url(url, scheme)
|
||||
if type(urls) == set:
|
||||
return set(new_urls)
|
||||
return new_urls
|
|
@ -5,10 +5,11 @@ from selenium.webdriver.support.ui import WebDriverWait
|
|||
from selenium.common.exceptions import TimeoutException
|
||||
from selenium.common.exceptions import ElementNotVisibleException
|
||||
from selenium.common.exceptions import NoSuchElementException
|
||||
from urlparse import urlparse
|
||||
from urlparse import urljoin
|
||||
import random
|
||||
import time
|
||||
|
||||
import domain_utils as du
|
||||
import XPathUtil
|
||||
|
||||
#### Basic functions
|
||||
|
@ -35,8 +36,10 @@ def wait_until_loaded(webdriver, timeout, period=0.25):
|
|||
return False
|
||||
|
||||
def get_intra_links(webdriver, url):
|
||||
domain = urlparse(url).hostname
|
||||
links = filter(lambda x: (x.get_attribute("href") and x.get_attribute("href").find(domain) > 0 and x.get_attribute("href").find("http") == 0), webdriver.find_elements_by_tag_name("a"))
|
||||
ps1 = du.get_ps_plus_1(url)
|
||||
links = filter(lambda x: (x.get_attribute("href") and
|
||||
du.get_ps_plus_1(urljoin(url, x.get_attribute("href"))) == ps1),
|
||||
webdriver.find_elements_by_tag_name("a"))
|
||||
return links
|
||||
|
||||
##### Search/Block Functions
|
||||
|
@ -106,7 +109,7 @@ def is_clickable(driver, full_xpath, xpath, timeout = 1):
|
|||
w = WebDriverWait(driver, timeout)
|
||||
w.until(EC.element_to_be_clickable(('xpath',xpath)))
|
||||
return XPathUtil.is_clickable(full_xpath)
|
||||
except TimeoutException, ElementNotVisibleException:
|
||||
except (TimeoutException, ElementNotVisibleException):
|
||||
return False
|
||||
|
||||
#TODO Update this. No direct access to DB right now
|
||||
|
|
Двоичные данные
automation/Extension/firefox/@openwpm-0.0.1.xpi
Двоичные данные
automation/Extension/firefox/@openwpm-0.0.1.xpi
Двоичный файл не отображается.
|
@ -11,4 +11,8 @@
|
|||
</head>
|
||||
<body onload="set_cookie()">
|
||||
<a href="http://localtest.me:8000/test_pages/simple_c.html">Click me!</a>
|
||||
<a href="simple_d.html">Click me also!</a>
|
||||
<a href="javascript:alert(1)">Click me for a JS alert!</a>
|
||||
<a href="https://www.google.com">Go to google.com</a>
|
||||
<a href="http://example.com/test.html?localtest.me">Go to example.com</a>
|
||||
</body></html>
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Simple Page D</title>
|
||||
<script type="application/javascript">
|
||||
function set_cookie() {
|
||||
document.cookie = 'test_cookie=Test-Page-D; expires=Tue, 31 Dec 2030 00:00:00 UTC; path=/';
|
||||
}
|
||||
</script>
|
||||
</head>
|
||||
<body onload="set_cookie()">
|
||||
</body></html>
|
|
@ -49,7 +49,7 @@ class TestProfile():
|
|||
manager_params, browser_params = self.get_config(str(tmpdir))
|
||||
browser_params[0]['profile_tar'] = '/tmp/NOTREAL'
|
||||
with pytest.raises(ProfileLoadError):
|
||||
manager = TaskManager.TaskManager(manager_params, browser_params)
|
||||
manager = TaskManager.TaskManager(manager_params, browser_params) # noqa
|
||||
|
||||
def test_profile_saved_when_launch_crashes(self, tmpdir):
|
||||
manager_params, browser_params = self.get_config(str(tmpdir))
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
import pytest
|
||||
import time
|
||||
import pytest # noqa
|
||||
import os
|
||||
import utilities
|
||||
from ..automation import CommandSequence
|
||||
|
@ -8,6 +7,7 @@ from ..automation import TaskManager
|
|||
url_a = utilities.BASE_TEST_URL + '/simple_a.html'
|
||||
url_b = utilities.BASE_TEST_URL + '/simple_b.html'
|
||||
url_c = utilities.BASE_TEST_URL + '/simple_c.html'
|
||||
url_d = utilities.BASE_TEST_URL + '/simple_d.html'
|
||||
|
||||
class TestSimpleCommands():
|
||||
"""Test correctness of simple commands and check
|
||||
|
@ -120,14 +120,19 @@ class TestSimpleCommands():
|
|||
assert qry_res[1][0] == url_b
|
||||
|
||||
def test_browse_http_table_valid(self, tmpdir):
|
||||
"""Check that 'browse' works and populates http tables correctly."""
|
||||
"""Check that 'browse' works and populates http tables correctly.
|
||||
|
||||
NOTE: Since the browse command is choosing links randomly, there is a
|
||||
(very small -- 2*0.5^20) chance this test will fail with valid
|
||||
code.
|
||||
"""
|
||||
# Run the test crawl
|
||||
manager_params, browser_params = self.get_config(str(tmpdir))
|
||||
manager = TaskManager.TaskManager(manager_params, browser_params)
|
||||
|
||||
# Set up two sequential browse commands to two URLS
|
||||
cs_a = CommandSequence.CommandSequence(url_a)
|
||||
cs_a.browse(num_links=1, sleep=1)
|
||||
cs_a.browse(num_links=20, sleep=1)
|
||||
cs_b = CommandSequence.CommandSequence(url_b)
|
||||
cs_b.browse(num_links=1, sleep=1)
|
||||
|
||||
|
@ -163,11 +168,26 @@ class TestSimpleCommands():
|
|||
" WHERE url = ?", (url_b,))
|
||||
assert qry_res[0][0] == visit_ids[url_b]
|
||||
|
||||
# Page simple_a.html has a link to simple_c.html. This request should
|
||||
# be keyed to the site visit for simple_a.html
|
||||
# Page simple_a.html has three links:
|
||||
# 1) An absolute link to simple_c.html
|
||||
# 2) A relative link to simple_d.html
|
||||
# 3) A javascript: link
|
||||
# 4) A link to www.google.com
|
||||
# 5) A link to example.com?localtest.me
|
||||
# We should see page visits for 1 and 2, but not 3-5.
|
||||
qry_res = utilities.query_db(manager_params['db'],
|
||||
"SELECT visit_id FROM http_responses"
|
||||
" WHERE url = ?", (url_c,))
|
||||
assert len(qry_res) == 1
|
||||
assert qry_res[0][0] == visit_ids[url_a]
|
||||
qry_res = utilities.query_db(manager_params['db'],
|
||||
"SELECT visit_id FROM http_responses"
|
||||
" WHERE url = ?", (url_d,))
|
||||
assert qry_res[0][0] == visit_ids[url_a]
|
||||
|
||||
# We expect 4 urls: a,c,d and a favicon request
|
||||
qry_res = utilities.query_db(manager_params['db'],
|
||||
"SELECT COUNT(DISTINCT url) FROM http_responses"
|
||||
" WHERE visit_id = ?", (visit_ids[url_a],))
|
||||
assert qry_res[0][0] == 4
|
||||
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче