зеркало из https://github.com/openwpm/OpenWPM.git
Update profile tests (#893)
* Re-enable test_profile_saved_when_launch_crashes Update `test_profile_saved_when_launch_crashes` so that it does not depend on the no longer supported proxy to make browser restarts fail. Instead, set the `FIREFOX_BINARY` environment variable to point to a wrong path. Also, fix a bug in `kill_browser_manager()`, which would cause OpenWPM to crash because of a `psutil.NoSuchProcess` exception and without archiving the browser profile, whenever a browser restart failed to launch geckodriver. Finally, make `kill_process_and_children()` use the timeout set via its arguments, which it previously ignored. * Update docstring of dump_profile Add a note for callers that they should make sure Firefox is closed, in order to prevent ending up with a corrupted database in the archived profile. * Update test_browser_profile_coverage Remove the buggy and outdated for loop that determined whether a url is expected to be missing from the places.sqlite database of the browser profile, as we have not observed any missing urls when running this test.
This commit is contained in:
Родитель
a159496e22
Коммит
3619b55682
|
@ -574,9 +574,15 @@ class BrowserManagerHandle:
|
|||
"""`geckodriver_pid` is the geckodriver process. We first kill
|
||||
the child processes (i.e. firefox) and then kill the geckodriver
|
||||
process."""
|
||||
kill_process_and_children(
|
||||
psutil.Process(pid=self.geckodriver_pid), self.logger
|
||||
)
|
||||
try:
|
||||
geckodriver_process = psutil.Process(pid=self.geckodriver_pid)
|
||||
except psutil.NoSuchProcess:
|
||||
self.logger.debug(
|
||||
"BROWSER %i: geckodriver process with pid %i has already"
|
||||
" exited" % (self.browser_id, self.geckodriver_pid)
|
||||
)
|
||||
return
|
||||
kill_process_and_children(geckodriver_process, self.logger)
|
||||
|
||||
def shutdown_browser(self, during_init: bool, force: bool = False) -> None:
|
||||
""" Runs the closing tasks for this Browser/BrowserManager """
|
||||
|
|
|
@ -21,7 +21,12 @@ def dump_profile(
|
|||
compress: bool,
|
||||
browser_params: BrowserParamsInternal,
|
||||
) -> None:
|
||||
"""Dumps a browser profile to a tar file."""
|
||||
"""Dumps a browser profile to a tar file.
|
||||
|
||||
Should only be called when the browser is closed, to prevent
|
||||
database corruption in the archived profile (see section 1.2
|
||||
of https://www.sqlite.org/howtocorrupt.html).
|
||||
"""
|
||||
assert browser_params.browser_id is not None
|
||||
|
||||
# Creating the folders if need be
|
||||
|
|
|
@ -60,7 +60,7 @@ def kill_process_and_children(
|
|||
kill_process_and_children(child, logger)
|
||||
|
||||
parent_process.kill()
|
||||
parent_process.wait(timeout=20)
|
||||
parent_process.wait(timeout=timeout)
|
||||
except psutil.NoSuchProcess:
|
||||
logger.debug("Process %i has already exited", parent_process.pid)
|
||||
pass
|
||||
|
|
|
@ -47,8 +47,8 @@ def test_browser_profile_coverage(default_params, task_manager_creator):
|
|||
"""Test the coverage of the browser's profile.
|
||||
|
||||
This verifies that Firefox's places.sqlite database contains all
|
||||
visited sites (with a few exceptions). If it does not, it is likely
|
||||
the profile is lost at some point during the crawl.
|
||||
visited sites. If it does not, it is likely the profile is lost at
|
||||
some point during the crawl.
|
||||
"""
|
||||
# Run the test crawl
|
||||
manager_params, browser_params = default_params
|
||||
|
@ -77,17 +77,14 @@ def test_browser_profile_coverage(default_params, task_manager_creator):
|
|||
req_ps.add(du.get_ps_plus_1(url))
|
||||
|
||||
hist_ps = set() # visited domains from crawl_history Table
|
||||
statuses = dict()
|
||||
rows = db_utils.query_db(
|
||||
crawl_db,
|
||||
"SELECT arguments, command_status FROM crawl_history WHERE"
|
||||
" command='GetCommand'",
|
||||
"SELECT arguments FROM crawl_history WHERE command='GetCommand'",
|
||||
)
|
||||
for arguments, command_status in rows:
|
||||
for (arguments,) in rows:
|
||||
url = json.loads(arguments)["url"]
|
||||
ps = du.get_ps_plus_1(url)
|
||||
hist_ps.add(ps)
|
||||
statuses[ps] = command_status
|
||||
|
||||
# Grab urls from Firefox database
|
||||
profile_ps = set() # visited domains from firefox profile
|
||||
|
@ -101,42 +98,12 @@ def test_browser_profile_coverage(default_params, task_manager_creator):
|
|||
# We expect a url to be in the Firefox profile if:
|
||||
# 1. We've made requests to it
|
||||
# 2. The url is a top_url we entered into the address bar
|
||||
#
|
||||
# Previously, we expected some missing urls if the following
|
||||
# conditions were not met, but this is no longer the case:
|
||||
# 3. The url successfully loaded (see: Issue #40)
|
||||
# 4. The site does not respond to the initial request with a 204
|
||||
# (won't show in FF DB)
|
||||
# See PR #893 to restore this behavior in case this test fails.
|
||||
missing_urls = req_ps.intersection(hist_ps).difference(profile_ps)
|
||||
unexpected_missing_urls = set()
|
||||
for url in missing_urls:
|
||||
if command_status[url] != "ok":
|
||||
continue
|
||||
|
||||
# Get the visit id for the url
|
||||
rows = db_utils.query_db(
|
||||
crawl_db,
|
||||
"SELECT visit_id FROM site_visits WHERE site_url = ?",
|
||||
("http://" + url,),
|
||||
)
|
||||
visit_id = rows[0]
|
||||
|
||||
rows = db_utils.query_db(
|
||||
crawl_db,
|
||||
"SELECT COUNT(*) FROM http_responses WHERE visit_id = ?",
|
||||
(visit_id,),
|
||||
)
|
||||
if rows[0] > 1:
|
||||
continue
|
||||
|
||||
rows = db_utils.query_db(
|
||||
crawl_db,
|
||||
"SELECT response_status, location FROM "
|
||||
"http_responses WHERE visit_id = ?",
|
||||
(visit_id,),
|
||||
)
|
||||
response_status, location = rows[0]
|
||||
if response_status == 204:
|
||||
continue
|
||||
if location == "http://": # site returned a blank redirect
|
||||
continue
|
||||
unexpected_missing_urls.add(url)
|
||||
|
||||
assert len(unexpected_missing_urls) == 0
|
||||
assert len(missing_urls) == 0
|
||||
|
|
|
@ -52,26 +52,23 @@ def test_profile_error(default_params, task_manager_creator):
|
|||
task_manager_creator((manager_params, browser_params[:1]))
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="proxy no longer supported, need to update")
|
||||
def test_profile_saved_when_launch_crashes(default_params, task_manager_creator):
|
||||
def test_profile_saved_when_launch_crashes(
|
||||
monkeypatch, default_params, task_manager_creator
|
||||
):
|
||||
manager_params, browser_params = default_params
|
||||
manager_params.num_browsers = 1
|
||||
browser_params[0].profile_archive_dir = (
|
||||
manager_params.data_directory / "browser_profile"
|
||||
)
|
||||
browser_params[0].proxy = True
|
||||
browser_params[0].save_content = "script"
|
||||
manager, _ = task_manager_creator((manager_params, browser_params[:1]))
|
||||
manager.get(BASE_TEST_URL)
|
||||
|
||||
# Kill the LevelDBAggregator
|
||||
# This will cause the proxy launch to crash
|
||||
manager.ldb_status_queue.put("DIE")
|
||||
# This will cause browser restarts to fail
|
||||
monkeypatch.setenv("FIREFOX_BINARY", "/tmp/NOTREAL")
|
||||
manager.browsers[0]._SPAWN_TIMEOUT = 2 # Have timeout occur quickly
|
||||
manager.browsers[0]._UNSUCCESSFUL_SPAWN_LIMIT = 2 # Quick timeout
|
||||
manager.get("example.com") # Cause a selenium crash
|
||||
manager.get("example.com") # Cause a selenium crash to force browser to restart
|
||||
|
||||
# The browser will fail to launch due to the proxy crashes
|
||||
try:
|
||||
manager.get(BASE_TEST_URL)
|
||||
except CommandExecutionError:
|
||||
|
|
Загрузка…
Ссылка в новой задаче