* Re-enable test_profile_saved_when_launch_crashes

Update `test_profile_saved_when_launch_crashes` so that it does not
depend on the no longer supported proxy to make browser restarts fail.
Instead, set the `FIREFOX_BINARY` environment variable to point to a
wrong path.

Also, fix a bug in `kill_browser_manager()`, which would cause OpenWPM
to crash because of a `psutil.NoSuchProcess` exception and without
archiving the browser profile, whenever a browser restart failed to
launch geckodriver.

Finally, make `kill_process_and_children()` use the timeout set via its
arguments, which it previously ignored.

* Update docstring of dump_profile

Add a note for callers that they should make sure Firefox is closed, in
order to prevent ending up with a corrupted database in the archived
profile.

* Update test_browser_profile_coverage

Remove the buggy and outdated for loop that determined whether a url is
expected to be missing from the places.sqlite database of the browser
profile, as we have not observed any missing urls when running this
test.
This commit is contained in:
Georgia Kokkinou 2021-04-28 14:04:10 +03:00 коммит произвёл GitHub
Родитель a159496e22
Коммит 3619b55682
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
5 изменённых файлов: 31 добавлений и 56 удалений

Просмотреть файл

@ -574,9 +574,15 @@ class BrowserManagerHandle:
"""`geckodriver_pid` is the geckodriver process. We first kill
the child processes (i.e. firefox) and then kill the geckodriver
process."""
kill_process_and_children(
psutil.Process(pid=self.geckodriver_pid), self.logger
)
try:
geckodriver_process = psutil.Process(pid=self.geckodriver_pid)
except psutil.NoSuchProcess:
self.logger.debug(
"BROWSER %i: geckodriver process with pid %i has already"
" exited" % (self.browser_id, self.geckodriver_pid)
)
return
kill_process_and_children(geckodriver_process, self.logger)
def shutdown_browser(self, during_init: bool, force: bool = False) -> None:
""" Runs the closing tasks for this Browser/BrowserManager """

Просмотреть файл

@ -21,7 +21,12 @@ def dump_profile(
compress: bool,
browser_params: BrowserParamsInternal,
) -> None:
"""Dumps a browser profile to a tar file."""
"""Dumps a browser profile to a tar file.
Should only be called when the browser is closed, to prevent
database corruption in the archived profile (see section 1.2
of https://www.sqlite.org/howtocorrupt.html).
"""
assert browser_params.browser_id is not None
# Creating the folders if need be

Просмотреть файл

@ -60,7 +60,7 @@ def kill_process_and_children(
kill_process_and_children(child, logger)
parent_process.kill()
parent_process.wait(timeout=20)
parent_process.wait(timeout=timeout)
except psutil.NoSuchProcess:
logger.debug("Process %i has already exited", parent_process.pid)
pass

Просмотреть файл

@ -47,8 +47,8 @@ def test_browser_profile_coverage(default_params, task_manager_creator):
"""Test the coverage of the browser's profile.
This verifies that Firefox's places.sqlite database contains all
visited sites (with a few exceptions). If it does not, it is likely
the profile is lost at some point during the crawl.
visited sites. If it does not, it is likely the profile is lost at
some point during the crawl.
"""
# Run the test crawl
manager_params, browser_params = default_params
@ -77,17 +77,14 @@ def test_browser_profile_coverage(default_params, task_manager_creator):
req_ps.add(du.get_ps_plus_1(url))
hist_ps = set() # visited domains from crawl_history Table
statuses = dict()
rows = db_utils.query_db(
crawl_db,
"SELECT arguments, command_status FROM crawl_history WHERE"
" command='GetCommand'",
"SELECT arguments FROM crawl_history WHERE command='GetCommand'",
)
for arguments, command_status in rows:
for (arguments,) in rows:
url = json.loads(arguments)["url"]
ps = du.get_ps_plus_1(url)
hist_ps.add(ps)
statuses[ps] = command_status
# Grab urls from Firefox database
profile_ps = set() # visited domains from firefox profile
@ -101,42 +98,12 @@ def test_browser_profile_coverage(default_params, task_manager_creator):
# We expect a url to be in the Firefox profile if:
# 1. We've made requests to it
# 2. The url is a top_url we entered into the address bar
#
# Previously, we expected some missing urls if the following
# conditions were not met, but this is no longer the case:
# 3. The url successfully loaded (see: Issue #40)
# 4. The site does not respond to the initial request with a 204
# (won't show in FF DB)
# See PR #893 to restore this behavior in case this test fails.
missing_urls = req_ps.intersection(hist_ps).difference(profile_ps)
unexpected_missing_urls = set()
for url in missing_urls:
if command_status[url] != "ok":
continue
# Get the visit id for the url
rows = db_utils.query_db(
crawl_db,
"SELECT visit_id FROM site_visits WHERE site_url = ?",
("http://" + url,),
)
visit_id = rows[0]
rows = db_utils.query_db(
crawl_db,
"SELECT COUNT(*) FROM http_responses WHERE visit_id = ?",
(visit_id,),
)
if rows[0] > 1:
continue
rows = db_utils.query_db(
crawl_db,
"SELECT response_status, location FROM "
"http_responses WHERE visit_id = ?",
(visit_id,),
)
response_status, location = rows[0]
if response_status == 204:
continue
if location == "http://": # site returned a blank redirect
continue
unexpected_missing_urls.add(url)
assert len(unexpected_missing_urls) == 0
assert len(missing_urls) == 0

Просмотреть файл

@ -52,26 +52,23 @@ def test_profile_error(default_params, task_manager_creator):
task_manager_creator((manager_params, browser_params[:1]))
@pytest.mark.skip(reason="proxy no longer supported, need to update")
def test_profile_saved_when_launch_crashes(default_params, task_manager_creator):
def test_profile_saved_when_launch_crashes(
monkeypatch, default_params, task_manager_creator
):
manager_params, browser_params = default_params
manager_params.num_browsers = 1
browser_params[0].profile_archive_dir = (
manager_params.data_directory / "browser_profile"
)
browser_params[0].proxy = True
browser_params[0].save_content = "script"
manager, _ = task_manager_creator((manager_params, browser_params[:1]))
manager.get(BASE_TEST_URL)
# Kill the LevelDBAggregator
# This will cause the proxy launch to crash
manager.ldb_status_queue.put("DIE")
# This will cause browser restarts to fail
monkeypatch.setenv("FIREFOX_BINARY", "/tmp/NOTREAL")
manager.browsers[0]._SPAWN_TIMEOUT = 2 # Have timeout occur quickly
manager.browsers[0]._UNSUCCESSFUL_SPAWN_LIMIT = 2 # Quick timeout
manager.get("example.com") # Cause a selenium crash
manager.get("example.com") # Cause a selenium crash to force browser to restart
# The browser will fail to launch due to the proxy crashes
try:
manager.get(BASE_TEST_URL)
except CommandExecutionError: