From daa6dba4e3c9cd2b5b6ccd4fb1d01b174aef7c7d Mon Sep 17 00:00:00 2001 From: Georgia Kokkinou Date: Mon, 1 Mar 2021 17:18:00 +0200 Subject: [PATCH 01/22] Enable stateful crawling and tests Reenable stateful crawling and profile tests. Also, update the docs now that stateful crawling is supported. Currently, stateful crawling is broken, as geckodriver deletes the browser profile when closing or crashing before we can archive it. --- demo.py | 1 - docs/Configuration.md | 6 - openwpm/browser_manager.py | 86 ++++++------ openwpm/command_sequence.py | 13 +- openwpm/commands/profile_commands.py | 18 ++- openwpm/config.py | 4 +- openwpm/task_manager.py | 15 +-- test/test_crawl.py | 195 ++++++++++++--------------- test/test_profile.py | 130 +++++++++--------- 9 files changed, 215 insertions(+), 253 deletions(-) diff --git a/demo.py b/demo.py index 4267faf4..4c6ef23a 100644 --- a/demo.py +++ b/demo.py @@ -65,7 +65,6 @@ with TaskManager( command_sequence = CommandSequence( site, site_rank=index, - reset=True, callback=callback, ) diff --git a/docs/Configuration.md b/docs/Configuration.md index f6c954c1..4d556a81 100644 --- a/docs/Configuration.md +++ b/docs/Configuration.md @@ -249,11 +249,6 @@ TODO # Browser Profile Support -**WARNING: Stateful crawls are currently not supported. Attempts to run -stateful crawls will throw `NotImplementedError`s. The work required to -restore support is tracked in -[this project](https://github.com/mozilla/OpenWPM/projects/2).** - ## Stateful vs Stateless crawls By default OpenWPM performs a "stateful" crawl, in that it keeps a consistent @@ -323,7 +318,6 @@ but will not be used during crash recovery. Specifically: profile specified by `seed_tar`. If OpenWPM determines that Firefox needs to restart for some reason during the crawl, it will use the profile from the most recent page visit (pre-crash) rather than the `seed_tar` profile. -Note that stateful crawls are currently [unsupported](https://github.com/mozilla/OpenWPM/projects/2)). * For stateless crawls, the initial `seed_tar` will be loaded during each new page visit. Note that this means the profile will very likely be _incomplete_, as cookies or storage may have been set or changed during the diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index 544b8d81..a342957f 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -5,9 +5,11 @@ import pickle import shutil import signal import sys +import tempfile import threading import time import traceback +from pathlib import Path from queue import Empty as EmptyQueue from typing import Optional, Union @@ -16,6 +18,7 @@ from multiprocess import Queue from selenium.common.exceptions import WebDriverException from tblib import pickling_support +from .commands.profile_commands import DumpProfileCommand from .commands.types import BaseCommand, ShutdownSignal from .config import BrowserParamsInternal, ManagerParamsInternal from .deploy_browsers import deploy_firefox @@ -33,7 +36,7 @@ pickling_support.install() class Browser: """ - The Browser class is responsbile for holding all of the + The Browser class is responsible for holding all of the configuration and status information on BrowserManager process it corresponds to. It also includes a set of methods for managing the BrowserManager process and its child processes/threads. @@ -52,7 +55,7 @@ class Browser: self._UNSUCCESSFUL_SPAWN_LIMIT = 4 # manager parameters - self.current_profile_path = None + self.current_profile_path: Optional[Path] = None self.db_socket_address = manager_params.storage_controller_address assert browser_params.browser_id is not None self.browser_id: BrowserId = browser_params.browser_id @@ -97,29 +100,33 @@ class Browser: sets up the BrowserManager and gets the process id, browser pid and, if applicable, screen pid. loads associated user profile if necessary """ - # Unsupported. See https://github.com/mozilla/OpenWPM/projects/2 # if this is restarting from a crash, update the tar location # to be a tar of the crashed browser's history - """ if self.current_profile_path is not None: # tar contents of crashed profile to a temp dir - tempdir = tempfile.mkdtemp(prefix="owpm_profile_archive_") + "/" - profile_commands.dump_profile( - self.current_profile_path, - self.manager_params, - self.browser_params, - tempdir, - close_webdriver=False, + tempdir = tempfile.mkdtemp(prefix="owpm_profile_archive_") + tar_path = Path(tempdir) / "profile.tar.gz" + + self.browser_params.profile_path = self.current_profile_path + dump_profile_command = DumpProfileCommand( + tar_path=tar_path, close_webdriver=False, compress=True ) + dump_profile_command.execute( + webdriver=None, + browser_params=self.browser_params, + manager_params=self.manager_params, + extension_socket=None, + ) + # make sure browser loads crashed profile - self.browser_params.recovery_tar = tempdir + self.browser_params.recovery_tar = tar_path crash_recovery = True else: - """ + tempdir = None + crash_recovery = False + self.logger.info("BROWSER %i: Launching browser..." % self.browser_id) - tempdir = None - crash_recovery = False self.is_fresh = not crash_recovery # Try to spawn the browser within the timelimit @@ -210,7 +217,7 @@ class Browser: if success: self.logger.debug("BROWSER %i: Browser spawn sucessful!" % self.browser_id) previous_profile_path = self.current_profile_path - self.current_profile_path = driver_profile_path + self.current_profile_path = Path(driver_profile_path) if driver_profile_path != spawned_profile_path: shutil.rmtree(spawned_profile_path, ignore_errors=True) if previous_profile_path is not None: @@ -394,33 +401,32 @@ class Browser: self.close_browser_manager(force=force) # Archive browser profile (if requested) - if not during_init and self.browser_params.profile_archive_dir is not None: - self.logger.warning( - "BROWSER %i: Archiving the browser profile directory is " - "currently unsupported. " - "See: https://github.com/mozilla/OpenWPM/projects/2" % self.browser_id - ) - """ self.logger.debug( - "BROWSER %i: during_init=%s | profile_archive_dir=%s" % ( - self.browser_id, str(during_init), - self.browser_params.profile_archive_dir) - ) - if (not during_init and - self.browser_params.profile_archive_dir is not None): - self.logger.debug( - "BROWSER %i: Archiving browser profile directory to %s" % ( - self.browser_id, - self.browser_params.profile_archive_dir)) - profile_commands.dump_profile( - self.current_profile_path, - self.manager_params, - self.browser_params, + "BROWSER %i: during_init=%s | profile_archive_dir=%s" + % ( + self.browser_id, + str(during_init), self.browser_params.profile_archive_dir, - close_webdriver=False, - compress=True ) - """ + ) + if not during_init and self.browser_params.profile_archive_dir is not None: + self.logger.debug( + "BROWSER %i: Archiving browser profile directory to %s" + % (self.browser_id, self.browser_params.profile_archive_dir) + ) + + self.browser_params.profile_path = self.current_profile_path + dump_profile_command = DumpProfileCommand( + tar_path=self.browser_params.profile_archive_dir, + close_webdriver=False, + compress=True, + ) + dump_profile_command.execute( + webdriver=None, + browser_params=self.browser_params, + manager_params=self.manager_params, + extension_socket=None, + ) # Clean up temporary files if self.current_profile_path is not None: diff --git a/openwpm/command_sequence.py b/openwpm/command_sequence.py index a5eca5b0..18b9a150 100644 --- a/openwpm/command_sequence.py +++ b/openwpm/command_sequence.py @@ -10,6 +10,7 @@ from .commands.browser_commands import ( SaveScreenshotCommand, ScreenshotFullPageCommand, ) +from .commands.profile_commands import DumpProfileCommand from .commands.types import BaseCommand from .errors import CommandExecutionError @@ -86,16 +87,10 @@ class CommandSequence: self._commands_with_timeout.append((command, timeout)) self.contains_get_or_browse = True - def dump_profile( - self, dump_folder, close_webdriver=False, compress=True, timeout=120 - ): + def dump_profile(self, tar_path, close_webdriver=False, compress=True, timeout=120): """ dumps from the profile path to a given file (absolute path) """ - raise NotImplementedError( - "Profile saving is currently unsupported. " - "See: https://github.com/mozilla/OpenWPM/projects/2." - ) self.total_timeout += timeout - command = DumpProfCommand(dump_folder, close_webdriver, compress) + command = DumpProfileCommand(tar_path, close_webdriver, compress) self._commands_with_timeout.append((command, timeout)) def save_screenshot(self, suffix="", timeout=30): @@ -131,7 +126,7 @@ class CommandSequence: self.total_timeout += timeout if not self.contains_get_or_browse: raise CommandExecutionError( - "No get or browse request preceding " "the dump page source command", + "No get or browse request preceding the dump page source command", self, ) command = ScreenshotFullPageCommand(suffix) diff --git a/openwpm/commands/profile_commands.py b/openwpm/commands/profile_commands.py index d44781d4..7a6ff9db 100644 --- a/openwpm/commands/profile_commands.py +++ b/openwpm/commands/profile_commands.py @@ -2,6 +2,7 @@ import logging import shutil import tarfile from pathlib import Path +from typing import Optional from selenium.webdriver import Firefox @@ -25,13 +26,9 @@ class DumpProfileCommand(BaseCommand): self.tar_path = tar_path self.close_webdriver = close_webdriver self.compress = compress - raise NotImplementedError( - "Profile dumping is currently unsupported. " - "See: https://github.com/mozilla/OpenWPM/projects/2." - ) def __repr__(self) -> str: - return "DumpProfCommand({},{},{})".format( + return "DumpProfileCommand({},{},{})".format( self.tar_path, self.close_webdriver, self.compress ) @@ -40,10 +37,11 @@ class DumpProfileCommand(BaseCommand): webdriver: Firefox, browser_params: BrowserParamsInternal, manager_params: ManagerParamsInternal, - extension_socket: ClientSocket, + extension_socket: Optional[ClientSocket], ) -> None: browser_profile_folder = browser_params.profile_path assert browser_profile_folder is not None + assert browser_params.browser_id is not None # Creating the folders if need be self.tar_path.parent.mkdir(exist_ok=True, parents=True) @@ -65,7 +63,7 @@ class DumpProfileCommand(BaseCommand): logger.debug( "BROWSER %i: Backing up full profile from %s to %s" % ( - self.browser_id, + browser_params.browser_id, browser_profile_folder, self.tar_path, ) @@ -94,7 +92,7 @@ class DumpProfileCommand(BaseCommand): ): logger.critical( "BROWSER %i: %s NOT FOUND IN profile folder, skipping." - % (self.browser_id, full_path) + % (browser_params.browser_id, full_path) ) elif not full_path.is_file() and ( full_path.name.endswith("shm") or full_path.name.endswith("wal") @@ -106,7 +104,7 @@ class DumpProfileCommand(BaseCommand): if not full_path.is_dir(): logger.warning( "BROWSER %i: %s NOT FOUND IN profile folder, skipping." - % (self.browser_id, full_path) + % (browser_params.browser_id, full_path) ) continue tar.add(full_path, arcname=item) @@ -125,9 +123,9 @@ def load_profile( The tar will remain unmodified. """ - assert tar_path.is_file() assert browser_params.browser_id is not None try: + assert tar_path.is_file() # Copy and untar the loaded profile logger.debug( "BROWSER %i: Copying profile tar from %s to %s" diff --git a/openwpm/config.py b/openwpm/config.py index d8f4a477..4b64f578 100644 --- a/openwpm/config.py +++ b/openwpm/config.py @@ -97,7 +97,9 @@ class BrowserParams(DataClassJsonMixin): prefs: dict = field(default_factory=dict) tp_cookies: str = "always" bot_mitigation: bool = False - profile_archive_dir: Optional[str] = None + profile_archive_dir: Optional[Path] = field( + default=None, metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path) + ) recovery_tar: Optional[Path] = None donottrack: bool = False tracking_protection: bool = False diff --git a/openwpm/task_manager.py b/openwpm/task_manager.py index e61d98f0..934b4707 100644 --- a/openwpm/task_manager.py +++ b/openwpm/task_manager.py @@ -305,8 +305,8 @@ class TaskManager: Parameters ---------- during_init : - flag to indicator if this shutdown is occuring during - the TaskManager initialization + flag to indicate if this shutdown is occuring during + the TaskManager initialization relaxed : If `True` the function will wait for all active `CommandSequences` to finish before shutting down @@ -434,17 +434,6 @@ class TaskManager: assert browser.browser_id is not None assert browser.curr_visit_id is not None reset = command_sequence.reset - if not reset: - self.logger.warning( - "BROWSER %i: Browser will not reset after CommandSequence " - "executes. OpenWPM does not currently support stateful crawls " - "(see: https://github.com/mozilla/OpenWPM/projects/2). " - "The next command issued to this browser may or may not " - "use the same profile (depending on the failure status of " - "this command). To prevent this warning, initialize the " - "CommandSequence with `reset` set to `True` to use a fresh " - "profile for each command." % browser.browser_id - ) self.logger.info( "Starting to work on CommandSequence with " "visit_id %d on browser with id %d", diff --git a/test/test_crawl.py b/test/test_crawl.py index c65a2ab2..563f2e6d 100644 --- a/test/test_crawl.py +++ b/test/test_crawl.py @@ -1,18 +1,17 @@ -# type:ignore -# As this file is no longer maintained, mypy shouldn't check this -import os +"""Runs a short test crawl. + +This should be used to test any features that require real crawl data. +This should be avoided if possible, as controlled tests will be easier +to debug. +""" + import tarfile -from pathlib import Path -from typing import List, Tuple import domain_utils as du import pytest -from openwpm.config import BrowserParams, ManagerParams from openwpm.utilities import db_utils -from .openwpmtest import OpenWPMTest - TEST_SITES = [ "http://google.com", "http://facebook.com", @@ -42,117 +41,99 @@ def get_public_suffix(url): return url_parts[-1] -class TestCrawl(OpenWPMTest): - """Runs a short test crawl. +@pytest.mark.slow +def test_browser_profile_coverage(default_params, task_manager_creator): + """Test the coverage of the browser's profile. - This should be used to test any features that require real - crawl data. This should be avoided if possible, as controlled - tests will be easier to debug + This verifies that Firefox's places.sqlite database contains all + visited sites (with a few exceptions). If it does not, it is likely + the profile is lost at some point during the crawl. """ + # Run the test crawl + manager_params, browser_params = default_params + manager_params.num_browsers = 1 + browser_params[0].profile_archive_dir = ( + manager_params.data_directory / "browser_profile" + ) + browser_params[0].http_instrument = True + manager, crawl_db = task_manager_creator((manager_params, browser_params[:1])) + for site in TEST_SITES: + manager.get(site) + manager.close() - def get_config( - self, data_dir: Path = None - ) -> Tuple[ManagerParams, List[BrowserParams]]: - manager_params, browser_params = self.get_test_config(data_dir) - browser_params[0].profile_archive_dir = os.path.join( - manager_params.data_directory, "browser_profile" - ) - browser_params[0].http_instrument = True - return manager_params, browser_params + # Extract crawl profile + ff_db_tar = browser_params[0].profile_archive_dir / "profile.tar.gz" + with tarfile.open(ff_db_tar) as tar: + tar.extractall(browser_params[0].profile_archive_dir) - @pytest.mark.xfail(run=False) - @pytest.mark.slow - def test_browser_profile_coverage(self, tmpdir: Path, task_manager_creator) -> None: - """Test the coverage of the browser's profile + # Output databases + ff_db = browser_params[0].profile_archive_dir / "places.sqlite" - This verifies that Firefox's places.sqlite database contains - all visited sites (with a few exceptions). If it does not, - it is likely the profile is lost at some point during the crawl - """ - # Run the test crawl - data_dir = tmpdir / "data_dir" - manager_params, browser_params = self.get_config(data_dir) - manager, crawl_db = task_manager_creator((manager_params, browser_params)) - for site in TEST_SITES: - manager.get(site) - ff_db_tar = os.path.join( - browser_params[0].profile_archive_dir, "profile.tar.gz" - ) - manager.close() + # Grab urls from crawl database + rows = db_utils.query_db(crawl_db, "SELECT url FROM http_requests") + req_ps = set() # visited domains from http_requests table + for (url,) in rows: + req_ps.add(get_public_suffix(url)) - # Extract crawl profile - with tarfile.open(ff_db_tar) as tar: - tar.extractall(browser_params[0].profile_archive_dir) + hist_ps = set() # visited domains from crawl_history Table + statuses = dict() + rows = db_utils.query_db( + crawl_db, + "SELECT arguments, command_status FROM crawl_history WHERE command='GET'", + ) + for url, command_status in rows: + ps = get_public_suffix(url) + hist_ps.add(ps) + statuses[ps] = command_status - # Output databases - ff_db = os.path.join(browser_params[0].profile_archive_dir, "places.sqlite") + # Grab urls from Firefox database + profile_ps = set() # visited domains from firefox profile + rows = db_utils.query_db(ff_db, "SELECT url FROM moz_places") + for (host,) in rows: + try: + profile_ps.add(get_public_suffix(host)) + except AttributeError: + pass - # Grab urls from crawl database - rows = db_utils.query_db(crawl_db, "SELECT url FROM http_requests") - req_ps = set() # visited domains from http_requests table - for (url,) in rows: - req_ps.add(get_public_suffix(url)) + # We expect a url to be in the Firefox profile if: + # 1. We've made requests to it + # 2. The url is a top_url we entered into the address bar + # 3. The url successfully loaded (see: Issue #40) + # 4. The site does not respond to the initial request with a 204 + # (won't show in FF DB) + missing_urls = req_ps.intersection(hist_ps).difference(profile_ps) + unexpected_missing_urls = set() + for url in missing_urls: + if command_status[url] != "ok": + continue - hist_ps = set() # visited domains from crawl_history Table - statuses = dict() + # Get the visit id for the url rows = db_utils.query_db( crawl_db, - "SELECT arguments, command_status " - "FROM crawl_history WHERE command='GET'", + "SELECT visit_id FROM site_visits WHERE site_url = ?", + ("http://" + url,), ) - for url, command_status in rows: - ps = get_public_suffix(url) - hist_ps.add(ps) - statuses[ps] = command_status + visit_id = rows[0] - # Grab urls from Firefox database - profile_ps = set() # visited domains from firefox profile - rows = db_utils.query_db(ff_db, "SELECT url FROM moz_places") - for (host,) in rows: - try: - profile_ps.add(get_public_suffix(host)) - except AttributeError: - pass + rows = db_utils.query_db( + crawl_db, + "SELECT COUNT(*) FROM http_responses WHERE visit_id = ?", + (visit_id,), + ) + if rows[0] > 1: + continue - # We expect urls to be in the Firefox profile if: - # 1. We've made requests to it - # 2. The url is a top_url we entered into the address bar - # 3. The url successfully loaded (see: Issue #40) - # 4. The site does not respond to the initial request with a 204 - # (won't show in FF DB) - missing_urls = req_ps.intersection(hist_ps).difference(profile_ps) - unexpected_missing_urls = set() - for url in missing_urls: - if command_status[url] != "ok": - continue + rows = db_utils.query_db( + crawl_db, + "SELECT response_status, location FROM " + "http_responses WHERE visit_id = ?", + (visit_id,), + ) + response_status, location = rows[0] + if response_status == 204: + continue + if location == "http://": # site returned a blank redirect + continue + unexpected_missing_urls.add(url) - # Get the visit id for the url - rows = db_utils.query_db( - crawl_db, - "SELECT visit_id FROM site_visits " "WHERE site_url = ?", - ("http://" + url,), - ) - visit_id = rows[0] - - rows = db_utils.query_db( - crawl_db, - "SELECT COUNT(*) FROM http_responses " "WHERE visit_id = ?", - (visit_id,), - ) - if rows[0] > 1: - continue - - rows = db_utils.query_db( - crawl_db, - "SELECT response_status, location FROM " - "http_responses WHERE visit_id = ?", - (visit_id,), - ) - response_status, location = rows[0] - if response_status == 204: - continue - if location == "http://": # site returned a blank redirect - continue - unexpected_missing_urls.add(url) - - assert len(unexpected_missing_urls) == 0 + assert len(unexpected_missing_urls) == 0 diff --git a/test/test_profile.py b/test/test_profile.py index 85c05c80..41166e48 100644 --- a/test/test_profile.py +++ b/test/test_profile.py @@ -1,83 +1,81 @@ -from os.path import isfile, join from pathlib import Path -from typing import Any, List, Optional, Tuple +from typing import Any import pytest from openwpm.command_sequence import CommandSequence from openwpm.commands.types import BaseCommand -from openwpm.config import BrowserParams, ManagerParams from openwpm.errors import CommandExecutionError, ProfileLoadError -from openwpm.task_manager import TaskManager from openwpm.utilities import db_utils -from .openwpmtest import OpenWPMTest - # TODO update these tests to make use of blocking commands -class TestProfile(OpenWPMTest): - def get_config( - self, data_dir: Optional[Path] - ) -> Tuple[ManagerParams, List[BrowserParams]]: - manager_params, browser_params = self.get_test_config(data_dir) - browser_params[0].profile_archive_dir = join( - manager_params.data_directory, "browser_profile" - ) - return manager_params, browser_params +def test_saving(default_params, task_manager_creator): + manager_params, browser_params = default_params + manager_params.num_browsers = 1 + browser_params[0].profile_archive_dir = ( + manager_params.data_directory / "browser_profile" + ) + manager, _ = task_manager_creator((manager_params, browser_params[:1])) + manager.get("http://example.com") + manager.close() + assert (browser_params[0].profile_archive_dir / "profile.tar.gz").is_file() - @pytest.mark.xfail(run=False) - def test_saving(self): - manager_params, browser_params = self.get_config() - manager = TaskManager(manager_params, browser_params) + +def test_crash_profile(default_params, task_manager_creator): + manager_params, browser_params = default_params + manager_params.num_browsers = 1 + manager_params.failure_limit = 2 + browser_params[0].profile_archive_dir = ( + manager_params.data_directory / "browser_profile" + ) + manager, _ = task_manager_creator((manager_params, browser_params[:1])) + try: + manager.get("http://example.com") # So we have a profile + manager.get("example.com") # Selenium requires scheme prefix + manager.get("example.com") # Selenium requires scheme prefix + manager.get("example.com") # Selenium requires scheme prefix + manager.get("example.com") # Requires two commands to shut down + except CommandExecutionError: + pass + assert (browser_params[0].profile_archive_dir / "profile.tar.gz").is_file() + + +def test_profile_error(default_params, task_manager_creator): + manager_params, browser_params = default_params + manager_params.num_browsers = 1 + browser_params[0].seed_tar = Path("/tmp/NOTREAL") + with pytest.raises(ProfileLoadError): + task_manager_creator((manager_params, browser_params[:1])) + + +@pytest.mark.skip(reason="proxy no longer supported, need to update") +def test_profile_saved_when_launch_crashes(default_params, task_manager_creator): + manager_params, browser_params = default_params + manager_params.num_browsers = 1 + browser_params[0].profile_archive_dir = ( + manager_params.data_directory / "browser_profile" + ) + browser_params[0].proxy = True + browser_params[0].save_content = "script" + manager, _ = task_manager_creator((manager_params, browser_params[:1])) + manager.get("http://example.com") + + # Kill the LevelDBAggregator + # This will cause the proxy launch to crash + manager.ldb_status_queue.put("DIE") + manager.browsers[0]._SPAWN_TIMEOUT = 2 # Have timeout occur quickly + manager.browsers[0]._UNSUCCESSFUL_SPAWN_LIMIT = 2 # Quick timeout + manager.get("example.com") # Cause a selenium crash + + # The browser will fail to launch due to the proxy crashes + try: manager.get("http://example.com") - manager.close() - assert isfile(join(browser_params[0].profile_archive_dir, "profile.tar.gz")) - - @pytest.mark.xfail(run=False) - def test_crash_profile(self): - manager_params, browser_params = self.get_config() - manager_params.failure_limit = 2 - manager = TaskManager(manager_params, browser_params) - try: - manager.get("http://example.com") # So we have a profile - manager.get("example.com") # Selenium requires scheme prefix - manager.get("example.com") # Selenium requires scheme prefix - manager.get("example.com") # Selenium requires scheme prefix - manager.get("example.com") # Requires two commands to shut down - except CommandExecutionError: - pass - assert isfile(join(browser_params[0].profile_archive_dir, "profile.tar.gz")) - - @pytest.mark.xfail(run=False) - def test_profile_error(self): - manager_params, browser_params = self.get_config() - browser_params[0].seed_tar = "/tmp/NOTREAL" - with pytest.raises(ProfileLoadError): - TaskManager(manager_params, browser_params) # noqa - - @pytest.mark.skip(reason="proxy no longer supported, need to update") - def test_profile_saved_when_launch_crashes(self): - manager_params, browser_params = self.get_config() - browser_params[0].proxy = True - browser_params[0].save_content = "script" - manager = TaskManager(manager_params, browser_params) - manager.get("http://example.com") - - # Kill the LevelDBAggregator - # This will cause the proxy launch to crash - manager.ldb_status_queue.put("DIE") - manager.browsers[0]._SPAWN_TIMEOUT = 2 # Have timeout occur quickly - manager.browsers[0]._UNSUCCESSFUL_SPAWN_LIMIT = 2 # Quick timeout - manager.get("example.com") # Cause a selenium crash - - # The browser will fail to launch due to the proxy crashes - try: - manager.get("http://example.com") - except CommandExecutionError: - pass - manager.close() - assert isfile(join(browser_params[0].profile_archive_dir, "profile.tar.gz")) + except CommandExecutionError: + pass + manager.close() + assert (browser_params[0].profile_archive_dir / "profile.tar.gz").is_file() def test_seed_persistance(default_params, task_manager_creator): From 51c0849cbd5c23ee42e3c1c21394e6ab4de2003d Mon Sep 17 00:00:00 2001 From: Georgia Kokkinou Date: Tue, 2 Mar 2021 14:23:40 +0200 Subject: [PATCH 02/22] Use custom browser profile Use a custom profile by setting it as an argument via the Options class, instead of using the FirefoxProfile class. This way geckodriver does not delete it when crashing or closing. Also, remove some unused arguments from the function that configures privacy settings in Firefox. Finally, remove the code that clears driver.profile before calling driver.quit(), as driver.profile is always None when using a custom profile. --- openwpm/browser_manager.py | 16 ++++----------- openwpm/deploy_browsers/configure_firefox.py | 2 +- openwpm/deploy_browsers/deploy_firefox.py | 21 ++++++++++++-------- 3 files changed, 18 insertions(+), 21 deletions(-) diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index a342957f..8e10c044 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -217,7 +217,7 @@ class Browser: if success: self.logger.debug("BROWSER %i: Browser spawn sucessful!" % self.browser_id) previous_profile_path = self.current_profile_path - self.current_profile_path = Path(driver_profile_path) + self.current_profile_path = driver_profile_path if driver_profile_path != spawned_profile_path: shutil.rmtree(spawned_profile_path, ignore_errors=True) if previous_profile_path is not None: @@ -414,10 +414,10 @@ class Browser: "BROWSER %i: Archiving browser profile directory to %s" % (self.browser_id, self.browser_params.profile_archive_dir) ) - + tar_path = self.browser_params.profile_archive_dir / "profile.tar.gz" self.browser_params.profile_path = self.current_profile_path dump_profile_command = DumpProfileCommand( - tar_path=self.browser_params.profile_archive_dir, + tar_path=tar_path, close_webdriver=False, compress=True, ) @@ -450,8 +450,6 @@ def BrowserManager( driver, prof_folder, display = deploy_firefox.deploy_firefox( status_queue, browser_params, manager_params, crash_recovery ) - if prof_folder[-1] != "/": - prof_folder += "/" # Read the extension port -- if extension is enabled # TODO: Initial communication from extension to TM should use sockets @@ -462,7 +460,7 @@ def BrowserManager( ) elapsed = 0 port = None - ep_filename = os.path.join(prof_folder, "extension_port.txt") + ep_filename = prof_folder / "extension_port.txt" while elapsed < 5: try: with open(ep_filename, "rt") as f: @@ -504,12 +502,6 @@ def BrowserManager( command: Union[ShutdownSignal, BaseCommand] = command_queue.get() if type(command) is ShutdownSignal: - # Geckodriver creates a copy of the profile (and the original - # temp file created by FirefoxProfile() is deleted). - # We clear the profile attribute here to prevent prints from: - # https://github.com/SeleniumHQ/selenium/blob/4e4160dd3d2f93757cafb87e2a1c20d6266f5554/py/selenium/webdriver/firefox/webdriver.py#L193-L199 - if driver.profile and not os.path.isdir(driver.profile.path): - driver.profile = None driver.quit() status_queue.put("OK") return diff --git a/openwpm/deploy_browsers/configure_firefox.py b/openwpm/deploy_browsers/configure_firefox.py index c8367d16..e38b721a 100644 --- a/openwpm/deploy_browsers/configure_firefox.py +++ b/openwpm/deploy_browsers/configure_firefox.py @@ -1,7 +1,7 @@ """ Set prefs and load extensions in Firefox """ -def privacy(browser_params, fp, fo, root_dir, browser_profile_path): +def privacy(browser_params, fo): """ Configure the privacy settings in Firefox. This includes: * DNT diff --git a/openwpm/deploy_browsers/deploy_firefox.py b/openwpm/deploy_browsers/deploy_firefox.py index 86252568..697ae276 100755 --- a/openwpm/deploy_browsers/deploy_firefox.py +++ b/openwpm/deploy_browsers/deploy_firefox.py @@ -1,6 +1,7 @@ import json import logging import os.path +import tempfile from pathlib import Path from typing import Any, Dict, Optional, Tuple @@ -8,7 +9,6 @@ from easyprocess import EasyProcessError from multiprocess import Queue from pyvirtualdisplay import Display from selenium import webdriver -from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from ..commands.profile_commands import load_profile from ..config import BrowserParamsInternal, ConfigEncoder, ManagerParamsInternal @@ -25,7 +25,7 @@ def deploy_firefox( browser_params: BrowserParamsInternal, manager_params: ManagerParamsInternal, crash_recovery: bool, -) -> Tuple[webdriver.Firefox, str, Optional[Display]]: +) -> Tuple[webdriver.Firefox, Path, Optional[Display]]: """ launches a firefox instance with parameters set by the input dictionary """ @@ -33,14 +33,20 @@ def deploy_firefox( root_dir = os.path.dirname(__file__) # directory of this file - fp = FirefoxProfile() - browser_profile_path = Path(fp.path) + browser_profile_path = Path(tempfile.mkdtemp(".firefox_profile")) status_queue.put(("STATUS", "Profile Created", browser_profile_path)) # Use Options instead of FirefoxProfile to set preferences since the # Options method has no "frozen"/restricted options. # https://github.com/SeleniumHQ/selenium/issues/2106#issuecomment-320238039 fo = Options() + # Set a custom profile that is used in-place and is not deleted by geckodriver. + # https://firefox-source-docs.mozilla.org/testing/geckodriver/CrashReports.html + # Using FirefoxProfile breaks stateful crawling: + # https://github.com/mozilla/OpenWPM/issues/423#issuecomment-521018093 + fo.add_argument("-profile") + fo.add_argument(str(browser_profile_path)) + assert browser_params.browser_id is not None if browser_params.seed_tar and not crash_recovery: logger.info( @@ -111,7 +117,7 @@ def deploy_firefox( # fo.set_preference("extensions.@openwpm.sdk.console.logLevel", "all") # Configure privacy settings - configure_firefox.privacy(browser_params, fp, fo, root_dir, browser_profile_path) + configure_firefox.privacy(browser_params, fo) # Set various prefs to improve speed and eliminate traffic to Mozilla configure_firefox.optimize_prefs(fo) @@ -135,9 +141,8 @@ def deploy_firefox( status_queue.put(("STATUS", "Launch Attempted", None)) fb = FirefoxBinary(firefox_path=firefox_binary_path) driver = webdriver.Firefox( - firefox_profile=fp, firefox_binary=fb, - firefox_options=fo, + options=fo, log_path=interceptor.fifo, ) @@ -165,4 +170,4 @@ def deploy_firefox( status_queue.put(("STATUS", "Browser Launched", int(pid))) - return driver, driver.capabilities["moz:profile"], display + return driver, Path(driver.capabilities["moz:profile"]), display From 7f51e50f4462002987d88c5d92a963526cd63847 Mon Sep 17 00:00:00 2001 From: Georgia Kokkinou Date: Tue, 2 Mar 2021 14:47:21 +0200 Subject: [PATCH 03/22] Pass service_args to geckodriver Fix a bug in PatchedGeckoDriverService that caused geckodriver not to receive the service_args passed when starting the browser. PatchedGeckoDriverService is a modified version of Selenium's Service class and this bug has been fixed in the original version. --- openwpm/deploy_browsers/selenium_firefox.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openwpm/deploy_browsers/selenium_firefox.py b/openwpm/deploy_browsers/selenium_firefox.py index 67d1450b..6081d78d 100644 --- a/openwpm/deploy_browsers/selenium_firefox.py +++ b/openwpm/deploy_browsers/selenium_firefox.py @@ -129,7 +129,7 @@ class PatchedGeckoDriverService(BaseService): self.service_args = service_args or [] def command_line_args(self): - return ["--port", "%d" % self.port] + return ["--port", "%d" % self.port] + self.service_args def send_remote_shutdown_command(self): pass From a355dc840d9b28762138fa8c92f9ae02ceb44047 Mon Sep 17 00:00:00 2001 From: Georgia Kokkinou Date: Wed, 3 Mar 2021 12:54:14 +0200 Subject: [PATCH 04/22] Create user.js manually in custom profile Geckodriver has a bug that makes it write the browser preferences we set, as well as its own default browser preferences, to a user.js file in the wrong profile directory when using a custom profile: https://github.com/mozilla/geckodriver/issues/1844. As a temporary workaround until this issue gets fixed, we create the user.js file ourselves. In order to do this, we keep a copy of geckodriver's default preferences in our code. Closes #423 --- openwpm/deploy_browsers/configure_firefox.py | 268 +++++++++++++------ openwpm/deploy_browsers/deploy_firefox.py | 28 +- 2 files changed, 207 insertions(+), 89 deletions(-) diff --git a/openwpm/deploy_browsers/configure_firefox.py b/openwpm/deploy_browsers/configure_firefox.py index e38b721a..e22a9cc0 100644 --- a/openwpm/deploy_browsers/configure_firefox.py +++ b/openwpm/deploy_browsers/configure_firefox.py @@ -1,7 +1,103 @@ """ Set prefs and load extensions in Firefox """ +import json +import re +from pathlib import Path +from typing import Any, Dict -def privacy(browser_params, fo): +# TODO: Remove hardcoded geckodriver default preferences once +# https://github.com/mozilla/geckodriver/issues/1844 is fixed. +# Source of preferences: +# https://hg.mozilla.org/mozilla-central/file/tip/testing/geckodriver/src/prefs.rs +# https://hg.mozilla.org/mozilla-central/file/tip/testing/geckodriver/src/marionette.rs +DEFAULT_GECKODRIVER_PREFS = { + "app.normandy.api_url": "", + "app.update.checkInstallTime": False, + "app.update.disabledForTesting": True, + "app.update.auto": False, + "browser.dom.window.dump.enabled": True, + "devtools.console.stdout.chrome": True, + "browser.safebrowsing.blockedURIs.enabled": False, + "browser.safebrowsing.downloads.enabled": False, + "browser.safebrowsing.passwords.enabled": False, + "browser.safebrowsing.malware.enabled": False, + "browser.safebrowsing.phishing.enabled": False, + "browser.sessionstore.resume_from_crash": False, + "browser.shell.checkDefaultBrowser": False, + "browser.startup.homepage_override.mstone": "ignore", + "browser.startup.page": 0, + "browser.tabs.closeWindowWithLastTab": False, + "browser.tabs.warnOnClose": False, + "browser.uitour.enabled": False, + "browser.warnOnQuit": False, + "datareporting.healthreport.documentServerURI": "http://%(server)s/dummy/healthreport/", + "datareporting.healthreport.logging.consoleEnabled": False, + "datareporting.healthreport.service.enabled": False, + "datareporting.healthreport.service.firstRun": False, + "datareporting.healthreport.uploadEnabled": False, + "datareporting.policy.dataSubmissionEnabled": False, + "datareporting.policy.dataSubmissionPolicyBypassNotification": True, + "dom.ipc.reportProcessHangs": False, + "extensions.autoDisableScopes": 0, + "extensions.enabledScopes": 5, + "extensions.installDistroAddons": False, + "extensions.update.enabled": False, + "extensions.update.notifyUser": False, + "focusmanager.testmode": True, + "general.useragent.updates.enabled": False, + "geo.provider.testing": True, + "geo.wifi.scan": False, + "hangmonitor.timeout": 0, + "idle.lastDailyNotification": -1, + "javascript.options.showInConsole": True, + "media.gmp-manager.updateEnabled": False, + "media.sanity-test.disabled": True, + "network.http.phishy-userpass-length": 255, + "network.manage-offline-status": False, + "network.sntp.pools": "%(server)s", + "plugin.state.flash": 0, + "security.certerrors.mitm.priming.enabled": False, + "services.settings.server": "http://%(server)s/dummy/blocklist/", + "startup.homepage_welcome_url": "about:blank", + "startup.homepage_welcome_url.additional": "", + "toolkit.startup.max_resumed_crashes": -1, + "marionette.log.level": "Info", +} + + +def load_existing_prefs(browser_profile_path: Path) -> Dict[str, Any]: + """Load existing user preferences. + + If the browser profile contains a user.js file, load the preferences + specified inside it into a dictionary. + """ + prefs: Dict[str, Any] = {} + prefs_path = browser_profile_path / "user.js" + if not prefs_path.is_file(): + return prefs + # Regular expression from https://stackoverflow.com/a/24563687 + r = re.compile(r"\s*user_pref\(([\"'])(.+?)\1,\s*(.+?)\);") + with open(prefs_path, "r") as f: + for line in f: + m = r.match(line) + if m: + key, value = m.group(2), m.group(3) + prefs[key] = json.loads(value) + return prefs + + +def save_prefs_to_profile(prefs: Dict[str, Any], browser_profile_path: Path) -> None: + """Save all preferences to the browser profile. + + Write preferences from the prefs dictionary to a user.js file in the + profile directory. + """ + with open(browser_profile_path / "user.js", "w") as f: + for key, value in prefs.items(): + f.write('user_pref("%s", %s);\n' % (key, json.dumps(value))) + + +def privacy(browser_params, prefs): """ Configure the privacy settings in Firefox. This includes: * DNT @@ -12,15 +108,15 @@ def privacy(browser_params, fo): # Turns on Do Not Track if browser_params.donottrack: - fo.set_preference("privacy.donottrackheader.enabled", True) + prefs["privacy.donottrackheader.enabled"] = True # Sets the third party cookie setting if browser_params.tp_cookies.lower() == "never": - fo.set_preference("network.cookie.cookieBehavior", 1) + prefs["network.cookie.cookieBehavior"] = 1 elif browser_params.tp_cookies.lower() == "from_visited": - fo.set_preference("network.cookie.cookieBehavior", 3) + prefs["network.cookie.cookieBehavior"] = 3 else: # always allow third party cookies - fo.set_preference("network.cookie.cookieBehavior", 0) + prefs["network.cookie.cookieBehavior"] = 0 # Tracking Protection if browser_params.tracking_protection: @@ -31,7 +127,7 @@ def privacy(browser_params, fo): ) -def optimize_prefs(fo): +def optimize_prefs(prefs): """ Disable various features and checks the browser will do on startup. Some of these (e.g. disabling the newtab page) are required to prevent @@ -42,113 +138,113 @@ def optimize_prefs(fo): * https://github.com/pyllyukko/user.js/blob/master/user.js """ # noqa # Startup / Speed - fo.set_preference("browser.shell.checkDefaultBrowser", False) - fo.set_preference("browser.slowStartup.notificationDisabled", True) - fo.set_preference("browser.slowStartup.maxSamples", 0) - fo.set_preference("browser.slowStartup.samples", 0) - fo.set_preference("extensions.checkCompatibility.nightly", False) - fo.set_preference("browser.rights.3.shown", True) - fo.set_preference("reader.parse-on-load.enabled", False) - fo.set_preference("browser.pagethumbnails.capturing_disabled", True) - fo.set_preference("browser.uitour.enabled", False) - fo.set_preference("dom.flyweb.enabled", False) + prefs["browser.shell.checkDefaultBrowser"] = False + prefs["browser.slowStartup.notificationDisabled"] = True + prefs["browser.slowStartup.maxSamples"] = 0 + prefs["browser.slowStartup.samples"] = 0 + prefs["extensions.checkCompatibility.nightly"] = False + prefs["browser.rights.3.shown"] = True + prefs["reader.parse-on-load.enabled"] = False + prefs["browser.pagethumbnails.capturing_disabled"] = True + prefs["browser.uitour.enabled"] = False + prefs["dom.flyweb.enabled"] = False # Disable health reports / telemetry / crash reports - fo.set_preference("datareporting.policy.dataSubmissionEnabled", False) - fo.set_preference("datareporting.healthreport.uploadEnabled", False) - fo.set_preference("datareporting.healthreport.service.enabled", False) - fo.set_preference("toolkit.telemetry.archive.enabled", False) - fo.set_preference("toolkit.telemetry.enabled", False) - fo.set_preference("toolkit.telemetry.unified", False) - fo.set_preference("breakpad.reportURL", "") - fo.set_preference("dom.ipc.plugins.reportCrashURL", False) - fo.set_preference("browser.selfsupport.url", "") - fo.set_preference("browser.tabs.crashReporting.sendReport", False) - fo.set_preference("browser.crashReports.unsubmittedCheck.enabled", False) - fo.set_preference("dom.ipc.plugins.flash.subprocess.crashreporter.enabled", False) + prefs["datareporting.policy.dataSubmissionEnabled"] = False + prefs["datareporting.healthreport.uploadEnabled"] = False + prefs["datareporting.healthreport.service.enabled"] = False + prefs["toolkit.telemetry.archive.enabled"] = False + prefs["toolkit.telemetry.enabled"] = False + prefs["toolkit.telemetry.unified"] = False + prefs["breakpad.reportURL"] = "" + prefs["dom.ipc.plugins.reportCrashURL"] = False + prefs["browser.selfsupport.url"] = "" + prefs["browser.tabs.crashReporting.sendReport"] = False + prefs["browser.crashReports.unsubmittedCheck.enabled"] = False + prefs["dom.ipc.plugins.flash.subprocess.crashreporter.enabled"] = False # Predictive Actions / Prefetch - fo.set_preference("network.predictor.enabled", False) - fo.set_preference("network.dns.disablePrefetch", True) - fo.set_preference("network.prefetch-next", False) - fo.set_preference("browser.search.suggest.enabled", False) - fo.set_preference("network.http.speculative-parallel-limit", 0) - fo.set_preference("keyword.enabled", False) # location bar using search - fo.set_preference("browser.urlbar.userMadeSearchSuggestionsChoice", True) - fo.set_preference("browser.casting.enabled", False) + prefs["network.predictor.enabled"] = False + prefs["network.dns.disablePrefetch"] = True + prefs["network.prefetch-next"] = False + prefs["browser.search.suggest.enabled"] = False + prefs["network.http.speculative-parallel-limit"] = 0 + prefs["keyword.enabled"] = False # location bar using search + prefs["browser.urlbar.userMadeSearchSuggestionsChoice"] = True + prefs["browser.casting.enabled"] = False # Disable pinging Mozilla for geoip - fo.set_preference("browser.search.geoip.url", "") - fo.set_preference("browser.search.countryCode", "US") - fo.set_preference("browser.search.region", "US") + prefs["browser.search.geoip.url"] = "" + prefs["browser.search.countryCode"] = "US" + prefs["browser.search.region"] = "US" # Disable pinging Mozilla for geo-specific search - fo.set_preference("browser.search.geoSpecificDefaults", False) - fo.set_preference("browser.search.geoSpecificDefaults.url", "") + prefs["browser.search.geoSpecificDefaults"] = False + prefs["browser.search.geoSpecificDefaults.url"] = "" # Disable auto-updating - fo.set_preference("app.update.enabled", False) # browser - fo.set_preference("app.update.url", "") # browser - fo.set_preference("browser.search.update", False) # search - fo.set_preference("extensions.update.enabled", False) # extensions - fo.set_preference("extensions.update.autoUpdateDefault", False) - fo.set_preference("extensions.getAddons.cache.enabled", False) - fo.set_preference("lightweightThemes.update.enabled", False) # Personas + prefs["app.update.enabled"] = False # browser + prefs["app.update.url"] = "" # browser + prefs["browser.search.update"] = False # search + prefs["extensions.update.enabled"] = False # extensions + prefs["extensions.update.autoUpdateDefault"] = False + prefs["extensions.getAddons.cache.enabled"] = False + prefs["lightweightThemes.update.enabled"] = False # Personas # Disable Safebrowsing and other security features # that require on remote content - fo.set_preference("browser.safebrowsing.phising.enabled", False) - fo.set_preference("browser.safebrowsing.malware.enabled", False) - fo.set_preference("browser.safebrowsing.downloads.enabled", False) - fo.set_preference("browser.safebrowsing.downloads.remote.enabled", False) - fo.set_preference("browser.safebrowsing.blockedURIs.enabled", False) - fo.set_preference("browser.safebrowsing.provider.mozilla.gethashURL", "") - fo.set_preference("browser.safebrowsing.provider.google.gethashURL", "") - fo.set_preference("browser.safebrowsing.provider.google4.gethashURL", "") - fo.set_preference("browser.safebrowsing.provider.mozilla.updateURL", "") - fo.set_preference("browser.safebrowsing.provider.google.updateURL", "") - fo.set_preference("browser.safebrowsing.provider.google4.updateURL", "") - fo.set_preference("browser.safebrowsing.provider.mozilla.lists", "") # TP - fo.set_preference("browser.safebrowsing.provider.google.lists", "") # TP - fo.set_preference("browser.safebrowsing.provider.google4.lists", "") # TP - fo.set_preference("extensions.blocklist.enabled", False) # extensions - fo.set_preference("security.OCSP.enabled", 0) + prefs["browser.safebrowsing.phising.enabled"] = False + prefs["browser.safebrowsing.malware.enabled"] = False + prefs["browser.safebrowsing.downloads.enabled"] = False + prefs["browser.safebrowsing.downloads.remote.enabled"] = False + prefs["browser.safebrowsing.blockedURIs.enabled"] = False + prefs["browser.safebrowsing.provider.mozilla.gethashURL"] = "" + prefs["browser.safebrowsing.provider.google.gethashURL"] = "" + prefs["browser.safebrowsing.provider.google4.gethashURL"] = "" + prefs["browser.safebrowsing.provider.mozilla.updateURL"] = "" + prefs["browser.safebrowsing.provider.google.updateURL"] = "" + prefs["browser.safebrowsing.provider.google4.updateURL"] = "" + prefs["browser.safebrowsing.provider.mozilla.lists"] = "" # TP + prefs["browser.safebrowsing.provider.google.lists"] = "" # TP + prefs["browser.safebrowsing.provider.google4.lists"] = "" # TP + prefs["extensions.blocklist.enabled"] = False # extensions + prefs["security.OCSP.enabled"] = 0 # Disable Content Decryption Module and OpenH264 related downloads - fo.set_preference("media.gmp-manager.url", "") - fo.set_preference("media.gmp-provider.enabled", False) - fo.set_preference("media.gmp-widevinecdm.enabled", False) - fo.set_preference("media.gmp-widevinecdm.visible", False) - fo.set_preference("media.gmp-gmpopenh264.enabled", False) + prefs["media.gmp-manager.url"] = "" + prefs["media.gmp-provider.enabled"] = False + prefs["media.gmp-widevinecdm.enabled"] = False + prefs["media.gmp-widevinecdm.visible"] = False + prefs["media.gmp-gmpopenh264.enabled"] = False # Disable Experiments - fo.set_preference("experiments.enabled", False) - fo.set_preference("experiments.manifest.uri", "") - fo.set_preference("experiments.supported", False) - fo.set_preference("experiments.activeExperiment", False) - fo.set_preference("network.allow-experiments", False) + prefs["experiments.enabled"] = False + prefs["experiments.manifest.uri"] = "" + prefs["experiments.supported"] = False + prefs["experiments.activeExperiment"] = False + prefs["network.allow-experiments"] = False # Disable pinging Mozilla for newtab - fo.set_preference("browser.newtabpage.directory.ping", "") - fo.set_preference("browser.newtabpage.directory.source", "") - fo.set_preference("browser.newtabpage.enabled", False) - fo.set_preference("browser.newtabpage.enhanced", False) - fo.set_preference("browser.newtabpage.introShown", True) - fo.set_preference("browser.aboutHomeSnippets.updateUrl", "") + prefs["browser.newtabpage.directory.ping"] = "" + prefs["browser.newtabpage.directory.source"] = "" + prefs["browser.newtabpage.enabled"] = False + prefs["browser.newtabpage.enhanced"] = False + prefs["browser.newtabpage.introShown"] = True + prefs["browser.aboutHomeSnippets.updateUrl"] = "" # Disable Pocket - fo.set_preference("extensions.pocket.enabled", False) + prefs["extensions.pocket.enabled"] = False # Disable Shield - fo.set_preference("app.shield.optoutstudies.enabled", False) - fo.set_preference("extensions.shield-recipe-client.enabled", False) + prefs["app.shield.optoutstudies.enabled"] = False + prefs["extensions.shield-recipe-client.enabled"] = False # Disable Source Pragams # As per https://bugzilla.mozilla.org/show_bug.cgi?id=1628853 # sourceURL can be used to obfuscate the original origin of # a script, we disable it. - fo.set_preference("javascript.options.source_pragmas", False) + prefs["javascript.options.source_pragmas"] = False # Enable extensions and disable extension signing - fo.set_preference("extensions.experiments.enabled", True) - fo.set_preference("xpinstall.signatures.required", False) + prefs["extensions.experiments.enabled"] = True + prefs["xpinstall.signatures.required"] = False diff --git a/openwpm/deploy_browsers/deploy_firefox.py b/openwpm/deploy_browsers/deploy_firefox.py index 697ae276..f16d138f 100755 --- a/openwpm/deploy_browsers/deploy_firefox.py +++ b/openwpm/deploy_browsers/deploy_firefox.py @@ -1,6 +1,7 @@ import json import logging import os.path +import socket import tempfile from pathlib import Path from typing import Any, Dict, Optional, Tuple @@ -116,11 +117,26 @@ def deploy_firefox( # TODO restore detailed logging # fo.set_preference("extensions.@openwpm.sdk.console.logLevel", "all") + # Geckodriver currently places the user.js file in the wrong profile + # directory, so we have to create it manually here. + # TODO: Remove this workaround once + # https://github.com/mozilla/geckodriver/issues/1844 is fixed. + # Load existing preferences from the profile's user.js file + prefs = configure_firefox.load_existing_prefs(browser_profile_path) + # Load default geckodriver preferences + prefs.update(configure_firefox.DEFAULT_GECKODRIVER_PREFS) + # Pick an available port for Marionette (https://stackoverflow.com/a/2838309) + s = socket.socket() + s.bind(("", 0)) + marionette_port = s.getsockname()[1] + s.close() + prefs["marionette.port"] = marionette_port + # Configure privacy settings - configure_firefox.privacy(browser_params, fo) + configure_firefox.privacy(browser_params, prefs) # Set various prefs to improve speed and eliminate traffic to Mozilla - configure_firefox.optimize_prefs(fo) + configure_firefox.optimize_prefs(prefs) # Intercept logging at the Selenium level and redirect it to the # main logger. This will also inform us where the real profile @@ -135,7 +151,10 @@ def deploy_firefox( "BROWSER %i: Setting custom preference: %s = %s" % (browser_params.browser_id, name, value) ) - fo.set_preference(name, value) + prefs[name] = value + + # Write all preferences to the profile's user.js file + configure_firefox.save_prefs_to_profile(prefs, browser_profile_path) # Launch the webdriver status_queue.put(("STATUS", "Launch Attempted", None)) @@ -144,6 +163,9 @@ def deploy_firefox( firefox_binary=fb, options=fo, log_path=interceptor.fifo, + # TODO: Remove when https://github.com/mozilla/geckodriver/issues/1844 + # is fixed + service_args=["--marionette-port", str(marionette_port)], ) # Add extension From 2237822eab3e5890d6575350371833a4f98ab72e Mon Sep 17 00:00:00 2001 From: Georgia Kokkinou Date: Mon, 8 Mar 2021 09:58:48 +0200 Subject: [PATCH 05/22] Do not intercept profile location from logs --- openwpm/deploy_browsers/deploy_firefox.py | 5 ++--- openwpm/deploy_browsers/selenium_firefox.py | 11 ++--------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/openwpm/deploy_browsers/deploy_firefox.py b/openwpm/deploy_browsers/deploy_firefox.py index f16d138f..4ce8949f 100755 --- a/openwpm/deploy_browsers/deploy_firefox.py +++ b/openwpm/deploy_browsers/deploy_firefox.py @@ -139,9 +139,8 @@ def deploy_firefox( configure_firefox.optimize_prefs(prefs) # Intercept logging at the Selenium level and redirect it to the - # main logger. This will also inform us where the real profile - # directory is hiding. - interceptor = FirefoxLogInterceptor(browser_params.browser_id, browser_profile_path) + # main logger. + interceptor = FirefoxLogInterceptor(browser_params.browser_id) interceptor.start() # Set custom prefs. These are set after all of the default prefs to allow diff --git a/openwpm/deploy_browsers/selenium_firefox.py b/openwpm/deploy_browsers/selenium_firefox.py index 6081d78d..29ba3a36 100644 --- a/openwpm/deploy_browsers/selenium_firefox.py +++ b/openwpm/deploy_browsers/selenium_firefox.py @@ -46,15 +46,13 @@ class FirefoxLogInterceptor(threading.Thread): """ Intercept logs from Selenium and/or geckodriver, using a named pipe and a detached thread, and feed them to the primary logger for this - instance. Also responsible for extracting the _real_ profile location - from geckodriver's log output (geckodriver copies the profile). + instance. """ - def __init__(self, browser_id, profile_path): + def __init__(self, browser_id): threading.Thread.__init__(self, name="log-interceptor-%i" % browser_id) self.browser_id = browser_id self.fifo = mktempfifo(suffix=".log", prefix="owpm_driver_") - self.profile_path = profile_path self.daemon = True self.logger = logging.getLogger("openwpm") @@ -68,11 +66,6 @@ class FirefoxLogInterceptor(threading.Thread): self.logger.debug( "BROWSER %i: driver: %s" % (self.browser_id, line.strip()) ) - if "Using profile path" in line: - self.profile_path = line.partition("Using profile path")[ - -1 - ].strip() - if self.fifo is not None: os.unlink(self.fifo) self.fifo = None From 403185a38a1547c61f426c0fd78709ea865aea37 Mon Sep 17 00:00:00 2001 From: Georgia Kokkinou Date: Mon, 15 Mar 2021 11:03:10 +0200 Subject: [PATCH 06/22] Simplify profile location handling 1. In `deploy_firefox` do not use `driver.capabilities["moz:profile"]` to get the profile location. Custom profiles, unlike profiles created via `FirefoxProfile`, are used in-place, so we already know the location. 2. In `launch_browser_manager`, `spawned_profile_path` and `driver_profile_path` point to the same location now that we are using a custom profile. Replace them with a single `browser_profile_path` variable. 3. Rename `prof_folder` and `browser_profile_folder` to `browser_profile_path` for consistency. 4. Improve naming of the temporary Firefox profile. --- openwpm/browser_manager.py | 25 ++++++++---------- openwpm/commands/profile_commands.py | 31 +++++++++++------------ openwpm/deploy_browsers/deploy_firefox.py | 4 +-- 3 files changed, 28 insertions(+), 32 deletions(-) diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index 8e10c044..bd675483 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -166,8 +166,8 @@ class Browser: # Read success status of browser manager launch_status = dict() try: - # 1. Selenium profile created - spawned_profile_path = check_queue(launch_status) + # 1. Browser profile created + browser_profile_path = check_queue(launch_status) # 2. Profile tar loaded (if necessary) check_queue(launch_status) # 3. Display launched (if necessary) @@ -177,7 +177,7 @@ class Browser: # 5. Browser launched self.geckodriver_pid = check_queue(launch_status) - (driver_profile_path, ready) = check_queue(launch_status) + ready = check_queue(launch_status) if ready != "READY": self.logger.error( "BROWSER %i: Mismatch of status queue return values, " @@ -209,7 +209,7 @@ class Browser: ) self.close_browser_manager() if "Profile Created" in launch_status: - shutil.rmtree(spawned_profile_path, ignore_errors=True) + shutil.rmtree(browser_profile_path, ignore_errors=True) # If the browser spawned successfully, we should update the # current profile path class variable and clean up the tempdir @@ -217,9 +217,7 @@ class Browser: if success: self.logger.debug("BROWSER %i: Browser spawn sucessful!" % self.browser_id) previous_profile_path = self.current_profile_path - self.current_profile_path = driver_profile_path - if driver_profile_path != spawned_profile_path: - shutil.rmtree(spawned_profile_path, ignore_errors=True) + self.current_profile_path = browser_profile_path if previous_profile_path is not None: shutil.rmtree(previous_profile_path, ignore_errors=True) if tempdir is not None: @@ -447,7 +445,7 @@ def BrowserManager( display = None try: # Start Xvfb (if necessary), webdriver, and browser - driver, prof_folder, display = deploy_firefox.deploy_firefox( + driver, browser_profile_path, display = deploy_firefox.deploy_firefox( status_queue, browser_params, manager_params, crash_recovery ) @@ -456,11 +454,11 @@ def BrowserManager( if browser_params.extension_enabled: logger.debug( "BROWSER %i: Looking for extension port information " - "in %s" % (browser_params.browser_id, prof_folder) + "in %s" % (browser_params.browser_id, browser_profile_path) ) elapsed = 0 port = None - ep_filename = prof_folder / "extension_port.txt" + ep_filename = browser_profile_path / "extension_port.txt" while elapsed < 5: try: with open(ep_filename, "rt") as f: @@ -487,10 +485,9 @@ def BrowserManager( logger.debug("BROWSER %i: BrowserManager ready." % browser_params.browser_id) - # passes the profile folder back to the - # TaskManager to signal a successful startup - status_queue.put(("STATUS", "Browser Ready", (prof_folder, "READY"))) - browser_params.profile_path = prof_folder + # passes "READY" to the TaskManager to signal a successful startup + status_queue.put(("STATUS", "Browser Ready", "READY")) + browser_params.profile_path = browser_profile_path # starts accepting arguments until told to die while True: diff --git a/openwpm/commands/profile_commands.py b/openwpm/commands/profile_commands.py index 7a6ff9db..d86573cf 100644 --- a/openwpm/commands/profile_commands.py +++ b/openwpm/commands/profile_commands.py @@ -18,8 +18,8 @@ logger = logging.getLogger("openwpm") class DumpProfileCommand(BaseCommand): """ - Dumps a browser profile currently stored in to - + Dumps a browser profile currently stored in to + . """ def __init__(self, tar_path: Path, close_webdriver: bool, compress: bool) -> None: @@ -39,8 +39,8 @@ class DumpProfileCommand(BaseCommand): manager_params: ManagerParamsInternal, extension_socket: Optional[ClientSocket], ) -> None: - browser_profile_folder = browser_params.profile_path - assert browser_profile_folder is not None + browser_profile_path = browser_params.profile_path + assert browser_profile_path is not None assert browser_params.browser_id is not None # Creating the folders if need be @@ -53,7 +53,7 @@ class DumpProfileCommand(BaseCommand): # if this is a dump on close, close the webdriver and wait for checkpoint if self.close_webdriver: webdriver.close() - sleep_until_sqlite_checkpoint(browser_profile_folder) + sleep_until_sqlite_checkpoint(browser_profile_path) # backup and tar profile if self.compress: @@ -64,7 +64,7 @@ class DumpProfileCommand(BaseCommand): "BROWSER %i: Backing up full profile from %s to %s" % ( browser_params.browser_id, - browser_profile_folder, + browser_profile_path, self.tar_path, ) ) @@ -84,7 +84,7 @@ class DumpProfileCommand(BaseCommand): "storage", # directory for IndexedDB ] for item in storage_vector_files: - full_path = browser_profile_folder / item + full_path = browser_profile_path / item if ( not full_path.is_file() and not full_path.name.endswith("shm") @@ -100,7 +100,7 @@ class DumpProfileCommand(BaseCommand): continue # These are just checkpoint files tar.add(full_path, arcname=item) for item in storage_vector_dirs: - full_path = browser_profile_folder / item + full_path = browser_profile_path / item if not full_path.is_dir(): logger.warning( "BROWSER %i: %s NOT FOUND IN profile folder, skipping." @@ -112,15 +112,14 @@ class DumpProfileCommand(BaseCommand): def load_profile( - browser_profile_folder: Path, + browser_profile_path: Path, manager_params: ManagerParamsInternal, browser_params: BrowserParamsInternal, tar_path: Path, ) -> None: """ - loads a zipped cookie-based profile stored at and - unzips it to . - The tar will remain unmodified. + Loads a zipped cookie-based profile stored at and unzips + it to . The tar will remain unmodified. """ assert browser_params.browser_id is not None @@ -132,16 +131,16 @@ def load_profile( % ( browser_params.browser_id, tar_path, - browser_profile_folder, + browser_profile_path, ) ) - shutil.copy(tar_path, browser_profile_folder) - tar_path = browser_profile_folder / tar_path.name + shutil.copy(tar_path, browser_profile_path) + tar_path = browser_profile_path / tar_path.name if tar_path.name.endswith("tar.gz"): f = tarfile.open(tar_path, "r:gz", errorlevel=1) else: f = tarfile.open(tar_path, "r", errorlevel=1) - f.extractall(browser_profile_folder) + f.extractall(browser_profile_path) f.close() tar_path.unlink() logger.debug("BROWSER %i: Tarfile extracted" % browser_params.browser_id) diff --git a/openwpm/deploy_browsers/deploy_firefox.py b/openwpm/deploy_browsers/deploy_firefox.py index 4ce8949f..94553ead 100755 --- a/openwpm/deploy_browsers/deploy_firefox.py +++ b/openwpm/deploy_browsers/deploy_firefox.py @@ -34,7 +34,7 @@ def deploy_firefox( root_dir = os.path.dirname(__file__) # directory of this file - browser_profile_path = Path(tempfile.mkdtemp(".firefox_profile")) + browser_profile_path = Path(tempfile.mkdtemp(prefix="firefox_profile_")) status_queue.put(("STATUS", "Profile Created", browser_profile_path)) # Use Options instead of FirefoxProfile to set preferences since the @@ -191,4 +191,4 @@ def deploy_firefox( status_queue.put(("STATUS", "Browser Launched", int(pid))) - return driver, Path(driver.capabilities["moz:profile"]), display + return driver, browser_profile_path, display From d2aff836f440e52703a812ead60ef2f47ab1a222 Mon Sep 17 00:00:00 2001 From: Georgia Kokkinou Date: Mon, 15 Mar 2021 11:09:44 +0200 Subject: [PATCH 07/22] Remove unused status string "Proxy Ready" --- openwpm/browser_manager.py | 1 - 1 file changed, 1 deletion(-) diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index bd675483..ea1bd874 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -190,7 +190,6 @@ class Browser: unsuccessful_spawns += 1 error_string = "" status_strings = [ - "Proxy Ready", "Profile Created", "Profile Tar", "Display", From 1d3de72292d861e6f530b692459fd6d302ac5928 Mon Sep 17 00:00:00 2001 From: Georgia Kokkinou Date: Mon, 15 Mar 2021 12:23:06 +0200 Subject: [PATCH 08/22] Reference our own issue instead of geckodriver's --- openwpm/deploy_browsers/configure_firefox.py | 4 ++-- openwpm/deploy_browsers/deploy_firefox.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/openwpm/deploy_browsers/configure_firefox.py b/openwpm/deploy_browsers/configure_firefox.py index e22a9cc0..9cc2cd09 100644 --- a/openwpm/deploy_browsers/configure_firefox.py +++ b/openwpm/deploy_browsers/configure_firefox.py @@ -5,8 +5,8 @@ import re from pathlib import Path from typing import Any, Dict -# TODO: Remove hardcoded geckodriver default preferences once -# https://github.com/mozilla/geckodriver/issues/1844 is fixed. +# TODO: Remove hardcoded geckodriver default preferences. See +# https://github.com/mozilla/OpenWPM/issues/867 # Source of preferences: # https://hg.mozilla.org/mozilla-central/file/tip/testing/geckodriver/src/prefs.rs # https://hg.mozilla.org/mozilla-central/file/tip/testing/geckodriver/src/marionette.rs diff --git a/openwpm/deploy_browsers/deploy_firefox.py b/openwpm/deploy_browsers/deploy_firefox.py index 94553ead..18d4fec1 100755 --- a/openwpm/deploy_browsers/deploy_firefox.py +++ b/openwpm/deploy_browsers/deploy_firefox.py @@ -119,8 +119,8 @@ def deploy_firefox( # Geckodriver currently places the user.js file in the wrong profile # directory, so we have to create it manually here. - # TODO: Remove this workaround once - # https://github.com/mozilla/geckodriver/issues/1844 is fixed. + # TODO: See https://github.com/mozilla/OpenWPM/issues/867 for when + # to remove this workaround. # Load existing preferences from the profile's user.js file prefs = configure_firefox.load_existing_prefs(browser_profile_path) # Load default geckodriver preferences @@ -162,8 +162,8 @@ def deploy_firefox( firefox_binary=fb, options=fo, log_path=interceptor.fifo, - # TODO: Remove when https://github.com/mozilla/geckodriver/issues/1844 - # is fixed + # TODO: See https://github.com/mozilla/OpenWPM/issues/867 for + # when to remove this service_args=["--marionette-port", str(marionette_port)], ) From f5bacaed84ae624b0a9d602bd4f28770976b8164 Mon Sep 17 00:00:00 2001 From: Georgia Kokkinou Date: Mon, 15 Mar 2021 13:44:36 +0200 Subject: [PATCH 09/22] Update manual_test.py Running manual_test.py resulted in an error because the `xpi()` fixture was called directly. Apply the fix suggested in https://docs.pytest.org/en/stable/deprecations.html#calling-fixtures-directly Also, use a custom profile instead of `FirefoxProfile` and update some docstrings. --- test/conftest.py | 6 +++++- test/manual_test.py | 40 +++++++++++++++++++++++++++++++--------- 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index e3380193..fe9cd9d7 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -24,13 +24,17 @@ EXTENSION_DIR = os.path.join( pytest_plugins = "test.storage.fixtures" -@pytest.fixture(scope="session") def xpi(): # Creates a new xpi using npm run build. print("Building new xpi") subprocess.check_call(["npm", "run", "build"], cwd=EXTENSION_DIR) +@pytest.fixture(name="xpi", scope="session") +def xpi_fixture(): + return xpi() + + @pytest.fixture(scope="session") def server(): """Run an HTTP server during the tests.""" diff --git a/test/manual_test.py b/test/manual_test.py index 0f5afb34..3fcdf9c1 100644 --- a/test/manual_test.py +++ b/test/manual_test.py @@ -1,11 +1,15 @@ import atexit +import shutil import subprocess +import tempfile from os.path import dirname, join, realpath +from pathlib import Path import click import IPython from selenium import webdriver from selenium.webdriver.firefox.firefox_binary import FirefoxBinary +from selenium.webdriver.firefox.options import Options from openwpm import js_instrumentation as jsi from openwpm.config import BrowserParams @@ -88,7 +92,7 @@ def start_webdriver( Set to True to load browser_params browser_params_file : string Specify the browser_params.json to load. - If None, default params form openwpm/config.py::BrowserParams will be loaded. + If None, default params from openwpm/config.py::BrowserParams will be loaded. Returns ------- @@ -110,16 +114,35 @@ def start_webdriver( print("...server shutdown") driver.quit() print("...webdriver closed") + shutil.rmtree(driver.capabilities["moz:profile"], ignore_errors=True) + print("...browser profile removed") atexit.register(cleanup_server) return driver - fp = webdriver.FirefoxProfile() + browser_profile_path = Path(tempfile.mkdtemp(prefix="firefox_profile_")) + fo = Options() + fo.add_argument("-profile") + fo.add_argument(str(browser_profile_path)) + # TODO: See https://github.com/mozilla/OpenWPM/issues/867 for when + # to remove manually creating user.js + prefs = configure_firefox.load_existing_prefs(browser_profile_path) + prefs.update(configure_firefox.DEFAULT_GECKODRIVER_PREFS) + if with_extension: # TODO: Restore preference for log level in a way that works in Fx 57+ # fp.set_preference("extensions.@openwpm.sdk.console.logLevel", "all") - configure_firefox.optimize_prefs(fp) - driver = webdriver.Firefox(firefox_binary=fb, firefox_profile=fp) + configure_firefox.optimize_prefs(prefs) + + configure_firefox.save_prefs_to_profile(prefs, browser_profile_path) + driver = webdriver.Firefox( + firefox_binary=fb, + options=fo, + # Use the default Marionette port. + # TODO: See https://github.com/mozilla/OpenWPM/issues/867 for + # when to remove this + service_args=["--marionette-port", "2828"], + ) if load_browser_params is True: # There's probably more we could do here # to set more preferences and better emulate @@ -134,8 +157,7 @@ def start_webdriver( js_request_as_string = jsi.clean_js_instrumentation_settings(js_request) browser_params.js_instrument_settings = js_request_as_string - profile_dir = driver.capabilities["moz:profile"] - with open(join(profile_dir, "browser_params.json"), "w") as f: + with open(browser_profile_path / "browser_params.json", "w") as f: f.write(browser_params.to_json()) if with_extension: @@ -192,9 +214,9 @@ def start_webext(): "--browser-params-file", help=""" Specify a browser_params.json file. If none provided and - --browser-params is enabled. Default browser_params.json - will be used. Pass an absolute path or a path relative - to the test directory.""", + --browser-params is enabled the default params from + openwpm/config.py::BrowserParams will be loaded. Pass an + absolute path or a path relative to the test directory.""", ) def main(selenium, no_extension, browser_params, browser_params_file): From aa1de922c9b69f19d0400bd4f01a09b4aed47acb Mon Sep 17 00:00:00 2001 From: Georgia Kokkinou Date: Mon, 15 Mar 2021 15:15:28 +0200 Subject: [PATCH 10/22] Add reminder to update geckodriver prefs --- docs/Release-Checklist.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/Release-Checklist.md b/docs/Release-Checklist.md index e9bd5ee6..1161d24d 100644 --- a/docs/Release-Checklist.md +++ b/docs/Release-Checklist.md @@ -1,6 +1,6 @@ # Release Checklist -We aim to release a new version of OpenWPM with each new Firefox release (~1 release per month). The following steps are necessary for a release +We aim to release a new version of OpenWPM with each new Firefox release (~1 release per month). The following steps are necessary for a release: 1. Upgrade Firefox to the newest version. 1. Go to: https://hg.mozilla.org/releases/mozilla-release/tags. @@ -10,12 +10,13 @@ We aim to release a new version of OpenWPM with each new Firefox release (~1 rel 1. Run `npm update` in `openwpm/Extension/firefox`. 2. Run `npm update` in `openwpm/Extension/webext-instrumentation`. 3. Update python and system dependencies by following the ["managing requirements" instructions](../CONTRIBUTING.md#managing-requirements). -4. Increment the version number in [VERSION](../VERSION) -5. Add a summary of changes since the last version to [CHANGELOG](../CHANGELOG.md) -6. Squash and merge the release PR to master. -7. Publish a new release from https://github.com/mozilla/OpenWPM/releases: +4. If a new version of geckodriver is used, check whether the default geckodriver browser preferences in [`openwpm/deploy_browsers/configure_firefox.py`](../openwpm/deploy_browsers/configure_firefox.py#L8L65) need to be updated. +5. Increment the version number in [VERSION](../VERSION) +6. Add a summary of changes since the last version to [CHANGELOG](../CHANGELOG.md) +7. Squash and merge the release PR to master. +8. Publish a new release from https://github.com/mozilla/OpenWPM/releases: 1. Click "Draft a new release". 2. Enter the "Tag version" and "Release title" as `vX.X.X`. 3. In the description: 1. Include the text `Updates OpenWPM to Firefox X` if this release is also a new FF version. - 2. Include a link to the CHANGELOG, e.g. `See the [CHANGELOG]() for details.`. \ No newline at end of file + 2. Include a link to the CHANGELOG, e.g. `See the [CHANGELOG]() for details.`. From 9ea8e8a051a9894c08c2b9ce7a6ac4233cf2ceb5 Mon Sep 17 00:00:00 2001 From: Georgia Kokkinou Date: Mon, 15 Mar 2021 15:37:12 +0200 Subject: [PATCH 11/22] Rename temp dir of crashed browser's profile tar --- openwpm/browser_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index ea1bd874..996068a2 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -104,7 +104,7 @@ class Browser: # to be a tar of the crashed browser's history if self.current_profile_path is not None: # tar contents of crashed profile to a temp dir - tempdir = tempfile.mkdtemp(prefix="owpm_profile_archive_") + tempdir = tempfile.mkdtemp(prefix="openwpm_profile_archive_") tar_path = Path(tempdir) / "profile.tar.gz" self.browser_params.profile_path = self.current_profile_path From 3f7efc249088c11c32a4740de8a2203b185f0728 Mon Sep 17 00:00:00 2001 From: Georgia Kokkinou Date: Mon, 15 Mar 2021 16:01:47 +0200 Subject: [PATCH 12/22] Skip test_browser_profile_coverage locally --- test/test_crawl.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/test_crawl.py b/test/test_crawl.py index 563f2e6d..2857dc2a 100644 --- a/test/test_crawl.py +++ b/test/test_crawl.py @@ -5,6 +5,7 @@ This should be avoided if possible, as controlled tests will be easier to debug. """ +import os import tarfile import domain_utils as du @@ -41,6 +42,10 @@ def get_public_suffix(url): return url_parts[-1] +@pytest.mark.skipif( + "CI" not in os.environ or os.environ["CI"] == "false", + reason="Makes remote connections", +) @pytest.mark.slow def test_browser_profile_coverage(default_params, task_manager_creator): """Test the coverage of the browser's profile. From 1e165133703482c58f40372452ed5edcf06e5d56 Mon Sep 17 00:00:00 2001 From: Georgia Kokkinou Date: Tue, 16 Mar 2021 17:10:50 +0200 Subject: [PATCH 13/22] Improve profile dumping logic Move the core implementation of profile dumping into a `dump_profile` function, which can be used both internally when closing or restarting a crashed browser and from the `execute()` method of `DumpProfileCommand`. Also, make compression the default in `DumpProfileCommand`. Finally, do not compress the tar archive of the crashed browser's profile when restarting from a crash. We should avoid the extra compression/ decompression step as this is a short-lived tar file. --- openwpm/browser_manager.py | 27 ++--- openwpm/commands/profile_commands.py | 149 +++++++++++++++------------ 2 files changed, 90 insertions(+), 86 deletions(-) diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index 996068a2..662f0043 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -18,7 +18,7 @@ from multiprocess import Queue from selenium.common.exceptions import WebDriverException from tblib import pickling_support -from .commands.profile_commands import DumpProfileCommand +from .commands.profile_commands import dump_profile from .commands.types import BaseCommand, ShutdownSignal from .config import BrowserParamsInternal, ManagerParamsInternal from .deploy_browsers import deploy_firefox @@ -105,17 +105,13 @@ class Browser: if self.current_profile_path is not None: # tar contents of crashed profile to a temp dir tempdir = tempfile.mkdtemp(prefix="openwpm_profile_archive_") - tar_path = Path(tempdir) / "profile.tar.gz" + tar_path = Path(tempdir) / "profile.tar" - self.browser_params.profile_path = self.current_profile_path - dump_profile_command = DumpProfileCommand( - tar_path=tar_path, close_webdriver=False, compress=True - ) - dump_profile_command.execute( - webdriver=None, + dump_profile( + browser_profile_path=self.current_profile_path, + tar_path=tar_path, + compress=False, browser_params=self.browser_params, - manager_params=self.manager_params, - extension_socket=None, ) # make sure browser loads crashed profile @@ -412,17 +408,12 @@ class Browser: % (self.browser_id, self.browser_params.profile_archive_dir) ) tar_path = self.browser_params.profile_archive_dir / "profile.tar.gz" - self.browser_params.profile_path = self.current_profile_path - dump_profile_command = DumpProfileCommand( + assert self.current_profile_path is not None + dump_profile( + browser_profile_path=self.current_profile_path, tar_path=tar_path, - close_webdriver=False, compress=True, - ) - dump_profile_command.execute( - webdriver=None, browser_params=self.browser_params, - manager_params=self.manager_params, - extension_socket=None, ) # Clean up temporary files diff --git a/openwpm/commands/profile_commands.py b/openwpm/commands/profile_commands.py index d86573cf..ea29206f 100644 --- a/openwpm/commands/profile_commands.py +++ b/openwpm/commands/profile_commands.py @@ -2,7 +2,6 @@ import logging import shutil import tarfile from pathlib import Path -from typing import Optional from selenium.webdriver import Firefox @@ -16,13 +15,85 @@ from .utils.firefox_profile import sleep_until_sqlite_checkpoint logger = logging.getLogger("openwpm") +def dump_profile( + browser_profile_path: Path, + tar_path: Path, + compress: bool, + browser_params: BrowserParamsInternal, +) -> None: + """Dumps a browser profile to a tar file.""" + assert browser_params.browser_id is not None + + # Creating the folders if need be + tar_path.parent.mkdir(exist_ok=True, parents=True) + + # see if this file exists first + # if it does, delete it before we try to save the current session + if tar_path.exists(): + tar_path.unlink() + + # backup and tar profile + if compress: + tar = tarfile.open(tar_path, "w:gz", errorlevel=1) + else: + tar = tarfile.open(tar_path, "w", errorlevel=1) + logger.debug( + "BROWSER %i: Backing up full profile from %s to %s" + % (browser_params.browser_id, browser_profile_path, tar_path) + ) + + storage_vector_files = [ + "cookies.sqlite", # cookies + "cookies.sqlite-shm", + "cookies.sqlite-wal", + "places.sqlite", # history + "places.sqlite-shm", + "places.sqlite-wal", + "webappsstore.sqlite", # localStorage + "webappsstore.sqlite-shm", + "webappsstore.sqlite-wal", + ] + storage_vector_dirs = [ + "webapps", # related to localStorage? + "storage", # directory for IndexedDB + ] + for item in storage_vector_files: + full_path = browser_profile_path / item + if ( + not full_path.is_file() + and not full_path.name.endswith("shm") + and not full_path.name.endswith("wal") + ): + logger.critical( + "BROWSER %i: %s NOT FOUND IN profile folder, skipping." + % (browser_params.browser_id, full_path) + ) + elif not full_path.is_file() and ( + full_path.name.endswith("shm") or full_path.name.endswith("wal") + ): + continue # These are just checkpoint files + tar.add(full_path, arcname=item) + for item in storage_vector_dirs: + full_path = browser_profile_path / item + if not full_path.is_dir(): + logger.warning( + "BROWSER %i: %s NOT FOUND IN profile folder, skipping." + % (browser_params.browser_id, full_path) + ) + continue + tar.add(full_path, arcname=item) + tar.close() + + class DumpProfileCommand(BaseCommand): """ Dumps a browser profile currently stored in to . """ - def __init__(self, tar_path: Path, close_webdriver: bool, compress: bool) -> None: + def __init__( + self, tar_path: Path, close_webdriver: bool, compress: bool = True + ) -> None: self.tar_path = tar_path self.close_webdriver = close_webdriver self.compress = compress @@ -37,78 +108,20 @@ class DumpProfileCommand(BaseCommand): webdriver: Firefox, browser_params: BrowserParamsInternal, manager_params: ManagerParamsInternal, - extension_socket: Optional[ClientSocket], + extension_socket: ClientSocket, ) -> None: - browser_profile_path = browser_params.profile_path - assert browser_profile_path is not None - assert browser_params.browser_id is not None - - # Creating the folders if need be - self.tar_path.parent.mkdir(exist_ok=True, parents=True) - - # see if this file exists first - # if it does, delete it before we try to save the current session - if self.tar_path.exists(): - self.tar_path.unlink() # IDK why it's called like this # if this is a dump on close, close the webdriver and wait for checkpoint if self.close_webdriver: webdriver.close() - sleep_until_sqlite_checkpoint(browser_profile_path) + sleep_until_sqlite_checkpoint(browser_params.profile_path) - # backup and tar profile - if self.compress: - tar = tarfile.open(self.tar_path, "w:gz", errorlevel=1) - else: - tar = tarfile.open(self.tar_path, "w", errorlevel=1) - logger.debug( - "BROWSER %i: Backing up full profile from %s to %s" - % ( - browser_params.browser_id, - browser_profile_path, - self.tar_path, - ) + assert browser_params.profile_path is not None + dump_profile( + browser_params.profile_path, + self.tar_path, + self.compress, + browser_params, ) - storage_vector_files = [ - "cookies.sqlite", # cookies - "cookies.sqlite-shm", - "cookies.sqlite-wal", - "places.sqlite", # history - "places.sqlite-shm", - "places.sqlite-wal", - "webappsstore.sqlite", # localStorage - "webappsstore.sqlite-shm", - "webappsstore.sqlite-wal", - ] - storage_vector_dirs = [ - "webapps", # related to localStorage? - "storage", # directory for IndexedDB - ] - for item in storage_vector_files: - full_path = browser_profile_path / item - if ( - not full_path.is_file() - and not full_path.name.endswith("shm") - and not full_path.name.endswith("wal") - ): - logger.critical( - "BROWSER %i: %s NOT FOUND IN profile folder, skipping." - % (browser_params.browser_id, full_path) - ) - elif not full_path.is_file() and ( - full_path.name.endswith("shm") or full_path.name.endswith("wal") - ): - continue # These are just checkpoint files - tar.add(full_path, arcname=item) - for item in storage_vector_dirs: - full_path = browser_profile_path / item - if not full_path.is_dir(): - logger.warning( - "BROWSER %i: %s NOT FOUND IN profile folder, skipping." - % (browser_params.browser_id, full_path) - ) - continue - tar.add(full_path, arcname=item) - tar.close() def load_profile( From 3b4219d0f98049d0689987f84e81f3957a4d8faa Mon Sep 17 00:00:00 2001 From: Georgia Kokkinou Date: Fri, 19 Mar 2021 21:09:20 +0200 Subject: [PATCH 14/22] Add some type annotations --- openwpm/command_sequence.py | 9 ++++++++- openwpm/deploy_browsers/configure_firefox.py | 6 ++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/openwpm/command_sequence.py b/openwpm/command_sequence.py index 18b9a150..2cb951c9 100644 --- a/openwpm/command_sequence.py +++ b/openwpm/command_sequence.py @@ -1,3 +1,4 @@ +from pathlib import Path from typing import Callable, List, Tuple from .commands.browser_commands import ( @@ -87,7 +88,13 @@ class CommandSequence: self._commands_with_timeout.append((command, timeout)) self.contains_get_or_browse = True - def dump_profile(self, tar_path, close_webdriver=False, compress=True, timeout=120): + def dump_profile( + self, + tar_path: Path, + close_webdriver: bool = False, + compress: bool = True, + timeout: int = 120, + ) -> None: """ dumps from the profile path to a given file (absolute path) """ self.total_timeout += timeout command = DumpProfileCommand(tar_path, close_webdriver, compress) diff --git a/openwpm/deploy_browsers/configure_firefox.py b/openwpm/deploy_browsers/configure_firefox.py index 9cc2cd09..5756f2a1 100644 --- a/openwpm/deploy_browsers/configure_firefox.py +++ b/openwpm/deploy_browsers/configure_firefox.py @@ -5,6 +5,8 @@ import re from pathlib import Path from typing import Any, Dict +from ..config import BrowserParams + # TODO: Remove hardcoded geckodriver default preferences. See # https://github.com/mozilla/OpenWPM/issues/867 # Source of preferences: @@ -97,7 +99,7 @@ def save_prefs_to_profile(prefs: Dict[str, Any], browser_profile_path: Path) -> f.write('user_pref("%s", %s);\n' % (key, json.dumps(value))) -def privacy(browser_params, prefs): +def privacy(browser_params: BrowserParams, prefs: Dict[str, Any]) -> None: """ Configure the privacy settings in Firefox. This includes: * DNT @@ -127,7 +129,7 @@ def privacy(browser_params, prefs): ) -def optimize_prefs(prefs): +def optimize_prefs(prefs: Dict[str, Any]) -> None: """ Disable various features and checks the browser will do on startup. Some of these (e.g. disabling the newtab page) are required to prevent From a19b12478be0c3fe96118bc861666c7c3d5043ae Mon Sep 17 00:00:00 2001 From: Georgia Kokkinou Date: Fri, 19 Mar 2021 21:14:51 +0200 Subject: [PATCH 15/22] Remove `reset=True` from tests --- test/test_callback.py | 2 +- test/test_profile.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_callback.py b/test/test_callback.py index 47230763..48746a25 100644 --- a/test/test_callback.py +++ b/test/test_callback.py @@ -17,7 +17,7 @@ def test_local_callbacks(default_params, task_manager_creator): my_list: List[int] = [] sequence = CommandSequence( - TEST_SITE, reset=True, blocking=True, callback=partial(callback, my_list) + TEST_SITE, blocking=True, callback=partial(callback, my_list) ) sequence.get() diff --git a/test/test_profile.py b/test/test_profile.py index 41166e48..4a3d3505 100644 --- a/test/test_profile.py +++ b/test/test_profile.py @@ -87,7 +87,7 @@ def test_seed_persistance(default_params, task_manager_creator): command_sequences = [] for _ in range(2): - cs = CommandSequence(url="https://example.com", reset=True) + cs = CommandSequence(url="https://example.com") cs.get() cs.append_command(AssertConfigSetCommand("test_pref", True)) command_sequences.append(cs) From c51f9e56bffd12ab57a8930b9afde01c6dfdfd85 Mon Sep 17 00:00:00 2001 From: Georgia Kokkinou Date: Fri, 19 Mar 2021 22:17:06 +0200 Subject: [PATCH 16/22] Fix minor typos --- demo.py | 2 +- openwpm/browser_manager.py | 10 +++++----- openwpm/command_sequence.py | 12 ++++++------ openwpm/deploy_browsers/configure_firefox.py | 2 +- test/test_callback.py | 2 +- test/test_profile.py | 2 +- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/demo.py b/demo.py index 4c6ef23a..7feaac7e 100644 --- a/demo.py +++ b/demo.py @@ -73,5 +73,5 @@ with TaskManager( # Have a look at custom_command.py to see how to implement your own command command_sequence.append_command(LinkCountingCommand()) - # Run commands across the three browsers (simple parallelization) + # Run commands across all browsers (simple parallelization) manager.execute_command_sequence(command_sequence) diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index 662f0043..1341b743 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -65,7 +65,7 @@ class Browser: # Queues and process IDs for BrowserManager - # thread to run commands issues from TaskManager + # thread to run commands issued from TaskManager self.command_thread: Optional[threading.Thread] = None # queue for passing command tuples to BrowserManager self.command_queue: Optional[Queue] = None @@ -78,7 +78,7 @@ class Browser: # the port of the display for the Xvfb display (if it exists) self.display_port: Optional[int] = None - # boolean that says if the BrowserManager new (to optimize restarts) + # boolean that says if the BrowserManager is new (to optimize restarts) self.is_fresh = True # boolean indicating if the browser should be restarted self.restart_required = False @@ -210,7 +210,7 @@ class Browser: # current profile path class variable and clean up the tempdir # and previous profile path. if success: - self.logger.debug("BROWSER %i: Browser spawn sucessful!" % self.browser_id) + self.logger.debug("BROWSER %i: Browser spawn successful!" % self.browser_id) previous_profile_path = self.current_profile_path self.current_profile_path = browser_profile_path if previous_profile_path is not None: @@ -360,7 +360,7 @@ class Browser: os.kill(self.display_pid, signal.SIGKILL) except OSError: self.logger.debug( - "BROWSER %i: Display process does not " "exit" % self.browser_id + "BROWSER %i: Display process does not exit" % self.browser_id ) pass except TypeError: @@ -368,7 +368,7 @@ class Browser: "BROWSER %i: PID may not be the correct " "type %s" % (self.browser_id, str(self.display_pid)) ) - if self.display_port is not None: # xvfb diplay lock + if self.display_port is not None: # xvfb display lock lockfile = "/tmp/.X%s-lock" % self.display_port try: os.remove(lockfile) diff --git a/openwpm/command_sequence.py b/openwpm/command_sequence.py index 2cb951c9..ac79dce5 100644 --- a/openwpm/command_sequence.py +++ b/openwpm/command_sequence.py @@ -20,7 +20,7 @@ class CommandSequence: """A CommandSequence wraps a series of commands to be performed on a visit to one top-level site into one logical "site visit," keyed by a visit id. An example of a CommandSequence - that visits a page and dumps cookies modified on that visit would be: + that visits a page and saves a screenshot of it would be: sequence = CommandSequence(url) sequence.get() @@ -105,7 +105,7 @@ class CommandSequence: self.total_timeout += timeout if not self.contains_get_or_browse: raise CommandExecutionError( - "No get or browse request preceding " "the save screenshot command", + "No get or browse request preceding the save screenshot command", self, ) command = SaveScreenshotCommand(suffix) @@ -133,7 +133,7 @@ class CommandSequence: self.total_timeout += timeout if not self.contains_get_or_browse: raise CommandExecutionError( - "No get or browse request preceding the dump page source command", + "No get or browse request preceding the screenshot full page command", self, ) command = ScreenshotFullPageCommand(suffix) @@ -144,7 +144,7 @@ class CommandSequence: self.total_timeout += timeout if not self.contains_get_or_browse: raise CommandExecutionError( - "No get or browse request preceding " "the dump page source command", + "No get or browse request preceding the dump page source command", self, ) command = DumpPageSourceCommand(suffix) @@ -173,7 +173,8 @@ class CommandSequence: self.total_timeout += timeout if not self.contains_get_or_browse: raise CommandExecutionError( - "No get or browse request preceding " "the dump page source command", + "No get or browse request preceding the recursive dump" + " page source command", self, ) command = RecursiveDumpPageSourceCommand(suffix) @@ -190,7 +191,6 @@ class CommandSequence: """Returns a list of all commands in the command_sequence appended by a finalize command """ - commands = list(self._commands_with_timeout) commands.insert(0, (InitializeCommand(), 10)) commands.append((FinalizeCommand(sleep=5), 10)) diff --git a/openwpm/deploy_browsers/configure_firefox.py b/openwpm/deploy_browsers/configure_firefox.py index 5756f2a1..ce6d39fe 100644 --- a/openwpm/deploy_browsers/configure_firefox.py +++ b/openwpm/deploy_browsers/configure_firefox.py @@ -241,7 +241,7 @@ def optimize_prefs(prefs: Dict[str, Any]) -> None: prefs["app.shield.optoutstudies.enabled"] = False prefs["extensions.shield-recipe-client.enabled"] = False - # Disable Source Pragams + # Disable Source Pragmas # As per https://bugzilla.mozilla.org/show_bug.cgi?id=1628853 # sourceURL can be used to obfuscate the original origin of # a script, we disable it. diff --git a/test/test_callback.py b/test/test_callback.py index 48746a25..51a6c351 100644 --- a/test/test_callback.py +++ b/test/test_callback.py @@ -7,7 +7,7 @@ from .utilities import BASE_TEST_URL def test_local_callbacks(default_params, task_manager_creator): - """Test test the storage controller as well as the entire callback machinery + """Test the storage controller as well as the entire callback machinery to see if all callbacks get correctly called""" manager, _ = task_manager_creator(default_params) TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html" diff --git a/test/test_profile.py b/test/test_profile.py index 4a3d3505..a9364f26 100644 --- a/test/test_profile.py +++ b/test/test_profile.py @@ -78,7 +78,7 @@ def test_profile_saved_when_launch_crashes(default_params, task_manager_creator) assert (browser_params[0].profile_archive_dir / "profile.tar.gz").is_file() -def test_seed_persistance(default_params, task_manager_creator): +def test_seed_persistence(default_params, task_manager_creator): manager_params, browser_params = default_params p = Path("profile.tar.gz") for browser_param in browser_params: From 9e8298c455f184560598d13ab3abc84673490ee6 Mon Sep 17 00:00:00 2001 From: Georgia Kokkinou Date: Sun, 21 Mar 2021 13:57:06 +0200 Subject: [PATCH 17/22] Fix test_browser_profile_coverage Use the public suffix + 1 instead of the public suffix when comparing the domains in the crawl database with those in the profile history. Also, update an incorrectly formed query to the crawl database. --- test/test_crawl.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/test/test_crawl.py b/test/test_crawl.py index 2857dc2a..d9235ede 100644 --- a/test/test_crawl.py +++ b/test/test_crawl.py @@ -5,6 +5,7 @@ This should be avoided if possible, as controlled tests will be easier to debug. """ +import json import os import tarfile @@ -37,11 +38,6 @@ TEST_SITES = [ ] -def get_public_suffix(url): - url_parts = du.hostname_subparts(url, include_ps=True) - return url_parts[-1] - - @pytest.mark.skipif( "CI" not in os.environ or os.environ["CI"] == "false", reason="Makes remote connections", @@ -78,16 +74,18 @@ def test_browser_profile_coverage(default_params, task_manager_creator): rows = db_utils.query_db(crawl_db, "SELECT url FROM http_requests") req_ps = set() # visited domains from http_requests table for (url,) in rows: - req_ps.add(get_public_suffix(url)) + req_ps.add(du.get_ps_plus_1(url)) hist_ps = set() # visited domains from crawl_history Table statuses = dict() rows = db_utils.query_db( crawl_db, - "SELECT arguments, command_status FROM crawl_history WHERE command='GET'", + "SELECT arguments, command_status FROM crawl_history WHERE" + " command='GetCommand'", ) - for url, command_status in rows: - ps = get_public_suffix(url) + for arguments, command_status in rows: + url = json.loads(arguments)["url"] + ps = du.get_ps_plus_1(url) hist_ps.add(ps) statuses[ps] = command_status @@ -96,7 +94,7 @@ def test_browser_profile_coverage(default_params, task_manager_creator): rows = db_utils.query_db(ff_db, "SELECT url FROM moz_places") for (host,) in rows: try: - profile_ps.add(get_public_suffix(host)) + profile_ps.add(du.get_ps_plus_1(host)) except AttributeError: pass From e536c630cc6da5e5d32cd5a56d10f42e266348d7 Mon Sep 17 00:00:00 2001 From: Georgia Kokkinou Date: Sun, 21 Mar 2021 14:43:18 +0200 Subject: [PATCH 18/22] Add comment for Marionette port race condition --- openwpm/deploy_browsers/deploy_firefox.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/openwpm/deploy_browsers/deploy_firefox.py b/openwpm/deploy_browsers/deploy_firefox.py index 18d4fec1..826b8830 100755 --- a/openwpm/deploy_browsers/deploy_firefox.py +++ b/openwpm/deploy_browsers/deploy_firefox.py @@ -126,6 +126,8 @@ def deploy_firefox( # Load default geckodriver preferences prefs.update(configure_firefox.DEFAULT_GECKODRIVER_PREFS) # Pick an available port for Marionette (https://stackoverflow.com/a/2838309) + # This has a race condition, as another process may get the port + # before Marionette, but we don't expect it to happen often s = socket.socket() s.bind(("", 0)) marionette_port = s.getsockname()[1] From 06b83596ab35646f0dddea85ae2bea0a6511f06c Mon Sep 17 00:00:00 2001 From: Georgia Kokkinou Date: Sun, 21 Mar 2021 22:10:36 +0200 Subject: [PATCH 19/22] Do not copy tar before extracting in load_profile --- openwpm/commands/profile_commands.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/openwpm/commands/profile_commands.py b/openwpm/commands/profile_commands.py index ea29206f..f4483e9d 100644 --- a/openwpm/commands/profile_commands.py +++ b/openwpm/commands/profile_commands.py @@ -134,28 +134,16 @@ def load_profile( Loads a zipped cookie-based profile stored at and unzips it to . The tar will remain unmodified. """ - assert browser_params.browser_id is not None try: assert tar_path.is_file() - # Copy and untar the loaded profile - logger.debug( - "BROWSER %i: Copying profile tar from %s to %s" - % ( - browser_params.browser_id, - tar_path, - browser_profile_path, - ) - ) - shutil.copy(tar_path, browser_profile_path) - tar_path = browser_profile_path / tar_path.name + # Untar the loaded profile if tar_path.name.endswith("tar.gz"): f = tarfile.open(tar_path, "r:gz", errorlevel=1) else: f = tarfile.open(tar_path, "r", errorlevel=1) f.extractall(browser_profile_path) f.close() - tar_path.unlink() logger.debug("BROWSER %i: Tarfile extracted" % browser_params.browser_id) except Exception as ex: From 49965db164e424f174dd749a87902ad95e10db63 Mon Sep 17 00:00:00 2001 From: Georgia Kokkinou Date: Mon, 22 Mar 2021 12:54:24 +0200 Subject: [PATCH 20/22] Use local test server in profile tests --- test/test_profile.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/test/test_profile.py b/test/test_profile.py index a9364f26..32839fa4 100644 --- a/test/test_profile.py +++ b/test/test_profile.py @@ -8,6 +8,8 @@ from openwpm.commands.types import BaseCommand from openwpm.errors import CommandExecutionError, ProfileLoadError from openwpm.utilities import db_utils +from .utilities import BASE_TEST_URL + # TODO update these tests to make use of blocking commands @@ -18,7 +20,7 @@ def test_saving(default_params, task_manager_creator): manager_params.data_directory / "browser_profile" ) manager, _ = task_manager_creator((manager_params, browser_params[:1])) - manager.get("http://example.com") + manager.get(BASE_TEST_URL) manager.close() assert (browser_params[0].profile_archive_dir / "profile.tar.gz").is_file() @@ -32,7 +34,7 @@ def test_crash_profile(default_params, task_manager_creator): ) manager, _ = task_manager_creator((manager_params, browser_params[:1])) try: - manager.get("http://example.com") # So we have a profile + manager.get(BASE_TEST_URL) # So we have a profile manager.get("example.com") # Selenium requires scheme prefix manager.get("example.com") # Selenium requires scheme prefix manager.get("example.com") # Selenium requires scheme prefix @@ -60,7 +62,7 @@ def test_profile_saved_when_launch_crashes(default_params, task_manager_creator) browser_params[0].proxy = True browser_params[0].save_content = "script" manager, _ = task_manager_creator((manager_params, browser_params[:1])) - manager.get("http://example.com") + manager.get(BASE_TEST_URL) # Kill the LevelDBAggregator # This will cause the proxy launch to crash @@ -71,7 +73,7 @@ def test_profile_saved_when_launch_crashes(default_params, task_manager_creator) # The browser will fail to launch due to the proxy crashes try: - manager.get("http://example.com") + manager.get(BASE_TEST_URL) except CommandExecutionError: pass manager.close() @@ -87,7 +89,7 @@ def test_seed_persistence(default_params, task_manager_creator): command_sequences = [] for _ in range(2): - cs = CommandSequence(url="https://example.com") + cs = CommandSequence(url=BASE_TEST_URL) cs.get() cs.append_command(AssertConfigSetCommand("test_pref", True)) command_sequences.append(cs) From 9a21d86e8f283dbf9bd733566e01348ada7681c4 Mon Sep 17 00:00:00 2001 From: Georgia Kokkinou Date: Mon, 22 Mar 2021 15:57:01 +0200 Subject: [PATCH 21/22] Simplify PatchedGeckoDriverService class Make `PatchedGeckoDriverService` class subclass selenium.webdriver.firefox.service.Service instead of selenium.webdriver.common.service.Service, so that we only have to keep track of the changes in the `__init__()` method of the former class. --- openwpm/deploy_browsers/selenium_firefox.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/openwpm/deploy_browsers/selenium_firefox.py b/openwpm/deploy_browsers/selenium_firefox.py index 29ba3a36..18a89b6d 100644 --- a/openwpm/deploy_browsers/selenium_firefox.py +++ b/openwpm/deploy_browsers/selenium_firefox.py @@ -13,6 +13,7 @@ from selenium.webdriver.common.service import Service as BaseService from selenium.webdriver.firefox import webdriver as FirefoxDriverModule from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.webdriver.firefox.options import Options +from selenium.webdriver.firefox.service import Service as FirefoxService __all__ = ["FirefoxBinary", "FirefoxLogInterceptor", "Options"] @@ -76,7 +77,7 @@ class FirefoxLogInterceptor(threading.Thread): self.fifo = None -class PatchedGeckoDriverService(BaseService): +class PatchedGeckoDriverService(FirefoxService): """Object that manages the starting and stopping of the GeckoDriver. Modified from the original (selenium.webdriver.firefox.service.Service) for Py3 compat in the presence of log FIFOs, and for potential future @@ -121,11 +122,5 @@ class PatchedGeckoDriverService(BaseService): ) self.service_args = service_args or [] - def command_line_args(self): - return ["--port", "%d" % self.port] + self.service_args - - def send_remote_shutdown_command(self): - pass - FirefoxDriverModule.Service = PatchedGeckoDriverService From 37271ba62d9307df7f9403ff0f2a857813617c85 Mon Sep 17 00:00:00 2001 From: Georgia Kokkinou Date: Mon, 22 Mar 2021 19:22:17 +0200 Subject: [PATCH 22/22] Remove unnecessary import --- openwpm/deploy_browsers/selenium_firefox.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/openwpm/deploy_browsers/selenium_firefox.py b/openwpm/deploy_browsers/selenium_firefox.py index 18a89b6d..77938c68 100644 --- a/openwpm/deploy_browsers/selenium_firefox.py +++ b/openwpm/deploy_browsers/selenium_firefox.py @@ -13,7 +13,6 @@ from selenium.webdriver.common.service import Service as BaseService from selenium.webdriver.firefox import webdriver as FirefoxDriverModule from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.webdriver.firefox.options import Options -from selenium.webdriver.firefox.service import Service as FirefoxService __all__ = ["FirefoxBinary", "FirefoxLogInterceptor", "Options"] @@ -77,7 +76,7 @@ class FirefoxLogInterceptor(threading.Thread): self.fifo = None -class PatchedGeckoDriverService(FirefoxService): +class PatchedGeckoDriverService(FirefoxDriverModule.Service): """Object that manages the starting and stopping of the GeckoDriver. Modified from the original (selenium.webdriver.firefox.service.Service) for Py3 compat in the presence of log FIFOs, and for potential future