зеркало из https://github.com/openwpm/OpenWPM.git
Merge pull request #864 from boolean5/restore-stateful-crawls
Restore stateful crawling support
This commit is contained in:
Коммит
bfc4644a71
3
demo.py
3
demo.py
|
@ -65,7 +65,6 @@ with TaskManager(
|
|||
command_sequence = CommandSequence(
|
||||
site,
|
||||
site_rank=index,
|
||||
reset=True,
|
||||
callback=callback,
|
||||
)
|
||||
|
||||
|
@ -74,5 +73,5 @@ with TaskManager(
|
|||
# Have a look at custom_command.py to see how to implement your own command
|
||||
command_sequence.append_command(LinkCountingCommand())
|
||||
|
||||
# Run commands across the three browsers (simple parallelization)
|
||||
# Run commands across all browsers (simple parallelization)
|
||||
manager.execute_command_sequence(command_sequence)
|
||||
|
|
|
@ -249,11 +249,6 @@ TODO
|
|||
|
||||
# Browser Profile Support
|
||||
|
||||
**WARNING: Stateful crawls are currently not supported. Attempts to run
|
||||
stateful crawls will throw `NotImplementedError`s. The work required to
|
||||
restore support is tracked in
|
||||
[this project](https://github.com/mozilla/OpenWPM/projects/2).**
|
||||
|
||||
## Stateful vs Stateless crawls
|
||||
|
||||
By default OpenWPM performs a "stateful" crawl, in that it keeps a consistent
|
||||
|
@ -329,7 +324,6 @@ but will not be used during crash recovery. Specifically:
|
|||
profile specified by `seed_tar`. If OpenWPM determines that Firefox needs to
|
||||
restart for some reason during the crawl, it will use the profile from
|
||||
the most recent page visit (pre-crash) rather than the `seed_tar` profile.
|
||||
Note that stateful crawls are currently [unsupported](https://github.com/mozilla/OpenWPM/projects/2)).
|
||||
* For stateless crawls, the initial `seed_tar` will be loaded during each
|
||||
new page visit. Note that this means the profile will very likely be
|
||||
_incomplete_, as cookies or storage may have been set or changed during the
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# Release Checklist
|
||||
|
||||
We aim to release a new version of OpenWPM with each new Firefox release (~1 release per month). The following steps are necessary for a release
|
||||
We aim to release a new version of OpenWPM with each new Firefox release (~1 release per month). The following steps are necessary for a release:
|
||||
|
||||
1. Upgrade Firefox to the newest version.
|
||||
1. Go to: https://hg.mozilla.org/releases/mozilla-release/tags.
|
||||
|
@ -11,10 +11,11 @@ We aim to release a new version of OpenWPM with each new Firefox release (~1 rel
|
|||
2. Run `npm update` in `openwpm/Extension/webext-instrumentation`.
|
||||
3. Run `npm update` in the base directory
|
||||
3. Update python and system dependencies by following the ["managing requirements" instructions](../CONTRIBUTING.md#managing-requirements).
|
||||
4. Increment the version number in [VERSION](../VERSION)
|
||||
5. Add a summary of changes since the last version to [CHANGELOG](../CHANGELOG.md)
|
||||
6. Squash and merge the release PR to master.
|
||||
7. Publish a new release from https://github.com/mozilla/OpenWPM/releases:
|
||||
4. If a new version of geckodriver is used, check whether the default geckodriver browser preferences in [`openwpm/deploy_browsers/configure_firefox.py`](../openwpm/deploy_browsers/configure_firefox.py#L8L65) need to be updated.
|
||||
5. Increment the version number in [VERSION](../VERSION)
|
||||
6. Add a summary of changes since the last version to [CHANGELOG](../CHANGELOG.md)
|
||||
7. Squash and merge the release PR to master.
|
||||
8. Publish a new release from https://github.com/mozilla/OpenWPM/releases:
|
||||
1. Click "Draft a new release".
|
||||
2. Enter the "Tag version" and "Release title" as `vX.X.X`.
|
||||
3. In the description:
|
||||
|
|
|
@ -5,9 +5,11 @@ import pickle
|
|||
import shutil
|
||||
import signal
|
||||
import sys
|
||||
import tempfile
|
||||
import threading
|
||||
import time
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
from queue import Empty as EmptyQueue
|
||||
from typing import Optional, Union
|
||||
|
||||
|
@ -16,6 +18,7 @@ from multiprocess import Queue
|
|||
from selenium.common.exceptions import WebDriverException
|
||||
from tblib import pickling_support
|
||||
|
||||
from .commands.profile_commands import dump_profile
|
||||
from .commands.types import BaseCommand, ShutdownSignal
|
||||
from .config import BrowserParamsInternal, ManagerParamsInternal
|
||||
from .deploy_browsers import deploy_firefox
|
||||
|
@ -33,7 +36,7 @@ pickling_support.install()
|
|||
|
||||
class Browser:
|
||||
"""
|
||||
The Browser class is responsbile for holding all of the
|
||||
The Browser class is responsible for holding all of the
|
||||
configuration and status information on BrowserManager process
|
||||
it corresponds to. It also includes a set of methods for managing
|
||||
the BrowserManager process and its child processes/threads.
|
||||
|
@ -52,7 +55,7 @@ class Browser:
|
|||
self._UNSUCCESSFUL_SPAWN_LIMIT = 4
|
||||
|
||||
# manager parameters
|
||||
self.current_profile_path = None
|
||||
self.current_profile_path: Optional[Path] = None
|
||||
self.db_socket_address = manager_params.storage_controller_address
|
||||
assert browser_params.browser_id is not None
|
||||
self.browser_id: BrowserId = browser_params.browser_id
|
||||
|
@ -62,7 +65,7 @@ class Browser:
|
|||
|
||||
# Queues and process IDs for BrowserManager
|
||||
|
||||
# thread to run commands issues from TaskManager
|
||||
# thread to run commands issued from TaskManager
|
||||
self.command_thread: Optional[threading.Thread] = None
|
||||
# queue for passing command tuples to BrowserManager
|
||||
self.command_queue: Optional[Queue] = None
|
||||
|
@ -75,7 +78,7 @@ class Browser:
|
|||
# the port of the display for the Xvfb display (if it exists)
|
||||
self.display_port: Optional[int] = None
|
||||
|
||||
# boolean that says if the BrowserManager new (to optimize restarts)
|
||||
# boolean that says if the BrowserManager is new (to optimize restarts)
|
||||
self.is_fresh = True
|
||||
# boolean indicating if the browser should be restarted
|
||||
self.restart_required = False
|
||||
|
@ -97,29 +100,29 @@ class Browser:
|
|||
sets up the BrowserManager and gets the process id, browser pid and,
|
||||
if applicable, screen pid. loads associated user profile if necessary
|
||||
"""
|
||||
# Unsupported. See https://github.com/mozilla/OpenWPM/projects/2
|
||||
# if this is restarting from a crash, update the tar location
|
||||
# to be a tar of the crashed browser's history
|
||||
"""
|
||||
if self.current_profile_path is not None:
|
||||
# tar contents of crashed profile to a temp dir
|
||||
tempdir = tempfile.mkdtemp(prefix="owpm_profile_archive_") + "/"
|
||||
profile_commands.dump_profile(
|
||||
self.current_profile_path,
|
||||
self.manager_params,
|
||||
self.browser_params,
|
||||
tempdir,
|
||||
close_webdriver=False,
|
||||
tempdir = tempfile.mkdtemp(prefix="openwpm_profile_archive_")
|
||||
tar_path = Path(tempdir) / "profile.tar"
|
||||
|
||||
dump_profile(
|
||||
browser_profile_path=self.current_profile_path,
|
||||
tar_path=tar_path,
|
||||
compress=False,
|
||||
browser_params=self.browser_params,
|
||||
)
|
||||
|
||||
# make sure browser loads crashed profile
|
||||
self.browser_params.recovery_tar = tempdir
|
||||
self.browser_params.recovery_tar = tar_path
|
||||
|
||||
crash_recovery = True
|
||||
else:
|
||||
"""
|
||||
tempdir = None
|
||||
crash_recovery = False
|
||||
|
||||
self.logger.info("BROWSER %i: Launching browser..." % self.browser_id)
|
||||
tempdir = None
|
||||
crash_recovery = False
|
||||
self.is_fresh = not crash_recovery
|
||||
|
||||
# Try to spawn the browser within the timelimit
|
||||
|
@ -159,8 +162,8 @@ class Browser:
|
|||
# Read success status of browser manager
|
||||
launch_status = dict()
|
||||
try:
|
||||
# 1. Selenium profile created
|
||||
spawned_profile_path = check_queue(launch_status)
|
||||
# 1. Browser profile created
|
||||
browser_profile_path = check_queue(launch_status)
|
||||
# 2. Profile tar loaded (if necessary)
|
||||
check_queue(launch_status)
|
||||
# 3. Display launched (if necessary)
|
||||
|
@ -170,7 +173,7 @@ class Browser:
|
|||
# 5. Browser launched
|
||||
self.geckodriver_pid = check_queue(launch_status)
|
||||
|
||||
(driver_profile_path, ready) = check_queue(launch_status)
|
||||
ready = check_queue(launch_status)
|
||||
if ready != "READY":
|
||||
self.logger.error(
|
||||
"BROWSER %i: Mismatch of status queue return values, "
|
||||
|
@ -183,7 +186,6 @@ class Browser:
|
|||
unsuccessful_spawns += 1
|
||||
error_string = ""
|
||||
status_strings = [
|
||||
"Proxy Ready",
|
||||
"Profile Created",
|
||||
"Profile Tar",
|
||||
"Display",
|
||||
|
@ -202,17 +204,15 @@ class Browser:
|
|||
)
|
||||
self.close_browser_manager()
|
||||
if "Profile Created" in launch_status:
|
||||
shutil.rmtree(spawned_profile_path, ignore_errors=True)
|
||||
shutil.rmtree(browser_profile_path, ignore_errors=True)
|
||||
|
||||
# If the browser spawned successfully, we should update the
|
||||
# current profile path class variable and clean up the tempdir
|
||||
# and previous profile path.
|
||||
if success:
|
||||
self.logger.debug("BROWSER %i: Browser spawn sucessful!" % self.browser_id)
|
||||
self.logger.debug("BROWSER %i: Browser spawn successful!" % self.browser_id)
|
||||
previous_profile_path = self.current_profile_path
|
||||
self.current_profile_path = driver_profile_path
|
||||
if driver_profile_path != spawned_profile_path:
|
||||
shutil.rmtree(spawned_profile_path, ignore_errors=True)
|
||||
self.current_profile_path = browser_profile_path
|
||||
if previous_profile_path is not None:
|
||||
shutil.rmtree(previous_profile_path, ignore_errors=True)
|
||||
if tempdir is not None:
|
||||
|
@ -360,7 +360,7 @@ class Browser:
|
|||
os.kill(self.display_pid, signal.SIGKILL)
|
||||
except OSError:
|
||||
self.logger.debug(
|
||||
"BROWSER %i: Display process does not " "exit" % self.browser_id
|
||||
"BROWSER %i: Display process does not exit" % self.browser_id
|
||||
)
|
||||
pass
|
||||
except TypeError:
|
||||
|
@ -368,7 +368,7 @@ class Browser:
|
|||
"BROWSER %i: PID may not be the correct "
|
||||
"type %s" % (self.browser_id, str(self.display_pid))
|
||||
)
|
||||
if self.display_port is not None: # xvfb diplay lock
|
||||
if self.display_port is not None: # xvfb display lock
|
||||
lockfile = "/tmp/.X%s-lock" % self.display_port
|
||||
try:
|
||||
os.remove(lockfile)
|
||||
|
@ -394,33 +394,27 @@ class Browser:
|
|||
self.close_browser_manager(force=force)
|
||||
|
||||
# Archive browser profile (if requested)
|
||||
if not during_init and self.browser_params.profile_archive_dir is not None:
|
||||
self.logger.warning(
|
||||
"BROWSER %i: Archiving the browser profile directory is "
|
||||
"currently unsupported. "
|
||||
"See: https://github.com/mozilla/OpenWPM/projects/2" % self.browser_id
|
||||
)
|
||||
"""
|
||||
self.logger.debug(
|
||||
"BROWSER %i: during_init=%s | profile_archive_dir=%s" % (
|
||||
self.browser_id, str(during_init),
|
||||
self.browser_params.profile_archive_dir)
|
||||
)
|
||||
if (not during_init and
|
||||
self.browser_params.profile_archive_dir is not None):
|
||||
self.logger.debug(
|
||||
"BROWSER %i: Archiving browser profile directory to %s" % (
|
||||
self.browser_id,
|
||||
self.browser_params.profile_archive_dir))
|
||||
profile_commands.dump_profile(
|
||||
self.current_profile_path,
|
||||
self.manager_params,
|
||||
self.browser_params,
|
||||
"BROWSER %i: during_init=%s | profile_archive_dir=%s"
|
||||
% (
|
||||
self.browser_id,
|
||||
str(during_init),
|
||||
self.browser_params.profile_archive_dir,
|
||||
close_webdriver=False,
|
||||
compress=True
|
||||
)
|
||||
"""
|
||||
)
|
||||
if not during_init and self.browser_params.profile_archive_dir is not None:
|
||||
self.logger.debug(
|
||||
"BROWSER %i: Archiving browser profile directory to %s"
|
||||
% (self.browser_id, self.browser_params.profile_archive_dir)
|
||||
)
|
||||
tar_path = self.browser_params.profile_archive_dir / "profile.tar.gz"
|
||||
assert self.current_profile_path is not None
|
||||
dump_profile(
|
||||
browser_profile_path=self.current_profile_path,
|
||||
tar_path=tar_path,
|
||||
compress=True,
|
||||
browser_params=self.browser_params,
|
||||
)
|
||||
|
||||
# Clean up temporary files
|
||||
if self.current_profile_path is not None:
|
||||
|
@ -441,22 +435,20 @@ def BrowserManager(
|
|||
display = None
|
||||
try:
|
||||
# Start Xvfb (if necessary), webdriver, and browser
|
||||
driver, prof_folder, display = deploy_firefox.deploy_firefox(
|
||||
driver, browser_profile_path, display = deploy_firefox.deploy_firefox(
|
||||
status_queue, browser_params, manager_params, crash_recovery
|
||||
)
|
||||
if prof_folder[-1] != "/":
|
||||
prof_folder += "/"
|
||||
|
||||
# Read the extension port -- if extension is enabled
|
||||
# TODO: Initial communication from extension to TM should use sockets
|
||||
if browser_params.extension_enabled:
|
||||
logger.debug(
|
||||
"BROWSER %i: Looking for extension port information "
|
||||
"in %s" % (browser_params.browser_id, prof_folder)
|
||||
"in %s" % (browser_params.browser_id, browser_profile_path)
|
||||
)
|
||||
elapsed = 0
|
||||
port = None
|
||||
ep_filename = os.path.join(prof_folder, "extension_port.txt")
|
||||
ep_filename = browser_profile_path / "extension_port.txt"
|
||||
while elapsed < 5:
|
||||
try:
|
||||
with open(ep_filename, "rt") as f:
|
||||
|
@ -483,10 +475,9 @@ def BrowserManager(
|
|||
|
||||
logger.debug("BROWSER %i: BrowserManager ready." % browser_params.browser_id)
|
||||
|
||||
# passes the profile folder back to the
|
||||
# TaskManager to signal a successful startup
|
||||
status_queue.put(("STATUS", "Browser Ready", (prof_folder, "READY")))
|
||||
browser_params.profile_path = prof_folder
|
||||
# passes "READY" to the TaskManager to signal a successful startup
|
||||
status_queue.put(("STATUS", "Browser Ready", "READY"))
|
||||
browser_params.profile_path = browser_profile_path
|
||||
|
||||
# starts accepting arguments until told to die
|
||||
while True:
|
||||
|
@ -498,12 +489,6 @@ def BrowserManager(
|
|||
command: Union[ShutdownSignal, BaseCommand] = command_queue.get()
|
||||
|
||||
if type(command) is ShutdownSignal:
|
||||
# Geckodriver creates a copy of the profile (and the original
|
||||
# temp file created by FirefoxProfile() is deleted).
|
||||
# We clear the profile attribute here to prevent prints from:
|
||||
# https://github.com/SeleniumHQ/selenium/blob/4e4160dd3d2f93757cafb87e2a1c20d6266f5554/py/selenium/webdriver/firefox/webdriver.py#L193-L199
|
||||
if driver.profile and not os.path.isdir(driver.profile.path):
|
||||
driver.profile = None
|
||||
driver.quit()
|
||||
status_queue.put("OK")
|
||||
return
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from pathlib import Path
|
||||
from typing import Callable, List, Tuple
|
||||
|
||||
from .commands.browser_commands import (
|
||||
|
@ -10,6 +11,7 @@ from .commands.browser_commands import (
|
|||
SaveScreenshotCommand,
|
||||
ScreenshotFullPageCommand,
|
||||
)
|
||||
from .commands.profile_commands import DumpProfileCommand
|
||||
from .commands.types import BaseCommand
|
||||
from .errors import CommandExecutionError
|
||||
|
||||
|
@ -18,7 +20,7 @@ class CommandSequence:
|
|||
"""A CommandSequence wraps a series of commands to be performed
|
||||
on a visit to one top-level site into one logical
|
||||
"site visit," keyed by a visit id. An example of a CommandSequence
|
||||
that visits a page and dumps cookies modified on that visit would be:
|
||||
that visits a page and saves a screenshot of it would be:
|
||||
|
||||
sequence = CommandSequence(url)
|
||||
sequence.get()
|
||||
|
@ -87,15 +89,15 @@ class CommandSequence:
|
|||
self.contains_get_or_browse = True
|
||||
|
||||
def dump_profile(
|
||||
self, dump_folder, close_webdriver=False, compress=True, timeout=120
|
||||
):
|
||||
self,
|
||||
tar_path: Path,
|
||||
close_webdriver: bool = False,
|
||||
compress: bool = True,
|
||||
timeout: int = 120,
|
||||
) -> None:
|
||||
""" dumps from the profile path to a given file (absolute path) """
|
||||
raise NotImplementedError(
|
||||
"Profile saving is currently unsupported. "
|
||||
"See: https://github.com/mozilla/OpenWPM/projects/2."
|
||||
)
|
||||
self.total_timeout += timeout
|
||||
command = DumpProfCommand(dump_folder, close_webdriver, compress)
|
||||
command = DumpProfileCommand(tar_path, close_webdriver, compress)
|
||||
self._commands_with_timeout.append((command, timeout))
|
||||
|
||||
def save_screenshot(self, suffix="", timeout=30):
|
||||
|
@ -103,7 +105,7 @@ class CommandSequence:
|
|||
self.total_timeout += timeout
|
||||
if not self.contains_get_or_browse:
|
||||
raise CommandExecutionError(
|
||||
"No get or browse request preceding " "the save screenshot command",
|
||||
"No get or browse request preceding the save screenshot command",
|
||||
self,
|
||||
)
|
||||
command = SaveScreenshotCommand(suffix)
|
||||
|
@ -131,7 +133,7 @@ class CommandSequence:
|
|||
self.total_timeout += timeout
|
||||
if not self.contains_get_or_browse:
|
||||
raise CommandExecutionError(
|
||||
"No get or browse request preceding " "the dump page source command",
|
||||
"No get or browse request preceding the screenshot full page command",
|
||||
self,
|
||||
)
|
||||
command = ScreenshotFullPageCommand(suffix)
|
||||
|
@ -142,7 +144,7 @@ class CommandSequence:
|
|||
self.total_timeout += timeout
|
||||
if not self.contains_get_or_browse:
|
||||
raise CommandExecutionError(
|
||||
"No get or browse request preceding " "the dump page source command",
|
||||
"No get or browse request preceding the dump page source command",
|
||||
self,
|
||||
)
|
||||
command = DumpPageSourceCommand(suffix)
|
||||
|
@ -171,7 +173,8 @@ class CommandSequence:
|
|||
self.total_timeout += timeout
|
||||
if not self.contains_get_or_browse:
|
||||
raise CommandExecutionError(
|
||||
"No get or browse request preceding " "the dump page source command",
|
||||
"No get or browse request preceding the recursive dump"
|
||||
" page source command",
|
||||
self,
|
||||
)
|
||||
command = RecursiveDumpPageSourceCommand(suffix)
|
||||
|
@ -188,7 +191,6 @@ class CommandSequence:
|
|||
"""Returns a list of all commands in the command_sequence
|
||||
appended by a finalize command
|
||||
"""
|
||||
|
||||
commands = list(self._commands_with_timeout)
|
||||
commands.insert(0, (InitializeCommand(), 10))
|
||||
commands.append((FinalizeCommand(sleep=5), 10))
|
||||
|
|
|
@ -15,23 +15,91 @@ from .utils.firefox_profile import sleep_until_sqlite_checkpoint
|
|||
logger = logging.getLogger("openwpm")
|
||||
|
||||
|
||||
def dump_profile(
|
||||
browser_profile_path: Path,
|
||||
tar_path: Path,
|
||||
compress: bool,
|
||||
browser_params: BrowserParamsInternal,
|
||||
) -> None:
|
||||
"""Dumps a browser profile to a tar file."""
|
||||
assert browser_params.browser_id is not None
|
||||
|
||||
# Creating the folders if need be
|
||||
tar_path.parent.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
# see if this file exists first
|
||||
# if it does, delete it before we try to save the current session
|
||||
if tar_path.exists():
|
||||
tar_path.unlink()
|
||||
|
||||
# backup and tar profile
|
||||
if compress:
|
||||
tar = tarfile.open(tar_path, "w:gz", errorlevel=1)
|
||||
else:
|
||||
tar = tarfile.open(tar_path, "w", errorlevel=1)
|
||||
logger.debug(
|
||||
"BROWSER %i: Backing up full profile from %s to %s"
|
||||
% (browser_params.browser_id, browser_profile_path, tar_path)
|
||||
)
|
||||
|
||||
storage_vector_files = [
|
||||
"cookies.sqlite", # cookies
|
||||
"cookies.sqlite-shm",
|
||||
"cookies.sqlite-wal",
|
||||
"places.sqlite", # history
|
||||
"places.sqlite-shm",
|
||||
"places.sqlite-wal",
|
||||
"webappsstore.sqlite", # localStorage
|
||||
"webappsstore.sqlite-shm",
|
||||
"webappsstore.sqlite-wal",
|
||||
]
|
||||
storage_vector_dirs = [
|
||||
"webapps", # related to localStorage?
|
||||
"storage", # directory for IndexedDB
|
||||
]
|
||||
for item in storage_vector_files:
|
||||
full_path = browser_profile_path / item
|
||||
if (
|
||||
not full_path.is_file()
|
||||
and not full_path.name.endswith("shm")
|
||||
and not full_path.name.endswith("wal")
|
||||
):
|
||||
logger.critical(
|
||||
"BROWSER %i: %s NOT FOUND IN profile folder, skipping."
|
||||
% (browser_params.browser_id, full_path)
|
||||
)
|
||||
elif not full_path.is_file() and (
|
||||
full_path.name.endswith("shm") or full_path.name.endswith("wal")
|
||||
):
|
||||
continue # These are just checkpoint files
|
||||
tar.add(full_path, arcname=item)
|
||||
for item in storage_vector_dirs:
|
||||
full_path = browser_profile_path / item
|
||||
if not full_path.is_dir():
|
||||
logger.warning(
|
||||
"BROWSER %i: %s NOT FOUND IN profile folder, skipping."
|
||||
% (browser_params.browser_id, full_path)
|
||||
)
|
||||
continue
|
||||
tar.add(full_path, arcname=item)
|
||||
tar.close()
|
||||
|
||||
|
||||
class DumpProfileCommand(BaseCommand):
|
||||
"""
|
||||
Dumps a browser profile currently stored in <browser_profile_folder> to
|
||||
<tar_path>
|
||||
Dumps a browser profile currently stored in <browser_params.profile_path> to
|
||||
<tar_path>.
|
||||
"""
|
||||
|
||||
def __init__(self, tar_path: Path, close_webdriver: bool, compress: bool) -> None:
|
||||
def __init__(
|
||||
self, tar_path: Path, close_webdriver: bool, compress: bool = True
|
||||
) -> None:
|
||||
self.tar_path = tar_path
|
||||
self.close_webdriver = close_webdriver
|
||||
self.compress = compress
|
||||
raise NotImplementedError(
|
||||
"Profile dumping is currently unsupported. "
|
||||
"See: https://github.com/mozilla/OpenWPM/projects/2."
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return "DumpProfCommand({},{},{})".format(
|
||||
return "DumpProfileCommand({},{},{})".format(
|
||||
self.tar_path, self.close_webdriver, self.compress
|
||||
)
|
||||
|
||||
|
@ -42,110 +110,40 @@ class DumpProfileCommand(BaseCommand):
|
|||
manager_params: ManagerParamsInternal,
|
||||
extension_socket: ClientSocket,
|
||||
) -> None:
|
||||
browser_profile_folder = browser_params.profile_path
|
||||
assert browser_profile_folder is not None
|
||||
|
||||
# Creating the folders if need be
|
||||
self.tar_path.parent.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
# see if this file exists first
|
||||
# if it does, delete it before we try to save the current session
|
||||
if self.tar_path.exists():
|
||||
self.tar_path.unlink() # IDK why it's called like this
|
||||
# if this is a dump on close, close the webdriver and wait for checkpoint
|
||||
if self.close_webdriver:
|
||||
webdriver.close()
|
||||
sleep_until_sqlite_checkpoint(browser_profile_folder)
|
||||
sleep_until_sqlite_checkpoint(browser_params.profile_path)
|
||||
|
||||
# backup and tar profile
|
||||
if self.compress:
|
||||
tar = tarfile.open(self.tar_path, "w:gz", errorlevel=1)
|
||||
else:
|
||||
tar = tarfile.open(self.tar_path, "w", errorlevel=1)
|
||||
logger.debug(
|
||||
"BROWSER %i: Backing up full profile from %s to %s"
|
||||
% (
|
||||
self.browser_id,
|
||||
browser_profile_folder,
|
||||
self.tar_path,
|
||||
)
|
||||
assert browser_params.profile_path is not None
|
||||
dump_profile(
|
||||
browser_params.profile_path,
|
||||
self.tar_path,
|
||||
self.compress,
|
||||
browser_params,
|
||||
)
|
||||
storage_vector_files = [
|
||||
"cookies.sqlite", # cookies
|
||||
"cookies.sqlite-shm",
|
||||
"cookies.sqlite-wal",
|
||||
"places.sqlite", # history
|
||||
"places.sqlite-shm",
|
||||
"places.sqlite-wal",
|
||||
"webappsstore.sqlite", # localStorage
|
||||
"webappsstore.sqlite-shm",
|
||||
"webappsstore.sqlite-wal",
|
||||
]
|
||||
storage_vector_dirs = [
|
||||
"webapps", # related to localStorage?
|
||||
"storage", # directory for IndexedDB
|
||||
]
|
||||
for item in storage_vector_files:
|
||||
full_path = browser_profile_folder / item
|
||||
if (
|
||||
not full_path.is_file()
|
||||
and not full_path.name.endswith("shm")
|
||||
and not full_path.name.endswith("wal")
|
||||
):
|
||||
logger.critical(
|
||||
"BROWSER %i: %s NOT FOUND IN profile folder, skipping."
|
||||
% (self.browser_id, full_path)
|
||||
)
|
||||
elif not full_path.is_file() and (
|
||||
full_path.name.endswith("shm") or full_path.name.endswith("wal")
|
||||
):
|
||||
continue # These are just checkpoint files
|
||||
tar.add(full_path, arcname=item)
|
||||
for item in storage_vector_dirs:
|
||||
full_path = browser_profile_folder / item
|
||||
if not full_path.is_dir():
|
||||
logger.warning(
|
||||
"BROWSER %i: %s NOT FOUND IN profile folder, skipping."
|
||||
% (self.browser_id, full_path)
|
||||
)
|
||||
continue
|
||||
tar.add(full_path, arcname=item)
|
||||
tar.close()
|
||||
|
||||
|
||||
def load_profile(
|
||||
browser_profile_folder: Path,
|
||||
browser_profile_path: Path,
|
||||
manager_params: ManagerParamsInternal,
|
||||
browser_params: BrowserParamsInternal,
|
||||
tar_path: Path,
|
||||
) -> None:
|
||||
"""
|
||||
loads a zipped cookie-based profile stored at <tar_location> and
|
||||
unzips it to <browser_profile_folder>.
|
||||
The tar will remain unmodified.
|
||||
Loads a zipped cookie-based profile stored at <tar_path> and unzips
|
||||
it to <browser_profile_path>. The tar will remain unmodified.
|
||||
"""
|
||||
|
||||
assert tar_path.is_file()
|
||||
assert browser_params.browser_id is not None
|
||||
try:
|
||||
# Copy and untar the loaded profile
|
||||
logger.debug(
|
||||
"BROWSER %i: Copying profile tar from %s to %s"
|
||||
% (
|
||||
browser_params.browser_id,
|
||||
tar_path,
|
||||
browser_profile_folder,
|
||||
)
|
||||
)
|
||||
shutil.copy(tar_path, browser_profile_folder)
|
||||
tar_path = browser_profile_folder / tar_path.name
|
||||
assert tar_path.is_file()
|
||||
# Untar the loaded profile
|
||||
if tar_path.name.endswith("tar.gz"):
|
||||
f = tarfile.open(tar_path, "r:gz", errorlevel=1)
|
||||
else:
|
||||
f = tarfile.open(tar_path, "r", errorlevel=1)
|
||||
f.extractall(browser_profile_folder)
|
||||
f.extractall(browser_profile_path)
|
||||
f.close()
|
||||
tar_path.unlink()
|
||||
logger.debug("BROWSER %i: Tarfile extracted" % browser_params.browser_id)
|
||||
|
||||
except Exception as ex:
|
||||
|
|
|
@ -97,7 +97,9 @@ class BrowserParams(DataClassJsonMixin):
|
|||
prefs: dict = field(default_factory=dict)
|
||||
tp_cookies: str = "always"
|
||||
bot_mitigation: bool = False
|
||||
profile_archive_dir: Optional[str] = None
|
||||
profile_archive_dir: Optional[Path] = field(
|
||||
default=None, metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path)
|
||||
)
|
||||
recovery_tar: Optional[Path] = None
|
||||
donottrack: bool = False
|
||||
tracking_protection: bool = False
|
||||
|
|
|
@ -1,7 +1,105 @@
|
|||
""" Set prefs and load extensions in Firefox """
|
||||
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict
|
||||
|
||||
def privacy(browser_params, fp, fo, root_dir, browser_profile_path):
|
||||
from ..config import BrowserParams
|
||||
|
||||
# TODO: Remove hardcoded geckodriver default preferences. See
|
||||
# https://github.com/mozilla/OpenWPM/issues/867
|
||||
# Source of preferences:
|
||||
# https://hg.mozilla.org/mozilla-central/file/tip/testing/geckodriver/src/prefs.rs
|
||||
# https://hg.mozilla.org/mozilla-central/file/tip/testing/geckodriver/src/marionette.rs
|
||||
DEFAULT_GECKODRIVER_PREFS = {
|
||||
"app.normandy.api_url": "",
|
||||
"app.update.checkInstallTime": False,
|
||||
"app.update.disabledForTesting": True,
|
||||
"app.update.auto": False,
|
||||
"browser.dom.window.dump.enabled": True,
|
||||
"devtools.console.stdout.chrome": True,
|
||||
"browser.safebrowsing.blockedURIs.enabled": False,
|
||||
"browser.safebrowsing.downloads.enabled": False,
|
||||
"browser.safebrowsing.passwords.enabled": False,
|
||||
"browser.safebrowsing.malware.enabled": False,
|
||||
"browser.safebrowsing.phishing.enabled": False,
|
||||
"browser.sessionstore.resume_from_crash": False,
|
||||
"browser.shell.checkDefaultBrowser": False,
|
||||
"browser.startup.homepage_override.mstone": "ignore",
|
||||
"browser.startup.page": 0,
|
||||
"browser.tabs.closeWindowWithLastTab": False,
|
||||
"browser.tabs.warnOnClose": False,
|
||||
"browser.uitour.enabled": False,
|
||||
"browser.warnOnQuit": False,
|
||||
"datareporting.healthreport.documentServerURI": "http://%(server)s/dummy/healthreport/",
|
||||
"datareporting.healthreport.logging.consoleEnabled": False,
|
||||
"datareporting.healthreport.service.enabled": False,
|
||||
"datareporting.healthreport.service.firstRun": False,
|
||||
"datareporting.healthreport.uploadEnabled": False,
|
||||
"datareporting.policy.dataSubmissionEnabled": False,
|
||||
"datareporting.policy.dataSubmissionPolicyBypassNotification": True,
|
||||
"dom.ipc.reportProcessHangs": False,
|
||||
"extensions.autoDisableScopes": 0,
|
||||
"extensions.enabledScopes": 5,
|
||||
"extensions.installDistroAddons": False,
|
||||
"extensions.update.enabled": False,
|
||||
"extensions.update.notifyUser": False,
|
||||
"focusmanager.testmode": True,
|
||||
"general.useragent.updates.enabled": False,
|
||||
"geo.provider.testing": True,
|
||||
"geo.wifi.scan": False,
|
||||
"hangmonitor.timeout": 0,
|
||||
"idle.lastDailyNotification": -1,
|
||||
"javascript.options.showInConsole": True,
|
||||
"media.gmp-manager.updateEnabled": False,
|
||||
"media.sanity-test.disabled": True,
|
||||
"network.http.phishy-userpass-length": 255,
|
||||
"network.manage-offline-status": False,
|
||||
"network.sntp.pools": "%(server)s",
|
||||
"plugin.state.flash": 0,
|
||||
"security.certerrors.mitm.priming.enabled": False,
|
||||
"services.settings.server": "http://%(server)s/dummy/blocklist/",
|
||||
"startup.homepage_welcome_url": "about:blank",
|
||||
"startup.homepage_welcome_url.additional": "",
|
||||
"toolkit.startup.max_resumed_crashes": -1,
|
||||
"marionette.log.level": "Info",
|
||||
}
|
||||
|
||||
|
||||
def load_existing_prefs(browser_profile_path: Path) -> Dict[str, Any]:
|
||||
"""Load existing user preferences.
|
||||
|
||||
If the browser profile contains a user.js file, load the preferences
|
||||
specified inside it into a dictionary.
|
||||
"""
|
||||
prefs: Dict[str, Any] = {}
|
||||
prefs_path = browser_profile_path / "user.js"
|
||||
if not prefs_path.is_file():
|
||||
return prefs
|
||||
# Regular expression from https://stackoverflow.com/a/24563687
|
||||
r = re.compile(r"\s*user_pref\(([\"'])(.+?)\1,\s*(.+?)\);")
|
||||
with open(prefs_path, "r") as f:
|
||||
for line in f:
|
||||
m = r.match(line)
|
||||
if m:
|
||||
key, value = m.group(2), m.group(3)
|
||||
prefs[key] = json.loads(value)
|
||||
return prefs
|
||||
|
||||
|
||||
def save_prefs_to_profile(prefs: Dict[str, Any], browser_profile_path: Path) -> None:
|
||||
"""Save all preferences to the browser profile.
|
||||
|
||||
Write preferences from the prefs dictionary to a user.js file in the
|
||||
profile directory.
|
||||
"""
|
||||
with open(browser_profile_path / "user.js", "w") as f:
|
||||
for key, value in prefs.items():
|
||||
f.write('user_pref("%s", %s);\n' % (key, json.dumps(value)))
|
||||
|
||||
|
||||
def privacy(browser_params: BrowserParams, prefs: Dict[str, Any]) -> None:
|
||||
"""
|
||||
Configure the privacy settings in Firefox. This includes:
|
||||
* DNT
|
||||
|
@ -12,15 +110,15 @@ def privacy(browser_params, fp, fo, root_dir, browser_profile_path):
|
|||
|
||||
# Turns on Do Not Track
|
||||
if browser_params.donottrack:
|
||||
fo.set_preference("privacy.donottrackheader.enabled", True)
|
||||
prefs["privacy.donottrackheader.enabled"] = True
|
||||
|
||||
# Sets the third party cookie setting
|
||||
if browser_params.tp_cookies.lower() == "never":
|
||||
fo.set_preference("network.cookie.cookieBehavior", 1)
|
||||
prefs["network.cookie.cookieBehavior"] = 1
|
||||
elif browser_params.tp_cookies.lower() == "from_visited":
|
||||
fo.set_preference("network.cookie.cookieBehavior", 3)
|
||||
prefs["network.cookie.cookieBehavior"] = 3
|
||||
else: # always allow third party cookies
|
||||
fo.set_preference("network.cookie.cookieBehavior", 0)
|
||||
prefs["network.cookie.cookieBehavior"] = 0
|
||||
|
||||
# Tracking Protection
|
||||
if browser_params.tracking_protection:
|
||||
|
@ -31,7 +129,7 @@ def privacy(browser_params, fp, fo, root_dir, browser_profile_path):
|
|||
)
|
||||
|
||||
|
||||
def optimize_prefs(fo):
|
||||
def optimize_prefs(prefs: Dict[str, Any]) -> None:
|
||||
"""
|
||||
Disable various features and checks the browser will do on startup.
|
||||
Some of these (e.g. disabling the newtab page) are required to prevent
|
||||
|
@ -42,113 +140,113 @@ def optimize_prefs(fo):
|
|||
* https://github.com/pyllyukko/user.js/blob/master/user.js
|
||||
""" # noqa
|
||||
# Startup / Speed
|
||||
fo.set_preference("browser.shell.checkDefaultBrowser", False)
|
||||
fo.set_preference("browser.slowStartup.notificationDisabled", True)
|
||||
fo.set_preference("browser.slowStartup.maxSamples", 0)
|
||||
fo.set_preference("browser.slowStartup.samples", 0)
|
||||
fo.set_preference("extensions.checkCompatibility.nightly", False)
|
||||
fo.set_preference("browser.rights.3.shown", True)
|
||||
fo.set_preference("reader.parse-on-load.enabled", False)
|
||||
fo.set_preference("browser.pagethumbnails.capturing_disabled", True)
|
||||
fo.set_preference("browser.uitour.enabled", False)
|
||||
fo.set_preference("dom.flyweb.enabled", False)
|
||||
prefs["browser.shell.checkDefaultBrowser"] = False
|
||||
prefs["browser.slowStartup.notificationDisabled"] = True
|
||||
prefs["browser.slowStartup.maxSamples"] = 0
|
||||
prefs["browser.slowStartup.samples"] = 0
|
||||
prefs["extensions.checkCompatibility.nightly"] = False
|
||||
prefs["browser.rights.3.shown"] = True
|
||||
prefs["reader.parse-on-load.enabled"] = False
|
||||
prefs["browser.pagethumbnails.capturing_disabled"] = True
|
||||
prefs["browser.uitour.enabled"] = False
|
||||
prefs["dom.flyweb.enabled"] = False
|
||||
|
||||
# Disable health reports / telemetry / crash reports
|
||||
fo.set_preference("datareporting.policy.dataSubmissionEnabled", False)
|
||||
fo.set_preference("datareporting.healthreport.uploadEnabled", False)
|
||||
fo.set_preference("datareporting.healthreport.service.enabled", False)
|
||||
fo.set_preference("toolkit.telemetry.archive.enabled", False)
|
||||
fo.set_preference("toolkit.telemetry.enabled", False)
|
||||
fo.set_preference("toolkit.telemetry.unified", False)
|
||||
fo.set_preference("breakpad.reportURL", "")
|
||||
fo.set_preference("dom.ipc.plugins.reportCrashURL", False)
|
||||
fo.set_preference("browser.selfsupport.url", "")
|
||||
fo.set_preference("browser.tabs.crashReporting.sendReport", False)
|
||||
fo.set_preference("browser.crashReports.unsubmittedCheck.enabled", False)
|
||||
fo.set_preference("dom.ipc.plugins.flash.subprocess.crashreporter.enabled", False)
|
||||
prefs["datareporting.policy.dataSubmissionEnabled"] = False
|
||||
prefs["datareporting.healthreport.uploadEnabled"] = False
|
||||
prefs["datareporting.healthreport.service.enabled"] = False
|
||||
prefs["toolkit.telemetry.archive.enabled"] = False
|
||||
prefs["toolkit.telemetry.enabled"] = False
|
||||
prefs["toolkit.telemetry.unified"] = False
|
||||
prefs["breakpad.reportURL"] = ""
|
||||
prefs["dom.ipc.plugins.reportCrashURL"] = False
|
||||
prefs["browser.selfsupport.url"] = ""
|
||||
prefs["browser.tabs.crashReporting.sendReport"] = False
|
||||
prefs["browser.crashReports.unsubmittedCheck.enabled"] = False
|
||||
prefs["dom.ipc.plugins.flash.subprocess.crashreporter.enabled"] = False
|
||||
|
||||
# Predictive Actions / Prefetch
|
||||
fo.set_preference("network.predictor.enabled", False)
|
||||
fo.set_preference("network.dns.disablePrefetch", True)
|
||||
fo.set_preference("network.prefetch-next", False)
|
||||
fo.set_preference("browser.search.suggest.enabled", False)
|
||||
fo.set_preference("network.http.speculative-parallel-limit", 0)
|
||||
fo.set_preference("keyword.enabled", False) # location bar using search
|
||||
fo.set_preference("browser.urlbar.userMadeSearchSuggestionsChoice", True)
|
||||
fo.set_preference("browser.casting.enabled", False)
|
||||
prefs["network.predictor.enabled"] = False
|
||||
prefs["network.dns.disablePrefetch"] = True
|
||||
prefs["network.prefetch-next"] = False
|
||||
prefs["browser.search.suggest.enabled"] = False
|
||||
prefs["network.http.speculative-parallel-limit"] = 0
|
||||
prefs["keyword.enabled"] = False # location bar using search
|
||||
prefs["browser.urlbar.userMadeSearchSuggestionsChoice"] = True
|
||||
prefs["browser.casting.enabled"] = False
|
||||
|
||||
# Disable pinging Mozilla for geoip
|
||||
fo.set_preference("browser.search.geoip.url", "")
|
||||
fo.set_preference("browser.search.countryCode", "US")
|
||||
fo.set_preference("browser.search.region", "US")
|
||||
prefs["browser.search.geoip.url"] = ""
|
||||
prefs["browser.search.countryCode"] = "US"
|
||||
prefs["browser.search.region"] = "US"
|
||||
|
||||
# Disable pinging Mozilla for geo-specific search
|
||||
fo.set_preference("browser.search.geoSpecificDefaults", False)
|
||||
fo.set_preference("browser.search.geoSpecificDefaults.url", "")
|
||||
prefs["browser.search.geoSpecificDefaults"] = False
|
||||
prefs["browser.search.geoSpecificDefaults.url"] = ""
|
||||
|
||||
# Disable auto-updating
|
||||
fo.set_preference("app.update.enabled", False) # browser
|
||||
fo.set_preference("app.update.url", "") # browser
|
||||
fo.set_preference("browser.search.update", False) # search
|
||||
fo.set_preference("extensions.update.enabled", False) # extensions
|
||||
fo.set_preference("extensions.update.autoUpdateDefault", False)
|
||||
fo.set_preference("extensions.getAddons.cache.enabled", False)
|
||||
fo.set_preference("lightweightThemes.update.enabled", False) # Personas
|
||||
prefs["app.update.enabled"] = False # browser
|
||||
prefs["app.update.url"] = "" # browser
|
||||
prefs["browser.search.update"] = False # search
|
||||
prefs["extensions.update.enabled"] = False # extensions
|
||||
prefs["extensions.update.autoUpdateDefault"] = False
|
||||
prefs["extensions.getAddons.cache.enabled"] = False
|
||||
prefs["lightweightThemes.update.enabled"] = False # Personas
|
||||
|
||||
# Disable Safebrowsing and other security features
|
||||
# that require on remote content
|
||||
fo.set_preference("browser.safebrowsing.phising.enabled", False)
|
||||
fo.set_preference("browser.safebrowsing.malware.enabled", False)
|
||||
fo.set_preference("browser.safebrowsing.downloads.enabled", False)
|
||||
fo.set_preference("browser.safebrowsing.downloads.remote.enabled", False)
|
||||
fo.set_preference("browser.safebrowsing.blockedURIs.enabled", False)
|
||||
fo.set_preference("browser.safebrowsing.provider.mozilla.gethashURL", "")
|
||||
fo.set_preference("browser.safebrowsing.provider.google.gethashURL", "")
|
||||
fo.set_preference("browser.safebrowsing.provider.google4.gethashURL", "")
|
||||
fo.set_preference("browser.safebrowsing.provider.mozilla.updateURL", "")
|
||||
fo.set_preference("browser.safebrowsing.provider.google.updateURL", "")
|
||||
fo.set_preference("browser.safebrowsing.provider.google4.updateURL", "")
|
||||
fo.set_preference("browser.safebrowsing.provider.mozilla.lists", "") # TP
|
||||
fo.set_preference("browser.safebrowsing.provider.google.lists", "") # TP
|
||||
fo.set_preference("browser.safebrowsing.provider.google4.lists", "") # TP
|
||||
fo.set_preference("extensions.blocklist.enabled", False) # extensions
|
||||
fo.set_preference("security.OCSP.enabled", 0)
|
||||
prefs["browser.safebrowsing.phising.enabled"] = False
|
||||
prefs["browser.safebrowsing.malware.enabled"] = False
|
||||
prefs["browser.safebrowsing.downloads.enabled"] = False
|
||||
prefs["browser.safebrowsing.downloads.remote.enabled"] = False
|
||||
prefs["browser.safebrowsing.blockedURIs.enabled"] = False
|
||||
prefs["browser.safebrowsing.provider.mozilla.gethashURL"] = ""
|
||||
prefs["browser.safebrowsing.provider.google.gethashURL"] = ""
|
||||
prefs["browser.safebrowsing.provider.google4.gethashURL"] = ""
|
||||
prefs["browser.safebrowsing.provider.mozilla.updateURL"] = ""
|
||||
prefs["browser.safebrowsing.provider.google.updateURL"] = ""
|
||||
prefs["browser.safebrowsing.provider.google4.updateURL"] = ""
|
||||
prefs["browser.safebrowsing.provider.mozilla.lists"] = "" # TP
|
||||
prefs["browser.safebrowsing.provider.google.lists"] = "" # TP
|
||||
prefs["browser.safebrowsing.provider.google4.lists"] = "" # TP
|
||||
prefs["extensions.blocklist.enabled"] = False # extensions
|
||||
prefs["security.OCSP.enabled"] = 0
|
||||
|
||||
# Disable Content Decryption Module and OpenH264 related downloads
|
||||
fo.set_preference("media.gmp-manager.url", "")
|
||||
fo.set_preference("media.gmp-provider.enabled", False)
|
||||
fo.set_preference("media.gmp-widevinecdm.enabled", False)
|
||||
fo.set_preference("media.gmp-widevinecdm.visible", False)
|
||||
fo.set_preference("media.gmp-gmpopenh264.enabled", False)
|
||||
prefs["media.gmp-manager.url"] = ""
|
||||
prefs["media.gmp-provider.enabled"] = False
|
||||
prefs["media.gmp-widevinecdm.enabled"] = False
|
||||
prefs["media.gmp-widevinecdm.visible"] = False
|
||||
prefs["media.gmp-gmpopenh264.enabled"] = False
|
||||
|
||||
# Disable Experiments
|
||||
fo.set_preference("experiments.enabled", False)
|
||||
fo.set_preference("experiments.manifest.uri", "")
|
||||
fo.set_preference("experiments.supported", False)
|
||||
fo.set_preference("experiments.activeExperiment", False)
|
||||
fo.set_preference("network.allow-experiments", False)
|
||||
prefs["experiments.enabled"] = False
|
||||
prefs["experiments.manifest.uri"] = ""
|
||||
prefs["experiments.supported"] = False
|
||||
prefs["experiments.activeExperiment"] = False
|
||||
prefs["network.allow-experiments"] = False
|
||||
|
||||
# Disable pinging Mozilla for newtab
|
||||
fo.set_preference("browser.newtabpage.directory.ping", "")
|
||||
fo.set_preference("browser.newtabpage.directory.source", "")
|
||||
fo.set_preference("browser.newtabpage.enabled", False)
|
||||
fo.set_preference("browser.newtabpage.enhanced", False)
|
||||
fo.set_preference("browser.newtabpage.introShown", True)
|
||||
fo.set_preference("browser.aboutHomeSnippets.updateUrl", "")
|
||||
prefs["browser.newtabpage.directory.ping"] = ""
|
||||
prefs["browser.newtabpage.directory.source"] = ""
|
||||
prefs["browser.newtabpage.enabled"] = False
|
||||
prefs["browser.newtabpage.enhanced"] = False
|
||||
prefs["browser.newtabpage.introShown"] = True
|
||||
prefs["browser.aboutHomeSnippets.updateUrl"] = ""
|
||||
|
||||
# Disable Pocket
|
||||
fo.set_preference("extensions.pocket.enabled", False)
|
||||
prefs["extensions.pocket.enabled"] = False
|
||||
|
||||
# Disable Shield
|
||||
fo.set_preference("app.shield.optoutstudies.enabled", False)
|
||||
fo.set_preference("extensions.shield-recipe-client.enabled", False)
|
||||
prefs["app.shield.optoutstudies.enabled"] = False
|
||||
prefs["extensions.shield-recipe-client.enabled"] = False
|
||||
|
||||
# Disable Source Pragams
|
||||
# Disable Source Pragmas
|
||||
# As per https://bugzilla.mozilla.org/show_bug.cgi?id=1628853
|
||||
# sourceURL can be used to obfuscate the original origin of
|
||||
# a script, we disable it.
|
||||
fo.set_preference("javascript.options.source_pragmas", False)
|
||||
prefs["javascript.options.source_pragmas"] = False
|
||||
|
||||
# Enable extensions and disable extension signing
|
||||
fo.set_preference("extensions.experiments.enabled", True)
|
||||
fo.set_preference("xpinstall.signatures.required", False)
|
||||
prefs["extensions.experiments.enabled"] = True
|
||||
prefs["xpinstall.signatures.required"] = False
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
import json
|
||||
import logging
|
||||
import os.path
|
||||
import socket
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional, Tuple
|
||||
|
||||
|
@ -8,7 +10,6 @@ from easyprocess import EasyProcessError
|
|||
from multiprocess import Queue
|
||||
from pyvirtualdisplay import Display
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
|
||||
|
||||
from ..commands.profile_commands import load_profile
|
||||
from ..config import BrowserParamsInternal, ConfigEncoder, ManagerParamsInternal
|
||||
|
@ -25,7 +26,7 @@ def deploy_firefox(
|
|||
browser_params: BrowserParamsInternal,
|
||||
manager_params: ManagerParamsInternal,
|
||||
crash_recovery: bool,
|
||||
) -> Tuple[webdriver.Firefox, str, Optional[Display]]:
|
||||
) -> Tuple[webdriver.Firefox, Path, Optional[Display]]:
|
||||
"""
|
||||
launches a firefox instance with parameters set by the input dictionary
|
||||
"""
|
||||
|
@ -33,14 +34,20 @@ def deploy_firefox(
|
|||
|
||||
root_dir = os.path.dirname(__file__) # directory of this file
|
||||
|
||||
fp = FirefoxProfile()
|
||||
browser_profile_path = Path(fp.path)
|
||||
browser_profile_path = Path(tempfile.mkdtemp(prefix="firefox_profile_"))
|
||||
status_queue.put(("STATUS", "Profile Created", browser_profile_path))
|
||||
|
||||
# Use Options instead of FirefoxProfile to set preferences since the
|
||||
# Options method has no "frozen"/restricted options.
|
||||
# https://github.com/SeleniumHQ/selenium/issues/2106#issuecomment-320238039
|
||||
fo = Options()
|
||||
# Set a custom profile that is used in-place and is not deleted by geckodriver.
|
||||
# https://firefox-source-docs.mozilla.org/testing/geckodriver/CrashReports.html
|
||||
# Using FirefoxProfile breaks stateful crawling:
|
||||
# https://github.com/mozilla/OpenWPM/issues/423#issuecomment-521018093
|
||||
fo.add_argument("-profile")
|
||||
fo.add_argument(str(browser_profile_path))
|
||||
|
||||
assert browser_params.browser_id is not None
|
||||
if browser_params.seed_tar and not crash_recovery:
|
||||
logger.info(
|
||||
|
@ -110,16 +117,32 @@ def deploy_firefox(
|
|||
# TODO restore detailed logging
|
||||
# fo.set_preference("extensions.@openwpm.sdk.console.logLevel", "all")
|
||||
|
||||
# Geckodriver currently places the user.js file in the wrong profile
|
||||
# directory, so we have to create it manually here.
|
||||
# TODO: See https://github.com/mozilla/OpenWPM/issues/867 for when
|
||||
# to remove this workaround.
|
||||
# Load existing preferences from the profile's user.js file
|
||||
prefs = configure_firefox.load_existing_prefs(browser_profile_path)
|
||||
# Load default geckodriver preferences
|
||||
prefs.update(configure_firefox.DEFAULT_GECKODRIVER_PREFS)
|
||||
# Pick an available port for Marionette (https://stackoverflow.com/a/2838309)
|
||||
# This has a race condition, as another process may get the port
|
||||
# before Marionette, but we don't expect it to happen often
|
||||
s = socket.socket()
|
||||
s.bind(("", 0))
|
||||
marionette_port = s.getsockname()[1]
|
||||
s.close()
|
||||
prefs["marionette.port"] = marionette_port
|
||||
|
||||
# Configure privacy settings
|
||||
configure_firefox.privacy(browser_params, fp, fo, root_dir, browser_profile_path)
|
||||
configure_firefox.privacy(browser_params, prefs)
|
||||
|
||||
# Set various prefs to improve speed and eliminate traffic to Mozilla
|
||||
configure_firefox.optimize_prefs(fo)
|
||||
configure_firefox.optimize_prefs(prefs)
|
||||
|
||||
# Intercept logging at the Selenium level and redirect it to the
|
||||
# main logger. This will also inform us where the real profile
|
||||
# directory is hiding.
|
||||
interceptor = FirefoxLogInterceptor(browser_params.browser_id, browser_profile_path)
|
||||
# main logger.
|
||||
interceptor = FirefoxLogInterceptor(browser_params.browser_id)
|
||||
interceptor.start()
|
||||
|
||||
# Set custom prefs. These are set after all of the default prefs to allow
|
||||
|
@ -129,16 +152,21 @@ def deploy_firefox(
|
|||
"BROWSER %i: Setting custom preference: %s = %s"
|
||||
% (browser_params.browser_id, name, value)
|
||||
)
|
||||
fo.set_preference(name, value)
|
||||
prefs[name] = value
|
||||
|
||||
# Write all preferences to the profile's user.js file
|
||||
configure_firefox.save_prefs_to_profile(prefs, browser_profile_path)
|
||||
|
||||
# Launch the webdriver
|
||||
status_queue.put(("STATUS", "Launch Attempted", None))
|
||||
fb = FirefoxBinary(firefox_path=firefox_binary_path)
|
||||
driver = webdriver.Firefox(
|
||||
firefox_profile=fp,
|
||||
firefox_binary=fb,
|
||||
firefox_options=fo,
|
||||
options=fo,
|
||||
log_path=interceptor.fifo,
|
||||
# TODO: See https://github.com/mozilla/OpenWPM/issues/867 for
|
||||
# when to remove this
|
||||
service_args=["--marionette-port", str(marionette_port)],
|
||||
)
|
||||
|
||||
# Add extension
|
||||
|
@ -165,4 +193,4 @@ def deploy_firefox(
|
|||
|
||||
status_queue.put(("STATUS", "Browser Launched", int(pid)))
|
||||
|
||||
return driver, driver.capabilities["moz:profile"], display
|
||||
return driver, browser_profile_path, display
|
||||
|
|
|
@ -46,15 +46,13 @@ class FirefoxLogInterceptor(threading.Thread):
|
|||
"""
|
||||
Intercept logs from Selenium and/or geckodriver, using a named pipe
|
||||
and a detached thread, and feed them to the primary logger for this
|
||||
instance. Also responsible for extracting the _real_ profile location
|
||||
from geckodriver's log output (geckodriver copies the profile).
|
||||
instance.
|
||||
"""
|
||||
|
||||
def __init__(self, browser_id, profile_path):
|
||||
def __init__(self, browser_id):
|
||||
threading.Thread.__init__(self, name="log-interceptor-%i" % browser_id)
|
||||
self.browser_id = browser_id
|
||||
self.fifo = mktempfifo(suffix=".log", prefix="owpm_driver_")
|
||||
self.profile_path = profile_path
|
||||
self.daemon = True
|
||||
self.logger = logging.getLogger("openwpm")
|
||||
|
||||
|
@ -68,11 +66,6 @@ class FirefoxLogInterceptor(threading.Thread):
|
|||
self.logger.debug(
|
||||
"BROWSER %i: driver: %s" % (self.browser_id, line.strip())
|
||||
)
|
||||
if "Using profile path" in line:
|
||||
self.profile_path = line.partition("Using profile path")[
|
||||
-1
|
||||
].strip()
|
||||
|
||||
if self.fifo is not None:
|
||||
os.unlink(self.fifo)
|
||||
self.fifo = None
|
||||
|
@ -83,7 +76,7 @@ class FirefoxLogInterceptor(threading.Thread):
|
|||
self.fifo = None
|
||||
|
||||
|
||||
class PatchedGeckoDriverService(BaseService):
|
||||
class PatchedGeckoDriverService(FirefoxDriverModule.Service):
|
||||
"""Object that manages the starting and stopping of the GeckoDriver.
|
||||
Modified from the original (selenium.webdriver.firefox.service.Service)
|
||||
for Py3 compat in the presence of log FIFOs, and for potential future
|
||||
|
@ -128,11 +121,5 @@ class PatchedGeckoDriverService(BaseService):
|
|||
)
|
||||
self.service_args = service_args or []
|
||||
|
||||
def command_line_args(self):
|
||||
return ["--port", "%d" % self.port]
|
||||
|
||||
def send_remote_shutdown_command(self):
|
||||
pass
|
||||
|
||||
|
||||
FirefoxDriverModule.Service = PatchedGeckoDriverService
|
||||
|
|
|
@ -305,8 +305,8 @@ class TaskManager:
|
|||
Parameters
|
||||
----------
|
||||
during_init :
|
||||
flag to indicator if this shutdown is occuring during
|
||||
the TaskManager initialization
|
||||
flag to indicate if this shutdown is occuring during
|
||||
the TaskManager initialization
|
||||
relaxed :
|
||||
If `True` the function will wait for all active
|
||||
`CommandSequences` to finish before shutting down
|
||||
|
@ -434,17 +434,6 @@ class TaskManager:
|
|||
assert browser.browser_id is not None
|
||||
assert browser.curr_visit_id is not None
|
||||
reset = command_sequence.reset
|
||||
if not reset:
|
||||
self.logger.warning(
|
||||
"BROWSER %i: Browser will not reset after CommandSequence "
|
||||
"executes. OpenWPM does not currently support stateful crawls "
|
||||
"(see: https://github.com/mozilla/OpenWPM/projects/2). "
|
||||
"The next command issued to this browser may or may not "
|
||||
"use the same profile (depending on the failure status of "
|
||||
"this command). To prevent this warning, initialize the "
|
||||
"CommandSequence with `reset` set to `True` to use a fresh "
|
||||
"profile for each command." % browser.browser_id
|
||||
)
|
||||
self.logger.info(
|
||||
"Starting to work on CommandSequence with "
|
||||
"visit_id %d on browser with id %d",
|
||||
|
|
|
@ -24,13 +24,17 @@ EXTENSION_DIR = os.path.join(
|
|||
pytest_plugins = "test.storage.fixtures"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def xpi():
|
||||
# Creates a new xpi using npm run build.
|
||||
print("Building new xpi")
|
||||
subprocess.check_call(["npm", "run", "build"], cwd=EXTENSION_DIR)
|
||||
|
||||
|
||||
@pytest.fixture(name="xpi", scope="session")
|
||||
def xpi_fixture():
|
||||
return xpi()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def server():
|
||||
"""Run an HTTP server during the tests."""
|
||||
|
|
|
@ -1,11 +1,15 @@
|
|||
import atexit
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from os.path import dirname, join, realpath
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
import IPython
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
|
||||
from selenium.webdriver.firefox.options import Options
|
||||
|
||||
from openwpm import js_instrumentation as jsi
|
||||
from openwpm.config import BrowserParams
|
||||
|
@ -88,7 +92,7 @@ def start_webdriver(
|
|||
Set to True to load browser_params
|
||||
browser_params_file : string
|
||||
Specify the browser_params.json to load.
|
||||
If None, default params form openwpm/config.py::BrowserParams will be loaded.
|
||||
If None, default params from openwpm/config.py::BrowserParams will be loaded.
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
@ -110,16 +114,35 @@ def start_webdriver(
|
|||
print("...server shutdown")
|
||||
driver.quit()
|
||||
print("...webdriver closed")
|
||||
shutil.rmtree(driver.capabilities["moz:profile"], ignore_errors=True)
|
||||
print("...browser profile removed")
|
||||
|
||||
atexit.register(cleanup_server)
|
||||
return driver
|
||||
|
||||
fp = webdriver.FirefoxProfile()
|
||||
browser_profile_path = Path(tempfile.mkdtemp(prefix="firefox_profile_"))
|
||||
fo = Options()
|
||||
fo.add_argument("-profile")
|
||||
fo.add_argument(str(browser_profile_path))
|
||||
# TODO: See https://github.com/mozilla/OpenWPM/issues/867 for when
|
||||
# to remove manually creating user.js
|
||||
prefs = configure_firefox.load_existing_prefs(browser_profile_path)
|
||||
prefs.update(configure_firefox.DEFAULT_GECKODRIVER_PREFS)
|
||||
|
||||
if with_extension:
|
||||
# TODO: Restore preference for log level in a way that works in Fx 57+
|
||||
# fp.set_preference("extensions.@openwpm.sdk.console.logLevel", "all")
|
||||
configure_firefox.optimize_prefs(fp)
|
||||
driver = webdriver.Firefox(firefox_binary=fb, firefox_profile=fp)
|
||||
configure_firefox.optimize_prefs(prefs)
|
||||
|
||||
configure_firefox.save_prefs_to_profile(prefs, browser_profile_path)
|
||||
driver = webdriver.Firefox(
|
||||
firefox_binary=fb,
|
||||
options=fo,
|
||||
# Use the default Marionette port.
|
||||
# TODO: See https://github.com/mozilla/OpenWPM/issues/867 for
|
||||
# when to remove this
|
||||
service_args=["--marionette-port", "2828"],
|
||||
)
|
||||
if load_browser_params is True:
|
||||
# There's probably more we could do here
|
||||
# to set more preferences and better emulate
|
||||
|
@ -134,8 +157,7 @@ def start_webdriver(
|
|||
js_request_as_string = jsi.clean_js_instrumentation_settings(js_request)
|
||||
browser_params.js_instrument_settings = js_request_as_string
|
||||
|
||||
profile_dir = driver.capabilities["moz:profile"]
|
||||
with open(join(profile_dir, "browser_params.json"), "w") as f:
|
||||
with open(browser_profile_path / "browser_params.json", "w") as f:
|
||||
f.write(browser_params.to_json())
|
||||
|
||||
if with_extension:
|
||||
|
@ -192,9 +214,9 @@ def start_webext():
|
|||
"--browser-params-file",
|
||||
help="""
|
||||
Specify a browser_params.json file. If none provided and
|
||||
--browser-params is enabled. Default browser_params.json
|
||||
will be used. Pass an absolute path or a path relative
|
||||
to the test directory.""",
|
||||
--browser-params is enabled the default params from
|
||||
openwpm/config.py::BrowserParams will be loaded. Pass an
|
||||
absolute path or a path relative to the test directory.""",
|
||||
)
|
||||
def main(selenium, no_extension, browser_params, browser_params_file):
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ from .utilities import BASE_TEST_URL
|
|||
|
||||
|
||||
def test_local_callbacks(default_params, task_manager_creator):
|
||||
"""Test test the storage controller as well as the entire callback machinery
|
||||
"""Test the storage controller as well as the entire callback machinery
|
||||
to see if all callbacks get correctly called"""
|
||||
manager, _ = task_manager_creator(default_params)
|
||||
TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html"
|
||||
|
@ -17,7 +17,7 @@ def test_local_callbacks(default_params, task_manager_creator):
|
|||
|
||||
my_list: List[int] = []
|
||||
sequence = CommandSequence(
|
||||
TEST_SITE, reset=True, blocking=True, callback=partial(callback, my_list)
|
||||
TEST_SITE, blocking=True, callback=partial(callback, my_list)
|
||||
)
|
||||
sequence.get()
|
||||
|
||||
|
|
|
@ -1,18 +1,19 @@
|
|||
# type:ignore
|
||||
# As this file is no longer maintained, mypy shouldn't check this
|
||||
"""Runs a short test crawl.
|
||||
|
||||
This should be used to test any features that require real crawl data.
|
||||
This should be avoided if possible, as controlled tests will be easier
|
||||
to debug.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import tarfile
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
|
||||
import domain_utils as du
|
||||
import pytest
|
||||
|
||||
from openwpm.config import BrowserParams, ManagerParams
|
||||
from openwpm.utilities import db_utils
|
||||
|
||||
from .openwpmtest import OpenWPMTest
|
||||
|
||||
TEST_SITES = [
|
||||
"http://google.com",
|
||||
"http://facebook.com",
|
||||
|
@ -37,122 +38,105 @@ TEST_SITES = [
|
|||
]
|
||||
|
||||
|
||||
def get_public_suffix(url):
|
||||
url_parts = du.hostname_subparts(url, include_ps=True)
|
||||
return url_parts[-1]
|
||||
@pytest.mark.skipif(
|
||||
"CI" not in os.environ or os.environ["CI"] == "false",
|
||||
reason="Makes remote connections",
|
||||
)
|
||||
@pytest.mark.slow
|
||||
def test_browser_profile_coverage(default_params, task_manager_creator):
|
||||
"""Test the coverage of the browser's profile.
|
||||
|
||||
|
||||
class TestCrawl(OpenWPMTest):
|
||||
"""Runs a short test crawl.
|
||||
|
||||
This should be used to test any features that require real
|
||||
crawl data. This should be avoided if possible, as controlled
|
||||
tests will be easier to debug
|
||||
This verifies that Firefox's places.sqlite database contains all
|
||||
visited sites (with a few exceptions). If it does not, it is likely
|
||||
the profile is lost at some point during the crawl.
|
||||
"""
|
||||
# Run the test crawl
|
||||
manager_params, browser_params = default_params
|
||||
manager_params.num_browsers = 1
|
||||
browser_params[0].profile_archive_dir = (
|
||||
manager_params.data_directory / "browser_profile"
|
||||
)
|
||||
browser_params[0].http_instrument = True
|
||||
manager, crawl_db = task_manager_creator((manager_params, browser_params[:1]))
|
||||
for site in TEST_SITES:
|
||||
manager.get(site)
|
||||
manager.close()
|
||||
|
||||
def get_config(
|
||||
self, data_dir: Path = None
|
||||
) -> Tuple[ManagerParams, List[BrowserParams]]:
|
||||
manager_params, browser_params = self.get_test_config(data_dir)
|
||||
browser_params[0].profile_archive_dir = os.path.join(
|
||||
manager_params.data_directory, "browser_profile"
|
||||
)
|
||||
browser_params[0].http_instrument = True
|
||||
return manager_params, browser_params
|
||||
# Extract crawl profile
|
||||
ff_db_tar = browser_params[0].profile_archive_dir / "profile.tar.gz"
|
||||
with tarfile.open(ff_db_tar) as tar:
|
||||
tar.extractall(browser_params[0].profile_archive_dir)
|
||||
|
||||
@pytest.mark.xfail(run=False)
|
||||
@pytest.mark.slow
|
||||
def test_browser_profile_coverage(self, tmpdir: Path, task_manager_creator) -> None:
|
||||
"""Test the coverage of the browser's profile
|
||||
# Output databases
|
||||
ff_db = browser_params[0].profile_archive_dir / "places.sqlite"
|
||||
|
||||
This verifies that Firefox's places.sqlite database contains
|
||||
all visited sites (with a few exceptions). If it does not,
|
||||
it is likely the profile is lost at some point during the crawl
|
||||
"""
|
||||
# Run the test crawl
|
||||
data_dir = tmpdir / "data_dir"
|
||||
manager_params, browser_params = self.get_config(data_dir)
|
||||
manager, crawl_db = task_manager_creator((manager_params, browser_params))
|
||||
for site in TEST_SITES:
|
||||
manager.get(site)
|
||||
ff_db_tar = os.path.join(
|
||||
browser_params[0].profile_archive_dir, "profile.tar.gz"
|
||||
)
|
||||
manager.close()
|
||||
# Grab urls from crawl database
|
||||
rows = db_utils.query_db(crawl_db, "SELECT url FROM http_requests")
|
||||
req_ps = set() # visited domains from http_requests table
|
||||
for (url,) in rows:
|
||||
req_ps.add(du.get_ps_plus_1(url))
|
||||
|
||||
# Extract crawl profile
|
||||
with tarfile.open(ff_db_tar) as tar:
|
||||
tar.extractall(browser_params[0].profile_archive_dir)
|
||||
hist_ps = set() # visited domains from crawl_history Table
|
||||
statuses = dict()
|
||||
rows = db_utils.query_db(
|
||||
crawl_db,
|
||||
"SELECT arguments, command_status FROM crawl_history WHERE"
|
||||
" command='GetCommand'",
|
||||
)
|
||||
for arguments, command_status in rows:
|
||||
url = json.loads(arguments)["url"]
|
||||
ps = du.get_ps_plus_1(url)
|
||||
hist_ps.add(ps)
|
||||
statuses[ps] = command_status
|
||||
|
||||
# Output databases
|
||||
ff_db = os.path.join(browser_params[0].profile_archive_dir, "places.sqlite")
|
||||
# Grab urls from Firefox database
|
||||
profile_ps = set() # visited domains from firefox profile
|
||||
rows = db_utils.query_db(ff_db, "SELECT url FROM moz_places")
|
||||
for (host,) in rows:
|
||||
try:
|
||||
profile_ps.add(du.get_ps_plus_1(host))
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
# Grab urls from crawl database
|
||||
rows = db_utils.query_db(crawl_db, "SELECT url FROM http_requests")
|
||||
req_ps = set() # visited domains from http_requests table
|
||||
for (url,) in rows:
|
||||
req_ps.add(get_public_suffix(url))
|
||||
# We expect a url to be in the Firefox profile if:
|
||||
# 1. We've made requests to it
|
||||
# 2. The url is a top_url we entered into the address bar
|
||||
# 3. The url successfully loaded (see: Issue #40)
|
||||
# 4. The site does not respond to the initial request with a 204
|
||||
# (won't show in FF DB)
|
||||
missing_urls = req_ps.intersection(hist_ps).difference(profile_ps)
|
||||
unexpected_missing_urls = set()
|
||||
for url in missing_urls:
|
||||
if command_status[url] != "ok":
|
||||
continue
|
||||
|
||||
hist_ps = set() # visited domains from crawl_history Table
|
||||
statuses = dict()
|
||||
# Get the visit id for the url
|
||||
rows = db_utils.query_db(
|
||||
crawl_db,
|
||||
"SELECT arguments, command_status "
|
||||
"FROM crawl_history WHERE command='GET'",
|
||||
"SELECT visit_id FROM site_visits WHERE site_url = ?",
|
||||
("http://" + url,),
|
||||
)
|
||||
for url, command_status in rows:
|
||||
ps = get_public_suffix(url)
|
||||
hist_ps.add(ps)
|
||||
statuses[ps] = command_status
|
||||
visit_id = rows[0]
|
||||
|
||||
# Grab urls from Firefox database
|
||||
profile_ps = set() # visited domains from firefox profile
|
||||
rows = db_utils.query_db(ff_db, "SELECT url FROM moz_places")
|
||||
for (host,) in rows:
|
||||
try:
|
||||
profile_ps.add(get_public_suffix(host))
|
||||
except AttributeError:
|
||||
pass
|
||||
rows = db_utils.query_db(
|
||||
crawl_db,
|
||||
"SELECT COUNT(*) FROM http_responses WHERE visit_id = ?",
|
||||
(visit_id,),
|
||||
)
|
||||
if rows[0] > 1:
|
||||
continue
|
||||
|
||||
# We expect urls to be in the Firefox profile if:
|
||||
# 1. We've made requests to it
|
||||
# 2. The url is a top_url we entered into the address bar
|
||||
# 3. The url successfully loaded (see: Issue #40)
|
||||
# 4. The site does not respond to the initial request with a 204
|
||||
# (won't show in FF DB)
|
||||
missing_urls = req_ps.intersection(hist_ps).difference(profile_ps)
|
||||
unexpected_missing_urls = set()
|
||||
for url in missing_urls:
|
||||
if command_status[url] != "ok":
|
||||
continue
|
||||
rows = db_utils.query_db(
|
||||
crawl_db,
|
||||
"SELECT response_status, location FROM "
|
||||
"http_responses WHERE visit_id = ?",
|
||||
(visit_id,),
|
||||
)
|
||||
response_status, location = rows[0]
|
||||
if response_status == 204:
|
||||
continue
|
||||
if location == "http://": # site returned a blank redirect
|
||||
continue
|
||||
unexpected_missing_urls.add(url)
|
||||
|
||||
# Get the visit id for the url
|
||||
rows = db_utils.query_db(
|
||||
crawl_db,
|
||||
"SELECT visit_id FROM site_visits " "WHERE site_url = ?",
|
||||
("http://" + url,),
|
||||
)
|
||||
visit_id = rows[0]
|
||||
|
||||
rows = db_utils.query_db(
|
||||
crawl_db,
|
||||
"SELECT COUNT(*) FROM http_responses " "WHERE visit_id = ?",
|
||||
(visit_id,),
|
||||
)
|
||||
if rows[0] > 1:
|
||||
continue
|
||||
|
||||
rows = db_utils.query_db(
|
||||
crawl_db,
|
||||
"SELECT response_status, location FROM "
|
||||
"http_responses WHERE visit_id = ?",
|
||||
(visit_id,),
|
||||
)
|
||||
response_status, location = rows[0]
|
||||
if response_status == 204:
|
||||
continue
|
||||
if location == "http://": # site returned a blank redirect
|
||||
continue
|
||||
unexpected_missing_urls.add(url)
|
||||
|
||||
assert len(unexpected_missing_urls) == 0
|
||||
assert len(unexpected_missing_urls) == 0
|
||||
|
|
|
@ -1,86 +1,86 @@
|
|||
from os.path import isfile, join
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Optional, Tuple
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
from openwpm.command_sequence import CommandSequence
|
||||
from openwpm.commands.types import BaseCommand
|
||||
from openwpm.config import BrowserParams, ManagerParams
|
||||
from openwpm.errors import CommandExecutionError, ProfileLoadError
|
||||
from openwpm.task_manager import TaskManager
|
||||
from openwpm.utilities import db_utils
|
||||
|
||||
from .openwpmtest import OpenWPMTest
|
||||
from .utilities import BASE_TEST_URL
|
||||
|
||||
# TODO update these tests to make use of blocking commands
|
||||
|
||||
|
||||
class TestProfile(OpenWPMTest):
|
||||
def get_config(
|
||||
self, data_dir: Optional[Path]
|
||||
) -> Tuple[ManagerParams, List[BrowserParams]]:
|
||||
manager_params, browser_params = self.get_test_config(data_dir)
|
||||
browser_params[0].profile_archive_dir = join(
|
||||
manager_params.data_directory, "browser_profile"
|
||||
)
|
||||
return manager_params, browser_params
|
||||
|
||||
@pytest.mark.xfail(run=False)
|
||||
def test_saving(self):
|
||||
manager_params, browser_params = self.get_config()
|
||||
manager = TaskManager(manager_params, browser_params)
|
||||
manager.get("http://example.com")
|
||||
manager.close()
|
||||
assert isfile(join(browser_params[0].profile_archive_dir, "profile.tar.gz"))
|
||||
|
||||
@pytest.mark.xfail(run=False)
|
||||
def test_crash_profile(self):
|
||||
manager_params, browser_params = self.get_config()
|
||||
manager_params.failure_limit = 2
|
||||
manager = TaskManager(manager_params, browser_params)
|
||||
try:
|
||||
manager.get("http://example.com") # So we have a profile
|
||||
manager.get("example.com") # Selenium requires scheme prefix
|
||||
manager.get("example.com") # Selenium requires scheme prefix
|
||||
manager.get("example.com") # Selenium requires scheme prefix
|
||||
manager.get("example.com") # Requires two commands to shut down
|
||||
except CommandExecutionError:
|
||||
pass
|
||||
assert isfile(join(browser_params[0].profile_archive_dir, "profile.tar.gz"))
|
||||
|
||||
@pytest.mark.xfail(run=False)
|
||||
def test_profile_error(self):
|
||||
manager_params, browser_params = self.get_config()
|
||||
browser_params[0].seed_tar = "/tmp/NOTREAL"
|
||||
with pytest.raises(ProfileLoadError):
|
||||
TaskManager(manager_params, browser_params) # noqa
|
||||
|
||||
@pytest.mark.skip(reason="proxy no longer supported, need to update")
|
||||
def test_profile_saved_when_launch_crashes(self):
|
||||
manager_params, browser_params = self.get_config()
|
||||
browser_params[0].proxy = True
|
||||
browser_params[0].save_content = "script"
|
||||
manager = TaskManager(manager_params, browser_params)
|
||||
manager.get("http://example.com")
|
||||
|
||||
# Kill the LevelDBAggregator
|
||||
# This will cause the proxy launch to crash
|
||||
manager.ldb_status_queue.put("DIE")
|
||||
manager.browsers[0]._SPAWN_TIMEOUT = 2 # Have timeout occur quickly
|
||||
manager.browsers[0]._UNSUCCESSFUL_SPAWN_LIMIT = 2 # Quick timeout
|
||||
manager.get("example.com") # Cause a selenium crash
|
||||
|
||||
# The browser will fail to launch due to the proxy crashes
|
||||
try:
|
||||
manager.get("http://example.com")
|
||||
except CommandExecutionError:
|
||||
pass
|
||||
manager.close()
|
||||
assert isfile(join(browser_params[0].profile_archive_dir, "profile.tar.gz"))
|
||||
def test_saving(default_params, task_manager_creator):
|
||||
manager_params, browser_params = default_params
|
||||
manager_params.num_browsers = 1
|
||||
browser_params[0].profile_archive_dir = (
|
||||
manager_params.data_directory / "browser_profile"
|
||||
)
|
||||
manager, _ = task_manager_creator((manager_params, browser_params[:1]))
|
||||
manager.get(BASE_TEST_URL)
|
||||
manager.close()
|
||||
assert (browser_params[0].profile_archive_dir / "profile.tar.gz").is_file()
|
||||
|
||||
|
||||
def test_seed_persistance(default_params, task_manager_creator):
|
||||
def test_crash_profile(default_params, task_manager_creator):
|
||||
manager_params, browser_params = default_params
|
||||
manager_params.num_browsers = 1
|
||||
manager_params.failure_limit = 2
|
||||
browser_params[0].profile_archive_dir = (
|
||||
manager_params.data_directory / "browser_profile"
|
||||
)
|
||||
manager, _ = task_manager_creator((manager_params, browser_params[:1]))
|
||||
try:
|
||||
manager.get(BASE_TEST_URL) # So we have a profile
|
||||
manager.get("example.com") # Selenium requires scheme prefix
|
||||
manager.get("example.com") # Selenium requires scheme prefix
|
||||
manager.get("example.com") # Selenium requires scheme prefix
|
||||
manager.get("example.com") # Requires two commands to shut down
|
||||
except CommandExecutionError:
|
||||
pass
|
||||
assert (browser_params[0].profile_archive_dir / "profile.tar.gz").is_file()
|
||||
|
||||
|
||||
def test_profile_error(default_params, task_manager_creator):
|
||||
manager_params, browser_params = default_params
|
||||
manager_params.num_browsers = 1
|
||||
browser_params[0].seed_tar = Path("/tmp/NOTREAL")
|
||||
with pytest.raises(ProfileLoadError):
|
||||
task_manager_creator((manager_params, browser_params[:1]))
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="proxy no longer supported, need to update")
|
||||
def test_profile_saved_when_launch_crashes(default_params, task_manager_creator):
|
||||
manager_params, browser_params = default_params
|
||||
manager_params.num_browsers = 1
|
||||
browser_params[0].profile_archive_dir = (
|
||||
manager_params.data_directory / "browser_profile"
|
||||
)
|
||||
browser_params[0].proxy = True
|
||||
browser_params[0].save_content = "script"
|
||||
manager, _ = task_manager_creator((manager_params, browser_params[:1]))
|
||||
manager.get(BASE_TEST_URL)
|
||||
|
||||
# Kill the LevelDBAggregator
|
||||
# This will cause the proxy launch to crash
|
||||
manager.ldb_status_queue.put("DIE")
|
||||
manager.browsers[0]._SPAWN_TIMEOUT = 2 # Have timeout occur quickly
|
||||
manager.browsers[0]._UNSUCCESSFUL_SPAWN_LIMIT = 2 # Quick timeout
|
||||
manager.get("example.com") # Cause a selenium crash
|
||||
|
||||
# The browser will fail to launch due to the proxy crashes
|
||||
try:
|
||||
manager.get(BASE_TEST_URL)
|
||||
except CommandExecutionError:
|
||||
pass
|
||||
manager.close()
|
||||
assert (browser_params[0].profile_archive_dir / "profile.tar.gz").is_file()
|
||||
|
||||
|
||||
def test_seed_persistence(default_params, task_manager_creator):
|
||||
manager_params, browser_params = default_params
|
||||
p = Path("profile.tar.gz")
|
||||
for browser_param in browser_params:
|
||||
|
@ -89,7 +89,7 @@ def test_seed_persistance(default_params, task_manager_creator):
|
|||
|
||||
command_sequences = []
|
||||
for _ in range(2):
|
||||
cs = CommandSequence(url="https://example.com", reset=True)
|
||||
cs = CommandSequence(url=BASE_TEST_URL)
|
||||
cs.get()
|
||||
cs.append_command(AssertConfigSetCommand("test_pref", True))
|
||||
command_sequences.append(cs)
|
||||
|
|
Загрузка…
Ссылка в новой задаче