Merge pull request #864 from boolean5/restore-stateful-crawls

Restore stateful crawling support
This commit is contained in:
Steven Englehardt 2021-03-29 11:34:05 -04:00 коммит произвёл GitHub
Родитель 358c8a7337 37271ba62d
Коммит bfc4644a71
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
16 изменённых файлов: 599 добавлений и 506 удалений

Просмотреть файл

@ -65,7 +65,6 @@ with TaskManager(
command_sequence = CommandSequence(
site,
site_rank=index,
reset=True,
callback=callback,
)
@ -74,5 +73,5 @@ with TaskManager(
# Have a look at custom_command.py to see how to implement your own command
command_sequence.append_command(LinkCountingCommand())
# Run commands across the three browsers (simple parallelization)
# Run commands across all browsers (simple parallelization)
manager.execute_command_sequence(command_sequence)

Просмотреть файл

@ -249,11 +249,6 @@ TODO
# Browser Profile Support
**WARNING: Stateful crawls are currently not supported. Attempts to run
stateful crawls will throw `NotImplementedError`s. The work required to
restore support is tracked in
[this project](https://github.com/mozilla/OpenWPM/projects/2).**
## Stateful vs Stateless crawls
By default OpenWPM performs a "stateful" crawl, in that it keeps a consistent
@ -329,7 +324,6 @@ but will not be used during crash recovery. Specifically:
profile specified by `seed_tar`. If OpenWPM determines that Firefox needs to
restart for some reason during the crawl, it will use the profile from
the most recent page visit (pre-crash) rather than the `seed_tar` profile.
Note that stateful crawls are currently [unsupported](https://github.com/mozilla/OpenWPM/projects/2)).
* For stateless crawls, the initial `seed_tar` will be loaded during each
new page visit. Note that this means the profile will very likely be
_incomplete_, as cookies or storage may have been set or changed during the

Просмотреть файл

@ -1,6 +1,6 @@
# Release Checklist
We aim to release a new version of OpenWPM with each new Firefox release (~1 release per month). The following steps are necessary for a release
We aim to release a new version of OpenWPM with each new Firefox release (~1 release per month). The following steps are necessary for a release:
1. Upgrade Firefox to the newest version.
1. Go to: https://hg.mozilla.org/releases/mozilla-release/tags.
@ -11,10 +11,11 @@ We aim to release a new version of OpenWPM with each new Firefox release (~1 rel
2. Run `npm update` in `openwpm/Extension/webext-instrumentation`.
3. Run `npm update` in the base directory
3. Update python and system dependencies by following the ["managing requirements" instructions](../CONTRIBUTING.md#managing-requirements).
4. Increment the version number in [VERSION](../VERSION)
5. Add a summary of changes since the last version to [CHANGELOG](../CHANGELOG.md)
6. Squash and merge the release PR to master.
7. Publish a new release from https://github.com/mozilla/OpenWPM/releases:
4. If a new version of geckodriver is used, check whether the default geckodriver browser preferences in [`openwpm/deploy_browsers/configure_firefox.py`](../openwpm/deploy_browsers/configure_firefox.py#L8L65) need to be updated.
5. Increment the version number in [VERSION](../VERSION)
6. Add a summary of changes since the last version to [CHANGELOG](../CHANGELOG.md)
7. Squash and merge the release PR to master.
8. Publish a new release from https://github.com/mozilla/OpenWPM/releases:
1. Click "Draft a new release".
2. Enter the "Tag version" and "Release title" as `vX.X.X`.
3. In the description:

Просмотреть файл

@ -5,9 +5,11 @@ import pickle
import shutil
import signal
import sys
import tempfile
import threading
import time
import traceback
from pathlib import Path
from queue import Empty as EmptyQueue
from typing import Optional, Union
@ -16,6 +18,7 @@ from multiprocess import Queue
from selenium.common.exceptions import WebDriverException
from tblib import pickling_support
from .commands.profile_commands import dump_profile
from .commands.types import BaseCommand, ShutdownSignal
from .config import BrowserParamsInternal, ManagerParamsInternal
from .deploy_browsers import deploy_firefox
@ -33,7 +36,7 @@ pickling_support.install()
class Browser:
"""
The Browser class is responsbile for holding all of the
The Browser class is responsible for holding all of the
configuration and status information on BrowserManager process
it corresponds to. It also includes a set of methods for managing
the BrowserManager process and its child processes/threads.
@ -52,7 +55,7 @@ class Browser:
self._UNSUCCESSFUL_SPAWN_LIMIT = 4
# manager parameters
self.current_profile_path = None
self.current_profile_path: Optional[Path] = None
self.db_socket_address = manager_params.storage_controller_address
assert browser_params.browser_id is not None
self.browser_id: BrowserId = browser_params.browser_id
@ -62,7 +65,7 @@ class Browser:
# Queues and process IDs for BrowserManager
# thread to run commands issues from TaskManager
# thread to run commands issued from TaskManager
self.command_thread: Optional[threading.Thread] = None
# queue for passing command tuples to BrowserManager
self.command_queue: Optional[Queue] = None
@ -75,7 +78,7 @@ class Browser:
# the port of the display for the Xvfb display (if it exists)
self.display_port: Optional[int] = None
# boolean that says if the BrowserManager new (to optimize restarts)
# boolean that says if the BrowserManager is new (to optimize restarts)
self.is_fresh = True
# boolean indicating if the browser should be restarted
self.restart_required = False
@ -97,29 +100,29 @@ class Browser:
sets up the BrowserManager and gets the process id, browser pid and,
if applicable, screen pid. loads associated user profile if necessary
"""
# Unsupported. See https://github.com/mozilla/OpenWPM/projects/2
# if this is restarting from a crash, update the tar location
# to be a tar of the crashed browser's history
"""
if self.current_profile_path is not None:
# tar contents of crashed profile to a temp dir
tempdir = tempfile.mkdtemp(prefix="owpm_profile_archive_") + "/"
profile_commands.dump_profile(
self.current_profile_path,
self.manager_params,
self.browser_params,
tempdir,
close_webdriver=False,
tempdir = tempfile.mkdtemp(prefix="openwpm_profile_archive_")
tar_path = Path(tempdir) / "profile.tar"
dump_profile(
browser_profile_path=self.current_profile_path,
tar_path=tar_path,
compress=False,
browser_params=self.browser_params,
)
# make sure browser loads crashed profile
self.browser_params.recovery_tar = tempdir
self.browser_params.recovery_tar = tar_path
crash_recovery = True
else:
"""
tempdir = None
crash_recovery = False
self.logger.info("BROWSER %i: Launching browser..." % self.browser_id)
tempdir = None
crash_recovery = False
self.is_fresh = not crash_recovery
# Try to spawn the browser within the timelimit
@ -159,8 +162,8 @@ class Browser:
# Read success status of browser manager
launch_status = dict()
try:
# 1. Selenium profile created
spawned_profile_path = check_queue(launch_status)
# 1. Browser profile created
browser_profile_path = check_queue(launch_status)
# 2. Profile tar loaded (if necessary)
check_queue(launch_status)
# 3. Display launched (if necessary)
@ -170,7 +173,7 @@ class Browser:
# 5. Browser launched
self.geckodriver_pid = check_queue(launch_status)
(driver_profile_path, ready) = check_queue(launch_status)
ready = check_queue(launch_status)
if ready != "READY":
self.logger.error(
"BROWSER %i: Mismatch of status queue return values, "
@ -183,7 +186,6 @@ class Browser:
unsuccessful_spawns += 1
error_string = ""
status_strings = [
"Proxy Ready",
"Profile Created",
"Profile Tar",
"Display",
@ -202,17 +204,15 @@ class Browser:
)
self.close_browser_manager()
if "Profile Created" in launch_status:
shutil.rmtree(spawned_profile_path, ignore_errors=True)
shutil.rmtree(browser_profile_path, ignore_errors=True)
# If the browser spawned successfully, we should update the
# current profile path class variable and clean up the tempdir
# and previous profile path.
if success:
self.logger.debug("BROWSER %i: Browser spawn sucessful!" % self.browser_id)
self.logger.debug("BROWSER %i: Browser spawn successful!" % self.browser_id)
previous_profile_path = self.current_profile_path
self.current_profile_path = driver_profile_path
if driver_profile_path != spawned_profile_path:
shutil.rmtree(spawned_profile_path, ignore_errors=True)
self.current_profile_path = browser_profile_path
if previous_profile_path is not None:
shutil.rmtree(previous_profile_path, ignore_errors=True)
if tempdir is not None:
@ -360,7 +360,7 @@ class Browser:
os.kill(self.display_pid, signal.SIGKILL)
except OSError:
self.logger.debug(
"BROWSER %i: Display process does not " "exit" % self.browser_id
"BROWSER %i: Display process does not exit" % self.browser_id
)
pass
except TypeError:
@ -368,7 +368,7 @@ class Browser:
"BROWSER %i: PID may not be the correct "
"type %s" % (self.browser_id, str(self.display_pid))
)
if self.display_port is not None: # xvfb diplay lock
if self.display_port is not None: # xvfb display lock
lockfile = "/tmp/.X%s-lock" % self.display_port
try:
os.remove(lockfile)
@ -394,33 +394,27 @@ class Browser:
self.close_browser_manager(force=force)
# Archive browser profile (if requested)
if not during_init and self.browser_params.profile_archive_dir is not None:
self.logger.warning(
"BROWSER %i: Archiving the browser profile directory is "
"currently unsupported. "
"See: https://github.com/mozilla/OpenWPM/projects/2" % self.browser_id
)
"""
self.logger.debug(
"BROWSER %i: during_init=%s | profile_archive_dir=%s" % (
self.browser_id, str(during_init),
self.browser_params.profile_archive_dir)
)
if (not during_init and
self.browser_params.profile_archive_dir is not None):
self.logger.debug(
"BROWSER %i: Archiving browser profile directory to %s" % (
self.browser_id,
self.browser_params.profile_archive_dir))
profile_commands.dump_profile(
self.current_profile_path,
self.manager_params,
self.browser_params,
"BROWSER %i: during_init=%s | profile_archive_dir=%s"
% (
self.browser_id,
str(during_init),
self.browser_params.profile_archive_dir,
close_webdriver=False,
compress=True
)
"""
)
if not during_init and self.browser_params.profile_archive_dir is not None:
self.logger.debug(
"BROWSER %i: Archiving browser profile directory to %s"
% (self.browser_id, self.browser_params.profile_archive_dir)
)
tar_path = self.browser_params.profile_archive_dir / "profile.tar.gz"
assert self.current_profile_path is not None
dump_profile(
browser_profile_path=self.current_profile_path,
tar_path=tar_path,
compress=True,
browser_params=self.browser_params,
)
# Clean up temporary files
if self.current_profile_path is not None:
@ -441,22 +435,20 @@ def BrowserManager(
display = None
try:
# Start Xvfb (if necessary), webdriver, and browser
driver, prof_folder, display = deploy_firefox.deploy_firefox(
driver, browser_profile_path, display = deploy_firefox.deploy_firefox(
status_queue, browser_params, manager_params, crash_recovery
)
if prof_folder[-1] != "/":
prof_folder += "/"
# Read the extension port -- if extension is enabled
# TODO: Initial communication from extension to TM should use sockets
if browser_params.extension_enabled:
logger.debug(
"BROWSER %i: Looking for extension port information "
"in %s" % (browser_params.browser_id, prof_folder)
"in %s" % (browser_params.browser_id, browser_profile_path)
)
elapsed = 0
port = None
ep_filename = os.path.join(prof_folder, "extension_port.txt")
ep_filename = browser_profile_path / "extension_port.txt"
while elapsed < 5:
try:
with open(ep_filename, "rt") as f:
@ -483,10 +475,9 @@ def BrowserManager(
logger.debug("BROWSER %i: BrowserManager ready." % browser_params.browser_id)
# passes the profile folder back to the
# TaskManager to signal a successful startup
status_queue.put(("STATUS", "Browser Ready", (prof_folder, "READY")))
browser_params.profile_path = prof_folder
# passes "READY" to the TaskManager to signal a successful startup
status_queue.put(("STATUS", "Browser Ready", "READY"))
browser_params.profile_path = browser_profile_path
# starts accepting arguments until told to die
while True:
@ -498,12 +489,6 @@ def BrowserManager(
command: Union[ShutdownSignal, BaseCommand] = command_queue.get()
if type(command) is ShutdownSignal:
# Geckodriver creates a copy of the profile (and the original
# temp file created by FirefoxProfile() is deleted).
# We clear the profile attribute here to prevent prints from:
# https://github.com/SeleniumHQ/selenium/blob/4e4160dd3d2f93757cafb87e2a1c20d6266f5554/py/selenium/webdriver/firefox/webdriver.py#L193-L199
if driver.profile and not os.path.isdir(driver.profile.path):
driver.profile = None
driver.quit()
status_queue.put("OK")
return

Просмотреть файл

@ -1,3 +1,4 @@
from pathlib import Path
from typing import Callable, List, Tuple
from .commands.browser_commands import (
@ -10,6 +11,7 @@ from .commands.browser_commands import (
SaveScreenshotCommand,
ScreenshotFullPageCommand,
)
from .commands.profile_commands import DumpProfileCommand
from .commands.types import BaseCommand
from .errors import CommandExecutionError
@ -18,7 +20,7 @@ class CommandSequence:
"""A CommandSequence wraps a series of commands to be performed
on a visit to one top-level site into one logical
"site visit," keyed by a visit id. An example of a CommandSequence
that visits a page and dumps cookies modified on that visit would be:
that visits a page and saves a screenshot of it would be:
sequence = CommandSequence(url)
sequence.get()
@ -87,15 +89,15 @@ class CommandSequence:
self.contains_get_or_browse = True
def dump_profile(
self, dump_folder, close_webdriver=False, compress=True, timeout=120
):
self,
tar_path: Path,
close_webdriver: bool = False,
compress: bool = True,
timeout: int = 120,
) -> None:
""" dumps from the profile path to a given file (absolute path) """
raise NotImplementedError(
"Profile saving is currently unsupported. "
"See: https://github.com/mozilla/OpenWPM/projects/2."
)
self.total_timeout += timeout
command = DumpProfCommand(dump_folder, close_webdriver, compress)
command = DumpProfileCommand(tar_path, close_webdriver, compress)
self._commands_with_timeout.append((command, timeout))
def save_screenshot(self, suffix="", timeout=30):
@ -103,7 +105,7 @@ class CommandSequence:
self.total_timeout += timeout
if not self.contains_get_or_browse:
raise CommandExecutionError(
"No get or browse request preceding " "the save screenshot command",
"No get or browse request preceding the save screenshot command",
self,
)
command = SaveScreenshotCommand(suffix)
@ -131,7 +133,7 @@ class CommandSequence:
self.total_timeout += timeout
if not self.contains_get_or_browse:
raise CommandExecutionError(
"No get or browse request preceding " "the dump page source command",
"No get or browse request preceding the screenshot full page command",
self,
)
command = ScreenshotFullPageCommand(suffix)
@ -142,7 +144,7 @@ class CommandSequence:
self.total_timeout += timeout
if not self.contains_get_or_browse:
raise CommandExecutionError(
"No get or browse request preceding " "the dump page source command",
"No get or browse request preceding the dump page source command",
self,
)
command = DumpPageSourceCommand(suffix)
@ -171,7 +173,8 @@ class CommandSequence:
self.total_timeout += timeout
if not self.contains_get_or_browse:
raise CommandExecutionError(
"No get or browse request preceding " "the dump page source command",
"No get or browse request preceding the recursive dump"
" page source command",
self,
)
command = RecursiveDumpPageSourceCommand(suffix)
@ -188,7 +191,6 @@ class CommandSequence:
"""Returns a list of all commands in the command_sequence
appended by a finalize command
"""
commands = list(self._commands_with_timeout)
commands.insert(0, (InitializeCommand(), 10))
commands.append((FinalizeCommand(sleep=5), 10))

Просмотреть файл

@ -15,23 +15,91 @@ from .utils.firefox_profile import sleep_until_sqlite_checkpoint
logger = logging.getLogger("openwpm")
def dump_profile(
browser_profile_path: Path,
tar_path: Path,
compress: bool,
browser_params: BrowserParamsInternal,
) -> None:
"""Dumps a browser profile to a tar file."""
assert browser_params.browser_id is not None
# Creating the folders if need be
tar_path.parent.mkdir(exist_ok=True, parents=True)
# see if this file exists first
# if it does, delete it before we try to save the current session
if tar_path.exists():
tar_path.unlink()
# backup and tar profile
if compress:
tar = tarfile.open(tar_path, "w:gz", errorlevel=1)
else:
tar = tarfile.open(tar_path, "w", errorlevel=1)
logger.debug(
"BROWSER %i: Backing up full profile from %s to %s"
% (browser_params.browser_id, browser_profile_path, tar_path)
)
storage_vector_files = [
"cookies.sqlite", # cookies
"cookies.sqlite-shm",
"cookies.sqlite-wal",
"places.sqlite", # history
"places.sqlite-shm",
"places.sqlite-wal",
"webappsstore.sqlite", # localStorage
"webappsstore.sqlite-shm",
"webappsstore.sqlite-wal",
]
storage_vector_dirs = [
"webapps", # related to localStorage?
"storage", # directory for IndexedDB
]
for item in storage_vector_files:
full_path = browser_profile_path / item
if (
not full_path.is_file()
and not full_path.name.endswith("shm")
and not full_path.name.endswith("wal")
):
logger.critical(
"BROWSER %i: %s NOT FOUND IN profile folder, skipping."
% (browser_params.browser_id, full_path)
)
elif not full_path.is_file() and (
full_path.name.endswith("shm") or full_path.name.endswith("wal")
):
continue # These are just checkpoint files
tar.add(full_path, arcname=item)
for item in storage_vector_dirs:
full_path = browser_profile_path / item
if not full_path.is_dir():
logger.warning(
"BROWSER %i: %s NOT FOUND IN profile folder, skipping."
% (browser_params.browser_id, full_path)
)
continue
tar.add(full_path, arcname=item)
tar.close()
class DumpProfileCommand(BaseCommand):
"""
Dumps a browser profile currently stored in <browser_profile_folder> to
<tar_path>
Dumps a browser profile currently stored in <browser_params.profile_path> to
<tar_path>.
"""
def __init__(self, tar_path: Path, close_webdriver: bool, compress: bool) -> None:
def __init__(
self, tar_path: Path, close_webdriver: bool, compress: bool = True
) -> None:
self.tar_path = tar_path
self.close_webdriver = close_webdriver
self.compress = compress
raise NotImplementedError(
"Profile dumping is currently unsupported. "
"See: https://github.com/mozilla/OpenWPM/projects/2."
)
def __repr__(self) -> str:
return "DumpProfCommand({},{},{})".format(
return "DumpProfileCommand({},{},{})".format(
self.tar_path, self.close_webdriver, self.compress
)
@ -42,110 +110,40 @@ class DumpProfileCommand(BaseCommand):
manager_params: ManagerParamsInternal,
extension_socket: ClientSocket,
) -> None:
browser_profile_folder = browser_params.profile_path
assert browser_profile_folder is not None
# Creating the folders if need be
self.tar_path.parent.mkdir(exist_ok=True, parents=True)
# see if this file exists first
# if it does, delete it before we try to save the current session
if self.tar_path.exists():
self.tar_path.unlink() # IDK why it's called like this
# if this is a dump on close, close the webdriver and wait for checkpoint
if self.close_webdriver:
webdriver.close()
sleep_until_sqlite_checkpoint(browser_profile_folder)
sleep_until_sqlite_checkpoint(browser_params.profile_path)
# backup and tar profile
if self.compress:
tar = tarfile.open(self.tar_path, "w:gz", errorlevel=1)
else:
tar = tarfile.open(self.tar_path, "w", errorlevel=1)
logger.debug(
"BROWSER %i: Backing up full profile from %s to %s"
% (
self.browser_id,
browser_profile_folder,
self.tar_path,
)
assert browser_params.profile_path is not None
dump_profile(
browser_params.profile_path,
self.tar_path,
self.compress,
browser_params,
)
storage_vector_files = [
"cookies.sqlite", # cookies
"cookies.sqlite-shm",
"cookies.sqlite-wal",
"places.sqlite", # history
"places.sqlite-shm",
"places.sqlite-wal",
"webappsstore.sqlite", # localStorage
"webappsstore.sqlite-shm",
"webappsstore.sqlite-wal",
]
storage_vector_dirs = [
"webapps", # related to localStorage?
"storage", # directory for IndexedDB
]
for item in storage_vector_files:
full_path = browser_profile_folder / item
if (
not full_path.is_file()
and not full_path.name.endswith("shm")
and not full_path.name.endswith("wal")
):
logger.critical(
"BROWSER %i: %s NOT FOUND IN profile folder, skipping."
% (self.browser_id, full_path)
)
elif not full_path.is_file() and (
full_path.name.endswith("shm") or full_path.name.endswith("wal")
):
continue # These are just checkpoint files
tar.add(full_path, arcname=item)
for item in storage_vector_dirs:
full_path = browser_profile_folder / item
if not full_path.is_dir():
logger.warning(
"BROWSER %i: %s NOT FOUND IN profile folder, skipping."
% (self.browser_id, full_path)
)
continue
tar.add(full_path, arcname=item)
tar.close()
def load_profile(
browser_profile_folder: Path,
browser_profile_path: Path,
manager_params: ManagerParamsInternal,
browser_params: BrowserParamsInternal,
tar_path: Path,
) -> None:
"""
loads a zipped cookie-based profile stored at <tar_location> and
unzips it to <browser_profile_folder>.
The tar will remain unmodified.
Loads a zipped cookie-based profile stored at <tar_path> and unzips
it to <browser_profile_path>. The tar will remain unmodified.
"""
assert tar_path.is_file()
assert browser_params.browser_id is not None
try:
# Copy and untar the loaded profile
logger.debug(
"BROWSER %i: Copying profile tar from %s to %s"
% (
browser_params.browser_id,
tar_path,
browser_profile_folder,
)
)
shutil.copy(tar_path, browser_profile_folder)
tar_path = browser_profile_folder / tar_path.name
assert tar_path.is_file()
# Untar the loaded profile
if tar_path.name.endswith("tar.gz"):
f = tarfile.open(tar_path, "r:gz", errorlevel=1)
else:
f = tarfile.open(tar_path, "r", errorlevel=1)
f.extractall(browser_profile_folder)
f.extractall(browser_profile_path)
f.close()
tar_path.unlink()
logger.debug("BROWSER %i: Tarfile extracted" % browser_params.browser_id)
except Exception as ex:

Просмотреть файл

@ -97,7 +97,9 @@ class BrowserParams(DataClassJsonMixin):
prefs: dict = field(default_factory=dict)
tp_cookies: str = "always"
bot_mitigation: bool = False
profile_archive_dir: Optional[str] = None
profile_archive_dir: Optional[Path] = field(
default=None, metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path)
)
recovery_tar: Optional[Path] = None
donottrack: bool = False
tracking_protection: bool = False

Просмотреть файл

@ -1,7 +1,105 @@
""" Set prefs and load extensions in Firefox """
import json
import re
from pathlib import Path
from typing import Any, Dict
def privacy(browser_params, fp, fo, root_dir, browser_profile_path):
from ..config import BrowserParams
# TODO: Remove hardcoded geckodriver default preferences. See
# https://github.com/mozilla/OpenWPM/issues/867
# Source of preferences:
# https://hg.mozilla.org/mozilla-central/file/tip/testing/geckodriver/src/prefs.rs
# https://hg.mozilla.org/mozilla-central/file/tip/testing/geckodriver/src/marionette.rs
DEFAULT_GECKODRIVER_PREFS = {
"app.normandy.api_url": "",
"app.update.checkInstallTime": False,
"app.update.disabledForTesting": True,
"app.update.auto": False,
"browser.dom.window.dump.enabled": True,
"devtools.console.stdout.chrome": True,
"browser.safebrowsing.blockedURIs.enabled": False,
"browser.safebrowsing.downloads.enabled": False,
"browser.safebrowsing.passwords.enabled": False,
"browser.safebrowsing.malware.enabled": False,
"browser.safebrowsing.phishing.enabled": False,
"browser.sessionstore.resume_from_crash": False,
"browser.shell.checkDefaultBrowser": False,
"browser.startup.homepage_override.mstone": "ignore",
"browser.startup.page": 0,
"browser.tabs.closeWindowWithLastTab": False,
"browser.tabs.warnOnClose": False,
"browser.uitour.enabled": False,
"browser.warnOnQuit": False,
"datareporting.healthreport.documentServerURI": "http://%(server)s/dummy/healthreport/",
"datareporting.healthreport.logging.consoleEnabled": False,
"datareporting.healthreport.service.enabled": False,
"datareporting.healthreport.service.firstRun": False,
"datareporting.healthreport.uploadEnabled": False,
"datareporting.policy.dataSubmissionEnabled": False,
"datareporting.policy.dataSubmissionPolicyBypassNotification": True,
"dom.ipc.reportProcessHangs": False,
"extensions.autoDisableScopes": 0,
"extensions.enabledScopes": 5,
"extensions.installDistroAddons": False,
"extensions.update.enabled": False,
"extensions.update.notifyUser": False,
"focusmanager.testmode": True,
"general.useragent.updates.enabled": False,
"geo.provider.testing": True,
"geo.wifi.scan": False,
"hangmonitor.timeout": 0,
"idle.lastDailyNotification": -1,
"javascript.options.showInConsole": True,
"media.gmp-manager.updateEnabled": False,
"media.sanity-test.disabled": True,
"network.http.phishy-userpass-length": 255,
"network.manage-offline-status": False,
"network.sntp.pools": "%(server)s",
"plugin.state.flash": 0,
"security.certerrors.mitm.priming.enabled": False,
"services.settings.server": "http://%(server)s/dummy/blocklist/",
"startup.homepage_welcome_url": "about:blank",
"startup.homepage_welcome_url.additional": "",
"toolkit.startup.max_resumed_crashes": -1,
"marionette.log.level": "Info",
}
def load_existing_prefs(browser_profile_path: Path) -> Dict[str, Any]:
"""Load existing user preferences.
If the browser profile contains a user.js file, load the preferences
specified inside it into a dictionary.
"""
prefs: Dict[str, Any] = {}
prefs_path = browser_profile_path / "user.js"
if not prefs_path.is_file():
return prefs
# Regular expression from https://stackoverflow.com/a/24563687
r = re.compile(r"\s*user_pref\(([\"'])(.+?)\1,\s*(.+?)\);")
with open(prefs_path, "r") as f:
for line in f:
m = r.match(line)
if m:
key, value = m.group(2), m.group(3)
prefs[key] = json.loads(value)
return prefs
def save_prefs_to_profile(prefs: Dict[str, Any], browser_profile_path: Path) -> None:
"""Save all preferences to the browser profile.
Write preferences from the prefs dictionary to a user.js file in the
profile directory.
"""
with open(browser_profile_path / "user.js", "w") as f:
for key, value in prefs.items():
f.write('user_pref("%s", %s);\n' % (key, json.dumps(value)))
def privacy(browser_params: BrowserParams, prefs: Dict[str, Any]) -> None:
"""
Configure the privacy settings in Firefox. This includes:
* DNT
@ -12,15 +110,15 @@ def privacy(browser_params, fp, fo, root_dir, browser_profile_path):
# Turns on Do Not Track
if browser_params.donottrack:
fo.set_preference("privacy.donottrackheader.enabled", True)
prefs["privacy.donottrackheader.enabled"] = True
# Sets the third party cookie setting
if browser_params.tp_cookies.lower() == "never":
fo.set_preference("network.cookie.cookieBehavior", 1)
prefs["network.cookie.cookieBehavior"] = 1
elif browser_params.tp_cookies.lower() == "from_visited":
fo.set_preference("network.cookie.cookieBehavior", 3)
prefs["network.cookie.cookieBehavior"] = 3
else: # always allow third party cookies
fo.set_preference("network.cookie.cookieBehavior", 0)
prefs["network.cookie.cookieBehavior"] = 0
# Tracking Protection
if browser_params.tracking_protection:
@ -31,7 +129,7 @@ def privacy(browser_params, fp, fo, root_dir, browser_profile_path):
)
def optimize_prefs(fo):
def optimize_prefs(prefs: Dict[str, Any]) -> None:
"""
Disable various features and checks the browser will do on startup.
Some of these (e.g. disabling the newtab page) are required to prevent
@ -42,113 +140,113 @@ def optimize_prefs(fo):
* https://github.com/pyllyukko/user.js/blob/master/user.js
""" # noqa
# Startup / Speed
fo.set_preference("browser.shell.checkDefaultBrowser", False)
fo.set_preference("browser.slowStartup.notificationDisabled", True)
fo.set_preference("browser.slowStartup.maxSamples", 0)
fo.set_preference("browser.slowStartup.samples", 0)
fo.set_preference("extensions.checkCompatibility.nightly", False)
fo.set_preference("browser.rights.3.shown", True)
fo.set_preference("reader.parse-on-load.enabled", False)
fo.set_preference("browser.pagethumbnails.capturing_disabled", True)
fo.set_preference("browser.uitour.enabled", False)
fo.set_preference("dom.flyweb.enabled", False)
prefs["browser.shell.checkDefaultBrowser"] = False
prefs["browser.slowStartup.notificationDisabled"] = True
prefs["browser.slowStartup.maxSamples"] = 0
prefs["browser.slowStartup.samples"] = 0
prefs["extensions.checkCompatibility.nightly"] = False
prefs["browser.rights.3.shown"] = True
prefs["reader.parse-on-load.enabled"] = False
prefs["browser.pagethumbnails.capturing_disabled"] = True
prefs["browser.uitour.enabled"] = False
prefs["dom.flyweb.enabled"] = False
# Disable health reports / telemetry / crash reports
fo.set_preference("datareporting.policy.dataSubmissionEnabled", False)
fo.set_preference("datareporting.healthreport.uploadEnabled", False)
fo.set_preference("datareporting.healthreport.service.enabled", False)
fo.set_preference("toolkit.telemetry.archive.enabled", False)
fo.set_preference("toolkit.telemetry.enabled", False)
fo.set_preference("toolkit.telemetry.unified", False)
fo.set_preference("breakpad.reportURL", "")
fo.set_preference("dom.ipc.plugins.reportCrashURL", False)
fo.set_preference("browser.selfsupport.url", "")
fo.set_preference("browser.tabs.crashReporting.sendReport", False)
fo.set_preference("browser.crashReports.unsubmittedCheck.enabled", False)
fo.set_preference("dom.ipc.plugins.flash.subprocess.crashreporter.enabled", False)
prefs["datareporting.policy.dataSubmissionEnabled"] = False
prefs["datareporting.healthreport.uploadEnabled"] = False
prefs["datareporting.healthreport.service.enabled"] = False
prefs["toolkit.telemetry.archive.enabled"] = False
prefs["toolkit.telemetry.enabled"] = False
prefs["toolkit.telemetry.unified"] = False
prefs["breakpad.reportURL"] = ""
prefs["dom.ipc.plugins.reportCrashURL"] = False
prefs["browser.selfsupport.url"] = ""
prefs["browser.tabs.crashReporting.sendReport"] = False
prefs["browser.crashReports.unsubmittedCheck.enabled"] = False
prefs["dom.ipc.plugins.flash.subprocess.crashreporter.enabled"] = False
# Predictive Actions / Prefetch
fo.set_preference("network.predictor.enabled", False)
fo.set_preference("network.dns.disablePrefetch", True)
fo.set_preference("network.prefetch-next", False)
fo.set_preference("browser.search.suggest.enabled", False)
fo.set_preference("network.http.speculative-parallel-limit", 0)
fo.set_preference("keyword.enabled", False) # location bar using search
fo.set_preference("browser.urlbar.userMadeSearchSuggestionsChoice", True)
fo.set_preference("browser.casting.enabled", False)
prefs["network.predictor.enabled"] = False
prefs["network.dns.disablePrefetch"] = True
prefs["network.prefetch-next"] = False
prefs["browser.search.suggest.enabled"] = False
prefs["network.http.speculative-parallel-limit"] = 0
prefs["keyword.enabled"] = False # location bar using search
prefs["browser.urlbar.userMadeSearchSuggestionsChoice"] = True
prefs["browser.casting.enabled"] = False
# Disable pinging Mozilla for geoip
fo.set_preference("browser.search.geoip.url", "")
fo.set_preference("browser.search.countryCode", "US")
fo.set_preference("browser.search.region", "US")
prefs["browser.search.geoip.url"] = ""
prefs["browser.search.countryCode"] = "US"
prefs["browser.search.region"] = "US"
# Disable pinging Mozilla for geo-specific search
fo.set_preference("browser.search.geoSpecificDefaults", False)
fo.set_preference("browser.search.geoSpecificDefaults.url", "")
prefs["browser.search.geoSpecificDefaults"] = False
prefs["browser.search.geoSpecificDefaults.url"] = ""
# Disable auto-updating
fo.set_preference("app.update.enabled", False) # browser
fo.set_preference("app.update.url", "") # browser
fo.set_preference("browser.search.update", False) # search
fo.set_preference("extensions.update.enabled", False) # extensions
fo.set_preference("extensions.update.autoUpdateDefault", False)
fo.set_preference("extensions.getAddons.cache.enabled", False)
fo.set_preference("lightweightThemes.update.enabled", False) # Personas
prefs["app.update.enabled"] = False # browser
prefs["app.update.url"] = "" # browser
prefs["browser.search.update"] = False # search
prefs["extensions.update.enabled"] = False # extensions
prefs["extensions.update.autoUpdateDefault"] = False
prefs["extensions.getAddons.cache.enabled"] = False
prefs["lightweightThemes.update.enabled"] = False # Personas
# Disable Safebrowsing and other security features
# that require on remote content
fo.set_preference("browser.safebrowsing.phising.enabled", False)
fo.set_preference("browser.safebrowsing.malware.enabled", False)
fo.set_preference("browser.safebrowsing.downloads.enabled", False)
fo.set_preference("browser.safebrowsing.downloads.remote.enabled", False)
fo.set_preference("browser.safebrowsing.blockedURIs.enabled", False)
fo.set_preference("browser.safebrowsing.provider.mozilla.gethashURL", "")
fo.set_preference("browser.safebrowsing.provider.google.gethashURL", "")
fo.set_preference("browser.safebrowsing.provider.google4.gethashURL", "")
fo.set_preference("browser.safebrowsing.provider.mozilla.updateURL", "")
fo.set_preference("browser.safebrowsing.provider.google.updateURL", "")
fo.set_preference("browser.safebrowsing.provider.google4.updateURL", "")
fo.set_preference("browser.safebrowsing.provider.mozilla.lists", "") # TP
fo.set_preference("browser.safebrowsing.provider.google.lists", "") # TP
fo.set_preference("browser.safebrowsing.provider.google4.lists", "") # TP
fo.set_preference("extensions.blocklist.enabled", False) # extensions
fo.set_preference("security.OCSP.enabled", 0)
prefs["browser.safebrowsing.phising.enabled"] = False
prefs["browser.safebrowsing.malware.enabled"] = False
prefs["browser.safebrowsing.downloads.enabled"] = False
prefs["browser.safebrowsing.downloads.remote.enabled"] = False
prefs["browser.safebrowsing.blockedURIs.enabled"] = False
prefs["browser.safebrowsing.provider.mozilla.gethashURL"] = ""
prefs["browser.safebrowsing.provider.google.gethashURL"] = ""
prefs["browser.safebrowsing.provider.google4.gethashURL"] = ""
prefs["browser.safebrowsing.provider.mozilla.updateURL"] = ""
prefs["browser.safebrowsing.provider.google.updateURL"] = ""
prefs["browser.safebrowsing.provider.google4.updateURL"] = ""
prefs["browser.safebrowsing.provider.mozilla.lists"] = "" # TP
prefs["browser.safebrowsing.provider.google.lists"] = "" # TP
prefs["browser.safebrowsing.provider.google4.lists"] = "" # TP
prefs["extensions.blocklist.enabled"] = False # extensions
prefs["security.OCSP.enabled"] = 0
# Disable Content Decryption Module and OpenH264 related downloads
fo.set_preference("media.gmp-manager.url", "")
fo.set_preference("media.gmp-provider.enabled", False)
fo.set_preference("media.gmp-widevinecdm.enabled", False)
fo.set_preference("media.gmp-widevinecdm.visible", False)
fo.set_preference("media.gmp-gmpopenh264.enabled", False)
prefs["media.gmp-manager.url"] = ""
prefs["media.gmp-provider.enabled"] = False
prefs["media.gmp-widevinecdm.enabled"] = False
prefs["media.gmp-widevinecdm.visible"] = False
prefs["media.gmp-gmpopenh264.enabled"] = False
# Disable Experiments
fo.set_preference("experiments.enabled", False)
fo.set_preference("experiments.manifest.uri", "")
fo.set_preference("experiments.supported", False)
fo.set_preference("experiments.activeExperiment", False)
fo.set_preference("network.allow-experiments", False)
prefs["experiments.enabled"] = False
prefs["experiments.manifest.uri"] = ""
prefs["experiments.supported"] = False
prefs["experiments.activeExperiment"] = False
prefs["network.allow-experiments"] = False
# Disable pinging Mozilla for newtab
fo.set_preference("browser.newtabpage.directory.ping", "")
fo.set_preference("browser.newtabpage.directory.source", "")
fo.set_preference("browser.newtabpage.enabled", False)
fo.set_preference("browser.newtabpage.enhanced", False)
fo.set_preference("browser.newtabpage.introShown", True)
fo.set_preference("browser.aboutHomeSnippets.updateUrl", "")
prefs["browser.newtabpage.directory.ping"] = ""
prefs["browser.newtabpage.directory.source"] = ""
prefs["browser.newtabpage.enabled"] = False
prefs["browser.newtabpage.enhanced"] = False
prefs["browser.newtabpage.introShown"] = True
prefs["browser.aboutHomeSnippets.updateUrl"] = ""
# Disable Pocket
fo.set_preference("extensions.pocket.enabled", False)
prefs["extensions.pocket.enabled"] = False
# Disable Shield
fo.set_preference("app.shield.optoutstudies.enabled", False)
fo.set_preference("extensions.shield-recipe-client.enabled", False)
prefs["app.shield.optoutstudies.enabled"] = False
prefs["extensions.shield-recipe-client.enabled"] = False
# Disable Source Pragams
# Disable Source Pragmas
# As per https://bugzilla.mozilla.org/show_bug.cgi?id=1628853
# sourceURL can be used to obfuscate the original origin of
# a script, we disable it.
fo.set_preference("javascript.options.source_pragmas", False)
prefs["javascript.options.source_pragmas"] = False
# Enable extensions and disable extension signing
fo.set_preference("extensions.experiments.enabled", True)
fo.set_preference("xpinstall.signatures.required", False)
prefs["extensions.experiments.enabled"] = True
prefs["xpinstall.signatures.required"] = False

Просмотреть файл

@ -1,6 +1,8 @@
import json
import logging
import os.path
import socket
import tempfile
from pathlib import Path
from typing import Any, Dict, Optional, Tuple
@ -8,7 +10,6 @@ from easyprocess import EasyProcessError
from multiprocess import Queue
from pyvirtualdisplay import Display
from selenium import webdriver
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from ..commands.profile_commands import load_profile
from ..config import BrowserParamsInternal, ConfigEncoder, ManagerParamsInternal
@ -25,7 +26,7 @@ def deploy_firefox(
browser_params: BrowserParamsInternal,
manager_params: ManagerParamsInternal,
crash_recovery: bool,
) -> Tuple[webdriver.Firefox, str, Optional[Display]]:
) -> Tuple[webdriver.Firefox, Path, Optional[Display]]:
"""
launches a firefox instance with parameters set by the input dictionary
"""
@ -33,14 +34,20 @@ def deploy_firefox(
root_dir = os.path.dirname(__file__) # directory of this file
fp = FirefoxProfile()
browser_profile_path = Path(fp.path)
browser_profile_path = Path(tempfile.mkdtemp(prefix="firefox_profile_"))
status_queue.put(("STATUS", "Profile Created", browser_profile_path))
# Use Options instead of FirefoxProfile to set preferences since the
# Options method has no "frozen"/restricted options.
# https://github.com/SeleniumHQ/selenium/issues/2106#issuecomment-320238039
fo = Options()
# Set a custom profile that is used in-place and is not deleted by geckodriver.
# https://firefox-source-docs.mozilla.org/testing/geckodriver/CrashReports.html
# Using FirefoxProfile breaks stateful crawling:
# https://github.com/mozilla/OpenWPM/issues/423#issuecomment-521018093
fo.add_argument("-profile")
fo.add_argument(str(browser_profile_path))
assert browser_params.browser_id is not None
if browser_params.seed_tar and not crash_recovery:
logger.info(
@ -110,16 +117,32 @@ def deploy_firefox(
# TODO restore detailed logging
# fo.set_preference("extensions.@openwpm.sdk.console.logLevel", "all")
# Geckodriver currently places the user.js file in the wrong profile
# directory, so we have to create it manually here.
# TODO: See https://github.com/mozilla/OpenWPM/issues/867 for when
# to remove this workaround.
# Load existing preferences from the profile's user.js file
prefs = configure_firefox.load_existing_prefs(browser_profile_path)
# Load default geckodriver preferences
prefs.update(configure_firefox.DEFAULT_GECKODRIVER_PREFS)
# Pick an available port for Marionette (https://stackoverflow.com/a/2838309)
# This has a race condition, as another process may get the port
# before Marionette, but we don't expect it to happen often
s = socket.socket()
s.bind(("", 0))
marionette_port = s.getsockname()[1]
s.close()
prefs["marionette.port"] = marionette_port
# Configure privacy settings
configure_firefox.privacy(browser_params, fp, fo, root_dir, browser_profile_path)
configure_firefox.privacy(browser_params, prefs)
# Set various prefs to improve speed and eliminate traffic to Mozilla
configure_firefox.optimize_prefs(fo)
configure_firefox.optimize_prefs(prefs)
# Intercept logging at the Selenium level and redirect it to the
# main logger. This will also inform us where the real profile
# directory is hiding.
interceptor = FirefoxLogInterceptor(browser_params.browser_id, browser_profile_path)
# main logger.
interceptor = FirefoxLogInterceptor(browser_params.browser_id)
interceptor.start()
# Set custom prefs. These are set after all of the default prefs to allow
@ -129,16 +152,21 @@ def deploy_firefox(
"BROWSER %i: Setting custom preference: %s = %s"
% (browser_params.browser_id, name, value)
)
fo.set_preference(name, value)
prefs[name] = value
# Write all preferences to the profile's user.js file
configure_firefox.save_prefs_to_profile(prefs, browser_profile_path)
# Launch the webdriver
status_queue.put(("STATUS", "Launch Attempted", None))
fb = FirefoxBinary(firefox_path=firefox_binary_path)
driver = webdriver.Firefox(
firefox_profile=fp,
firefox_binary=fb,
firefox_options=fo,
options=fo,
log_path=interceptor.fifo,
# TODO: See https://github.com/mozilla/OpenWPM/issues/867 for
# when to remove this
service_args=["--marionette-port", str(marionette_port)],
)
# Add extension
@ -165,4 +193,4 @@ def deploy_firefox(
status_queue.put(("STATUS", "Browser Launched", int(pid)))
return driver, driver.capabilities["moz:profile"], display
return driver, browser_profile_path, display

Просмотреть файл

@ -46,15 +46,13 @@ class FirefoxLogInterceptor(threading.Thread):
"""
Intercept logs from Selenium and/or geckodriver, using a named pipe
and a detached thread, and feed them to the primary logger for this
instance. Also responsible for extracting the _real_ profile location
from geckodriver's log output (geckodriver copies the profile).
instance.
"""
def __init__(self, browser_id, profile_path):
def __init__(self, browser_id):
threading.Thread.__init__(self, name="log-interceptor-%i" % browser_id)
self.browser_id = browser_id
self.fifo = mktempfifo(suffix=".log", prefix="owpm_driver_")
self.profile_path = profile_path
self.daemon = True
self.logger = logging.getLogger("openwpm")
@ -68,11 +66,6 @@ class FirefoxLogInterceptor(threading.Thread):
self.logger.debug(
"BROWSER %i: driver: %s" % (self.browser_id, line.strip())
)
if "Using profile path" in line:
self.profile_path = line.partition("Using profile path")[
-1
].strip()
if self.fifo is not None:
os.unlink(self.fifo)
self.fifo = None
@ -83,7 +76,7 @@ class FirefoxLogInterceptor(threading.Thread):
self.fifo = None
class PatchedGeckoDriverService(BaseService):
class PatchedGeckoDriverService(FirefoxDriverModule.Service):
"""Object that manages the starting and stopping of the GeckoDriver.
Modified from the original (selenium.webdriver.firefox.service.Service)
for Py3 compat in the presence of log FIFOs, and for potential future
@ -128,11 +121,5 @@ class PatchedGeckoDriverService(BaseService):
)
self.service_args = service_args or []
def command_line_args(self):
return ["--port", "%d" % self.port]
def send_remote_shutdown_command(self):
pass
FirefoxDriverModule.Service = PatchedGeckoDriverService

Просмотреть файл

@ -305,8 +305,8 @@ class TaskManager:
Parameters
----------
during_init :
flag to indicator if this shutdown is occuring during
the TaskManager initialization
flag to indicate if this shutdown is occuring during
the TaskManager initialization
relaxed :
If `True` the function will wait for all active
`CommandSequences` to finish before shutting down
@ -434,17 +434,6 @@ class TaskManager:
assert browser.browser_id is not None
assert browser.curr_visit_id is not None
reset = command_sequence.reset
if not reset:
self.logger.warning(
"BROWSER %i: Browser will not reset after CommandSequence "
"executes. OpenWPM does not currently support stateful crawls "
"(see: https://github.com/mozilla/OpenWPM/projects/2). "
"The next command issued to this browser may or may not "
"use the same profile (depending on the failure status of "
"this command). To prevent this warning, initialize the "
"CommandSequence with `reset` set to `True` to use a fresh "
"profile for each command." % browser.browser_id
)
self.logger.info(
"Starting to work on CommandSequence with "
"visit_id %d on browser with id %d",

Просмотреть файл

@ -24,13 +24,17 @@ EXTENSION_DIR = os.path.join(
pytest_plugins = "test.storage.fixtures"
@pytest.fixture(scope="session")
def xpi():
# Creates a new xpi using npm run build.
print("Building new xpi")
subprocess.check_call(["npm", "run", "build"], cwd=EXTENSION_DIR)
@pytest.fixture(name="xpi", scope="session")
def xpi_fixture():
return xpi()
@pytest.fixture(scope="session")
def server():
"""Run an HTTP server during the tests."""

Просмотреть файл

@ -1,11 +1,15 @@
import atexit
import shutil
import subprocess
import tempfile
from os.path import dirname, join, realpath
from pathlib import Path
import click
import IPython
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.options import Options
from openwpm import js_instrumentation as jsi
from openwpm.config import BrowserParams
@ -88,7 +92,7 @@ def start_webdriver(
Set to True to load browser_params
browser_params_file : string
Specify the browser_params.json to load.
If None, default params form openwpm/config.py::BrowserParams will be loaded.
If None, default params from openwpm/config.py::BrowserParams will be loaded.
Returns
-------
@ -110,16 +114,35 @@ def start_webdriver(
print("...server shutdown")
driver.quit()
print("...webdriver closed")
shutil.rmtree(driver.capabilities["moz:profile"], ignore_errors=True)
print("...browser profile removed")
atexit.register(cleanup_server)
return driver
fp = webdriver.FirefoxProfile()
browser_profile_path = Path(tempfile.mkdtemp(prefix="firefox_profile_"))
fo = Options()
fo.add_argument("-profile")
fo.add_argument(str(browser_profile_path))
# TODO: See https://github.com/mozilla/OpenWPM/issues/867 for when
# to remove manually creating user.js
prefs = configure_firefox.load_existing_prefs(browser_profile_path)
prefs.update(configure_firefox.DEFAULT_GECKODRIVER_PREFS)
if with_extension:
# TODO: Restore preference for log level in a way that works in Fx 57+
# fp.set_preference("extensions.@openwpm.sdk.console.logLevel", "all")
configure_firefox.optimize_prefs(fp)
driver = webdriver.Firefox(firefox_binary=fb, firefox_profile=fp)
configure_firefox.optimize_prefs(prefs)
configure_firefox.save_prefs_to_profile(prefs, browser_profile_path)
driver = webdriver.Firefox(
firefox_binary=fb,
options=fo,
# Use the default Marionette port.
# TODO: See https://github.com/mozilla/OpenWPM/issues/867 for
# when to remove this
service_args=["--marionette-port", "2828"],
)
if load_browser_params is True:
# There's probably more we could do here
# to set more preferences and better emulate
@ -134,8 +157,7 @@ def start_webdriver(
js_request_as_string = jsi.clean_js_instrumentation_settings(js_request)
browser_params.js_instrument_settings = js_request_as_string
profile_dir = driver.capabilities["moz:profile"]
with open(join(profile_dir, "browser_params.json"), "w") as f:
with open(browser_profile_path / "browser_params.json", "w") as f:
f.write(browser_params.to_json())
if with_extension:
@ -192,9 +214,9 @@ def start_webext():
"--browser-params-file",
help="""
Specify a browser_params.json file. If none provided and
--browser-params is enabled. Default browser_params.json
will be used. Pass an absolute path or a path relative
to the test directory.""",
--browser-params is enabled the default params from
openwpm/config.py::BrowserParams will be loaded. Pass an
absolute path or a path relative to the test directory.""",
)
def main(selenium, no_extension, browser_params, browser_params_file):

Просмотреть файл

@ -7,7 +7,7 @@ from .utilities import BASE_TEST_URL
def test_local_callbacks(default_params, task_manager_creator):
"""Test test the storage controller as well as the entire callback machinery
"""Test the storage controller as well as the entire callback machinery
to see if all callbacks get correctly called"""
manager, _ = task_manager_creator(default_params)
TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html"
@ -17,7 +17,7 @@ def test_local_callbacks(default_params, task_manager_creator):
my_list: List[int] = []
sequence = CommandSequence(
TEST_SITE, reset=True, blocking=True, callback=partial(callback, my_list)
TEST_SITE, blocking=True, callback=partial(callback, my_list)
)
sequence.get()

Просмотреть файл

@ -1,18 +1,19 @@
# type:ignore
# As this file is no longer maintained, mypy shouldn't check this
"""Runs a short test crawl.
This should be used to test any features that require real crawl data.
This should be avoided if possible, as controlled tests will be easier
to debug.
"""
import json
import os
import tarfile
from pathlib import Path
from typing import List, Tuple
import domain_utils as du
import pytest
from openwpm.config import BrowserParams, ManagerParams
from openwpm.utilities import db_utils
from .openwpmtest import OpenWPMTest
TEST_SITES = [
"http://google.com",
"http://facebook.com",
@ -37,122 +38,105 @@ TEST_SITES = [
]
def get_public_suffix(url):
url_parts = du.hostname_subparts(url, include_ps=True)
return url_parts[-1]
@pytest.mark.skipif(
"CI" not in os.environ or os.environ["CI"] == "false",
reason="Makes remote connections",
)
@pytest.mark.slow
def test_browser_profile_coverage(default_params, task_manager_creator):
"""Test the coverage of the browser's profile.
class TestCrawl(OpenWPMTest):
"""Runs a short test crawl.
This should be used to test any features that require real
crawl data. This should be avoided if possible, as controlled
tests will be easier to debug
This verifies that Firefox's places.sqlite database contains all
visited sites (with a few exceptions). If it does not, it is likely
the profile is lost at some point during the crawl.
"""
# Run the test crawl
manager_params, browser_params = default_params
manager_params.num_browsers = 1
browser_params[0].profile_archive_dir = (
manager_params.data_directory / "browser_profile"
)
browser_params[0].http_instrument = True
manager, crawl_db = task_manager_creator((manager_params, browser_params[:1]))
for site in TEST_SITES:
manager.get(site)
manager.close()
def get_config(
self, data_dir: Path = None
) -> Tuple[ManagerParams, List[BrowserParams]]:
manager_params, browser_params = self.get_test_config(data_dir)
browser_params[0].profile_archive_dir = os.path.join(
manager_params.data_directory, "browser_profile"
)
browser_params[0].http_instrument = True
return manager_params, browser_params
# Extract crawl profile
ff_db_tar = browser_params[0].profile_archive_dir / "profile.tar.gz"
with tarfile.open(ff_db_tar) as tar:
tar.extractall(browser_params[0].profile_archive_dir)
@pytest.mark.xfail(run=False)
@pytest.mark.slow
def test_browser_profile_coverage(self, tmpdir: Path, task_manager_creator) -> None:
"""Test the coverage of the browser's profile
# Output databases
ff_db = browser_params[0].profile_archive_dir / "places.sqlite"
This verifies that Firefox's places.sqlite database contains
all visited sites (with a few exceptions). If it does not,
it is likely the profile is lost at some point during the crawl
"""
# Run the test crawl
data_dir = tmpdir / "data_dir"
manager_params, browser_params = self.get_config(data_dir)
manager, crawl_db = task_manager_creator((manager_params, browser_params))
for site in TEST_SITES:
manager.get(site)
ff_db_tar = os.path.join(
browser_params[0].profile_archive_dir, "profile.tar.gz"
)
manager.close()
# Grab urls from crawl database
rows = db_utils.query_db(crawl_db, "SELECT url FROM http_requests")
req_ps = set() # visited domains from http_requests table
for (url,) in rows:
req_ps.add(du.get_ps_plus_1(url))
# Extract crawl profile
with tarfile.open(ff_db_tar) as tar:
tar.extractall(browser_params[0].profile_archive_dir)
hist_ps = set() # visited domains from crawl_history Table
statuses = dict()
rows = db_utils.query_db(
crawl_db,
"SELECT arguments, command_status FROM crawl_history WHERE"
" command='GetCommand'",
)
for arguments, command_status in rows:
url = json.loads(arguments)["url"]
ps = du.get_ps_plus_1(url)
hist_ps.add(ps)
statuses[ps] = command_status
# Output databases
ff_db = os.path.join(browser_params[0].profile_archive_dir, "places.sqlite")
# Grab urls from Firefox database
profile_ps = set() # visited domains from firefox profile
rows = db_utils.query_db(ff_db, "SELECT url FROM moz_places")
for (host,) in rows:
try:
profile_ps.add(du.get_ps_plus_1(host))
except AttributeError:
pass
# Grab urls from crawl database
rows = db_utils.query_db(crawl_db, "SELECT url FROM http_requests")
req_ps = set() # visited domains from http_requests table
for (url,) in rows:
req_ps.add(get_public_suffix(url))
# We expect a url to be in the Firefox profile if:
# 1. We've made requests to it
# 2. The url is a top_url we entered into the address bar
# 3. The url successfully loaded (see: Issue #40)
# 4. The site does not respond to the initial request with a 204
# (won't show in FF DB)
missing_urls = req_ps.intersection(hist_ps).difference(profile_ps)
unexpected_missing_urls = set()
for url in missing_urls:
if command_status[url] != "ok":
continue
hist_ps = set() # visited domains from crawl_history Table
statuses = dict()
# Get the visit id for the url
rows = db_utils.query_db(
crawl_db,
"SELECT arguments, command_status "
"FROM crawl_history WHERE command='GET'",
"SELECT visit_id FROM site_visits WHERE site_url = ?",
("http://" + url,),
)
for url, command_status in rows:
ps = get_public_suffix(url)
hist_ps.add(ps)
statuses[ps] = command_status
visit_id = rows[0]
# Grab urls from Firefox database
profile_ps = set() # visited domains from firefox profile
rows = db_utils.query_db(ff_db, "SELECT url FROM moz_places")
for (host,) in rows:
try:
profile_ps.add(get_public_suffix(host))
except AttributeError:
pass
rows = db_utils.query_db(
crawl_db,
"SELECT COUNT(*) FROM http_responses WHERE visit_id = ?",
(visit_id,),
)
if rows[0] > 1:
continue
# We expect urls to be in the Firefox profile if:
# 1. We've made requests to it
# 2. The url is a top_url we entered into the address bar
# 3. The url successfully loaded (see: Issue #40)
# 4. The site does not respond to the initial request with a 204
# (won't show in FF DB)
missing_urls = req_ps.intersection(hist_ps).difference(profile_ps)
unexpected_missing_urls = set()
for url in missing_urls:
if command_status[url] != "ok":
continue
rows = db_utils.query_db(
crawl_db,
"SELECT response_status, location FROM "
"http_responses WHERE visit_id = ?",
(visit_id,),
)
response_status, location = rows[0]
if response_status == 204:
continue
if location == "http://": # site returned a blank redirect
continue
unexpected_missing_urls.add(url)
# Get the visit id for the url
rows = db_utils.query_db(
crawl_db,
"SELECT visit_id FROM site_visits " "WHERE site_url = ?",
("http://" + url,),
)
visit_id = rows[0]
rows = db_utils.query_db(
crawl_db,
"SELECT COUNT(*) FROM http_responses " "WHERE visit_id = ?",
(visit_id,),
)
if rows[0] > 1:
continue
rows = db_utils.query_db(
crawl_db,
"SELECT response_status, location FROM "
"http_responses WHERE visit_id = ?",
(visit_id,),
)
response_status, location = rows[0]
if response_status == 204:
continue
if location == "http://": # site returned a blank redirect
continue
unexpected_missing_urls.add(url)
assert len(unexpected_missing_urls) == 0
assert len(unexpected_missing_urls) == 0

Просмотреть файл

@ -1,86 +1,86 @@
from os.path import isfile, join
from pathlib import Path
from typing import Any, List, Optional, Tuple
from typing import Any
import pytest
from openwpm.command_sequence import CommandSequence
from openwpm.commands.types import BaseCommand
from openwpm.config import BrowserParams, ManagerParams
from openwpm.errors import CommandExecutionError, ProfileLoadError
from openwpm.task_manager import TaskManager
from openwpm.utilities import db_utils
from .openwpmtest import OpenWPMTest
from .utilities import BASE_TEST_URL
# TODO update these tests to make use of blocking commands
class TestProfile(OpenWPMTest):
def get_config(
self, data_dir: Optional[Path]
) -> Tuple[ManagerParams, List[BrowserParams]]:
manager_params, browser_params = self.get_test_config(data_dir)
browser_params[0].profile_archive_dir = join(
manager_params.data_directory, "browser_profile"
)
return manager_params, browser_params
@pytest.mark.xfail(run=False)
def test_saving(self):
manager_params, browser_params = self.get_config()
manager = TaskManager(manager_params, browser_params)
manager.get("http://example.com")
manager.close()
assert isfile(join(browser_params[0].profile_archive_dir, "profile.tar.gz"))
@pytest.mark.xfail(run=False)
def test_crash_profile(self):
manager_params, browser_params = self.get_config()
manager_params.failure_limit = 2
manager = TaskManager(manager_params, browser_params)
try:
manager.get("http://example.com") # So we have a profile
manager.get("example.com") # Selenium requires scheme prefix
manager.get("example.com") # Selenium requires scheme prefix
manager.get("example.com") # Selenium requires scheme prefix
manager.get("example.com") # Requires two commands to shut down
except CommandExecutionError:
pass
assert isfile(join(browser_params[0].profile_archive_dir, "profile.tar.gz"))
@pytest.mark.xfail(run=False)
def test_profile_error(self):
manager_params, browser_params = self.get_config()
browser_params[0].seed_tar = "/tmp/NOTREAL"
with pytest.raises(ProfileLoadError):
TaskManager(manager_params, browser_params) # noqa
@pytest.mark.skip(reason="proxy no longer supported, need to update")
def test_profile_saved_when_launch_crashes(self):
manager_params, browser_params = self.get_config()
browser_params[0].proxy = True
browser_params[0].save_content = "script"
manager = TaskManager(manager_params, browser_params)
manager.get("http://example.com")
# Kill the LevelDBAggregator
# This will cause the proxy launch to crash
manager.ldb_status_queue.put("DIE")
manager.browsers[0]._SPAWN_TIMEOUT = 2 # Have timeout occur quickly
manager.browsers[0]._UNSUCCESSFUL_SPAWN_LIMIT = 2 # Quick timeout
manager.get("example.com") # Cause a selenium crash
# The browser will fail to launch due to the proxy crashes
try:
manager.get("http://example.com")
except CommandExecutionError:
pass
manager.close()
assert isfile(join(browser_params[0].profile_archive_dir, "profile.tar.gz"))
def test_saving(default_params, task_manager_creator):
manager_params, browser_params = default_params
manager_params.num_browsers = 1
browser_params[0].profile_archive_dir = (
manager_params.data_directory / "browser_profile"
)
manager, _ = task_manager_creator((manager_params, browser_params[:1]))
manager.get(BASE_TEST_URL)
manager.close()
assert (browser_params[0].profile_archive_dir / "profile.tar.gz").is_file()
def test_seed_persistance(default_params, task_manager_creator):
def test_crash_profile(default_params, task_manager_creator):
manager_params, browser_params = default_params
manager_params.num_browsers = 1
manager_params.failure_limit = 2
browser_params[0].profile_archive_dir = (
manager_params.data_directory / "browser_profile"
)
manager, _ = task_manager_creator((manager_params, browser_params[:1]))
try:
manager.get(BASE_TEST_URL) # So we have a profile
manager.get("example.com") # Selenium requires scheme prefix
manager.get("example.com") # Selenium requires scheme prefix
manager.get("example.com") # Selenium requires scheme prefix
manager.get("example.com") # Requires two commands to shut down
except CommandExecutionError:
pass
assert (browser_params[0].profile_archive_dir / "profile.tar.gz").is_file()
def test_profile_error(default_params, task_manager_creator):
manager_params, browser_params = default_params
manager_params.num_browsers = 1
browser_params[0].seed_tar = Path("/tmp/NOTREAL")
with pytest.raises(ProfileLoadError):
task_manager_creator((manager_params, browser_params[:1]))
@pytest.mark.skip(reason="proxy no longer supported, need to update")
def test_profile_saved_when_launch_crashes(default_params, task_manager_creator):
manager_params, browser_params = default_params
manager_params.num_browsers = 1
browser_params[0].profile_archive_dir = (
manager_params.data_directory / "browser_profile"
)
browser_params[0].proxy = True
browser_params[0].save_content = "script"
manager, _ = task_manager_creator((manager_params, browser_params[:1]))
manager.get(BASE_TEST_URL)
# Kill the LevelDBAggregator
# This will cause the proxy launch to crash
manager.ldb_status_queue.put("DIE")
manager.browsers[0]._SPAWN_TIMEOUT = 2 # Have timeout occur quickly
manager.browsers[0]._UNSUCCESSFUL_SPAWN_LIMIT = 2 # Quick timeout
manager.get("example.com") # Cause a selenium crash
# The browser will fail to launch due to the proxy crashes
try:
manager.get(BASE_TEST_URL)
except CommandExecutionError:
pass
manager.close()
assert (browser_params[0].profile_archive_dir / "profile.tar.gz").is_file()
def test_seed_persistence(default_params, task_manager_creator):
manager_params, browser_params = default_params
p = Path("profile.tar.gz")
for browser_param in browser_params:
@ -89,7 +89,7 @@ def test_seed_persistance(default_params, task_manager_creator):
command_sequences = []
for _ in range(2):
cs = CommandSequence(url="https://example.com", reset=True)
cs = CommandSequence(url=BASE_TEST_URL)
cs.get()
cs.append_command(AssertConfigSetCommand("test_pref", True))
command_sequences.append(cs)