From e338bb29f0ab03861c371bf65c6783200c7c62a1 Mon Sep 17 00:00:00 2001 From: Stefan Zabka Date: Sat, 9 Jan 2021 11:15:01 +0100 Subject: [PATCH] Command refactoring (#750) * Refactored GetCommand, BrowseCommand to have execute method * Fixed type name format issues in __issue_command * Fixed everything I broke * Changed import style so tests can run * Added BrowseCommad to imports * Added some more self * Added logging to explain failing test * Added one more self * attempt at refactoring save_screenshot * fixed indentation, attempt at refactoring save_screenshot * refactored SaveScreenshot command to have execute method * reformatted code using black * Ported SaveScreenshotCommand It now uses the new command.execute(...) syntax * refactored savefullscreenshot command to follow command sequence * formatted files with black * removed extraneous commands * Ported SaveScreenshotFullPage #763 * refactored dump page source and formatted code with black * reformatted recursive dump page source command and formatted code w black * formatted files using isort * formatted all files with isort * Ported DumpPageSource and RecursiveDumpPageSource (#767) * refactor finalize command * refactored initalize command and formatted with black and isort * missed a conflict * Command refactoring (#770) * attempt at refactoring save_screenshot * fixed indentation, attempt at refactoring save_screenshot * refactored SaveScreenshot command to have execute method * reformatted code using black * refactored savefullscreenshot command to follow command sequence * formatted files with black * removed extraneous commands * refactored dump page source and formatted code with black * reformatted recursive dump page source command and formatted code w black * formatted files using isort * formatted all files with isort * refactor finalize command * refactored initalize command and formatted with black and isort * missed a conflict * Ran isort * Added append_command * remove custom function command and format code * Refactored GetCommand, BrowseCommand to have execute method * Fixed type name format issues in __issue_command * Fixed everything I broke * Changed import style so tests can run * Added BrowseCommad to imports * Added some more self * Added logging to explain failing test * Added one more self * Ported SaveScreenshotCommand It now uses the new command.execute(...) syntax * Ported SaveScreenshotFullPage #763 * Ported DumpPageSource and RecursiveDumpPageSource (#767) * Command refactoring (#770) * attempt at refactoring save_screenshot * fixed indentation, attempt at refactoring save_screenshot * refactored SaveScreenshot command to have execute method * reformatted code using black * refactored savefullscreenshot command to follow command sequence * formatted files with black * removed extraneous commands * refactored dump page source and formatted code with black * reformatted recursive dump page source command and formatted code w black * formatted files using isort * formatted all files with isort * refactor finalize command * refactored initalize command and formatted with black and isort * missed a conflict * Ran isort * Added append_command * remove duplicate append_command * Refactored GetCommand, BrowseCommand to have execute method * Fixed type name format issues in __issue_command * Fixed everything I broke * Changed import style so tests can run * Added BrowseCommad to imports * Added some more self * Added logging to explain failing test * Added one more self * Ported SaveScreenshotCommand It now uses the new command.execute(...) syntax * Ported SaveScreenshotFullPage #763 * Ported DumpPageSource and RecursiveDumpPageSource (#767) * Command refactoring (#770) * attempt at refactoring save_screenshot * fixed indentation, attempt at refactoring save_screenshot * refactored SaveScreenshot command to have execute method * reformatted code using black * refactored savefullscreenshot command to follow command sequence * formatted files with black * removed extraneous commands * refactored dump page source and formatted code with black * reformatted recursive dump page source command and formatted code w black * formatted files using isort * formatted all files with isort * refactor finalize command * refactored initalize command and formatted with black and isort * missed a conflict * Ran isort * Added append_command * generate new xpi * Fixing tests * Fixing tests * Fixing up more tests * Removed type annotations * Fixing tests * Fixing tests * Removed command_executor * Moved Commands to commands * Fixing imports * Fixed skipped test * Removed duplicate append_command * docs: update adding command in usingOpenWPM * Forgot to save * Removed datadir * Cleaning up imports * Implemented simple command * Added documentation to simple_command.py * Renamed to custom_command.py * Moved docs around * Referencing BaseCommand.execute * Update docs/Using_OpenWPM.md Co-authored-by: Steven Englehardt Co-authored-by: Cyrus Co-authored-by: cyruskarsan <55566678+cyruskarsan@users.noreply.github.com> Co-authored-by: Steven Englehardt --- crawler.py | 2 +- custom_command.py | 43 +++ demo.py | 8 +- docs/Using_OpenWPM.md | 87 +---- environment.yaml | 14 +- openwpm/browser_manager.py | 14 +- openwpm/command_sequence.py | 18 +- openwpm/commands/browser_commands.py | 506 +++++++++++++++++---------- openwpm/commands/command_executor.py | 118 ------- openwpm/commands/profile_commands.py | 198 ++++++----- openwpm/commands/types.py | 145 +++----- openwpm/config.py | 17 + openwpm/task_manager.py | 5 +- test/manual_test.py | 1 - test/test_custom_function_command.py | 96 ++--- test/test_dataclass_validations.py | 2 - test/test_http_instrumentation.py | 31 +- test/test_profile.py | 62 ++-- test/test_timer.py | 3 +- test/test_webdriver_utils.py | 2 +- 20 files changed, 679 insertions(+), 693 deletions(-) create mode 100644 custom_command.py delete mode 100644 openwpm/commands/command_executor.py diff --git a/crawler.py b/crawler.py index 9550623f..77f1ec6d 100644 --- a/crawler.py +++ b/crawler.py @@ -13,7 +13,7 @@ import sentry_sdk from openwpm.command_sequence import CommandSequence from openwpm.config import BrowserParams, ManagerParams from openwpm.mp_logger import parse_config_from_env -from openwpm.task_manager import TaskManager, load_default_params +from openwpm.task_manager import TaskManager from openwpm.utilities import rediswq from test.utilities import LocalS3Session, local_s3_bucket diff --git a/custom_command.py b/custom_command.py new file mode 100644 index 00000000..508a4cff --- /dev/null +++ b/custom_command.py @@ -0,0 +1,43 @@ +""" This file aims to demonstrate how to write custom commands in OpenWPM + +Steps to have a custom command run as part of a CommandSequence + +1. Create a class that derives from BaseCommand +2. Implement the execute method +3. Append it to the CommandSequence +4. Execute the CommandSequence + +""" +import logging + +from selenium.webdriver import Firefox +from selenium.webdriver.common.by import By + +from openwpm.commands.types import BaseCommand +from openwpm.config import BrowserParams, ManagerParams +from openwpm.socket_interface import ClientSocket + + +class LinkCountingCommand(BaseCommand): + """This command logs how many links it found on any given page""" + + def __init__(self) -> None: + self.logger = logging.getLogger("openwpm") + + # While this is not strictly necessary, we use the repr of a command for logging + # So not having a proper repr will make your logs a lot less useful + def __repr__(self) -> str: + return "LinkCountingCommand" + + # Have a look at openwpm.commands.types.BaseCommand.execute to see + # an explanation of each parameter + def execute( + self, + webdriver: Firefox, + browser_params: BrowserParams, + manager_params: ManagerParams, + extension_socket: ClientSocket, + ) -> None: + current_url = webdriver.current_url + link_count = len(webdriver.find_elements(By.TAG_NAME, "a")) + self.logger.info("There are %d links on %s", link_count, current_url) diff --git a/demo.py b/demo.py index ab5cd89b..0e3edf88 100644 --- a/demo.py +++ b/demo.py @@ -1,4 +1,6 @@ +from custom_command import LinkCountingCommand from openwpm.command_sequence import CommandSequence +from openwpm.commands.browser_commands import GetCommand from openwpm.config import BrowserParams, ManagerParams from openwpm.task_manager import TaskManager @@ -10,9 +12,9 @@ sites = [ "http://citp.princeton.edu/", ] + # Loads the default ManagerParams # and NUM_BROWSERS copies of the default BrowserParams - manager_params = ManagerParams( num_browsers=NUM_BROWSERS ) # num_browsers is necessary to let TaskManager know how many browsers to spawn @@ -58,7 +60,9 @@ for site in sites: ) # Start by visiting the page - command_sequence.get(sleep=3, timeout=60) + command_sequence.append_command(GetCommand(url=site, sleep=3), timeout=60) + # Have a look at custom_command.py to see how to implement your own command + command_sequence.append_command(LinkCountingCommand()) # Run commands across the three browsers (simple parallelization) manager.execute_command_sequence(command_sequence) diff --git a/docs/Using_OpenWPM.md b/docs/Using_OpenWPM.md index 1f0c7c34..8da2c2ea 100644 --- a/docs/Using_OpenWPM.md +++ b/docs/Using_OpenWPM.md @@ -6,7 +6,7 @@ In this section, we present three basic development demos for working on the Ope Have a look at [demo.py](../demo.py) Generally, measurement crawls should be able to be run using scripts with lengths on the order of 100 lines of code. -Even within this short script, there are several different options that a user can change. +Even within this short script, there are several options that a user can change. Users can change the settings for task manager and individual browsers so that, for instance, certain browsers can run headless while others do not. We provide a method to read the default configuration settings into a classes that can be passed to the `TaskManager` instance. Note that browser configuration is **per-browser**, so this command will return a list of class. @@ -18,97 +18,28 @@ browser_params = [BrowserParams() for _ in range(manager_params.num_browsers)] ``` #### Loading Custom Browser or Manager configs -Users can load custom Browser and Platform/Manager configuration by writing them into a json file and then loading then into respective dataclasses. For example: +Users can load custom Browser and Platform/Manager configuration by writing them into a json file and then loading them into respective dataclasses. For example: ```py from openwpm.config import BrowserParams, ManagerParams -with open('.json', 'r') as f: +with open(".json", 'r') as f: manager_params = ManagerParams.from_json(f.read()) browser_params = list() for _ in range(num_browsers): - with open('.json', 'r') as file: + with open(".json", 'r') as file: browser_params.append(BrowserParams.from_json(file.read())) ``` -## Adding a new command +## Defining a new command -OpenWPM commands exist as part of a command sequence object, which allows one to string together a sequence of actions for a browser to take and deploy that sequence to the first available browser from the manager. Adding a command to a `CommandSequence` object will cause the browser to execute it immediately after the previously added command as long as the previous command does not time out or fail. - -Suppose we want to add a top-level command to cause the browser to jiggle the mouse a few times. We may want to have the browser visit a site, jiggle the mouse, and extract the links from the site. - -To add a new command you need to modify the following four files: - -1. Define all required paramters in a type in `openwpm/commands/types.py` - In our case this looks like this: - ```python - class JiggleCommand(BaseCommand): - def __init__(self, num_jiggles): - self.num_jiggles = num_jiggles - - def __repr__(self): - return "JiggleCommand({})".format(self.num_jiggles) - ``` - -2. Define the behaviour of our new command in `*_commands.py` in `openwpm/commands/`, - e.g. `browser_commands.py`. - Feel free to add a new module within `openwpm/commands/` for your own custom commands - In our case this looks like this: - ```python - from selenium.webdriver.common.action_chains import ActionChains - - def jiggle_mouse(webdriver, number_jiggles): - for i in xrange(0, number_jiggles): - x = random.randrange(0, 500) - y = random.randrange(0, 500) - action = ActionChains(webdriver) - action.move_by_offset(x, y) - action.perform() - ``` - -3. Make our function be called when the command_sequence reaches our Command, by adding it to the - `execute_command` function in `openwpm/commands/command_executer.py` - In our case this looks like this: - ```python - elif type(command) is JiggleCommand: - browser_commands.jiggle_mouse( - webdriver=webdriver, - number_jiggles=self.num_jiggles) - ``` - -4. Lastly we change ```openwpm/CommandSequence.py``` by adding a `jiggle_mouse` method to the `CommandSequence` - so we can add our command to the commands list - In our case this looks like this: - ```python - def jiggle_mouse(self, num_jiggles, timeout=60): - """ jiggles mouse times """ - self.total_timeout += timeout - if not self.contains_get_or_browse: - raise CommandExecutionError("No get or browse request preceding " - "the jiggle_mouse command", self) - command = JiggleCommand(num_jiggles) - self.commands_with_timeout.append((command, timeout)) - ``` - A timeout is given and set by default to 60 seconds. This is added to the overall sequence timeout. Finally, we check that the `CommandSequence` instance contains a `get` or a `browse` command prior to this command being added by checking `self.contains_get_or_browse`. This is necessary as it wouldn't make sense to have selenium jiggle the mouse before loading a page. +Please have a look at [`custom_command.py`](../custom_command.py). Note that custom commands must be +defined in a separate module and imported. They can't be defined within the main crawl script. +See [#837](https://github.com/mozilla/OpenWPM/issues/837). -Notice that any arguments to the command are added both to the command sequence top-level method, and are then stored in the `Command` object to be serialized and sent across the process boundary between the task manager and browser manager. - -Finally, the command sequence given to the Task Manager to visit a site, sleep for 10 seconds, jiggle the mouse 10 times, and takes a screenshot would look like this: - -```python -site = 'http://www.example.com' - -command_sequence = CommandSequence.CommandSequence(site) -command_sequence.get(sleep=10) -command_sequence.jiggle_mouse(10) -command_sequence.screenshot_full_page() - -manager.execute_command_sequence(command_sequence) -``` - ## Running a simple analysis Suppose that we ran the platform over some set of sites while logged into several sites while using a particular email. During the crawl, we turned on the proxy option to log HTTP traffic. One possible threat is, perhaps due to sloppy coding, the first-party leaks the user's email as plaintext over HTTP traffic. Given an OpenWPM database, the following script logs the first-party sites on which such a leakage occurs. @@ -133,7 +64,7 @@ for url, top_url in cur.execute("SELECT DISTINCT h.url, v.site_url " fp_sites.add(top_url) # outputs the results -print list(fp_sites) +print(list(fp_sites)) ```` The variety of data stored in OpenWPM databases (with all instrumentation enabled) allows the above script to easily be expanded into a larger study. For instance, one step would be to see which parties are the recipients of the email address. Do these recipients later place cookies containing the email? Besides the site on which the original email leak was made, on which other first parties do these recipients appear as a third party? All of these questions are answerable through OpenWPM database instances. diff --git a/environment.yaml b/environment.yaml index ea515a75..baf52264 100644 --- a/environment.yaml +++ b/environment.yaml @@ -5,7 +5,7 @@ dependencies: - beautifulsoup4=4.9.3 - black=20.8b1 - click=7.1.2 -- codecov=2.1.10 +- codecov=2.1.11 - dill=0.3.3 - geckodriver=0.28.0 - ipython=7.19.0 @@ -13,20 +13,20 @@ dependencies: - localstack=0.11.1.1 - multiprocess=0.70.11.1 - nodejs=14.15.1 -- pandas=1.1.4 +- pandas=1.1.5 - pillow=8.0.1 -- pip=20.2.4 -- pre-commit=2.9.2 -- psutil=5.7.3 +- pip=20.3.3 +- pre-commit=2.9.3 +- psutil=5.8.0 - pyarrow=2.0.0 - pytest-cov=2.10.1 -- pytest=6.1.2 +- pytest=6.2.1 - python=3.8.6 - pyvirtualdisplay=0.2.5 - redis-py=3.5.3 - s3fs=0.4.0 - selenium=3.141.0 -- sentry-sdk=0.19.4 +- sentry-sdk=0.19.5 - tabulate=0.8.7 - tblib=1.6.0 - wget=1.20.1 diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index 4eefbf94..df8ae60a 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -9,15 +9,14 @@ import threading import time import traceback from queue import Empty as EmptyQueue -from typing import Optional +from typing import Optional, Union import psutil from multiprocess import Queue from selenium.common.exceptions import WebDriverException from tblib import pickling_support -from .commands import command_executor -from .commands.types import ShutdownCommand +from .commands.types import BaseCommand, ShutdownSignal from .deploy_browsers import deploy_firefox from .errors import BrowserConfigError, BrowserCrashError, ProfileLoadError from .socket_interface import ClientSocket @@ -286,7 +285,7 @@ class Browser: return # Send the shutdown command - command = ShutdownCommand() + command = ShutdownSignal() self.command_queue.put((command)) # Verify that webdriver has closed (30 second timeout) @@ -487,9 +486,9 @@ def BrowserManager( time.sleep(0.001) continue - command = command_queue.get() + command: Union[ShutdownSignal, BaseCommand] = command_queue.get() - if type(command) is ShutdownCommand: + if type(command) is ShutdownSignal: # Geckodriver creates a copy of the profile (and the original # temp file created by FirefoxProfile() is deleted). # We clear the profile attribute here to prevent prints from: @@ -509,8 +508,7 @@ def BrowserManager( # if command fails for whatever reason, tell the TaskManager to # kill and restart its worker processes try: - command_executor.execute_command( - command, + command.execute( driver, browser_params, manager_params, diff --git a/openwpm/command_sequence.py b/openwpm/command_sequence.py index 884f95b5..394c81c3 100644 --- a/openwpm/command_sequence.py +++ b/openwpm/command_sequence.py @@ -1,18 +1,16 @@ from typing import Callable, List, Tuple -from .commands.types import ( - BaseCommand, +from .commands.browser_commands import ( BrowseCommand, DumpPageSourceCommand, - DumpProfCommand, FinalizeCommand, GetCommand, InitializeCommand, RecursiveDumpPageSourceCommand, - RunCustomFunctionCommand, SaveScreenshotCommand, ScreenshotFullPageCommand, ) +from .commands.types import BaseCommand from .errors import CommandExecutionError @@ -39,7 +37,7 @@ class CommandSequence: retry_number: int = None, site_rank: int = None, callback: Callable[[bool], None] = None, - ): + ) -> None: """Initialize command sequence. Parameters @@ -179,15 +177,7 @@ class CommandSequence: command = RecursiveDumpPageSourceCommand(suffix) self._commands_with_timeout.append((command, timeout)) - def run_custom_function(self, function_handle, func_args=(), timeout=30): - """Run a custom by passing the function handle""" - self.total_timeout += timeout - if not self.contains_get_or_browse: - raise CommandExecutionError( - "No get or browse request preceding " "the dump page source command", - self, - ) - command = RunCustomFunctionCommand(function_handle, func_args) + def append_command(self, command: BaseCommand, timeout: int = 30) -> None: self._commands_with_timeout.append((command, timeout)) def mark_done(self, success: bool): diff --git a/openwpm/commands/browser_commands.py b/openwpm/commands/browser_commands.py index f5fd484f..b3eadd08 100644 --- a/openwpm/commands/browser_commands.py +++ b/openwpm/commands/browser_commands.py @@ -16,11 +16,10 @@ from selenium.common.exceptions import ( WebDriverException, ) from selenium.webdriver.common.action_chains import ActionChains -from selenium.webdriver.remote.webdriver import WebDriver from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait -from ..socket_interface import ClientSocket +from .types import BaseCommand from .utils.webdriver_utils import ( execute_in_all_frames, execute_script_with_retry, @@ -31,6 +30,7 @@ from .utils.webdriver_utils import ( ) # Constants for bot mitigation + NUM_MOUSE_MOVES = 10 # Times to randomly move the mouse RANDOM_SLEEP_LOW = 1 # low (in sec) for random sleep between page loads RANDOM_SLEEP_HIGH = 7 # high (in sec) for random sleep between page loads @@ -111,93 +111,141 @@ def tab_restart_browser(webdriver): webdriver.switch_to_window(webdriver.window_handles[0]) -def get_website( - url, sleep, visit_id, webdriver, browser_params, extension_socket: ClientSocket -): +class GetCommand(BaseCommand): """ goes to using the given instance """ - tab_restart_browser(webdriver) + def __init__(self, url, sleep): + self.url = url + self.sleep = sleep - if extension_socket is not None: - extension_socket.send(visit_id) + def __repr__(self): + return "GetCommand({},{})".format(self.url, self.sleep) - # Execute a get through selenium - try: - webdriver.get(url) - except TimeoutException: - pass + def execute( + self, + webdriver, + browser_params, + manager_params, + extension_socket, + ): + tab_restart_browser(webdriver) - # Sleep after get returns - time.sleep(sleep) - - # Close modal dialog if exists - try: - WebDriverWait(webdriver, 0.5).until(EC.alert_is_present()) - alert = webdriver.switch_to_alert() - alert.dismiss() - time.sleep(1) - except (TimeoutException, WebDriverException): - pass - - close_other_windows(webdriver) - - if browser_params.bot_mitigation: - bot_mitigation(webdriver) - - -def browse_website( - url, - num_links, - sleep, - visit_id, - webdriver, - browser_params, - manager_params, - extension_socket, -): - """Calls get_website before visiting present on the page. - - Note: the site_url in the site_visits table for the links visited will - be the site_url of the original page and NOT the url of the links visited. - """ - # First get the site - get_website(url, sleep, visit_id, webdriver, browser_params, extension_socket) - - # Then visit a few subpages - for _ in range(num_links): - links = [x for x in get_intra_links(webdriver, url) if is_displayed(x) is True] - if not links: - break - r = int(random.random() * len(links)) - logger.info( - "BROWSER %i: visiting internal link %s" - % (browser_params.browser_id, links[r].get_attribute("href")) - ) + if extension_socket is not None: + extension_socket.send(self.visit_id) + # Execute a get through selenium try: - links[r].click() - wait_until_loaded(webdriver, 300) - time.sleep(max(1, sleep)) - if browser_params.bot_mitigation: - bot_mitigation(webdriver) - webdriver.back() - wait_until_loaded(webdriver, 300) - except Exception: + webdriver.get(self.url) + except TimeoutException: pass + # Sleep after get returns + time.sleep(self.sleep) -def save_screenshot(visit_id, browser_id, driver, manager_params, suffix=""): - """ Save a screenshot of the current viewport""" - if suffix != "": - suffix = "-" + suffix + # Close modal dialog if exists + try: + WebDriverWait(webdriver, 0.5).until(EC.alert_is_present()) + alert = webdriver.switch_to_alert() + alert.dismiss() + time.sleep(1) + except (TimeoutException, WebDriverException): + pass - urlhash = md5(driver.current_url.encode("utf-8")).hexdigest() - outname = os.path.join( - manager_params.screenshot_path, "%i-%s%s.png" % (visit_id, urlhash, suffix) - ) - driver.save_screenshot(outname) + close_other_windows(webdriver) + + if browser_params.bot_mitigation: + bot_mitigation(webdriver) + + +class BrowseCommand(BaseCommand): + def __init__(self, url, num_links, sleep): + self.url = url + self.num_links = num_links + self.sleep = sleep + + def __repr__(self): + return "BrowseCommand({},{},{})".format(self.url, self.num_links, self.sleep) + + def execute( + self, + webdriver, + browser_params, + manager_params, + extension_socket, + ): + """Calls get_website before visiting present on the page. + + Note: the site_url in the site_visits table for the links visited will + be the site_url of the original page and NOT the url of the links visited. + """ + # First get the site + get_command = GetCommand(self.url, self.sleep) + get_command.set_visit_browser_id(self.visit_id, self.browser_id) + get_command.execute( + webdriver, + browser_params, + manager_params, + extension_socket, + ) + + # Then visit a few subpages + for _ in range(self.num_links): + links = [ + x + for x in get_intra_links(webdriver, self.url) + if is_displayed(x) is True + ] + if not links: + break + r = int(random.random() * len(links)) + logger.info( + "BROWSER %i: visiting internal link %s" + % (browser_params.browser_id, links[r].get_attribute("href")) + ) + + try: + links[r].click() + wait_until_loaded(webdriver, 300) + time.sleep(max(1, self.sleep)) + if browser_params.bot_mitigation: + bot_mitigation(webdriver) + webdriver.back() + wait_until_loaded(webdriver, 300) + except Exception as e: + logger.error( + "BROWSER %i: Error visitit internal link %s", + browser_params.browser_id, + links[r].get_attribute("href"), + exc_info=e, + ) + pass + + +class SaveScreenshotCommand(BaseCommand): + def __init__(self, suffix): + self.suffix = suffix + + def __repr__(self): + return "SaveScreenshotCommand({})".format(self.suffix) + + def execute( + self, + webdriver, + browser_params, + manager_params, + extension_socket, + ): + if self.suffix != "": + self.suffix = "-" + self.suffix + + urlhash = md5(webdriver.current_url.encode("utf-8")).hexdigest() + outname = os.path.join( + manager_params.screenshot_path, + "%i-%s%s.png" % (self.visit_id, urlhash, self.suffix), + ) + webdriver.save_screenshot(outname) def _stitch_screenshot_parts(visit_id, browser_id, manager_params): @@ -262,127 +310,209 @@ def _stitch_screenshot_parts(visit_id, browser_id, manager_params): pass -def screenshot_full_page(visit_id, browser_id, driver, manager_params, suffix=""): +class ScreenshotFullPageCommand(BaseCommand): + def __init__(self, suffix): + self.suffix = suffix - outdir = os.path.join(manager_params.screenshot_path, "parts") - if not os.path.isdir(outdir): - os.mkdir(outdir) - if suffix != "": - suffix = "-" + suffix - urlhash = md5(driver.current_url.encode("utf-8")).hexdigest() - outname = os.path.join( - outdir, "%i-%s%s-part-%%i-%%i.png" % (visit_id, urlhash, suffix) - ) + def __repr__(self): + return "ScreenshotFullPageCommand({})".format(self.suffix) - try: - part = 0 - max_height = execute_script_with_retry( - driver, "return document.body.scrollHeight;" + def execute( + self, + webdriver, + browser_params, + manager_params, + extension_socket, + ): + self.outdir = os.path.join(manager_params.screenshot_path, "parts") + if not os.path.isdir(self.outdir): + os.mkdir(self.outdir) + if self.suffix != "": + self.suffix = "-" + self.suffix + urlhash = md5(webdriver.current_url.encode("utf-8")).hexdigest() + outname = os.path.join( + self.outdir, + "%i-%s%s-part-%%i-%%i.png" % (self.visit_id, urlhash, self.suffix), ) - inner_height = execute_script_with_retry(driver, "return window.innerHeight;") - curr_scrollY = execute_script_with_retry(driver, "return window.scrollY;") - prev_scrollY = -1 - driver.save_screenshot(outname % (part, curr_scrollY)) - while ( - curr_scrollY + inner_height - ) < max_height and curr_scrollY != prev_scrollY: - # Scroll down to bottom of previous viewport - try: - driver.execute_script("window.scrollBy(0, window.innerHeight)") - except WebDriverException: - logger.info( - "BROWSER %i: WebDriverException while scrolling, " - "screenshot may be misaligned!" % browser_id + try: + part = 0 + max_height = execute_script_with_retry( + webdriver, "return document.body.scrollHeight;" + ) + inner_height = execute_script_with_retry( + webdriver, "return window.innerHeight;" + ) + curr_scrollY = execute_script_with_retry( + webdriver, "return window.scrollY;" + ) + prev_scrollY = -1 + webdriver.save_screenshot(outname % (part, curr_scrollY)) + while ( + curr_scrollY + inner_height + ) < max_height and curr_scrollY != prev_scrollY: + + # Scroll down to bottom of previous viewport + try: + webdriver.execute_script("window.scrollBy(0, window.innerHeight)") + except WebDriverException: + logger.info( + "BROWSER %i: WebDriverException while scrolling, " + "screenshot may be misaligned!" % self.browser_id + ) + pass + + # Update control variables + part += 1 + prev_scrollY = curr_scrollY + curr_scrollY = execute_script_with_retry( + webdriver, "return window.scrollY;" ) - pass - # Update control variables - part += 1 - prev_scrollY = curr_scrollY - curr_scrollY = execute_script_with_retry(driver, "return window.scrollY;") - - # Save screenshot - driver.save_screenshot(outname % (part, curr_scrollY)) - except WebDriverException: - excp = traceback.format_exception(*sys.exc_info()) - logger.error( - "BROWSER %i: Exception while taking full page screenshot \n %s" - % (browser_id, "".join(excp)) - ) - return - - _stitch_screenshot_parts(visit_id, browser_id, manager_params) - - -def dump_page_source(visit_id, driver, manager_params, suffix=""): - if suffix != "": - suffix = "-" + suffix - - outname = md5(driver.current_url.encode("utf-8")).hexdigest() - outfile = os.path.join( - manager_params.source_dump_path, "%i-%s%s.html" % (visit_id, outname, suffix) - ) - - with open(outfile, "wb") as f: - f.write(driver.page_source.encode("utf8")) - f.write(b"\n") - - -def recursive_dump_page_source(visit_id, driver, manager_params, suffix=""): - """Dump a compressed html tree for the current page visit""" - if suffix != "": - suffix = "-" + suffix - - outname = md5(driver.current_url.encode("utf-8")).hexdigest() - outfile = os.path.join( - manager_params.source_dump_path, - "%i-%s%s.json.gz" % (visit_id, outname, suffix), - ) - - def collect_source(driver, frame_stack, rv={}): - is_top_frame = len(frame_stack) == 1 - - # Gather frame information - doc_url = driver.execute_script("return window.document.URL;") - if is_top_frame: - page_source = rv - else: - page_source = dict() - page_source["doc_url"] = doc_url - source = driver.page_source - if type(source) != str: - source = str(source, "utf-8") - page_source["source"] = source - page_source["iframes"] = dict() - - # Store frame info in correct area of return value - if is_top_frame: + # Save screenshot + webdriver.save_screenshot(outname % (part, curr_scrollY)) + except WebDriverException: + excp = traceback.format_exception(*sys.exc_info()) + logger.error( + "BROWSER %i: Exception while taking full page screenshot \n %s" + % (self.browser_id, "".join(excp)) + ) return - out_dict = rv["iframes"] - for frame in frame_stack[1:-1]: - out_dict = out_dict[frame.id]["iframes"] - out_dict[frame_stack[-1].id] = page_source - page_source = dict() - execute_in_all_frames(driver, collect_source, {"rv": page_source}) - - with gzip.GzipFile(outfile, "wb") as f: - f.write(json.dumps(page_source).encode("utf-8")) + _stitch_screenshot_parts(self.visit_id, self.browser_id, manager_params) -def finalize( - visit_id: int, webdriver: WebDriver, extension_socket: ClientSocket, sleep: int -) -> None: - """ Informs the extension that a visit is done """ - tab_restart_browser(webdriver) - # This doesn't immediately stop data saving from the current - # visit so we sleep briefly before unsetting the visit_id. - time.sleep(sleep) - msg = {"action": "Finalize", "visit_id": visit_id} - extension_socket.send(msg) +class DumpPageSourceCommand(BaseCommand): + def __init__(self, suffix): + self.suffix = suffix + + def __repr__(self): + return "DumpPageSourceCommand({})".format(self.suffix) + + def execute( + self, + webdriver, + browser_params, + manager_params, + extension_socket, + ): + + if self.suffix != "": + self.suffix = "-" + self.suffix + + outname = md5(webdriver.current_url.encode("utf-8")).hexdigest() + outfile = os.path.join( + manager_params.source_dump_path, + "%i-%s%s.html" % (self.visit_id, outname, self.suffix), + ) + + with open(outfile, "wb") as f: + f.write(webdriver.page_source.encode("utf8")) + f.write(b"\n") -def initialize(visit_id: int, extension_socket: ClientSocket) -> None: - msg = {"action": "Initialize", "visit_id": visit_id} - extension_socket.send(msg) +class RecursiveDumpPageSourceCommand(BaseCommand): + def __init__(self, suffix): + self.suffix = suffix + + def __repr__(self): + return "RecursiveDumpPageSourceCommand({})".format(self.suffix) + + def execute( + self, + webdriver, + browser_params, + manager_params, + extension_socket, + ): + + """Dump a compressed html tree for the current page visit""" + if self.suffix != "": + self.suffix = "-" + self.suffix + + outname = md5(webdriver.current_url.encode("utf-8")).hexdigest() + outfile = os.path.join( + manager_params.source_dump_path, + "%i-%s%s.json.gz" % (self.visit_id, outname, self.suffix), + ) + + def collect_source(webdriver, frame_stack, rv={}): + is_top_frame = len(frame_stack) == 1 + + # Gather frame information + doc_url = webdriver.execute_script("return window.document.URL;") + if is_top_frame: + page_source = rv + else: + page_source = dict() + page_source["doc_url"] = doc_url + source = webdriver.page_source + if type(source) != str: + source = str(source, "utf-8") + page_source["source"] = source + page_source["iframes"] = dict() + + # Store frame info in correct area of return value + if is_top_frame: + return + out_dict = rv["iframes"] + for frame in frame_stack[1:-1]: + out_dict = out_dict[frame.id]["iframes"] + out_dict[frame_stack[-1].id] = page_source + + page_source = dict() + execute_in_all_frames(webdriver, collect_source, {"rv": page_source}) + + with gzip.GzipFile(outfile, "wb") as f: + f.write(json.dumps(page_source).encode("utf-8")) + + +class FinalizeCommand(BaseCommand): + """This command is automatically appended to the end of a CommandSequence + It's apperance means there won't be any more commands for this + visit_id + """ + + def __init__(self, sleep): + self.sleep = sleep + + def __repr__(self): + return f"FinalizeCommand({self.sleep})" + + def execute( + self, + webdriver, + browser_params, + manager_params, + extension_socket, + ): + + """ Informs the extension that a visit is done """ + tab_restart_browser(webdriver) + # This doesn't immediately stop data saving from the current + # visit so we sleep briefly before unsetting the visit_id. + time.sleep(self.sleep) + msg = {"action": "Finalize", "visit_id": self.visit_id} + extension_socket.send(msg) + + +class InitializeCommand(BaseCommand): + """The command is automatically prepended to the beginning of a + CommandSequence + It initializes state both in the extensions as well in as the + Aggregator + """ + + def __repr__(self): + return "IntitializeCommand()" + + def execute( + self, + webdriver, + browser_params, + manager_params, + extension_socket, + ): + + msg = {"action": "Initialize", "visit_id": self.visit_id} + extension_socket.send(msg) diff --git a/openwpm/commands/command_executor.py b/openwpm/commands/command_executor.py deleted file mode 100644 index ed1dece7..00000000 --- a/openwpm/commands/command_executor.py +++ /dev/null @@ -1,118 +0,0 @@ -from ..errors import CommandExecutionError -from . import browser_commands, profile_commands -from .types import ( - BrowseCommand, - DumpPageSourceCommand, - DumpProfCommand, - FinalizeCommand, - GetCommand, - InitializeCommand, - RecursiveDumpPageSourceCommand, - RunCustomFunctionCommand, - SaveScreenshotCommand, - ScreenshotFullPageCommand, -) - - -def execute_command( - command, - webdriver, - browser_params, - manager_params, - extension_socket, -): - """Executes BrowserManager commands - commands are of form (COMMAND, ARG0, ARG1, ...) - """ - if type(command) is GetCommand: - browser_commands.get_website( - url=command.url, - sleep=command.sleep, - visit_id=command.visit_id, - webdriver=webdriver, - browser_params=browser_params, - extension_socket=extension_socket, - ) - - elif type(command) is BrowseCommand: - browser_commands.browse_website( - url=command.url, - num_links=command.num_links, - sleep=command.sleep, - visit_id=command.visit_id, - webdriver=webdriver, - browser_params=browser_params, - manager_params=manager_params, - extension_socket=extension_socket, - ) - - elif type(command) is DumpProfCommand: - profile_commands.dump_profile( - browser_profile_folder=browser_params.profile_path, - manager_params=manager_params, - browser_params=browser_params, - tar_location=command.dump_folder, - close_webdriver=command.close_webdriver, - webdriver=webdriver, - compress=command.compress, - ) - - elif type(command) is DumpPageSourceCommand: - browser_commands.dump_page_source( - visit_id=command.visit_id, - driver=webdriver, - manager_params=manager_params, - suffix=command.suffix, - ) - - elif type(command) is RecursiveDumpPageSourceCommand: - browser_commands.recursive_dump_page_source( - visit_id=command.visit_id, - driver=webdriver, - manager_params=manager_params, - suffix=command.suffix, - ) - - elif type(command) is SaveScreenshotCommand: - browser_commands.save_screenshot( - visit_id=command.visit_id, - browser_id=command.browser_id, - driver=webdriver, - manager_params=manager_params, - suffix=command.suffix, - ) - - elif type(command) is ScreenshotFullPageCommand: - browser_commands.screenshot_full_page( - visit_id=command.visit_id, - browser_id=command.browser_id, - driver=webdriver, - manager_params=manager_params, - suffix=command.suffix, - ) - - elif type(command) is RunCustomFunctionCommand: - arg_dict = { - "command": command, - "driver": webdriver, - "browser_params": browser_params, - "manager_params": manager_params, - "extension_socket": extension_socket, - } - command.function_handle(*command.func_args, **arg_dict) - - elif type(command) is FinalizeCommand: - browser_commands.finalize( - visit_id=command.visit_id, - sleep=command.sleep, - webdriver=webdriver, - extension_socket=extension_socket, - ) - - elif type(command) is InitializeCommand: - browser_commands.initialize( - visit_id=command.visit_id, extension_socket=extension_socket - ) - - else: - raise CommandExecutionError("Invalid Command", command) diff --git a/openwpm/commands/profile_commands.py b/openwpm/commands/profile_commands.py index 19157147..9a2d7ba7 100644 --- a/openwpm/commands/profile_commands.py +++ b/openwpm/commands/profile_commands.py @@ -1,113 +1,129 @@ import logging import os -import pickle import shutil import tarfile +from selenium.webdriver import Firefox + +from openwpm.config import BrowserParams, ManagerParams + from ..errors import ProfileLoadError +from ..socket_interface import ClientSocket +from .types import BaseCommand from .utils.firefox_profile import sleep_until_sqlite_checkpoint logger = logging.getLogger("openwpm") -def dump_profile( - browser_profile_folder, - manager_params, - browser_params, - tar_location, - close_webdriver, - webdriver=None, - compress=False, -): +class DumpProfileCommand(BaseCommand): """ dumps a browser profile currently stored in to in which both folders are absolute paths. """ - logger.debug( - "BROWSER %i: Profile dumping is currently unsupported. " - "See: https://github.com/mozilla/OpenWPM/projects/2." - % browser_params.browser_id - ) - return - # ensures that folder paths end with slashes - if browser_profile_folder[-1] != "/": - browser_profile_folder = browser_profile_folder + "/" - if tar_location[-1] != "/": - tar_location = tar_location + "/" + def __init__(self, dump_folder, close_webdriver, compress): + self.dump_folder = dump_folder + self.close_webdriver = close_webdriver + self.compress = compress - if not os.path.exists(tar_location): - os.makedirs(tar_location) - - if compress: - tar_name = "profile.tar.gz" - else: - tar_name = "profile.tar" - - # see if this file exists first - # if it does, delete it before we try to save the current session - if os.path.isfile(tar_location + tar_name): - os.remove(tar_location + tar_name) - - # if this is a dump on close, close the webdriver and wait for checkpoint - if close_webdriver: - webdriver.close() - sleep_until_sqlite_checkpoint(browser_profile_folder) - - # backup and tar profile - if compress: - tar = tarfile.open(tar_location + tar_name, "w:gz", errorlevel=1) - else: - tar = tarfile.open(tar_location + tar_name, "w", errorlevel=1) - logger.debug( - "BROWSER %i: Backing up full profile from %s to %s" - % ( - browser_params.browser_id, - browser_profile_folder, - tar_location + tar_name, + def __repr__(self): + return "DumpProfCommand({},{},{})".format( + self.dump_folder, self.close_webdriver, self.compress ) - ) - storage_vector_files = [ - "cookies.sqlite", # cookies - "cookies.sqlite-shm", - "cookies.sqlite-wal", - "places.sqlite", # history - "places.sqlite-shm", - "places.sqlite-wal", - "webappsstore.sqlite", # localStorage - "webappsstore.sqlite-shm", - "webappsstore.sqlite-wal", - ] - storage_vector_dirs = [ - "webapps", # related to localStorage? - "storage", # directory for IndexedDB - ] - for item in storage_vector_files: - full_path = os.path.join(browser_profile_folder, item) - if ( - not os.path.isfile(full_path) - and full_path[-3:] != "shm" - and full_path[-3:] != "wal" - ): - logger.critical( - "BROWSER %i: %s NOT FOUND IN profile folder, skipping." - % (browser_params.browser_id, full_path) + + def execute( + self, + webdriver: Firefox, + browser_params: BrowserParams, + manager_params: ManagerParams, + extension_socket: ClientSocket, + ) -> None: + logger.debug( + "BROWSER %i: Profile dumping is currently unsupported. " + "See: https://github.com/mozilla/OpenWPM/projects/2." + % browser_params.browser_id + ) + return + browser_profile_folder = browser_params.profile_path + + # ensures that folder paths end with slashes + if browser_profile_folder[-1] != "/": + browser_profile_folder = browser_profile_folder + "/" + if tar_location[-1] != "/": + tar_location = tar_location + "/" + + if not os.path.exists(tar_location): + os.makedirs(tar_location) + + if compress: + tar_name = "profile.tar.gz" + else: + tar_name = "profile.tar" + + # see if this file exists first + # if it does, delete it before we try to save the current session + if os.path.isfile(tar_location + tar_name): + os.remove(tar_location + tar_name) + + # if this is a dump on close, close the webdriver and wait for checkpoint + if close_webdriver: + webdriver.close() + sleep_until_sqlite_checkpoint(browser_profile_folder) + + # backup and tar profile + if compress: + tar = tarfile.open(tar_location + tar_name, "w:gz", errorlevel=1) + else: + tar = tarfile.open(tar_location + tar_name, "w", errorlevel=1) + logger.debug( + "BROWSER %i: Backing up full profile from %s to %s" + % ( + browser_params.browser_id, + browser_profile_folder, + tar_location + tar_name, ) - elif not os.path.isfile(full_path) and ( - full_path[-3:] == "shm" or full_path[-3:] == "wal" - ): - continue # These are just checkpoint files - tar.add(full_path, arcname=item) - for item in storage_vector_dirs: - full_path = os.path.join(browser_profile_folder, item) - if not os.path.isdir(full_path): - logger.warning( - "BROWSER %i: %s NOT FOUND IN profile folder, skipping." - % (browser_params.browser_id, full_path) - ) - continue - tar.add(full_path, arcname=item) - tar.close() + ) + storage_vector_files = [ + "cookies.sqlite", # cookies + "cookies.sqlite-shm", + "cookies.sqlite-wal", + "places.sqlite", # history + "places.sqlite-shm", + "places.sqlite-wal", + "webappsstore.sqlite", # localStorage + "webappsstore.sqlite-shm", + "webappsstore.sqlite-wal", + ] + storage_vector_dirs = [ + "webapps", # related to localStorage? + "storage", # directory for IndexedDB + ] + for item in storage_vector_files: + full_path = os.path.join(browser_profile_folder, item) + if ( + not os.path.isfile(full_path) + and full_path[-3:] != "shm" + and full_path[-3:] != "wal" + ): + logger.critical( + "BROWSER %i: %s NOT FOUND IN profile folder, skipping." + % (browser_params.browser_id, full_path) + ) + elif not os.path.isfile(full_path) and ( + full_path[-3:] == "shm" or full_path[-3:] == "wal" + ): + continue # These are just checkpoint files + tar.add(full_path, arcname=item) + for item in storage_vector_dirs: + full_path = os.path.join(browser_profile_folder, item) + if not os.path.isdir(full_path): + logger.warning( + "BROWSER %i: %s NOT FOUND IN profile folder, skipping." + % (browser_params.browser_id, full_path) + ) + continue + tar.add(full_path, arcname=item) + tar.close() def load_profile(browser_profile_folder, manager_params, browser_params, tar_location): diff --git a/openwpm/commands/types.py b/openwpm/commands/types.py index 5899a1db..597afcc5 100644 --- a/openwpm/commands/types.py +++ b/openwpm/commands/types.py @@ -1,4 +1,20 @@ -class BaseCommand: +from abc import ABC, abstractmethod + +from selenium.webdriver import Firefox + +from ..config import BrowserParams, ManagerParams +from ..socket_interface import ClientSocket + + +class BaseCommand(ABC): + """ + Base class for all Commands in OpenWPM + + See `custom_command.py` for instructions on how + to implement your own and `openwpm/commands` for + all commands that are already implemented + """ + def set_visit_browser_id(self, visit_id, browser_id): self.visit_id = visit_id self.browser_id = browser_id @@ -6,105 +22,34 @@ class BaseCommand: def set_start_time(self, start_time): self.start_time = start_time + @abstractmethod + def execute( + self, + webdriver: Firefox, + browser_params: BrowserParams, + manager_params: ManagerParams, + extension_socket: ClientSocket, + ) -> None: + """ + This method gets called in the Browser process + :parameter webdriver: + WebDriver is a Selenium class used to control + browser. + You can simulate arbitrary interactions and extract almost all browser state + with the tools that Selenium gives you + :parameter browser_params: + Contains the per browser configuration + E.g. which instruments are enabled + :parameter manager_params: + Per crawl parameters + E.g. where to store files + :parameter extension_socket: Communication channel to the storage provider + TODO: Further document this once the StorageProvider PR has landed + This allows you to send data to be persisted to storage. + """ + pass -class GetCommand(BaseCommand): - def __init__(self, url, sleep): - self.url = url - self.sleep = sleep +class ShutdownSignal: def __repr__(self): - return "GetCommand({},{})".format(self.url, self.sleep) - - -class BrowseCommand(BaseCommand): - def __init__(self, url, num_links, sleep): - self.url = url - self.num_links = num_links - self.sleep = sleep - - def __repr__(self): - return "BrowseCommand({},{},{})".format(self.url, self.num_links, self.sleep) - - -class DumpProfCommand(BaseCommand): - def __init__(self, dump_folder, close_webdriver, compress): - self.dump_folder = dump_folder - self.close_webdriver = close_webdriver - self.compress = compress - - def __repr__(self): - return "DumpProfCommand({},{},{})".format( - self.dump_folder, self.close_webdriver, self.compress - ) - - -class DumpPageSourceCommand(BaseCommand): - def __init__(self, suffix): - self.suffix = suffix - - def __repr__(self): - return "DumpPageSourceCommand({})".format(self.suffix) - - -class RecursiveDumpPageSourceCommand(BaseCommand): - def __init__(self, suffix): - self.suffix = suffix - - def __repr__(self): - return "RecursiveDumpPageSourceCommand({})".format(self.suffix) - - -class SaveScreenshotCommand(BaseCommand): - def __init__(self, suffix): - self.suffix = suffix - - def __repr__(self): - return "SaveScreenshotCommand({})".format(self.suffix) - - -class ScreenshotFullPageCommand(BaseCommand): - def __init__(self, suffix): - self.suffix = suffix - - def __repr__(self): - return "ScreenshotFullPageCommand({})".format(self.suffix) - - -class RunCustomFunctionCommand(BaseCommand): - def __init__(self, function_handle, func_args): - self.function_handle = function_handle - self.func_args = func_args - - def __repr__(self): - return "RunCustomFunctionCommand({},{})".format( - self.function_handle, self.func_args - ) - - -class ShutdownCommand(BaseCommand): - def __repr__(self): - return "ShutdownCommand()" - - -class FinalizeCommand(BaseCommand): - """This command is automatically appended to the end of a CommandSequence - It's apperance means there won't be any more commands for this - visit_id - """ - - def __init__(self, sleep): - self.sleep = sleep - - def __repr__(self): - return f"FinalizeCommand({self.sleep})" - - -class InitializeCommand(BaseCommand): - """The command is automatically prepended to the beginning of a - CommandSequence - It initializes state both in the extensions as well in as the - Aggregator - """ - - def __repr__(self): - return "IntitializeCommand()" + return "ShutdownSignal()" diff --git a/openwpm/config.py b/openwpm/config.py index 1bfe07cd..e42a620a 100644 --- a/openwpm/config.py +++ b/openwpm/config.py @@ -57,6 +57,14 @@ ALL_RESOURCE_TYPES = { @dataclass_json @dataclass class BrowserParams: + """ + Configuration that might differ per browser + + OpenWPM allows you to run multiple browsers with different + configurations in parallel and this class allows you + to customize behaviour of an individual browser + """ + extension_enabled: bool = True cookie_instrument: bool = True js_instrument: bool = False @@ -83,6 +91,15 @@ class BrowserParams: @dataclass_json @dataclass class ManagerParams: + """ + Configuration for the TaskManager + + The configuration will be the same for all browsers running on the same + TaskManager. + It can be used to control storage locations or which watchdogs should + run + """ + data_directory: str = "~/openwpm/" log_directory: str = "~/openwpm/" screenshot_path: Optional[str] = None diff --git a/openwpm/task_manager.py b/openwpm/task_manager.py index 7c5164a8..f852d23b 100644 --- a/openwpm/task_manager.py +++ b/openwpm/task_manager.py @@ -1,4 +1,3 @@ -import copy import json import logging import os @@ -418,7 +417,7 @@ class TaskManager: self, browser: Browser, command_sequence: CommandSequence ) -> None: """ - sends command tuple to the BrowserManager + Sends CommandSequence to the BrowserManager one command at a time """ browser.is_fresh = False @@ -508,7 +507,7 @@ class TaskManager: { "browser_id": browser.browser_id, "visit_id": browser.curr_visit_id, - "command": type(command), + "command": type(command).__name__, "arguments": json.dumps( command.__dict__, default=lambda x: repr(x) ).encode("utf-8"), diff --git a/test/manual_test.py b/test/manual_test.py index abd0f103..bf23316d 100644 --- a/test/manual_test.py +++ b/test/manual_test.py @@ -1,5 +1,4 @@ import atexit -import json import subprocess from os.path import dirname, join, realpath diff --git a/test/test_custom_function_command.py b/test/test_custom_function_command.py index 38928979..81d9a455 100644 --- a/test/test_custom_function_command.py +++ b/test/test_custom_function_command.py @@ -1,4 +1,9 @@ +from selenium.webdriver import Firefox + from openwpm import command_sequence, task_manager +from openwpm.commands.types import BaseCommand +from openwpm.config import BrowserParams, ManagerParams +from openwpm.socket_interface import ClientSocket from openwpm.utilities import db_utils from . import utilities @@ -22,6 +27,54 @@ PAGE_LINKS = { } +class CollectLinksCommand(BaseCommand): + """ Collect links with `scheme` and save in table `table_name` """ + + def __init__(self, scheme, table_name) -> None: + self.scheme = scheme + self.table_name = table_name + + def execute( + self, + webdriver: Firefox, + browser_params: BrowserParams, + manager_params: ManagerParams, + extension_socket: ClientSocket, + ) -> None: + link_urls = [ + x + for x in ( + element.get_attribute("href") + for element in webdriver.find_elements_by_tag_name("a") + ) + if x.startswith(self.scheme + "://") + ] + current_url = webdriver.current_url + + sock = ClientSocket() + sock.connect(*manager_params.aggregator_address) + + query = ( + "CREATE TABLE IF NOT EXISTS %s (" + "top_url TEXT, link TEXT, " + "visit_id INTEGER, browser_id INTEGER);" % self.table_name + ) + sock.send(("create_table", query)) + + for link in link_urls: + query = ( + self.table_name, + { + "top_url": current_url, + "link": link, + "visit_id": self.visit_id, + "browser_id": self.browser_id, + }, + ) + sock.send(query) + sock.close() + + class TestCustomFunctionCommand(OpenWPMTest): """Test `custom_function` command's ability to handle inline functions""" @@ -31,52 +84,11 @@ class TestCustomFunctionCommand(OpenWPMTest): def test_custom_function(self): """ Test `custom_function` with an inline func that collects links """ - from openwpm.socket_interface import ClientSocket - - def collect_links(table_name, scheme, **kwargs): - """ Collect links with `scheme` and save in table `table_name` """ - driver = kwargs["driver"] - manager_params = kwargs["manager_params"] - browser_id = kwargs["command"].browser_id - visit_id = kwargs["command"].visit_id - link_urls = [ - x - for x in ( - element.get_attribute("href") - for element in driver.find_elements_by_tag_name("a") - ) - if x.startswith(scheme + "://") - ] - current_url = driver.current_url - - sock = ClientSocket() - sock.connect(*manager_params.aggregator_address) - - query = ( - "CREATE TABLE IF NOT EXISTS %s (" - "top_url TEXT, link TEXT, " - "visit_id INTEGER, browser_id INTEGER);" % table_name - ) - sock.send(("create_table", query)) - - for link in link_urls: - query = ( - table_name, - { - "top_url": current_url, - "link": link, - "visit_id": visit_id, - "browser_id": browser_id, - }, - ) - sock.send(query) - sock.close() - manager_params, browser_params = self.get_config() manager = task_manager.TaskManager(manager_params, browser_params) cs = command_sequence.CommandSequence(url_a) cs.get(sleep=0, timeout=60) - cs.run_custom_function(collect_links, ("page_links", "http")) + cs.append_command(CollectLinksCommand("http", "page_links")) manager.execute_command_sequence(cs) manager.close() query_result = db_utils.query_db( diff --git a/test/test_dataclass_validations.py b/test/test_dataclass_validations.py index ce3af3cf..9f32da3a 100644 --- a/test/test_dataclass_validations.py +++ b/test/test_dataclass_validations.py @@ -9,8 +9,6 @@ from openwpm.config import ( ) from openwpm.errors import ConfigError -from .openwpmtest import OpenWPMTest - def test_display_mode(): browser_params = BrowserParams() diff --git a/test/test_http_instrumentation.py b/test/test_http_instrumentation.py index 2c299435..538735be 100644 --- a/test/test_http_instrumentation.py +++ b/test/test_http_instrumentation.py @@ -11,6 +11,7 @@ from urllib.parse import urlparse import pytest from openwpm import command_sequence, task_manager +from openwpm.commands.types import BaseCommand from openwpm.utilities import db_utils from . import utilities @@ -997,21 +998,12 @@ class TestPOSTInstrument(OpenWPMTest): img_file_path = os.path.abspath("test_pages/shared/test_image.png") css_file_path = os.path.abspath("test_pages/shared/test_style.css") - def type_filenames_into_form(**kwargs): - """Simulate typing into the file upload input fields.""" - driver = kwargs["driver"] - img_file_upload_element = driver.find_element_by_id("upload-img") - css_file_upload_element = driver.find_element_by_id("upload-css") - img_file_upload_element.send_keys(img_file_path) - css_file_upload_element.send_keys(css_file_path) - sleep(5) # wait for the form submission (3 sec after onload) - manager_params, browser_params = self.get_config() manager = task_manager.TaskManager(manager_params, browser_params) test_url = utilities.BASE_TEST_URL + "/post_file_upload.html" cs = command_sequence.CommandSequence(test_url) cs.get(sleep=0, timeout=60) - cs.run_custom_function(type_filenames_into_form, ()) + cs.append_command(FilenamesIntoFormCommand(img_file_path, css_file_path)) manager.execute_command_sequence(cs) manager.close() @@ -1029,3 +1021,22 @@ class TestPOSTInstrument(OpenWPMTest): u"upload-img": img_file_content, } assert expected_body == post_body_decoded + + +class FilenamesIntoFormCommand(BaseCommand): + def __init__(self, img_file_path, css_file_path) -> None: + self.img_file_path = img_file_path + self.css_file_path = css_file_path + + def execute( + self, + webdriver, + browser_params, + manager_params, + extension_socket, + ) -> None: + img_file_upload_element = webdriver.find_element_by_id("upload-img") + css_file_upload_element = webdriver.find_element_by_id("upload-css") + img_file_upload_element.send_keys(self.img_file_path) + css_file_upload_element.send_keys(self.css_file_path) + sleep(5) # wait for the form submission (3 sec after onload) diff --git a/test/test_profile.py b/test/test_profile.py index aba8994f..f344955e 100644 --- a/test/test_profile.py +++ b/test/test_profile.py @@ -2,9 +2,10 @@ from os.path import isfile, join import pytest -from openwpm import task_manager from openwpm.command_sequence import CommandSequence +from openwpm.commands.types import BaseCommand from openwpm.errors import CommandExecutionError, ProfileLoadError +from openwpm.task_manager import TaskManager from openwpm.utilities import db_utils from .openwpmtest import OpenWPMTest @@ -23,7 +24,7 @@ class TestProfile(OpenWPMTest): @pytest.mark.xfail(run=False) def test_saving(self): manager_params, browser_params = self.get_config() - manager = task_manager.TaskManager(manager_params, browser_params) + manager = TaskManager(manager_params, browser_params) manager.get("http://example.com") manager.close() assert isfile(join(browser_params[0].profile_archive_dir, "profile.tar.gz")) @@ -32,7 +33,7 @@ class TestProfile(OpenWPMTest): def test_crash(self): manager_params, browser_params = self.get_config() manager_params.failure_limit = 0 - manager = task_manager.TaskManager(manager_params, browser_params) + manager = TaskManager(manager_params, browser_params) with pytest.raises(CommandExecutionError): manager.get("http://example.com") # So we have a profile manager.get("example.com") # Selenium requires scheme prefix @@ -42,7 +43,7 @@ class TestProfile(OpenWPMTest): def test_crash_profile(self): manager_params, browser_params = self.get_config() manager_params.failure_limit = 2 - manager = task_manager.TaskManager(manager_params, browser_params) + manager = TaskManager(manager_params, browser_params) try: manager.get("http://example.com") # So we have a profile manager.get("example.com") # Selenium requires scheme prefix @@ -58,14 +59,14 @@ class TestProfile(OpenWPMTest): manager_params, browser_params = self.get_config() browser_params[0].seed_tar = "/tmp/NOTREAL" with pytest.raises(ProfileLoadError): - task_manager.TaskManager(manager_params, browser_params) # noqa + TaskManager(manager_params, browser_params) # noqa @pytest.mark.skip(reason="proxy no longer supported, need to update") def test_profile_saved_when_launch_crashes(self): manager_params, browser_params = self.get_config() browser_params[0].proxy = True browser_params[0].save_content = "script" - manager = task_manager.TaskManager(manager_params, browser_params) + manager = TaskManager(manager_params, browser_params) manager.get("http://example.com") # Kill the LevelDBAggregator @@ -84,32 +85,15 @@ class TestProfile(OpenWPMTest): assert isfile(join(browser_params[0].profile_archive_dir, "profile.tar.gz")) def test_seed_persistance(self): - def test_config_is_set(*args, **kwargs): - driver = kwargs["driver"] - driver.get("about:config") - result = driver.execute_script( - """ - var prefs = Components - .classes["@mozilla.org/preferences-service;1"] - .getService(Components.interfaces.nsIPrefBranch); - try { - return prefs.getBoolPref("test_pref") - } catch (e) { - return false; - } - """ - ) - assert result - manager_params, browser_params = self.get_test_config(num_browsers=1) browser_params[0].seed_tar = "." command_sequences = [] for _ in range(2): cs = CommandSequence(url="https://example.com", reset=True) cs.get() - cs.run_custom_function(test_config_is_set) + cs.append_command(TestConfigSetCommand("test_pref", True)) command_sequences.append(cs) - manager = task_manager.TaskManager(manager_params, browser_params) + manager = TaskManager(manager_params, browser_params) for cs in command_sequences: manager.execute_command_sequence(cs) manager.close() @@ -120,3 +104,31 @@ class TestProfile(OpenWPMTest): assert len(query_result) > 0 for row in query_result: assert row["command_status"] == "ok", f"Command {tuple(row)} was not ok" + + +class TestConfigSetCommand(BaseCommand): + def __init__(self, pref_name, expected_value) -> None: + self.pref_name = pref_name + self.expected_value = expected_value + + def execute( + self, + webdriver, + browser_params, + manager_params, + extension_socket, + ) -> None: + webdriver.get("about:config") + result = webdriver.execute_script( + f""" + var prefs = Components + .classes["@mozilla.org/preferences-service;1"] + .getService(Components.interfaces.nsIPrefBranch); + try {{ + return prefs.getBoolPref("{self.pref_name}") + }} catch (e) {{ + return false; + }} + """ + ) + assert result == self.expected_value diff --git a/test/test_timer.py b/test/test_timer.py index 732f521e..125fc4db 100644 --- a/test/test_timer.py +++ b/test/test_timer.py @@ -17,10 +17,9 @@ class TestCommandDuration(OpenWPMTest): manager = task_manager.TaskManager(manager_params, browser_params) manager.get(url=TEST_URL, sleep=5) manager.close() - get_command = db_utils.query_db( manager_params.database_name, - "SELECT duration FROM crawl_history WHERE command = \"\"", + "SELECT duration FROM crawl_history WHERE command = 'GetCommand'", as_tuple=True, )[0] diff --git a/test/test_webdriver_utils.py b/test/test_webdriver_utils.py index d4bc25ee..a8e83de9 100644 --- a/test/test_webdriver_utils.py +++ b/test/test_webdriver_utils.py @@ -28,7 +28,7 @@ class TestCustomFunctionCommand(OpenWPMTest): get_command = db_utils.query_db( manager_params.database_name, - "SELECT command_status, error FROM crawl_history WHERE command = \"\"", + "SELECT command_status, error FROM crawl_history WHERE command = 'GetCommand'", as_tuple=True, )[0]