OpenWPM/openwpm/command_sequence.py

198 строки
7.9 KiB
Python

from pathlib import Path
from typing import Callable, List, Optional, Tuple
from .commands.browser_commands import (
BrowseCommand,
DumpPageSourceCommand,
FinalizeCommand,
GetCommand,
InitializeCommand,
RecursiveDumpPageSourceCommand,
SaveScreenshotCommand,
ScreenshotFullPageCommand,
)
from .commands.profile_commands import DumpProfileCommand
from .commands.types import BaseCommand
from .errors import CommandExecutionError
class CommandSequence:
"""A CommandSequence wraps a series of commands to be performed
on a visit to one top-level site into one logical
"site visit," keyed by a visit id. An example of a CommandSequence
that visits a page and saves a screenshot of it would be:
sequence = CommandSequence(url)
sequence.get()
sequence.save_screenshot()
task_manager.execute_command_sequence(sequence)
CommandSequence guarantees that a series of commands will be performed
by a single browser instance.
"""
def __init__(
self,
url: str,
reset: bool = False,
blocking: bool = False,
retry_number: Optional[int] = None,
site_rank: Optional[int] = None,
callback: Optional[Callable[[bool], None]] = None,
) -> None:
"""Initialize command sequence.
Parameters
----------
url : string
url of page visit the command sequence should execute on
reset : bool, optional
True if browser should clear state and restart after sequence
blocking : bool, optional
True if sequence should block parent process during execution
retry_number : int, optional
Integer denoting the number of attempts that have been made to
execute this command. Will be saved in `crawl_history`.
site_rank : int, optional
Integer indicating the ranking of the page to visit, saved
to `site_visits`
callback :
A callback to be invoked once all data regarding this
CommandSequence has been saved out or it has been interrupted.
If the command sequence completes and all data is saved
successfully, `True` will be passed to the callback.
Otherwise `False` will be passed. A value of `False` indicates
that the data saved from the site visit may be incomplete or empty.
"""
self.url = url
self.reset = reset
self.blocking = blocking
self.retry_number = retry_number
self._commands_with_timeout: List[Tuple[BaseCommand, int]] = []
self.total_timeout = 0
self.contains_get_or_browse = False
self.site_rank = site_rank
self.callback = callback
def get(self, sleep=0, timeout=60):
"""goes to a url"""
self.total_timeout += timeout
command = GetCommand(self.url, sleep)
self._commands_with_timeout.append((command, timeout))
self.contains_get_or_browse = True
def browse(self, num_links=2, sleep=0, timeout=60):
"""browse a website and visit <num_links> links on the page"""
self.total_timeout += timeout
command = BrowseCommand(self.url, num_links, sleep)
self._commands_with_timeout.append((command, timeout))
self.contains_get_or_browse = True
def dump_profile(
self,
tar_path: Path,
close_webdriver: bool = False,
compress: bool = True,
timeout: int = 120,
) -> None:
"""dumps from the profile path to a given file (absolute path)"""
self.total_timeout += timeout
command = DumpProfileCommand(tar_path, close_webdriver, compress)
self._commands_with_timeout.append((command, timeout))
def save_screenshot(self, suffix="", timeout=30):
"""Save a screenshot of the current viewport."""
self.total_timeout += timeout
if not self.contains_get_or_browse:
raise CommandExecutionError(
"No get or browse request preceding the save screenshot command",
self,
)
command = SaveScreenshotCommand(suffix)
self._commands_with_timeout.append((command, timeout))
def screenshot_full_page(self, suffix="", timeout=30):
"""Save a screenshot of the entire page.
NOTE: geckodriver v0.15 only supports viewport screenshots. To
screenshot the entire page we scroll the page using javascript and take
a viewport screenshot at each location. This method will save the
parts and a stitched version in the `screenshot_path`. We only scroll
vertically, so pages that are wider than the viewport will be clipped.
See: https://github.com/mozilla/geckodriver/issues/570
The screenshot produced will only include the area originally
loaded at the start of the command. Sites which dynamically expand as
the page is scrolled (i.e. infinite scroll) will only go as far as the
original height.
NOTE: In geckodriver v0.15 doing any scrolling (or having devtools
open) seems to break element-only screenshots. So using this command
will cause any future element-only screenshots to be mis-aligned
"""
self.total_timeout += timeout
if not self.contains_get_or_browse:
raise CommandExecutionError(
"No get or browse request preceding the screenshot full page command",
self,
)
command = ScreenshotFullPageCommand(suffix)
self._commands_with_timeout.append((command, timeout))
def dump_page_source(self, suffix="", timeout=30):
"""Dumps rendered source of current page to 'sources' directory."""
self.total_timeout += timeout
if not self.contains_get_or_browse:
raise CommandExecutionError(
"No get or browse request preceding the dump page source command",
self,
)
command = DumpPageSourceCommand(suffix)
self._commands_with_timeout.append((command, timeout))
def recursive_dump_page_source(self, suffix="", timeout=30):
"""Dumps rendered source of current page visit to 'sources' dir.
Unlike `dump_page_source`, this includes iframe sources. Archive is
stored in `manager_params.source_dump_path` and is keyed by the
current `visit_id` and top-level url. The source dump is a gzipped json
file with the following structure:
{
'document_url': "http://example.com",
'source': "<html> ... </html>",
'iframes': {
'frame_1': {'document_url': ...,
'source': ...,
'iframes: { ... }},
'frame_2': {'document_url': ...,
'source': ...,
'iframes: { ... }},
'frame_3': { ... }
}
}
"""
self.total_timeout += timeout
if not self.contains_get_or_browse:
raise CommandExecutionError(
"No get or browse request preceding the recursive dump"
" page source command",
self,
)
command = RecursiveDumpPageSourceCommand(suffix)
self._commands_with_timeout.append((command, timeout))
def append_command(self, command: BaseCommand, timeout: int = 30) -> None:
self._commands_with_timeout.append((command, timeout))
def mark_done(self, success: bool) -> None:
if self.callback is not None:
self.callback(success)
def get_commands_with_timeout(self) -> List[Tuple[BaseCommand, int]]:
"""Returns a list of all commands in the command_sequence
appended by a finalize command
"""
commands = list(self._commands_with_timeout)
commands.insert(0, (InitializeCommand(), 10))
commands.append((FinalizeCommand(sleep=5), 10))
return commands