зеркало из https://github.com/openwpm/OpenWPM.git
351 строка
12 KiB
Python
351 строка
12 KiB
Python
import tempfile
|
|
from dataclasses import dataclass, field
|
|
from json import JSONEncoder
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
|
|
|
|
from dataclasses_json import DataClassJsonMixin
|
|
from dataclasses_json import config as DCJConfig
|
|
|
|
from .errors import ConfigError
|
|
from .types import BrowserId
|
|
|
|
BOOL_TYPE_VALIDATION_LIST = [True, False]
|
|
DISPLAY_MODE_VALIDATION_LIST = ["native", "headless", "xvfb"]
|
|
SUPPORTED_BROWSER_LIST = [
|
|
"firefox"
|
|
] # Using List instead of a str type to future proof the logic as OpenWPM may add support for more browsers in future
|
|
TP_COOKIES_OPTIONALS_LIST = ["always", "never", "from_visited"]
|
|
LOG_EXTENSION_TYPE_LIST = [".log"]
|
|
CONFIG_ERROR_STRING = (
|
|
"Found {value} as value for {parameter_name} in BrowserParams. "
|
|
"Supported values are {value_list}. Please look at "
|
|
"docs/Configuration.md#browser-configuration-options for more information"
|
|
)
|
|
EXTENSION_ERROR_STRING = (
|
|
"Found {extension} extension for {parameter_name} in ManagerParams "
|
|
"supported extensions are {value_list}. Please look at "
|
|
"docs/Configuration.md#platform-configuration-options for more information"
|
|
)
|
|
GENERAL_ERROR_STRING = (
|
|
"Found invalid value `{value}` for {parameter_name} in {params_type}. "
|
|
"Please look at docs/Configuration.md for more information"
|
|
)
|
|
|
|
ALL_RESOURCE_TYPES = {
|
|
"beacon",
|
|
"csp_report",
|
|
"font",
|
|
"image",
|
|
"imageset",
|
|
"main_frame",
|
|
"media",
|
|
"object",
|
|
"object_subrequest",
|
|
"ping",
|
|
"script",
|
|
"stylesheet",
|
|
"sub_frame",
|
|
"web_manifest",
|
|
"websocket",
|
|
"xml_dtd",
|
|
"xmlhttprequest",
|
|
"xslt",
|
|
"other",
|
|
}
|
|
|
|
|
|
def str_to_path(string: Optional[str]) -> Optional[Path]:
|
|
if string is not None:
|
|
return Path(string)
|
|
return None
|
|
|
|
|
|
def path_to_str(path: Optional[Path]) -> Optional[str]:
|
|
if path is not None:
|
|
return str(path.resolve())
|
|
return None
|
|
|
|
|
|
@dataclass
|
|
class BrowserParams(DataClassJsonMixin):
|
|
"""
|
|
Configuration that might differ per browser
|
|
|
|
OpenWPM allows you to run multiple browsers with different
|
|
configurations in parallel and this class allows you
|
|
to customize behaviour of an individual browser
|
|
"""
|
|
|
|
cookie_instrument: bool = True
|
|
js_instrument: bool = False
|
|
js_instrument_settings: List[Union[str, dict]] = field(
|
|
default_factory=lambda: ["collection_fingerprinting"]
|
|
)
|
|
http_instrument: bool = False
|
|
navigation_instrument: bool = False
|
|
save_content: Union[bool, str] = False
|
|
callstack_instrument: bool = False
|
|
dns_instrument: bool = False
|
|
seed_tar: Optional[Path] = field(
|
|
default=None, metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path)
|
|
)
|
|
display_mode: Literal["native", "headless", "xvfb"] = "native"
|
|
browser: str = "firefox"
|
|
prefs: dict = field(default_factory=dict)
|
|
tp_cookies: str = "always"
|
|
bot_mitigation: bool = False
|
|
profile_archive_dir: Optional[Path] = field(
|
|
default=None, metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path)
|
|
)
|
|
|
|
tmp_profile_dir: Path = field(
|
|
default=Path(tempfile.gettempdir()),
|
|
metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path),
|
|
)
|
|
"""
|
|
The tmp_profile_dir defaults to the OS's temporary file folder (typically /tmp) and is where the generated
|
|
browser profiles and residual files are stored.
|
|
"""
|
|
|
|
maximum_profile_size: Optional[int] = None
|
|
"""
|
|
The total amount of on disk space the generated
|
|
browser profiles and residual files are allowed to consume in bytes.
|
|
If this option is not set, no checks will be performed
|
|
|
|
Rationale
|
|
---------
|
|
This option can serve as a happy medium between killing a browser after each
|
|
crawl and allowing the application to still perform quickly.
|
|
|
|
Used as a way to save space
|
|
in a limited environment with minimal detriment to speed.
|
|
|
|
If the maximum_profile_size is exceeded after a CommandSequence
|
|
is completed, the browser will be shut down and a new one will
|
|
be created. **Even with this setting you may temporarily have
|
|
more disk usage than the sum of all maximum_profile_sizes**
|
|
However, this will also ensure that a CommandSequence is
|
|
allowed to complete without undue interruptions.
|
|
|
|
Sample values
|
|
-------------
|
|
* 1073741824: 1GB
|
|
* 20971520: 20MB - for testing purposes
|
|
* 52428800: 50MB
|
|
* 73400320: 70MB
|
|
* 104857600: 100MB - IDEAL for 10+ browsers
|
|
|
|
"""
|
|
|
|
recovery_tar: Optional[Path] = None
|
|
donottrack: bool = False
|
|
tracking_protection: bool = False
|
|
custom_params: Dict[Any, Any] = field(default_factory=lambda: {})
|
|
|
|
|
|
@dataclass
|
|
class ManagerParams(DataClassJsonMixin):
|
|
"""
|
|
Configuration for the TaskManager
|
|
The configuration will be the same for all browsers running on the same
|
|
TaskManager.
|
|
It can be used to control storage locations or which watchdogs should
|
|
run
|
|
"""
|
|
|
|
data_directory: Path = field(
|
|
default=Path.home() / "openwpm",
|
|
metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path),
|
|
)
|
|
"""The directory into which screenshots and page dumps will be saved"""
|
|
log_path: Path = field(
|
|
default=Path.home() / "openwpm" / "openwpm.log",
|
|
metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path),
|
|
)
|
|
"""The path to the file in which OpenWPM will log. The
|
|
directory given will be created if it does not exist."""
|
|
testing: bool = False
|
|
"""A platform wide flag that can be used to only run certain functionality
|
|
while testing. For example, the Javascript instrumentation"""
|
|
memory_watchdog: bool = False
|
|
"""A watchdog that tries to ensure that no Firefox instance takes up too much memory.
|
|
It is mostly useful for long running cloud crawls"""
|
|
process_watchdog: bool = False
|
|
"""It is used to create another thread that kills off `GeckoDriver` (or `Xvfb`)
|
|
instances that haven't been spawned by OpenWPM. (GeckoDriver is used by
|
|
Selenium to control Firefox and Xvfb a "virtual display" so we simulate having graphics when running on a server).
|
|
"""
|
|
|
|
num_browsers: int = 1
|
|
_failure_limit: Optional[int] = None
|
|
"""The number of command failures the platform will tolerate before raising a
|
|
`CommandExecutionError` exception. Otherwise the default is set to 2 x the
|
|
number of browsers plus 10. The failure counter is reset at the end of each
|
|
successfully completed command sequence.
|
|
For non-blocking command sequences that cause the number of failures to
|
|
exceed `failure_limit` the `CommandExecutionError` is raised when
|
|
attempting to execute the next command sequence."""
|
|
|
|
@property
|
|
def failure_limit(self) -> int:
|
|
if self._failure_limit is None:
|
|
return 2 * self.num_browsers + 10
|
|
return self._failure_limit
|
|
|
|
@failure_limit.setter
|
|
def failure_limit(self, value: int) -> None:
|
|
self._failure_limit = value
|
|
|
|
|
|
@dataclass
|
|
class BrowserParamsInternal(BrowserParams):
|
|
browser_id: Optional[BrowserId] = None
|
|
profile_path: Optional[Path] = None
|
|
cleaned_js_instrument_settings: Optional[List[Dict[str, Any]]] = None
|
|
|
|
|
|
@dataclass
|
|
class ManagerParamsInternal(ManagerParams):
|
|
storage_controller_address: Optional[Tuple[str, int]] = None
|
|
logger_address: Optional[Tuple[str, ...]] = None
|
|
screenshot_path: Optional[Path] = field(
|
|
default=None, metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path)
|
|
)
|
|
source_dump_path: Optional[Path] = field(
|
|
default=None, metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path)
|
|
)
|
|
|
|
|
|
def validate_browser_params(browser_params: BrowserParams) -> None:
|
|
if BrowserParams() == browser_params:
|
|
return
|
|
try:
|
|
if browser_params.display_mode.lower() not in DISPLAY_MODE_VALIDATION_LIST:
|
|
raise ConfigError(
|
|
CONFIG_ERROR_STRING.format(
|
|
value=browser_params.display_mode,
|
|
value_list=DISPLAY_MODE_VALIDATION_LIST,
|
|
parameter_name="display_mode",
|
|
)
|
|
)
|
|
|
|
if browser_params.browser.lower() not in SUPPORTED_BROWSER_LIST:
|
|
raise ConfigError(
|
|
CONFIG_ERROR_STRING.format(
|
|
value=browser_params.browser,
|
|
value_list=SUPPORTED_BROWSER_LIST,
|
|
parameter_name="browser",
|
|
)
|
|
)
|
|
|
|
if browser_params.tp_cookies.lower() not in TP_COOKIES_OPTIONALS_LIST:
|
|
raise ConfigError(
|
|
CONFIG_ERROR_STRING.format(
|
|
value=browser_params.tp_cookies,
|
|
value_list=TP_COOKIES_OPTIONALS_LIST,
|
|
parameter_name="tp_cookies",
|
|
)
|
|
)
|
|
|
|
if browser_params.callstack_instrument:
|
|
raise ConfigError(
|
|
"The callstacks instrument currently doesn't work "
|
|
"as it is requires intricate machinery that broke "
|
|
"in one of the previous Firefox versions."
|
|
)
|
|
|
|
if browser_params.callstack_instrument and not browser_params.js_instrument:
|
|
raise ConfigError(
|
|
"The callstacks instrument currently doesn't work without "
|
|
"the JS instrument enabled. see: "
|
|
"https://github.com/openwpm/OpenWPM/issues/557"
|
|
)
|
|
|
|
if not isinstance(browser_params.save_content, bool) and not isinstance(
|
|
browser_params.save_content, str
|
|
):
|
|
raise ConfigError(
|
|
GENERAL_ERROR_STRING.format(
|
|
value=browser_params.save_content,
|
|
parameter_name="save_content",
|
|
params_type="BrowserParams",
|
|
)
|
|
)
|
|
|
|
if browser_params.save_content:
|
|
if isinstance(browser_params.save_content, str):
|
|
configured_types = set(browser_params.save_content.split(","))
|
|
if not configured_types.issubset(ALL_RESOURCE_TYPES):
|
|
diff = configured_types.difference(ALL_RESOURCE_TYPES)
|
|
raise ConfigError(
|
|
"Unrecognized resource types provided ",
|
|
"in browser_params.save_content (%s)" % diff,
|
|
)
|
|
|
|
except:
|
|
raise ConfigError(
|
|
"Something went wrong while validating BrowserParams. "
|
|
"Please check values provided for BrowserParams are of expected types"
|
|
)
|
|
|
|
|
|
def validate_manager_params(manager_params: ManagerParams) -> None:
|
|
if ManagerParams() == manager_params:
|
|
return
|
|
|
|
try:
|
|
log_file_extension = manager_params.log_path.suffix
|
|
if log_file_extension.lower() not in LOG_EXTENSION_TYPE_LIST:
|
|
raise ConfigError(
|
|
EXTENSION_ERROR_STRING.format(
|
|
extension=log_file_extension or "no",
|
|
value_list=LOG_EXTENSION_TYPE_LIST,
|
|
parameter_name="log_file",
|
|
)
|
|
)
|
|
except (TypeError, AttributeError):
|
|
raise ConfigError(
|
|
GENERAL_ERROR_STRING.format(
|
|
value=manager_params.log_path,
|
|
parameter_name="log_file",
|
|
params_type="ManagerParams",
|
|
)
|
|
)
|
|
|
|
# This check is necessary to not cause any internal error
|
|
if not isinstance(manager_params.failure_limit, int):
|
|
raise ConfigError(
|
|
GENERAL_ERROR_STRING.format(
|
|
value=manager_params.failure_limit,
|
|
parameter_name="failure_limit",
|
|
params_type="ManagerParams",
|
|
).replace(
|
|
"Please look at docs/Configuration.md for more information",
|
|
"failure_limit must be of type `int` or `None`",
|
|
)
|
|
)
|
|
|
|
|
|
def validate_crawl_configs(
|
|
manager_params: ManagerParams, browser_params: List[BrowserParams]
|
|
) -> None:
|
|
validate_manager_params(manager_params)
|
|
for bp in browser_params:
|
|
validate_browser_params(bp)
|
|
|
|
if len(browser_params) != manager_params.num_browsers:
|
|
raise ConfigError(
|
|
"Number of BrowserParams instances is not the same "
|
|
"as manager_params.num_browsers. Make sure you are assigning number of browsers "
|
|
"to be used to manager_params.num_browsers in your entry file"
|
|
)
|
|
|
|
|
|
class ConfigEncoder(JSONEncoder):
|
|
def default(self, obj):
|
|
if isinstance(obj, Path):
|
|
return str(obj.resolve())
|
|
return JSONEncoder.default(self, obj)
|