OpenWPM/openwpm/browser_manager.py

809 строки
32 KiB
Python

import errno
import json
import logging
import os
import pickle
import shutil
import signal
import sys
import tempfile
import threading
import time
import traceback
from pathlib import Path
from queue import Empty as EmptyQueue
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Type, Union
import psutil
from multiprocess import Process, Queue
from selenium.common.exceptions import WebDriverException
from tblib import Traceback, pickling_support
from .command_sequence import CommandSequence
from .commands.browser_commands import FinalizeCommand
from .commands.profile_commands import dump_profile
from .commands.types import BaseCommand, ShutdownSignal
from .commands.utils.webdriver_utils import parse_neterror
from .config import BrowserParamsInternal, ManagerParamsInternal
from .deploy_browsers import deploy_firefox
from .errors import BrowserConfigError, BrowserCrashError, ProfileLoadError
from .socket_interface import ClientSocket
from .storage.storage_providers import TableName
from .types import BrowserId, VisitId
from .utilities.multiprocess_utils import (
kill_process_and_children,
parse_traceback_for_sentry,
)
pickling_support.install()
if TYPE_CHECKING:
from .task_manager import TaskManager
class BrowserManagerHandle:
"""The BrowserManagerHandle class is responsible for holding all of the
configuration and status information on BrowserManager process
it corresponds to. It also includes a set of methods for managing
the BrowserManager process and its child processes/threads.
:param manager_params: are the TaskManager configuration settings.
:param browser_params: are per-browser parameter settings (e.g. whether
this browser is headless, etc.)
"""
def __init__(
self,
manager_params: ManagerParamsInternal,
browser_params: BrowserParamsInternal,
) -> None:
# Constants
self._SPAWN_TIMEOUT = 120 # seconds
self._UNSUCCESSFUL_SPAWN_LIMIT = 4
# manager parameters
self.current_profile_path: Optional[Path] = None
self.db_socket_address = manager_params.storage_controller_address
assert browser_params.browser_id is not None
self.browser_id: BrowserId = browser_params.browser_id
self.curr_visit_id: Optional[VisitId] = None
self.browser_params = browser_params
self.manager_params = manager_params
# Queues and process IDs for BrowserManager
self.command_thread: Optional[threading.Thread] = None
"""thread to run commands issued from TaskManager"""
self.command_queue: Optional[Queue] = None
"""queue for passing command objects to BrowserManager"""
self.status_queue: Optional[Queue] = None
"""queue for receiving command execution status from BrowserManager"""
self.geckodriver_pid: Optional[int] = None
"""pid for browser instance controlled by BrowserManager"""
self.display_pid: Optional[int] = None
"""the pid of the display for the Xvfb display (if it exists)"""
self.display_port: Optional[int] = None
"""the port of the display for the Xvfb display (if it exists)"""
self.is_fresh: bool = True
"""indicates if the BrowserManager is new (to optimize restarts)"""
self.restart_required: bool = False
"""indicates if the browser should be restarted"""
self.current_timeout: Optional[int] = None
"""timeout of the current command"""
self.browser_manager: Optional[Process] = None
"""process that controls browser"""
self.logger = logging.getLogger("openwpm")
def ready(self):
"""return if the browser is ready to accept a command"""
return self.command_thread is None or not self.command_thread.is_alive()
def set_visit_id(self, visit_id):
self.curr_visit_id = visit_id
def launch_browser_manager(self) -> bool:
"""
sets up the BrowserManager and gets the process id, browser pid and,
if applicable, screen pid. loads associated user profile if necessary
"""
tempdir: Optional[str] = None
crash_recovery = False
# if this is restarting from a crash, update the tar location
# to be a tar of the crashed browser's history
if self.current_profile_path is not None:
# tar contents of crashed profile to a temp dir
tempdir = tempfile.mkdtemp(prefix="openwpm_profile_archive_")
tar_path = Path(tempdir) / "profile.tar"
dump_profile(
browser_profile_path=self.current_profile_path,
tar_path=tar_path,
compress=False,
browser_params=self.browser_params,
)
# make sure browser loads crashed profile
self.browser_params.recovery_tar = tar_path
crash_recovery = True
self.logger.info("BROWSER %i: Launching browser..." % self.browser_id)
self.is_fresh = not crash_recovery
# Try to spawn the browser within the timelimit
unsuccessful_spawns = 0
success = False
def check_queue(launch_status: Dict[str, bool]) -> Any:
assert self.status_queue is not None
result = self.status_queue.get(True, self._SPAWN_TIMEOUT)
if result[0] == "STATUS":
launch_status[result[1]] = True
return result[2]
elif result[0] == "CRITICAL":
_, exc, tb = pickle.loads(result[1])
raise exc.with_traceback(tb)
elif result[0] == "FAILED":
raise BrowserCrashError("Browser spawn returned failure status")
while not success and unsuccessful_spawns < self._UNSUCCESSFUL_SPAWN_LIMIT:
self.logger.debug(
"BROWSER %i: Spawn attempt %i " % (self.browser_id, unsuccessful_spawns)
)
# Resets the command/status queues
(self.command_queue, self.status_queue) = (Queue(), Queue())
# builds and launches the browser_manager
self.browser_manager = BrowserManager(
self.command_queue,
self.status_queue,
self.browser_params,
self.manager_params,
crash_recovery,
)
self.browser_manager.daemon = True
self.browser_manager.start()
# Read success status of browser manager
launch_status: Dict[str, bool] = dict()
try:
# 1. Browser profile created
browser_profile_path = check_queue(launch_status)
# 2. Profile tar loaded (if necessary)
check_queue(launch_status)
# 3. Display launched (if necessary)
self.display_pid, self.display_port = check_queue(launch_status)
# 4. Browser launch attempted
check_queue(launch_status)
# 5. Browser launched
self.geckodriver_pid = check_queue(launch_status)
ready = check_queue(launch_status)
if ready != "READY":
self.logger.error(
"BROWSER %i: Mismatch of status queue return values, "
"trying again..." % self.browser_id
)
unsuccessful_spawns += 1
continue
success = True
except (EmptyQueue, BrowserCrashError):
unsuccessful_spawns += 1
error_string = ""
status_strings = [
"Profile Created",
"Profile Tar",
"Display",
"Launch Attempted",
"Browser Launched",
"Browser Ready",
]
for string in status_strings:
error_string += " | %s: %s " % (
string,
launch_status.get(string, False),
)
self.logger.error(
"BROWSER %i: Spawn unsuccessful %s"
% (self.browser_id, error_string)
)
self.close_browser_manager()
if "Profile Created" in launch_status:
shutil.rmtree(browser_profile_path, ignore_errors=True)
# If the browser spawned successfully, we should update the
# current profile path class variable and clean up the tempdir
# and previous profile path.
if success:
self.logger.debug("BROWSER %i: Browser spawn successful!" % self.browser_id)
previous_profile_path = self.current_profile_path
self.current_profile_path = browser_profile_path
if previous_profile_path is not None:
shutil.rmtree(previous_profile_path, ignore_errors=True)
if tempdir is not None:
shutil.rmtree(tempdir, ignore_errors=True)
return success
def restart_browser_manager(self, clear_profile=False):
"""
kill and restart the two worker processes
<clear_profile> marks whether we want to wipe the old profile
"""
self.logger.info(
"BROWSER %i: BrowserManager restart initiated. "
"Clear profile? %s" % (self.browser_id, clear_profile)
)
if self.is_fresh: # Return success if browser is fresh
self.logger.info(
"BROWSER %i: Skipping restart since the browser "
"is a fresh instance already" % self.browser_id
)
return True
self.close_browser_manager()
# if crawl should be stateless we can clear profile
if clear_profile and self.current_profile_path is not None:
shutil.rmtree(self.current_profile_path, ignore_errors=True)
self.current_profile_path = None
self.browser_params.recovery_tar = None
return self.launch_browser_manager()
def close_browser_manager(self, force: bool = False) -> None:
"""Attempt to close the webdriver and browser manager processes
from this thread.
If the browser manager process is unresponsive, the process is killed.
"""
self.logger.debug("BROWSER %i: Closing browser..." % self.browser_id)
shutdown_complete = False
try:
if force:
return
# Join current command thread (if it exists)
in_command_thread = threading.current_thread() == self.command_thread
if not in_command_thread and self.command_thread is not None:
self.logger.debug(
"BROWSER %i: Joining command thread" % self.browser_id
)
start_time = time.time()
if self.current_timeout is not None:
self.command_thread.join(self.current_timeout + 10)
else:
self.command_thread.join(60)
# If command thread is still alive, process is locked
if self.command_thread.is_alive():
self.logger.debug(
"BROWSER %i: command thread failed to join during close. "
"Assuming the browser process is locked..." % self.browser_id
)
return
self.logger.debug(
"BROWSER %i: %f seconds to join command thread"
% (self.browser_id, time.time() - start_time)
)
# If the command queue or status queue doesn't exist,
# it is likely that the browser failed to launch properly.
# Let's kill any child processes that we can find.
if self.command_queue is None or self.status_queue is None:
self.logger.debug(
"BROWSER %i: Command queue or status queue not found while closing."
% self.browser_id
)
return
# Send the shutdown command
command = ShutdownSignal()
self.command_queue.put(command)
# Verify that webdriver has closed (30 second timeout)
try:
status = self.status_queue.get(True, 30)
except EmptyQueue:
self.logger.debug(
"BROWSER %i: Status queue timeout while closing browser."
% self.browser_id
)
return
if status != "OK":
self.logger.debug(
"BROWSER %i: Command failure while closing browser."
% self.browser_id
)
return
# Verify that the browser process has closed (30 second timeout)
if self.browser_manager is not None:
self.browser_manager.join(30)
if self.browser_manager.is_alive():
self.logger.debug(
"BROWSER %i: Browser manager process still alive 30 seconds "
"after executing shutdown command." % self.browser_id
)
return
self.logger.debug(
"BROWSER %i: Browser manager closed successfully." % self.browser_id
)
shutdown_complete = True
finally:
if not shutdown_complete:
self.kill_browser_manager()
def execute_command_sequence(
self,
# Quoting to break cyclic import, see https://stackoverflow.com/a/39757388
task_manager: "TaskManager",
command_sequence: CommandSequence,
) -> None:
"""
Sends CommandSequence to the BrowserManager one command at a time
"""
assert self.browser_id is not None
assert self.curr_visit_id is not None
task_manager.sock.store_record(
TableName("site_visits"),
self.curr_visit_id,
{
"visit_id": self.curr_visit_id,
"browser_id": self.browser_id,
"site_url": command_sequence.url,
"site_rank": command_sequence.site_rank,
},
)
self.is_fresh = False
reset = command_sequence.reset
self.logger.info(
"Starting to work on CommandSequence with "
"visit_id %d on browser with id %d",
self.curr_visit_id,
self.browser_id,
)
assert self.command_queue is not None
assert self.status_queue is not None
for command_and_timeout in command_sequence.get_commands_with_timeout():
command, timeout = command_and_timeout
command.set_visit_browser_id(self.curr_visit_id, self.browser_id)
command.set_start_time(time.time())
self.current_timeout = timeout
# Adding timer to track performance of commands
t1 = time.time_ns()
# passes off command and waits for a success (or failure signal)
self.command_queue.put(command)
# received reply from BrowserManager, either success or failure
error_text = None
tb = None
status = None
try:
status = self.status_queue.get(True, self.current_timeout)
except EmptyQueue:
self.logger.info(
"BROWSER %i: Timeout while executing command, %s, killing "
"browser manager" % (self.browser_id, repr(command))
)
if status is None:
# allows us to skip this entire block without having to bloat
# every if statement
command_status = "timeout"
pass
elif status == "OK":
command_status = "ok"
elif status[0] == "CRITICAL":
command_status = "critical"
self.logger.critical(
"BROWSER %i: Received critical error from browser "
"process while executing command %s. Setting failure "
"status." % (self.browser_id, str(command))
)
task_manager.failure_status = {
"ErrorType": "CriticalChildException",
"CommandSequence": command_sequence,
"Exception": status[1],
}
error_text, tb = self._unpack_pickled_error(status[1])
elif status[0] == "FAILED":
command_status = "error"
error_text, tb = self._unpack_pickled_error(status[1])
self.logger.info(
"BROWSER %i: Received failure status while executing "
"command: %s" % (self.browser_id, repr(command))
)
elif status[0] == "NETERROR":
command_status = "neterror"
error_text, tb = self._unpack_pickled_error(status[1])
error_text = parse_neterror(error_text)
self.logger.info(
"BROWSER %i: Received neterror %s while executing "
"command: %s" % (self.browser_id, error_text, repr(command))
)
else:
raise ValueError("Unknown browser status message %s" % status)
task_manager.sock.store_record(
TableName("crawl_history"),
self.curr_visit_id,
{
"browser_id": self.browser_id,
"visit_id": self.curr_visit_id,
"command": type(command).__name__,
"arguments": json.dumps(
command.__dict__, default=lambda x: repr(x)
).encode("utf-8"),
"retry_number": command_sequence.retry_number,
"command_status": command_status,
"error": error_text,
"traceback": tb,
"duration": int((time.time_ns() - t1) / 1000000),
},
)
if command_status == "critical":
task_manager.sock.finalize_visit_id(
success=False,
visit_id=self.curr_visit_id,
)
return
if command_status != "ok":
with task_manager.threadlock:
task_manager.failure_count += 1
if task_manager.failure_count > task_manager.failure_limit:
self.logger.critical(
"BROWSER %i: Command execution failure pushes failure "
"count above the allowable limit. Setting "
"failure_status." % self.browser_id
)
task_manager.failure_status = {
"ErrorType": "ExceedCommandFailureLimit",
"CommandSequence": command_sequence,
}
return
self.restart_required = True
self.logger.debug(
"BROWSER %i: Browser restart required" % self.browser_id
)
# Reset failure_count at the end of each successful command sequence
elif type(command) is FinalizeCommand:
with task_manager.threadlock:
task_manager.failure_count = 0
if self.restart_required:
task_manager.sock.finalize_visit_id(
success=False, visit_id=self.curr_visit_id
)
break
self.logger.info(
"Finished working on CommandSequence with "
"visit_id %d on browser with id %d",
self.curr_visit_id,
self.browser_id,
)
# Sleep after executing CommandSequence to provide extra time for
# internal buffers to drain. Stopgap in support of #135
time.sleep(2)
if task_manager.closing:
return
if self.restart_required or reset:
success = self.restart_browser_manager(clear_profile=reset)
if not success:
self.logger.critical(
"BROWSER %i: Exceeded the maximum allowable consecutive "
"browser launch failures. Setting failure_status." % self.browser_id
)
task_manager.failure_status = {
"ErrorType": "ExceedLaunchFailureLimit",
"CommandSequence": command_sequence,
}
return
self.restart_required = False
def _unpack_pickled_error(self, pickled_error: bytes) -> Tuple[str, str]:
"""Unpacks `pickled_error` into an error `message` and `tb` string."""
exc = pickle.loads(pickled_error)
message = traceback.format_exception(*exc)[-1]
tb = json.dumps(Traceback(exc[2]).to_dict())
return message, tb
def kill_browser_manager(self):
"""Kill the BrowserManager process and all of its children"""
if self.display_pid is not None:
self.logger.debug(
"BROWSER {browser_id}: Attempting to kill display "
"with pid {display_pid}, port {display_port}".format(
browser_id=self.browser_id,
display_pid=self.display_pid,
display_port=self.display_port,
)
)
if self.browser_manager is not None and self.browser_manager.pid is not None:
self.logger.debug(
"BROWSER %i: Attempting to kill BrowserManager with pid %i. "
"Browser PID: %s"
% (self.browser_id, self.browser_manager.pid, self.geckodriver_pid)
)
try:
os.kill(self.browser_manager.pid, signal.SIGKILL)
except OSError:
self.logger.debug(
"BROWSER %i: Browser manager process does "
"not exist" % self.browser_id
)
pass
if self.display_pid is not None:
try:
os.kill(self.display_pid, signal.SIGKILL)
except OSError:
self.logger.debug(
"BROWSER %i: Display process does not exit" % self.browser_id
)
pass
except TypeError:
self.logger.error(
"BROWSER %i: PID may not be the correct "
"type %s" % (self.browser_id, str(self.display_pid))
)
if self.display_port is not None: # xvfb display lock
lockfile = "/tmp/.X%s-lock" % self.display_port
try:
os.remove(lockfile)
except OSError:
self.logger.debug(
"BROWSER %i: Screen lockfile (%s) already "
"removed" % (self.browser_id, lockfile)
)
pass
if self.geckodriver_pid is not None:
"""`geckodriver_pid` is the geckodriver process. We first kill
the child processes (i.e. firefox) and then kill the geckodriver
process."""
try:
geckodriver_process = psutil.Process(pid=self.geckodriver_pid)
except psutil.NoSuchProcess:
self.logger.debug(
"BROWSER %i: geckodriver process with pid %i has already"
" exited" % (self.browser_id, self.geckodriver_pid)
)
return
kill_process_and_children(geckodriver_process, self.logger)
def shutdown_browser(self, during_init: bool, force: bool = False) -> None:
"""Runs the closing tasks for this Browser/BrowserManager"""
# Close BrowserManager process and children
self.logger.debug("BROWSER %i: Closing browser manager..." % self.browser_id)
self.close_browser_manager(force=force)
# Archive browser profile (if requested)
self.logger.debug(
"BROWSER %i: during_init=%s | profile_archive_dir=%s"
% (
self.browser_id,
str(during_init),
self.browser_params.profile_archive_dir,
)
)
if not during_init and self.browser_params.profile_archive_dir is not None:
self.logger.debug(
"BROWSER %i: Archiving browser profile directory to %s"
% (self.browser_id, self.browser_params.profile_archive_dir)
)
tar_path = self.browser_params.profile_archive_dir / "profile.tar.gz"
assert self.current_profile_path is not None
dump_profile(
browser_profile_path=self.current_profile_path,
tar_path=tar_path,
compress=True,
browser_params=self.browser_params,
)
# Clean up temporary files
if self.current_profile_path is not None:
shutil.rmtree(self.current_profile_path, ignore_errors=True)
class BrowserManager(Process):
"""
The BrowserManager function runs in each new browser process.
It is responsible for listening to command instructions from
the Task Manager and passing them to the command module to execute
and interface with Selenium. Command execution status is sent back
to the TaskManager.
"""
def __init__(
self,
command_queue: Queue,
status_queue: Queue,
browser_params: BrowserParamsInternal,
manager_params: ManagerParamsInternal,
crash_recovery: bool,
) -> None:
super().__init__()
self.logger = logging.getLogger("openwpm")
self.command_queue = command_queue
self.status_queue = status_queue
self.browser_params = browser_params
self.manager_params = manager_params
self.crash_recovery = crash_recovery
self.critical_exceptions: Tuple[Type[BaseException], ...] = (
ProfileLoadError,
BrowserConfigError,
)
if self.manager_params.testing:
self.critical_exceptions += (AssertionError,)
def _start_extension(self, browser_profile_path: Path) -> ClientSocket:
"""Start up the extension
Blocks until the extension has fully started up
"""
assert self.browser_params.browser_id is not None
self.logger.debug(
"BROWSER %i: Looking for extension port information "
"in %s" % (self.browser_params.browser_id, browser_profile_path)
)
elapsed = 0.0
port = None
ep_filename = browser_profile_path / "extension_port.txt"
while elapsed < 5:
try:
with open(ep_filename, "rt") as f:
port = int(f.read().strip())
break
except IOError as e:
if e.errno != errno.ENOENT:
raise
time.sleep(0.1)
elapsed += 0.1
if port is None:
# try one last time, allowing all exceptions to propagate
with open(ep_filename, "rt") as f:
port = int(f.read().strip())
ep_filename.unlink()
self.logger.debug(
"BROWSER %i: Connecting to extension on port %i"
% (self.browser_params.browser_id, port)
)
extension_socket = ClientSocket(serialization="json")
extension_socket.connect("127.0.0.1", int(port))
success_filename = browser_profile_path / "OPENWPM_STARTUP_SUCCESS.txt"
startup_successful = False
while elapsed < 10:
if success_filename.exists():
startup_successful = True
break
time.sleep(0.1)
elapsed += 0.1
if not startup_successful:
self.logger.error(
"BROWSER %i: Failed to complete extension startup in time",
self.browser_params.browser_id,
)
raise BrowserConfigError("The extension did not boot up in time")
success_filename.unlink()
return extension_socket
def run(self) -> None:
assert self.browser_params.browser_id is not None
display = None
try:
# Start Xvfb (if necessary), webdriver, and browser
driver, browser_profile_path, display = deploy_firefox.deploy_firefox(
self.status_queue,
self.browser_params,
self.manager_params,
self.crash_recovery,
)
extension_socket: Optional[ClientSocket] = None
if self.browser_params.extension_enabled:
extension_socket = self._start_extension(browser_profile_path)
self.logger.debug(
"BROWSER %i: BrowserManager ready." % self.browser_params.browser_id
)
# passes "READY" to the TaskManager to signal a successful startup
self.status_queue.put(("STATUS", "Browser Ready", "READY"))
self.browser_params.profile_path = browser_profile_path
assert extension_socket is not None
# starts accepting arguments until told to die
while True:
# no command for now -> sleep to avoid pegging CPU on blocking get
if self.command_queue.empty():
time.sleep(0.001)
continue
command: Union[ShutdownSignal, BaseCommand] = self.command_queue.get()
if isinstance(command, ShutdownSignal):
driver.quit()
self.status_queue.put("OK")
return
assert isinstance(command, BaseCommand)
self.logger.info(
"BROWSER %i: EXECUTING COMMAND: %s"
% (self.browser_params.browser_id, str(command))
)
# attempts to perform an action and return an OK signal
# if command fails for whatever reason, tell the TaskManager to
# kill and restart its worker processes
try:
command.execute(
driver,
self.browser_params,
self.manager_params,
extension_socket,
)
self.status_queue.put("OK")
except WebDriverException:
# We handle WebDriverExceptions separately here because they
# are quite common, and we often still have a handle to the
# browser, allowing us to run the SHUTDOWN command.
tb = traceback.format_exception(*sys.exc_info())
if "about:neterror" in tb[-1]:
self.status_queue.put(
("NETERROR", pickle.dumps(sys.exc_info()))
)
continue
extra = parse_traceback_for_sentry(tb)
extra["exception"] = tb[-1]
self.logger.error(
"BROWSER %i: WebDriverException while executing command"
% self.browser_params.browser_id,
exc_info=True,
extra=extra,
)
self.status_queue.put(("FAILED", pickle.dumps(sys.exc_info())))
except self.critical_exceptions as e:
self.logger.error(
"BROWSER %i: %s thrown, informing parent and raising"
% (self.browser_params.browser_id, e.__class__.__name__)
)
self.status_queue.put(("CRITICAL", pickle.dumps(sys.exc_info())))
except Exception:
tb = traceback.format_exception(*sys.exc_info())
extra = parse_traceback_for_sentry(tb)
extra["exception"] = tb[-1]
self.logger.error(
"BROWSER %i: Crash in driver, restarting browser manager"
% self.browser_params.browser_id,
exc_info=True,
extra=extra,
)
self.status_queue.put(("FAILED", pickle.dumps(sys.exc_info())))
finally:
if display is not None:
display.stop()
return