зеркало из https://github.com/openwpm/OpenWPM.git
419 строки
18 KiB
Python
419 строки
18 KiB
Python
from __future__ import absolute_import
|
|
from .Commands import command_executor
|
|
from .DeployBrowsers import deploy_browser
|
|
from .Commands import profile_commands
|
|
from .SocketInterface import clientsocket
|
|
from .MPLogger import loggingclient
|
|
from .Errors import ProfileLoadError, BrowserConfigError, BrowserCrashError
|
|
|
|
import errno
|
|
from multiprocess import Process, Queue
|
|
from six.moves.queue import Empty as EmptyQueue
|
|
from tblib import pickling_support
|
|
from six import reraise
|
|
import traceback
|
|
import tempfile
|
|
from six.moves import cPickle as pickle
|
|
import shutil
|
|
import signal
|
|
import psutil
|
|
import time
|
|
import sys
|
|
import os
|
|
pickling_support.install()
|
|
|
|
|
|
class Browser:
|
|
"""
|
|
The Browser class is responsbile for holding all of the
|
|
configuration and status information on BrowserManager process
|
|
it corresponds to. It also includes a set of methods for managing
|
|
the BrowserManager process and its child processes/threads.
|
|
<manager_params> are the TaskManager configuration settings.
|
|
<browser_params> are per-browser parameter settings (e.g. whether
|
|
this browser is headless, etc.)
|
|
"""
|
|
def __init__(self, manager_params, browser_params):
|
|
# Constants
|
|
self._SPAWN_TIMEOUT = 120 # seconds
|
|
self._UNSUCCESSFUL_SPAWN_LIMIT = 4
|
|
|
|
# manager parameters
|
|
self.current_profile_path = None
|
|
self.db_socket_address = manager_params['aggregator_address']
|
|
self.logger_address = manager_params['logger_address']
|
|
self.crawl_id = browser_params['crawl_id']
|
|
self.curr_visit_id = None
|
|
self.browser_params = browser_params
|
|
self.manager_params = manager_params
|
|
|
|
# Queues and process IDs for BrowserManager
|
|
|
|
# thread to run commands issues from TaskManager
|
|
self.command_thread = None
|
|
# queue for passing command tuples to BrowserManager
|
|
self.command_queue = None
|
|
# queue for receiving command execution status from BrowserManager
|
|
self.status_queue = None
|
|
# pid for browser instance controlled by BrowserManager
|
|
self.browser_pid = None
|
|
# the pid of the display for the headless browser (if it exists)
|
|
self.display_pid = None
|
|
# the port of the display for the headless browser (if it exists)
|
|
self.display_port = None
|
|
|
|
# boolean that says if the BrowserManager new (to optimize restarts)
|
|
self.is_fresh = True
|
|
# boolean indicating if the browser should be restarted
|
|
self.restart_required = False
|
|
|
|
self.current_timeout = None # timeout of the current command
|
|
# dict of additional browser profile settings (e.g. screen_res)
|
|
self.browser_settings = None
|
|
self.browser_manager = None # process that controls browser
|
|
self.logger = loggingclient(*self.logger_address)
|
|
|
|
def ready(self):
|
|
""" return if the browser is ready to accept a command """
|
|
return (self.command_thread is None or
|
|
not self.command_thread.is_alive())
|
|
|
|
def set_visit_id(self, visit_id):
|
|
self.curr_visit_id = visit_id
|
|
|
|
def launch_browser_manager(self):
|
|
"""
|
|
sets up the BrowserManager and gets the process id, browser pid and,
|
|
if applicable, screen pid. loads associated user profile if necessary
|
|
"""
|
|
# if this is restarting from a crash, update the tar location
|
|
# to be a tar of the crashed browser's history
|
|
if self.current_profile_path is not None:
|
|
# tar contents of crashed profile to a temp dir
|
|
tempdir = tempfile.mkdtemp(prefix="owpm_profile_archive_") + "/"
|
|
profile_commands.dump_profile(
|
|
self.current_profile_path,
|
|
self.manager_params,
|
|
self.browser_params,
|
|
tempdir,
|
|
close_webdriver=False,
|
|
browser_settings=self.browser_settings
|
|
)
|
|
# make sure browser loads crashed profile
|
|
self.browser_params['profile_tar'] = tempdir
|
|
# don't re-randomize attributes
|
|
self.browser_params['random_attributes'] = False
|
|
crash_recovery = True
|
|
else:
|
|
tempdir = None
|
|
crash_recovery = False
|
|
self.is_fresh = not crash_recovery
|
|
|
|
# Try to spawn the browser within the timelimit
|
|
unsuccessful_spawns = 0
|
|
success = False
|
|
|
|
def check_queue(launch_status):
|
|
result = self.status_queue.get(True, self._SPAWN_TIMEOUT)
|
|
if result[0] == 'STATUS':
|
|
launch_status[result[1]] = True
|
|
return result[2]
|
|
elif result[0] == 'CRITICAL':
|
|
reraise(*pickle.loads(result[1]))
|
|
elif result[0] == 'FAILED':
|
|
raise BrowserCrashError(
|
|
'Browser spawn returned failure status')
|
|
|
|
while (not success and
|
|
unsuccessful_spawns < self._UNSUCCESSFUL_SPAWN_LIMIT):
|
|
self.logger.debug("BROWSER %i: Spawn attempt %i " % (
|
|
self.crawl_id, unsuccessful_spawns))
|
|
# Resets the command/status queues
|
|
(self.command_queue, self.status_queue) = (Queue(), Queue())
|
|
|
|
# builds and launches the browser_manager
|
|
args = (self.command_queue, self.status_queue, self.browser_params,
|
|
self.manager_params, crash_recovery)
|
|
self.browser_manager = Process(target=BrowserManager, args=args)
|
|
self.browser_manager.daemon = True
|
|
self.browser_manager.start()
|
|
|
|
# Read success status of browser manager
|
|
launch_status = dict()
|
|
try:
|
|
# 1. Selenium profile created
|
|
spawned_profile_path = check_queue(launch_status)
|
|
# 2. Profile tar loaded (if necessary)
|
|
check_queue(launch_status)
|
|
# 3. Display launched
|
|
(self.display_pid, self.display_port) = check_queue(
|
|
launch_status)
|
|
# 4. Browser launch attempted
|
|
check_queue(launch_status)
|
|
# 5. Browser launched
|
|
(self.browser_pid, self.browser_settings) = check_queue(
|
|
launch_status)
|
|
|
|
(driver_profile_path, ready) = check_queue(launch_status)
|
|
if ready != 'READY':
|
|
self.logger.error(
|
|
"BROWSER %i: Mismatch of status queue return values, "
|
|
"trying again..." % self.crawl_id
|
|
)
|
|
unsuccessful_spawns += 1
|
|
continue
|
|
success = True
|
|
except (EmptyQueue, BrowserCrashError):
|
|
unsuccessful_spawns += 1
|
|
error_string = ''
|
|
status_strings = [
|
|
'Proxy Ready', 'Profile Created', 'Profile Tar', 'Display',
|
|
'Launch Attempted', 'Browser Launched', 'Browser Ready']
|
|
for string in status_strings:
|
|
error_string += " | %s: %s " % (
|
|
string, launch_status.get(string, False))
|
|
self.logger.error(
|
|
"BROWSER %i: Spawn unsuccessful %s" % (self.crawl_id,
|
|
error_string))
|
|
self.kill_browser_manager()
|
|
if 'Profile Created' in launch_status:
|
|
shutil.rmtree(spawned_profile_path, ignore_errors=True)
|
|
|
|
# If the browser spawned successfully, we should update the
|
|
# current profile path class variable and clean up the tempdir
|
|
# and previous profile path.
|
|
if success:
|
|
self.logger.debug(
|
|
"BROWSER %i: Browser spawn sucessful!" % self.crawl_id)
|
|
previous_profile_path = self.current_profile_path
|
|
self.current_profile_path = driver_profile_path
|
|
if driver_profile_path != spawned_profile_path:
|
|
shutil.rmtree(spawned_profile_path, ignore_errors=True)
|
|
if previous_profile_path is not None:
|
|
shutil.rmtree(previous_profile_path, ignore_errors=True)
|
|
if tempdir is not None:
|
|
shutil.rmtree(tempdir, ignore_errors=True)
|
|
|
|
return success
|
|
|
|
def restart_browser_manager(self, clear_profile=False):
|
|
"""
|
|
kill and restart the two worker processes
|
|
<clear_profile> marks whether we want to wipe the old profile
|
|
"""
|
|
self.logger.info("BROWSER %i: BrowserManager restart initiated. "
|
|
"Clear profile? %s" % (self.crawl_id, clear_profile))
|
|
if self.is_fresh: # Return success if browser is fresh
|
|
self.logger.info("BROWSER %i: Skipping restart since the browser "
|
|
"is a fresh instance already" % self.crawl_id)
|
|
return True
|
|
|
|
self.kill_browser_manager()
|
|
|
|
# if crawl should be stateless we can clear profile
|
|
if clear_profile and self.current_profile_path is not None:
|
|
shutil.rmtree(self.current_profile_path, ignore_errors=True)
|
|
self.current_profile_path = None
|
|
self.browser_params['profile_tar'] = None
|
|
|
|
return self.launch_browser_manager()
|
|
|
|
def kill_browser_manager(self):
|
|
"""Kill the BrowserManager process and all of its children"""
|
|
self.logger.debug(
|
|
"BROWSER %i: Attempting to kill BrowserManager with pid %i. "
|
|
"Display PID: %s | Display Port: %s | Browser PID: %s" % (
|
|
self.crawl_id, self.browser_manager.pid, self.display_pid,
|
|
self.display_port, self.browser_pid)
|
|
)
|
|
if (self.browser_manager is not None
|
|
and self.browser_manager.pid is not None):
|
|
try:
|
|
os.kill(self.browser_manager.pid, signal.SIGKILL)
|
|
except OSError:
|
|
self.logger.debug("BROWSER %i: Browser manager process does "
|
|
"not exist" % self.crawl_id)
|
|
pass
|
|
if self.display_pid is not None:
|
|
try:
|
|
os.kill(self.display_pid, signal.SIGKILL)
|
|
except OSError:
|
|
self.logger.debug("BROWSER %i: Display process does not "
|
|
"exit" % self.crawl_id)
|
|
pass
|
|
except TypeError:
|
|
self.logger.error("BROWSER %i: PID may not be the correct "
|
|
"type %s" % (self.crawl_id,
|
|
str(self.display_pid)))
|
|
if self.display_port is not None: # xvfb diplay lock
|
|
try:
|
|
os.remove("/tmp/.X"+str(self.display_port)+"-lock")
|
|
except OSError:
|
|
self.logger.debug("BROWSER %i: Screen lockfile already "
|
|
"removed" % self.crawl_id)
|
|
pass
|
|
if self.browser_pid is not None:
|
|
"""`browser_pid` is the geckodriver process. We first kill
|
|
the child processes (i.e. firefox) and then kill the geckodriver
|
|
process."""
|
|
try:
|
|
geckodriver = psutil.Process(pid=self.browser_pid)
|
|
for child in geckodriver.children():
|
|
try:
|
|
child.kill()
|
|
except psutil.NoSuchProcess:
|
|
self.logger.debug(
|
|
"BROWSER %i: Geckodriver child process already "
|
|
"killed (pid=%i)." % (self.crawl_id, child.pid))
|
|
pass
|
|
geckodriver.kill()
|
|
geckodriver.wait(timeout=20)
|
|
for child in geckodriver.children():
|
|
child.wait(timeout=20)
|
|
except psutil.NoSuchProcess:
|
|
self.logger.debug("BROWSER %i: Geckodriver process already "
|
|
"killed." % self.crawl_id)
|
|
pass
|
|
except psutil.TimeoutExpired:
|
|
self.logger.debug("BROWSER %i: Timeout while waiting for "
|
|
"geckodriver or browser process to close " %
|
|
self.crawl_id)
|
|
pass
|
|
|
|
def shutdown_browser(self, during_init):
|
|
""" Runs the closing tasks for this Browser/BrowserManager """
|
|
# Join command thread
|
|
if self.command_thread is not None:
|
|
self.logger.debug(
|
|
"BROWSER %i: Joining command thread" % self.crawl_id)
|
|
start_time = time.time()
|
|
if self.current_timeout is not None:
|
|
self.command_thread.join(self.current_timeout + 10)
|
|
else:
|
|
self.command_thread.join(60)
|
|
self.logger.debug(
|
|
"BROWSER %i: %f seconds to join command thread" % (
|
|
self.crawl_id, time.time() - start_time))
|
|
|
|
# Kill BrowserManager process and children
|
|
self.logger.debug(
|
|
"BROWSER %i: Killing browser manager..." % self.crawl_id)
|
|
self.kill_browser_manager()
|
|
|
|
# Archive browser profile (if requested)
|
|
self.logger.debug(
|
|
"BROWSER %i: during_init=%s | profile_archive_dir=%s" % (
|
|
self.crawl_id, str(during_init),
|
|
self.browser_params['profile_archive_dir'])
|
|
)
|
|
if (not during_init and
|
|
self.browser_params['profile_archive_dir'] is not None):
|
|
self.logger.debug(
|
|
"BROWSER %i: Archiving browser profile directory to %s" % (
|
|
self.crawl_id, self.browser_params['profile_archive_dir']))
|
|
profile_commands.dump_profile(
|
|
self.current_profile_path,
|
|
self.manager_params,
|
|
self.browser_params,
|
|
self.browser_params['profile_archive_dir'],
|
|
close_webdriver=False,
|
|
browser_settings=self.browser_settings,
|
|
compress=True,
|
|
save_flash=self.browser_params['disable_flash'] is False
|
|
)
|
|
|
|
# Clean up temporary files
|
|
if self.current_profile_path is not None:
|
|
shutil.rmtree(self.current_profile_path, ignore_errors=True)
|
|
|
|
|
|
def BrowserManager(command_queue, status_queue, browser_params,
|
|
manager_params, crash_recovery):
|
|
"""
|
|
The BrowserManager function runs in each new browser process.
|
|
It is responsible for listening to command instructions from
|
|
the Task Manager and passing them to the command module to execute
|
|
and interface with Selenium. Command execution status is sent back
|
|
to the TaskManager.
|
|
"""
|
|
try:
|
|
logger = loggingclient(*manager_params['logger_address'])
|
|
|
|
# Start the virtualdisplay (if necessary), webdriver, and browser
|
|
driver, prof_folder, browser_settings = deploy_browser.deploy_browser(
|
|
status_queue, browser_params, manager_params, crash_recovery)
|
|
if prof_folder[-1] != '/':
|
|
prof_folder += '/'
|
|
|
|
# Read the extension port -- if extension is enabled
|
|
# TODO: Initial communication from extension to TM should use sockets
|
|
if (browser_params['browser'] == 'firefox' and
|
|
browser_params['extension_enabled']):
|
|
logger.debug("BROWSER %i: Looking for extension port information "
|
|
"in %s" % (browser_params['crawl_id'], prof_folder))
|
|
elapsed = 0
|
|
port = None
|
|
ep_filename = os.path.join(prof_folder, 'extension_port.txt')
|
|
while elapsed < 5:
|
|
try:
|
|
with open(ep_filename, 'rt') as f:
|
|
port = int(f.read().strip())
|
|
break
|
|
except OSError as e:
|
|
if e.errno != errno.ENOENT:
|
|
raise
|
|
time.sleep(0.1)
|
|
elapsed += 0.1
|
|
if port is None:
|
|
# try one last time, allowing all exceptions to propagate
|
|
with open(ep_filename, 'rt') as f:
|
|
port = int(f.read().strip())
|
|
|
|
logger.debug("BROWSER %i: Connecting to extension on port %i" % (
|
|
browser_params['crawl_id'], port))
|
|
extension_socket = clientsocket(serialization='json')
|
|
extension_socket.connect('127.0.0.1', int(port))
|
|
else:
|
|
extension_socket = None
|
|
|
|
logger.debug(
|
|
"BROWSER %i: BrowserManager ready." % browser_params['crawl_id'])
|
|
|
|
# passes the profile folder, WebDriver pid and display pid back to the
|
|
# TaskManager to signal a successful startup
|
|
status_queue.put(('STATUS', 'Browser Ready', (prof_folder, 'READY')))
|
|
browser_params['profile_path'] = prof_folder
|
|
|
|
# starts accepting arguments until told to die
|
|
while True:
|
|
# no command for now -> sleep to avoid pegging CPU on blocking get
|
|
if command_queue.empty():
|
|
time.sleep(0.001)
|
|
continue
|
|
|
|
# reads in the command tuple of form:
|
|
# (command, arg0, arg1, arg2, ..., argN) where N is variable
|
|
command = command_queue.get()
|
|
logger.info("BROWSER %i: EXECUTING COMMAND: %s" % (
|
|
browser_params['crawl_id'], str(command)))
|
|
# attempts to perform an action and return an OK signal
|
|
# if command fails for whatever reason, tell the TaskManager to
|
|
# kill and restart its worker processes
|
|
command_executor.execute_command(
|
|
command, driver, browser_settings,
|
|
browser_params, manager_params, extension_socket)
|
|
status_queue.put("OK")
|
|
|
|
except (ProfileLoadError, BrowserConfigError, AssertionError) as e:
|
|
logger.info("BROWSER %i: %s thrown, informing parent and raising" % (
|
|
browser_params['crawl_id'], e.__class__.__name__))
|
|
err_info = sys.exc_info()
|
|
status_queue.put(('CRITICAL', pickle.dumps(err_info)))
|
|
return
|
|
except Exception as e:
|
|
excp = traceback.format_exception(*sys.exc_info())
|
|
logger.info("BROWSER %i: Crash in driver, restarting browser manager "
|
|
"\n %s" % (browser_params['crawl_id'], ''.join(excp)))
|
|
status_queue.put(('FAILED', None))
|
|
return
|