Integrate new logging class with entire codebase

This commit is contained in:
englehardt 2019-07-31 16:44:37 -07:00
Родитель ad7d8a3264
Коммит 292cc48715
9 изменённых файлов: 147 добавлений и 190 удалений

Просмотреть файл

@ -1,6 +1,7 @@
from __future__ import absolute_import
import errno
import logging
import os
import shutil
import signal
@ -18,7 +19,6 @@ from tblib import pickling_support
from .Commands import command_executor
from .DeployBrowsers import deploy_browser
from .Errors import BrowserConfigError, BrowserCrashError, ProfileLoadError
from .MPLogger import loggingclient
from .SocketInterface import clientsocket
pickling_support.install()
@ -43,7 +43,6 @@ class Browser:
# manager parameters
self.current_profile_path = None
self.db_socket_address = manager_params['aggregator_address']
self.logger_address = manager_params['logger_address']
self.crawl_id = browser_params['crawl_id']
self.curr_visit_id = None
self.browser_params = browser_params
@ -73,7 +72,6 @@ class Browser:
# dict of additional browser profile settings (e.g. screen_res)
self.browser_settings = None
self.browser_manager = None # process that controls browser
self.logger = loggingclient(*self.logger_address)
def ready(self):
""" return if the browser is ready to accept a command """
@ -131,7 +129,7 @@ class Browser:
while not success and \
unsuccessful_spawns < self._UNSUCCESSFUL_SPAWN_LIMIT:
self.logger.debug("BROWSER %i: Spawn attempt %i " % (
logging.debug("BROWSER %i: Spawn attempt %i " % (
self.crawl_id, unsuccessful_spawns))
# Resets the command/status queues
(self.command_queue, self.status_queue) = (Queue(), Queue())
@ -161,7 +159,7 @@ class Browser:
(driver_profile_path, ready) = check_queue(launch_status)
if ready != 'READY':
self.logger.error(
logging.error(
"BROWSER %i: Mismatch of status queue return values, "
"trying again..." % self.crawl_id
)
@ -177,7 +175,7 @@ class Browser:
for string in status_strings:
error_string += " | %s: %s " % (
string, launch_status.get(string, False))
self.logger.error(
logging.error(
"BROWSER %i: Spawn unsuccessful %s" % (self.crawl_id,
error_string))
self.kill_browser_manager()
@ -188,7 +186,7 @@ class Browser:
# current profile path class variable and clean up the tempdir
# and previous profile path.
if success:
self.logger.debug(
logging.debug(
"BROWSER %i: Browser spawn sucessful!" % self.crawl_id)
previous_profile_path = self.current_profile_path
self.current_profile_path = driver_profile_path
@ -206,11 +204,11 @@ class Browser:
kill and restart the two worker processes
<clear_profile> marks whether we want to wipe the old profile
"""
self.logger.info("BROWSER %i: BrowserManager restart initiated. "
"Clear profile? %s" % (self.crawl_id, clear_profile))
logging.info("BROWSER %i: BrowserManager restart initiated. "
"Clear profile? %s" % (self.crawl_id, clear_profile))
if self.is_fresh: # Return success if browser is fresh
self.logger.info("BROWSER %i: Skipping restart since the browser "
"is a fresh instance already" % self.crawl_id)
logging.info("BROWSER %i: Skipping restart since the browser "
"is a fresh instance already" % self.crawl_id)
return True
self.kill_browser_manager()
@ -225,7 +223,7 @@ class Browser:
def kill_browser_manager(self):
"""Kill the BrowserManager process and all of its children"""
self.logger.debug(
logging.debug(
"BROWSER %i: Attempting to kill BrowserManager with pid %i. "
"Display PID: %s | Display Port: %s | Browser PID: %s" % (
self.crawl_id, self.browser_manager.pid, self.display_pid,
@ -236,27 +234,27 @@ class Browser:
try:
os.kill(self.browser_manager.pid, signal.SIGKILL)
except OSError:
self.logger.debug("BROWSER %i: Browser manager process does "
"not exist" % self.crawl_id)
logging.debug("BROWSER %i: Browser manager process does "
"not exist" % self.crawl_id)
pass
if self.display_pid is not None:
try:
os.kill(self.display_pid, signal.SIGKILL)
except OSError:
self.logger.debug("BROWSER %i: Display process does not "
"exit" % self.crawl_id)
logging.debug("BROWSER %i: Display process does not "
"exit" % self.crawl_id)
pass
except TypeError:
self.logger.error("BROWSER %i: PID may not be the correct "
"type %s" % (self.crawl_id,
str(self.display_pid)))
logging.error("BROWSER %i: PID may not be the correct "
"type %s" % (self.crawl_id,
str(self.display_pid)))
if self.display_port is not None: # xvfb diplay lock
lockfile = "/tmp/.X%s-lock" % self.display_port
try:
os.remove(lockfile)
except OSError:
self.logger.debug("BROWSER %i: Screen lockfile (%s) already "
"removed" % (self.crawl_id, lockfile))
logging.debug("BROWSER %i: Screen lockfile (%s) already "
"removed" % (self.crawl_id, lockfile))
pass
if self.browser_pid is not None:
"""`browser_pid` is the geckodriver process. We first kill
@ -268,7 +266,7 @@ class Browser:
try:
child.kill()
except psutil.NoSuchProcess:
self.logger.debug(
logging.debug(
"BROWSER %i: Geckodriver child process already "
"killed (pid=%i)." % (self.crawl_id, child.pid))
pass
@ -277,53 +275,53 @@ class Browser:
for child in geckodriver.children():
child.wait(timeout=20)
except psutil.NoSuchProcess:
self.logger.debug("BROWSER %i: Geckodriver process already "
"killed." % self.crawl_id)
logging.debug("BROWSER %i: Geckodriver process already "
"killed." % self.crawl_id)
pass
except psutil.TimeoutExpired:
self.logger.debug("BROWSER %i: Timeout while waiting for "
"geckodriver or browser process to close " %
self.crawl_id)
logging.debug("BROWSER %i: Timeout while waiting for "
"geckodriver or browser process to close " %
self.crawl_id)
pass
def shutdown_browser(self, during_init):
""" Runs the closing tasks for this Browser/BrowserManager """
# Join command thread
if self.command_thread is not None:
self.logger.debug(
logging.debug(
"BROWSER %i: Joining command thread" % self.crawl_id)
start_time = time.time()
if self.current_timeout is not None:
self.command_thread.join(self.current_timeout + 10)
else:
self.command_thread.join(60)
self.logger.debug(
logging.debug(
"BROWSER %i: %f seconds to join command thread" % (
self.crawl_id, time.time() - start_time))
# Kill BrowserManager process and children
self.logger.debug(
logging.debug(
"BROWSER %i: Killing browser manager..." % self.crawl_id)
self.kill_browser_manager()
# Archive browser profile (if requested)
if not during_init and \
self.browser_params['profile_archive_dir'] is not None:
self.logger.warn(
logging.warn(
"BROWSER %i: Archiving the browser profile directory is "
"currently unsupported. "
"See: https://github.com/mozilla/OpenWPM/projects/2" %
self.crawl_id
)
"""
self.logger.debug(
logging.debug(
"BROWSER %i: during_init=%s | profile_archive_dir=%s" % (
self.crawl_id, str(during_init),
self.browser_params['profile_archive_dir'])
)
if (not during_init and
self.browser_params['profile_archive_dir'] is not None):
self.logger.debug(
logging.debug(
"BROWSER %i: Archiving browser profile directory to %s" % (
self.crawl_id, self.browser_params['profile_archive_dir']))
profile_commands.dump_profile(
@ -353,8 +351,6 @@ def BrowserManager(command_queue, status_queue, browser_params,
to the TaskManager.
"""
try:
logger = loggingclient(*manager_params['logger_address'])
# Start the virtualdisplay (if necessary), webdriver, and browser
driver, prof_folder, browser_settings = deploy_browser.deploy_browser(
status_queue, browser_params, manager_params, crash_recovery)
@ -365,8 +361,8 @@ def BrowserManager(command_queue, status_queue, browser_params,
# TODO: Initial communication from extension to TM should use sockets
if browser_params['browser'] == 'firefox' and \
browser_params['extension_enabled']:
logger.debug("BROWSER %i: Looking for extension port information "
"in %s" % (browser_params['crawl_id'], prof_folder))
logging.debug("BROWSER %i: Looking for extension port information "
"in %s" % (browser_params['crawl_id'], prof_folder))
elapsed = 0
port = None
ep_filename = os.path.join(prof_folder, 'extension_port.txt')
@ -385,14 +381,14 @@ def BrowserManager(command_queue, status_queue, browser_params,
with open(ep_filename, 'rt') as f:
port = int(f.read().strip())
logger.debug("BROWSER %i: Connecting to extension on port %i" % (
logging.debug("BROWSER %i: Connecting to extension on port %i" % (
browser_params['crawl_id'], port))
extension_socket = clientsocket(serialization='json')
extension_socket.connect('127.0.0.1', int(port))
else:
extension_socket = None
logger.debug(
logging.debug(
"BROWSER %i: BrowserManager ready." % browser_params['crawl_id'])
# passes the profile folder, WebDriver pid and display pid back to the
@ -410,7 +406,7 @@ def BrowserManager(command_queue, status_queue, browser_params,
# reads in the command tuple of form:
# (command, arg0, arg1, arg2, ..., argN) where N is variable
command = command_queue.get()
logger.info("BROWSER %i: EXECUTING COMMAND: %s" % (
logging.info("BROWSER %i: EXECUTING COMMAND: %s" % (
browser_params['crawl_id'], str(command)))
# attempts to perform an action and return an OK signal
# if command fails for whatever reason, tell the TaskManager to
@ -421,14 +417,16 @@ def BrowserManager(command_queue, status_queue, browser_params,
status_queue.put("OK")
except (ProfileLoadError, BrowserConfigError, AssertionError) as e:
logger.error("BROWSER %i: %s thrown, informing parent and raising" % (
logging.error("BROWSER %i: %s thrown, informing parent and raising" % (
browser_params['crawl_id'], e.__class__.__name__))
err_info = sys.exc_info()
status_queue.put(('CRITICAL', pickle.dumps(err_info)))
return
except Exception:
excp = traceback.format_exception(*sys.exc_info())
logger.error("BROWSER %i: Crash in driver, restarting browser manager "
"\n %s" % (browser_params['crawl_id'], ''.join(excp)))
logging.error(
"BROWSER %i: Crash in driver, restarting browser manager "
"\n %s" % (browser_params['crawl_id'], ''.join(excp))
)
status_queue.put(('FAILED', None))
return

Просмотреть файл

@ -2,6 +2,7 @@ from __future__ import absolute_import
import gzip
import json
import logging
import os
import random
import sys
@ -18,7 +19,6 @@ from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from six.moves import range
from ..MPLogger import loggingclient
from ..SocketInterface import clientsocket
from .utils.lso import get_flash_cookies
from .utils.webdriver_extensions import (execute_in_all_frames,
@ -152,9 +152,6 @@ def browse_website(url, num_links, sleep, visit_id, webdriver,
get_website(url, sleep, visit_id, webdriver,
browser_params, extension_socket)
# Connect to logger
logger = loggingclient(*manager_params['logger_address'])
# Then visit a few subpages
for _ in range(num_links):
links = [x for x in get_intra_links(webdriver, url)
@ -162,7 +159,7 @@ def browse_website(url, num_links, sleep, visit_id, webdriver,
if not links:
break
r = int(random.random() * len(links))
logger.info("BROWSER %i: visiting internal link %s" % (
logging.info("BROWSER %i: visiting internal link %s" % (
browser_params['crawl_id'], links[r].get_attribute("href")))
try:
@ -214,7 +211,7 @@ def save_screenshot(visit_id, crawl_id, driver, manager_params, suffix=''):
driver.save_screenshot(outname)
def _stitch_screenshot_parts(visit_id, crawl_id, logger, manager_params):
def _stitch_screenshot_parts(visit_id, crawl_id, manager_params):
# Read image parts and compute dimensions of output image
total_height = -1
max_scroll = -1
@ -260,7 +257,7 @@ def _stitch_screenshot_parts(visit_id, crawl_id, logger, manager_params):
try:
output.save(outname)
except SystemError:
logger.error(
logging.error(
"BROWSER %i: SystemError while trying to save screenshot %s. \n"
"Slices of image %s \n Final size %s, %s." %
(crawl_id, outname, '\n'.join([str(x) for x in parts]),
@ -271,7 +268,6 @@ def _stitch_screenshot_parts(visit_id, crawl_id, logger, manager_params):
def screenshot_full_page(visit_id, crawl_id, driver, manager_params,
suffix=''):
logger = loggingclient(*manager_params['logger_address'])
outdir = os.path.join(manager_params['screenshot_path'], 'parts')
if not os.path.isdir(outdir):
@ -299,7 +295,7 @@ def screenshot_full_page(visit_id, crawl_id, driver, manager_params,
try:
driver.execute_script('window.scrollBy(0, window.innerHeight)')
except WebDriverException:
logger.info(
logging.info(
"BROWSER %i: WebDriverException while scrolling, "
"screenshot may be misaligned!" % crawl_id)
pass
@ -314,12 +310,12 @@ def screenshot_full_page(visit_id, crawl_id, driver, manager_params,
driver.save_screenshot(outname % (part, curr_scrollY))
except WebDriverException:
excp = traceback.format_exception(*sys.exc_info())
logger.error(
logging.error(
"BROWSER %i: Exception while taking full page screenshot \n %s" %
(crawl_id, ''.join(excp)))
return
_stitch_screenshot_parts(visit_id, crawl_id, logger, manager_params)
_stitch_screenshot_parts(visit_id, crawl_id, manager_params)
def dump_page_source(visit_id, driver, manager_params, suffix=''):

Просмотреть файл

@ -1,5 +1,6 @@
from __future__ import absolute_import
import logging
import os
import shutil
import tarfile
@ -7,7 +8,6 @@ import tarfile
from six.moves import cPickle as pickle
from ..Errors import ProfileLoadError
from ..MPLogger import loggingclient
from .utils.file_utils import rmsubtree
from .utils.firefox_profile import sleep_until_sqlite_checkpoint
@ -43,7 +43,7 @@ def load_browser_settings(location):
return browser_settings
def save_flash_files(logger, browser_params, dump_location, clear=False):
def save_flash_files(browser_params, dump_location, clear=False):
"""
save all files from the default flash storage locations
clear: sets whether to clear storage locations after backup
@ -54,13 +54,13 @@ def save_flash_files(logger, browser_params, dump_location, clear=False):
# Copy all flash objects over to dump location
for location in FLASH_LOCS:
if not os.path.isdir(location):
logger.warning("BROWSER %i: %s not found when attempting to save "
"flash files, skipping..." %
(browser_params['crawl_id'], location))
logging.warning("BROWSER %i: %s not found when attempting to save "
"flash files, skipping..." %
(browser_params['crawl_id'], location))
continue
logger.debug("BROWSER %i: SAVING %s during flash file archive" %
(browser_params['crawl_id'], location))
logging.debug("BROWSER %i: SAVING %s during flash file archive" %
(browser_params['crawl_id'], location))
(head, tail) = os.path.split(location)
# Remove old backups if exist
@ -71,23 +71,23 @@ def save_flash_files(logger, browser_params, dump_location, clear=False):
shutil.copytree(location, os.path.join(dump_location, tail))
if clear:
logger.debug("BROWSER %i: CLEARING %s during flash file archive" %
(browser_params['crawl_id'], location))
logging.debug("BROWSER %i: CLEARING %s during flash file archive" %
(browser_params['crawl_id'], location))
rmsubtree(location)
def load_flash_files(logger, browser_params, tar_location):
def load_flash_files(browser_params, tar_location):
""" clear old flash cookies and load ones from dump """
# Clear previous objects prior to loading
for location in FLASH_LOCS:
if not os.path.isdir(location):
logger.warning("BROWSER %i: %s not found when attempting to load "
"flash files, skipping..." %
(browser_params['crawl_id'], location))
logging.warning("BROWSER %i: %s not found when attempting to load "
"flash files, skipping..." %
(browser_params['crawl_id'], location))
continue
logger.debug("BROWSER %i: CLEARING %s before loading flash files" %
(browser_params['crawl_id'], location))
logging.debug("BROWSER %i: CLEARING %s before loading flash files" %
(browser_params['crawl_id'], location))
shutil.rmtree(location)
# Copy flash storage objects from tar_location
@ -95,7 +95,7 @@ def load_flash_files(logger, browser_params, tar_location):
if os.path.exists(os.path.join(tar_location, tail)):
shutil.copytree(os.path.join(tar_location, tail), location)
else:
logger.warning(
logging.warning(
"BROWSER %i: %s not found while loading flash "
"files, skipping..." %
(browser_params['crawl_id'], os.path.join(tar_location, tail)))
@ -111,12 +111,9 @@ def dump_profile(browser_profile_folder, manager_params, browser_params,
if <browser_settings> exists they are also saved
<save_flash> specifies whether to dump flash files
"""
# Connect to logger
logger = loggingclient(*manager_params['logger_address'])
logger.debug("BROWSER %i: Profile dumping is currently unsupported. "
"See: https://github.com/mozilla/OpenWPM/projects/2." %
browser_params['crawl_id'])
logging.debug("BROWSER %i: Profile dumping is currently unsupported. "
"See: https://github.com/mozilla/OpenWPM/projects/2." %
browser_params['crawl_id'])
return
# ensures that folder paths end with slashes
@ -148,7 +145,7 @@ def dump_profile(browser_profile_folder, manager_params, browser_params,
tar = tarfile.open(tar_location + tar_name, 'w:gz', errorlevel=1)
else:
tar = tarfile.open(tar_location + tar_name, 'w', errorlevel=1)
logger.debug(
logging.debug(
"BROWSER %i: Backing up full profile from %s to %s" %
(browser_params['crawl_id'], browser_profile_folder,
tar_location + tar_name)
@ -173,7 +170,7 @@ def dump_profile(browser_profile_folder, manager_params, browser_params,
if not os.path.isfile(full_path) and \
full_path[-3:] != 'shm' and \
full_path[-3:] != 'wal':
logger.critical(
logging.critical(
"BROWSER %i: %s NOT FOUND IN profile folder, skipping." %
(browser_params['crawl_id'], full_path))
elif not os.path.isfile(full_path) and \
@ -183,7 +180,7 @@ def dump_profile(browser_profile_folder, manager_params, browser_params,
for item in storage_vector_dirs:
full_path = os.path.join(browser_profile_folder, item)
if not os.path.isdir(full_path):
logger.warning(
logging.warning(
"BROWSER %i: %s NOT FOUND IN profile folder, skipping." %
(browser_params['crawl_id'], full_path))
continue
@ -192,7 +189,7 @@ def dump_profile(browser_profile_folder, manager_params, browser_params,
# save flash cookies
if save_flash:
save_flash_files(logger, browser_params, tar_location)
save_flash_files(browser_params, tar_location)
# save the browser settings
if browser_settings is not None:
@ -207,9 +204,6 @@ def load_profile(browser_profile_folder, manager_params, browser_params,
is in the folder, either full_profile.tar.gz or profile.tar.gz
"""
try:
# Connect to logger
logger = loggingclient(*manager_params['logger_address'])
# ensures that folder paths end with slashes
if browser_profile_folder[-1] != '/':
browser_profile_folder = browser_profile_folder + "/"
@ -222,7 +216,7 @@ def load_profile(browser_profile_folder, manager_params, browser_params,
tar_name = 'profile.tar'
# Copy and untar the loaded profile
logger.debug(
logging.debug(
"BROWSER %i: Copying profile tar from %s to %s" %
(browser_params['crawl_id'], tar_location + tar_name,
browser_profile_folder)
@ -238,17 +232,17 @@ def load_profile(browser_profile_folder, manager_params, browser_params,
f.extractall(browser_profile_folder)
f.close()
os.remove(browser_profile_folder + tar_name)
logger.debug(
logging.debug(
"BROWSER %i: Tarfile extracted" % browser_params['crawl_id'])
# clear and load flash cookies
if load_flash:
load_flash_files(logger, browser_params, tar_location)
load_flash_files(browser_params, tar_location)
# load the browser settings
browser_settings = load_browser_settings(tar_location)
except Exception as ex:
logger.critical(
logging.critical(
"BROWSER %i: Error: %s while attempting to load profile" %
(browser_params['crawl_id'], str(ex)))
raise ProfileLoadError('Profile Load not successful')

Просмотреть файл

@ -1,10 +1,10 @@
import abc
import logging
import time
from multiprocess import Process, Queue
from six.moves import queue
from ..MPLogger import loggingclient
from ..SocketInterface import serversocket
RECORD_TYPE_CONTENT = 'page_content'
@ -33,7 +33,6 @@ class BaseListener(object):
def __init__(self, status_queue, shutdown_queue, manager_params):
self.status_queue = status_queue
self.shutdown_queue = shutdown_queue
self.logger = loggingclient(*manager_params['logger_address'])
self._shutdown_flag = False
self._last_update = time.time() # last status update time
self.record_queue = None # Initialized on `startup`
@ -71,7 +70,7 @@ class BaseListener(object):
"""Return `True` if the listener has received a shutdown signal"""
if not self.shutdown_queue.empty():
self.shutdown_queue.get()
self.logger.info("Received shutdown signal!")
logging.info("Received shutdown signal!")
return True
return False
@ -81,7 +80,7 @@ class BaseListener(object):
return
qsize = self.record_queue.qsize()
self.status_queue.put(qsize)
self.logger.debug(
logging.debug(
"Status update; current record queue size: %d" % qsize)
self._last_update = time.time()
@ -116,7 +115,6 @@ class BaseAggregator(object):
def __init__(self, manager_params, browser_params):
self.manager_params = manager_params
self.browser_params = browser_params
self.logger = loggingclient(*manager_params['logger_address'])
self.listener_address = None
self.listener_process = None
self.status_queue = Queue()
@ -184,14 +182,14 @@ class BaseAggregator(object):
def shutdown(self):
""" Terminate the aggregator listener process"""
self.logger.debug(
logging.debug(
"Sending the shutdown signal to the %s listener process..." %
type(self).__name__
)
self.shutdown_queue.put(SHUTDOWN_SIGNAL)
start_time = time.time()
self.listener_process.join(300)
self.logger.debug(
logging.debug(
"%s took %s seconds to close." % (
type(self).__name__,
str(time.time() - start_time)

Просмотреть файл

@ -1,6 +1,7 @@
from __future__ import absolute_import
import json
import logging
import os.path
import random
import sys
@ -9,7 +10,6 @@ from pyvirtualdisplay import Display
from selenium import webdriver
from ..Commands.profile_commands import load_profile
from ..MPLogger import loggingclient
from ..utilities.platform_utils import (get_firefox_binary_path,
get_geckodriver_exec_path)
from . import configure_firefox
@ -28,7 +28,6 @@ def deploy_firefox(status_queue, browser_params, manager_params,
geckodriver_executable_path = get_geckodriver_exec_path()
root_dir = os.path.dirname(__file__) # directory of this file
logger = loggingclient(*manager_params['logger_address'])
display_pid = None
display_port = None
@ -43,9 +42,9 @@ def deploy_firefox(status_queue, browser_params, manager_params,
profile_settings = None # Imported browser settings
if browser_params['profile_tar'] and not crash_recovery:
logger.debug("BROWSER %i: Loading initial browser profile from: %s"
% (browser_params['crawl_id'],
browser_params['profile_tar']))
logging.debug("BROWSER %i: Loading initial browser profile from: %s"
% (browser_params['crawl_id'],
browser_params['profile_tar']))
load_flash = browser_params['disable_flash'] is False
profile_settings = load_profile(browser_profile_path,
manager_params,
@ -53,9 +52,9 @@ def deploy_firefox(status_queue, browser_params, manager_params,
browser_params['profile_tar'],
load_flash=load_flash)
elif browser_params['profile_tar']:
logger.debug("BROWSER %i: Loading recovered browser profile from: %s"
% (browser_params['crawl_id'],
browser_params['profile_tar']))
logging.debug("BROWSER %i: Loading recovered browser profile from: %s"
% (browser_params['crawl_id'],
browser_params['profile_tar']))
profile_settings = load_profile(browser_profile_path,
manager_params,
browser_params,
@ -63,8 +62,8 @@ def deploy_firefox(status_queue, browser_params, manager_params,
status_queue.put(('STATUS', 'Profile Tar', None))
if browser_params['random_attributes'] and profile_settings is None:
logger.debug("BROWSER %i: Loading random attributes for browser"
% browser_params['crawl_id'])
logging.debug("BROWSER %i: Loading random attributes for browser"
% browser_params['crawl_id'])
profile_settings = dict()
# choose a random screen-res from list
@ -88,15 +87,15 @@ def deploy_firefox(status_queue, browser_params, manager_params,
profile_settings['ua_string'] = None
if profile_settings['ua_string'] is not None:
logger.debug("BROWSER %i: Overriding user agent string to '%s'"
% (browser_params['crawl_id'],
profile_settings['ua_string']))
logging.debug("BROWSER %i: Overriding user agent string to '%s'"
% (browser_params['crawl_id'],
profile_settings['ua_string']))
fo.set_preference("general.useragent.override",
profile_settings['ua_string'])
if browser_params['headless']:
if sys.platform == 'darwin':
logger.warn(
logging.warn(
"BROWSER %i: headless mode is not supported on MacOS. "
"Browser window will be visible." % browser_params['crawl_id']
)
@ -122,8 +121,8 @@ def deploy_firefox(status_queue, browser_params, manager_params,
ext_config_file = browser_profile_path + 'browser_params.json'
with open(ext_config_file, 'w') as f:
json.dump(extension_config, f)
logger.debug("BROWSER %i: Saved extension config file to: %s" %
(browser_params['crawl_id'], ext_config_file))
logging.debug("BROWSER %i: Saved extension config file to: %s" %
(browser_params['crawl_id'], ext_config_file))
# TODO restore detailed logging
# fo.set_preference("extensions.@openwpm.sdk.console.logLevel", "all")
@ -146,13 +145,13 @@ def deploy_firefox(status_queue, browser_params, manager_params,
# main logger. This will also inform us where the real profile
# directory is hiding.
interceptor = FirefoxLogInterceptor(
browser_params['crawl_id'], logger, browser_profile_path)
browser_params['crawl_id'], browser_profile_path)
interceptor.start()
# Set custom prefs. These are set after all of the default prefs to allow
# our defaults to be overwritten.
for name, value in browser_params['prefs'].items():
logger.info(
logging.info(
"BROWSER %i: Setting custom preference: %s = %s" %
(browser_params['crawl_id'], name, value))
fo.set_preference(name, value)
@ -171,8 +170,8 @@ def deploy_firefox(status_queue, browser_params, manager_params,
ext_loc = os.path.join(root_dir, '../Extension/firefox/openwpm.xpi')
ext_loc = os.path.normpath(ext_loc)
driver.install_addon(ext_loc, temporary=True)
logger.debug("BROWSER %i: OpenWPM Firefox extension loaded"
% browser_params['crawl_id'])
logging.debug("BROWSER %i: OpenWPM Firefox extension loaded"
% browser_params['crawl_id'])
# set window size
driver.set_window_size(*profile_settings['screen_res'])

Просмотреть файл

@ -6,6 +6,7 @@ from __future__ import absolute_import
import errno
import json
import logging
import os
import sys
import tempfile
@ -58,10 +59,9 @@ class FirefoxLogInterceptor(threading.Thread):
from geckodriver's log output (geckodriver copies the profile).
"""
def __init__(self, crawl_id, logger, profile_path):
def __init__(self, crawl_id, profile_path):
threading.Thread.__init__(self, name="log-interceptor-%i" % crawl_id)
self.crawl_id = crawl_id
self.logger = logger
self.fifo = mktempfifo(suffix=".log", prefix="owpm_driver_")
self.profile_path = profile_path
self.daemon = True
@ -73,8 +73,8 @@ class FirefoxLogInterceptor(threading.Thread):
try:
with open(self.fifo, "rt") as f:
for line in f:
self.logger.debug("BROWSER %i: driver: %s"
% (self.crawl_id, line.strip()))
logging.debug("BROWSER %i: driver: %s" %
(self.crawl_id, line.strip()))
if "Using profile path" in line:
self.profile_path = \
line.partition("Using profile path")[-1].strip()

Просмотреть файл

@ -77,7 +77,7 @@ class MPLogger(object):
)
self._listener.daemon = True
self._listener.start()
self._listener_address = self._status_queue.get()
self.logger_address = self._status_queue.get()
# Attach console handler to log to console
logger = logging.getLogger()
@ -90,7 +90,7 @@ class MPLogger(object):
logger.addHandler(consoleHandler)
# Attach socket handler to logger to serialize writes to file
socketHandler = ClientSocketHandler(*self._listener_address)
socketHandler = ClientSocketHandler(*self.logger_address)
socketHandler.setLevel(logging.DEBUG)
logger.addHandler(socketHandler)
@ -104,17 +104,10 @@ class MPLogger(object):
def _initialize_sentry(self):
"""If running a cloud crawl, we can pull the sentry endpoint
and related config varibles from the environment"""
print("**********")
self._breadcrumb_handler = BreadcrumbHandler(level=logging.DEBUG)
self._event_handler = EventHandler(level=logging.ERROR)
# sentry_logging = LoggingIntegration(
# level=logging.DEBUG,
# event_level=logging.ERROR
# )
sentry_sdk.init(
dsn=self._sentry_dsn,
# integrations=[sentry_logging],
# before_send=self._sentry_before_send
)
@ -177,7 +170,7 @@ class MPLogger(object):
self._breadcrumb_handler.handle(record)
if record.levelno >= self._event_handler.level:
self._event_handler.handle(record)
def close(self):
self._status_queue.put("SHUTDOWN")
self._listener.join()

Просмотреть файл

@ -2,10 +2,10 @@ from __future__ import absolute_import, division
import copy
import json
import logging
import os
import threading
import time
from queue import Queue
import psutil
from six import reraise
@ -107,13 +107,11 @@ class TaskManager:
self.process_watchdog = process_watchdog
# sets up logging server + connect a client
self.logging_status_queue = None
self.loggingserver = self._launch_loggingserver()
# socket location: (address, port)
self.manager_params['logger_address'] = self.logging_status_queue.get()
self.logger = MPLogger.loggingclient(
*self.manager_params['logger_address'])
# Start logging server thread
self.logging_server = MPLogger.MPLogger(
self.manager_params['log_file'])
self.manager_params[
'logger_address'] = self.logging_server.logger_address
# Initialize the data aggregators
self._launch_aggregators()
@ -130,7 +128,7 @@ class TaskManager:
# Save crawl config information to database
openwpm_v, browser_v = get_version()
self.data_aggregator.save_configuration(openwpm_v, browser_v)
self.logger.info(
logging.info(
get_configuration_string(
self.manager_params, browser_params, (openwpm_v, browser_v)
)
@ -156,8 +154,8 @@ class TaskManager:
raise
if not success:
self.logger.critical("Browser spawn failure during "
"TaskManager initialization, exiting...")
logging.critical("Browser spawn failure during "
"TaskManager initialization, exiting...")
self.close()
break
@ -171,8 +169,8 @@ class TaskManager:
kill browser processes started by Selenium 3 (with `subprocess`)
"""
if self.process_watchdog:
self.logger.error("BROWSER %i: Process watchdog is not currently "
"supported." % self.crawl_id)
logging.error("BROWSER %i: Process watchdog is not currently "
"supported." % self.crawl_id)
while not self.closing:
time.sleep(10)
@ -182,10 +180,10 @@ class TaskManager:
process = psutil.Process(browser.browser_pid)
mem = process.memory_info()[0] / float(2 ** 20)
if mem > BROWSER_MEMORY_LIMIT:
self.logger.info("BROWSER %i: Memory usage: %iMB"
", exceeding limit of %iMB" % (
browser.crawl_id, int(mem),
BROWSER_MEMORY_LIMIT))
logging.info("BROWSER %i: Memory usage: %iMB"
", exceeding limit of %iMB" %
(browser.crawl_id, int(mem),
BROWSER_MEMORY_LIMIT))
browser.restart_required = True
except psutil.NoSuchProcess:
pass
@ -208,11 +206,11 @@ class TaskManager:
process.pid not in browser_pids) or
(process.name() == 'Xvfb' and
process.pid not in display_pids))):
self.logger.debug("Process: %s (pid: %i) with start "
"time %s found running but not in "
"browser process list. Killing." % (
process.name(), process.pid,
process.create_time()))
logging.debug("Process: %s (pid: %i) with start "
"time %s found running but not in "
"browser process list. Killing." % (
process.name(), process.pid,
process.create_time()))
process.kill()
def _launch_aggregators(self):
@ -234,26 +232,6 @@ class TaskManager:
self.sock = clientsocket(serialization='dill')
self.sock.connect(*self.manager_params['aggregator_address'])
def _kill_aggregators(self):
"""Shutdown any currently running data aggregators"""
self.data_aggregator.shutdown()
def _launch_loggingserver(self):
""" sets up logging server """
self.logging_status_queue = Queue()
loggingserver = threading.Thread(
target=MPLogger.loggingserver,
args=(self.manager_params['log_file'], self.logging_status_queue)
)
loggingserver.daemon = True
loggingserver.start()
return loggingserver
def _kill_loggingserver(self):
""" terminates logging server gracefully """
self.logging_status_queue.put("DIE")
self.loggingserver.join(300)
def _shutdown_manager(self, during_init=False):
"""
Wait for current commands to finish, close all child processes and
@ -267,8 +245,8 @@ class TaskManager:
browser.shutdown_browser(during_init)
self.sock.close() # close socket to data aggregator
self._kill_aggregators()
self._kill_loggingserver()
self.data_aggregator.shutdown()
self.logging_server.close()
def _cleanup_before_fail(self, during_init=False):
"""
@ -288,9 +266,9 @@ class TaskManager:
remote browser manager processes. If a failure status is found, the
appropriate steps are taken to gracefully close the infrastructure
"""
self.logger.debug("Checking command failure status indicator...")
logging.debug("Checking command failure status indicator...")
if self.failure_status:
self.logger.debug(
logging.debug(
"TaskManager failure status set, halting command execution.")
self._cleanup_before_fail()
if self.failure_status['ErrorType'] == 'ExceedCommandFailureLimit':
@ -324,7 +302,7 @@ class TaskManager:
agg_queue_size = self.data_aggregator.get_most_recent_status()
if agg_queue_size >= AGGREGATOR_QUEUE_LIMIT:
while agg_queue_size >= AGGREGATOR_QUEUE_LIMIT:
self.logger.info(
logging.info(
"Blocking command submission until the DataAggregator "
"is below the max queue size of %d. Current queue "
"length %d. " % (AGGREGATOR_QUEUE_LIMIT, agg_queue_size)
@ -384,7 +362,7 @@ class TaskManager:
break
time.sleep(SLEEP_CONS)
else:
self.logger.info(
logging.info(
"Command index type is not supported or out of range")
return
@ -397,7 +375,7 @@ class TaskManager:
# Check status flags before starting thread
if self.closing:
self.logger.error(
logging.error(
"Attempted to execute command on a closed TaskManager")
return
self._check_failure_status()
@ -430,7 +408,7 @@ class TaskManager:
reset = command_sequence.reset
if not reset:
self.logger.warn(
logging.warn(
"BROWSER %i: Browser will not reset after CommandSequence "
"executes. OpenWPM does not currently support stateful crawls "
"(see: https://github.com/mozilla/OpenWPM/projects/2). "
@ -465,7 +443,7 @@ class TaskManager:
if status == "OK":
command_succeeded = 1
elif status[0] == "CRITICAL":
self.logger.critical(
logging.critical(
"BROWSER %i: Received critical error from browser "
"process while executing command %s. Setting failure "
"status." % (browser.crawl_id, str(command)))
@ -477,12 +455,12 @@ class TaskManager:
return
else:
command_succeeded = 0
self.logger.info(
logging.info(
"BROWSER %i: Received failure status while executing "
"command: %s" % (browser.crawl_id, command[0]))
except EmptyQueue:
command_succeeded = -1
self.logger.info(
logging.info(
"BROWSER %i: Timeout while executing command, %s, killing "
"browser manager" % (browser.crawl_id, command[0]))
@ -498,7 +476,7 @@ class TaskManager:
with self.threadlock:
self.failurecount += 1
if self.failurecount > self.failure_limit:
self.logger.critical(
logging.critical(
"BROWSER %i: Command execution failure pushes failure "
"count above the allowable limit. Setting "
"failure_status." % browser.crawl_id)
@ -508,7 +486,7 @@ class TaskManager:
}
return
browser.restart_required = True
self.logger.debug("BROWSER %i: Browser restart required" % (
logging.debug("BROWSER %i: Browser restart required" % (
browser.crawl_id))
else:
with self.threadlock:
@ -527,7 +505,7 @@ class TaskManager:
if browser.restart_required or reset:
success = browser.restart_browser_manager(clear_profile=reset)
if not success:
self.logger.critical(
logging.critical(
"BROWSER %i: Exceeded the maximum allowable consecutive "
"browser launch failures. Setting failure_status." % (
browser.crawl_id))
@ -567,6 +545,6 @@ class TaskManager:
Execute shutdown procedure for TaskManager
"""
if self.closing:
self.logger.error("TaskManager already closed")
logging.error("TaskManager already closed")
return
self._shutdown_manager()

Просмотреть файл

@ -1,5 +1,6 @@
from __future__ import absolute_import
import logging
import os
import time
@ -83,20 +84,20 @@ if SENTRY_DSN:
# Connect to job queue
job_queue = rediswq.RedisWQ(name=REDIS_QUEUE_NAME, host="redis")
manager.logger.info("Worker with sessionID: %s" % job_queue.sessionID())
manager.logger.info("Initial queue state: empty=%s" % job_queue.empty())
logging.info("Worker with sessionID: %s" % job_queue.sessionID())
logging.info("Initial queue state: empty=%s" % job_queue.empty())
# Crawl sites specified in job queue until empty
while not job_queue.empty():
job = job_queue.lease(lease_secs=120, block=True, timeout=5)
if job is None:
manager.logger.info("Waiting for work")
logging.info("Waiting for work")
time.sleep(5)
else:
site_rank, site = job.decode("utf-8").split(',')
if "://" not in site:
site = "http://" + site
manager.logger.info("Visiting %s..." % site)
logging.info("Visiting %s..." % site)
command_sequence = CommandSequence.CommandSequence(
site, reset=True
)
@ -104,7 +105,7 @@ while not job_queue.empty():
manager.execute_command_sequence(command_sequence)
job_queue.complete(job)
manager.logger.info("Job queue finished, exiting.")
logging.info("Job queue finished, exiting.")
manager.close()
if SENTRY_DSN: