зеркало из https://github.com/openwpm/OpenWPM.git
Integrate new logging class with entire codebase
This commit is contained in:
Родитель
ad7d8a3264
Коммит
292cc48715
|
@ -1,6 +1,7 @@
|
|||
from __future__ import absolute_import
|
||||
|
||||
import errno
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import signal
|
||||
|
@ -18,7 +19,6 @@ from tblib import pickling_support
|
|||
from .Commands import command_executor
|
||||
from .DeployBrowsers import deploy_browser
|
||||
from .Errors import BrowserConfigError, BrowserCrashError, ProfileLoadError
|
||||
from .MPLogger import loggingclient
|
||||
from .SocketInterface import clientsocket
|
||||
|
||||
pickling_support.install()
|
||||
|
@ -43,7 +43,6 @@ class Browser:
|
|||
# manager parameters
|
||||
self.current_profile_path = None
|
||||
self.db_socket_address = manager_params['aggregator_address']
|
||||
self.logger_address = manager_params['logger_address']
|
||||
self.crawl_id = browser_params['crawl_id']
|
||||
self.curr_visit_id = None
|
||||
self.browser_params = browser_params
|
||||
|
@ -73,7 +72,6 @@ class Browser:
|
|||
# dict of additional browser profile settings (e.g. screen_res)
|
||||
self.browser_settings = None
|
||||
self.browser_manager = None # process that controls browser
|
||||
self.logger = loggingclient(*self.logger_address)
|
||||
|
||||
def ready(self):
|
||||
""" return if the browser is ready to accept a command """
|
||||
|
@ -131,7 +129,7 @@ class Browser:
|
|||
|
||||
while not success and \
|
||||
unsuccessful_spawns < self._UNSUCCESSFUL_SPAWN_LIMIT:
|
||||
self.logger.debug("BROWSER %i: Spawn attempt %i " % (
|
||||
logging.debug("BROWSER %i: Spawn attempt %i " % (
|
||||
self.crawl_id, unsuccessful_spawns))
|
||||
# Resets the command/status queues
|
||||
(self.command_queue, self.status_queue) = (Queue(), Queue())
|
||||
|
@ -161,7 +159,7 @@ class Browser:
|
|||
|
||||
(driver_profile_path, ready) = check_queue(launch_status)
|
||||
if ready != 'READY':
|
||||
self.logger.error(
|
||||
logging.error(
|
||||
"BROWSER %i: Mismatch of status queue return values, "
|
||||
"trying again..." % self.crawl_id
|
||||
)
|
||||
|
@ -177,7 +175,7 @@ class Browser:
|
|||
for string in status_strings:
|
||||
error_string += " | %s: %s " % (
|
||||
string, launch_status.get(string, False))
|
||||
self.logger.error(
|
||||
logging.error(
|
||||
"BROWSER %i: Spawn unsuccessful %s" % (self.crawl_id,
|
||||
error_string))
|
||||
self.kill_browser_manager()
|
||||
|
@ -188,7 +186,7 @@ class Browser:
|
|||
# current profile path class variable and clean up the tempdir
|
||||
# and previous profile path.
|
||||
if success:
|
||||
self.logger.debug(
|
||||
logging.debug(
|
||||
"BROWSER %i: Browser spawn sucessful!" % self.crawl_id)
|
||||
previous_profile_path = self.current_profile_path
|
||||
self.current_profile_path = driver_profile_path
|
||||
|
@ -206,11 +204,11 @@ class Browser:
|
|||
kill and restart the two worker processes
|
||||
<clear_profile> marks whether we want to wipe the old profile
|
||||
"""
|
||||
self.logger.info("BROWSER %i: BrowserManager restart initiated. "
|
||||
"Clear profile? %s" % (self.crawl_id, clear_profile))
|
||||
logging.info("BROWSER %i: BrowserManager restart initiated. "
|
||||
"Clear profile? %s" % (self.crawl_id, clear_profile))
|
||||
if self.is_fresh: # Return success if browser is fresh
|
||||
self.logger.info("BROWSER %i: Skipping restart since the browser "
|
||||
"is a fresh instance already" % self.crawl_id)
|
||||
logging.info("BROWSER %i: Skipping restart since the browser "
|
||||
"is a fresh instance already" % self.crawl_id)
|
||||
return True
|
||||
|
||||
self.kill_browser_manager()
|
||||
|
@ -225,7 +223,7 @@ class Browser:
|
|||
|
||||
def kill_browser_manager(self):
|
||||
"""Kill the BrowserManager process and all of its children"""
|
||||
self.logger.debug(
|
||||
logging.debug(
|
||||
"BROWSER %i: Attempting to kill BrowserManager with pid %i. "
|
||||
"Display PID: %s | Display Port: %s | Browser PID: %s" % (
|
||||
self.crawl_id, self.browser_manager.pid, self.display_pid,
|
||||
|
@ -236,27 +234,27 @@ class Browser:
|
|||
try:
|
||||
os.kill(self.browser_manager.pid, signal.SIGKILL)
|
||||
except OSError:
|
||||
self.logger.debug("BROWSER %i: Browser manager process does "
|
||||
"not exist" % self.crawl_id)
|
||||
logging.debug("BROWSER %i: Browser manager process does "
|
||||
"not exist" % self.crawl_id)
|
||||
pass
|
||||
if self.display_pid is not None:
|
||||
try:
|
||||
os.kill(self.display_pid, signal.SIGKILL)
|
||||
except OSError:
|
||||
self.logger.debug("BROWSER %i: Display process does not "
|
||||
"exit" % self.crawl_id)
|
||||
logging.debug("BROWSER %i: Display process does not "
|
||||
"exit" % self.crawl_id)
|
||||
pass
|
||||
except TypeError:
|
||||
self.logger.error("BROWSER %i: PID may not be the correct "
|
||||
"type %s" % (self.crawl_id,
|
||||
str(self.display_pid)))
|
||||
logging.error("BROWSER %i: PID may not be the correct "
|
||||
"type %s" % (self.crawl_id,
|
||||
str(self.display_pid)))
|
||||
if self.display_port is not None: # xvfb diplay lock
|
||||
lockfile = "/tmp/.X%s-lock" % self.display_port
|
||||
try:
|
||||
os.remove(lockfile)
|
||||
except OSError:
|
||||
self.logger.debug("BROWSER %i: Screen lockfile (%s) already "
|
||||
"removed" % (self.crawl_id, lockfile))
|
||||
logging.debug("BROWSER %i: Screen lockfile (%s) already "
|
||||
"removed" % (self.crawl_id, lockfile))
|
||||
pass
|
||||
if self.browser_pid is not None:
|
||||
"""`browser_pid` is the geckodriver process. We first kill
|
||||
|
@ -268,7 +266,7 @@ class Browser:
|
|||
try:
|
||||
child.kill()
|
||||
except psutil.NoSuchProcess:
|
||||
self.logger.debug(
|
||||
logging.debug(
|
||||
"BROWSER %i: Geckodriver child process already "
|
||||
"killed (pid=%i)." % (self.crawl_id, child.pid))
|
||||
pass
|
||||
|
@ -277,53 +275,53 @@ class Browser:
|
|||
for child in geckodriver.children():
|
||||
child.wait(timeout=20)
|
||||
except psutil.NoSuchProcess:
|
||||
self.logger.debug("BROWSER %i: Geckodriver process already "
|
||||
"killed." % self.crawl_id)
|
||||
logging.debug("BROWSER %i: Geckodriver process already "
|
||||
"killed." % self.crawl_id)
|
||||
pass
|
||||
except psutil.TimeoutExpired:
|
||||
self.logger.debug("BROWSER %i: Timeout while waiting for "
|
||||
"geckodriver or browser process to close " %
|
||||
self.crawl_id)
|
||||
logging.debug("BROWSER %i: Timeout while waiting for "
|
||||
"geckodriver or browser process to close " %
|
||||
self.crawl_id)
|
||||
pass
|
||||
|
||||
def shutdown_browser(self, during_init):
|
||||
""" Runs the closing tasks for this Browser/BrowserManager """
|
||||
# Join command thread
|
||||
if self.command_thread is not None:
|
||||
self.logger.debug(
|
||||
logging.debug(
|
||||
"BROWSER %i: Joining command thread" % self.crawl_id)
|
||||
start_time = time.time()
|
||||
if self.current_timeout is not None:
|
||||
self.command_thread.join(self.current_timeout + 10)
|
||||
else:
|
||||
self.command_thread.join(60)
|
||||
self.logger.debug(
|
||||
logging.debug(
|
||||
"BROWSER %i: %f seconds to join command thread" % (
|
||||
self.crawl_id, time.time() - start_time))
|
||||
|
||||
# Kill BrowserManager process and children
|
||||
self.logger.debug(
|
||||
logging.debug(
|
||||
"BROWSER %i: Killing browser manager..." % self.crawl_id)
|
||||
self.kill_browser_manager()
|
||||
|
||||
# Archive browser profile (if requested)
|
||||
if not during_init and \
|
||||
self.browser_params['profile_archive_dir'] is not None:
|
||||
self.logger.warn(
|
||||
logging.warn(
|
||||
"BROWSER %i: Archiving the browser profile directory is "
|
||||
"currently unsupported. "
|
||||
"See: https://github.com/mozilla/OpenWPM/projects/2" %
|
||||
self.crawl_id
|
||||
)
|
||||
"""
|
||||
self.logger.debug(
|
||||
logging.debug(
|
||||
"BROWSER %i: during_init=%s | profile_archive_dir=%s" % (
|
||||
self.crawl_id, str(during_init),
|
||||
self.browser_params['profile_archive_dir'])
|
||||
)
|
||||
if (not during_init and
|
||||
self.browser_params['profile_archive_dir'] is not None):
|
||||
self.logger.debug(
|
||||
logging.debug(
|
||||
"BROWSER %i: Archiving browser profile directory to %s" % (
|
||||
self.crawl_id, self.browser_params['profile_archive_dir']))
|
||||
profile_commands.dump_profile(
|
||||
|
@ -353,8 +351,6 @@ def BrowserManager(command_queue, status_queue, browser_params,
|
|||
to the TaskManager.
|
||||
"""
|
||||
try:
|
||||
logger = loggingclient(*manager_params['logger_address'])
|
||||
|
||||
# Start the virtualdisplay (if necessary), webdriver, and browser
|
||||
driver, prof_folder, browser_settings = deploy_browser.deploy_browser(
|
||||
status_queue, browser_params, manager_params, crash_recovery)
|
||||
|
@ -365,8 +361,8 @@ def BrowserManager(command_queue, status_queue, browser_params,
|
|||
# TODO: Initial communication from extension to TM should use sockets
|
||||
if browser_params['browser'] == 'firefox' and \
|
||||
browser_params['extension_enabled']:
|
||||
logger.debug("BROWSER %i: Looking for extension port information "
|
||||
"in %s" % (browser_params['crawl_id'], prof_folder))
|
||||
logging.debug("BROWSER %i: Looking for extension port information "
|
||||
"in %s" % (browser_params['crawl_id'], prof_folder))
|
||||
elapsed = 0
|
||||
port = None
|
||||
ep_filename = os.path.join(prof_folder, 'extension_port.txt')
|
||||
|
@ -385,14 +381,14 @@ def BrowserManager(command_queue, status_queue, browser_params,
|
|||
with open(ep_filename, 'rt') as f:
|
||||
port = int(f.read().strip())
|
||||
|
||||
logger.debug("BROWSER %i: Connecting to extension on port %i" % (
|
||||
logging.debug("BROWSER %i: Connecting to extension on port %i" % (
|
||||
browser_params['crawl_id'], port))
|
||||
extension_socket = clientsocket(serialization='json')
|
||||
extension_socket.connect('127.0.0.1', int(port))
|
||||
else:
|
||||
extension_socket = None
|
||||
|
||||
logger.debug(
|
||||
logging.debug(
|
||||
"BROWSER %i: BrowserManager ready." % browser_params['crawl_id'])
|
||||
|
||||
# passes the profile folder, WebDriver pid and display pid back to the
|
||||
|
@ -410,7 +406,7 @@ def BrowserManager(command_queue, status_queue, browser_params,
|
|||
# reads in the command tuple of form:
|
||||
# (command, arg0, arg1, arg2, ..., argN) where N is variable
|
||||
command = command_queue.get()
|
||||
logger.info("BROWSER %i: EXECUTING COMMAND: %s" % (
|
||||
logging.info("BROWSER %i: EXECUTING COMMAND: %s" % (
|
||||
browser_params['crawl_id'], str(command)))
|
||||
# attempts to perform an action and return an OK signal
|
||||
# if command fails for whatever reason, tell the TaskManager to
|
||||
|
@ -421,14 +417,16 @@ def BrowserManager(command_queue, status_queue, browser_params,
|
|||
status_queue.put("OK")
|
||||
|
||||
except (ProfileLoadError, BrowserConfigError, AssertionError) as e:
|
||||
logger.error("BROWSER %i: %s thrown, informing parent and raising" % (
|
||||
logging.error("BROWSER %i: %s thrown, informing parent and raising" % (
|
||||
browser_params['crawl_id'], e.__class__.__name__))
|
||||
err_info = sys.exc_info()
|
||||
status_queue.put(('CRITICAL', pickle.dumps(err_info)))
|
||||
return
|
||||
except Exception:
|
||||
excp = traceback.format_exception(*sys.exc_info())
|
||||
logger.error("BROWSER %i: Crash in driver, restarting browser manager "
|
||||
"\n %s" % (browser_params['crawl_id'], ''.join(excp)))
|
||||
logging.error(
|
||||
"BROWSER %i: Crash in driver, restarting browser manager "
|
||||
"\n %s" % (browser_params['crawl_id'], ''.join(excp))
|
||||
)
|
||||
status_queue.put(('FAILED', None))
|
||||
return
|
||||
|
|
|
@ -2,6 +2,7 @@ from __future__ import absolute_import
|
|||
|
||||
import gzip
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
|
@ -18,7 +19,6 @@ from selenium.webdriver.support import expected_conditions as EC
|
|||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from six.moves import range
|
||||
|
||||
from ..MPLogger import loggingclient
|
||||
from ..SocketInterface import clientsocket
|
||||
from .utils.lso import get_flash_cookies
|
||||
from .utils.webdriver_extensions import (execute_in_all_frames,
|
||||
|
@ -152,9 +152,6 @@ def browse_website(url, num_links, sleep, visit_id, webdriver,
|
|||
get_website(url, sleep, visit_id, webdriver,
|
||||
browser_params, extension_socket)
|
||||
|
||||
# Connect to logger
|
||||
logger = loggingclient(*manager_params['logger_address'])
|
||||
|
||||
# Then visit a few subpages
|
||||
for _ in range(num_links):
|
||||
links = [x for x in get_intra_links(webdriver, url)
|
||||
|
@ -162,7 +159,7 @@ def browse_website(url, num_links, sleep, visit_id, webdriver,
|
|||
if not links:
|
||||
break
|
||||
r = int(random.random() * len(links))
|
||||
logger.info("BROWSER %i: visiting internal link %s" % (
|
||||
logging.info("BROWSER %i: visiting internal link %s" % (
|
||||
browser_params['crawl_id'], links[r].get_attribute("href")))
|
||||
|
||||
try:
|
||||
|
@ -214,7 +211,7 @@ def save_screenshot(visit_id, crawl_id, driver, manager_params, suffix=''):
|
|||
driver.save_screenshot(outname)
|
||||
|
||||
|
||||
def _stitch_screenshot_parts(visit_id, crawl_id, logger, manager_params):
|
||||
def _stitch_screenshot_parts(visit_id, crawl_id, manager_params):
|
||||
# Read image parts and compute dimensions of output image
|
||||
total_height = -1
|
||||
max_scroll = -1
|
||||
|
@ -260,7 +257,7 @@ def _stitch_screenshot_parts(visit_id, crawl_id, logger, manager_params):
|
|||
try:
|
||||
output.save(outname)
|
||||
except SystemError:
|
||||
logger.error(
|
||||
logging.error(
|
||||
"BROWSER %i: SystemError while trying to save screenshot %s. \n"
|
||||
"Slices of image %s \n Final size %s, %s." %
|
||||
(crawl_id, outname, '\n'.join([str(x) for x in parts]),
|
||||
|
@ -271,7 +268,6 @@ def _stitch_screenshot_parts(visit_id, crawl_id, logger, manager_params):
|
|||
|
||||
def screenshot_full_page(visit_id, crawl_id, driver, manager_params,
|
||||
suffix=''):
|
||||
logger = loggingclient(*manager_params['logger_address'])
|
||||
|
||||
outdir = os.path.join(manager_params['screenshot_path'], 'parts')
|
||||
if not os.path.isdir(outdir):
|
||||
|
@ -299,7 +295,7 @@ def screenshot_full_page(visit_id, crawl_id, driver, manager_params,
|
|||
try:
|
||||
driver.execute_script('window.scrollBy(0, window.innerHeight)')
|
||||
except WebDriverException:
|
||||
logger.info(
|
||||
logging.info(
|
||||
"BROWSER %i: WebDriverException while scrolling, "
|
||||
"screenshot may be misaligned!" % crawl_id)
|
||||
pass
|
||||
|
@ -314,12 +310,12 @@ def screenshot_full_page(visit_id, crawl_id, driver, manager_params,
|
|||
driver.save_screenshot(outname % (part, curr_scrollY))
|
||||
except WebDriverException:
|
||||
excp = traceback.format_exception(*sys.exc_info())
|
||||
logger.error(
|
||||
logging.error(
|
||||
"BROWSER %i: Exception while taking full page screenshot \n %s" %
|
||||
(crawl_id, ''.join(excp)))
|
||||
return
|
||||
|
||||
_stitch_screenshot_parts(visit_id, crawl_id, logger, manager_params)
|
||||
_stitch_screenshot_parts(visit_id, crawl_id, manager_params)
|
||||
|
||||
|
||||
def dump_page_source(visit_id, driver, manager_params, suffix=''):
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from __future__ import absolute_import
|
||||
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import tarfile
|
||||
|
@ -7,7 +8,6 @@ import tarfile
|
|||
from six.moves import cPickle as pickle
|
||||
|
||||
from ..Errors import ProfileLoadError
|
||||
from ..MPLogger import loggingclient
|
||||
from .utils.file_utils import rmsubtree
|
||||
from .utils.firefox_profile import sleep_until_sqlite_checkpoint
|
||||
|
||||
|
@ -43,7 +43,7 @@ def load_browser_settings(location):
|
|||
return browser_settings
|
||||
|
||||
|
||||
def save_flash_files(logger, browser_params, dump_location, clear=False):
|
||||
def save_flash_files(browser_params, dump_location, clear=False):
|
||||
"""
|
||||
save all files from the default flash storage locations
|
||||
clear: sets whether to clear storage locations after backup
|
||||
|
@ -54,13 +54,13 @@ def save_flash_files(logger, browser_params, dump_location, clear=False):
|
|||
# Copy all flash objects over to dump location
|
||||
for location in FLASH_LOCS:
|
||||
if not os.path.isdir(location):
|
||||
logger.warning("BROWSER %i: %s not found when attempting to save "
|
||||
"flash files, skipping..." %
|
||||
(browser_params['crawl_id'], location))
|
||||
logging.warning("BROWSER %i: %s not found when attempting to save "
|
||||
"flash files, skipping..." %
|
||||
(browser_params['crawl_id'], location))
|
||||
continue
|
||||
|
||||
logger.debug("BROWSER %i: SAVING %s during flash file archive" %
|
||||
(browser_params['crawl_id'], location))
|
||||
logging.debug("BROWSER %i: SAVING %s during flash file archive" %
|
||||
(browser_params['crawl_id'], location))
|
||||
(head, tail) = os.path.split(location)
|
||||
|
||||
# Remove old backups if exist
|
||||
|
@ -71,23 +71,23 @@ def save_flash_files(logger, browser_params, dump_location, clear=False):
|
|||
shutil.copytree(location, os.path.join(dump_location, tail))
|
||||
|
||||
if clear:
|
||||
logger.debug("BROWSER %i: CLEARING %s during flash file archive" %
|
||||
(browser_params['crawl_id'], location))
|
||||
logging.debug("BROWSER %i: CLEARING %s during flash file archive" %
|
||||
(browser_params['crawl_id'], location))
|
||||
rmsubtree(location)
|
||||
|
||||
|
||||
def load_flash_files(logger, browser_params, tar_location):
|
||||
def load_flash_files(browser_params, tar_location):
|
||||
""" clear old flash cookies and load ones from dump """
|
||||
# Clear previous objects prior to loading
|
||||
for location in FLASH_LOCS:
|
||||
if not os.path.isdir(location):
|
||||
logger.warning("BROWSER %i: %s not found when attempting to load "
|
||||
"flash files, skipping..." %
|
||||
(browser_params['crawl_id'], location))
|
||||
logging.warning("BROWSER %i: %s not found when attempting to load "
|
||||
"flash files, skipping..." %
|
||||
(browser_params['crawl_id'], location))
|
||||
continue
|
||||
|
||||
logger.debug("BROWSER %i: CLEARING %s before loading flash files" %
|
||||
(browser_params['crawl_id'], location))
|
||||
logging.debug("BROWSER %i: CLEARING %s before loading flash files" %
|
||||
(browser_params['crawl_id'], location))
|
||||
shutil.rmtree(location)
|
||||
|
||||
# Copy flash storage objects from tar_location
|
||||
|
@ -95,7 +95,7 @@ def load_flash_files(logger, browser_params, tar_location):
|
|||
if os.path.exists(os.path.join(tar_location, tail)):
|
||||
shutil.copytree(os.path.join(tar_location, tail), location)
|
||||
else:
|
||||
logger.warning(
|
||||
logging.warning(
|
||||
"BROWSER %i: %s not found while loading flash "
|
||||
"files, skipping..." %
|
||||
(browser_params['crawl_id'], os.path.join(tar_location, tail)))
|
||||
|
@ -111,12 +111,9 @@ def dump_profile(browser_profile_folder, manager_params, browser_params,
|
|||
if <browser_settings> exists they are also saved
|
||||
<save_flash> specifies whether to dump flash files
|
||||
"""
|
||||
# Connect to logger
|
||||
logger = loggingclient(*manager_params['logger_address'])
|
||||
|
||||
logger.debug("BROWSER %i: Profile dumping is currently unsupported. "
|
||||
"See: https://github.com/mozilla/OpenWPM/projects/2." %
|
||||
browser_params['crawl_id'])
|
||||
logging.debug("BROWSER %i: Profile dumping is currently unsupported. "
|
||||
"See: https://github.com/mozilla/OpenWPM/projects/2." %
|
||||
browser_params['crawl_id'])
|
||||
return
|
||||
|
||||
# ensures that folder paths end with slashes
|
||||
|
@ -148,7 +145,7 @@ def dump_profile(browser_profile_folder, manager_params, browser_params,
|
|||
tar = tarfile.open(tar_location + tar_name, 'w:gz', errorlevel=1)
|
||||
else:
|
||||
tar = tarfile.open(tar_location + tar_name, 'w', errorlevel=1)
|
||||
logger.debug(
|
||||
logging.debug(
|
||||
"BROWSER %i: Backing up full profile from %s to %s" %
|
||||
(browser_params['crawl_id'], browser_profile_folder,
|
||||
tar_location + tar_name)
|
||||
|
@ -173,7 +170,7 @@ def dump_profile(browser_profile_folder, manager_params, browser_params,
|
|||
if not os.path.isfile(full_path) and \
|
||||
full_path[-3:] != 'shm' and \
|
||||
full_path[-3:] != 'wal':
|
||||
logger.critical(
|
||||
logging.critical(
|
||||
"BROWSER %i: %s NOT FOUND IN profile folder, skipping." %
|
||||
(browser_params['crawl_id'], full_path))
|
||||
elif not os.path.isfile(full_path) and \
|
||||
|
@ -183,7 +180,7 @@ def dump_profile(browser_profile_folder, manager_params, browser_params,
|
|||
for item in storage_vector_dirs:
|
||||
full_path = os.path.join(browser_profile_folder, item)
|
||||
if not os.path.isdir(full_path):
|
||||
logger.warning(
|
||||
logging.warning(
|
||||
"BROWSER %i: %s NOT FOUND IN profile folder, skipping." %
|
||||
(browser_params['crawl_id'], full_path))
|
||||
continue
|
||||
|
@ -192,7 +189,7 @@ def dump_profile(browser_profile_folder, manager_params, browser_params,
|
|||
|
||||
# save flash cookies
|
||||
if save_flash:
|
||||
save_flash_files(logger, browser_params, tar_location)
|
||||
save_flash_files(browser_params, tar_location)
|
||||
|
||||
# save the browser settings
|
||||
if browser_settings is not None:
|
||||
|
@ -207,9 +204,6 @@ def load_profile(browser_profile_folder, manager_params, browser_params,
|
|||
is in the folder, either full_profile.tar.gz or profile.tar.gz
|
||||
"""
|
||||
try:
|
||||
# Connect to logger
|
||||
logger = loggingclient(*manager_params['logger_address'])
|
||||
|
||||
# ensures that folder paths end with slashes
|
||||
if browser_profile_folder[-1] != '/':
|
||||
browser_profile_folder = browser_profile_folder + "/"
|
||||
|
@ -222,7 +216,7 @@ def load_profile(browser_profile_folder, manager_params, browser_params,
|
|||
tar_name = 'profile.tar'
|
||||
|
||||
# Copy and untar the loaded profile
|
||||
logger.debug(
|
||||
logging.debug(
|
||||
"BROWSER %i: Copying profile tar from %s to %s" %
|
||||
(browser_params['crawl_id'], tar_location + tar_name,
|
||||
browser_profile_folder)
|
||||
|
@ -238,17 +232,17 @@ def load_profile(browser_profile_folder, manager_params, browser_params,
|
|||
f.extractall(browser_profile_folder)
|
||||
f.close()
|
||||
os.remove(browser_profile_folder + tar_name)
|
||||
logger.debug(
|
||||
logging.debug(
|
||||
"BROWSER %i: Tarfile extracted" % browser_params['crawl_id'])
|
||||
|
||||
# clear and load flash cookies
|
||||
if load_flash:
|
||||
load_flash_files(logger, browser_params, tar_location)
|
||||
load_flash_files(browser_params, tar_location)
|
||||
|
||||
# load the browser settings
|
||||
browser_settings = load_browser_settings(tar_location)
|
||||
except Exception as ex:
|
||||
logger.critical(
|
||||
logging.critical(
|
||||
"BROWSER %i: Error: %s while attempting to load profile" %
|
||||
(browser_params['crawl_id'], str(ex)))
|
||||
raise ProfileLoadError('Profile Load not successful')
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
import abc
|
||||
import logging
|
||||
import time
|
||||
|
||||
from multiprocess import Process, Queue
|
||||
from six.moves import queue
|
||||
|
||||
from ..MPLogger import loggingclient
|
||||
from ..SocketInterface import serversocket
|
||||
|
||||
RECORD_TYPE_CONTENT = 'page_content'
|
||||
|
@ -33,7 +33,6 @@ class BaseListener(object):
|
|||
def __init__(self, status_queue, shutdown_queue, manager_params):
|
||||
self.status_queue = status_queue
|
||||
self.shutdown_queue = shutdown_queue
|
||||
self.logger = loggingclient(*manager_params['logger_address'])
|
||||
self._shutdown_flag = False
|
||||
self._last_update = time.time() # last status update time
|
||||
self.record_queue = None # Initialized on `startup`
|
||||
|
@ -71,7 +70,7 @@ class BaseListener(object):
|
|||
"""Return `True` if the listener has received a shutdown signal"""
|
||||
if not self.shutdown_queue.empty():
|
||||
self.shutdown_queue.get()
|
||||
self.logger.info("Received shutdown signal!")
|
||||
logging.info("Received shutdown signal!")
|
||||
return True
|
||||
return False
|
||||
|
||||
|
@ -81,7 +80,7 @@ class BaseListener(object):
|
|||
return
|
||||
qsize = self.record_queue.qsize()
|
||||
self.status_queue.put(qsize)
|
||||
self.logger.debug(
|
||||
logging.debug(
|
||||
"Status update; current record queue size: %d" % qsize)
|
||||
self._last_update = time.time()
|
||||
|
||||
|
@ -116,7 +115,6 @@ class BaseAggregator(object):
|
|||
def __init__(self, manager_params, browser_params):
|
||||
self.manager_params = manager_params
|
||||
self.browser_params = browser_params
|
||||
self.logger = loggingclient(*manager_params['logger_address'])
|
||||
self.listener_address = None
|
||||
self.listener_process = None
|
||||
self.status_queue = Queue()
|
||||
|
@ -184,14 +182,14 @@ class BaseAggregator(object):
|
|||
|
||||
def shutdown(self):
|
||||
""" Terminate the aggregator listener process"""
|
||||
self.logger.debug(
|
||||
logging.debug(
|
||||
"Sending the shutdown signal to the %s listener process..." %
|
||||
type(self).__name__
|
||||
)
|
||||
self.shutdown_queue.put(SHUTDOWN_SIGNAL)
|
||||
start_time = time.time()
|
||||
self.listener_process.join(300)
|
||||
self.logger.debug(
|
||||
logging.debug(
|
||||
"%s took %s seconds to close." % (
|
||||
type(self).__name__,
|
||||
str(time.time() - start_time)
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from __future__ import absolute_import
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os.path
|
||||
import random
|
||||
import sys
|
||||
|
@ -9,7 +10,6 @@ from pyvirtualdisplay import Display
|
|||
from selenium import webdriver
|
||||
|
||||
from ..Commands.profile_commands import load_profile
|
||||
from ..MPLogger import loggingclient
|
||||
from ..utilities.platform_utils import (get_firefox_binary_path,
|
||||
get_geckodriver_exec_path)
|
||||
from . import configure_firefox
|
||||
|
@ -28,7 +28,6 @@ def deploy_firefox(status_queue, browser_params, manager_params,
|
|||
geckodriver_executable_path = get_geckodriver_exec_path()
|
||||
|
||||
root_dir = os.path.dirname(__file__) # directory of this file
|
||||
logger = loggingclient(*manager_params['logger_address'])
|
||||
|
||||
display_pid = None
|
||||
display_port = None
|
||||
|
@ -43,9 +42,9 @@ def deploy_firefox(status_queue, browser_params, manager_params,
|
|||
|
||||
profile_settings = None # Imported browser settings
|
||||
if browser_params['profile_tar'] and not crash_recovery:
|
||||
logger.debug("BROWSER %i: Loading initial browser profile from: %s"
|
||||
% (browser_params['crawl_id'],
|
||||
browser_params['profile_tar']))
|
||||
logging.debug("BROWSER %i: Loading initial browser profile from: %s"
|
||||
% (browser_params['crawl_id'],
|
||||
browser_params['profile_tar']))
|
||||
load_flash = browser_params['disable_flash'] is False
|
||||
profile_settings = load_profile(browser_profile_path,
|
||||
manager_params,
|
||||
|
@ -53,9 +52,9 @@ def deploy_firefox(status_queue, browser_params, manager_params,
|
|||
browser_params['profile_tar'],
|
||||
load_flash=load_flash)
|
||||
elif browser_params['profile_tar']:
|
||||
logger.debug("BROWSER %i: Loading recovered browser profile from: %s"
|
||||
% (browser_params['crawl_id'],
|
||||
browser_params['profile_tar']))
|
||||
logging.debug("BROWSER %i: Loading recovered browser profile from: %s"
|
||||
% (browser_params['crawl_id'],
|
||||
browser_params['profile_tar']))
|
||||
profile_settings = load_profile(browser_profile_path,
|
||||
manager_params,
|
||||
browser_params,
|
||||
|
@ -63,8 +62,8 @@ def deploy_firefox(status_queue, browser_params, manager_params,
|
|||
status_queue.put(('STATUS', 'Profile Tar', None))
|
||||
|
||||
if browser_params['random_attributes'] and profile_settings is None:
|
||||
logger.debug("BROWSER %i: Loading random attributes for browser"
|
||||
% browser_params['crawl_id'])
|
||||
logging.debug("BROWSER %i: Loading random attributes for browser"
|
||||
% browser_params['crawl_id'])
|
||||
profile_settings = dict()
|
||||
|
||||
# choose a random screen-res from list
|
||||
|
@ -88,15 +87,15 @@ def deploy_firefox(status_queue, browser_params, manager_params,
|
|||
profile_settings['ua_string'] = None
|
||||
|
||||
if profile_settings['ua_string'] is not None:
|
||||
logger.debug("BROWSER %i: Overriding user agent string to '%s'"
|
||||
% (browser_params['crawl_id'],
|
||||
profile_settings['ua_string']))
|
||||
logging.debug("BROWSER %i: Overriding user agent string to '%s'"
|
||||
% (browser_params['crawl_id'],
|
||||
profile_settings['ua_string']))
|
||||
fo.set_preference("general.useragent.override",
|
||||
profile_settings['ua_string'])
|
||||
|
||||
if browser_params['headless']:
|
||||
if sys.platform == 'darwin':
|
||||
logger.warn(
|
||||
logging.warn(
|
||||
"BROWSER %i: headless mode is not supported on MacOS. "
|
||||
"Browser window will be visible." % browser_params['crawl_id']
|
||||
)
|
||||
|
@ -122,8 +121,8 @@ def deploy_firefox(status_queue, browser_params, manager_params,
|
|||
ext_config_file = browser_profile_path + 'browser_params.json'
|
||||
with open(ext_config_file, 'w') as f:
|
||||
json.dump(extension_config, f)
|
||||
logger.debug("BROWSER %i: Saved extension config file to: %s" %
|
||||
(browser_params['crawl_id'], ext_config_file))
|
||||
logging.debug("BROWSER %i: Saved extension config file to: %s" %
|
||||
(browser_params['crawl_id'], ext_config_file))
|
||||
|
||||
# TODO restore detailed logging
|
||||
# fo.set_preference("extensions.@openwpm.sdk.console.logLevel", "all")
|
||||
|
@ -146,13 +145,13 @@ def deploy_firefox(status_queue, browser_params, manager_params,
|
|||
# main logger. This will also inform us where the real profile
|
||||
# directory is hiding.
|
||||
interceptor = FirefoxLogInterceptor(
|
||||
browser_params['crawl_id'], logger, browser_profile_path)
|
||||
browser_params['crawl_id'], browser_profile_path)
|
||||
interceptor.start()
|
||||
|
||||
# Set custom prefs. These are set after all of the default prefs to allow
|
||||
# our defaults to be overwritten.
|
||||
for name, value in browser_params['prefs'].items():
|
||||
logger.info(
|
||||
logging.info(
|
||||
"BROWSER %i: Setting custom preference: %s = %s" %
|
||||
(browser_params['crawl_id'], name, value))
|
||||
fo.set_preference(name, value)
|
||||
|
@ -171,8 +170,8 @@ def deploy_firefox(status_queue, browser_params, manager_params,
|
|||
ext_loc = os.path.join(root_dir, '../Extension/firefox/openwpm.xpi')
|
||||
ext_loc = os.path.normpath(ext_loc)
|
||||
driver.install_addon(ext_loc, temporary=True)
|
||||
logger.debug("BROWSER %i: OpenWPM Firefox extension loaded"
|
||||
% browser_params['crawl_id'])
|
||||
logging.debug("BROWSER %i: OpenWPM Firefox extension loaded"
|
||||
% browser_params['crawl_id'])
|
||||
|
||||
# set window size
|
||||
driver.set_window_size(*profile_settings['screen_res'])
|
||||
|
|
|
@ -6,6 +6,7 @@ from __future__ import absolute_import
|
|||
|
||||
import errno
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
|
@ -58,10 +59,9 @@ class FirefoxLogInterceptor(threading.Thread):
|
|||
from geckodriver's log output (geckodriver copies the profile).
|
||||
"""
|
||||
|
||||
def __init__(self, crawl_id, logger, profile_path):
|
||||
def __init__(self, crawl_id, profile_path):
|
||||
threading.Thread.__init__(self, name="log-interceptor-%i" % crawl_id)
|
||||
self.crawl_id = crawl_id
|
||||
self.logger = logger
|
||||
self.fifo = mktempfifo(suffix=".log", prefix="owpm_driver_")
|
||||
self.profile_path = profile_path
|
||||
self.daemon = True
|
||||
|
@ -73,8 +73,8 @@ class FirefoxLogInterceptor(threading.Thread):
|
|||
try:
|
||||
with open(self.fifo, "rt") as f:
|
||||
for line in f:
|
||||
self.logger.debug("BROWSER %i: driver: %s"
|
||||
% (self.crawl_id, line.strip()))
|
||||
logging.debug("BROWSER %i: driver: %s" %
|
||||
(self.crawl_id, line.strip()))
|
||||
if "Using profile path" in line:
|
||||
self.profile_path = \
|
||||
line.partition("Using profile path")[-1].strip()
|
||||
|
|
|
@ -77,7 +77,7 @@ class MPLogger(object):
|
|||
)
|
||||
self._listener.daemon = True
|
||||
self._listener.start()
|
||||
self._listener_address = self._status_queue.get()
|
||||
self.logger_address = self._status_queue.get()
|
||||
|
||||
# Attach console handler to log to console
|
||||
logger = logging.getLogger()
|
||||
|
@ -90,7 +90,7 @@ class MPLogger(object):
|
|||
logger.addHandler(consoleHandler)
|
||||
|
||||
# Attach socket handler to logger to serialize writes to file
|
||||
socketHandler = ClientSocketHandler(*self._listener_address)
|
||||
socketHandler = ClientSocketHandler(*self.logger_address)
|
||||
socketHandler.setLevel(logging.DEBUG)
|
||||
logger.addHandler(socketHandler)
|
||||
|
||||
|
@ -104,17 +104,10 @@ class MPLogger(object):
|
|||
def _initialize_sentry(self):
|
||||
"""If running a cloud crawl, we can pull the sentry endpoint
|
||||
and related config varibles from the environment"""
|
||||
print("**********")
|
||||
self._breadcrumb_handler = BreadcrumbHandler(level=logging.DEBUG)
|
||||
self._event_handler = EventHandler(level=logging.ERROR)
|
||||
|
||||
# sentry_logging = LoggingIntegration(
|
||||
# level=logging.DEBUG,
|
||||
# event_level=logging.ERROR
|
||||
# )
|
||||
sentry_sdk.init(
|
||||
dsn=self._sentry_dsn,
|
||||
# integrations=[sentry_logging],
|
||||
# before_send=self._sentry_before_send
|
||||
)
|
||||
|
||||
|
@ -177,7 +170,7 @@ class MPLogger(object):
|
|||
self._breadcrumb_handler.handle(record)
|
||||
if record.levelno >= self._event_handler.level:
|
||||
self._event_handler.handle(record)
|
||||
|
||||
|
||||
def close(self):
|
||||
self._status_queue.put("SHUTDOWN")
|
||||
self._listener.join()
|
||||
|
|
|
@ -2,10 +2,10 @@ from __future__ import absolute_import, division
|
|||
|
||||
import copy
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from queue import Queue
|
||||
|
||||
import psutil
|
||||
from six import reraise
|
||||
|
@ -107,13 +107,11 @@ class TaskManager:
|
|||
|
||||
self.process_watchdog = process_watchdog
|
||||
|
||||
# sets up logging server + connect a client
|
||||
self.logging_status_queue = None
|
||||
self.loggingserver = self._launch_loggingserver()
|
||||
# socket location: (address, port)
|
||||
self.manager_params['logger_address'] = self.logging_status_queue.get()
|
||||
self.logger = MPLogger.loggingclient(
|
||||
*self.manager_params['logger_address'])
|
||||
# Start logging server thread
|
||||
self.logging_server = MPLogger.MPLogger(
|
||||
self.manager_params['log_file'])
|
||||
self.manager_params[
|
||||
'logger_address'] = self.logging_server.logger_address
|
||||
|
||||
# Initialize the data aggregators
|
||||
self._launch_aggregators()
|
||||
|
@ -130,7 +128,7 @@ class TaskManager:
|
|||
# Save crawl config information to database
|
||||
openwpm_v, browser_v = get_version()
|
||||
self.data_aggregator.save_configuration(openwpm_v, browser_v)
|
||||
self.logger.info(
|
||||
logging.info(
|
||||
get_configuration_string(
|
||||
self.manager_params, browser_params, (openwpm_v, browser_v)
|
||||
)
|
||||
|
@ -156,8 +154,8 @@ class TaskManager:
|
|||
raise
|
||||
|
||||
if not success:
|
||||
self.logger.critical("Browser spawn failure during "
|
||||
"TaskManager initialization, exiting...")
|
||||
logging.critical("Browser spawn failure during "
|
||||
"TaskManager initialization, exiting...")
|
||||
self.close()
|
||||
break
|
||||
|
||||
|
@ -171,8 +169,8 @@ class TaskManager:
|
|||
kill browser processes started by Selenium 3 (with `subprocess`)
|
||||
"""
|
||||
if self.process_watchdog:
|
||||
self.logger.error("BROWSER %i: Process watchdog is not currently "
|
||||
"supported." % self.crawl_id)
|
||||
logging.error("BROWSER %i: Process watchdog is not currently "
|
||||
"supported." % self.crawl_id)
|
||||
while not self.closing:
|
||||
time.sleep(10)
|
||||
|
||||
|
@ -182,10 +180,10 @@ class TaskManager:
|
|||
process = psutil.Process(browser.browser_pid)
|
||||
mem = process.memory_info()[0] / float(2 ** 20)
|
||||
if mem > BROWSER_MEMORY_LIMIT:
|
||||
self.logger.info("BROWSER %i: Memory usage: %iMB"
|
||||
", exceeding limit of %iMB" % (
|
||||
browser.crawl_id, int(mem),
|
||||
BROWSER_MEMORY_LIMIT))
|
||||
logging.info("BROWSER %i: Memory usage: %iMB"
|
||||
", exceeding limit of %iMB" %
|
||||
(browser.crawl_id, int(mem),
|
||||
BROWSER_MEMORY_LIMIT))
|
||||
browser.restart_required = True
|
||||
except psutil.NoSuchProcess:
|
||||
pass
|
||||
|
@ -208,11 +206,11 @@ class TaskManager:
|
|||
process.pid not in browser_pids) or
|
||||
(process.name() == 'Xvfb' and
|
||||
process.pid not in display_pids))):
|
||||
self.logger.debug("Process: %s (pid: %i) with start "
|
||||
"time %s found running but not in "
|
||||
"browser process list. Killing." % (
|
||||
process.name(), process.pid,
|
||||
process.create_time()))
|
||||
logging.debug("Process: %s (pid: %i) with start "
|
||||
"time %s found running but not in "
|
||||
"browser process list. Killing." % (
|
||||
process.name(), process.pid,
|
||||
process.create_time()))
|
||||
process.kill()
|
||||
|
||||
def _launch_aggregators(self):
|
||||
|
@ -234,26 +232,6 @@ class TaskManager:
|
|||
self.sock = clientsocket(serialization='dill')
|
||||
self.sock.connect(*self.manager_params['aggregator_address'])
|
||||
|
||||
def _kill_aggregators(self):
|
||||
"""Shutdown any currently running data aggregators"""
|
||||
self.data_aggregator.shutdown()
|
||||
|
||||
def _launch_loggingserver(self):
|
||||
""" sets up logging server """
|
||||
self.logging_status_queue = Queue()
|
||||
loggingserver = threading.Thread(
|
||||
target=MPLogger.loggingserver,
|
||||
args=(self.manager_params['log_file'], self.logging_status_queue)
|
||||
)
|
||||
loggingserver.daemon = True
|
||||
loggingserver.start()
|
||||
return loggingserver
|
||||
|
||||
def _kill_loggingserver(self):
|
||||
""" terminates logging server gracefully """
|
||||
self.logging_status_queue.put("DIE")
|
||||
self.loggingserver.join(300)
|
||||
|
||||
def _shutdown_manager(self, during_init=False):
|
||||
"""
|
||||
Wait for current commands to finish, close all child processes and
|
||||
|
@ -267,8 +245,8 @@ class TaskManager:
|
|||
browser.shutdown_browser(during_init)
|
||||
|
||||
self.sock.close() # close socket to data aggregator
|
||||
self._kill_aggregators()
|
||||
self._kill_loggingserver()
|
||||
self.data_aggregator.shutdown()
|
||||
self.logging_server.close()
|
||||
|
||||
def _cleanup_before_fail(self, during_init=False):
|
||||
"""
|
||||
|
@ -288,9 +266,9 @@ class TaskManager:
|
|||
remote browser manager processes. If a failure status is found, the
|
||||
appropriate steps are taken to gracefully close the infrastructure
|
||||
"""
|
||||
self.logger.debug("Checking command failure status indicator...")
|
||||
logging.debug("Checking command failure status indicator...")
|
||||
if self.failure_status:
|
||||
self.logger.debug(
|
||||
logging.debug(
|
||||
"TaskManager failure status set, halting command execution.")
|
||||
self._cleanup_before_fail()
|
||||
if self.failure_status['ErrorType'] == 'ExceedCommandFailureLimit':
|
||||
|
@ -324,7 +302,7 @@ class TaskManager:
|
|||
agg_queue_size = self.data_aggregator.get_most_recent_status()
|
||||
if agg_queue_size >= AGGREGATOR_QUEUE_LIMIT:
|
||||
while agg_queue_size >= AGGREGATOR_QUEUE_LIMIT:
|
||||
self.logger.info(
|
||||
logging.info(
|
||||
"Blocking command submission until the DataAggregator "
|
||||
"is below the max queue size of %d. Current queue "
|
||||
"length %d. " % (AGGREGATOR_QUEUE_LIMIT, agg_queue_size)
|
||||
|
@ -384,7 +362,7 @@ class TaskManager:
|
|||
break
|
||||
time.sleep(SLEEP_CONS)
|
||||
else:
|
||||
self.logger.info(
|
||||
logging.info(
|
||||
"Command index type is not supported or out of range")
|
||||
return
|
||||
|
||||
|
@ -397,7 +375,7 @@ class TaskManager:
|
|||
|
||||
# Check status flags before starting thread
|
||||
if self.closing:
|
||||
self.logger.error(
|
||||
logging.error(
|
||||
"Attempted to execute command on a closed TaskManager")
|
||||
return
|
||||
self._check_failure_status()
|
||||
|
@ -430,7 +408,7 @@ class TaskManager:
|
|||
|
||||
reset = command_sequence.reset
|
||||
if not reset:
|
||||
self.logger.warn(
|
||||
logging.warn(
|
||||
"BROWSER %i: Browser will not reset after CommandSequence "
|
||||
"executes. OpenWPM does not currently support stateful crawls "
|
||||
"(see: https://github.com/mozilla/OpenWPM/projects/2). "
|
||||
|
@ -465,7 +443,7 @@ class TaskManager:
|
|||
if status == "OK":
|
||||
command_succeeded = 1
|
||||
elif status[0] == "CRITICAL":
|
||||
self.logger.critical(
|
||||
logging.critical(
|
||||
"BROWSER %i: Received critical error from browser "
|
||||
"process while executing command %s. Setting failure "
|
||||
"status." % (browser.crawl_id, str(command)))
|
||||
|
@ -477,12 +455,12 @@ class TaskManager:
|
|||
return
|
||||
else:
|
||||
command_succeeded = 0
|
||||
self.logger.info(
|
||||
logging.info(
|
||||
"BROWSER %i: Received failure status while executing "
|
||||
"command: %s" % (browser.crawl_id, command[0]))
|
||||
except EmptyQueue:
|
||||
command_succeeded = -1
|
||||
self.logger.info(
|
||||
logging.info(
|
||||
"BROWSER %i: Timeout while executing command, %s, killing "
|
||||
"browser manager" % (browser.crawl_id, command[0]))
|
||||
|
||||
|
@ -498,7 +476,7 @@ class TaskManager:
|
|||
with self.threadlock:
|
||||
self.failurecount += 1
|
||||
if self.failurecount > self.failure_limit:
|
||||
self.logger.critical(
|
||||
logging.critical(
|
||||
"BROWSER %i: Command execution failure pushes failure "
|
||||
"count above the allowable limit. Setting "
|
||||
"failure_status." % browser.crawl_id)
|
||||
|
@ -508,7 +486,7 @@ class TaskManager:
|
|||
}
|
||||
return
|
||||
browser.restart_required = True
|
||||
self.logger.debug("BROWSER %i: Browser restart required" % (
|
||||
logging.debug("BROWSER %i: Browser restart required" % (
|
||||
browser.crawl_id))
|
||||
else:
|
||||
with self.threadlock:
|
||||
|
@ -527,7 +505,7 @@ class TaskManager:
|
|||
if browser.restart_required or reset:
|
||||
success = browser.restart_browser_manager(clear_profile=reset)
|
||||
if not success:
|
||||
self.logger.critical(
|
||||
logging.critical(
|
||||
"BROWSER %i: Exceeded the maximum allowable consecutive "
|
||||
"browser launch failures. Setting failure_status." % (
|
||||
browser.crawl_id))
|
||||
|
@ -567,6 +545,6 @@ class TaskManager:
|
|||
Execute shutdown procedure for TaskManager
|
||||
"""
|
||||
if self.closing:
|
||||
self.logger.error("TaskManager already closed")
|
||||
logging.error("TaskManager already closed")
|
||||
return
|
||||
self._shutdown_manager()
|
||||
|
|
11
crawler.py
11
crawler.py
|
@ -1,5 +1,6 @@
|
|||
from __future__ import absolute_import
|
||||
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
|
||||
|
@ -83,20 +84,20 @@ if SENTRY_DSN:
|
|||
|
||||
# Connect to job queue
|
||||
job_queue = rediswq.RedisWQ(name=REDIS_QUEUE_NAME, host="redis")
|
||||
manager.logger.info("Worker with sessionID: %s" % job_queue.sessionID())
|
||||
manager.logger.info("Initial queue state: empty=%s" % job_queue.empty())
|
||||
logging.info("Worker with sessionID: %s" % job_queue.sessionID())
|
||||
logging.info("Initial queue state: empty=%s" % job_queue.empty())
|
||||
|
||||
# Crawl sites specified in job queue until empty
|
||||
while not job_queue.empty():
|
||||
job = job_queue.lease(lease_secs=120, block=True, timeout=5)
|
||||
if job is None:
|
||||
manager.logger.info("Waiting for work")
|
||||
logging.info("Waiting for work")
|
||||
time.sleep(5)
|
||||
else:
|
||||
site_rank, site = job.decode("utf-8").split(',')
|
||||
if "://" not in site:
|
||||
site = "http://" + site
|
||||
manager.logger.info("Visiting %s..." % site)
|
||||
logging.info("Visiting %s..." % site)
|
||||
command_sequence = CommandSequence.CommandSequence(
|
||||
site, reset=True
|
||||
)
|
||||
|
@ -104,7 +105,7 @@ while not job_queue.empty():
|
|||
manager.execute_command_sequence(command_sequence)
|
||||
job_queue.complete(job)
|
||||
|
||||
manager.logger.info("Job queue finished, exiting.")
|
||||
logging.info("Job queue finished, exiting.")
|
||||
manager.close()
|
||||
|
||||
if SENTRY_DSN:
|
||||
|
|
Загрузка…
Ссылка в новой задаче