Improvements to profile dumping and loading

1. Added in success logging for profile untaring
2. Made compression optional when dumping/loading profile
3. Removed "slim profile" option, since the full profile isn't much bigger
This commit is contained in:
englehardt 2015-09-21 16:01:56 +00:00
Родитель c0cea836f9
Коммит f4575044d6
5 изменённых файлов: 55 добавлений и 54 удалений

Просмотреть файл

@ -72,8 +72,7 @@ class Browser:
self.browser_params,
tempdir,
close_webdriver=False,
browser_settings=self.browser_settings,
full_profile=True)
browser_settings=self.browser_settings)
self.browser_params['profile_tar'] = tempdir # make sure browser loads crashed profile
self.browser_params['random_attributes'] = False # don't re-randomize attributes
crash_recovery = True
@ -98,10 +97,12 @@ class Browser:
self.browser_manager.start()
# Read success status of browser manager
prof_done = disp_done = browser_done = ready_done = launch_attempted = False
prof_done = prof_tar_done = disp_done = browser_done = ready_done = launch_attempted = False
try:
self.current_profile_path = self.status_queue.get(True, spawn_timeout)
prof_done = True
useless = self.status_queue.get(True, spawn_timeout)
prof_tar_done = True
(self.display_pid, self.display_port) = self.status_queue.get(True, spawn_timeout)
disp_done = True
useless = self.status_queue.get(True, spawn_timeout)
@ -115,8 +116,8 @@ class Browser:
success = True
except EmptyQueue:
unsuccessful_spawns += 1
self.logger.error("BROWSER %i: Spawn unsuccessful | Profile: %s | Display: %s | Launch attempted: %s | Browser: %s" %
(self.crawl_id, str(prof_done), str(disp_done), str(launch_attempted), str(browser_done)))
self.logger.error("BROWSER %i: Spawn unsuccessful | Profile Created: %s | Profile Tar: %s | Display: %s | Launch attempted: %s | Browser: %s" %
(self.crawl_id, str(prof_done), str(prof_tar_done), str(disp_done), str(launch_attempted), str(browser_done)))
self.kill_browser_manager()
if self.current_profile_path is not None:
shutil.rmtree(self.current_profile_path, ignore_errors=True)

Просмотреть файл

@ -23,8 +23,8 @@ def execute_command(command, webdriver, proxy_queue, browser_settings, browser_p
if command[0] == 'DUMP_PROF':
profile_commands.dump_profile(browser_params['profile_path'], manager_params,
browser_params command[1], command[2], webdriver,
browser_settings,
browser_params, command[1], command[2], webdriver,
browser_settings, compress=command[3],
save_flash=browser_params['disable_flash'] is False)
if command[0] == 'EXTRACT_LINKS':

Просмотреть файл

@ -84,12 +84,11 @@ def load_flash_files(logger, browser_params, tar_location):
def dump_profile(browser_profile_folder, manager_params, browser_params, tar_location,
close_webdriver, webdriver=None, browser_settings=None, save_flash=False,
full_profile=True):
compress=False):
"""
dumps a browser profile currently stored in <browser_profile_folder> to
<tar_location> in which both folders are absolute paths.
if <browser_settings> exists they are also saved
<full_profile> specifies to save the entire profile directory (not just cookies)
<save_flash> specifies whether to dump flash files
"""
# Connect to logger
@ -103,10 +102,10 @@ def dump_profile(browser_profile_folder, manager_params, browser_params, tar_loc
if not os.path.exists(tar_location):
os.makedirs(tar_location)
if full_profile:
tar_name = 'full_profile.tar.gz'
else:
if compress:
tar_name = 'profile.tar.gz'
else:
tar_name = 'profile.tar'
# see if this file exists first
# if it does, delete it before we try to save the current session
@ -119,39 +118,33 @@ def dump_profile(browser_profile_folder, manager_params, browser_params, tar_loc
sleep_until_sqlite_checkpoint(browser_profile_folder)
# backup and tar profile
tar = tarfile.open(tar_location + tar_name, 'w:gz')
if full_profile: # backup all storage vectors
logger.debug("BROWSER %i: Backing up full profile from %s to %s" % (browser_params['crawl_id'], browser_profile_folder, tar_location + tar_name))
storage_vector_files = [
'cookies.sqlite', 'cookies.sqlite-shm', 'cookies.sqlite-wal', # cookies
'places.sqlite', 'places.sqlite-shm', 'places.sqlite-wal', # history
'webappsstore.sqlite', 'webappsstore.sqlite-shm', 'webappsstore.sqlite-wal', # localStorage
]
storage_vector_dirs = [
'webapps', # related to localStorage?
'storage' # directory for IndexedDB
]
for item in storage_vector_files:
full_path = os.path.join(browser_profile_folder, item)
if not os.path.isfile(full_path) and full_path[-3:] != 'shm' and full_path[-3:] != 'wal':
logger.critical("BROWSER %i: %s NOT FOUND IN profile folder, skipping." % (browser_params['crawl_id'], full_path))
elif not os.path.isfile(full_path) and (full_path[-3:] == 'shm' or full_path[-3:] == 'wal'):
continue # These are just checkpoint files
tar.add(full_path, arcname=item)
for item in storage_vector_dirs:
full_path = os.path.join(browser_profile_folder, item)
if not os.path.isdir(full_path):
logger.warning("BROWSER %i: %s NOT FOUND IN profile folder, skipping." % (browser_params['crawl_id'], full_path))
continue
tar.add(full_path, arcname=item)
else: # only backup cookies and history
logger.debug("BROWSER %i: Backing up limited profile from %s to %s" % (browser_params['crawl_id'], browser_profile_folder, tar_location + tar_name))
for db in ["cookies.sqlite", "cookies.sqlite-shm", "cookies.sqlite-wal",
"places.sqlite", "places.sqlite-shm", "places.sqlite-wal"]:
if os.path.isfile(browser_profile_folder + db):
logger.debug("BROWSER %i: Adding %s from profile folder to archive." % (browser_params['crawl_id'], full_path))
tar.add(browser_profile_folder + db, arcname=db)
if compress:
tar = tarfile.open(tar_location + tar_name, 'w:gz', errorlevel=1)
else:
tar = tarfile.open(tar_location + tar_name, 'w', errorlevel=1)
logger.debug("BROWSER %i: Backing up full profile from %s to %s" % (browser_params['crawl_id'], browser_profile_folder, tar_location + tar_name))
storage_vector_files = [
'cookies.sqlite', 'cookies.sqlite-shm', 'cookies.sqlite-wal', # cookies
'places.sqlite', 'places.sqlite-shm', 'places.sqlite-wal', # history
'webappsstore.sqlite', 'webappsstore.sqlite-shm', 'webappsstore.sqlite-wal', # localStorage
]
storage_vector_dirs = [
'webapps', # related to localStorage?
'storage' # directory for IndexedDB
]
for item in storage_vector_files:
full_path = os.path.join(browser_profile_folder, item)
if not os.path.isfile(full_path) and full_path[-3:] != 'shm' and full_path[-3:] != 'wal':
logger.critical("BROWSER %i: %s NOT FOUND IN profile folder, skipping." % (browser_params['crawl_id'], full_path))
elif not os.path.isfile(full_path) and (full_path[-3:] == 'shm' or full_path[-3:] == 'wal'):
continue # These are just checkpoint files
tar.add(full_path, arcname=item)
for item in storage_vector_dirs:
full_path = os.path.join(browser_profile_folder, item)
if not os.path.isdir(full_path):
logger.warning("BROWSER %i: %s NOT FOUND IN profile folder, skipping." % (browser_params['crawl_id'], full_path))
continue
tar.add(full_path, arcname=item)
tar.close()
# save flash cookies
@ -178,17 +171,23 @@ def load_profile(browser_profile_folder, manager_params, browser_params, tar_loc
else browser_profile_folder + "/"
tar_location = tar_location if tar_location.endswith("/") else tar_location + "/"
if os.path.isfile(tar_location + 'full_profile.tar.gz'):
tar_name = 'full_profile.tar.gz'
else:
if os.path.isfile(tar_location + 'profile.tar.gz'):
tar_name = 'profile.tar.gz'
else:
tar_name = 'profile.tar'
# Copy and untar the loaded profile
logger.debug("BROWSER %i: Copying profile tar from %s to %s" % (browser_params['crawl_id'], tar_location+tar_name, browser_profile_folder))
shutil.copy(tar_location + tar_name, browser_profile_folder)
with tarfile.open(browser_profile_folder + tar_name, 'r:gz') as f:
f.extractall(browser_profile_folder)
if tar_name == 'profile.tar.gz':
f = tarfile.open(browser_profile_folder + tar_name, 'r:gz', errorlevel=1)
else:
f = tarfile.open(browser_profile_folder + tar_name, 'r', errorlevel=1)
f.extractall(browser_profile_folder)
f.close()
os.remove(browser_profile_folder + tar_name)
logger.debug("BROWSER %i: Tarfile extracted" % browser_params['crawl_id'])
# clear and load flash cookies
if load_flash:

Просмотреть файл

@ -28,7 +28,7 @@ def deploy_firefox(status_queue, browser_params, manager_params, crash_recovery)
# Enable logging
#LOGGER.setLevel(logging.WARNING)
#fp.set_preference("webdriver.log.file", os.path.expanduser('~/selenium_logging'))
profile_settings = None # Imported browser settings
if browser_params['profile_tar'] and not crash_recovery:
logger.debug("BROWSER %i: Loading initial browser profile from: %s" % (browser_params['crawl_id'], browser_params['profile_tar']))
@ -39,7 +39,8 @@ def deploy_firefox(status_queue, browser_params, manager_params, crash_recovery)
logger.debug("BROWSER %i: Loading recovered browser profile from: %s" % (browser_params['crawl_id'], browser_params['profile_tar']))
profile_settings = load_profile(browser_profile_path, manager_params, browser_params,
browser_params['profile_tar'])
status_queue.put('profile_success')
if browser_params['random_attributes'] and profile_settings is None:
logger.debug("BROWSER %i: Loading random attributes for browser" % browser_params['crawl_id'])
profile_settings = dict()

Просмотреть файл

@ -418,9 +418,9 @@ class TaskManager:
""" dumps the local storage vectors (flash, localStorage, cookies) to db """
self._distribute_command(('DUMP_STORAGE_VECTORS', url, start_time), index, timeout)
def dump_profile(self, dump_folder, close_webdriver=False, index=None, timeout=120):
def dump_profile(self, dump_folder, close_webdriver=False, compress=True, index=None, timeout=120):
""" dumps from the profile path to a given file (absolute path) """
self._distribute_command(('DUMP_PROF', dump_folder, close_webdriver), index, timeout)
self._distribute_command(('DUMP_PROF', dump_folder, close_webdriver, compress), index, timeout)
def extract_links(self, index=None, timeout=30):
self._distribute_command(('EXTRACT_LINKS',), index, timeout)