зеркало из https://github.com/openwpm/OpenWPM.git
Improvements to profile dumping and loading
1. Added in success logging for profile untaring 2. Made compression optional when dumping/loading profile 3. Removed "slim profile" option, since the full profile isn't much bigger
This commit is contained in:
Родитель
c0cea836f9
Коммит
f4575044d6
|
@ -72,8 +72,7 @@ class Browser:
|
|||
self.browser_params,
|
||||
tempdir,
|
||||
close_webdriver=False,
|
||||
browser_settings=self.browser_settings,
|
||||
full_profile=True)
|
||||
browser_settings=self.browser_settings)
|
||||
self.browser_params['profile_tar'] = tempdir # make sure browser loads crashed profile
|
||||
self.browser_params['random_attributes'] = False # don't re-randomize attributes
|
||||
crash_recovery = True
|
||||
|
@ -98,10 +97,12 @@ class Browser:
|
|||
self.browser_manager.start()
|
||||
|
||||
# Read success status of browser manager
|
||||
prof_done = disp_done = browser_done = ready_done = launch_attempted = False
|
||||
prof_done = prof_tar_done = disp_done = browser_done = ready_done = launch_attempted = False
|
||||
try:
|
||||
self.current_profile_path = self.status_queue.get(True, spawn_timeout)
|
||||
prof_done = True
|
||||
useless = self.status_queue.get(True, spawn_timeout)
|
||||
prof_tar_done = True
|
||||
(self.display_pid, self.display_port) = self.status_queue.get(True, spawn_timeout)
|
||||
disp_done = True
|
||||
useless = self.status_queue.get(True, spawn_timeout)
|
||||
|
@ -115,8 +116,8 @@ class Browser:
|
|||
success = True
|
||||
except EmptyQueue:
|
||||
unsuccessful_spawns += 1
|
||||
self.logger.error("BROWSER %i: Spawn unsuccessful | Profile: %s | Display: %s | Launch attempted: %s | Browser: %s" %
|
||||
(self.crawl_id, str(prof_done), str(disp_done), str(launch_attempted), str(browser_done)))
|
||||
self.logger.error("BROWSER %i: Spawn unsuccessful | Profile Created: %s | Profile Tar: %s | Display: %s | Launch attempted: %s | Browser: %s" %
|
||||
(self.crawl_id, str(prof_done), str(prof_tar_done), str(disp_done), str(launch_attempted), str(browser_done)))
|
||||
self.kill_browser_manager()
|
||||
if self.current_profile_path is not None:
|
||||
shutil.rmtree(self.current_profile_path, ignore_errors=True)
|
||||
|
|
|
@ -23,8 +23,8 @@ def execute_command(command, webdriver, proxy_queue, browser_settings, browser_p
|
|||
|
||||
if command[0] == 'DUMP_PROF':
|
||||
profile_commands.dump_profile(browser_params['profile_path'], manager_params,
|
||||
browser_params command[1], command[2], webdriver,
|
||||
browser_settings,
|
||||
browser_params, command[1], command[2], webdriver,
|
||||
browser_settings, compress=command[3],
|
||||
save_flash=browser_params['disable_flash'] is False)
|
||||
|
||||
if command[0] == 'EXTRACT_LINKS':
|
||||
|
|
|
@ -84,12 +84,11 @@ def load_flash_files(logger, browser_params, tar_location):
|
|||
|
||||
def dump_profile(browser_profile_folder, manager_params, browser_params, tar_location,
|
||||
close_webdriver, webdriver=None, browser_settings=None, save_flash=False,
|
||||
full_profile=True):
|
||||
compress=False):
|
||||
"""
|
||||
dumps a browser profile currently stored in <browser_profile_folder> to
|
||||
<tar_location> in which both folders are absolute paths.
|
||||
if <browser_settings> exists they are also saved
|
||||
<full_profile> specifies to save the entire profile directory (not just cookies)
|
||||
<save_flash> specifies whether to dump flash files
|
||||
"""
|
||||
# Connect to logger
|
||||
|
@ -103,10 +102,10 @@ def dump_profile(browser_profile_folder, manager_params, browser_params, tar_loc
|
|||
if not os.path.exists(tar_location):
|
||||
os.makedirs(tar_location)
|
||||
|
||||
if full_profile:
|
||||
tar_name = 'full_profile.tar.gz'
|
||||
else:
|
||||
if compress:
|
||||
tar_name = 'profile.tar.gz'
|
||||
else:
|
||||
tar_name = 'profile.tar'
|
||||
|
||||
# see if this file exists first
|
||||
# if it does, delete it before we try to save the current session
|
||||
|
@ -119,39 +118,33 @@ def dump_profile(browser_profile_folder, manager_params, browser_params, tar_loc
|
|||
sleep_until_sqlite_checkpoint(browser_profile_folder)
|
||||
|
||||
# backup and tar profile
|
||||
tar = tarfile.open(tar_location + tar_name, 'w:gz')
|
||||
if full_profile: # backup all storage vectors
|
||||
logger.debug("BROWSER %i: Backing up full profile from %s to %s" % (browser_params['crawl_id'], browser_profile_folder, tar_location + tar_name))
|
||||
storage_vector_files = [
|
||||
'cookies.sqlite', 'cookies.sqlite-shm', 'cookies.sqlite-wal', # cookies
|
||||
'places.sqlite', 'places.sqlite-shm', 'places.sqlite-wal', # history
|
||||
'webappsstore.sqlite', 'webappsstore.sqlite-shm', 'webappsstore.sqlite-wal', # localStorage
|
||||
]
|
||||
storage_vector_dirs = [
|
||||
'webapps', # related to localStorage?
|
||||
'storage' # directory for IndexedDB
|
||||
]
|
||||
for item in storage_vector_files:
|
||||
full_path = os.path.join(browser_profile_folder, item)
|
||||
if not os.path.isfile(full_path) and full_path[-3:] != 'shm' and full_path[-3:] != 'wal':
|
||||
logger.critical("BROWSER %i: %s NOT FOUND IN profile folder, skipping." % (browser_params['crawl_id'], full_path))
|
||||
elif not os.path.isfile(full_path) and (full_path[-3:] == 'shm' or full_path[-3:] == 'wal'):
|
||||
continue # These are just checkpoint files
|
||||
tar.add(full_path, arcname=item)
|
||||
for item in storage_vector_dirs:
|
||||
full_path = os.path.join(browser_profile_folder, item)
|
||||
if not os.path.isdir(full_path):
|
||||
logger.warning("BROWSER %i: %s NOT FOUND IN profile folder, skipping." % (browser_params['crawl_id'], full_path))
|
||||
continue
|
||||
tar.add(full_path, arcname=item)
|
||||
|
||||
else: # only backup cookies and history
|
||||
logger.debug("BROWSER %i: Backing up limited profile from %s to %s" % (browser_params['crawl_id'], browser_profile_folder, tar_location + tar_name))
|
||||
for db in ["cookies.sqlite", "cookies.sqlite-shm", "cookies.sqlite-wal",
|
||||
"places.sqlite", "places.sqlite-shm", "places.sqlite-wal"]:
|
||||
if os.path.isfile(browser_profile_folder + db):
|
||||
logger.debug("BROWSER %i: Adding %s from profile folder to archive." % (browser_params['crawl_id'], full_path))
|
||||
tar.add(browser_profile_folder + db, arcname=db)
|
||||
if compress:
|
||||
tar = tarfile.open(tar_location + tar_name, 'w:gz', errorlevel=1)
|
||||
else:
|
||||
tar = tarfile.open(tar_location + tar_name, 'w', errorlevel=1)
|
||||
logger.debug("BROWSER %i: Backing up full profile from %s to %s" % (browser_params['crawl_id'], browser_profile_folder, tar_location + tar_name))
|
||||
storage_vector_files = [
|
||||
'cookies.sqlite', 'cookies.sqlite-shm', 'cookies.sqlite-wal', # cookies
|
||||
'places.sqlite', 'places.sqlite-shm', 'places.sqlite-wal', # history
|
||||
'webappsstore.sqlite', 'webappsstore.sqlite-shm', 'webappsstore.sqlite-wal', # localStorage
|
||||
]
|
||||
storage_vector_dirs = [
|
||||
'webapps', # related to localStorage?
|
||||
'storage' # directory for IndexedDB
|
||||
]
|
||||
for item in storage_vector_files:
|
||||
full_path = os.path.join(browser_profile_folder, item)
|
||||
if not os.path.isfile(full_path) and full_path[-3:] != 'shm' and full_path[-3:] != 'wal':
|
||||
logger.critical("BROWSER %i: %s NOT FOUND IN profile folder, skipping." % (browser_params['crawl_id'], full_path))
|
||||
elif not os.path.isfile(full_path) and (full_path[-3:] == 'shm' or full_path[-3:] == 'wal'):
|
||||
continue # These are just checkpoint files
|
||||
tar.add(full_path, arcname=item)
|
||||
for item in storage_vector_dirs:
|
||||
full_path = os.path.join(browser_profile_folder, item)
|
||||
if not os.path.isdir(full_path):
|
||||
logger.warning("BROWSER %i: %s NOT FOUND IN profile folder, skipping." % (browser_params['crawl_id'], full_path))
|
||||
continue
|
||||
tar.add(full_path, arcname=item)
|
||||
tar.close()
|
||||
|
||||
# save flash cookies
|
||||
|
@ -178,17 +171,23 @@ def load_profile(browser_profile_folder, manager_params, browser_params, tar_loc
|
|||
else browser_profile_folder + "/"
|
||||
tar_location = tar_location if tar_location.endswith("/") else tar_location + "/"
|
||||
|
||||
if os.path.isfile(tar_location + 'full_profile.tar.gz'):
|
||||
tar_name = 'full_profile.tar.gz'
|
||||
else:
|
||||
if os.path.isfile(tar_location + 'profile.tar.gz'):
|
||||
tar_name = 'profile.tar.gz'
|
||||
else:
|
||||
tar_name = 'profile.tar'
|
||||
|
||||
# Copy and untar the loaded profile
|
||||
logger.debug("BROWSER %i: Copying profile tar from %s to %s" % (browser_params['crawl_id'], tar_location+tar_name, browser_profile_folder))
|
||||
shutil.copy(tar_location + tar_name, browser_profile_folder)
|
||||
with tarfile.open(browser_profile_folder + tar_name, 'r:gz') as f:
|
||||
f.extractall(browser_profile_folder)
|
||||
|
||||
if tar_name == 'profile.tar.gz':
|
||||
f = tarfile.open(browser_profile_folder + tar_name, 'r:gz', errorlevel=1)
|
||||
else:
|
||||
f = tarfile.open(browser_profile_folder + tar_name, 'r', errorlevel=1)
|
||||
f.extractall(browser_profile_folder)
|
||||
f.close()
|
||||
os.remove(browser_profile_folder + tar_name)
|
||||
logger.debug("BROWSER %i: Tarfile extracted" % browser_params['crawl_id'])
|
||||
|
||||
# clear and load flash cookies
|
||||
if load_flash:
|
||||
|
|
|
@ -28,7 +28,7 @@ def deploy_firefox(status_queue, browser_params, manager_params, crash_recovery)
|
|||
# Enable logging
|
||||
#LOGGER.setLevel(logging.WARNING)
|
||||
#fp.set_preference("webdriver.log.file", os.path.expanduser('~/selenium_logging'))
|
||||
|
||||
|
||||
profile_settings = None # Imported browser settings
|
||||
if browser_params['profile_tar'] and not crash_recovery:
|
||||
logger.debug("BROWSER %i: Loading initial browser profile from: %s" % (browser_params['crawl_id'], browser_params['profile_tar']))
|
||||
|
@ -39,7 +39,8 @@ def deploy_firefox(status_queue, browser_params, manager_params, crash_recovery)
|
|||
logger.debug("BROWSER %i: Loading recovered browser profile from: %s" % (browser_params['crawl_id'], browser_params['profile_tar']))
|
||||
profile_settings = load_profile(browser_profile_path, manager_params, browser_params,
|
||||
browser_params['profile_tar'])
|
||||
|
||||
status_queue.put('profile_success')
|
||||
|
||||
if browser_params['random_attributes'] and profile_settings is None:
|
||||
logger.debug("BROWSER %i: Loading random attributes for browser" % browser_params['crawl_id'])
|
||||
profile_settings = dict()
|
||||
|
|
|
@ -418,9 +418,9 @@ class TaskManager:
|
|||
""" dumps the local storage vectors (flash, localStorage, cookies) to db """
|
||||
self._distribute_command(('DUMP_STORAGE_VECTORS', url, start_time), index, timeout)
|
||||
|
||||
def dump_profile(self, dump_folder, close_webdriver=False, index=None, timeout=120):
|
||||
def dump_profile(self, dump_folder, close_webdriver=False, compress=True, index=None, timeout=120):
|
||||
""" dumps from the profile path to a given file (absolute path) """
|
||||
self._distribute_command(('DUMP_PROF', dump_folder, close_webdriver), index, timeout)
|
||||
self._distribute_command(('DUMP_PROF', dump_folder, close_webdriver, compress), index, timeout)
|
||||
|
||||
def extract_links(self, index=None, timeout=30):
|
||||
self._distribute_command(('EXTRACT_LINKS',), index, timeout)
|
||||
|
|
Загрузка…
Ссылка в новой задаче