OpenWPM/crawler.py

240 строки
8.1 KiB
Python
Исходник Обычный вид История

import json
import logging
2019-07-13 01:27:49 +03:00
import os
import signal
2020-04-16 17:50:08 +03:00
import sys
2019-07-13 01:27:49 +03:00
import time
import typing
Data Aggregator Rewrite (#753) * First steps in the rewrite * Fixed import paths * One giant refactor * Fixing tests * Adding mypy * Removed mypy from pre-commit workflow * First draft on DataAggregator * Wrote a DataAggregator that starts and shuts down * Created tests and added more empty types * Got demo.py working * Created sql_provider * Cleaned up imports in TaskManager * Added async * Fixed minor bugs * First steps at porting arrow * Introduced TableName and different Task handling * Added more failing tests * First first completes others don't * It works * Started working on arrow_provider * Implemented ArrowProvider * Added logger fixture * Fixed test_storage_controller * Fixing OpenWPMTest.visit() * Moved test/storage_providers to test/storage * Fixing up tests * Moved automation to openwpm * Readded datadir to .gitignore * Ran repin.sh * Fixed formatting * Let's see if this works * Fixed imports * Got arrow_memory_provider working * Starting to rewrite tests * Setting up fixtures * Attempting to fix all the tests * Still fixing tests * Broken content saving * Added node * Fixed screenshot tests * Fixing more tests * Fixed tests * Implemented local_storage.py * Cleaned up flush_cache * Fixing more tests * Wrote test for LocalArrowProvider * Introduced tests for local_storage_provider.py * Asserting test dir is empty * Creating subfolder for different aggregators * New depencies and init() * Everything is terribly broken * Figured out finalize_visit_id * Running two event loops kinda works??? * Rearming the event * Introduced mypy * Downgraded black in pre-commit * Modifying the database directly * Fixed formatting * Made mypy a lil stricter * Fixing docs and config printing * Realising I've been using the wrong with * Trying to figure arrow_storage * Moving lock initialization in in_memory_storage * Fixing tests * Fixing up tests and adding more typechecking * Fixed num_browsers in test_cache_hits_recorded * Parametrized unstructured * String fix * Added failing test * New test * Review changes with Steven * Fixed repin.sh and test_arrow_cache * Minor change * Fixed prune-environment.py * Removing references to DataAggregator * Fixed test_seed_persistance * More paths * Fixed test display shutdown * Made cache test more robust * Update crawler.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Slimming down ManagerParams * Fixing more tests * Update test/storage/test_storage_controller.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Purging references to DataAggregator * Reverted changes to .travis.yml * Demo.py saves locally again * Readjusting test paths * Expanded comment on initialize to reference #846 * Made token optional in finalize_visit_id * Simplified test paramtetrization * Fixed callback semantics change * Removed test_parse_http_stack_trace_str * Added DataSocket * WIP need to fix path encoding * Fixed path encoding * Added task and crawl to schema * Fixed paths in GitHub actions * Refactored completion handling * Fix tests * Trying to fix tests on CI * Removed redundant setting of tag * Removing references to S3 * Purging more DataAggregator references * Craking up logging to figure out test failure * Moved test_values into a fixture * Fixing GcpUnstructuredProvider * Fixed paths for future crawls * Renamed sqllite to official sqlite * Restored demo.py * Update openwpm/commands/profile_commands.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Restored previous behaviour of DumpProfileCommand Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed leftovers * Cleaned up comments * Expanded lock check * Fixed more stuff * More comment updates * Update openwpm/socket_interface.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed outdated comment * Using config_encoder * Renamed tar_location to tar_path * Removed references to database_name in docs * Cleanup * Moved screenshot_path and source_dump_path to ManagerParamsInternal * Fixed imports * Fixing up comments * Fixing up comments * More docs * updated dependencies * Fixed test_task_manager * Reupgraded to python 3.9.1 * Restoring crawl_reference in mp_logger * Removed unused imports * Apply suggestions from code review Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Cleaned up socket handling * Fixed TaskManager.__exit__ * Moved validation code into config.py * Removed comment * Removed comment * Removed comment Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com>
2021-02-22 19:51:32 +03:00
from pathlib import Path
2020-03-06 19:19:28 +03:00
from threading import Lock
from types import FrameType
from typing import Any, Callable, List, Literal, Optional
2020-03-06 19:19:28 +03:00
import sentry_sdk
2019-07-13 01:27:49 +03:00
Data Aggregator Rewrite (#753) * First steps in the rewrite * Fixed import paths * One giant refactor * Fixing tests * Adding mypy * Removed mypy from pre-commit workflow * First draft on DataAggregator * Wrote a DataAggregator that starts and shuts down * Created tests and added more empty types * Got demo.py working * Created sql_provider * Cleaned up imports in TaskManager * Added async * Fixed minor bugs * First steps at porting arrow * Introduced TableName and different Task handling * Added more failing tests * First first completes others don't * It works * Started working on arrow_provider * Implemented ArrowProvider * Added logger fixture * Fixed test_storage_controller * Fixing OpenWPMTest.visit() * Moved test/storage_providers to test/storage * Fixing up tests * Moved automation to openwpm * Readded datadir to .gitignore * Ran repin.sh * Fixed formatting * Let's see if this works * Fixed imports * Got arrow_memory_provider working * Starting to rewrite tests * Setting up fixtures * Attempting to fix all the tests * Still fixing tests * Broken content saving * Added node * Fixed screenshot tests * Fixing more tests * Fixed tests * Implemented local_storage.py * Cleaned up flush_cache * Fixing more tests * Wrote test for LocalArrowProvider * Introduced tests for local_storage_provider.py * Asserting test dir is empty * Creating subfolder for different aggregators * New depencies and init() * Everything is terribly broken * Figured out finalize_visit_id * Running two event loops kinda works??? * Rearming the event * Introduced mypy * Downgraded black in pre-commit * Modifying the database directly * Fixed formatting * Made mypy a lil stricter * Fixing docs and config printing * Realising I've been using the wrong with * Trying to figure arrow_storage * Moving lock initialization in in_memory_storage * Fixing tests * Fixing up tests and adding more typechecking * Fixed num_browsers in test_cache_hits_recorded * Parametrized unstructured * String fix * Added failing test * New test * Review changes with Steven * Fixed repin.sh and test_arrow_cache * Minor change * Fixed prune-environment.py * Removing references to DataAggregator * Fixed test_seed_persistance * More paths * Fixed test display shutdown * Made cache test more robust * Update crawler.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Slimming down ManagerParams * Fixing more tests * Update test/storage/test_storage_controller.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Purging references to DataAggregator * Reverted changes to .travis.yml * Demo.py saves locally again * Readjusting test paths * Expanded comment on initialize to reference #846 * Made token optional in finalize_visit_id * Simplified test paramtetrization * Fixed callback semantics change * Removed test_parse_http_stack_trace_str * Added DataSocket * WIP need to fix path encoding * Fixed path encoding * Added task and crawl to schema * Fixed paths in GitHub actions * Refactored completion handling * Fix tests * Trying to fix tests on CI * Removed redundant setting of tag * Removing references to S3 * Purging more DataAggregator references * Craking up logging to figure out test failure * Moved test_values into a fixture * Fixing GcpUnstructuredProvider * Fixed paths for future crawls * Renamed sqllite to official sqlite * Restored demo.py * Update openwpm/commands/profile_commands.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Restored previous behaviour of DumpProfileCommand Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed leftovers * Cleaned up comments * Expanded lock check * Fixed more stuff * More comment updates * Update openwpm/socket_interface.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed outdated comment * Using config_encoder * Renamed tar_location to tar_path * Removed references to database_name in docs * Cleanup * Moved screenshot_path and source_dump_path to ManagerParamsInternal * Fixed imports * Fixing up comments * Fixing up comments * More docs * updated dependencies * Fixed test_task_manager * Reupgraded to python 3.9.1 * Restoring crawl_reference in mp_logger * Removed unused imports * Apply suggestions from code review Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Cleaned up socket handling * Fixed TaskManager.__exit__ * Moved validation code into config.py * Removed comment * Removed comment * Removed comment Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com>
2021-02-22 19:51:32 +03:00
from openwpm import mp_logger
from openwpm.command_sequence import CommandSequence
Refactoring browser and manager params into dataclasses (#807) * initial file commit * add new dependency for dataclasses * implemeted basic BrowserParams dataclass * dependencies update * file reformat * implemented basic ManagerParams dataclass * Update environment dependencies * Added new error class to validate browser and manager params * file reformat * Update scripts/environment-unpinned.yaml Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * added validations for BrowserParams dataclass * Update openwpm/config.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Removed unnecessary checks Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Changed error string formatting Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Changed filenamea and necessary imports to resolve conflicts with new master branch(refering to PEP-8 reformatting) * Revert "Changed filenamea and necessary imports to resolve conflicts with new master branch(refering to PEP-8 reformatting)" This reverts commit e550c3bd604f415272bd05ee3d9c76397ad98006. * Revert "Merge branch 'master' into turn_browser_and_manager_params_into_dataclasses" This reverts commit aff5a384e737477746d6a38d3b2be6244f8dfd11, reversing changes made to 6ecaf5d0a94d376126692c3785692ba10626d88a. * Revert "Update environment dependencies" This reverts commit 385825b10aee4610a6e304122bec4ab2b7219a5b. * Revert "Merge branch 'turn_browser_and_manager_params_into_dataclasses' of https://github.com/ankushduacodes/OpenWPM into turn_browser_and_manager_params_into_dataclasses" This reverts commit 6ecaf5d0a94d376126692c3785692ba10626d88a, reversing changes made to e550c3bd604f415272bd05ee3d9c76397ad98006. * file reformat * finalized validate_browser_params function * fixed typo in error string * added validations for manager_params * Explanation for using list for supported browser * Revert "Revert "Merge branch 'master' into turn_browser_and_manager_params_into_dataclasses"" This reverts commit 6c3e98e57bd9c42acd029c74649742dcc81de86c. * Revert "Revert "Changed filenamea and necessary imports to resolve conflicts with new master branch(refering to PEP-8 reformatting)"" This reverts commit fc8f48f1878ea7c43b342989ce581dc3d6eab929. * import name change from .Error to .error * moved call_instrument check to config.py * fixed accidental use of dict syntax in a class * moved save_content check from deploy_firefox.py * deleting redundent file * deleted more redundent files * removed redundant imports * added new save_content check * property name changevariables can not have '-' * added new attribute to ManagerParams * adapted files to validate manager & broswer params - also added logic to convert the objects(BrowserParams and ManagerParams) to dictionaries to not break the functionality - also updated demo.py to work with new file names on this branch * removed obsolete documentaion * Dependency Update * Revert "Dependency Update" This reverts commit 8ee3a02b1764883a1f5922e0b52e9f17f8e098db. * Dependencies Update * unset memory and process watchdogs * add new output_format and failure_limit checks * inheriting dataclasses and added type hints to fn * added todo * fixed inheritance of dataclasses acc. to plan * refactor use of dict to use dataclasses(pending) * more refactoring use of dict to dataclasses - Also changed some type hints related to new refactoring * fixed screenshot directory issue - because of which some of the tests were failing * added try-except clause for unexpected errors * added tests to cover dataclasses * added some new and edited some old docs * refactor use of __dict__ to dataclass.to_dict() * Revert "refactor use of __dict__ to dataclass.to_dict()" This reverts commit a4f35513fa26d23a073c16af9fb332045826dcb2. * fixed some tests * refactor use of __dict__ in favor of dataclass.to_dict() method * removed some TODOS * fixed dataclases validation tests * Update docs/Configuration.md Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update docs/Configuration.md Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update docs/Configuration.md Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update openwpm/config.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update openwpm/config.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update openwpm/task_manager.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * minor fixed wrt polishing the PR * added new check and test for crawl configs Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de>
2020-12-02 12:10:45 +03:00
from openwpm.config import BrowserParams, ManagerParams
Data Aggregator Rewrite (#753) * First steps in the rewrite * Fixed import paths * One giant refactor * Fixing tests * Adding mypy * Removed mypy from pre-commit workflow * First draft on DataAggregator * Wrote a DataAggregator that starts and shuts down * Created tests and added more empty types * Got demo.py working * Created sql_provider * Cleaned up imports in TaskManager * Added async * Fixed minor bugs * First steps at porting arrow * Introduced TableName and different Task handling * Added more failing tests * First first completes others don't * It works * Started working on arrow_provider * Implemented ArrowProvider * Added logger fixture * Fixed test_storage_controller * Fixing OpenWPMTest.visit() * Moved test/storage_providers to test/storage * Fixing up tests * Moved automation to openwpm * Readded datadir to .gitignore * Ran repin.sh * Fixed formatting * Let's see if this works * Fixed imports * Got arrow_memory_provider working * Starting to rewrite tests * Setting up fixtures * Attempting to fix all the tests * Still fixing tests * Broken content saving * Added node * Fixed screenshot tests * Fixing more tests * Fixed tests * Implemented local_storage.py * Cleaned up flush_cache * Fixing more tests * Wrote test for LocalArrowProvider * Introduced tests for local_storage_provider.py * Asserting test dir is empty * Creating subfolder for different aggregators * New depencies and init() * Everything is terribly broken * Figured out finalize_visit_id * Running two event loops kinda works??? * Rearming the event * Introduced mypy * Downgraded black in pre-commit * Modifying the database directly * Fixed formatting * Made mypy a lil stricter * Fixing docs and config printing * Realising I've been using the wrong with * Trying to figure arrow_storage * Moving lock initialization in in_memory_storage * Fixing tests * Fixing up tests and adding more typechecking * Fixed num_browsers in test_cache_hits_recorded * Parametrized unstructured * String fix * Added failing test * New test * Review changes with Steven * Fixed repin.sh and test_arrow_cache * Minor change * Fixed prune-environment.py * Removing references to DataAggregator * Fixed test_seed_persistance * More paths * Fixed test display shutdown * Made cache test more robust * Update crawler.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Slimming down ManagerParams * Fixing more tests * Update test/storage/test_storage_controller.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Purging references to DataAggregator * Reverted changes to .travis.yml * Demo.py saves locally again * Readjusting test paths * Expanded comment on initialize to reference #846 * Made token optional in finalize_visit_id * Simplified test paramtetrization * Fixed callback semantics change * Removed test_parse_http_stack_trace_str * Added DataSocket * WIP need to fix path encoding * Fixed path encoding * Added task and crawl to schema * Fixed paths in GitHub actions * Refactored completion handling * Fix tests * Trying to fix tests on CI * Removed redundant setting of tag * Removing references to S3 * Purging more DataAggregator references * Craking up logging to figure out test failure * Moved test_values into a fixture * Fixing GcpUnstructuredProvider * Fixed paths for future crawls * Renamed sqllite to official sqlite * Restored demo.py * Update openwpm/commands/profile_commands.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Restored previous behaviour of DumpProfileCommand Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed leftovers * Cleaned up comments * Expanded lock check * Fixed more stuff * More comment updates * Update openwpm/socket_interface.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed outdated comment * Using config_encoder * Renamed tar_location to tar_path * Removed references to database_name in docs * Cleanup * Moved screenshot_path and source_dump_path to ManagerParamsInternal * Fixed imports * Fixing up comments * Fixing up comments * More docs * updated dependencies * Fixed test_task_manager * Reupgraded to python 3.9.1 * Restoring crawl_reference in mp_logger * Removed unused imports * Apply suggestions from code review Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Cleaned up socket handling * Fixed TaskManager.__exit__ * Moved validation code into config.py * Removed comment * Removed comment * Removed comment Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com>
2021-02-22 19:51:32 +03:00
from openwpm.storage.cloud_storage.gcp_storage import (
GcsStructuredProvider,
GcsUnstructuredProvider,
)
Command refactoring (#750) * Refactored GetCommand, BrowseCommand to have execute method * Fixed type name format issues in __issue_command * Fixed everything I broke * Changed import style so tests can run * Added BrowseCommad to imports * Added some more self * Added logging to explain failing test * Added one more self * attempt at refactoring save_screenshot * fixed indentation, attempt at refactoring save_screenshot * refactored SaveScreenshot command to have execute method * reformatted code using black * Ported SaveScreenshotCommand It now uses the new command.execute(...) syntax * refactored savefullscreenshot command to follow command sequence * formatted files with black * removed extraneous commands * Ported SaveScreenshotFullPage #763 * refactored dump page source and formatted code with black * reformatted recursive dump page source command and formatted code w black * formatted files using isort * formatted all files with isort * Ported DumpPageSource and RecursiveDumpPageSource (#767) * refactor finalize command * refactored initalize command and formatted with black and isort * missed a conflict * Command refactoring (#770) * attempt at refactoring save_screenshot * fixed indentation, attempt at refactoring save_screenshot * refactored SaveScreenshot command to have execute method * reformatted code using black * refactored savefullscreenshot command to follow command sequence * formatted files with black * removed extraneous commands * refactored dump page source and formatted code with black * reformatted recursive dump page source command and formatted code w black * formatted files using isort * formatted all files with isort * refactor finalize command * refactored initalize command and formatted with black and isort * missed a conflict * Ran isort * Added append_command * remove custom function command and format code * Refactored GetCommand, BrowseCommand to have execute method * Fixed type name format issues in __issue_command * Fixed everything I broke * Changed import style so tests can run * Added BrowseCommad to imports * Added some more self * Added logging to explain failing test * Added one more self * Ported SaveScreenshotCommand It now uses the new command.execute(...) syntax * Ported SaveScreenshotFullPage #763 * Ported DumpPageSource and RecursiveDumpPageSource (#767) * Command refactoring (#770) * attempt at refactoring save_screenshot * fixed indentation, attempt at refactoring save_screenshot * refactored SaveScreenshot command to have execute method * reformatted code using black * refactored savefullscreenshot command to follow command sequence * formatted files with black * removed extraneous commands * refactored dump page source and formatted code with black * reformatted recursive dump page source command and formatted code w black * formatted files using isort * formatted all files with isort * refactor finalize command * refactored initalize command and formatted with black and isort * missed a conflict * Ran isort * Added append_command * remove duplicate append_command * Refactored GetCommand, BrowseCommand to have execute method * Fixed type name format issues in __issue_command * Fixed everything I broke * Changed import style so tests can run * Added BrowseCommad to imports * Added some more self * Added logging to explain failing test * Added one more self * Ported SaveScreenshotCommand It now uses the new command.execute(...) syntax * Ported SaveScreenshotFullPage #763 * Ported DumpPageSource and RecursiveDumpPageSource (#767) * Command refactoring (#770) * attempt at refactoring save_screenshot * fixed indentation, attempt at refactoring save_screenshot * refactored SaveScreenshot command to have execute method * reformatted code using black * refactored savefullscreenshot command to follow command sequence * formatted files with black * removed extraneous commands * refactored dump page source and formatted code with black * reformatted recursive dump page source command and formatted code w black * formatted files using isort * formatted all files with isort * refactor finalize command * refactored initalize command and formatted with black and isort * missed a conflict * Ran isort * Added append_command * generate new xpi * Fixing tests * Fixing tests * Fixing up more tests * Removed type annotations * Fixing tests * Fixing tests * Removed command_executor * Moved Commands to commands * Fixing imports * Fixed skipped test * Removed duplicate append_command * docs: update adding command in usingOpenWPM * Forgot to save * Removed datadir * Cleaning up imports * Implemented simple command * Added documentation to simple_command.py * Renamed to custom_command.py * Moved docs around * Referencing BaseCommand.execute * Update docs/Using_OpenWPM.md Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> Co-authored-by: Cyrus <cyruskarsan@gmail.com> Co-authored-by: cyruskarsan <55566678+cyruskarsan@users.noreply.github.com> Co-authored-by: Steven Englehardt <senglehardt@mozilla.com>
2021-01-09 13:15:01 +03:00
from openwpm.task_manager import TaskManager
from openwpm.utilities import rediswq
# Configuration via environment variables
Data Aggregator Rewrite (#753) * First steps in the rewrite * Fixed import paths * One giant refactor * Fixing tests * Adding mypy * Removed mypy from pre-commit workflow * First draft on DataAggregator * Wrote a DataAggregator that starts and shuts down * Created tests and added more empty types * Got demo.py working * Created sql_provider * Cleaned up imports in TaskManager * Added async * Fixed minor bugs * First steps at porting arrow * Introduced TableName and different Task handling * Added more failing tests * First first completes others don't * It works * Started working on arrow_provider * Implemented ArrowProvider * Added logger fixture * Fixed test_storage_controller * Fixing OpenWPMTest.visit() * Moved test/storage_providers to test/storage * Fixing up tests * Moved automation to openwpm * Readded datadir to .gitignore * Ran repin.sh * Fixed formatting * Let's see if this works * Fixed imports * Got arrow_memory_provider working * Starting to rewrite tests * Setting up fixtures * Attempting to fix all the tests * Still fixing tests * Broken content saving * Added node * Fixed screenshot tests * Fixing more tests * Fixed tests * Implemented local_storage.py * Cleaned up flush_cache * Fixing more tests * Wrote test for LocalArrowProvider * Introduced tests for local_storage_provider.py * Asserting test dir is empty * Creating subfolder for different aggregators * New depencies and init() * Everything is terribly broken * Figured out finalize_visit_id * Running two event loops kinda works??? * Rearming the event * Introduced mypy * Downgraded black in pre-commit * Modifying the database directly * Fixed formatting * Made mypy a lil stricter * Fixing docs and config printing * Realising I've been using the wrong with * Trying to figure arrow_storage * Moving lock initialization in in_memory_storage * Fixing tests * Fixing up tests and adding more typechecking * Fixed num_browsers in test_cache_hits_recorded * Parametrized unstructured * String fix * Added failing test * New test * Review changes with Steven * Fixed repin.sh and test_arrow_cache * Minor change * Fixed prune-environment.py * Removing references to DataAggregator * Fixed test_seed_persistance * More paths * Fixed test display shutdown * Made cache test more robust * Update crawler.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Slimming down ManagerParams * Fixing more tests * Update test/storage/test_storage_controller.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Purging references to DataAggregator * Reverted changes to .travis.yml * Demo.py saves locally again * Readjusting test paths * Expanded comment on initialize to reference #846 * Made token optional in finalize_visit_id * Simplified test paramtetrization * Fixed callback semantics change * Removed test_parse_http_stack_trace_str * Added DataSocket * WIP need to fix path encoding * Fixed path encoding * Added task and crawl to schema * Fixed paths in GitHub actions * Refactored completion handling * Fix tests * Trying to fix tests on CI * Removed redundant setting of tag * Removing references to S3 * Purging more DataAggregator references * Craking up logging to figure out test failure * Moved test_values into a fixture * Fixing GcpUnstructuredProvider * Fixed paths for future crawls * Renamed sqllite to official sqlite * Restored demo.py * Update openwpm/commands/profile_commands.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Restored previous behaviour of DumpProfileCommand Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed leftovers * Cleaned up comments * Expanded lock check * Fixed more stuff * More comment updates * Update openwpm/socket_interface.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed outdated comment * Using config_encoder * Renamed tar_location to tar_path * Removed references to database_name in docs * Cleanup * Moved screenshot_path and source_dump_path to ManagerParamsInternal * Fixed imports * Fixing up comments * Fixing up comments * More docs * updated dependencies * Fixed test_task_manager * Reupgraded to python 3.9.1 * Restoring crawl_reference in mp_logger * Removed unused imports * Apply suggestions from code review Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Cleaned up socket handling * Fixed TaskManager.__exit__ * Moved validation code into config.py * Removed comment * Removed comment * Removed comment Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com>
2021-02-22 19:51:32 +03:00
# Crawler specific config
2020-09-11 16:14:09 +03:00
REDIS_HOST = os.getenv("REDIS_HOST", "redis-box")
REDIS_QUEUE_NAME = os.getenv("REDIS_QUEUE_NAME", "crawl-queue")
Data Aggregator Rewrite (#753) * First steps in the rewrite * Fixed import paths * One giant refactor * Fixing tests * Adding mypy * Removed mypy from pre-commit workflow * First draft on DataAggregator * Wrote a DataAggregator that starts and shuts down * Created tests and added more empty types * Got demo.py working * Created sql_provider * Cleaned up imports in TaskManager * Added async * Fixed minor bugs * First steps at porting arrow * Introduced TableName and different Task handling * Added more failing tests * First first completes others don't * It works * Started working on arrow_provider * Implemented ArrowProvider * Added logger fixture * Fixed test_storage_controller * Fixing OpenWPMTest.visit() * Moved test/storage_providers to test/storage * Fixing up tests * Moved automation to openwpm * Readded datadir to .gitignore * Ran repin.sh * Fixed formatting * Let's see if this works * Fixed imports * Got arrow_memory_provider working * Starting to rewrite tests * Setting up fixtures * Attempting to fix all the tests * Still fixing tests * Broken content saving * Added node * Fixed screenshot tests * Fixing more tests * Fixed tests * Implemented local_storage.py * Cleaned up flush_cache * Fixing more tests * Wrote test for LocalArrowProvider * Introduced tests for local_storage_provider.py * Asserting test dir is empty * Creating subfolder for different aggregators * New depencies and init() * Everything is terribly broken * Figured out finalize_visit_id * Running two event loops kinda works??? * Rearming the event * Introduced mypy * Downgraded black in pre-commit * Modifying the database directly * Fixed formatting * Made mypy a lil stricter * Fixing docs and config printing * Realising I've been using the wrong with * Trying to figure arrow_storage * Moving lock initialization in in_memory_storage * Fixing tests * Fixing up tests and adding more typechecking * Fixed num_browsers in test_cache_hits_recorded * Parametrized unstructured * String fix * Added failing test * New test * Review changes with Steven * Fixed repin.sh and test_arrow_cache * Minor change * Fixed prune-environment.py * Removing references to DataAggregator * Fixed test_seed_persistance * More paths * Fixed test display shutdown * Made cache test more robust * Update crawler.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Slimming down ManagerParams * Fixing more tests * Update test/storage/test_storage_controller.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Purging references to DataAggregator * Reverted changes to .travis.yml * Demo.py saves locally again * Readjusting test paths * Expanded comment on initialize to reference #846 * Made token optional in finalize_visit_id * Simplified test paramtetrization * Fixed callback semantics change * Removed test_parse_http_stack_trace_str * Added DataSocket * WIP need to fix path encoding * Fixed path encoding * Added task and crawl to schema * Fixed paths in GitHub actions * Refactored completion handling * Fix tests * Trying to fix tests on CI * Removed redundant setting of tag * Removing references to S3 * Purging more DataAggregator references * Craking up logging to figure out test failure * Moved test_values into a fixture * Fixing GcpUnstructuredProvider * Fixed paths for future crawls * Renamed sqllite to official sqlite * Restored demo.py * Update openwpm/commands/profile_commands.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Restored previous behaviour of DumpProfileCommand Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed leftovers * Cleaned up comments * Expanded lock check * Fixed more stuff * More comment updates * Update openwpm/socket_interface.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed outdated comment * Using config_encoder * Renamed tar_location to tar_path * Removed references to database_name in docs * Cleanup * Moved screenshot_path and source_dump_path to ManagerParamsInternal * Fixed imports * Fixing up comments * Fixing up comments * More docs * updated dependencies * Fixed test_task_manager * Reupgraded to python 3.9.1 * Restoring crawl_reference in mp_logger * Removed unused imports * Apply suggestions from code review Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Cleaned up socket handling * Fixed TaskManager.__exit__ * Moved validation code into config.py * Removed comment * Removed comment * Removed comment Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com>
2021-02-22 19:51:32 +03:00
MAX_JOB_RETRIES = int(os.getenv("MAX_JOB_RETRIES", "2"))
DWELL_TIME = int(os.getenv("DWELL_TIME", "10"))
TIMEOUT = int(os.getenv("TIMEOUT", "60"))
# Storage Provider Params
2020-09-11 16:14:09 +03:00
CRAWL_DIRECTORY = os.getenv("CRAWL_DIRECTORY", "crawl-data")
Data Aggregator Rewrite (#753) * First steps in the rewrite * Fixed import paths * One giant refactor * Fixing tests * Adding mypy * Removed mypy from pre-commit workflow * First draft on DataAggregator * Wrote a DataAggregator that starts and shuts down * Created tests and added more empty types * Got demo.py working * Created sql_provider * Cleaned up imports in TaskManager * Added async * Fixed minor bugs * First steps at porting arrow * Introduced TableName and different Task handling * Added more failing tests * First first completes others don't * It works * Started working on arrow_provider * Implemented ArrowProvider * Added logger fixture * Fixed test_storage_controller * Fixing OpenWPMTest.visit() * Moved test/storage_providers to test/storage * Fixing up tests * Moved automation to openwpm * Readded datadir to .gitignore * Ran repin.sh * Fixed formatting * Let's see if this works * Fixed imports * Got arrow_memory_provider working * Starting to rewrite tests * Setting up fixtures * Attempting to fix all the tests * Still fixing tests * Broken content saving * Added node * Fixed screenshot tests * Fixing more tests * Fixed tests * Implemented local_storage.py * Cleaned up flush_cache * Fixing more tests * Wrote test for LocalArrowProvider * Introduced tests for local_storage_provider.py * Asserting test dir is empty * Creating subfolder for different aggregators * New depencies and init() * Everything is terribly broken * Figured out finalize_visit_id * Running two event loops kinda works??? * Rearming the event * Introduced mypy * Downgraded black in pre-commit * Modifying the database directly * Fixed formatting * Made mypy a lil stricter * Fixing docs and config printing * Realising I've been using the wrong with * Trying to figure arrow_storage * Moving lock initialization in in_memory_storage * Fixing tests * Fixing up tests and adding more typechecking * Fixed num_browsers in test_cache_hits_recorded * Parametrized unstructured * String fix * Added failing test * New test * Review changes with Steven * Fixed repin.sh and test_arrow_cache * Minor change * Fixed prune-environment.py * Removing references to DataAggregator * Fixed test_seed_persistance * More paths * Fixed test display shutdown * Made cache test more robust * Update crawler.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Slimming down ManagerParams * Fixing more tests * Update test/storage/test_storage_controller.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Purging references to DataAggregator * Reverted changes to .travis.yml * Demo.py saves locally again * Readjusting test paths * Expanded comment on initialize to reference #846 * Made token optional in finalize_visit_id * Simplified test paramtetrization * Fixed callback semantics change * Removed test_parse_http_stack_trace_str * Added DataSocket * WIP need to fix path encoding * Fixed path encoding * Added task and crawl to schema * Fixed paths in GitHub actions * Refactored completion handling * Fix tests * Trying to fix tests on CI * Removed redundant setting of tag * Removing references to S3 * Purging more DataAggregator references * Craking up logging to figure out test failure * Moved test_values into a fixture * Fixing GcpUnstructuredProvider * Fixed paths for future crawls * Renamed sqllite to official sqlite * Restored demo.py * Update openwpm/commands/profile_commands.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Restored previous behaviour of DumpProfileCommand Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed leftovers * Cleaned up comments * Expanded lock check * Fixed more stuff * More comment updates * Update openwpm/socket_interface.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed outdated comment * Using config_encoder * Renamed tar_location to tar_path * Removed references to database_name in docs * Cleanup * Moved screenshot_path and source_dump_path to ManagerParamsInternal * Fixed imports * Fixing up comments * Fixing up comments * More docs * updated dependencies * Fixed test_task_manager * Reupgraded to python 3.9.1 * Restoring crawl_reference in mp_logger * Removed unused imports * Apply suggestions from code review Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Cleaned up socket handling * Fixed TaskManager.__exit__ * Moved validation code into config.py * Removed comment * Removed comment * Removed comment Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com>
2021-02-22 19:51:32 +03:00
GCS_BUCKET = os.getenv("GCS_BUCKET", "openwpm-crawls")
GCP_PROJECT = os.getenv("GCP_PROJECT", "")
AUTH_TOKEN = os.getenv("GCP_AUTH_TOKEN", "cloud")
# Browser Params
2020-09-11 16:14:09 +03:00
DISPLAY_MODE = os.getenv("DISPLAY_MODE", "headless")
assert DISPLAY_MODE in ["headless", "xvfb", "native"]
DISPLAY_MODE = typing.cast(Literal["headless", "xvfb", "native"], DISPLAY_MODE)
2020-09-11 16:14:09 +03:00
HTTP_INSTRUMENT = os.getenv("HTTP_INSTRUMENT", "1") == "1"
COOKIE_INSTRUMENT = os.getenv("COOKIE_INSTRUMENT", "1") == "1"
NAVIGATION_INSTRUMENT = os.getenv("NAVIGATION_INSTRUMENT", "1") == "1"
JS_INSTRUMENT = os.getenv("JS_INSTRUMENT", "1") == "1"
CALLSTACK_INSTRUMENT = os.getenv("CALLSTACK_INSTRUMENT", "1") == "1"
Data Aggregator Rewrite (#753) * First steps in the rewrite * Fixed import paths * One giant refactor * Fixing tests * Adding mypy * Removed mypy from pre-commit workflow * First draft on DataAggregator * Wrote a DataAggregator that starts and shuts down * Created tests and added more empty types * Got demo.py working * Created sql_provider * Cleaned up imports in TaskManager * Added async * Fixed minor bugs * First steps at porting arrow * Introduced TableName and different Task handling * Added more failing tests * First first completes others don't * It works * Started working on arrow_provider * Implemented ArrowProvider * Added logger fixture * Fixed test_storage_controller * Fixing OpenWPMTest.visit() * Moved test/storage_providers to test/storage * Fixing up tests * Moved automation to openwpm * Readded datadir to .gitignore * Ran repin.sh * Fixed formatting * Let's see if this works * Fixed imports * Got arrow_memory_provider working * Starting to rewrite tests * Setting up fixtures * Attempting to fix all the tests * Still fixing tests * Broken content saving * Added node * Fixed screenshot tests * Fixing more tests * Fixed tests * Implemented local_storage.py * Cleaned up flush_cache * Fixing more tests * Wrote test for LocalArrowProvider * Introduced tests for local_storage_provider.py * Asserting test dir is empty * Creating subfolder for different aggregators * New depencies and init() * Everything is terribly broken * Figured out finalize_visit_id * Running two event loops kinda works??? * Rearming the event * Introduced mypy * Downgraded black in pre-commit * Modifying the database directly * Fixed formatting * Made mypy a lil stricter * Fixing docs and config printing * Realising I've been using the wrong with * Trying to figure arrow_storage * Moving lock initialization in in_memory_storage * Fixing tests * Fixing up tests and adding more typechecking * Fixed num_browsers in test_cache_hits_recorded * Parametrized unstructured * String fix * Added failing test * New test * Review changes with Steven * Fixed repin.sh and test_arrow_cache * Minor change * Fixed prune-environment.py * Removing references to DataAggregator * Fixed test_seed_persistance * More paths * Fixed test display shutdown * Made cache test more robust * Update crawler.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Slimming down ManagerParams * Fixing more tests * Update test/storage/test_storage_controller.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Purging references to DataAggregator * Reverted changes to .travis.yml * Demo.py saves locally again * Readjusting test paths * Expanded comment on initialize to reference #846 * Made token optional in finalize_visit_id * Simplified test paramtetrization * Fixed callback semantics change * Removed test_parse_http_stack_trace_str * Added DataSocket * WIP need to fix path encoding * Fixed path encoding * Added task and crawl to schema * Fixed paths in GitHub actions * Refactored completion handling * Fix tests * Trying to fix tests on CI * Removed redundant setting of tag * Removing references to S3 * Purging more DataAggregator references * Craking up logging to figure out test failure * Moved test_values into a fixture * Fixing GcpUnstructuredProvider * Fixed paths for future crawls * Renamed sqllite to official sqlite * Restored demo.py * Update openwpm/commands/profile_commands.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Restored previous behaviour of DumpProfileCommand Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed leftovers * Cleaned up comments * Expanded lock check * Fixed more stuff * More comment updates * Update openwpm/socket_interface.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed outdated comment * Using config_encoder * Renamed tar_location to tar_path * Removed references to database_name in docs * Cleanup * Moved screenshot_path and source_dump_path to ManagerParamsInternal * Fixed imports * Fixing up comments * Fixing up comments * More docs * updated dependencies * Fixed test_task_manager * Reupgraded to python 3.9.1 * Restoring crawl_reference in mp_logger * Removed unused imports * Apply suggestions from code review Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Cleaned up socket handling * Fixed TaskManager.__exit__ * Moved validation code into config.py * Removed comment * Removed comment * Removed comment Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com>
2021-02-22 19:51:32 +03:00
JS_INSTRUMENT_SETTINGS = json.loads(
os.getenv("JS_INSTRUMENT_SETTINGS", '["collection_fingerprinting"]')
2020-09-11 16:14:09 +03:00
)
Data Aggregator Rewrite (#753) * First steps in the rewrite * Fixed import paths * One giant refactor * Fixing tests * Adding mypy * Removed mypy from pre-commit workflow * First draft on DataAggregator * Wrote a DataAggregator that starts and shuts down * Created tests and added more empty types * Got demo.py working * Created sql_provider * Cleaned up imports in TaskManager * Added async * Fixed minor bugs * First steps at porting arrow * Introduced TableName and different Task handling * Added more failing tests * First first completes others don't * It works * Started working on arrow_provider * Implemented ArrowProvider * Added logger fixture * Fixed test_storage_controller * Fixing OpenWPMTest.visit() * Moved test/storage_providers to test/storage * Fixing up tests * Moved automation to openwpm * Readded datadir to .gitignore * Ran repin.sh * Fixed formatting * Let's see if this works * Fixed imports * Got arrow_memory_provider working * Starting to rewrite tests * Setting up fixtures * Attempting to fix all the tests * Still fixing tests * Broken content saving * Added node * Fixed screenshot tests * Fixing more tests * Fixed tests * Implemented local_storage.py * Cleaned up flush_cache * Fixing more tests * Wrote test for LocalArrowProvider * Introduced tests for local_storage_provider.py * Asserting test dir is empty * Creating subfolder for different aggregators * New depencies and init() * Everything is terribly broken * Figured out finalize_visit_id * Running two event loops kinda works??? * Rearming the event * Introduced mypy * Downgraded black in pre-commit * Modifying the database directly * Fixed formatting * Made mypy a lil stricter * Fixing docs and config printing * Realising I've been using the wrong with * Trying to figure arrow_storage * Moving lock initialization in in_memory_storage * Fixing tests * Fixing up tests and adding more typechecking * Fixed num_browsers in test_cache_hits_recorded * Parametrized unstructured * String fix * Added failing test * New test * Review changes with Steven * Fixed repin.sh and test_arrow_cache * Minor change * Fixed prune-environment.py * Removing references to DataAggregator * Fixed test_seed_persistance * More paths * Fixed test display shutdown * Made cache test more robust * Update crawler.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Slimming down ManagerParams * Fixing more tests * Update test/storage/test_storage_controller.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Purging references to DataAggregator * Reverted changes to .travis.yml * Demo.py saves locally again * Readjusting test paths * Expanded comment on initialize to reference #846 * Made token optional in finalize_visit_id * Simplified test paramtetrization * Fixed callback semantics change * Removed test_parse_http_stack_trace_str * Added DataSocket * WIP need to fix path encoding * Fixed path encoding * Added task and crawl to schema * Fixed paths in GitHub actions * Refactored completion handling * Fix tests * Trying to fix tests on CI * Removed redundant setting of tag * Removing references to S3 * Purging more DataAggregator references * Craking up logging to figure out test failure * Moved test_values into a fixture * Fixing GcpUnstructuredProvider * Fixed paths for future crawls * Renamed sqllite to official sqlite * Restored demo.py * Update openwpm/commands/profile_commands.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Restored previous behaviour of DumpProfileCommand Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed leftovers * Cleaned up comments * Expanded lock check * Fixed more stuff * More comment updates * Update openwpm/socket_interface.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed outdated comment * Using config_encoder * Renamed tar_location to tar_path * Removed references to database_name in docs * Cleanup * Moved screenshot_path and source_dump_path to ManagerParamsInternal * Fixed imports * Fixing up comments * Fixing up comments * More docs * updated dependencies * Fixed test_task_manager * Reupgraded to python 3.9.1 * Restoring crawl_reference in mp_logger * Removed unused imports * Apply suggestions from code review Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Cleaned up socket handling * Fixed TaskManager.__exit__ * Moved validation code into config.py * Removed comment * Removed comment * Removed comment Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com>
2021-02-22 19:51:32 +03:00
2020-09-11 16:14:09 +03:00
SAVE_CONTENT = os.getenv("SAVE_CONTENT", "")
PREFS = os.getenv("PREFS", None)
Data Aggregator Rewrite (#753) * First steps in the rewrite * Fixed import paths * One giant refactor * Fixing tests * Adding mypy * Removed mypy from pre-commit workflow * First draft on DataAggregator * Wrote a DataAggregator that starts and shuts down * Created tests and added more empty types * Got demo.py working * Created sql_provider * Cleaned up imports in TaskManager * Added async * Fixed minor bugs * First steps at porting arrow * Introduced TableName and different Task handling * Added more failing tests * First first completes others don't * It works * Started working on arrow_provider * Implemented ArrowProvider * Added logger fixture * Fixed test_storage_controller * Fixing OpenWPMTest.visit() * Moved test/storage_providers to test/storage * Fixing up tests * Moved automation to openwpm * Readded datadir to .gitignore * Ran repin.sh * Fixed formatting * Let's see if this works * Fixed imports * Got arrow_memory_provider working * Starting to rewrite tests * Setting up fixtures * Attempting to fix all the tests * Still fixing tests * Broken content saving * Added node * Fixed screenshot tests * Fixing more tests * Fixed tests * Implemented local_storage.py * Cleaned up flush_cache * Fixing more tests * Wrote test for LocalArrowProvider * Introduced tests for local_storage_provider.py * Asserting test dir is empty * Creating subfolder for different aggregators * New depencies and init() * Everything is terribly broken * Figured out finalize_visit_id * Running two event loops kinda works??? * Rearming the event * Introduced mypy * Downgraded black in pre-commit * Modifying the database directly * Fixed formatting * Made mypy a lil stricter * Fixing docs and config printing * Realising I've been using the wrong with * Trying to figure arrow_storage * Moving lock initialization in in_memory_storage * Fixing tests * Fixing up tests and adding more typechecking * Fixed num_browsers in test_cache_hits_recorded * Parametrized unstructured * String fix * Added failing test * New test * Review changes with Steven * Fixed repin.sh and test_arrow_cache * Minor change * Fixed prune-environment.py * Removing references to DataAggregator * Fixed test_seed_persistance * More paths * Fixed test display shutdown * Made cache test more robust * Update crawler.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Slimming down ManagerParams * Fixing more tests * Update test/storage/test_storage_controller.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Purging references to DataAggregator * Reverted changes to .travis.yml * Demo.py saves locally again * Readjusting test paths * Expanded comment on initialize to reference #846 * Made token optional in finalize_visit_id * Simplified test paramtetrization * Fixed callback semantics change * Removed test_parse_http_stack_trace_str * Added DataSocket * WIP need to fix path encoding * Fixed path encoding * Added task and crawl to schema * Fixed paths in GitHub actions * Refactored completion handling * Fix tests * Trying to fix tests on CI * Removed redundant setting of tag * Removing references to S3 * Purging more DataAggregator references * Craking up logging to figure out test failure * Moved test_values into a fixture * Fixing GcpUnstructuredProvider * Fixed paths for future crawls * Renamed sqllite to official sqlite * Restored demo.py * Update openwpm/commands/profile_commands.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Restored previous behaviour of DumpProfileCommand Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed leftovers * Cleaned up comments * Expanded lock check * Fixed more stuff * More comment updates * Update openwpm/socket_interface.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed outdated comment * Using config_encoder * Renamed tar_location to tar_path * Removed references to database_name in docs * Cleanup * Moved screenshot_path and source_dump_path to ManagerParamsInternal * Fixed imports * Fixing up comments * Fixing up comments * More docs * updated dependencies * Fixed test_task_manager * Reupgraded to python 3.9.1 * Restoring crawl_reference in mp_logger * Removed unused imports * Apply suggestions from code review Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Cleaned up socket handling * Fixed TaskManager.__exit__ * Moved validation code into config.py * Removed comment * Removed comment * Removed comment Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com>
2021-02-22 19:51:32 +03:00
SENTRY_DSN = os.getenv("SENTRY_DSN", None)
LOGGER_SETTINGS = mp_logger.parse_config_from_env()
2020-04-16 17:26:04 +03:00
if CALLSTACK_INSTRUMENT is True:
# Must have JS_INSTRUMENT True for CALLSTACK_INSTRUMENT to work
JS_INSTRUMENT = True
EXTENDED_LEASE_TIME = 2 * (TIMEOUT + DWELL_TIME + 30)
2020-04-16 17:26:04 +03:00
2019-07-14 00:48:58 +03:00
# Loads the default manager params
2019-08-23 00:21:59 +03:00
# We can't use more than one browser per instance because the job management
# code below requires blocking commands. For more context see:
2021-12-20 18:47:32 +03:00
# https://github.com/openwpm/OpenWPM/issues/470
2019-08-23 00:21:59 +03:00
NUM_BROWSERS = 1
Refactoring browser and manager params into dataclasses (#807) * initial file commit * add new dependency for dataclasses * implemeted basic BrowserParams dataclass * dependencies update * file reformat * implemented basic ManagerParams dataclass * Update environment dependencies * Added new error class to validate browser and manager params * file reformat * Update scripts/environment-unpinned.yaml Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * added validations for BrowserParams dataclass * Update openwpm/config.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Removed unnecessary checks Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Changed error string formatting Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Changed filenamea and necessary imports to resolve conflicts with new master branch(refering to PEP-8 reformatting) * Revert "Changed filenamea and necessary imports to resolve conflicts with new master branch(refering to PEP-8 reformatting)" This reverts commit e550c3bd604f415272bd05ee3d9c76397ad98006. * Revert "Merge branch 'master' into turn_browser_and_manager_params_into_dataclasses" This reverts commit aff5a384e737477746d6a38d3b2be6244f8dfd11, reversing changes made to 6ecaf5d0a94d376126692c3785692ba10626d88a. * Revert "Update environment dependencies" This reverts commit 385825b10aee4610a6e304122bec4ab2b7219a5b. * Revert "Merge branch 'turn_browser_and_manager_params_into_dataclasses' of https://github.com/ankushduacodes/OpenWPM into turn_browser_and_manager_params_into_dataclasses" This reverts commit 6ecaf5d0a94d376126692c3785692ba10626d88a, reversing changes made to e550c3bd604f415272bd05ee3d9c76397ad98006. * file reformat * finalized validate_browser_params function * fixed typo in error string * added validations for manager_params * Explanation for using list for supported browser * Revert "Revert "Merge branch 'master' into turn_browser_and_manager_params_into_dataclasses"" This reverts commit 6c3e98e57bd9c42acd029c74649742dcc81de86c. * Revert "Revert "Changed filenamea and necessary imports to resolve conflicts with new master branch(refering to PEP-8 reformatting)"" This reverts commit fc8f48f1878ea7c43b342989ce581dc3d6eab929. * import name change from .Error to .error * moved call_instrument check to config.py * fixed accidental use of dict syntax in a class * moved save_content check from deploy_firefox.py * deleting redundent file * deleted more redundent files * removed redundant imports * added new save_content check * property name changevariables can not have '-' * added new attribute to ManagerParams * adapted files to validate manager & broswer params - also added logic to convert the objects(BrowserParams and ManagerParams) to dictionaries to not break the functionality - also updated demo.py to work with new file names on this branch * removed obsolete documentaion * Dependency Update * Revert "Dependency Update" This reverts commit 8ee3a02b1764883a1f5922e0b52e9f17f8e098db. * Dependencies Update * unset memory and process watchdogs * add new output_format and failure_limit checks * inheriting dataclasses and added type hints to fn * added todo * fixed inheritance of dataclasses acc. to plan * refactor use of dict to use dataclasses(pending) * more refactoring use of dict to dataclasses - Also changed some type hints related to new refactoring * fixed screenshot directory issue - because of which some of the tests were failing * added try-except clause for unexpected errors * added tests to cover dataclasses * added some new and edited some old docs * refactor use of __dict__ to dataclass.to_dict() * Revert "refactor use of __dict__ to dataclass.to_dict()" This reverts commit a4f35513fa26d23a073c16af9fb332045826dcb2. * fixed some tests * refactor use of __dict__ in favor of dataclass.to_dict() method * removed some TODOS * fixed dataclases validation tests * Update docs/Configuration.md Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update docs/Configuration.md Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update docs/Configuration.md Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update openwpm/config.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update openwpm/config.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update openwpm/task_manager.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * minor fixed wrt polishing the PR * added new check and test for crawl configs Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de>
2020-12-02 12:10:45 +03:00
manager_params = ManagerParams()
browser_params = [BrowserParams() for _ in range(NUM_BROWSERS)]
2019-07-13 01:27:49 +03:00
# Browser configuration
for i in range(NUM_BROWSERS):
Refactoring browser and manager params into dataclasses (#807) * initial file commit * add new dependency for dataclasses * implemeted basic BrowserParams dataclass * dependencies update * file reformat * implemented basic ManagerParams dataclass * Update environment dependencies * Added new error class to validate browser and manager params * file reformat * Update scripts/environment-unpinned.yaml Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * added validations for BrowserParams dataclass * Update openwpm/config.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Removed unnecessary checks Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Changed error string formatting Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Changed filenamea and necessary imports to resolve conflicts with new master branch(refering to PEP-8 reformatting) * Revert "Changed filenamea and necessary imports to resolve conflicts with new master branch(refering to PEP-8 reformatting)" This reverts commit e550c3bd604f415272bd05ee3d9c76397ad98006. * Revert "Merge branch 'master' into turn_browser_and_manager_params_into_dataclasses" This reverts commit aff5a384e737477746d6a38d3b2be6244f8dfd11, reversing changes made to 6ecaf5d0a94d376126692c3785692ba10626d88a. * Revert "Update environment dependencies" This reverts commit 385825b10aee4610a6e304122bec4ab2b7219a5b. * Revert "Merge branch 'turn_browser_and_manager_params_into_dataclasses' of https://github.com/ankushduacodes/OpenWPM into turn_browser_and_manager_params_into_dataclasses" This reverts commit 6ecaf5d0a94d376126692c3785692ba10626d88a, reversing changes made to e550c3bd604f415272bd05ee3d9c76397ad98006. * file reformat * finalized validate_browser_params function * fixed typo in error string * added validations for manager_params * Explanation for using list for supported browser * Revert "Revert "Merge branch 'master' into turn_browser_and_manager_params_into_dataclasses"" This reverts commit 6c3e98e57bd9c42acd029c74649742dcc81de86c. * Revert "Revert "Changed filenamea and necessary imports to resolve conflicts with new master branch(refering to PEP-8 reformatting)"" This reverts commit fc8f48f1878ea7c43b342989ce581dc3d6eab929. * import name change from .Error to .error * moved call_instrument check to config.py * fixed accidental use of dict syntax in a class * moved save_content check from deploy_firefox.py * deleting redundent file * deleted more redundent files * removed redundant imports * added new save_content check * property name changevariables can not have '-' * added new attribute to ManagerParams * adapted files to validate manager & broswer params - also added logic to convert the objects(BrowserParams and ManagerParams) to dictionaries to not break the functionality - also updated demo.py to work with new file names on this branch * removed obsolete documentaion * Dependency Update * Revert "Dependency Update" This reverts commit 8ee3a02b1764883a1f5922e0b52e9f17f8e098db. * Dependencies Update * unset memory and process watchdogs * add new output_format and failure_limit checks * inheriting dataclasses and added type hints to fn * added todo * fixed inheritance of dataclasses acc. to plan * refactor use of dict to use dataclasses(pending) * more refactoring use of dict to dataclasses - Also changed some type hints related to new refactoring * fixed screenshot directory issue - because of which some of the tests were failing * added try-except clause for unexpected errors * added tests to cover dataclasses * added some new and edited some old docs * refactor use of __dict__ to dataclass.to_dict() * Revert "refactor use of __dict__ to dataclass.to_dict()" This reverts commit a4f35513fa26d23a073c16af9fb332045826dcb2. * fixed some tests * refactor use of __dict__ in favor of dataclass.to_dict() method * removed some TODOS * fixed dataclases validation tests * Update docs/Configuration.md Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update docs/Configuration.md Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update docs/Configuration.md Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update openwpm/config.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update openwpm/config.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update openwpm/task_manager.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * minor fixed wrt polishing the PR * added new check and test for crawl configs Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de>
2020-12-02 12:10:45 +03:00
browser_params[i].display_mode = DISPLAY_MODE
browser_params[i].http_instrument = HTTP_INSTRUMENT
browser_params[i].cookie_instrument = COOKIE_INSTRUMENT
browser_params[i].navigation_instrument = NAVIGATION_INSTRUMENT
browser_params[i].callstack_instrument = CALLSTACK_INSTRUMENT
browser_params[i].js_instrument = JS_INSTRUMENT
browser_params[i].js_instrument_settings = JS_INSTRUMENT_SETTINGS
2020-09-11 16:14:09 +03:00
if SAVE_CONTENT == "1":
Refactoring browser and manager params into dataclasses (#807) * initial file commit * add new dependency for dataclasses * implemeted basic BrowserParams dataclass * dependencies update * file reformat * implemented basic ManagerParams dataclass * Update environment dependencies * Added new error class to validate browser and manager params * file reformat * Update scripts/environment-unpinned.yaml Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * added validations for BrowserParams dataclass * Update openwpm/config.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Removed unnecessary checks Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Changed error string formatting Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Changed filenamea and necessary imports to resolve conflicts with new master branch(refering to PEP-8 reformatting) * Revert "Changed filenamea and necessary imports to resolve conflicts with new master branch(refering to PEP-8 reformatting)" This reverts commit e550c3bd604f415272bd05ee3d9c76397ad98006. * Revert "Merge branch 'master' into turn_browser_and_manager_params_into_dataclasses" This reverts commit aff5a384e737477746d6a38d3b2be6244f8dfd11, reversing changes made to 6ecaf5d0a94d376126692c3785692ba10626d88a. * Revert "Update environment dependencies" This reverts commit 385825b10aee4610a6e304122bec4ab2b7219a5b. * Revert "Merge branch 'turn_browser_and_manager_params_into_dataclasses' of https://github.com/ankushduacodes/OpenWPM into turn_browser_and_manager_params_into_dataclasses" This reverts commit 6ecaf5d0a94d376126692c3785692ba10626d88a, reversing changes made to e550c3bd604f415272bd05ee3d9c76397ad98006. * file reformat * finalized validate_browser_params function * fixed typo in error string * added validations for manager_params * Explanation for using list for supported browser * Revert "Revert "Merge branch 'master' into turn_browser_and_manager_params_into_dataclasses"" This reverts commit 6c3e98e57bd9c42acd029c74649742dcc81de86c. * Revert "Revert "Changed filenamea and necessary imports to resolve conflicts with new master branch(refering to PEP-8 reformatting)"" This reverts commit fc8f48f1878ea7c43b342989ce581dc3d6eab929. * import name change from .Error to .error * moved call_instrument check to config.py * fixed accidental use of dict syntax in a class * moved save_content check from deploy_firefox.py * deleting redundent file * deleted more redundent files * removed redundant imports * added new save_content check * property name changevariables can not have '-' * added new attribute to ManagerParams * adapted files to validate manager & broswer params - also added logic to convert the objects(BrowserParams and ManagerParams) to dictionaries to not break the functionality - also updated demo.py to work with new file names on this branch * removed obsolete documentaion * Dependency Update * Revert "Dependency Update" This reverts commit 8ee3a02b1764883a1f5922e0b52e9f17f8e098db. * Dependencies Update * unset memory and process watchdogs * add new output_format and failure_limit checks * inheriting dataclasses and added type hints to fn * added todo * fixed inheritance of dataclasses acc. to plan * refactor use of dict to use dataclasses(pending) * more refactoring use of dict to dataclasses - Also changed some type hints related to new refactoring * fixed screenshot directory issue - because of which some of the tests were failing * added try-except clause for unexpected errors * added tests to cover dataclasses * added some new and edited some old docs * refactor use of __dict__ to dataclass.to_dict() * Revert "refactor use of __dict__ to dataclass.to_dict()" This reverts commit a4f35513fa26d23a073c16af9fb332045826dcb2. * fixed some tests * refactor use of __dict__ in favor of dataclass.to_dict() method * removed some TODOS * fixed dataclases validation tests * Update docs/Configuration.md Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update docs/Configuration.md Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update docs/Configuration.md Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update openwpm/config.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update openwpm/config.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update openwpm/task_manager.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * minor fixed wrt polishing the PR * added new check and test for crawl configs Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de>
2020-12-02 12:10:45 +03:00
browser_params[i].save_content = True
2020-09-11 16:14:09 +03:00
elif SAVE_CONTENT == "0":
Refactoring browser and manager params into dataclasses (#807) * initial file commit * add new dependency for dataclasses * implemeted basic BrowserParams dataclass * dependencies update * file reformat * implemented basic ManagerParams dataclass * Update environment dependencies * Added new error class to validate browser and manager params * file reformat * Update scripts/environment-unpinned.yaml Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * added validations for BrowserParams dataclass * Update openwpm/config.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Removed unnecessary checks Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Changed error string formatting Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Changed filenamea and necessary imports to resolve conflicts with new master branch(refering to PEP-8 reformatting) * Revert "Changed filenamea and necessary imports to resolve conflicts with new master branch(refering to PEP-8 reformatting)" This reverts commit e550c3bd604f415272bd05ee3d9c76397ad98006. * Revert "Merge branch 'master' into turn_browser_and_manager_params_into_dataclasses" This reverts commit aff5a384e737477746d6a38d3b2be6244f8dfd11, reversing changes made to 6ecaf5d0a94d376126692c3785692ba10626d88a. * Revert "Update environment dependencies" This reverts commit 385825b10aee4610a6e304122bec4ab2b7219a5b. * Revert "Merge branch 'turn_browser_and_manager_params_into_dataclasses' of https://github.com/ankushduacodes/OpenWPM into turn_browser_and_manager_params_into_dataclasses" This reverts commit 6ecaf5d0a94d376126692c3785692ba10626d88a, reversing changes made to e550c3bd604f415272bd05ee3d9c76397ad98006. * file reformat * finalized validate_browser_params function * fixed typo in error string * added validations for manager_params * Explanation for using list for supported browser * Revert "Revert "Merge branch 'master' into turn_browser_and_manager_params_into_dataclasses"" This reverts commit 6c3e98e57bd9c42acd029c74649742dcc81de86c. * Revert "Revert "Changed filenamea and necessary imports to resolve conflicts with new master branch(refering to PEP-8 reformatting)"" This reverts commit fc8f48f1878ea7c43b342989ce581dc3d6eab929. * import name change from .Error to .error * moved call_instrument check to config.py * fixed accidental use of dict syntax in a class * moved save_content check from deploy_firefox.py * deleting redundent file * deleted more redundent files * removed redundant imports * added new save_content check * property name changevariables can not have '-' * added new attribute to ManagerParams * adapted files to validate manager & broswer params - also added logic to convert the objects(BrowserParams and ManagerParams) to dictionaries to not break the functionality - also updated demo.py to work with new file names on this branch * removed obsolete documentaion * Dependency Update * Revert "Dependency Update" This reverts commit 8ee3a02b1764883a1f5922e0b52e9f17f8e098db. * Dependencies Update * unset memory and process watchdogs * add new output_format and failure_limit checks * inheriting dataclasses and added type hints to fn * added todo * fixed inheritance of dataclasses acc. to plan * refactor use of dict to use dataclasses(pending) * more refactoring use of dict to dataclasses - Also changed some type hints related to new refactoring * fixed screenshot directory issue - because of which some of the tests were failing * added try-except clause for unexpected errors * added tests to cover dataclasses * added some new and edited some old docs * refactor use of __dict__ to dataclass.to_dict() * Revert "refactor use of __dict__ to dataclass.to_dict()" This reverts commit a4f35513fa26d23a073c16af9fb332045826dcb2. * fixed some tests * refactor use of __dict__ in favor of dataclass.to_dict() method * removed some TODOS * fixed dataclases validation tests * Update docs/Configuration.md Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update docs/Configuration.md Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update docs/Configuration.md Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update openwpm/config.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update openwpm/config.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update openwpm/task_manager.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * minor fixed wrt polishing the PR * added new check and test for crawl configs Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de>
2020-12-02 12:10:45 +03:00
browser_params[i].save_content = False
else:
Refactoring browser and manager params into dataclasses (#807) * initial file commit * add new dependency for dataclasses * implemeted basic BrowserParams dataclass * dependencies update * file reformat * implemented basic ManagerParams dataclass * Update environment dependencies * Added new error class to validate browser and manager params * file reformat * Update scripts/environment-unpinned.yaml Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * added validations for BrowserParams dataclass * Update openwpm/config.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Removed unnecessary checks Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Changed error string formatting Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Changed filenamea and necessary imports to resolve conflicts with new master branch(refering to PEP-8 reformatting) * Revert "Changed filenamea and necessary imports to resolve conflicts with new master branch(refering to PEP-8 reformatting)" This reverts commit e550c3bd604f415272bd05ee3d9c76397ad98006. * Revert "Merge branch 'master' into turn_browser_and_manager_params_into_dataclasses" This reverts commit aff5a384e737477746d6a38d3b2be6244f8dfd11, reversing changes made to 6ecaf5d0a94d376126692c3785692ba10626d88a. * Revert "Update environment dependencies" This reverts commit 385825b10aee4610a6e304122bec4ab2b7219a5b. * Revert "Merge branch 'turn_browser_and_manager_params_into_dataclasses' of https://github.com/ankushduacodes/OpenWPM into turn_browser_and_manager_params_into_dataclasses" This reverts commit 6ecaf5d0a94d376126692c3785692ba10626d88a, reversing changes made to e550c3bd604f415272bd05ee3d9c76397ad98006. * file reformat * finalized validate_browser_params function * fixed typo in error string * added validations for manager_params * Explanation for using list for supported browser * Revert "Revert "Merge branch 'master' into turn_browser_and_manager_params_into_dataclasses"" This reverts commit 6c3e98e57bd9c42acd029c74649742dcc81de86c. * Revert "Revert "Changed filenamea and necessary imports to resolve conflicts with new master branch(refering to PEP-8 reformatting)"" This reverts commit fc8f48f1878ea7c43b342989ce581dc3d6eab929. * import name change from .Error to .error * moved call_instrument check to config.py * fixed accidental use of dict syntax in a class * moved save_content check from deploy_firefox.py * deleting redundent file * deleted more redundent files * removed redundant imports * added new save_content check * property name changevariables can not have '-' * added new attribute to ManagerParams * adapted files to validate manager & broswer params - also added logic to convert the objects(BrowserParams and ManagerParams) to dictionaries to not break the functionality - also updated demo.py to work with new file names on this branch * removed obsolete documentaion * Dependency Update * Revert "Dependency Update" This reverts commit 8ee3a02b1764883a1f5922e0b52e9f17f8e098db. * Dependencies Update * unset memory and process watchdogs * add new output_format and failure_limit checks * inheriting dataclasses and added type hints to fn * added todo * fixed inheritance of dataclasses acc. to plan * refactor use of dict to use dataclasses(pending) * more refactoring use of dict to dataclasses - Also changed some type hints related to new refactoring * fixed screenshot directory issue - because of which some of the tests were failing * added try-except clause for unexpected errors * added tests to cover dataclasses * added some new and edited some old docs * refactor use of __dict__ to dataclass.to_dict() * Revert "refactor use of __dict__ to dataclass.to_dict()" This reverts commit a4f35513fa26d23a073c16af9fb332045826dcb2. * fixed some tests * refactor use of __dict__ in favor of dataclass.to_dict() method * removed some TODOS * fixed dataclases validation tests * Update docs/Configuration.md Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update docs/Configuration.md Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update docs/Configuration.md Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update openwpm/config.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update openwpm/config.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update openwpm/task_manager.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * minor fixed wrt polishing the PR * added new check and test for crawl configs Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de>
2020-12-02 12:10:45 +03:00
browser_params[i].save_content = SAVE_CONTENT
if PREFS:
Refactoring browser and manager params into dataclasses (#807) * initial file commit * add new dependency for dataclasses * implemeted basic BrowserParams dataclass * dependencies update * file reformat * implemented basic ManagerParams dataclass * Update environment dependencies * Added new error class to validate browser and manager params * file reformat * Update scripts/environment-unpinned.yaml Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * added validations for BrowserParams dataclass * Update openwpm/config.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Removed unnecessary checks Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Changed error string formatting Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Changed filenamea and necessary imports to resolve conflicts with new master branch(refering to PEP-8 reformatting) * Revert "Changed filenamea and necessary imports to resolve conflicts with new master branch(refering to PEP-8 reformatting)" This reverts commit e550c3bd604f415272bd05ee3d9c76397ad98006. * Revert "Merge branch 'master' into turn_browser_and_manager_params_into_dataclasses" This reverts commit aff5a384e737477746d6a38d3b2be6244f8dfd11, reversing changes made to 6ecaf5d0a94d376126692c3785692ba10626d88a. * Revert "Update environment dependencies" This reverts commit 385825b10aee4610a6e304122bec4ab2b7219a5b. * Revert "Merge branch 'turn_browser_and_manager_params_into_dataclasses' of https://github.com/ankushduacodes/OpenWPM into turn_browser_and_manager_params_into_dataclasses" This reverts commit 6ecaf5d0a94d376126692c3785692ba10626d88a, reversing changes made to e550c3bd604f415272bd05ee3d9c76397ad98006. * file reformat * finalized validate_browser_params function * fixed typo in error string * added validations for manager_params * Explanation for using list for supported browser * Revert "Revert "Merge branch 'master' into turn_browser_and_manager_params_into_dataclasses"" This reverts commit 6c3e98e57bd9c42acd029c74649742dcc81de86c. * Revert "Revert "Changed filenamea and necessary imports to resolve conflicts with new master branch(refering to PEP-8 reformatting)"" This reverts commit fc8f48f1878ea7c43b342989ce581dc3d6eab929. * import name change from .Error to .error * moved call_instrument check to config.py * fixed accidental use of dict syntax in a class * moved save_content check from deploy_firefox.py * deleting redundent file * deleted more redundent files * removed redundant imports * added new save_content check * property name changevariables can not have '-' * added new attribute to ManagerParams * adapted files to validate manager & broswer params - also added logic to convert the objects(BrowserParams and ManagerParams) to dictionaries to not break the functionality - also updated demo.py to work with new file names on this branch * removed obsolete documentaion * Dependency Update * Revert "Dependency Update" This reverts commit 8ee3a02b1764883a1f5922e0b52e9f17f8e098db. * Dependencies Update * unset memory and process watchdogs * add new output_format and failure_limit checks * inheriting dataclasses and added type hints to fn * added todo * fixed inheritance of dataclasses acc. to plan * refactor use of dict to use dataclasses(pending) * more refactoring use of dict to dataclasses - Also changed some type hints related to new refactoring * fixed screenshot directory issue - because of which some of the tests were failing * added try-except clause for unexpected errors * added tests to cover dataclasses * added some new and edited some old docs * refactor use of __dict__ to dataclass.to_dict() * Revert "refactor use of __dict__ to dataclass.to_dict()" This reverts commit a4f35513fa26d23a073c16af9fb332045826dcb2. * fixed some tests * refactor use of __dict__ in favor of dataclass.to_dict() method * removed some TODOS * fixed dataclases validation tests * Update docs/Configuration.md Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update docs/Configuration.md Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update docs/Configuration.md Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update openwpm/config.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update openwpm/config.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * Update openwpm/task_manager.py Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de> * minor fixed wrt polishing the PR * added new check and test for crawl configs Co-authored-by: Stefan Zabka <zabkaste@informatik.hu-berlin.de>
2020-12-02 12:10:45 +03:00
browser_params[i].prefs = json.loads(PREFS)
2019-07-13 01:27:49 +03:00
# Manager configuration
Data Aggregator Rewrite (#753) * First steps in the rewrite * Fixed import paths * One giant refactor * Fixing tests * Adding mypy * Removed mypy from pre-commit workflow * First draft on DataAggregator * Wrote a DataAggregator that starts and shuts down * Created tests and added more empty types * Got demo.py working * Created sql_provider * Cleaned up imports in TaskManager * Added async * Fixed minor bugs * First steps at porting arrow * Introduced TableName and different Task handling * Added more failing tests * First first completes others don't * It works * Started working on arrow_provider * Implemented ArrowProvider * Added logger fixture * Fixed test_storage_controller * Fixing OpenWPMTest.visit() * Moved test/storage_providers to test/storage * Fixing up tests * Moved automation to openwpm * Readded datadir to .gitignore * Ran repin.sh * Fixed formatting * Let's see if this works * Fixed imports * Got arrow_memory_provider working * Starting to rewrite tests * Setting up fixtures * Attempting to fix all the tests * Still fixing tests * Broken content saving * Added node * Fixed screenshot tests * Fixing more tests * Fixed tests * Implemented local_storage.py * Cleaned up flush_cache * Fixing more tests * Wrote test for LocalArrowProvider * Introduced tests for local_storage_provider.py * Asserting test dir is empty * Creating subfolder for different aggregators * New depencies and init() * Everything is terribly broken * Figured out finalize_visit_id * Running two event loops kinda works??? * Rearming the event * Introduced mypy * Downgraded black in pre-commit * Modifying the database directly * Fixed formatting * Made mypy a lil stricter * Fixing docs and config printing * Realising I've been using the wrong with * Trying to figure arrow_storage * Moving lock initialization in in_memory_storage * Fixing tests * Fixing up tests and adding more typechecking * Fixed num_browsers in test_cache_hits_recorded * Parametrized unstructured * String fix * Added failing test * New test * Review changes with Steven * Fixed repin.sh and test_arrow_cache * Minor change * Fixed prune-environment.py * Removing references to DataAggregator * Fixed test_seed_persistance * More paths * Fixed test display shutdown * Made cache test more robust * Update crawler.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Slimming down ManagerParams * Fixing more tests * Update test/storage/test_storage_controller.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Purging references to DataAggregator * Reverted changes to .travis.yml * Demo.py saves locally again * Readjusting test paths * Expanded comment on initialize to reference #846 * Made token optional in finalize_visit_id * Simplified test paramtetrization * Fixed callback semantics change * Removed test_parse_http_stack_trace_str * Added DataSocket * WIP need to fix path encoding * Fixed path encoding * Added task and crawl to schema * Fixed paths in GitHub actions * Refactored completion handling * Fix tests * Trying to fix tests on CI * Removed redundant setting of tag * Removing references to S3 * Purging more DataAggregator references * Craking up logging to figure out test failure * Moved test_values into a fixture * Fixing GcpUnstructuredProvider * Fixed paths for future crawls * Renamed sqllite to official sqlite * Restored demo.py * Update openwpm/commands/profile_commands.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Restored previous behaviour of DumpProfileCommand Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed leftovers * Cleaned up comments * Expanded lock check * Fixed more stuff * More comment updates * Update openwpm/socket_interface.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed outdated comment * Using config_encoder * Renamed tar_location to tar_path * Removed references to database_name in docs * Cleanup * Moved screenshot_path and source_dump_path to ManagerParamsInternal * Fixed imports * Fixing up comments * Fixing up comments * More docs * updated dependencies * Fixed test_task_manager * Reupgraded to python 3.9.1 * Restoring crawl_reference in mp_logger * Removed unused imports * Apply suggestions from code review Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Cleaned up socket handling * Fixed TaskManager.__exit__ * Moved validation code into config.py * Removed comment * Removed comment * Removed comment Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com>
2021-02-22 19:51:32 +03:00
manager_params.data_directory = Path("~/Desktop/") / CRAWL_DIRECTORY
manager_params.log_path = Path("~/Desktop/") / CRAWL_DIRECTORY / "openwpm.log"
Data Aggregator Rewrite (#753) * First steps in the rewrite * Fixed import paths * One giant refactor * Fixing tests * Adding mypy * Removed mypy from pre-commit workflow * First draft on DataAggregator * Wrote a DataAggregator that starts and shuts down * Created tests and added more empty types * Got demo.py working * Created sql_provider * Cleaned up imports in TaskManager * Added async * Fixed minor bugs * First steps at porting arrow * Introduced TableName and different Task handling * Added more failing tests * First first completes others don't * It works * Started working on arrow_provider * Implemented ArrowProvider * Added logger fixture * Fixed test_storage_controller * Fixing OpenWPMTest.visit() * Moved test/storage_providers to test/storage * Fixing up tests * Moved automation to openwpm * Readded datadir to .gitignore * Ran repin.sh * Fixed formatting * Let's see if this works * Fixed imports * Got arrow_memory_provider working * Starting to rewrite tests * Setting up fixtures * Attempting to fix all the tests * Still fixing tests * Broken content saving * Added node * Fixed screenshot tests * Fixing more tests * Fixed tests * Implemented local_storage.py * Cleaned up flush_cache * Fixing more tests * Wrote test for LocalArrowProvider * Introduced tests for local_storage_provider.py * Asserting test dir is empty * Creating subfolder for different aggregators * New depencies and init() * Everything is terribly broken * Figured out finalize_visit_id * Running two event loops kinda works??? * Rearming the event * Introduced mypy * Downgraded black in pre-commit * Modifying the database directly * Fixed formatting * Made mypy a lil stricter * Fixing docs and config printing * Realising I've been using the wrong with * Trying to figure arrow_storage * Moving lock initialization in in_memory_storage * Fixing tests * Fixing up tests and adding more typechecking * Fixed num_browsers in test_cache_hits_recorded * Parametrized unstructured * String fix * Added failing test * New test * Review changes with Steven * Fixed repin.sh and test_arrow_cache * Minor change * Fixed prune-environment.py * Removing references to DataAggregator * Fixed test_seed_persistance * More paths * Fixed test display shutdown * Made cache test more robust * Update crawler.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Slimming down ManagerParams * Fixing more tests * Update test/storage/test_storage_controller.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Purging references to DataAggregator * Reverted changes to .travis.yml * Demo.py saves locally again * Readjusting test paths * Expanded comment on initialize to reference #846 * Made token optional in finalize_visit_id * Simplified test paramtetrization * Fixed callback semantics change * Removed test_parse_http_stack_trace_str * Added DataSocket * WIP need to fix path encoding * Fixed path encoding * Added task and crawl to schema * Fixed paths in GitHub actions * Refactored completion handling * Fix tests * Trying to fix tests on CI * Removed redundant setting of tag * Removing references to S3 * Purging more DataAggregator references * Craking up logging to figure out test failure * Moved test_values into a fixture * Fixing GcpUnstructuredProvider * Fixed paths for future crawls * Renamed sqllite to official sqlite * Restored demo.py * Update openwpm/commands/profile_commands.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Restored previous behaviour of DumpProfileCommand Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed leftovers * Cleaned up comments * Expanded lock check * Fixed more stuff * More comment updates * Update openwpm/socket_interface.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed outdated comment * Using config_encoder * Renamed tar_location to tar_path * Removed references to database_name in docs * Cleanup * Moved screenshot_path and source_dump_path to ManagerParamsInternal * Fixed imports * Fixing up comments * Fixing up comments * More docs * updated dependencies * Fixed test_task_manager * Reupgraded to python 3.9.1 * Restoring crawl_reference in mp_logger * Removed unused imports * Apply suggestions from code review Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Cleaned up socket handling * Fixed TaskManager.__exit__ * Moved validation code into config.py * Removed comment * Removed comment * Removed comment Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com>
2021-02-22 19:51:32 +03:00
structured = GcsStructuredProvider(
project=GCP_PROJECT,
bucket_name=GCS_BUCKET,
base_path=CRAWL_DIRECTORY,
token=AUTH_TOKEN,
)
unstructured = GcsUnstructuredProvider(
project=GCP_PROJECT,
bucket_name=GCS_BUCKET,
base_path=CRAWL_DIRECTORY + "/data",
token=AUTH_TOKEN,
)
# Instantiates the measurement platform
# Commands time out by default after 60 seconds
Data Aggregator Rewrite (#753) * First steps in the rewrite * Fixed import paths * One giant refactor * Fixing tests * Adding mypy * Removed mypy from pre-commit workflow * First draft on DataAggregator * Wrote a DataAggregator that starts and shuts down * Created tests and added more empty types * Got demo.py working * Created sql_provider * Cleaned up imports in TaskManager * Added async * Fixed minor bugs * First steps at porting arrow * Introduced TableName and different Task handling * Added more failing tests * First first completes others don't * It works * Started working on arrow_provider * Implemented ArrowProvider * Added logger fixture * Fixed test_storage_controller * Fixing OpenWPMTest.visit() * Moved test/storage_providers to test/storage * Fixing up tests * Moved automation to openwpm * Readded datadir to .gitignore * Ran repin.sh * Fixed formatting * Let's see if this works * Fixed imports * Got arrow_memory_provider working * Starting to rewrite tests * Setting up fixtures * Attempting to fix all the tests * Still fixing tests * Broken content saving * Added node * Fixed screenshot tests * Fixing more tests * Fixed tests * Implemented local_storage.py * Cleaned up flush_cache * Fixing more tests * Wrote test for LocalArrowProvider * Introduced tests for local_storage_provider.py * Asserting test dir is empty * Creating subfolder for different aggregators * New depencies and init() * Everything is terribly broken * Figured out finalize_visit_id * Running two event loops kinda works??? * Rearming the event * Introduced mypy * Downgraded black in pre-commit * Modifying the database directly * Fixed formatting * Made mypy a lil stricter * Fixing docs and config printing * Realising I've been using the wrong with * Trying to figure arrow_storage * Moving lock initialization in in_memory_storage * Fixing tests * Fixing up tests and adding more typechecking * Fixed num_browsers in test_cache_hits_recorded * Parametrized unstructured * String fix * Added failing test * New test * Review changes with Steven * Fixed repin.sh and test_arrow_cache * Minor change * Fixed prune-environment.py * Removing references to DataAggregator * Fixed test_seed_persistance * More paths * Fixed test display shutdown * Made cache test more robust * Update crawler.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Slimming down ManagerParams * Fixing more tests * Update test/storage/test_storage_controller.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Purging references to DataAggregator * Reverted changes to .travis.yml * Demo.py saves locally again * Readjusting test paths * Expanded comment on initialize to reference #846 * Made token optional in finalize_visit_id * Simplified test paramtetrization * Fixed callback semantics change * Removed test_parse_http_stack_trace_str * Added DataSocket * WIP need to fix path encoding * Fixed path encoding * Added task and crawl to schema * Fixed paths in GitHub actions * Refactored completion handling * Fix tests * Trying to fix tests on CI * Removed redundant setting of tag * Removing references to S3 * Purging more DataAggregator references * Craking up logging to figure out test failure * Moved test_values into a fixture * Fixing GcpUnstructuredProvider * Fixed paths for future crawls * Renamed sqllite to official sqlite * Restored demo.py * Update openwpm/commands/profile_commands.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Restored previous behaviour of DumpProfileCommand Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed leftovers * Cleaned up comments * Expanded lock check * Fixed more stuff * More comment updates * Update openwpm/socket_interface.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed outdated comment * Using config_encoder * Renamed tar_location to tar_path * Removed references to database_name in docs * Cleanup * Moved screenshot_path and source_dump_path to ManagerParamsInternal * Fixed imports * Fixing up comments * Fixing up comments * More docs * updated dependencies * Fixed test_task_manager * Reupgraded to python 3.9.1 * Restoring crawl_reference in mp_logger * Removed unused imports * Apply suggestions from code review Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Cleaned up socket handling * Fixed TaskManager.__exit__ * Moved validation code into config.py * Removed comment * Removed comment * Removed comment Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com>
2021-02-22 19:51:32 +03:00
manager = TaskManager(
manager_params,
browser_params,
structured,
unstructured,
logger_kwargs=LOGGER_SETTINGS,
)
2019-07-13 01:27:49 +03:00
# At this point, Sentry should be initiated
if SENTRY_DSN:
# Add crawler.py-specific context
with sentry_sdk.configure_scope() as scope:
# tags generate breakdown charts and search filters
2020-09-11 16:14:09 +03:00
scope.set_tag("CRAWL_DIRECTORY", CRAWL_DIRECTORY)
Data Aggregator Rewrite (#753) * First steps in the rewrite * Fixed import paths * One giant refactor * Fixing tests * Adding mypy * Removed mypy from pre-commit workflow * First draft on DataAggregator * Wrote a DataAggregator that starts and shuts down * Created tests and added more empty types * Got demo.py working * Created sql_provider * Cleaned up imports in TaskManager * Added async * Fixed minor bugs * First steps at porting arrow * Introduced TableName and different Task handling * Added more failing tests * First first completes others don't * It works * Started working on arrow_provider * Implemented ArrowProvider * Added logger fixture * Fixed test_storage_controller * Fixing OpenWPMTest.visit() * Moved test/storage_providers to test/storage * Fixing up tests * Moved automation to openwpm * Readded datadir to .gitignore * Ran repin.sh * Fixed formatting * Let's see if this works * Fixed imports * Got arrow_memory_provider working * Starting to rewrite tests * Setting up fixtures * Attempting to fix all the tests * Still fixing tests * Broken content saving * Added node * Fixed screenshot tests * Fixing more tests * Fixed tests * Implemented local_storage.py * Cleaned up flush_cache * Fixing more tests * Wrote test for LocalArrowProvider * Introduced tests for local_storage_provider.py * Asserting test dir is empty * Creating subfolder for different aggregators * New depencies and init() * Everything is terribly broken * Figured out finalize_visit_id * Running two event loops kinda works??? * Rearming the event * Introduced mypy * Downgraded black in pre-commit * Modifying the database directly * Fixed formatting * Made mypy a lil stricter * Fixing docs and config printing * Realising I've been using the wrong with * Trying to figure arrow_storage * Moving lock initialization in in_memory_storage * Fixing tests * Fixing up tests and adding more typechecking * Fixed num_browsers in test_cache_hits_recorded * Parametrized unstructured * String fix * Added failing test * New test * Review changes with Steven * Fixed repin.sh and test_arrow_cache * Minor change * Fixed prune-environment.py * Removing references to DataAggregator * Fixed test_seed_persistance * More paths * Fixed test display shutdown * Made cache test more robust * Update crawler.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Slimming down ManagerParams * Fixing more tests * Update test/storage/test_storage_controller.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Purging references to DataAggregator * Reverted changes to .travis.yml * Demo.py saves locally again * Readjusting test paths * Expanded comment on initialize to reference #846 * Made token optional in finalize_visit_id * Simplified test paramtetrization * Fixed callback semantics change * Removed test_parse_http_stack_trace_str * Added DataSocket * WIP need to fix path encoding * Fixed path encoding * Added task and crawl to schema * Fixed paths in GitHub actions * Refactored completion handling * Fix tests * Trying to fix tests on CI * Removed redundant setting of tag * Removing references to S3 * Purging more DataAggregator references * Craking up logging to figure out test failure * Moved test_values into a fixture * Fixing GcpUnstructuredProvider * Fixed paths for future crawls * Renamed sqllite to official sqlite * Restored demo.py * Update openwpm/commands/profile_commands.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Restored previous behaviour of DumpProfileCommand Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed leftovers * Cleaned up comments * Expanded lock check * Fixed more stuff * More comment updates * Update openwpm/socket_interface.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed outdated comment * Using config_encoder * Renamed tar_location to tar_path * Removed references to database_name in docs * Cleanup * Moved screenshot_path and source_dump_path to ManagerParamsInternal * Fixed imports * Fixing up comments * Fixing up comments * More docs * updated dependencies * Fixed test_task_manager * Reupgraded to python 3.9.1 * Restoring crawl_reference in mp_logger * Removed unused imports * Apply suggestions from code review Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Cleaned up socket handling * Fixed TaskManager.__exit__ * Moved validation code into config.py * Removed comment * Removed comment * Removed comment Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com>
2021-02-22 19:51:32 +03:00
scope.set_tag("GCS_BUCKET", GCS_BUCKET)
2020-09-11 16:14:09 +03:00
scope.set_tag("DISPLAY_MODE", DISPLAY_MODE)
scope.set_tag("HTTP_INSTRUMENT", HTTP_INSTRUMENT)
scope.set_tag("COOKIE_INSTRUMENT", COOKIE_INSTRUMENT)
scope.set_tag("NAVIGATION_INSTRUMENT", NAVIGATION_INSTRUMENT)
scope.set_tag("JS_INSTRUMENT", JS_INSTRUMENT)
scope.set_tag("JS_INSTRUMENT_SETTINGS", JS_INSTRUMENT_SETTINGS)
scope.set_tag("CALLSTACK_INSTRUMENT", CALLSTACK_INSTRUMENT)
scope.set_tag("SAVE_CONTENT", SAVE_CONTENT)
scope.set_tag("DWELL_TIME", DWELL_TIME)
scope.set_tag("TIMEOUT", TIMEOUT)
scope.set_tag("MAX_JOB_RETRIES", MAX_JOB_RETRIES)
Data Aggregator Rewrite (#753) * First steps in the rewrite * Fixed import paths * One giant refactor * Fixing tests * Adding mypy * Removed mypy from pre-commit workflow * First draft on DataAggregator * Wrote a DataAggregator that starts and shuts down * Created tests and added more empty types * Got demo.py working * Created sql_provider * Cleaned up imports in TaskManager * Added async * Fixed minor bugs * First steps at porting arrow * Introduced TableName and different Task handling * Added more failing tests * First first completes others don't * It works * Started working on arrow_provider * Implemented ArrowProvider * Added logger fixture * Fixed test_storage_controller * Fixing OpenWPMTest.visit() * Moved test/storage_providers to test/storage * Fixing up tests * Moved automation to openwpm * Readded datadir to .gitignore * Ran repin.sh * Fixed formatting * Let's see if this works * Fixed imports * Got arrow_memory_provider working * Starting to rewrite tests * Setting up fixtures * Attempting to fix all the tests * Still fixing tests * Broken content saving * Added node * Fixed screenshot tests * Fixing more tests * Fixed tests * Implemented local_storage.py * Cleaned up flush_cache * Fixing more tests * Wrote test for LocalArrowProvider * Introduced tests for local_storage_provider.py * Asserting test dir is empty * Creating subfolder for different aggregators * New depencies and init() * Everything is terribly broken * Figured out finalize_visit_id * Running two event loops kinda works??? * Rearming the event * Introduced mypy * Downgraded black in pre-commit * Modifying the database directly * Fixed formatting * Made mypy a lil stricter * Fixing docs and config printing * Realising I've been using the wrong with * Trying to figure arrow_storage * Moving lock initialization in in_memory_storage * Fixing tests * Fixing up tests and adding more typechecking * Fixed num_browsers in test_cache_hits_recorded * Parametrized unstructured * String fix * Added failing test * New test * Review changes with Steven * Fixed repin.sh and test_arrow_cache * Minor change * Fixed prune-environment.py * Removing references to DataAggregator * Fixed test_seed_persistance * More paths * Fixed test display shutdown * Made cache test more robust * Update crawler.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Slimming down ManagerParams * Fixing more tests * Update test/storage/test_storage_controller.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Purging references to DataAggregator * Reverted changes to .travis.yml * Demo.py saves locally again * Readjusting test paths * Expanded comment on initialize to reference #846 * Made token optional in finalize_visit_id * Simplified test paramtetrization * Fixed callback semantics change * Removed test_parse_http_stack_trace_str * Added DataSocket * WIP need to fix path encoding * Fixed path encoding * Added task and crawl to schema * Fixed paths in GitHub actions * Refactored completion handling * Fix tests * Trying to fix tests on CI * Removed redundant setting of tag * Removing references to S3 * Purging more DataAggregator references * Craking up logging to figure out test failure * Moved test_values into a fixture * Fixing GcpUnstructuredProvider * Fixed paths for future crawls * Renamed sqllite to official sqlite * Restored demo.py * Update openwpm/commands/profile_commands.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Restored previous behaviour of DumpProfileCommand Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed leftovers * Cleaned up comments * Expanded lock check * Fixed more stuff * More comment updates * Update openwpm/socket_interface.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed outdated comment * Using config_encoder * Renamed tar_location to tar_path * Removed references to database_name in docs * Cleanup * Moved screenshot_path and source_dump_path to ManagerParamsInternal * Fixed imports * Fixing up comments * Fixing up comments * More docs * updated dependencies * Fixed test_task_manager * Reupgraded to python 3.9.1 * Restoring crawl_reference in mp_logger * Removed unused imports * Apply suggestions from code review Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Cleaned up socket handling * Fixed TaskManager.__exit__ * Moved validation code into config.py * Removed comment * Removed comment * Removed comment Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com>
2021-02-22 19:51:32 +03:00
scope.set_tag("CRAWL_REFERENCE", "%s/%s" % (GCS_BUCKET, CRAWL_DIRECTORY))
# context adds addition information that may be of interest
Data Aggregator Rewrite (#753) * First steps in the rewrite * Fixed import paths * One giant refactor * Fixing tests * Adding mypy * Removed mypy from pre-commit workflow * First draft on DataAggregator * Wrote a DataAggregator that starts and shuts down * Created tests and added more empty types * Got demo.py working * Created sql_provider * Cleaned up imports in TaskManager * Added async * Fixed minor bugs * First steps at porting arrow * Introduced TableName and different Task handling * Added more failing tests * First first completes others don't * It works * Started working on arrow_provider * Implemented ArrowProvider * Added logger fixture * Fixed test_storage_controller * Fixing OpenWPMTest.visit() * Moved test/storage_providers to test/storage * Fixing up tests * Moved automation to openwpm * Readded datadir to .gitignore * Ran repin.sh * Fixed formatting * Let's see if this works * Fixed imports * Got arrow_memory_provider working * Starting to rewrite tests * Setting up fixtures * Attempting to fix all the tests * Still fixing tests * Broken content saving * Added node * Fixed screenshot tests * Fixing more tests * Fixed tests * Implemented local_storage.py * Cleaned up flush_cache * Fixing more tests * Wrote test for LocalArrowProvider * Introduced tests for local_storage_provider.py * Asserting test dir is empty * Creating subfolder for different aggregators * New depencies and init() * Everything is terribly broken * Figured out finalize_visit_id * Running two event loops kinda works??? * Rearming the event * Introduced mypy * Downgraded black in pre-commit * Modifying the database directly * Fixed formatting * Made mypy a lil stricter * Fixing docs and config printing * Realising I've been using the wrong with * Trying to figure arrow_storage * Moving lock initialization in in_memory_storage * Fixing tests * Fixing up tests and adding more typechecking * Fixed num_browsers in test_cache_hits_recorded * Parametrized unstructured * String fix * Added failing test * New test * Review changes with Steven * Fixed repin.sh and test_arrow_cache * Minor change * Fixed prune-environment.py * Removing references to DataAggregator * Fixed test_seed_persistance * More paths * Fixed test display shutdown * Made cache test more robust * Update crawler.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Slimming down ManagerParams * Fixing more tests * Update test/storage/test_storage_controller.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Purging references to DataAggregator * Reverted changes to .travis.yml * Demo.py saves locally again * Readjusting test paths * Expanded comment on initialize to reference #846 * Made token optional in finalize_visit_id * Simplified test paramtetrization * Fixed callback semantics change * Removed test_parse_http_stack_trace_str * Added DataSocket * WIP need to fix path encoding * Fixed path encoding * Added task and crawl to schema * Fixed paths in GitHub actions * Refactored completion handling * Fix tests * Trying to fix tests on CI * Removed redundant setting of tag * Removing references to S3 * Purging more DataAggregator references * Craking up logging to figure out test failure * Moved test_values into a fixture * Fixing GcpUnstructuredProvider * Fixed paths for future crawls * Renamed sqllite to official sqlite * Restored demo.py * Update openwpm/commands/profile_commands.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Restored previous behaviour of DumpProfileCommand Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed leftovers * Cleaned up comments * Expanded lock check * Fixed more stuff * More comment updates * Update openwpm/socket_interface.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed outdated comment * Using config_encoder * Renamed tar_location to tar_path * Removed references to database_name in docs * Cleanup * Moved screenshot_path and source_dump_path to ManagerParamsInternal * Fixed imports * Fixing up comments * Fixing up comments * More docs * updated dependencies * Fixed test_task_manager * Reupgraded to python 3.9.1 * Restoring crawl_reference in mp_logger * Removed unused imports * Apply suggestions from code review Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Cleaned up socket handling * Fixed TaskManager.__exit__ * Moved validation code into config.py * Removed comment * Removed comment * Removed comment Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com>
2021-02-22 19:51:32 +03:00
if PREFS:
scope.set_context("PREFS", json.loads(PREFS))
2020-09-11 16:14:09 +03:00
scope.set_context(
"crawl_config",
{
"REDIS_QUEUE_NAME": REDIS_QUEUE_NAME,
},
)
# Send a sentry error message (temporarily - to easily be able
# to compare error frequencies to crawl worker instance count)
sentry_sdk.capture_message("Crawl worker started")
2019-07-13 01:27:49 +03:00
# Connect to job queue
job_queue = rediswq.RedisWQ(
2020-09-11 16:14:09 +03:00
name=REDIS_QUEUE_NAME, host=REDIS_HOST, max_retries=MAX_JOB_RETRIES
)
2019-08-02 06:00:23 +03:00
manager.logger.info("Worker with sessionID: %s" % job_queue.sessionID())
manager.logger.info("Initial queue state: empty=%s" % job_queue.empty())
2019-07-13 01:27:49 +03:00
unsaved_jobs: List[bytes] = list()
2020-03-06 19:19:28 +03:00
unsaved_jobs_lock = Lock()
shutting_down = False
2020-09-11 16:14:09 +03:00
def on_shutdown(
manager: TaskManager, unsaved_jobs_lock: Lock
) -> Callable[[int, Optional[FrameType]], None]:
def actual_callback(s: int, _: Optional[FrameType]) -> None:
global shutting_down
2020-04-16 17:25:31 +03:00
manager.logger.error("Got interupted by %r, shutting down", s)
with unsaved_jobs_lock:
shutting_down = True
2020-04-15 18:17:43 +03:00
manager.close(relaxed=False)
2020-04-16 17:50:08 +03:00
sys.exit(1)
2020-09-11 16:14:09 +03:00
return actual_callback
# Register signal listeners for shutdown
for sig in [signal.SIGTERM, signal.SIGINT]:
signal.signal(sig, on_shutdown(manager, unsaved_jobs_lock))
2020-09-11 16:14:09 +03:00
def get_job_completion_callback(
logger: logging.Logger,
unsaved_jobs_lock: Lock,
job_queue: rediswq.RedisWQ,
job: bytes,
) -> Callable[[bool], None]:
Data Aggregator Rewrite (#753) * First steps in the rewrite * Fixed import paths * One giant refactor * Fixing tests * Adding mypy * Removed mypy from pre-commit workflow * First draft on DataAggregator * Wrote a DataAggregator that starts and shuts down * Created tests and added more empty types * Got demo.py working * Created sql_provider * Cleaned up imports in TaskManager * Added async * Fixed minor bugs * First steps at porting arrow * Introduced TableName and different Task handling * Added more failing tests * First first completes others don't * It works * Started working on arrow_provider * Implemented ArrowProvider * Added logger fixture * Fixed test_storage_controller * Fixing OpenWPMTest.visit() * Moved test/storage_providers to test/storage * Fixing up tests * Moved automation to openwpm * Readded datadir to .gitignore * Ran repin.sh * Fixed formatting * Let's see if this works * Fixed imports * Got arrow_memory_provider working * Starting to rewrite tests * Setting up fixtures * Attempting to fix all the tests * Still fixing tests * Broken content saving * Added node * Fixed screenshot tests * Fixing more tests * Fixed tests * Implemented local_storage.py * Cleaned up flush_cache * Fixing more tests * Wrote test for LocalArrowProvider * Introduced tests for local_storage_provider.py * Asserting test dir is empty * Creating subfolder for different aggregators * New depencies and init() * Everything is terribly broken * Figured out finalize_visit_id * Running two event loops kinda works??? * Rearming the event * Introduced mypy * Downgraded black in pre-commit * Modifying the database directly * Fixed formatting * Made mypy a lil stricter * Fixing docs and config printing * Realising I've been using the wrong with * Trying to figure arrow_storage * Moving lock initialization in in_memory_storage * Fixing tests * Fixing up tests and adding more typechecking * Fixed num_browsers in test_cache_hits_recorded * Parametrized unstructured * String fix * Added failing test * New test * Review changes with Steven * Fixed repin.sh and test_arrow_cache * Minor change * Fixed prune-environment.py * Removing references to DataAggregator * Fixed test_seed_persistance * More paths * Fixed test display shutdown * Made cache test more robust * Update crawler.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Slimming down ManagerParams * Fixing more tests * Update test/storage/test_storage_controller.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Purging references to DataAggregator * Reverted changes to .travis.yml * Demo.py saves locally again * Readjusting test paths * Expanded comment on initialize to reference #846 * Made token optional in finalize_visit_id * Simplified test paramtetrization * Fixed callback semantics change * Removed test_parse_http_stack_trace_str * Added DataSocket * WIP need to fix path encoding * Fixed path encoding * Added task and crawl to schema * Fixed paths in GitHub actions * Refactored completion handling * Fix tests * Trying to fix tests on CI * Removed redundant setting of tag * Removing references to S3 * Purging more DataAggregator references * Craking up logging to figure out test failure * Moved test_values into a fixture * Fixing GcpUnstructuredProvider * Fixed paths for future crawls * Renamed sqllite to official sqlite * Restored demo.py * Update openwpm/commands/profile_commands.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Restored previous behaviour of DumpProfileCommand Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed leftovers * Cleaned up comments * Expanded lock check * Fixed more stuff * More comment updates * Update openwpm/socket_interface.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed outdated comment * Using config_encoder * Renamed tar_location to tar_path * Removed references to database_name in docs * Cleanup * Moved screenshot_path and source_dump_path to ManagerParamsInternal * Fixed imports * Fixing up comments * Fixing up comments * More docs * updated dependencies * Fixed test_task_manager * Reupgraded to python 3.9.1 * Restoring crawl_reference in mp_logger * Removed unused imports * Apply suggestions from code review Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Cleaned up socket handling * Fixed TaskManager.__exit__ * Moved validation code into config.py * Removed comment * Removed comment * Removed comment Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com>
2021-02-22 19:51:32 +03:00
def callback(success: bool) -> None:
with unsaved_jobs_lock:
Data Aggregator Rewrite (#753) * First steps in the rewrite * Fixed import paths * One giant refactor * Fixing tests * Adding mypy * Removed mypy from pre-commit workflow * First draft on DataAggregator * Wrote a DataAggregator that starts and shuts down * Created tests and added more empty types * Got demo.py working * Created sql_provider * Cleaned up imports in TaskManager * Added async * Fixed minor bugs * First steps at porting arrow * Introduced TableName and different Task handling * Added more failing tests * First first completes others don't * It works * Started working on arrow_provider * Implemented ArrowProvider * Added logger fixture * Fixed test_storage_controller * Fixing OpenWPMTest.visit() * Moved test/storage_providers to test/storage * Fixing up tests * Moved automation to openwpm * Readded datadir to .gitignore * Ran repin.sh * Fixed formatting * Let's see if this works * Fixed imports * Got arrow_memory_provider working * Starting to rewrite tests * Setting up fixtures * Attempting to fix all the tests * Still fixing tests * Broken content saving * Added node * Fixed screenshot tests * Fixing more tests * Fixed tests * Implemented local_storage.py * Cleaned up flush_cache * Fixing more tests * Wrote test for LocalArrowProvider * Introduced tests for local_storage_provider.py * Asserting test dir is empty * Creating subfolder for different aggregators * New depencies and init() * Everything is terribly broken * Figured out finalize_visit_id * Running two event loops kinda works??? * Rearming the event * Introduced mypy * Downgraded black in pre-commit * Modifying the database directly * Fixed formatting * Made mypy a lil stricter * Fixing docs and config printing * Realising I've been using the wrong with * Trying to figure arrow_storage * Moving lock initialization in in_memory_storage * Fixing tests * Fixing up tests and adding more typechecking * Fixed num_browsers in test_cache_hits_recorded * Parametrized unstructured * String fix * Added failing test * New test * Review changes with Steven * Fixed repin.sh and test_arrow_cache * Minor change * Fixed prune-environment.py * Removing references to DataAggregator * Fixed test_seed_persistance * More paths * Fixed test display shutdown * Made cache test more robust * Update crawler.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Slimming down ManagerParams * Fixing more tests * Update test/storage/test_storage_controller.py Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Purging references to DataAggregator * Reverted changes to .travis.yml * Demo.py saves locally again * Readjusting test paths * Expanded comment on initialize to reference #846 * Made token optional in finalize_visit_id * Simplified test paramtetrization * Fixed callback semantics change * Removed test_parse_http_stack_trace_str * Added DataSocket * WIP need to fix path encoding * Fixed path encoding * Added task and crawl to schema * Fixed paths in GitHub actions * Refactored completion handling * Fix tests * Trying to fix tests on CI * Removed redundant setting of tag * Removing references to S3 * Purging more DataAggregator references * Craking up logging to figure out test failure * Moved test_values into a fixture * Fixing GcpUnstructuredProvider * Fixed paths for future crawls * Renamed sqllite to official sqlite * Restored demo.py * Update openwpm/commands/profile_commands.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Restored previous behaviour of DumpProfileCommand Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed leftovers * Cleaned up comments * Expanded lock check * Fixed more stuff * More comment updates * Update openwpm/socket_interface.py Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com> * Removed outdated comment * Using config_encoder * Renamed tar_location to tar_path * Removed references to database_name in docs * Cleanup * Moved screenshot_path and source_dump_path to ManagerParamsInternal * Fixed imports * Fixing up comments * Fixing up comments * More docs * updated dependencies * Fixed test_task_manager * Reupgraded to python 3.9.1 * Restoring crawl_reference in mp_logger * Removed unused imports * Apply suggestions from code review Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> * Cleaned up socket handling * Fixed TaskManager.__exit__ * Moved validation code into config.py * Removed comment * Removed comment * Removed comment Co-authored-by: Steven Englehardt <senglehardt@mozilla.com> Co-authored-by: Georgia Kokkinou <geor5ko@gmail.com>
2021-02-22 19:51:32 +03:00
if success:
logger.info("Job %r is done", job)
job_queue.complete(job)
else:
logger.warning("Job %r got interrupted", job)
unsaved_jobs.remove(job)
2020-09-11 16:14:09 +03:00
return callback
2020-04-16 17:26:04 +03:00
no_job_since = None
2019-07-13 01:27:49 +03:00
# Crawl sites specified in job queue until empty
while not job_queue.empty():
2019-08-18 02:57:37 +03:00
job_queue.check_expired_leases()
with unsaved_jobs_lock:
manager.logger.debug("Currently unfinished jobs are: %s", unsaved_jobs)
2020-03-06 19:19:28 +03:00
for unsaved_job in unsaved_jobs:
2020-09-11 16:14:09 +03:00
if not job_queue.renew_lease(unsaved_job, EXTENDED_LEASE_TIME):
2020-03-06 19:19:28 +03:00
manager.logger.error("Unsaved job: %s timed out", unsaved_job)
2020-09-11 16:14:09 +03:00
job = job_queue.lease(lease_secs=TIMEOUT + DWELL_TIME + 30, block=True, timeout=5)
2019-07-13 01:27:49 +03:00
if job is None:
2020-05-11 13:37:27 +03:00
manager.logger.info("Waiting for work")
2019-07-23 20:21:26 +03:00
time.sleep(5)
continue
2020-05-11 13:37:27 +03:00
unsaved_jobs.append(job)
retry_number = job_queue.get_retry_number(job)
2020-09-11 16:14:09 +03:00
site_rank, site = job.decode("utf-8").split(",")
if "://" not in site:
site = "http://" + site
manager.logger.info("Visiting %s..." % site)
2020-04-17 19:38:40 +03:00
callback = get_job_completion_callback(
2020-09-11 16:14:09 +03:00
manager.logger, unsaved_jobs_lock, job_queue, job
)
command_sequence = CommandSequence(
2020-09-11 16:14:09 +03:00
site,
blocking=True,
reset=True,
retry_number=retry_number,
callback=callback,
site_rank=int(site_rank),
)
command_sequence.get(sleep=DWELL_TIME, timeout=TIMEOUT)
manager.execute_command_sequence(command_sequence)
2020-04-16 17:26:04 +03:00
else:
manager.logger.info("Job queue finished, exiting.")
2019-07-13 01:27:49 +03:00
manager.close()
if SENTRY_DSN:
2019-08-02 00:27:46 +03:00
sentry_sdk.capture_message("Crawl worker finished")