2021-02-22 19:51:32 +03:00
|
|
|
from pathlib import Path
|
|
|
|
|
2021-01-09 13:15:01 +03:00
|
|
|
from custom_command import LinkCountingCommand
|
2020-11-25 22:33:56 +03:00
|
|
|
from openwpm.command_sequence import CommandSequence
|
2021-01-09 13:15:01 +03:00
|
|
|
from openwpm.commands.browser_commands import GetCommand
|
2020-12-02 12:10:45 +03:00
|
|
|
from openwpm.config import BrowserParams, ManagerParams
|
2021-02-22 19:51:32 +03:00
|
|
|
from openwpm.storage.sql_provider import SQLiteStorageProvider
|
2020-12-02 12:10:45 +03:00
|
|
|
from openwpm.task_manager import TaskManager
|
2018-08-01 09:48:06 +03:00
|
|
|
|
2014-07-01 20:37:17 +04:00
|
|
|
# The list of sites that we wish to crawl
|
2020-02-28 19:39:39 +03:00
|
|
|
NUM_BROWSERS = 1
|
2020-05-08 02:27:52 +03:00
|
|
|
sites = [
|
2020-09-11 16:14:09 +03:00
|
|
|
"http://www.example.com",
|
|
|
|
"http://www.princeton.edu",
|
|
|
|
"http://citp.princeton.edu/",
|
2020-05-08 02:27:52 +03:00
|
|
|
]
|
2014-07-01 20:37:17 +04:00
|
|
|
|
2020-12-02 12:10:45 +03:00
|
|
|
# Loads the default ManagerParams
|
|
|
|
# and NUM_BROWSERS copies of the default BrowserParams
|
|
|
|
|
2021-02-22 19:51:32 +03:00
|
|
|
manager_params = ManagerParams(num_browsers=NUM_BROWSERS)
|
2020-12-02 12:10:45 +03:00
|
|
|
browser_params = [BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS)]
|
2014-07-01 20:37:17 +04:00
|
|
|
|
2015-09-14 18:05:50 +03:00
|
|
|
# Update browser configuration (use this for per-browser settings)
|
2017-03-09 19:00:54 +03:00
|
|
|
for i in range(NUM_BROWSERS):
|
2017-07-28 23:37:35 +03:00
|
|
|
# Record HTTP Requests and Responses
|
2020-12-02 12:10:45 +03:00
|
|
|
browser_params[i].http_instrument = True
|
2019-06-12 18:07:55 +03:00
|
|
|
# Record cookie changes
|
2020-12-02 12:10:45 +03:00
|
|
|
browser_params[i].cookie_instrument = True
|
2019-05-09 11:56:44 +03:00
|
|
|
# Record Navigations
|
2020-12-02 12:10:45 +03:00
|
|
|
browser_params[i].navigation_instrument = True
|
2019-05-09 11:56:44 +03:00
|
|
|
# Record JS Web API calls
|
2020-12-02 12:10:45 +03:00
|
|
|
browser_params[i].js_instrument = True
|
2020-01-17 18:40:13 +03:00
|
|
|
# Record the callstack of all WebRequests made
|
2020-12-02 12:10:45 +03:00
|
|
|
browser_params[i].callstack_instrument = True
|
2020-08-04 17:40:11 +03:00
|
|
|
# Record DNS resolution
|
2020-12-02 12:10:45 +03:00
|
|
|
browser_params[i].dns_instrument = True
|
2014-07-01 20:37:17 +04:00
|
|
|
|
2015-09-14 18:05:50 +03:00
|
|
|
# Update TaskManager configuration (use this for crawl-wide settings)
|
2021-02-22 19:51:32 +03:00
|
|
|
manager_params.data_directory = Path("./datadir/")
|
|
|
|
manager_params.log_directory = Path("./datadir/")
|
2020-12-02 12:10:45 +03:00
|
|
|
|
|
|
|
# memory_watchdog and process_watchdog are useful for large scale cloud crawls.
|
|
|
|
# Please refer to docs/Configuration.md#platform-configuration-options for more information
|
|
|
|
# manager_params.memory_watchdog = True
|
|
|
|
# manager_params.process_watchdog = True
|
2014-07-01 20:37:17 +04:00
|
|
|
|
|
|
|
|
2021-02-22 19:51:32 +03:00
|
|
|
# Commands time out by default after 60 seconds
|
|
|
|
with TaskManager(
|
|
|
|
manager_params,
|
|
|
|
browser_params,
|
|
|
|
SQLiteStorageProvider(Path("./datadir/crawl-data.sqlite")),
|
|
|
|
None,
|
|
|
|
) as manager:
|
|
|
|
# Visits the sites
|
|
|
|
for index, site in enumerate(sites):
|
2016-10-27 17:56:03 +03:00
|
|
|
|
2021-02-22 19:51:32 +03:00
|
|
|
def callback(success: bool, val: str = site) -> None:
|
|
|
|
print(
|
|
|
|
f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}"
|
|
|
|
)
|
2016-10-27 17:56:03 +03:00
|
|
|
|
2021-02-22 19:51:32 +03:00
|
|
|
# Parallelize sites over all number of browsers set above.
|
|
|
|
command_sequence = CommandSequence(
|
|
|
|
site,
|
|
|
|
site_rank=index,
|
|
|
|
callback=callback,
|
|
|
|
)
|
2016-10-27 17:56:03 +03:00
|
|
|
|
2021-02-22 19:51:32 +03:00
|
|
|
# Start by visiting the page
|
|
|
|
command_sequence.append_command(GetCommand(url=site, sleep=3), timeout=60)
|
|
|
|
# Have a look at custom_command.py to see how to implement your own command
|
|
|
|
command_sequence.append_command(LinkCountingCommand())
|
2014-07-01 20:37:17 +04:00
|
|
|
|
2021-02-22 19:51:32 +03:00
|
|
|
# Run commands across the three browsers (simple parallelization)
|
|
|
|
manager.execute_command_sequence(command_sequence)
|