2019-08-30 09:43:17 +03:00
|
|
|
import json
|
2020-04-01 20:20:56 +03:00
|
|
|
import logging
|
2019-07-13 01:27:49 +03:00
|
|
|
import os
|
2020-03-11 21:12:18 +03:00
|
|
|
import signal
|
2020-04-16 17:50:08 +03:00
|
|
|
import sys
|
2019-07-13 01:27:49 +03:00
|
|
|
import time
|
2021-05-24 14:05:41 +03:00
|
|
|
import typing
|
2021-02-22 19:51:32 +03:00
|
|
|
from pathlib import Path
|
2020-03-06 19:19:28 +03:00
|
|
|
from threading import Lock
|
2022-03-23 20:32:18 +03:00
|
|
|
from types import FrameType
|
|
|
|
from typing import Any, Callable, List, Literal, Optional
|
2020-03-06 19:19:28 +03:00
|
|
|
|
2019-07-31 11:01:46 +03:00
|
|
|
import sentry_sdk
|
2019-07-13 01:27:49 +03:00
|
|
|
|
2021-02-22 19:51:32 +03:00
|
|
|
from openwpm import mp_logger
|
2020-11-25 22:33:56 +03:00
|
|
|
from openwpm.command_sequence import CommandSequence
|
2020-12-02 12:10:45 +03:00
|
|
|
from openwpm.config import BrowserParams, ManagerParams
|
2021-02-22 19:51:32 +03:00
|
|
|
from openwpm.storage.cloud_storage.gcp_storage import (
|
|
|
|
GcsStructuredProvider,
|
|
|
|
GcsUnstructuredProvider,
|
|
|
|
)
|
2021-01-09 13:15:01 +03:00
|
|
|
from openwpm.task_manager import TaskManager
|
2020-11-14 18:06:51 +03:00
|
|
|
from openwpm.utilities import rediswq
|
2019-07-13 17:29:46 +03:00
|
|
|
|
|
|
|
# Configuration via environment variables
|
2021-02-22 19:51:32 +03:00
|
|
|
# Crawler specific config
|
2020-09-11 16:14:09 +03:00
|
|
|
REDIS_HOST = os.getenv("REDIS_HOST", "redis-box")
|
|
|
|
REDIS_QUEUE_NAME = os.getenv("REDIS_QUEUE_NAME", "crawl-queue")
|
2021-02-22 19:51:32 +03:00
|
|
|
MAX_JOB_RETRIES = int(os.getenv("MAX_JOB_RETRIES", "2"))
|
|
|
|
DWELL_TIME = int(os.getenv("DWELL_TIME", "10"))
|
|
|
|
TIMEOUT = int(os.getenv("TIMEOUT", "60"))
|
|
|
|
|
|
|
|
# Storage Provider Params
|
2020-09-11 16:14:09 +03:00
|
|
|
CRAWL_DIRECTORY = os.getenv("CRAWL_DIRECTORY", "crawl-data")
|
2021-02-22 19:51:32 +03:00
|
|
|
GCS_BUCKET = os.getenv("GCS_BUCKET", "openwpm-crawls")
|
|
|
|
GCP_PROJECT = os.getenv("GCP_PROJECT", "")
|
|
|
|
AUTH_TOKEN = os.getenv("GCP_AUTH_TOKEN", "cloud")
|
|
|
|
|
|
|
|
# Browser Params
|
2020-09-11 16:14:09 +03:00
|
|
|
DISPLAY_MODE = os.getenv("DISPLAY_MODE", "headless")
|
2021-05-24 14:05:41 +03:00
|
|
|
assert DISPLAY_MODE in ["headless", "xvfb", "native"]
|
|
|
|
DISPLAY_MODE = typing.cast(Literal["headless", "xvfb", "native"], DISPLAY_MODE)
|
2020-09-11 16:14:09 +03:00
|
|
|
HTTP_INSTRUMENT = os.getenv("HTTP_INSTRUMENT", "1") == "1"
|
|
|
|
COOKIE_INSTRUMENT = os.getenv("COOKIE_INSTRUMENT", "1") == "1"
|
|
|
|
NAVIGATION_INSTRUMENT = os.getenv("NAVIGATION_INSTRUMENT", "1") == "1"
|
|
|
|
JS_INSTRUMENT = os.getenv("JS_INSTRUMENT", "1") == "1"
|
|
|
|
CALLSTACK_INSTRUMENT = os.getenv("CALLSTACK_INSTRUMENT", "1") == "1"
|
2021-02-22 19:51:32 +03:00
|
|
|
JS_INSTRUMENT_SETTINGS = json.loads(
|
|
|
|
os.getenv("JS_INSTRUMENT_SETTINGS", '["collection_fingerprinting"]')
|
2020-09-11 16:14:09 +03:00
|
|
|
)
|
2021-02-22 19:51:32 +03:00
|
|
|
|
2020-09-11 16:14:09 +03:00
|
|
|
SAVE_CONTENT = os.getenv("SAVE_CONTENT", "")
|
|
|
|
PREFS = os.getenv("PREFS", None)
|
2019-07-20 13:40:28 +03:00
|
|
|
|
2021-02-22 19:51:32 +03:00
|
|
|
|
|
|
|
SENTRY_DSN = os.getenv("SENTRY_DSN", None)
|
|
|
|
LOGGER_SETTINGS = mp_logger.parse_config_from_env()
|
2020-04-16 17:26:04 +03:00
|
|
|
|
2020-05-01 05:06:43 +03:00
|
|
|
if CALLSTACK_INSTRUMENT is True:
|
|
|
|
# Must have JS_INSTRUMENT True for CALLSTACK_INSTRUMENT to work
|
|
|
|
JS_INSTRUMENT = True
|
|
|
|
|
2020-04-20 22:49:45 +03:00
|
|
|
EXTENDED_LEASE_TIME = 2 * (TIMEOUT + DWELL_TIME + 30)
|
2020-04-16 17:26:04 +03:00
|
|
|
|
2019-07-14 00:48:58 +03:00
|
|
|
# Loads the default manager params
|
2019-08-23 00:21:59 +03:00
|
|
|
# We can't use more than one browser per instance because the job management
|
|
|
|
# code below requires blocking commands. For more context see:
|
2021-12-20 18:47:32 +03:00
|
|
|
# https://github.com/openwpm/OpenWPM/issues/470
|
2019-08-23 00:21:59 +03:00
|
|
|
NUM_BROWSERS = 1
|
2020-12-02 12:10:45 +03:00
|
|
|
manager_params = ManagerParams()
|
|
|
|
browser_params = [BrowserParams() for _ in range(NUM_BROWSERS)]
|
2019-07-13 01:27:49 +03:00
|
|
|
|
|
|
|
# Browser configuration
|
|
|
|
for i in range(NUM_BROWSERS):
|
2020-12-02 12:10:45 +03:00
|
|
|
browser_params[i].display_mode = DISPLAY_MODE
|
|
|
|
browser_params[i].http_instrument = HTTP_INSTRUMENT
|
|
|
|
browser_params[i].cookie_instrument = COOKIE_INSTRUMENT
|
|
|
|
browser_params[i].navigation_instrument = NAVIGATION_INSTRUMENT
|
|
|
|
browser_params[i].callstack_instrument = CALLSTACK_INSTRUMENT
|
|
|
|
browser_params[i].js_instrument = JS_INSTRUMENT
|
|
|
|
browser_params[i].js_instrument_settings = JS_INSTRUMENT_SETTINGS
|
2020-09-11 16:14:09 +03:00
|
|
|
if SAVE_CONTENT == "1":
|
2020-12-02 12:10:45 +03:00
|
|
|
browser_params[i].save_content = True
|
2020-09-11 16:14:09 +03:00
|
|
|
elif SAVE_CONTENT == "0":
|
2020-12-02 12:10:45 +03:00
|
|
|
browser_params[i].save_content = False
|
2019-08-14 13:14:27 +03:00
|
|
|
else:
|
2020-12-02 12:10:45 +03:00
|
|
|
browser_params[i].save_content = SAVE_CONTENT
|
2019-08-30 09:43:17 +03:00
|
|
|
if PREFS:
|
2020-12-02 12:10:45 +03:00
|
|
|
browser_params[i].prefs = json.loads(PREFS)
|
2019-07-13 01:27:49 +03:00
|
|
|
|
|
|
|
# Manager configuration
|
2021-02-22 19:51:32 +03:00
|
|
|
manager_params.data_directory = Path("~/Desktop/") / CRAWL_DIRECTORY
|
2021-04-30 17:52:22 +03:00
|
|
|
manager_params.log_path = Path("~/Desktop/") / CRAWL_DIRECTORY / "openwpm.log"
|
2021-02-22 19:51:32 +03:00
|
|
|
|
|
|
|
structured = GcsStructuredProvider(
|
|
|
|
project=GCP_PROJECT,
|
|
|
|
bucket_name=GCS_BUCKET,
|
|
|
|
base_path=CRAWL_DIRECTORY,
|
|
|
|
token=AUTH_TOKEN,
|
|
|
|
)
|
|
|
|
unstructured = GcsUnstructuredProvider(
|
|
|
|
project=GCP_PROJECT,
|
|
|
|
bucket_name=GCS_BUCKET,
|
|
|
|
base_path=CRAWL_DIRECTORY + "/data",
|
|
|
|
token=AUTH_TOKEN,
|
|
|
|
)
|
2019-07-13 17:29:46 +03:00
|
|
|
# Instantiates the measurement platform
|
|
|
|
# Commands time out by default after 60 seconds
|
2021-02-22 19:51:32 +03:00
|
|
|
manager = TaskManager(
|
|
|
|
manager_params,
|
|
|
|
browser_params,
|
|
|
|
structured,
|
|
|
|
unstructured,
|
|
|
|
logger_kwargs=LOGGER_SETTINGS,
|
|
|
|
)
|
2019-07-13 01:27:49 +03:00
|
|
|
|
2019-07-31 11:01:46 +03:00
|
|
|
# At this point, Sentry should be initiated
|
|
|
|
if SENTRY_DSN:
|
|
|
|
# Add crawler.py-specific context
|
|
|
|
with sentry_sdk.configure_scope() as scope:
|
|
|
|
# tags generate breakdown charts and search filters
|
2020-09-11 16:14:09 +03:00
|
|
|
scope.set_tag("CRAWL_DIRECTORY", CRAWL_DIRECTORY)
|
2021-02-22 19:51:32 +03:00
|
|
|
scope.set_tag("GCS_BUCKET", GCS_BUCKET)
|
2020-09-11 16:14:09 +03:00
|
|
|
scope.set_tag("DISPLAY_MODE", DISPLAY_MODE)
|
|
|
|
scope.set_tag("HTTP_INSTRUMENT", HTTP_INSTRUMENT)
|
|
|
|
scope.set_tag("COOKIE_INSTRUMENT", COOKIE_INSTRUMENT)
|
|
|
|
scope.set_tag("NAVIGATION_INSTRUMENT", NAVIGATION_INSTRUMENT)
|
|
|
|
scope.set_tag("JS_INSTRUMENT", JS_INSTRUMENT)
|
|
|
|
scope.set_tag("JS_INSTRUMENT_SETTINGS", JS_INSTRUMENT_SETTINGS)
|
|
|
|
scope.set_tag("CALLSTACK_INSTRUMENT", CALLSTACK_INSTRUMENT)
|
|
|
|
scope.set_tag("SAVE_CONTENT", SAVE_CONTENT)
|
|
|
|
scope.set_tag("DWELL_TIME", DWELL_TIME)
|
|
|
|
scope.set_tag("TIMEOUT", TIMEOUT)
|
|
|
|
scope.set_tag("MAX_JOB_RETRIES", MAX_JOB_RETRIES)
|
2021-02-22 19:51:32 +03:00
|
|
|
scope.set_tag("CRAWL_REFERENCE", "%s/%s" % (GCS_BUCKET, CRAWL_DIRECTORY))
|
2019-07-31 11:01:46 +03:00
|
|
|
# context adds addition information that may be of interest
|
2021-02-22 19:51:32 +03:00
|
|
|
if PREFS:
|
|
|
|
scope.set_context("PREFS", json.loads(PREFS))
|
2020-09-11 16:14:09 +03:00
|
|
|
scope.set_context(
|
|
|
|
"crawl_config",
|
|
|
|
{
|
|
|
|
"REDIS_QUEUE_NAME": REDIS_QUEUE_NAME,
|
|
|
|
},
|
|
|
|
)
|
2019-07-31 11:01:46 +03:00
|
|
|
# Send a sentry error message (temporarily - to easily be able
|
|
|
|
# to compare error frequencies to crawl worker instance count)
|
|
|
|
sentry_sdk.capture_message("Crawl worker started")
|
|
|
|
|
2019-07-13 01:27:49 +03:00
|
|
|
# Connect to job queue
|
2019-08-20 02:51:32 +03:00
|
|
|
job_queue = rediswq.RedisWQ(
|
2020-09-11 16:14:09 +03:00
|
|
|
name=REDIS_QUEUE_NAME, host=REDIS_HOST, max_retries=MAX_JOB_RETRIES
|
2019-08-20 02:51:32 +03:00
|
|
|
)
|
2019-08-02 06:00:23 +03:00
|
|
|
manager.logger.info("Worker with sessionID: %s" % job_queue.sessionID())
|
|
|
|
manager.logger.info("Initial queue state: empty=%s" % job_queue.empty())
|
2019-07-13 01:27:49 +03:00
|
|
|
|
2020-05-08 14:48:16 +03:00
|
|
|
unsaved_jobs: List[bytes] = list()
|
2020-03-06 19:19:28 +03:00
|
|
|
unsaved_jobs_lock = Lock()
|
2020-04-01 20:20:56 +03:00
|
|
|
|
2020-03-11 21:12:18 +03:00
|
|
|
shutting_down = False
|
|
|
|
|
|
|
|
|
2020-09-11 16:14:09 +03:00
|
|
|
def on_shutdown(
|
2020-11-25 22:33:56 +03:00
|
|
|
manager: TaskManager, unsaved_jobs_lock: Lock
|
2022-03-23 20:32:18 +03:00
|
|
|
) -> Callable[[int, Optional[FrameType]], None]:
|
|
|
|
def actual_callback(s: int, _: Optional[FrameType]) -> None:
|
2020-03-11 21:12:18 +03:00
|
|
|
global shutting_down
|
2020-04-16 17:25:31 +03:00
|
|
|
manager.logger.error("Got interupted by %r, shutting down", s)
|
2020-03-11 21:12:18 +03:00
|
|
|
with unsaved_jobs_lock:
|
|
|
|
shutting_down = True
|
2020-04-15 18:17:43 +03:00
|
|
|
manager.close(relaxed=False)
|
2020-04-16 17:50:08 +03:00
|
|
|
sys.exit(1)
|
2020-09-11 16:14:09 +03:00
|
|
|
|
2020-03-11 21:12:18 +03:00
|
|
|
return actual_callback
|
|
|
|
|
|
|
|
|
|
|
|
# Register signal listeners for shutdown
|
|
|
|
for sig in [signal.SIGTERM, signal.SIGINT]:
|
|
|
|
signal.signal(sig, on_shutdown(manager, unsaved_jobs_lock))
|
|
|
|
|
|
|
|
|
2020-09-11 16:14:09 +03:00
|
|
|
def get_job_completion_callback(
|
|
|
|
logger: logging.Logger,
|
|
|
|
unsaved_jobs_lock: Lock,
|
|
|
|
job_queue: rediswq.RedisWQ,
|
|
|
|
job: bytes,
|
|
|
|
) -> Callable[[bool], None]:
|
2021-02-22 19:51:32 +03:00
|
|
|
def callback(success: bool) -> None:
|
2020-03-11 21:12:18 +03:00
|
|
|
with unsaved_jobs_lock:
|
2021-02-22 19:51:32 +03:00
|
|
|
if success:
|
2020-05-08 14:48:16 +03:00
|
|
|
logger.info("Job %r is done", job)
|
|
|
|
job_queue.complete(job)
|
|
|
|
else:
|
2020-06-26 00:31:34 +03:00
|
|
|
logger.warning("Job %r got interrupted", job)
|
2020-03-11 21:12:18 +03:00
|
|
|
unsaved_jobs.remove(job)
|
2020-09-11 16:14:09 +03:00
|
|
|
|
2020-04-01 20:20:56 +03:00
|
|
|
return callback
|
2020-03-11 21:12:18 +03:00
|
|
|
|
|
|
|
|
2020-04-16 17:26:04 +03:00
|
|
|
no_job_since = None
|
2019-07-13 01:27:49 +03:00
|
|
|
# Crawl sites specified in job queue until empty
|
|
|
|
while not job_queue.empty():
|
2019-08-18 02:57:37 +03:00
|
|
|
job_queue.check_expired_leases()
|
2020-03-11 21:12:18 +03:00
|
|
|
with unsaved_jobs_lock:
|
2020-04-01 20:20:56 +03:00
|
|
|
manager.logger.debug("Currently unfinished jobs are: %s", unsaved_jobs)
|
2020-03-06 19:19:28 +03:00
|
|
|
for unsaved_job in unsaved_jobs:
|
2020-09-11 16:14:09 +03:00
|
|
|
if not job_queue.renew_lease(unsaved_job, EXTENDED_LEASE_TIME):
|
2020-03-06 19:19:28 +03:00
|
|
|
manager.logger.error("Unsaved job: %s timed out", unsaved_job)
|
|
|
|
|
2020-09-11 16:14:09 +03:00
|
|
|
job = job_queue.lease(lease_secs=TIMEOUT + DWELL_TIME + 30, block=True, timeout=5)
|
2019-07-13 01:27:49 +03:00
|
|
|
if job is None:
|
2020-05-11 13:37:27 +03:00
|
|
|
manager.logger.info("Waiting for work")
|
2019-07-23 20:21:26 +03:00
|
|
|
time.sleep(5)
|
2019-08-23 01:19:31 +03:00
|
|
|
continue
|
2020-05-11 13:37:27 +03:00
|
|
|
|
2020-03-11 21:12:18 +03:00
|
|
|
unsaved_jobs.append(job)
|
2019-08-23 01:19:31 +03:00
|
|
|
retry_number = job_queue.get_retry_number(job)
|
2020-09-11 16:14:09 +03:00
|
|
|
site_rank, site = job.decode("utf-8").split(",")
|
2019-08-23 01:19:31 +03:00
|
|
|
if "://" not in site:
|
|
|
|
site = "http://" + site
|
|
|
|
manager.logger.info("Visiting %s..." % site)
|
2020-04-17 19:38:40 +03:00
|
|
|
callback = get_job_completion_callback(
|
2020-09-11 16:14:09 +03:00
|
|
|
manager.logger, unsaved_jobs_lock, job_queue, job
|
|
|
|
)
|
2020-11-25 22:33:56 +03:00
|
|
|
command_sequence = CommandSequence(
|
2020-09-11 16:14:09 +03:00
|
|
|
site,
|
|
|
|
blocking=True,
|
|
|
|
reset=True,
|
|
|
|
retry_number=retry_number,
|
|
|
|
callback=callback,
|
2021-05-10 19:48:47 +03:00
|
|
|
site_rank=int(site_rank),
|
2019-08-23 01:19:31 +03:00
|
|
|
)
|
|
|
|
command_sequence.get(sleep=DWELL_TIME, timeout=TIMEOUT)
|
|
|
|
manager.execute_command_sequence(command_sequence)
|
2020-04-16 17:26:04 +03:00
|
|
|
else:
|
|
|
|
manager.logger.info("Job queue finished, exiting.")
|
2019-07-13 01:27:49 +03:00
|
|
|
manager.close()
|
2019-07-31 11:01:46 +03:00
|
|
|
|
|
|
|
if SENTRY_DSN:
|
2019-08-02 00:27:46 +03:00
|
|
|
sentry_sdk.capture_message("Crawl worker finished")
|