OpenWPM/crawler.py

114 строки
4.5 KiB
Python
Исходник Обычный вид История

2019-07-13 01:27:49 +03:00
from __future__ import absolute_import
import logging
2019-07-13 01:27:49 +03:00
import os
import time
import boto3
import sentry_sdk
2019-07-13 01:27:49 +03:00
from six.moves import range
from automation import CommandSequence, TaskManager
from automation.utilities import rediswq
from test.utilities import LocalS3Session, local_s3_bucket
# Configuration via environment variables
2019-07-14 00:05:41 +03:00
NUM_BROWSERS = int(os.getenv('NUM_BROWSERS', '1'))
2019-08-01 14:31:08 +03:00
REDIS_HOST = os.getenv('REDIS_HOST', 'redis')
2019-07-14 00:05:41 +03:00
REDIS_QUEUE_NAME = os.getenv('REDIS_QUEUE_NAME', 'crawl-queue')
CRAWL_DIRECTORY = os.getenv('CRAWL_DIRECTORY', 'crawl-data')
2019-07-14 01:54:01 +03:00
S3_BUCKET = os.getenv('S3_BUCKET', 'openwpm-crawls')
2019-07-14 00:05:41 +03:00
HTTP_INSTRUMENT = os.getenv('HTTP_INSTRUMENT', '1') == '1'
COOKIE_INSTRUMENT = os.getenv('COOKIE_INSTRUMENT', '1') == '1'
NAVIGATION_INSTRUMENT = os.getenv('NAVIGATION_INSTRUMENT', '1') == '1'
JS_INSTRUMENT = os.getenv('JS_INSTRUMENT', '1') == '1'
SAVE_JAVASCRIPT = os.getenv('SAVE_JAVASCRIPT', '0') == '1'
DWELL_TIME = int(os.getenv('DWELL_TIME', '10'))
TIMEOUT = int(os.getenv('TIMEOUT', '60'))
SENTRY_DSN = os.getenv('SENTRY_DSN', None)
2019-07-14 00:48:58 +03:00
# Loads the default manager params
# and NUM_BROWSERS copies of the default browser params
2019-07-13 01:27:49 +03:00
manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)
# Browser configuration
for i in range(NUM_BROWSERS):
2019-07-14 00:05:41 +03:00
browser_params[i]['http_instrument'] = HTTP_INSTRUMENT
browser_params[i]['cookie_instrument'] = COOKIE_INSTRUMENT
browser_params[i]['navigation_instrument'] = NAVIGATION_INSTRUMENT
browser_params[i]['js_instrument'] = JS_INSTRUMENT
browser_params[i]['save_javascript'] = SAVE_JAVASCRIPT
2019-07-13 01:27:49 +03:00
browser_params[i]['headless'] = True
# Manager configuration
manager_params['data_directory'] = '~/Desktop/%s/' % CRAWL_DIRECTORY
manager_params['log_directory'] = '~/Desktop/%s/' % CRAWL_DIRECTORY
manager_params['output_format'] = 's3'
manager_params['s3_bucket'] = S3_BUCKET
2019-07-13 01:27:49 +03:00
manager_params['s3_directory'] = CRAWL_DIRECTORY
# Allow the use of localstack's mock s3 service
S3_ENDPOINT = os.getenv('S3_ENDPOINT')
if S3_ENDPOINT:
boto3.DEFAULT_SESSION = LocalS3Session(endpoint_url=S3_ENDPOINT)
manager_params['s3_bucket'] = local_s3_bucket(
boto3.resource('s3'), name=S3_BUCKET)
# Instantiates the measurement platform
# Commands time out by default after 60 seconds
2019-07-13 01:27:49 +03:00
manager = TaskManager.TaskManager(manager_params, browser_params)
# At this point, Sentry should be initiated
if SENTRY_DSN:
# Add crawler.py-specific context
with sentry_sdk.configure_scope() as scope:
# tags generate breakdown charts and search filters
scope.set_tag('NUM_BROWSERS', NUM_BROWSERS)
scope.set_tag('CRAWL_DIRECTORY', CRAWL_DIRECTORY)
scope.set_tag('S3_BUCKET', S3_BUCKET)
scope.set_tag('HTTP_INSTRUMENT', HTTP_INSTRUMENT)
scope.set_tag('COOKIE_INSTRUMENT', COOKIE_INSTRUMENT)
scope.set_tag('NAVIGATION_INSTRUMENT', NAVIGATION_INSTRUMENT)
scope.set_tag('JS_INSTRUMENT', JS_INSTRUMENT)
scope.set_tag('SAVE_JAVASCRIPT', SAVE_JAVASCRIPT)
scope.set_tag('DWELL_TIME', DWELL_TIME)
scope.set_tag('TIMEOUT', TIMEOUT)
scope.set_tag('CRAWL_REFERENCE', '%s/%s' %
(S3_BUCKET, CRAWL_DIRECTORY))
# context adds addition information that may be of interest
scope.set_context("crawl_config", {
'REDIS_QUEUE_NAME': REDIS_QUEUE_NAME,
})
# Send a sentry error message (temporarily - to easily be able
# to compare error frequencies to crawl worker instance count)
sentry_sdk.capture_message("Crawl worker started")
2019-07-13 01:27:49 +03:00
# Connect to job queue
2019-08-01 14:31:08 +03:00
job_queue = rediswq.RedisWQ(name=REDIS_QUEUE_NAME, host=REDIS_HOST)
logging.info("Worker with sessionID: %s" % job_queue.sessionID())
logging.info("Initial queue state: empty=%s" % job_queue.empty())
2019-07-13 01:27:49 +03:00
# Crawl sites specified in job queue until empty
while not job_queue.empty():
job = job_queue.lease(lease_secs=120, block=True, timeout=5)
if job is None:
logging.info("Waiting for work")
2019-07-23 20:21:26 +03:00
time.sleep(5)
else:
site_rank, site = job.decode("utf-8").split(',')
if "://" not in site:
site = "http://" + site
logging.info("Visiting %s..." % site)
command_sequence = CommandSequence.CommandSequence(
site, reset=True
)
command_sequence.get(sleep=DWELL_TIME, timeout=TIMEOUT)
manager.execute_command_sequence(command_sequence)
job_queue.complete(job)
2019-07-13 01:27:49 +03:00
logging.info("Job queue finished, exiting.")
2019-07-13 01:27:49 +03:00
manager.close()
if SENTRY_DSN:
2019-08-02 00:27:46 +03:00
sentry_sdk.capture_message("Crawl worker finished")