2019-07-13 01:27:49 +03:00
|
|
|
from __future__ import absolute_import
|
|
|
|
|
2019-08-01 02:44:37 +03:00
|
|
|
import logging
|
2019-07-13 01:27:49 +03:00
|
|
|
import os
|
|
|
|
import time
|
|
|
|
|
2019-07-13 18:40:10 +03:00
|
|
|
import boto3
|
2019-07-31 11:01:46 +03:00
|
|
|
import sentry_sdk
|
2019-07-13 01:27:49 +03:00
|
|
|
from six.moves import range
|
|
|
|
|
2019-07-13 17:29:46 +03:00
|
|
|
from automation import CommandSequence, TaskManager
|
|
|
|
from automation.utilities import rediswq
|
|
|
|
from test.utilities import LocalS3Session, local_s3_bucket
|
|
|
|
|
|
|
|
# Configuration via environment variables
|
2019-07-14 00:05:41 +03:00
|
|
|
NUM_BROWSERS = int(os.getenv('NUM_BROWSERS', '1'))
|
2019-08-01 14:31:08 +03:00
|
|
|
REDIS_HOST = os.getenv('REDIS_HOST', 'redis')
|
2019-07-14 00:05:41 +03:00
|
|
|
REDIS_QUEUE_NAME = os.getenv('REDIS_QUEUE_NAME', 'crawl-queue')
|
|
|
|
CRAWL_DIRECTORY = os.getenv('CRAWL_DIRECTORY', 'crawl-data')
|
2019-07-14 01:54:01 +03:00
|
|
|
S3_BUCKET = os.getenv('S3_BUCKET', 'openwpm-crawls')
|
2019-07-14 00:05:41 +03:00
|
|
|
HTTP_INSTRUMENT = os.getenv('HTTP_INSTRUMENT', '1') == '1'
|
|
|
|
COOKIE_INSTRUMENT = os.getenv('COOKIE_INSTRUMENT', '1') == '1'
|
|
|
|
NAVIGATION_INSTRUMENT = os.getenv('NAVIGATION_INSTRUMENT', '1') == '1'
|
|
|
|
JS_INSTRUMENT = os.getenv('JS_INSTRUMENT', '1') == '1'
|
|
|
|
SAVE_JAVASCRIPT = os.getenv('SAVE_JAVASCRIPT', '0') == '1'
|
|
|
|
DWELL_TIME = int(os.getenv('DWELL_TIME', '10'))
|
|
|
|
TIMEOUT = int(os.getenv('TIMEOUT', '60'))
|
2019-07-31 11:01:46 +03:00
|
|
|
SENTRY_DSN = os.getenv('SENTRY_DSN', None)
|
2019-07-20 13:40:28 +03:00
|
|
|
|
2019-07-14 00:48:58 +03:00
|
|
|
# Loads the default manager params
|
|
|
|
# and NUM_BROWSERS copies of the default browser params
|
2019-07-13 01:27:49 +03:00
|
|
|
manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)
|
|
|
|
|
|
|
|
# Browser configuration
|
|
|
|
for i in range(NUM_BROWSERS):
|
2019-07-14 00:05:41 +03:00
|
|
|
browser_params[i]['http_instrument'] = HTTP_INSTRUMENT
|
|
|
|
browser_params[i]['cookie_instrument'] = COOKIE_INSTRUMENT
|
|
|
|
browser_params[i]['navigation_instrument'] = NAVIGATION_INSTRUMENT
|
|
|
|
browser_params[i]['js_instrument'] = JS_INSTRUMENT
|
|
|
|
browser_params[i]['save_javascript'] = SAVE_JAVASCRIPT
|
2019-07-13 01:27:49 +03:00
|
|
|
browser_params[i]['headless'] = True
|
|
|
|
|
|
|
|
# Manager configuration
|
|
|
|
manager_params['data_directory'] = '~/Desktop/%s/' % CRAWL_DIRECTORY
|
|
|
|
manager_params['log_directory'] = '~/Desktop/%s/' % CRAWL_DIRECTORY
|
|
|
|
manager_params['output_format'] = 's3'
|
2019-07-13 17:29:46 +03:00
|
|
|
manager_params['s3_bucket'] = S3_BUCKET
|
2019-07-13 01:27:49 +03:00
|
|
|
manager_params['s3_directory'] = CRAWL_DIRECTORY
|
2019-07-13 17:29:46 +03:00
|
|
|
|
2019-07-13 18:40:10 +03:00
|
|
|
# Allow the use of localstack's mock s3 service
|
2019-07-13 17:29:46 +03:00
|
|
|
S3_ENDPOINT = os.getenv('S3_ENDPOINT')
|
|
|
|
if S3_ENDPOINT:
|
|
|
|
boto3.DEFAULT_SESSION = LocalS3Session(endpoint_url=S3_ENDPOINT)
|
2019-07-13 18:40:10 +03:00
|
|
|
manager_params['s3_bucket'] = local_s3_bucket(
|
|
|
|
boto3.resource('s3'), name=S3_BUCKET)
|
2019-07-13 17:29:46 +03:00
|
|
|
|
|
|
|
# Instantiates the measurement platform
|
|
|
|
# Commands time out by default after 60 seconds
|
2019-07-13 01:27:49 +03:00
|
|
|
manager = TaskManager.TaskManager(manager_params, browser_params)
|
|
|
|
|
2019-07-31 11:01:46 +03:00
|
|
|
# At this point, Sentry should be initiated
|
|
|
|
if SENTRY_DSN:
|
|
|
|
# Add crawler.py-specific context
|
|
|
|
with sentry_sdk.configure_scope() as scope:
|
|
|
|
# tags generate breakdown charts and search filters
|
|
|
|
scope.set_tag('NUM_BROWSERS', NUM_BROWSERS)
|
|
|
|
scope.set_tag('CRAWL_DIRECTORY', CRAWL_DIRECTORY)
|
|
|
|
scope.set_tag('S3_BUCKET', S3_BUCKET)
|
|
|
|
scope.set_tag('HTTP_INSTRUMENT', HTTP_INSTRUMENT)
|
|
|
|
scope.set_tag('COOKIE_INSTRUMENT', COOKIE_INSTRUMENT)
|
|
|
|
scope.set_tag('NAVIGATION_INSTRUMENT', NAVIGATION_INSTRUMENT)
|
|
|
|
scope.set_tag('JS_INSTRUMENT', JS_INSTRUMENT)
|
|
|
|
scope.set_tag('SAVE_JAVASCRIPT', SAVE_JAVASCRIPT)
|
|
|
|
scope.set_tag('DWELL_TIME', DWELL_TIME)
|
|
|
|
scope.set_tag('TIMEOUT', TIMEOUT)
|
|
|
|
scope.set_tag('CRAWL_REFERENCE', '%s/%s' %
|
|
|
|
(S3_BUCKET, CRAWL_DIRECTORY))
|
|
|
|
# context adds addition information that may be of interest
|
|
|
|
scope.set_context("crawl_config", {
|
|
|
|
'REDIS_QUEUE_NAME': REDIS_QUEUE_NAME,
|
|
|
|
})
|
|
|
|
# Send a sentry error message (temporarily - to easily be able
|
|
|
|
# to compare error frequencies to crawl worker instance count)
|
|
|
|
sentry_sdk.capture_message("Crawl worker started")
|
|
|
|
|
2019-07-13 01:27:49 +03:00
|
|
|
# Connect to job queue
|
2019-08-01 14:31:08 +03:00
|
|
|
job_queue = rediswq.RedisWQ(name=REDIS_QUEUE_NAME, host=REDIS_HOST)
|
2019-08-01 02:44:37 +03:00
|
|
|
logging.info("Worker with sessionID: %s" % job_queue.sessionID())
|
|
|
|
logging.info("Initial queue state: empty=%s" % job_queue.empty())
|
2019-07-13 01:27:49 +03:00
|
|
|
|
|
|
|
# Crawl sites specified in job queue until empty
|
|
|
|
while not job_queue.empty():
|
|
|
|
job = job_queue.lease(lease_secs=120, block=True, timeout=5)
|
|
|
|
if job is None:
|
2019-08-01 02:44:37 +03:00
|
|
|
logging.info("Waiting for work")
|
2019-07-23 20:21:26 +03:00
|
|
|
time.sleep(5)
|
2019-07-13 17:20:19 +03:00
|
|
|
else:
|
|
|
|
site_rank, site = job.decode("utf-8").split(',')
|
2019-07-14 01:54:51 +03:00
|
|
|
if "://" not in site:
|
|
|
|
site = "http://" + site
|
2019-08-01 02:44:37 +03:00
|
|
|
logging.info("Visiting %s..." % site)
|
2019-07-13 17:20:19 +03:00
|
|
|
command_sequence = CommandSequence.CommandSequence(
|
2019-07-13 17:29:46 +03:00
|
|
|
site, reset=True
|
2019-07-13 17:20:19 +03:00
|
|
|
)
|
2019-07-13 17:29:46 +03:00
|
|
|
command_sequence.get(sleep=DWELL_TIME, timeout=TIMEOUT)
|
2019-07-13 17:20:19 +03:00
|
|
|
manager.execute_command_sequence(command_sequence)
|
|
|
|
job_queue.complete(job)
|
2019-07-13 01:27:49 +03:00
|
|
|
|
2019-08-01 02:44:37 +03:00
|
|
|
logging.info("Job queue finished, exiting.")
|
2019-07-13 01:27:49 +03:00
|
|
|
manager.close()
|
2019-07-31 11:01:46 +03:00
|
|
|
|
|
|
|
if SENTRY_DSN:
|
2019-08-02 00:27:46 +03:00
|
|
|
sentry_sdk.capture_message("Crawl worker finished")
|