diff --git a/release/docker/beet-mover/Dockerfile b/release/docker/beet-mover/Dockerfile new file mode 100644 index 000000000000..f109c0e247da --- /dev/null +++ b/release/docker/beet-mover/Dockerfile @@ -0,0 +1,19 @@ +FROM ubuntu:vivid + +RUN apt-get -q update \ + && apt-get install --yes -q \ + mercurial \ + python-dev \ + python-pip \ + python-virtualenv \ + libffi-dev \ + libssl-dev \ + libyaml-dev \ + libmysqlclient-dev \ + clamav \ + clamav-freshclam \ + curl \ + wget \ + && apt-get clean + +RUN freshclam --verbose diff --git a/testing/mozharness/external_tools/extract_and_run_command.py b/testing/mozharness/external_tools/extract_and_run_command.py new file mode 100644 index 000000000000..ab48ee1dff6e --- /dev/null +++ b/testing/mozharness/external_tools/extract_and_run_command.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python +"""\ +Usage: extract_and_run_command.py [-j N] [command to run] -- [files and/or directories] + -j is the number of workers to start, defaulting to 1. + [command to run] must be a command that can accept one or many files + to process as arguments. + +WARNING: This script does NOT respond to SIGINT. You must use SIGQUIT or SIGKILL to + terminate it early. + """ + +### The canonical location for this file is +### https://hg.mozilla.org/build/tools/file/default/stage/extract_and_run_command.py +### +### Please update the copy in puppet to deploy new changes to +### stage.mozilla.org, see +# https://wiki.mozilla.org/ReleaseEngineering/How_To/Modify_scripts_on_stage + +import logging +import os +from os import path +import sys +from Queue import Queue +import shutil +import subprocess +import tempfile +from threading import Thread +import time + +logging.basicConfig( + stream=sys.stdout, level=logging.INFO, format="%(message)s") +log = logging.getLogger(__name__) + +try: + # the future - https://github.com/mozilla/build-mar via a venv + from mardor.marfile import BZ2MarFile +except: + # the past - http://hg.mozilla.org/build/tools/file/default/buildfarm/utils/mar.py + sys.path.append( + path.join(path.dirname(path.realpath(__file__)), "../buildfarm/utils")) + from mar import BZ2MarFile + +SEVENZIP = "7za" + + +def extractMar(filename, tempdir): + m = BZ2MarFile(filename) + m.extractall(path=tempdir) + + +def extractExe(filename, tempdir): + try: + # We don't actually care about output, put we redirect to a tempfile + # to avoid deadlocking in wait() when stdout=PIPE + fd = tempfile.TemporaryFile() + proc = subprocess.Popen([SEVENZIP, 'x', '-o%s' % tempdir, filename], + stdout=fd, stderr=subprocess.STDOUT) + proc.wait() + except subprocess.CalledProcessError: + # Not all EXEs are 7-zip files, so we have to ignore extraction errors + pass + +# The keys here are matched against the last 3 characters of filenames. +# The values are callables that accept two string arguments. +EXTRACTORS = { + '.mar': extractMar, + '.exe': extractExe, +} + + +def find_files(d): + """yields all of the files in `d'""" + for root, dirs, files in os.walk(d): + for f in files: + yield path.abspath(path.join(root, f)) + + +def rchmod(d, mode=0755): + """chmods everything in `d' to `mode', including `d' itself""" + os.chmod(d, mode) + for root, dirs, files in os.walk(d): + for item in dirs: + os.chmod(path.join(root, item), mode) + for item in files: + os.chmod(path.join(root, item), mode) + + +def maybe_extract(filename): + """If an extractor is found for `filename', extracts it to a temporary + directory and chmods it. The consumer is responsible for removing + the extracted files, if desired.""" + ext = path.splitext(filename)[1] + if ext not in EXTRACTORS.keys(): + return None + # Append the full filepath to the tempdir + tempdir_root = tempfile.mkdtemp() + tempdir = path.join(tempdir_root, filename.lstrip('/')) + os.makedirs(tempdir) + EXTRACTORS[ext](filename, tempdir) + rchmod(tempdir_root) + return tempdir_root + + +def process(item, command): + def format_time(t): + return time.strftime("%H:%M:%S", time.localtime(t)) + # Buffer output to avoid interleaving of multiple workers' + logs = [] + args = [item] + proc = None + start = time.time() + logs.append("START %s: %s" % (format_time(start), item)) + # If the file was extracted, we need to process all of its files, too. + tempdir = maybe_extract(item) + if tempdir: + for f in find_files(tempdir): + args.append(f) + + try: + fd = tempfile.TemporaryFile() + proc = subprocess.Popen(command + args, stdout=fd) + proc.wait() + if proc.returncode != 0: + raise Exception("returned %s" % proc.returncode) + finally: + if tempdir: + shutil.rmtree(tempdir) + fd.seek(0) + # rstrip() here to avoid an unnecessary newline, if it exists. + logs.append(fd.read().rstrip()) + end = time.time() + elapsed = end - start + logs.append("END %s (%d seconds elapsed): %s\n" % ( + format_time(end), elapsed, item)) + # Now that we've got all of our output, print it. It's important that + # the logging module is used for this, because "print" is not + # thread-safe. + log.info("\n".join(logs)) + + +def worker(command, errors): + item = q.get() + while item != None: + try: + process(item, command) + except: + errors.put(item) + item = q.get() + +if __name__ == '__main__': + # getopt is used in favour of optparse to enable "--" as a separator + # between the command and list of files. optparse doesn't allow that. + from getopt import getopt + options, args = getopt(sys.argv[1:], 'j:h', ['help']) + + concurrency = 1 + for o, a in options: + if o == '-j': + concurrency = int(a) + elif o in ('-h', '--help'): + log.info(__doc__) + sys.exit(0) + + if len(args) < 3 or '--' not in args: + log.error(__doc__) + sys.exit(1) + + command = [] + while args[0] != "--": + command.append(args.pop(0)) + args.pop(0) + + q = Queue() + errors = Queue() + threads = [] + for i in range(concurrency): + t = Thread(target=worker, args=(command, errors)) + t.start() + threads.append(t) + + # find_files is a generator, so work will begin prior to it finding + # all of the files + for arg in args: + if path.isfile(arg): + q.put(arg) + else: + for f in find_files(arg): + q.put(f) + # Because the workers are started before we start populating the q + # they can't use .empty() to determine whether or not their done. + # We also can't use q.join() or j.task_done(), because we need to + # support Python 2.4. We know that find_files won't yield None, + # so we can detect doneness by having workers die when they get None + # as an item. + for i in range(concurrency): + q.put(None) + + for t in threads: + t.join() + + if not errors.empty(): + log.error("Command failed for the following files:") + while not errors.empty(): + log.error(" %s" % errors.get()) + sys.exit(1) diff --git a/testing/mozharness/scripts/release/beet_mover.py b/testing/mozharness/scripts/release/beet_mover.py index 0d6ebf037c97..4a0634acd142 100755 --- a/testing/mozharness/scripts/release/beet_mover.py +++ b/testing/mozharness/scripts/release/beet_mover.py @@ -6,17 +6,21 @@ # ***** END LICENSE BLOCK ***** """beet_mover.py. -downloads artifacts and uploads them to s3 +downloads artifacts, scans them and uploads them to s3 """ import hashlib import sys import os import pprint +import re +from os import listdir +from os.path import isfile, join sys.path.insert(1, os.path.dirname(os.path.dirname(sys.path[0]))) from mozharness.base.log import FATAL from mozharness.base.python import VirtualenvMixin from mozharness.base.script import BaseScript +import mozharness def get_hash(content, hash_type="md5"): @@ -85,8 +89,39 @@ CONFIG_OPTIONS = [ "default": False, "help": "taskcluster task id to download artifacts from", }], + [["--exclude"], { + "dest": "excludes", + "action": "append", + "help": "List of filename patterns to exclude. See script source for default", + }], + [["-s", "--scan-parallelization"], { + "dest": "scan_parallelization", + "default": 4, + "type": "int", + "help": "Number of concurrent file scans", + }], ] +DEFAULT_EXCLUDES = [ + r"^.*tests.*$", + r"^.*crashreporter.*$", + r"^.*\.zip(\.asc)?$", + r"^.*\.log$", + r"^.*\.txt$", + r"^.*\.asc$", + r"^.*/partner-repacks.*$", + r"^.*.checksums(\.asc)?$", + r"^.*/logs/.*$", + r"^.*/jsshell.*$", + r"^.*json$", + r"^.*/host.*$", + r"^.*/mar-tools/.*$", + r"^.*gecko-unsigned-unaligned.apk$", + r"^.*robocop.apk$", + r"^.*contrib.*" +] +CACHE_DIR = 'cache' + class BeetMover(BaseScript, VirtualenvMixin, object): def __init__(self, aws_creds): @@ -98,6 +133,8 @@ class BeetMover(BaseScript, VirtualenvMixin, object): 'activate-virtualenv', 'generate-candidates-manifest', 'verify-bits', # beets + 'download-bits', # beets + 'scan-bits', # beets 'upload-bits', # beets ], 'require_config_file': False, @@ -111,6 +148,8 @@ class BeetMover(BaseScript, VirtualenvMixin, object): "boto", "PyYAML", "Jinja2", + "redo", + "mar", ], "virtualenv_path": "venv", 'buckets': { @@ -120,6 +159,7 @@ class BeetMover(BaseScript, VirtualenvMixin, object): 'product': 'firefox', }, } + #todo do excludes need to be configured via command line for specific builds? super(BeetMover, self).__init__(**beetmover_kwargs) c = self.config @@ -128,6 +168,10 @@ class BeetMover(BaseScript, VirtualenvMixin, object): self.virtualenv_imports = None self.bucket = c['buckets']['production'] if c['production'] else c['buckets']['development'] self.aws_key_id, self.aws_secret_key = aws_creds + # if excludes is set from command line, use it otherwise use defaults + self.excludes = self.config.get('excludes', DEFAULT_EXCLUDES) + dirs = self.query_abs_dirs() + self.dest_dir = os.path.join(dirs['abs_work_dir'], CACHE_DIR) def activate_virtualenv(self): """ @@ -172,7 +216,7 @@ class BeetMover(BaseScript, VirtualenvMixin, object): # mirror current release folder structure "s3_prefix": 'pub/{}/candidates'.format(self.config['product']), "artifact_base_url": self.config['artifact_base_url'].format( - taskid=self.config['taskid'], subdir=self.config['artifact_sudbir'] + taskid=self.config['taskid'], subdir=self.config['artifact_subdir'] ) } self.manifest = yaml.safe_load(template.render(**template_vars)) @@ -187,37 +231,60 @@ class BeetMover(BaseScript, VirtualenvMixin, object): # TODO self.log('skipping verification. unimplemented...') + def download_bits(self): + """ + downloads list of artifacts to self.dest_dir dir based on a given manifest + """ + self.log('downloading and uploading artifacts to self_dest_dir...') + + # TODO - do we want to mirror/upload to more than one region? + dirs = self.query_abs_dirs() + + for locale in self.manifest['mapping']: + for deliverable in self.manifest['mapping'][locale]: + self.log("downloading '{}' deliverable for '{}' locale".format(deliverable, locale)) + # download locally to working dir + source=self.manifest['mapping'][locale][deliverable]['artifact'] + file_name = self.retry(self.download_file, + args=[source], + kwargs={'parent_dir': dirs['abs_work_dir']}, + error_level=FATAL) + self.log('Success!') + def upload_bits(self): """ - downloads and uploads list of artifacts to s3 candidates dir based on a given manifest + uploads list of artifacts to s3 candidates dir based on a given manifest """ - self.log('downloading and uploading artifacts to s3...') + self.log('uploading artifacts to s3...') + dirs = self.query_abs_dirs() # connect to s3 boto = self.virtualenv_imports['boto'] conn = boto.connect_s3(self.aws_key_id, self.aws_secret_key) bucket = conn.get_bucket(self.bucket) + #todo change so this is not every entry in manifest - should exclude those that don't pass virus sign + #not sure how to determine this for locale in self.manifest['mapping']: for deliverable in self.manifest['mapping'][locale]: self.log("uploading '{}' deliverable for '{}' locale".format(deliverable, locale)) + #we have already downloaded the files locally so we can use that version + source = self.manifest['mapping'][locale][deliverable]['artifact'] + downloaded_file = os.path.join(dirs['abs_work_dir'], self.get_filename_from_url(source)) self.upload_bit( - source=self.manifest['mapping'][locale][deliverable]['artifact'], + source=downloaded_file, s3_key=self.manifest['mapping'][locale][deliverable]['s3_key'], bucket=bucket, ) self.log('Success!') + def upload_bit(self, source, s3_key, bucket): # TODO - do we want to mirror/upload to more than one region? dirs = self.query_abs_dirs() boto = self.virtualenv_imports['boto'] - # download locally - file_name = self.retry(self.download_file, - args=[source], - kwargs={'parent_dir': dirs['abs_work_dir']}, - error_level=FATAL) + #todo need to copy from dir to s3 self.info('uploading to s3 with key: {}'.format(s3_key)) key = boto.s3.key.Key(bucket) # create new key @@ -230,20 +297,46 @@ class BeetMover(BaseScript, VirtualenvMixin, object): key = bucket.new_key(s3_key) # set key value - self.retry(key.set_contents_from_filename, args=[file_name], error_level=FATAL), + self.retry(key.set_contents_from_filename, args=[source], error_level=FATAL), # key.make_public() may lead to race conditions, because # it doesn't pass version_id, so it may not set permissions bucket.set_canned_acl(acl_str='public-read', key_name=s3_key, version_id=key.version_id) else: - if not get_hash(key.get_contents_as_string()) == get_hash(open(file_name).read()): + if not get_hash(key.get_contents_as_string()) == get_hash(open(source).read()): # for now, let's halt. If necessary, we can revisit this and allow for overwrites # to the same buildnum release with different bits self.fatal("`{}` already exists with different checksum.".format(s3_key)) self.log("`{}` has the same MD5 checksum, not uploading".format(s3_key)) + def scan_bits(self): + dirs = self.query_abs_dirs() + + filenames = [f for f in listdir(dirs['abs_work_dir']) if isfile(join(dirs['abs_work_dir'], f))] + self.mkdir_p(self.dest_dir) + for file_name in filenames: + if self._matches_exclude(file_name): + self.info("Excluding {} from virus scan".format(file_name)) + else: + self.info('Copying {} to {}'.format(file_name,self.dest_dir)) + self.copyfile(os.path.join(dirs['abs_work_dir'], file_name), os.path.join(self.dest_dir,file_name)) + self._scan_files() + self.info('Emptying {}'.format(self.dest_dir)) + self.rmtree(self.dest_dir) + + def _scan_files(self): + """Scan the files we've collected. We do the download and scan concurrently to make + it easier to have a coherent log afterwards. Uses the venv python.""" + external_tools_path = os.path.join( + os.path.abspath(os.path.dirname(os.path.dirname(mozharness.__file__))), 'external_tools') + self.run_command([self.query_python_path(), os.path.join(external_tools_path,'extract_and_run_command.py'), + '-j{}'.format(self.config['scan_parallelization']), + 'clamscan', '--no-summary', '--', self.dest_dir]) + + def _matches_exclude(self, keyname): + return any(re.search(exclude, keyname) for exclude in self.excludes) if __name__ == '__main__': beet_mover = BeetMover(get_aws_auth())