Bug 1533330 - Update tooltool.py with optional support of taskcluster auth tokens r=rail

With this change tooltool.py also supports taskcluster credentials to be passed (in json format) to --authentication-file option. RelengAPI tokens are still working with this patch, just additional authentication is added. Differential Revision: https://phabricator.services.mozilla.com/D27881 --HG-- extra : moz-landing-system : lando
2019-04-24 13:41:46 +00:00 · 2019-04-24 13:41:46 +00:00 · c3b5a7c6d6
--- a/python/mozbuild/mozbuild/action/tooltool.py
+++ b/python/mozbuild/mozbuild/action/tooltool.py
@ -22,12 +22,20 @@
 # in which the manifest file resides and it should be called
 # 'manifest.tt'

+from __future__ import print_function
+
+import base64
+import calendar
 import hashlib
+import hmac
 import httplib
 import json
 import logging
+import math
 import optparse
 import os
+import pprint
+import re
 import shutil
 import sys
 import tarfile
@ -43,8 +51,20 @@ from subprocess import Popen

 __version__ = '1'

+# Allowed request header characters:
+# !#$%&'()*+,-./:;<=>?@[]^_`{|}~ and space, a-z, A-Z, 0-9, \, "
+REQUEST_HEADER_ATTRIBUTE_CHARS = re.compile(
+    r"^[ a-zA-Z0-9_\!#\$%&'\(\)\*\+,\-\./\:;<\=>\?@\[\]\^`\{\|\}~]*$")
 DEFAULT_MANIFEST_NAME = 'manifest.tt'
 TOOLTOOL_PACKAGE_SUFFIX = '.TOOLTOOL-PACKAGE'
+HAWK_VER = 1
+PY3 = sys.version_info[0] == 3
+
+if PY3:
+    # TODO: py3 coverage
+    six_binary_type = bytes  # pragma: no cover
+else:
+    six_binary_type = str


 log = logging.getLogger(__name__)
@ -77,10 +97,208 @@ class MissingFileException(ExceptionWithFilename):
    pass


+class InvalidCredentials(Exception):
+    pass
+
+
+class BadHeaderValue(Exception):
+    pass
+
+
+def parse_url(url):
+    url_parts = urlparse.urlparse(url)
+    url_dict = {
+        'scheme': url_parts.scheme,
+        'hostname': url_parts.hostname,
+        'port': url_parts.port,
+        'path': url_parts.path,
+        'resource': url_parts.path,
+        'query': url_parts.query,
+    }
+    if len(url_dict['query']) > 0:
+        url_dict['resource'] = '%s?%s' % (url_dict['resource'],  # pragma: no cover
+                                          url_dict['query'])
+
+    if url_parts.port is None:
+        if url_parts.scheme == 'http':
+            url_dict['port'] = 80
+        elif url_parts.scheme == 'https':  # pragma: no cover
+            url_dict['port'] = 443
+    return url_dict
+
+
+def utc_now(offset_in_seconds=0.0):
+    return int(math.floor(calendar.timegm(time.gmtime()) + float(offset_in_seconds)))
+
+
+def random_string(length):
+    return base64.urlsafe_b64encode(os.urandom(length))[:length]
+
+
+def prepare_header_val(val):
+    if isinstance(val, six_binary_type):
+        val = val.decode('utf-8')
+
+    if not REQUEST_HEADER_ATTRIBUTE_CHARS.match(val):
+        raise BadHeaderValue(  # pragma: no cover
+            'header value value={val} contained an illegal character'.format(val=repr(val)))
+
+    return val
+
+
+def parse_content_type(content_type):  # pragma: no cover
+    if content_type:
+        return content_type.split(';')[0].strip().lower()
+    else:
+        return ''
+
+
+def calculate_payload_hash(algorithm, payload, content_type):  # pragma: no cover
+    parts = [
+        part if isinstance(part, six_binary_type) else part.encode('utf8')
+        for part in ['hawk.' + str(HAWK_VER) + '.payload\n',
+                     parse_content_type(content_type) + '\n',
+                     payload or '',
+                     '\n',
+                     ]
+    ]
+
+    p_hash = hashlib.new(algorithm)
+    p_hash.update(''.join(parts))
+
+    log.debug('calculating payload hash from:\n{parts}'.format(parts=pprint.pformat(parts)))
+
+    return base64.b64encode(p_hash.digest())
+
+
+def validate_taskcluster_credentials(credentials):
+    if not hasattr(credentials, '__getitem__'):
+        raise InvalidCredentials('credentials must be a dict-like object')  # pragma: no cover
+    try:
+        credentials['clientId']
+        credentials['accessToken']
+    except KeyError:  # pragma: no cover
+        etype, val, tb = sys.exc_info()
+        raise InvalidCredentials('{etype}: {val}'.format(etype=etype, val=val))
+
+
+def normalize_header_attr(val):
+    if isinstance(val, six_binary_type):
+        return val.decode('utf-8')
+    return val  # pragma: no cover
+
+
+def normalize_string(mac_type,
+                     timestamp,
+                     nonce,
+                     method,
+                     name,
+                     host,
+                     port,
+                     content_hash,
+                     ):
+    return '\n'.join([
+        normalize_header_attr(header)
+        # The blank lines are important. They follow what the Node Hawk lib does.
+        for header in ['hawk.' + str(HAWK_VER) + '.' + mac_type,
+                       timestamp,
+                       nonce,
+                       method or '',
+                       name or '',
+                       host,
+                       port,
+                       content_hash or ''
+                       '',  # for ext which is empty in this case
+                       '',  # Add trailing new line.
+                       ]
+    ])
+
+
+def calculate_mac(mac_type,
+                  access_token,
+                  algorithm,
+                  timestamp,
+                  nonce,
+                  method,
+                  name,
+                  host,
+                  port,
+                  content_hash,
+                  ):
+    normalized = normalize_string(mac_type,
+                                  timestamp,
+                                  nonce,
+                                  method,
+                                  name,
+                                  host,
+                                  port,
+                                  content_hash)
+    log.debug(u'normalized resource for mac calc: {norm}'.format(norm=normalized))
+    digestmod = getattr(hashlib, algorithm)
+
+    # Make sure we are about to hash binary strings.
+
+    if not isinstance(normalized, six_binary_type):
+        normalized = normalized.encode('utf8')
+
+    if not isinstance(access_token, six_binary_type):
+        access_token = access_token.encode('ascii')
+
+    result = hmac.new(access_token, normalized, digestmod)
+    return base64.b64encode(result.digest())
+
+
+def make_taskcluster_header(credentials, req):
+    validate_taskcluster_credentials(credentials)
+
+    url = req.get_full_url()
+    method = req.get_method()
+    algorithm = 'sha256'
+    timestamp = str(utc_now())
+    nonce = random_string(6)
+    url_parts = parse_url(url)
+
+    content_hash = None
+    if req.has_data():
+        content_hash = calculate_payload_hash(  # pragma: no cover
+            algorithm,
+            req.get_data(),
+            req.get_method(),
+        )
+
+    mac = calculate_mac('header',
+                        credentials['accessToken'],
+                        algorithm,
+                        timestamp,
+                        nonce,
+                        method,
+                        url_parts['resource'],
+                        url_parts['hostname'],
+                        str(url_parts['port']),
+                        content_hash,
+                        )
+
+    header = u'Hawk mac="{}"'.format(prepare_header_val(mac))
+
+    if content_hash:  # pragma: no cover
+        header = u'{}, hash="{}"'.format(header, prepare_header_val(content_hash))
+
+    header = u'{header}, id="{id}", ts="{ts}", nonce="{nonce}"'.format(
+        header=header,
+        id=prepare_header_val(credentials['clientId']),
+        ts=prepare_header_val(timestamp),
+        nonce=prepare_header_val(nonce),
+    )
+
+    log.debug('Hawk header for URL={} method={}: {}'.format(url, method, header))
+
+    return header
+
+
 class FileRecord(object):

    def __init__(self, filename, size, digest, algorithm, unpack=False,
-                 version=None, visibility=None):
+                 version=None, visibility=None, setup=None):
        object.__init__(self)
        if '/' in filename or '\\' in filename:
            log.error(
@ -93,6 +311,7 @@ class FileRecord(object):
        self.unpack = unpack
        self.version = version
        self.visibility = visibility
+        self.setup = setup

    def __eq__(self, other):
        if self is other:
@ -184,6 +403,8 @@ class FileRecordJSONEncoder(json.JSONEncoder):
                rv['version'] = obj.version
            if obj.visibility is not None:
                rv['visibility'] = obj.visibility
+            if obj.setup:
+                rv['setup'] = obj.setup
            return rv

    def default(self, f):
@ -229,9 +450,10 @@ class FileRecordJSONDecoder(json.JSONDecoder):
                unpack = obj.get('unpack', False)
                version = obj.get('version', None)
                visibility = obj.get('visibility', None)
+                setup = obj.get('setup')
                rv = FileRecord(
                    obj['filename'], obj['size'], obj['digest'], obj['algorithm'],
-                    unpack, version, visibility)
+                    unpack, version, visibility, setup)
                log.debug("materialized %s" % rv)
                return rv
        return obj
@ -334,7 +556,7 @@ def digest_file(f, a):

 def execute(cmd):
    """Execute CMD, logging its stdout at the info level"""
-    process = Popen(cmd, shell=True, stdout=PIPE)
+    process = Popen(cmd, shell=True, stdout=PIPE, bufsize=0)
    while True:
        line = process.stdout.readline()
        if not line:
@ -535,7 +757,7 @@ def _compute_cache_checksum(filename):
        return digest_file(f, "sha256")


-def unpack_file(filename):
+def unpack_file(filename, setup=None):
    """Untar `filename`, assuming it is uncompressed or compressed with bzip2,
    xz, gzip, or unzip a zip file. The file is assumed to contain a single
    directory with a name matching the base of the given filename.
@ -579,6 +801,9 @@ def unpack_file(filename):
        log.error("Unknown archive extension for filename '%s'" % filename)
        return False

+    if setup and not execute(os.path.join(base_file, setup)):
+        return False
+
    with open(base_file + CHECKSUM_SUFFIX, "wb") as f:
        f.write(checksum)

@ -609,6 +834,9 @@ def fetch_files(manifest_file, base_urls, filenames=[], cache_folder=None,
    # Files that we want to unpack.
    unpack_files = []

+    # Setup for unpacked files.
+    setup_files = {}
+
    # Lets go through the manifest and fetch the files that we want
    for f in manifest.file_records:
        # case 1: files are already present
@ -668,6 +896,13 @@ def fetch_files(manifest_file, base_urls, filenames=[], cache_folder=None,
        else:
            log.debug("skipping %s" % f.filename)

+        if f.setup:
+            if f.unpack:
+                setup_files[f.filename] = f.setup
+            else:
+                log.error("'setup' requires 'unpack' being set for %s" % f.filename)
+                failed_files.append(f.filename)
+
    # lets ensure that fetched files match what the manifest specified
    for localfile, temp_file_name in fetched_files:
        # since I downloaded to a temp file, I need to perform all validations on the temp file
@ -710,7 +945,7 @@ def fetch_files(manifest_file, base_urls, filenames=[], cache_folder=None,

    # Unpack files that need to be unpacked.
    for filename in unpack_files:
-        if not unpack_file(filename):
+        if not unpack_file(filename, setup_files.get(filename)):
            failed_files.append(filename)

    # If we failed to fetch or validate a file, we need to fail
@ -778,10 +1013,25 @@ def _log_api_error(e):


 def _authorize(req, auth_file):
-    if auth_file:
-        log.debug("using bearer token in %s" % auth_file)
-        req.add_unredirected_header('Authorization',
-                                    'Bearer %s' % (open(auth_file, "rb").read().strip()))
+    if not auth_file:
+        return
+
+    is_taskcluster_auth = False
+    with open(auth_file) as f:
+        auth_file_content = f.read().strip()
+        try:
+            auth_file_content = json.loads(auth_file_content)
+            is_taskcluster_auth = True
+        except:
+            pass
+
+    if is_taskcluster_auth:
+        taskcluster_header = make_taskcluster_header(auth_file_content, req)
+        log.debug("Using taskcluster credentials in %s" % auth_file)
+        req.add_unredirected_header('Authorization', taskcluster_header)
+    else:
+        log.debug("Using Bearer token in %s" % auth_file)
+        req.add_unredirected_header('Authorization', 'Bearer %s' % auth_file_content)


 def _send_batch(base_url, auth_file, batch, region):
@ -808,7 +1058,7 @@ def _s3_upload(filename, file):
    try:
        req_path = "%s?%s" % (url.path, url.query) if url.query else url.path
        conn.request('PUT', req_path, open(filename, "rb"),
-                     {'Content-type': 'application/octet-stream'})
+                     {'Content-Type': 'application/octet-stream'})
        resp = conn.getresponse()
        resp_body = resp.read()
        conn.close()
--- a/testing/mozharness/external_tools/tooltool.py
+++ b/testing/mozharness/external_tools/tooltool.py
@ -22,12 +22,20 @@
 # in which the manifest file resides and it should be called
 # 'manifest.tt'

+from __future__ import print_function
+
+import base64
+import calendar
 import hashlib
+import hmac
 import httplib
 import json
 import logging
+import math
 import optparse
 import os
+import pprint
+import re
 import shutil
 import sys
 import tarfile
@ -43,8 +51,20 @@ from subprocess import Popen

 __version__ = '1'

+# Allowed request header characters:
+# !#$%&'()*+,-./:;<=>?@[]^_`{|}~ and space, a-z, A-Z, 0-9, \, "
+REQUEST_HEADER_ATTRIBUTE_CHARS = re.compile(
+    r"^[ a-zA-Z0-9_\!#\$%&'\(\)\*\+,\-\./\:;<\=>\?@\[\]\^`\{\|\}~]*$")
 DEFAULT_MANIFEST_NAME = 'manifest.tt'
 TOOLTOOL_PACKAGE_SUFFIX = '.TOOLTOOL-PACKAGE'
+HAWK_VER = 1
+PY3 = sys.version_info[0] == 3
+
+if PY3:
+    # TODO: py3 coverage
+    six_binary_type = bytes  # pragma: no cover
+else:
+    six_binary_type = str


 log = logging.getLogger(__name__)
@ -77,6 +97,204 @@ class MissingFileException(ExceptionWithFilename):
    pass


+class InvalidCredentials(Exception):
+    pass
+
+
+class BadHeaderValue(Exception):
+    pass
+
+
+def parse_url(url):
+    url_parts = urlparse.urlparse(url)
+    url_dict = {
+        'scheme': url_parts.scheme,
+        'hostname': url_parts.hostname,
+        'port': url_parts.port,
+        'path': url_parts.path,
+        'resource': url_parts.path,
+        'query': url_parts.query,
+    }
+    if len(url_dict['query']) > 0:
+        url_dict['resource'] = '%s?%s' % (url_dict['resource'],  # pragma: no cover
+                                          url_dict['query'])
+
+    if url_parts.port is None:
+        if url_parts.scheme == 'http':
+            url_dict['port'] = 80
+        elif url_parts.scheme == 'https':  # pragma: no cover
+            url_dict['port'] = 443
+    return url_dict
+
+
+def utc_now(offset_in_seconds=0.0):
+    return int(math.floor(calendar.timegm(time.gmtime()) + float(offset_in_seconds)))
+
+
+def random_string(length):
+    return base64.urlsafe_b64encode(os.urandom(length))[:length]
+
+
+def prepare_header_val(val):
+    if isinstance(val, six_binary_type):
+        val = val.decode('utf-8')
+
+    if not REQUEST_HEADER_ATTRIBUTE_CHARS.match(val):
+        raise BadHeaderValue(  # pragma: no cover
+            'header value value={val} contained an illegal character'.format(val=repr(val)))
+
+    return val
+
+
+def parse_content_type(content_type):  # pragma: no cover
+    if content_type:
+        return content_type.split(';')[0].strip().lower()
+    else:
+        return ''
+
+
+def calculate_payload_hash(algorithm, payload, content_type):  # pragma: no cover
+    parts = [
+        part if isinstance(part, six_binary_type) else part.encode('utf8')
+        for part in ['hawk.' + str(HAWK_VER) + '.payload\n',
+                     parse_content_type(content_type) + '\n',
+                     payload or '',
+                     '\n',
+                     ]
+    ]
+
+    p_hash = hashlib.new(algorithm)
+    p_hash.update(''.join(parts))
+
+    log.debug('calculating payload hash from:\n{parts}'.format(parts=pprint.pformat(parts)))
+
+    return base64.b64encode(p_hash.digest())
+
+
+def validate_taskcluster_credentials(credentials):
+    if not hasattr(credentials, '__getitem__'):
+        raise InvalidCredentials('credentials must be a dict-like object')  # pragma: no cover
+    try:
+        credentials['clientId']
+        credentials['accessToken']
+    except KeyError:  # pragma: no cover
+        etype, val, tb = sys.exc_info()
+        raise InvalidCredentials('{etype}: {val}'.format(etype=etype, val=val))
+
+
+def normalize_header_attr(val):
+    if isinstance(val, six_binary_type):
+        return val.decode('utf-8')
+    return val  # pragma: no cover
+
+
+def normalize_string(mac_type,
+                     timestamp,
+                     nonce,
+                     method,
+                     name,
+                     host,
+                     port,
+                     content_hash,
+                     ):
+    return '\n'.join([
+        normalize_header_attr(header)
+        # The blank lines are important. They follow what the Node Hawk lib does.
+        for header in ['hawk.' + str(HAWK_VER) + '.' + mac_type,
+                       timestamp,
+                       nonce,
+                       method or '',
+                       name or '',
+                       host,
+                       port,
+                       content_hash or ''
+                       '',  # for ext which is empty in this case
+                       '',  # Add trailing new line.
+                       ]
+    ])
+
+
+def calculate_mac(mac_type,
+                  access_token,
+                  algorithm,
+                  timestamp,
+                  nonce,
+                  method,
+                  name,
+                  host,
+                  port,
+                  content_hash,
+                  ):
+    normalized = normalize_string(mac_type,
+                                  timestamp,
+                                  nonce,
+                                  method,
+                                  name,
+                                  host,
+                                  port,
+                                  content_hash)
+    log.debug(u'normalized resource for mac calc: {norm}'.format(norm=normalized))
+    digestmod = getattr(hashlib, algorithm)
+
+    # Make sure we are about to hash binary strings.
+
+    if not isinstance(normalized, six_binary_type):
+        normalized = normalized.encode('utf8')
+
+    if not isinstance(access_token, six_binary_type):
+        access_token = access_token.encode('ascii')
+
+    result = hmac.new(access_token, normalized, digestmod)
+    return base64.b64encode(result.digest())
+
+
+def make_taskcluster_header(credentials, req):
+    validate_taskcluster_credentials(credentials)
+
+    url = req.get_full_url()
+    method = req.get_method()
+    algorithm = 'sha256'
+    timestamp = str(utc_now())
+    nonce = random_string(6)
+    url_parts = parse_url(url)
+
+    content_hash = None
+    if req.has_data():
+        content_hash = calculate_payload_hash(  # pragma: no cover
+            algorithm,
+            req.get_data(),
+            req.get_method(),
+        )
+
+    mac = calculate_mac('header',
+                        credentials['accessToken'],
+                        algorithm,
+                        timestamp,
+                        nonce,
+                        method,
+                        url_parts['resource'],
+                        url_parts['hostname'],
+                        str(url_parts['port']),
+                        content_hash,
+                        )
+
+    header = u'Hawk mac="{}"'.format(prepare_header_val(mac))
+
+    if content_hash:  # pragma: no cover
+        header = u'{}, hash="{}"'.format(header, prepare_header_val(content_hash))
+
+    header = u'{header}, id="{id}", ts="{ts}", nonce="{nonce}"'.format(
+        header=header,
+        id=prepare_header_val(credentials['clientId']),
+        ts=prepare_header_val(timestamp),
+        nonce=prepare_header_val(nonce),
+    )
+
+    log.debug('Hawk header for URL={} method={}: {}'.format(url, method, header))
+
+    return header
+
+
 class FileRecord(object):

    def __init__(self, filename, size, digest, algorithm, unpack=False,
@ -483,6 +701,8 @@ def fetch_file(base_urls, file_record, grabchunk=1024 * 4, auth_file=None, regio
                k = True
                size = 0
                while k:
+                    # TODO: print statistics as file transfers happen both for info and to stop
+                    # buildbot timeouts
                    indata = f.read(grabchunk)
                    out.write(indata)
                    size += len(indata)
@ -517,14 +737,39 @@ def clean_path(dirname):
        shutil.rmtree(dirname)


+CHECKSUM_SUFFIX = ".checksum"
+
+
+def _cache_checksum_matches(base_file, checksum):
+    try:
+        with open(base_file + CHECKSUM_SUFFIX, "rb") as f:
+            prev_checksum = f.read().strip()
+            if prev_checksum == checksum:
+                log.info("Cache matches, avoiding extracting in '%s'" % base_file)
+                return True
+            return False
+    except IOError as e:
+        return False
+
+
+def _compute_cache_checksum(filename):
+    with open(filename, "rb") as f:
+        return digest_file(f, "sha256")
+
+
 def unpack_file(filename, setup=None):
    """Untar `filename`, assuming it is uncompressed or compressed with bzip2,
    xz, gzip, or unzip a zip file. The file is assumed to contain a single
    directory with a name matching the base of the given filename.
    Xz support is handled by shelling out to 'tar'."""
+
+    checksum = _compute_cache_checksum(filename)
+
    if tarfile.is_tarfile(filename):
        tar_file, zip_ext = os.path.splitext(filename)
        base_file, tar_ext = os.path.splitext(tar_file)
+        if _cache_checksum_matches(base_file, checksum):
+            return True
        clean_path(base_file)
        log.info('untarring "%s"' % filename)
        tar = tarfile.open(filename)
@ -532,12 +777,16 @@ def unpack_file(filename, setup=None):
        tar.close()
    elif filename.endswith('.tar.xz'):
        base_file = filename.replace('.tar.xz', '')
+        if _cache_checksum_matches(base_file, checksum):
+            return True
        clean_path(base_file)
        log.info('untarring "%s"' % filename)
        if not execute('tar -Jxf %s 2>&1' % filename):
            return False
    elif zipfile.is_zipfile(filename):
        base_file = filename.replace('.zip', '')
+        if _cache_checksum_matches(base_file, checksum):
+            return True
        clean_path(base_file)
        log.info('unzipping "%s"' % filename)
        z = zipfile.ZipFile(filename)
@ -549,6 +798,10 @@ def unpack_file(filename, setup=None):

    if setup and not execute(os.path.join(base_file, setup)):
        return False
+
+    with open(base_file + CHECKSUM_SUFFIX, "wb") as f:
+        f.write(checksum)
+
    return True


@ -755,10 +1008,25 @@ def _log_api_error(e):


 def _authorize(req, auth_file):
-    if auth_file:
-        log.debug("using bearer token in %s" % auth_file)
-        req.add_unredirected_header('Authorization',
-                                    'Bearer %s' % (open(auth_file, "rb").read().strip()))
+    if not auth_file:
+        return
+
+    is_taskcluster_auth = False
+    with open(auth_file) as f:
+        auth_file_content = f.read().strip()
+        try:
+            auth_file_content = json.loads(auth_file_content)
+            is_taskcluster_auth = True
+        except:
+            pass
+
+    if is_taskcluster_auth:
+        taskcluster_header = make_taskcluster_header(auth_file_content, req)
+        log.debug("Using taskcluster credentials in %s" % auth_file)
+        req.add_unredirected_header('Authorization', taskcluster_header)
+    else:
+        log.debug("Using Bearer token in %s" % auth_file)
+        req.add_unredirected_header('Authorization', 'Bearer %s' % auth_file_content)


 def _send_batch(base_url, auth_file, batch, region):
@ -785,7 +1053,7 @@ def _s3_upload(filename, file):
    try:
        req_path = "%s?%s" % (url.path, url.query) if url.query else url.path
        conn.request('PUT', req_path, open(filename, "rb"),
-                     {'Content-type': 'application/octet-stream'})
+                     {'Content-Type': 'application/octet-stream'})
        resp = conn.getresponse()
        resp_body = resp.read()
        conn.close()