2018-05-10 03:26:40 +03:00
|
|
|
#!/usr/bin/python3 -u
|
2016-07-30 06:53:21 +03:00
|
|
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
|
|
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
|
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
|
|
|
|
"""Run a task after performing common actions.
|
|
|
|
|
|
|
|
This script is meant to be the "driver" for TaskCluster based tasks.
|
|
|
|
It receives some common arguments to control the run-time environment.
|
|
|
|
|
|
|
|
It performs actions as requested from the arguments. Then it executes
|
|
|
|
the requested process and prints its output, prefixing it with the
|
|
|
|
current time to improve log usefulness.
|
|
|
|
"""
|
|
|
|
|
2018-05-11 20:19:53 +03:00
|
|
|
import sys
|
|
|
|
|
|
|
|
|
|
|
|
if sys.version_info[0:2] < (3, 5):
|
|
|
|
print('run-task requires Python 3.5+')
|
|
|
|
sys.exit(1)
|
|
|
|
|
2016-07-30 06:53:21 +03:00
|
|
|
|
|
|
|
import argparse
|
|
|
|
import datetime
|
|
|
|
import errno
|
2018-05-16 21:06:36 +03:00
|
|
|
import io
|
2016-09-02 01:38:30 +03:00
|
|
|
import json
|
2016-07-30 06:53:21 +03:00
|
|
|
import os
|
2019-05-16 00:04:57 +03:00
|
|
|
import platform
|
2019-04-11 20:19:36 +03:00
|
|
|
import random
|
2016-08-09 23:14:05 +03:00
|
|
|
import re
|
2018-06-04 23:36:28 +03:00
|
|
|
import shutil
|
2016-11-09 19:26:58 +03:00
|
|
|
import socket
|
2016-09-14 22:26:15 +03:00
|
|
|
import stat
|
2016-07-30 06:53:21 +03:00
|
|
|
import subprocess
|
2018-05-16 23:57:08 +03:00
|
|
|
|
2018-05-11 20:19:53 +03:00
|
|
|
import urllib.error
|
|
|
|
import urllib.request
|
2016-09-02 01:38:30 +03:00
|
|
|
|
|
|
|
FINGERPRINT_URL = 'http://taskcluster/secrets/v1/secret/project/taskcluster/gecko/hgfingerprint'
|
2016-10-17 17:45:02 +03:00
|
|
|
FALLBACK_FINGERPRINT = {
|
|
|
|
'fingerprints':
|
2018-11-01 00:15:30 +03:00
|
|
|
"sha256:17:38:aa:92:0b:84:3e:aa:8e:52:52:e9:4c:2f:98:a9:0e:bf:6c:3e:e9"
|
|
|
|
":15:ff:0a:29:80:f7:06:02:5b:e8:48"}
|
2016-07-30 06:53:21 +03:00
|
|
|
|
2019-04-11 20:19:36 +03:00
|
|
|
HGMOINTERNAL_CONFIG_URL = 'http://taskcluster/secrets/v1/secret/project/taskcluster/gecko/hgmointernal'
|
2016-07-30 06:53:21 +03:00
|
|
|
|
Bug 1391476 - Add UID and GID to cache parameters; r=dustin
The UID and GID that a task executes under is dynamic. As a result,
caches need to be aware of the UID and GID that owns files otherwise
subsequent tasks could run into permission denied errors. This is
why `run-task --chown-recursive` exists. By recursively changing
ownership of persisted files, we ensure the current task is able
to read and write all existing files.
When you take a step back, you realize that chowning of cached
files is an expensive workaround. Yes, this results in cache hits.
But the cost is you potentially have to perform hundreds of thousands
of I/O system calls to mass chown. The ideal situation is that
UID/GID is consistent across tasks on any given cache and
potentially expensive permissions setting can be avoided. So, that's
what this commit does.
We add the task's UID and GID to run-task's requirements. When we
first see a cache, we record a UID and GID with it and chown the
empty cache directory to that UID and GID. Subsequent tasks using
this cache *must* use the same UID and GID or else run-task will
fail.
Since run-task now guarantees that all cache consumers use the same
UID and GID, we can avoid a potentially expensive recursive chown.
But there is an exception. In untrusted environments (namely Try),
we recursively chown existing caches if there is a uid/gid mismatch.
We do this because Try is a sandbox and any random task could
experiment with a non-standard uid/gid. That populated cache would
"poison" the cache for the next caller. Or vice-versa. It would be
annoying if caches were randomly poisoned due to Try pushes that
didn't realize there was a UID/GID mismatch. We could outlaw "bad"
UID and GIDs. But that makes the barrier to testing things on Try
harder. So, we go with the flow and recursively chown caches in
this scenario.
This change will shine light on all tasks using inconsistent UID
and GID values on the same cache. Bustage is anticipated.
Unfortunately, we can't easily know what will break. So it will be
one of those things where we will have to fix problems as they arise.
Fortunately, because caches are now tied to the content of run-task,
we only need to back out this change and tasks should revert to caches
without UID and GID pinning requirements and everything will work
again.
MozReview-Commit-ID: 2ka4rOnnXIp
--HG--
extra : rebase_source : ccb2b0a9230694f989775b26d5276fd3ac928af3
extra : source : 083d2e1cc8fe44b04e44f74bda3dd8bc75ba826c
2017-08-23 02:49:26 +03:00
|
|
|
CACHE_UID_GID_MISMATCH = '''
|
|
|
|
There is a UID/GID mismatch on the cache. This likely means:
|
|
|
|
|
|
|
|
a) different tasks are running as a different user/group
|
|
|
|
b) different Docker images have different UID/GID for the same user/group
|
|
|
|
|
|
|
|
Our cache policy is that the UID/GID for ALL tasks must be consistent
|
|
|
|
for the lifetime of the cache. This eliminates permissions problems due
|
|
|
|
to file/directory user/group ownership.
|
|
|
|
|
|
|
|
To make this error go away, ensure that all Docker images are use
|
|
|
|
a consistent UID/GID and that all tasks using this cache are running as
|
|
|
|
the same user/group.
|
|
|
|
'''
|
|
|
|
|
|
|
|
|
2017-08-23 20:47:37 +03:00
|
|
|
NON_EMPTY_VOLUME = '''
|
|
|
|
error: volume %s is not empty
|
|
|
|
|
|
|
|
Our Docker image policy requires volumes to be empty.
|
|
|
|
|
|
|
|
The volume was likely populated as part of building the Docker image.
|
|
|
|
Change the Dockerfile and anything run from it to not create files in
|
|
|
|
any VOLUME.
|
|
|
|
|
|
|
|
A lesser possibility is that you stumbled upon a TaskCluster platform bug
|
|
|
|
where it fails to use new volumes for tasks.
|
|
|
|
'''
|
|
|
|
|
2018-07-26 20:13:39 +03:00
|
|
|
|
|
|
|
FETCH_CONTENT_NOT_FOUND = '''
|
|
|
|
error: fetch-content script not found
|
|
|
|
|
|
|
|
The script at `taskcluster/scripts/misc/fetch-content` could not be
|
|
|
|
detected in the current environment.
|
|
|
|
|
|
|
|
If this task clones gecko, make sure the GECKO_PATH environment variable
|
|
|
|
is set to proper location. Otherwise, the script may need to be mounted
|
|
|
|
or added to the task's docker image then added to the PATH.
|
|
|
|
'''
|
|
|
|
|
2018-01-16 08:00:50 +03:00
|
|
|
# The exit code to use when caches should be purged and the task retried.
|
|
|
|
# This is EX_OSFILE (from sysexits.h):
|
|
|
|
# Some system file does not exist, cannot be opened, or has some
|
|
|
|
# sort of error (e.g., syntax error).
|
|
|
|
EXIT_PURGE_CACHE = 72
|
|
|
|
|
2017-08-23 20:47:37 +03:00
|
|
|
|
2018-12-21 16:43:04 +03:00
|
|
|
IS_MACOSX = sys.platform == 'darwin'
|
2018-05-05 03:11:53 +03:00
|
|
|
IS_POSIX = os.name == 'posix'
|
2018-05-05 03:33:19 +03:00
|
|
|
IS_WINDOWS = os.name == 'nt'
|
2018-05-05 03:11:53 +03:00
|
|
|
|
|
|
|
|
2016-07-30 06:53:21 +03:00
|
|
|
def print_line(prefix, m):
|
2018-05-11 20:19:53 +03:00
|
|
|
now = datetime.datetime.utcnow().isoformat().encode('utf-8')
|
2018-05-16 21:06:36 +03:00
|
|
|
# slice microseconds to 3 decimals.
|
|
|
|
now = now[:-3] if now[-7:-6] == b'.' else now
|
|
|
|
sys.stdout.buffer.write(b'[%s %sZ] %s' % (prefix, now, m))
|
|
|
|
sys.stdout.buffer.flush()
|
2016-07-30 06:53:21 +03:00
|
|
|
|
|
|
|
|
2019-07-27 09:21:35 +03:00
|
|
|
def run_and_prefix_output(prefix, args, *, extra_env=None, cwd=None):
|
2016-07-30 06:53:21 +03:00
|
|
|
"""Runs a process and prefixes its output with the time.
|
|
|
|
|
|
|
|
Returns the process exit code.
|
|
|
|
"""
|
2019-07-27 09:21:35 +03:00
|
|
|
print_line(
|
|
|
|
prefix,
|
|
|
|
b"executing %r%s\n" % (args, b"in %s" % (cwd.encode("utf-8"),) if cwd else b""),
|
|
|
|
)
|
2016-07-30 06:53:21 +03:00
|
|
|
|
2016-09-23 09:33:01 +03:00
|
|
|
env = dict(os.environ)
|
|
|
|
env.update(extra_env or {})
|
|
|
|
|
2016-07-30 06:53:21 +03:00
|
|
|
# Note: TaskCluster's stdin is a TTY. This attribute is lost
|
|
|
|
# when we pass sys.stdin to the invoked process. If we cared
|
|
|
|
# to preserve stdin as a TTY, we could make this work. But until
|
|
|
|
# someone needs it, don't bother.
|
2018-05-16 21:06:36 +03:00
|
|
|
|
|
|
|
# We want stdout to be bytes on Python 3. That means we can't use
|
|
|
|
# universal_newlines=True (because it implies text mode). But
|
|
|
|
# p.stdout.readline() won't work for bytes text streams. So, on Python 3,
|
|
|
|
# we manually install a latin1 stream wrapper. This allows us to readline()
|
|
|
|
# and preserves bytes, without losing any data.
|
|
|
|
|
2016-07-30 06:53:21 +03:00
|
|
|
p = subprocess.Popen(args,
|
|
|
|
# Disable buffering because we want to receive output
|
|
|
|
# as it is generated so timestamps in logs are
|
|
|
|
# accurate.
|
|
|
|
bufsize=0,
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
stderr=subprocess.STDOUT,
|
|
|
|
stdin=sys.stdin.fileno(),
|
2019-07-27 09:21:35 +03:00
|
|
|
env=env,
|
|
|
|
cwd=cwd)
|
2018-05-16 21:06:36 +03:00
|
|
|
|
2018-05-11 20:19:53 +03:00
|
|
|
stdout = io.TextIOWrapper(p.stdout, encoding='latin1')
|
2016-07-30 06:53:21 +03:00
|
|
|
|
|
|
|
while True:
|
2018-05-11 20:19:53 +03:00
|
|
|
data = stdout.readline().encode('latin1')
|
2018-05-16 21:06:36 +03:00
|
|
|
|
2016-08-11 02:01:25 +03:00
|
|
|
if data == b'':
|
2016-07-30 06:53:21 +03:00
|
|
|
break
|
|
|
|
|
|
|
|
print_line(prefix, data)
|
|
|
|
|
2016-08-02 07:44:22 +03:00
|
|
|
return p.wait()
|
2016-07-30 06:53:21 +03:00
|
|
|
|
2018-01-16 08:00:50 +03:00
|
|
|
|
2018-05-05 03:11:53 +03:00
|
|
|
def get_posix_user_group(user, group):
|
|
|
|
import grp
|
|
|
|
import pwd
|
|
|
|
|
|
|
|
try:
|
|
|
|
user_record = pwd.getpwnam(user)
|
|
|
|
except KeyError:
|
|
|
|
print('could not find user %s; specify a valid user with --user' % user)
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
try:
|
|
|
|
group_record = grp.getgrnam(group)
|
|
|
|
except KeyError:
|
|
|
|
print('could not find group %s; specify a valid group with --group' %
|
|
|
|
group)
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
# Most tasks use worker:worker. We require they have a specific numeric ID
|
|
|
|
# because otherwise it is too easy for files written to caches to have
|
|
|
|
# mismatched numeric IDs, which results in permissions errors.
|
|
|
|
if user_record.pw_name == 'worker' and user_record.pw_uid != 1000:
|
|
|
|
print('user `worker` must have uid=1000; got %d' % user_record.pw_uid)
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
if group_record.gr_name == 'worker' and group_record.gr_gid != 1000:
|
|
|
|
print('group `worker` must have gid=1000; got %d' % group_record.gr_gid)
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
# Find all groups to which this user is a member.
|
|
|
|
gids = [g.gr_gid for g in grp.getgrall() if group in g.gr_mem]
|
|
|
|
|
|
|
|
return user_record, group_record, gids
|
|
|
|
|
|
|
|
|
2018-05-05 03:41:45 +03:00
|
|
|
def write_audit_entry(path, msg):
|
2018-05-11 20:19:53 +03:00
|
|
|
now = datetime.datetime.utcnow().isoformat().encode('utf-8')
|
2018-05-05 03:41:45 +03:00
|
|
|
with open(path, 'ab') as fh:
|
|
|
|
fh.write(b'[%sZ %s] %s\n' % (
|
2018-05-11 20:19:53 +03:00
|
|
|
now, os.environb.get(b'TASK_ID', b'UNKNOWN'), msg))
|
2018-05-05 03:41:45 +03:00
|
|
|
|
|
|
|
|
2017-08-18 03:06:16 +03:00
|
|
|
WANTED_DIR_MODE = stat.S_IXUSR | stat.S_IRUSR | stat.S_IWUSR
|
|
|
|
|
|
|
|
|
|
|
|
def set_dir_permissions(path, uid, gid):
|
|
|
|
st = os.lstat(path)
|
|
|
|
|
|
|
|
if st.st_uid != uid or st.st_gid != gid:
|
|
|
|
os.chown(path, uid, gid)
|
|
|
|
|
|
|
|
# Also make sure dirs are writable in case we need to delete
|
|
|
|
# them.
|
|
|
|
if st.st_mode & WANTED_DIR_MODE != WANTED_DIR_MODE:
|
|
|
|
os.chmod(path, st.st_mode | WANTED_DIR_MODE)
|
|
|
|
|
|
|
|
|
|
|
|
def chown_recursive(path, user, group, uid, gid):
|
|
|
|
print_line(b'chown',
|
|
|
|
b'recursively changing ownership of %s to %s:%s\n' %
|
2018-05-11 20:19:53 +03:00
|
|
|
(path.encode('utf-8'), user.encode('utf-8'), group.encode(
|
|
|
|
'utf-8')))
|
2017-08-18 03:06:16 +03:00
|
|
|
|
|
|
|
set_dir_permissions(path, uid, gid)
|
|
|
|
|
|
|
|
for root, dirs, files in os.walk(path):
|
|
|
|
for d in dirs:
|
|
|
|
set_dir_permissions(os.path.join(root, d), uid, gid)
|
|
|
|
|
|
|
|
for f in files:
|
|
|
|
# File may be a symlink that points to nowhere. In which case
|
|
|
|
# os.chown() would fail because it attempts to follow the
|
|
|
|
# symlink. We only care about directory entries, not what
|
|
|
|
# they point to. So setting the owner of the symlink should
|
|
|
|
# be sufficient.
|
|
|
|
os.lchown(os.path.join(root, f), uid, gid)
|
|
|
|
|
|
|
|
|
2018-05-05 03:54:07 +03:00
|
|
|
def configure_cache_posix(cache, user, group,
|
|
|
|
untrusted_caches, running_as_root):
|
|
|
|
"""Configure a cache path on POSIX platforms.
|
|
|
|
|
|
|
|
For each cache, we write out a special file denoting attributes and
|
|
|
|
capabilities of run-task and the task being executed. These attributes
|
|
|
|
are used by subsequent run-task invocations to validate that use of
|
|
|
|
the cache is acceptable.
|
|
|
|
|
|
|
|
We /could/ blow away the cache data on requirements mismatch.
|
|
|
|
While this would be convenient, this could result in "competing" tasks
|
|
|
|
effectively undoing the other's work. This would slow down task
|
|
|
|
execution in aggregate. Without monitoring for this, people may not notice
|
|
|
|
the problem and tasks would be slower than they could be. We follow the
|
|
|
|
principle of "fail fast" to ensure optimal task execution.
|
|
|
|
|
|
|
|
We also write an audit log of who used the caches. This log is printed
|
|
|
|
during failures to help aid debugging.
|
|
|
|
"""
|
|
|
|
|
|
|
|
our_requirements = {
|
|
|
|
# Include a version string that we can bump whenever to trigger
|
|
|
|
# fresh caches. The actual value is not relevant and doesn't need
|
|
|
|
# to follow any explicit order. Since taskgraph bakes this file's
|
|
|
|
# hash into cache names, any change to this file/version is sufficient
|
|
|
|
# to force the use of a new cache.
|
|
|
|
b'version=1',
|
|
|
|
# Include the UID and GID the task will run as to ensure that tasks
|
|
|
|
# with different UID and GID don't share the same cache.
|
|
|
|
b'uid=%d' % user.pw_uid,
|
|
|
|
b'gid=%d' % group.gr_gid,
|
|
|
|
}
|
|
|
|
|
|
|
|
requires_path = os.path.join(cache, '.cacherequires')
|
|
|
|
audit_path = os.path.join(cache, '.cachelog')
|
|
|
|
|
|
|
|
# The cache is empty. Configure it.
|
|
|
|
if not os.listdir(cache):
|
|
|
|
print_line(b'cache', b'cache %s is empty; writing requirements: '
|
|
|
|
b'%s\n' % (
|
2018-12-05 18:59:51 +03:00
|
|
|
cache.encode('utf-8'), b' '.join(sorted(our_requirements))))
|
2018-05-05 03:54:07 +03:00
|
|
|
|
|
|
|
# We write a requirements file so future invocations know what the
|
|
|
|
# requirements are.
|
|
|
|
with open(requires_path, 'wb') as fh:
|
|
|
|
fh.write(b'\n'.join(sorted(our_requirements)))
|
|
|
|
|
|
|
|
# And make it read-only as a precaution against deletion.
|
|
|
|
os.chmod(requires_path, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
|
|
|
|
|
|
|
|
write_audit_entry(audit_path,
|
2018-05-10 07:15:36 +03:00
|
|
|
b'created; requirements: %s' %
|
|
|
|
b', '.join(sorted(our_requirements)))
|
2018-05-05 03:54:07 +03:00
|
|
|
|
|
|
|
set_dir_permissions(cache, user.pw_uid, group.gr_gid)
|
|
|
|
return
|
|
|
|
|
|
|
|
# The cache has content and we have a requirements file. Validate
|
|
|
|
# requirements alignment.
|
|
|
|
if os.path.exists(requires_path):
|
|
|
|
with open(requires_path, 'rb') as fh:
|
|
|
|
wanted_requirements = set(fh.read().splitlines())
|
|
|
|
|
|
|
|
print_line(b'cache', b'cache %s exists; requirements: %s\n' % (
|
2018-05-11 20:19:53 +03:00
|
|
|
cache.encode('utf-8'), b' '.join(sorted(wanted_requirements))))
|
2018-05-05 03:54:07 +03:00
|
|
|
|
|
|
|
missing = wanted_requirements - our_requirements
|
|
|
|
|
|
|
|
# Allow requirements mismatch for uid/gid if and only if caches
|
|
|
|
# are untrusted. This allows cache behavior on Try to be
|
|
|
|
# reasonable. Otherwise, random tasks could "poison" cache
|
|
|
|
# usability by introducing uid/gid mismatches. For untrusted
|
|
|
|
# environments like Try, this is a perfectly reasonable thing to
|
|
|
|
# allow.
|
|
|
|
if missing and untrusted_caches and running_as_root and \
|
2018-05-10 07:15:36 +03:00
|
|
|
all(s.startswith((b'uid=', b'gid=')) for s in missing):
|
2018-05-05 03:54:07 +03:00
|
|
|
print_line(b'cache',
|
|
|
|
b'cache %s uid/gid mismatch; this is acceptable '
|
|
|
|
b'because caches for this task are untrusted; '
|
|
|
|
b'changing ownership to facilitate cache use\n' %
|
2018-05-11 20:19:53 +03:00
|
|
|
cache.encode('utf-8'))
|
2018-05-05 03:54:07 +03:00
|
|
|
chown_recursive(cache, user.pw_name, group.gr_name, user.pw_uid,
|
|
|
|
group.gr_gid)
|
|
|
|
|
|
|
|
# And write out the updated reality.
|
|
|
|
with open(requires_path, 'wb') as fh:
|
|
|
|
fh.write(b'\n'.join(sorted(our_requirements)))
|
|
|
|
|
|
|
|
write_audit_entry(audit_path,
|
2018-05-10 07:15:36 +03:00
|
|
|
b'chown; requirements: %s' %
|
|
|
|
b', '.join(sorted(our_requirements)))
|
2018-05-05 03:54:07 +03:00
|
|
|
|
|
|
|
elif missing:
|
|
|
|
print('error: requirements for populated cache %s differ from '
|
|
|
|
'this task' % cache)
|
|
|
|
print('cache requirements: %s' % ' '.join(sorted(
|
2018-05-11 20:19:53 +03:00
|
|
|
s.decode('utf-8') for s in wanted_requirements)))
|
2018-05-05 03:54:07 +03:00
|
|
|
print('our requirements: %s' % ' '.join(sorted(
|
2018-05-11 20:19:53 +03:00
|
|
|
s.decode('utf-8') for s in our_requirements)))
|
2018-05-10 07:15:36 +03:00
|
|
|
if any(s.startswith((b'uid=', b'gid=')) for s in missing):
|
2018-05-05 03:54:07 +03:00
|
|
|
print(CACHE_UID_GID_MISMATCH)
|
|
|
|
|
|
|
|
write_audit_entry(audit_path,
|
2018-05-10 07:15:36 +03:00
|
|
|
b'requirements mismatch; wanted: %s' %
|
|
|
|
b', '.join(sorted(our_requirements)))
|
2018-05-05 03:54:07 +03:00
|
|
|
|
|
|
|
print('')
|
|
|
|
print('audit log:')
|
2018-05-10 07:15:36 +03:00
|
|
|
with open(audit_path, 'r') as fh:
|
2018-05-05 03:54:07 +03:00
|
|
|
print(fh.read())
|
|
|
|
|
|
|
|
return True
|
|
|
|
else:
|
2018-05-10 07:15:36 +03:00
|
|
|
write_audit_entry(audit_path, b'used')
|
2018-05-05 03:54:07 +03:00
|
|
|
|
|
|
|
# We don't need to adjust permissions here because the cache is
|
|
|
|
# associated with a uid/gid and the first task should have set
|
|
|
|
# a proper owner/group.
|
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
# The cache has content and no requirements file. This shouldn't
|
|
|
|
# happen because run-task should be the first thing that touches a
|
|
|
|
# cache.
|
|
|
|
print('error: cache %s is not empty and is missing a '
|
|
|
|
'.cacherequires file; the cache names for this task are '
|
|
|
|
'likely mis-configured or TASKCLUSTER_CACHES is not set '
|
|
|
|
'properly' % cache)
|
|
|
|
|
2018-05-10 07:15:36 +03:00
|
|
|
write_audit_entry(audit_path, b'missing .cacherequires')
|
2018-05-05 03:54:07 +03:00
|
|
|
return True
|
|
|
|
|
|
|
|
|
2018-05-05 04:00:44 +03:00
|
|
|
def configure_volume_posix(volume, user, group, running_as_root):
|
|
|
|
# The only time we should see files in the volume is if the Docker
|
|
|
|
# image build put files there.
|
|
|
|
#
|
|
|
|
# For the sake of simplicity, our policy is that volumes should be
|
|
|
|
# empty. This also has the advantage that an empty volume looks
|
|
|
|
# a lot like an empty cache. Tasks can rely on caches being
|
|
|
|
# swapped in and out on any volume without any noticeable change
|
|
|
|
# of behavior.
|
|
|
|
volume_files = os.listdir(volume)
|
|
|
|
if volume_files:
|
|
|
|
print(NON_EMPTY_VOLUME % volume)
|
|
|
|
print('entries in root directory: %s' %
|
|
|
|
' '.join(sorted(volume_files)))
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
# The volume is almost certainly owned by root:root. Chown it so it
|
|
|
|
# is writable.
|
|
|
|
|
|
|
|
if running_as_root:
|
|
|
|
print_line(b'volume', b'changing ownership of volume %s '
|
2018-05-11 20:19:53 +03:00
|
|
|
b'to %d:%d\n' % (volume.encode('utf-8'),
|
2018-05-10 07:15:36 +03:00
|
|
|
user.pw_uid,
|
2018-05-05 04:00:44 +03:00
|
|
|
group.gr_gid))
|
|
|
|
set_dir_permissions(volume, user.pw_uid, group.gr_gid)
|
|
|
|
|
|
|
|
|
2016-10-20 22:56:43 +03:00
|
|
|
def vcs_checkout(source_repo, dest, store_path,
|
2018-12-12 00:08:27 +03:00
|
|
|
base_repo=None, revision=None, branch=None,
|
2018-12-11 22:50:58 +03:00
|
|
|
fetch_hgfingerprint=False, sparse_profile=None):
|
2018-12-12 00:08:27 +03:00
|
|
|
# Specify method to checkout a revision. This defaults to revisions as
|
|
|
|
# SHA-1 strings, but also supports symbolic revisions like `tip` via the
|
|
|
|
# branch flag.
|
|
|
|
if revision:
|
|
|
|
revision_flag = '--revision'
|
|
|
|
revision_value = revision
|
|
|
|
elif branch:
|
|
|
|
revision_flag = '--branch'
|
|
|
|
revision_value = branch
|
|
|
|
else:
|
|
|
|
print('revision is not specified for checkout')
|
2018-12-11 22:50:52 +03:00
|
|
|
sys.exit(1)
|
|
|
|
|
2018-12-21 16:43:04 +03:00
|
|
|
if IS_MACOSX:
|
2019-05-16 00:04:57 +03:00
|
|
|
release = platform.mac_ver()
|
|
|
|
versionNums = release[0].split('.')[:2]
|
|
|
|
os_version = "%s.%s" % (versionNums[0], versionNums[1])
|
2018-12-21 16:43:04 +03:00
|
|
|
hg_bin = '/tools/python27-mercurial/bin/hg'
|
2019-05-16 00:04:57 +03:00
|
|
|
if os_version == "10.14":
|
|
|
|
hg_bin = 'hg'
|
2018-12-21 16:43:04 +03:00
|
|
|
elif IS_POSIX:
|
2018-05-10 07:15:36 +03:00
|
|
|
hg_bin = 'hg'
|
2018-05-05 03:33:19 +03:00
|
|
|
elif IS_WINDOWS:
|
|
|
|
# This is where OCC installs it in the AMIs.
|
2018-05-10 07:15:36 +03:00
|
|
|
hg_bin = r'C:\Program Files\Mercurial\hg.exe'
|
2018-05-05 03:33:19 +03:00
|
|
|
if not os.path.exists(hg_bin):
|
|
|
|
print('could not find Mercurial executable: %s' % hg_bin)
|
|
|
|
sys.exit(1)
|
|
|
|
|
2018-12-21 16:43:04 +03:00
|
|
|
store_path = os.path.abspath(store_path)
|
2016-09-30 03:05:05 +03:00
|
|
|
args = [
|
2018-05-05 03:33:19 +03:00
|
|
|
hg_bin,
|
2018-05-10 07:15:36 +03:00
|
|
|
'robustcheckout',
|
|
|
|
'--sharebase', store_path,
|
|
|
|
'--purge',
|
2016-09-30 03:05:05 +03:00
|
|
|
]
|
|
|
|
|
2017-03-04 01:49:23 +03:00
|
|
|
# Obtain certificate fingerprints. Without this, the checkout will use the fingerprint
|
|
|
|
# on the system, which is managed some other way (such as puppet)
|
|
|
|
if fetch_hgfingerprint:
|
|
|
|
try:
|
2018-05-10 07:15:36 +03:00
|
|
|
print_line(b'vcs', b'fetching hg.mozilla.org fingerprint from %s\n' %
|
2018-05-11 20:19:53 +03:00
|
|
|
FINGERPRINT_URL.encode('utf-8'))
|
|
|
|
res = urllib.request.urlopen(FINGERPRINT_URL, timeout=10)
|
2017-03-04 01:49:23 +03:00
|
|
|
secret = res.read()
|
|
|
|
try:
|
2018-12-05 19:00:20 +03:00
|
|
|
secret = json.loads(secret.decode('utf-8'))
|
2017-03-04 01:49:23 +03:00
|
|
|
except ValueError:
|
2018-05-10 07:15:36 +03:00
|
|
|
print_line(b'vcs', b'invalid JSON in hg fingerprint secret')
|
2017-03-04 01:49:23 +03:00
|
|
|
sys.exit(1)
|
2018-05-11 20:19:53 +03:00
|
|
|
except (urllib.error.URLError, socket.timeout):
|
2018-05-10 07:15:36 +03:00
|
|
|
print_line(b'vcs', b'Unable to retrieve current hg.mozilla.org fingerprint'
|
|
|
|
b'using the secret service, using fallback instead.')
|
2017-03-04 01:49:23 +03:00
|
|
|
# XXX This fingerprint will not be accurate if running on an old
|
|
|
|
# revision after the server fingerprint has changed.
|
|
|
|
secret = {'secret': FALLBACK_FINGERPRINT}
|
|
|
|
|
2018-05-10 07:15:36 +03:00
|
|
|
hgmo_fingerprint = secret['secret']['fingerprints']
|
2017-03-04 01:49:23 +03:00
|
|
|
args.extend([
|
2018-05-10 07:15:36 +03:00
|
|
|
'--config', 'hostsecurity.hg.mozilla.org:fingerprints=%s' % hgmo_fingerprint,
|
2017-03-04 01:49:23 +03:00
|
|
|
])
|
|
|
|
|
2016-09-30 03:05:05 +03:00
|
|
|
if base_repo:
|
2018-05-10 07:15:36 +03:00
|
|
|
args.extend(['--upstream', base_repo])
|
2017-08-24 04:54:14 +03:00
|
|
|
if sparse_profile:
|
2018-05-10 07:15:36 +03:00
|
|
|
args.extend(['--sparseprofile', sparse_profile])
|
2016-09-30 03:05:05 +03:00
|
|
|
|
2018-12-05 22:19:56 +03:00
|
|
|
dest = os.path.abspath(dest)
|
2016-09-30 03:05:05 +03:00
|
|
|
args.extend([
|
2018-12-12 00:08:27 +03:00
|
|
|
revision_flag, revision_value,
|
2016-09-30 03:05:05 +03:00
|
|
|
source_repo, dest,
|
|
|
|
])
|
2016-07-30 06:53:21 +03:00
|
|
|
|
2016-09-30 03:05:05 +03:00
|
|
|
res = run_and_prefix_output(b'vcs', args,
|
2018-12-05 22:19:56 +03:00
|
|
|
extra_env={'PYTHONUNBUFFERED': '1'})
|
2016-07-30 06:53:21 +03:00
|
|
|
if res:
|
|
|
|
sys.exit(res)
|
|
|
|
|
2016-08-09 23:14:05 +03:00
|
|
|
# Update the current revision hash and ensure that it is well formed.
|
|
|
|
revision = subprocess.check_output(
|
2018-05-10 07:15:36 +03:00
|
|
|
[hg_bin, 'log',
|
|
|
|
'--rev', '.',
|
|
|
|
'--template', '{node}'],
|
|
|
|
cwd=dest,
|
|
|
|
# Triggers text mode on Python 3.
|
|
|
|
universal_newlines=True)
|
2016-08-09 23:14:05 +03:00
|
|
|
|
|
|
|
assert re.match('^[a-f0-9]{40}$', revision)
|
2017-10-26 20:54:55 +03:00
|
|
|
|
2018-05-10 07:15:36 +03:00
|
|
|
msg = ("TinderboxPrint:<a href={source_repo}/rev/{revision} "
|
|
|
|
"title='Built from {repo_name} revision {revision}'>"
|
|
|
|
"{revision}</a>\n".format(revision=revision,
|
|
|
|
source_repo=source_repo,
|
|
|
|
repo_name=source_repo.split('/')[-1]))
|
|
|
|
|
|
|
|
print_line(b'vcs', msg.encode('utf-8'))
|
2017-10-26 20:54:55 +03:00
|
|
|
|
2016-09-30 03:05:05 +03:00
|
|
|
return revision
|
2016-08-09 23:14:05 +03:00
|
|
|
|
2016-07-30 06:53:21 +03:00
|
|
|
|
2018-08-17 19:37:21 +03:00
|
|
|
def fetch_artifacts():
|
2018-07-26 20:13:39 +03:00
|
|
|
print_line(b'fetches', b'fetching artifacts\n')
|
|
|
|
|
|
|
|
fetch_content = shutil.which('fetch-content')
|
|
|
|
if not fetch_content and os.environ.get('GECKO_PATH'):
|
|
|
|
fetch_content = os.path.join(os.environ['GECKO_PATH'], 'taskcluster',
|
|
|
|
'scripts', 'misc', 'fetch-content')
|
|
|
|
|
|
|
|
if not fetch_content or not os.path.isfile(fetch_content):
|
2019-04-28 00:56:47 +03:00
|
|
|
fetch_content = os.path.join(os.path.dirname(__file__),
|
|
|
|
'fetch-content')
|
|
|
|
|
|
|
|
if not os.path.isfile(fetch_content):
|
2018-07-26 20:13:39 +03:00
|
|
|
print(FETCH_CONTENT_NOT_FOUND)
|
|
|
|
sys.exit(1)
|
|
|
|
|
2019-02-15 04:30:48 +03:00
|
|
|
cmd = [sys.executable, '-u', fetch_content, 'task-artifacts']
|
|
|
|
print_line(b'fetches', b'executing %r\n' % cmd)
|
2018-07-26 20:13:39 +03:00
|
|
|
subprocess.run(cmd, check=True, env=os.environ)
|
|
|
|
print_line(b'fetches', b'finished fetching artifacts\n')
|
|
|
|
|
|
|
|
|
2018-12-31 21:52:30 +03:00
|
|
|
def add_vcs_arguments(parser, project, name):
|
|
|
|
"""Adds arguments to ArgumentParser to control VCS options for a project."""
|
|
|
|
|
|
|
|
parser.add_argument('--%s-checkout' % project,
|
|
|
|
help='Directory where %s checkout should be created' %
|
|
|
|
name)
|
|
|
|
parser.add_argument('--%s-sparse-profile' % project,
|
|
|
|
help='Path to sparse profile for %s checkout' % name)
|
|
|
|
|
|
|
|
|
2019-04-11 20:19:36 +03:00
|
|
|
def resolve_checkout_url(base_repo, head_repo):
|
|
|
|
"""Resolve the Mercurial URL to perform a checkout against, either the
|
|
|
|
public hg.mozilla.org service or a CI-only regional mirror.
|
|
|
|
|
|
|
|
The config will be of the form:
|
|
|
|
{
|
|
|
|
"us-west-2": { # value of `TASKCLUSTER_WORKER_GROUP`
|
|
|
|
"rate": 0.5,
|
|
|
|
"domain": "us-west-2.hgmointernal.net"
|
|
|
|
},
|
|
|
|
"us-east-1": {...}
|
|
|
|
}
|
|
|
|
"""
|
|
|
|
region = os.getenv('TASKCLUSTER_WORKER_GROUP')
|
|
|
|
|
|
|
|
try:
|
|
|
|
print_line(b'vcs', b'fetching hgmointernal config from %s\n' %
|
|
|
|
HGMOINTERNAL_CONFIG_URL.encode('utf-8'))
|
|
|
|
|
|
|
|
# Get the hgmointernal config Taskcluster secret
|
|
|
|
res = urllib.request.urlopen(HGMOINTERNAL_CONFIG_URL, timeout=10)
|
|
|
|
hgmointernal_config = json.loads(res.read().decode('utf-8'))['secret']
|
|
|
|
|
|
|
|
# Use public hg service if region not yet supported
|
|
|
|
if region not in hgmointernal_config:
|
2019-04-24 00:27:50 +03:00
|
|
|
if region:
|
|
|
|
print_line(b'vcs', b'region %s not yet supported; using public '
|
|
|
|
b'hg.mozilla.org service\n' % region.encode('utf-8'))
|
|
|
|
else:
|
|
|
|
print_line(b'vcs', b'unspecified region; using public '
|
|
|
|
b'hg.mozilla.org service\n')
|
2019-04-11 20:19:36 +03:00
|
|
|
return base_repo, head_repo
|
|
|
|
|
|
|
|
# Only send a percentage of traffic to the internal mirror
|
|
|
|
rate = float(hgmointernal_config[region]['rate'])
|
|
|
|
|
|
|
|
if random.random() > rate:
|
|
|
|
print_line(b'vcs', b'hgmointernal rate miss; using '
|
|
|
|
b'public hg.mozilla.org service\n')
|
|
|
|
return base_repo, head_repo
|
|
|
|
|
|
|
|
print_line(b'vcs', b'hgmointernal rate hit; cloning from '
|
|
|
|
b'private hgweb mirror\n')
|
|
|
|
|
|
|
|
mirror_domain = hgmointernal_config[region]['domain']
|
|
|
|
|
|
|
|
if base_repo and base_repo.startswith('https://hg.mozilla.org'):
|
|
|
|
base_repo = base_repo.replace('hg.mozilla.org', mirror_domain, 1)
|
|
|
|
|
|
|
|
if head_repo and head_repo.startswith('https://hg.mozilla.org'):
|
|
|
|
head_repo = head_repo.replace('hg.mozilla.org', mirror_domain, 1)
|
|
|
|
|
|
|
|
return base_repo, head_repo
|
|
|
|
|
|
|
|
except (KeyError, ValueError):
|
|
|
|
print_line(b'vcs', b'invalid JSON in hgmointernal config; '
|
|
|
|
b'falling back to public hg.mozilla.org service\n')
|
|
|
|
|
|
|
|
except (urllib.error.URLError, socket.timeout):
|
|
|
|
print_line(b'vcs', b'Unable to retrieve hgmointernal config using '
|
|
|
|
b'the secret service; falling back to public hg.mozilla.org '
|
|
|
|
b'service\n')
|
|
|
|
|
|
|
|
return base_repo, head_repo
|
|
|
|
|
|
|
|
|
2018-12-31 21:52:30 +03:00
|
|
|
def collect_vcs_options(args, project):
|
|
|
|
checkout = getattr(args, '%s_checkout' % project)
|
|
|
|
sparse_profile = getattr(args, '%s_sparse_profile' % project)
|
|
|
|
|
2018-12-31 21:52:32 +03:00
|
|
|
env_prefix = project.upper()
|
|
|
|
|
|
|
|
base_repo = os.environ.get('%s_BASE_REPOSITORY' % env_prefix)
|
|
|
|
head_repo = os.environ.get('%s_HEAD_REPOSITORY' % env_prefix)
|
|
|
|
revision = os.environ.get('%s_HEAD_REV' % env_prefix)
|
|
|
|
branch = os.environ.get('%s_HEAD_REF' % env_prefix)
|
|
|
|
|
2018-12-31 21:54:14 +03:00
|
|
|
store_path = os.environ.get('HG_STORE_PATH')
|
|
|
|
|
|
|
|
# Expand ~ in some paths.
|
2018-12-31 21:52:30 +03:00
|
|
|
if checkout:
|
|
|
|
checkout = os.path.expanduser(checkout)
|
2018-12-31 21:54:14 +03:00
|
|
|
if store_path:
|
|
|
|
store_path = os.path.expanduser(store_path)
|
2018-12-31 21:52:30 +03:00
|
|
|
|
2018-12-31 21:54:31 +03:00
|
|
|
# Some callers set the base repository to mozilla-central for historical
|
|
|
|
# reasons. Switch to mozilla-unified because robustcheckout works best
|
|
|
|
# with it.
|
|
|
|
if base_repo == 'https://hg.mozilla.org/mozilla-central':
|
|
|
|
base_repo = 'https://hg.mozilla.org/mozilla-unified'
|
|
|
|
|
2019-04-11 20:19:36 +03:00
|
|
|
# No need to check the hgmointernal config if we aren't performing
|
|
|
|
# a checkout.
|
|
|
|
if checkout:
|
|
|
|
base_repo, head_repo = resolve_checkout_url(base_repo, head_repo)
|
|
|
|
|
2018-12-31 21:52:30 +03:00
|
|
|
return {
|
2018-12-31 21:54:14 +03:00
|
|
|
'store-path': store_path,
|
2018-12-31 21:52:30 +03:00
|
|
|
'project': project,
|
2018-12-31 21:50:34 +03:00
|
|
|
'env-prefix': env_prefix,
|
2018-12-31 21:52:30 +03:00
|
|
|
'checkout': checkout,
|
|
|
|
'sparse-profile': sparse_profile,
|
2018-12-31 21:52:32 +03:00
|
|
|
'base-repo': base_repo,
|
|
|
|
'head-repo': head_repo,
|
|
|
|
'revision': revision,
|
|
|
|
'branch': branch,
|
2018-12-31 21:52:30 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2018-12-31 21:50:34 +03:00
|
|
|
def vcs_checkout_from_args(args, project):
|
|
|
|
options = collect_vcs_options(args, project)
|
|
|
|
|
|
|
|
if not options['checkout']:
|
|
|
|
if options['branch'] and not options['revision']:
|
|
|
|
print('task should be defined in terms of non-symbolic revision')
|
|
|
|
sys.exit(1)
|
|
|
|
return
|
|
|
|
|
|
|
|
os.environ['%s_HEAD_REV' % options['env-prefix']] = vcs_checkout(
|
|
|
|
options['head-repo'],
|
|
|
|
options['checkout'],
|
|
|
|
options['store-path'],
|
|
|
|
base_repo=options['base-repo'],
|
|
|
|
revision=options['revision'],
|
|
|
|
fetch_hgfingerprint=args.fetch_hgfingerprint,
|
|
|
|
branch=options['branch'],
|
|
|
|
sparse_profile=options['sparse-profile'])
|
|
|
|
|
|
|
|
|
2016-07-30 06:53:21 +03:00
|
|
|
def main(args):
|
2018-05-29 23:05:35 +03:00
|
|
|
print_line(b'setup', b'run-task started in %s\n' % os.getcwd().encode('utf-8'))
|
2018-05-05 03:11:53 +03:00
|
|
|
running_as_root = IS_POSIX and os.getuid() == 0
|
2016-07-30 06:53:21 +03:00
|
|
|
|
|
|
|
# Arguments up to '--' are ours. After are for the main task
|
|
|
|
# to be executed.
|
|
|
|
try:
|
|
|
|
i = args.index('--')
|
|
|
|
our_args = args[0:i]
|
|
|
|
task_args = args[i + 1:]
|
|
|
|
except ValueError:
|
|
|
|
our_args = args
|
|
|
|
task_args = []
|
|
|
|
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument('--user', default='worker', help='user to run as')
|
|
|
|
parser.add_argument('--group', default='worker', help='group to run as')
|
2019-07-27 09:21:35 +03:00
|
|
|
parser.add_argument('--task-cwd', help='directory to run the provided command in')
|
2018-12-31 21:52:30 +03:00
|
|
|
|
|
|
|
add_vcs_arguments(parser, 'gecko', 'Firefox')
|
|
|
|
add_vcs_arguments(parser, 'comm', 'Comm')
|
|
|
|
|
2017-03-04 01:49:23 +03:00
|
|
|
parser.add_argument('--fetch-hgfingerprint', action='store_true',
|
|
|
|
help='Fetch the latest hgfingerprint from the secrets store, '
|
|
|
|
'using the taskclsuerProxy')
|
2016-07-30 06:53:21 +03:00
|
|
|
|
|
|
|
args = parser.parse_args(our_args)
|
|
|
|
|
2018-05-29 22:58:07 +03:00
|
|
|
uid = gid = gids = None
|
|
|
|
if IS_POSIX and running_as_root:
|
|
|
|
user, group, gids = get_posix_user_group(args.user, args.group)
|
|
|
|
uid = user.pw_uid
|
|
|
|
gid = group.gr_gid
|
2016-07-30 06:53:21 +03:00
|
|
|
|
2018-05-29 22:58:07 +03:00
|
|
|
if running_as_root and os.path.exists("/dev/kvm"):
|
2018-05-15 18:57:27 +03:00
|
|
|
# Ensure kvm permissions for worker, required for Android x86
|
|
|
|
st = os.stat("/dev/kvm")
|
2018-05-16 23:57:08 +03:00
|
|
|
os.chmod("/dev/kvm", st.st_mode | 0o666)
|
2018-05-15 18:57:27 +03:00
|
|
|
|
Bug 1391789 - Improve cache coherence via run-task integration; r=dustin
Today, cache names are mostly static and are brittle as a result.
In theory, when a backwards incompatible change is performed on
something that touches a cache, the cache name needs to be changed
to ensure tasks running the old code don't see cached data from the
new task. (Alternatively, all code is forward compatible, but that is
hard to implement in practice.)
For many things, the process works as planned. However, not everyone
knows that cache names need changed. And, it isn't always obvious
that some things require fresh caches. When mistakes are made, tasks
break intermittently due to cache wonkiness.
One area where we get into trouble is with UID and GID mismatch.
Task A will use a Docker image where our standard "worker" user/group
is UID/GID 1000:1000. Then Task B will use UID/GID 500:500. (This is
common when mixing Debian and RedHel based distros.) If they use the
same cache, then Task B needs to chown/chmod all files in the cache
or there could be a permissions problem. This is exactly why
run-task recursively chowns certain paths before dropping root
privileges.
Permissions setting in run-task solves permissions problems. But
it doesn't solve content incompatibility problems. For that, you
need to change cache names, not use caches, or blow away content
when incompatibilities are detected.
This commit starts the process of adding a little bit more coherence
to our caching story.
There are two main features in this commit:
1) Cache names tied to run-task content
2) Cache validation in run-task
Taskgraph now detects when a task is using caches with run-task. When
caches and run-task are both being used, the cache name is adjusted to
contain a hash of run-task's content. When run-task changes, the cache
name changes. So, changing run-task ensures that all caches from that point
forward are "clean." This frees run-task and any functionality related
to run-task (such as maintaining version control checkouts) from
having to maintain backwards or forwards compatibility with any other
version of run-task. This does mean that any changes to run-task
effectively wipe out caches. But changes to run-task tend to be
seldom, so this should be acceptable.
The second part of this change is code in run-task to record per-cache
properties and validate whether a populated cache is appropriate for
use. To enable this, taskgraph passes a list of cache paths via an
environment variable. For each cache path, run-task looks for a
well-defined file containing a list of "requirements." Right now,
that list is simply a version string. But other features will be
worked into it. If the cache is empty, we simply write out a new
requirements file and are done. If the file exists, we compare
requirements and fail fast if there is a mismatch. If the cache
has content but not this special file, then we abort (because this
should never happen).
The "requirements" validation isn't very useful now because the only
entry comes from run-task's source code and modifying run-task will
change the hash and cause a new cache to be used. The implementation
at this point is more demonstrating the concept than doing anything
terribly useful with it.
MozReview-Commit-ID: HtpXIc7OD1k
--HG--
extra : rebase_source : 2424696b1fde59f20152617a6ebb2afe14b94678
2017-08-19 00:07:03 +03:00
|
|
|
# Validate caches.
|
|
|
|
#
|
|
|
|
# Taskgraph should pass in a list of paths that are caches via an
|
|
|
|
# environment variable (which we don't want to pass down to child
|
2018-05-05 03:54:07 +03:00
|
|
|
# processes).
|
Bug 1391789 - Improve cache coherence via run-task integration; r=dustin
Today, cache names are mostly static and are brittle as a result.
In theory, when a backwards incompatible change is performed on
something that touches a cache, the cache name needs to be changed
to ensure tasks running the old code don't see cached data from the
new task. (Alternatively, all code is forward compatible, but that is
hard to implement in practice.)
For many things, the process works as planned. However, not everyone
knows that cache names need changed. And, it isn't always obvious
that some things require fresh caches. When mistakes are made, tasks
break intermittently due to cache wonkiness.
One area where we get into trouble is with UID and GID mismatch.
Task A will use a Docker image where our standard "worker" user/group
is UID/GID 1000:1000. Then Task B will use UID/GID 500:500. (This is
common when mixing Debian and RedHel based distros.) If they use the
same cache, then Task B needs to chown/chmod all files in the cache
or there could be a permissions problem. This is exactly why
run-task recursively chowns certain paths before dropping root
privileges.
Permissions setting in run-task solves permissions problems. But
it doesn't solve content incompatibility problems. For that, you
need to change cache names, not use caches, or blow away content
when incompatibilities are detected.
This commit starts the process of adding a little bit more coherence
to our caching story.
There are two main features in this commit:
1) Cache names tied to run-task content
2) Cache validation in run-task
Taskgraph now detects when a task is using caches with run-task. When
caches and run-task are both being used, the cache name is adjusted to
contain a hash of run-task's content. When run-task changes, the cache
name changes. So, changing run-task ensures that all caches from that point
forward are "clean." This frees run-task and any functionality related
to run-task (such as maintaining version control checkouts) from
having to maintain backwards or forwards compatibility with any other
version of run-task. This does mean that any changes to run-task
effectively wipe out caches. But changes to run-task tend to be
seldom, so this should be acceptable.
The second part of this change is code in run-task to record per-cache
properties and validate whether a populated cache is appropriate for
use. To enable this, taskgraph passes a list of cache paths via an
environment variable. For each cache path, run-task looks for a
well-defined file containing a list of "requirements." Right now,
that list is simply a version string. But other features will be
worked into it. If the cache is empty, we simply write out a new
requirements file and are done. If the file exists, we compare
requirements and fail fast if there is a mismatch. If the cache
has content but not this special file, then we abort (because this
should never happen).
The "requirements" validation isn't very useful now because the only
entry comes from run-task's source code and modifying run-task will
change the hash and cause a new cache to be used. The implementation
at this point is more demonstrating the concept than doing anything
terribly useful with it.
MozReview-Commit-ID: HtpXIc7OD1k
--HG--
extra : rebase_source : 2424696b1fde59f20152617a6ebb2afe14b94678
2017-08-19 00:07:03 +03:00
|
|
|
|
|
|
|
if 'TASKCLUSTER_CACHES' in os.environ:
|
|
|
|
caches = os.environ['TASKCLUSTER_CACHES'].split(';')
|
|
|
|
del os.environ['TASKCLUSTER_CACHES']
|
|
|
|
else:
|
|
|
|
caches = []
|
|
|
|
|
Bug 1391476 - Add UID and GID to cache parameters; r=dustin
The UID and GID that a task executes under is dynamic. As a result,
caches need to be aware of the UID and GID that owns files otherwise
subsequent tasks could run into permission denied errors. This is
why `run-task --chown-recursive` exists. By recursively changing
ownership of persisted files, we ensure the current task is able
to read and write all existing files.
When you take a step back, you realize that chowning of cached
files is an expensive workaround. Yes, this results in cache hits.
But the cost is you potentially have to perform hundreds of thousands
of I/O system calls to mass chown. The ideal situation is that
UID/GID is consistent across tasks on any given cache and
potentially expensive permissions setting can be avoided. So, that's
what this commit does.
We add the task's UID and GID to run-task's requirements. When we
first see a cache, we record a UID and GID with it and chown the
empty cache directory to that UID and GID. Subsequent tasks using
this cache *must* use the same UID and GID or else run-task will
fail.
Since run-task now guarantees that all cache consumers use the same
UID and GID, we can avoid a potentially expensive recursive chown.
But there is an exception. In untrusted environments (namely Try),
we recursively chown existing caches if there is a uid/gid mismatch.
We do this because Try is a sandbox and any random task could
experiment with a non-standard uid/gid. That populated cache would
"poison" the cache for the next caller. Or vice-versa. It would be
annoying if caches were randomly poisoned due to Try pushes that
didn't realize there was a UID/GID mismatch. We could outlaw "bad"
UID and GIDs. But that makes the barrier to testing things on Try
harder. So, we go with the flow and recursively chown caches in
this scenario.
This change will shine light on all tasks using inconsistent UID
and GID values on the same cache. Bustage is anticipated.
Unfortunately, we can't easily know what will break. So it will be
one of those things where we will have to fix problems as they arise.
Fortunately, because caches are now tied to the content of run-task,
we only need to back out this change and tasks should revert to caches
without UID and GID pinning requirements and everything will work
again.
MozReview-Commit-ID: 2ka4rOnnXIp
--HG--
extra : rebase_source : ccb2b0a9230694f989775b26d5276fd3ac928af3
extra : source : 083d2e1cc8fe44b04e44f74bda3dd8bc75ba826c
2017-08-23 02:49:26 +03:00
|
|
|
if 'TASKCLUSTER_UNTRUSTED_CACHES' in os.environ:
|
|
|
|
untrusted_caches = True
|
|
|
|
del os.environ['TASKCLUSTER_UNTRUSTED_CACHES']
|
|
|
|
else:
|
|
|
|
untrusted_caches = False
|
|
|
|
|
Bug 1391789 - Improve cache coherence via run-task integration; r=dustin
Today, cache names are mostly static and are brittle as a result.
In theory, when a backwards incompatible change is performed on
something that touches a cache, the cache name needs to be changed
to ensure tasks running the old code don't see cached data from the
new task. (Alternatively, all code is forward compatible, but that is
hard to implement in practice.)
For many things, the process works as planned. However, not everyone
knows that cache names need changed. And, it isn't always obvious
that some things require fresh caches. When mistakes are made, tasks
break intermittently due to cache wonkiness.
One area where we get into trouble is with UID and GID mismatch.
Task A will use a Docker image where our standard "worker" user/group
is UID/GID 1000:1000. Then Task B will use UID/GID 500:500. (This is
common when mixing Debian and RedHel based distros.) If they use the
same cache, then Task B needs to chown/chmod all files in the cache
or there could be a permissions problem. This is exactly why
run-task recursively chowns certain paths before dropping root
privileges.
Permissions setting in run-task solves permissions problems. But
it doesn't solve content incompatibility problems. For that, you
need to change cache names, not use caches, or blow away content
when incompatibilities are detected.
This commit starts the process of adding a little bit more coherence
to our caching story.
There are two main features in this commit:
1) Cache names tied to run-task content
2) Cache validation in run-task
Taskgraph now detects when a task is using caches with run-task. When
caches and run-task are both being used, the cache name is adjusted to
contain a hash of run-task's content. When run-task changes, the cache
name changes. So, changing run-task ensures that all caches from that point
forward are "clean." This frees run-task and any functionality related
to run-task (such as maintaining version control checkouts) from
having to maintain backwards or forwards compatibility with any other
version of run-task. This does mean that any changes to run-task
effectively wipe out caches. But changes to run-task tend to be
seldom, so this should be acceptable.
The second part of this change is code in run-task to record per-cache
properties and validate whether a populated cache is appropriate for
use. To enable this, taskgraph passes a list of cache paths via an
environment variable. For each cache path, run-task looks for a
well-defined file containing a list of "requirements." Right now,
that list is simply a version string. But other features will be
worked into it. If the cache is empty, we simply write out a new
requirements file and are done. If the file exists, we compare
requirements and fail fast if there is a mismatch. If the cache
has content but not this special file, then we abort (because this
should never happen).
The "requirements" validation isn't very useful now because the only
entry comes from run-task's source code and modifying run-task will
change the hash and cause a new cache to be used. The implementation
at this point is more demonstrating the concept than doing anything
terribly useful with it.
MozReview-Commit-ID: HtpXIc7OD1k
--HG--
extra : rebase_source : 2424696b1fde59f20152617a6ebb2afe14b94678
2017-08-19 00:07:03 +03:00
|
|
|
for cache in caches:
|
|
|
|
if not os.path.isdir(cache):
|
2017-08-24 19:06:45 +03:00
|
|
|
print('error: cache %s is not a directory; this should never '
|
|
|
|
'happen' % cache)
|
Bug 1391789 - Improve cache coherence via run-task integration; r=dustin
Today, cache names are mostly static and are brittle as a result.
In theory, when a backwards incompatible change is performed on
something that touches a cache, the cache name needs to be changed
to ensure tasks running the old code don't see cached data from the
new task. (Alternatively, all code is forward compatible, but that is
hard to implement in practice.)
For many things, the process works as planned. However, not everyone
knows that cache names need changed. And, it isn't always obvious
that some things require fresh caches. When mistakes are made, tasks
break intermittently due to cache wonkiness.
One area where we get into trouble is with UID and GID mismatch.
Task A will use a Docker image where our standard "worker" user/group
is UID/GID 1000:1000. Then Task B will use UID/GID 500:500. (This is
common when mixing Debian and RedHel based distros.) If they use the
same cache, then Task B needs to chown/chmod all files in the cache
or there could be a permissions problem. This is exactly why
run-task recursively chowns certain paths before dropping root
privileges.
Permissions setting in run-task solves permissions problems. But
it doesn't solve content incompatibility problems. For that, you
need to change cache names, not use caches, or blow away content
when incompatibilities are detected.
This commit starts the process of adding a little bit more coherence
to our caching story.
There are two main features in this commit:
1) Cache names tied to run-task content
2) Cache validation in run-task
Taskgraph now detects when a task is using caches with run-task. When
caches and run-task are both being used, the cache name is adjusted to
contain a hash of run-task's content. When run-task changes, the cache
name changes. So, changing run-task ensures that all caches from that point
forward are "clean." This frees run-task and any functionality related
to run-task (such as maintaining version control checkouts) from
having to maintain backwards or forwards compatibility with any other
version of run-task. This does mean that any changes to run-task
effectively wipe out caches. But changes to run-task tend to be
seldom, so this should be acceptable.
The second part of this change is code in run-task to record per-cache
properties and validate whether a populated cache is appropriate for
use. To enable this, taskgraph passes a list of cache paths via an
environment variable. For each cache path, run-task looks for a
well-defined file containing a list of "requirements." Right now,
that list is simply a version string. But other features will be
worked into it. If the cache is empty, we simply write out a new
requirements file and are done. If the file exists, we compare
requirements and fail fast if there is a mismatch. If the cache
has content but not this special file, then we abort (because this
should never happen).
The "requirements" validation isn't very useful now because the only
entry comes from run-task's source code and modifying run-task will
change the hash and cause a new cache to be used. The implementation
at this point is more demonstrating the concept than doing anything
terribly useful with it.
MozReview-Commit-ID: HtpXIc7OD1k
--HG--
extra : rebase_source : 2424696b1fde59f20152617a6ebb2afe14b94678
2017-08-19 00:07:03 +03:00
|
|
|
return 1
|
|
|
|
|
2018-05-05 03:54:07 +03:00
|
|
|
purge = configure_cache_posix(cache, user, group, untrusted_caches,
|
|
|
|
running_as_root)
|
2017-10-26 02:58:37 +03:00
|
|
|
|
2018-05-05 03:54:07 +03:00
|
|
|
if purge:
|
2018-01-16 08:00:50 +03:00
|
|
|
return EXIT_PURGE_CACHE
|
Bug 1391789 - Improve cache coherence via run-task integration; r=dustin
Today, cache names are mostly static and are brittle as a result.
In theory, when a backwards incompatible change is performed on
something that touches a cache, the cache name needs to be changed
to ensure tasks running the old code don't see cached data from the
new task. (Alternatively, all code is forward compatible, but that is
hard to implement in practice.)
For many things, the process works as planned. However, not everyone
knows that cache names need changed. And, it isn't always obvious
that some things require fresh caches. When mistakes are made, tasks
break intermittently due to cache wonkiness.
One area where we get into trouble is with UID and GID mismatch.
Task A will use a Docker image where our standard "worker" user/group
is UID/GID 1000:1000. Then Task B will use UID/GID 500:500. (This is
common when mixing Debian and RedHel based distros.) If they use the
same cache, then Task B needs to chown/chmod all files in the cache
or there could be a permissions problem. This is exactly why
run-task recursively chowns certain paths before dropping root
privileges.
Permissions setting in run-task solves permissions problems. But
it doesn't solve content incompatibility problems. For that, you
need to change cache names, not use caches, or blow away content
when incompatibilities are detected.
This commit starts the process of adding a little bit more coherence
to our caching story.
There are two main features in this commit:
1) Cache names tied to run-task content
2) Cache validation in run-task
Taskgraph now detects when a task is using caches with run-task. When
caches and run-task are both being used, the cache name is adjusted to
contain a hash of run-task's content. When run-task changes, the cache
name changes. So, changing run-task ensures that all caches from that point
forward are "clean." This frees run-task and any functionality related
to run-task (such as maintaining version control checkouts) from
having to maintain backwards or forwards compatibility with any other
version of run-task. This does mean that any changes to run-task
effectively wipe out caches. But changes to run-task tend to be
seldom, so this should be acceptable.
The second part of this change is code in run-task to record per-cache
properties and validate whether a populated cache is appropriate for
use. To enable this, taskgraph passes a list of cache paths via an
environment variable. For each cache path, run-task looks for a
well-defined file containing a list of "requirements." Right now,
that list is simply a version string. But other features will be
worked into it. If the cache is empty, we simply write out a new
requirements file and are done. If the file exists, we compare
requirements and fail fast if there is a mismatch. If the cache
has content but not this special file, then we abort (because this
should never happen).
The "requirements" validation isn't very useful now because the only
entry comes from run-task's source code and modifying run-task will
change the hash and cause a new cache to be used. The implementation
at this point is more demonstrating the concept than doing anything
terribly useful with it.
MozReview-Commit-ID: HtpXIc7OD1k
--HG--
extra : rebase_source : 2424696b1fde59f20152617a6ebb2afe14b94678
2017-08-19 00:07:03 +03:00
|
|
|
|
2017-08-23 20:47:37 +03:00
|
|
|
if 'TASKCLUSTER_VOLUMES' in os.environ:
|
|
|
|
volumes = os.environ['TASKCLUSTER_VOLUMES'].split(';')
|
|
|
|
del os.environ['TASKCLUSTER_VOLUMES']
|
|
|
|
else:
|
|
|
|
volumes = []
|
|
|
|
|
2018-05-05 04:02:54 +03:00
|
|
|
if volumes and not IS_POSIX:
|
|
|
|
print('assertion failed: volumes not expected on Windows')
|
|
|
|
return 1
|
|
|
|
|
2017-08-23 20:47:37 +03:00
|
|
|
# Sanitize volumes.
|
|
|
|
for volume in volumes:
|
|
|
|
# If a volume is a cache, it was dealt with above.
|
|
|
|
if volume in caches:
|
2018-05-11 20:19:53 +03:00
|
|
|
print_line(b'volume', b'volume %s is a cache\n' %
|
|
|
|
volume.encode('utf-8'))
|
2017-08-23 20:47:37 +03:00
|
|
|
continue
|
|
|
|
|
2018-05-05 04:00:44 +03:00
|
|
|
configure_volume_posix(volume, user, group, running_as_root)
|
2017-08-23 20:47:37 +03:00
|
|
|
|
2017-10-26 03:38:35 +03:00
|
|
|
all_caches_and_volumes = set(map(os.path.normpath, caches))
|
|
|
|
all_caches_and_volumes |= set(map(os.path.normpath, volumes))
|
|
|
|
|
|
|
|
def path_in_cache_or_volume(path):
|
|
|
|
path = os.path.normpath(path)
|
|
|
|
|
|
|
|
while path:
|
|
|
|
if path in all_caches_and_volumes:
|
|
|
|
return True
|
|
|
|
|
|
|
|
path, child = os.path.split(path)
|
|
|
|
if not child:
|
|
|
|
break
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
2016-09-30 03:05:05 +03:00
|
|
|
def prepare_checkout_dir(checkout):
|
|
|
|
if not checkout:
|
|
|
|
return
|
|
|
|
|
2017-10-26 03:23:16 +03:00
|
|
|
# The checkout path becomes the working directory. Since there are
|
|
|
|
# special cache files in the cache's root directory and working
|
|
|
|
# directory purging could blow them away, disallow this scenario.
|
|
|
|
if os.path.exists(os.path.join(checkout, '.cacherequires')):
|
|
|
|
print('error: cannot perform vcs checkout into cache root: %s' %
|
|
|
|
checkout)
|
|
|
|
sys.exit(1)
|
|
|
|
|
2017-10-26 03:38:35 +03:00
|
|
|
# TODO given the performance implications, consider making this a fatal
|
|
|
|
# error.
|
|
|
|
if not path_in_cache_or_volume(checkout):
|
|
|
|
print_line(b'vcs', b'WARNING: vcs checkout path (%s) not in cache '
|
|
|
|
b'or volume; performance will likely suffer\n' %
|
2018-05-11 20:19:53 +03:00
|
|
|
checkout.encode('utf-8'))
|
2017-10-26 03:38:35 +03:00
|
|
|
|
2016-07-30 06:53:21 +03:00
|
|
|
# Ensure the directory for the source checkout exists.
|
|
|
|
try:
|
|
|
|
os.makedirs(os.path.dirname(checkout))
|
|
|
|
except OSError as e:
|
|
|
|
if e.errno != errno.EEXIST:
|
|
|
|
raise
|
|
|
|
|
|
|
|
# And that it is owned by the appropriate user/group.
|
2017-03-04 01:49:23 +03:00
|
|
|
if running_as_root:
|
|
|
|
os.chown(os.path.dirname(checkout), uid, gid)
|
2016-07-30 06:53:21 +03:00
|
|
|
|
2017-03-04 01:49:23 +03:00
|
|
|
def prepare_hg_store_path():
|
2016-10-20 22:56:43 +03:00
|
|
|
# And ensure the shared store path exists and has proper permissions.
|
|
|
|
if 'HG_STORE_PATH' not in os.environ:
|
|
|
|
print('error: HG_STORE_PATH environment variable not set')
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
store_path = os.environ['HG_STORE_PATH']
|
2017-10-26 03:38:35 +03:00
|
|
|
|
|
|
|
if not path_in_cache_or_volume(store_path):
|
|
|
|
print_line(b'vcs', b'WARNING: HG_STORE_PATH (%s) not in cache or '
|
|
|
|
b'volume; performance will likely suffer\n' %
|
2018-05-11 20:19:53 +03:00
|
|
|
store_path.encode('utf-8'))
|
2017-10-26 03:38:35 +03:00
|
|
|
|
2016-10-18 19:46:55 +03:00
|
|
|
try:
|
2016-10-20 22:56:43 +03:00
|
|
|
os.makedirs(store_path)
|
2016-10-18 19:46:55 +03:00
|
|
|
except OSError as e:
|
|
|
|
if e.errno != errno.EEXIST:
|
|
|
|
raise
|
|
|
|
|
2017-03-04 01:49:23 +03:00
|
|
|
if running_as_root:
|
|
|
|
os.chown(store_path, uid, gid)
|
2016-10-18 19:46:55 +03:00
|
|
|
|
2018-12-31 21:51:27 +03:00
|
|
|
prepare_checkout_dir(args.gecko_checkout)
|
2018-12-31 21:52:28 +03:00
|
|
|
if args.gecko_checkout or args.comm_checkout:
|
2017-12-26 07:19:45 +03:00
|
|
|
prepare_hg_store_path()
|
2017-03-04 01:49:23 +03:00
|
|
|
|
2018-05-05 03:11:53 +03:00
|
|
|
if IS_POSIX and running_as_root:
|
2017-03-04 01:49:23 +03:00
|
|
|
# Drop permissions to requested user.
|
|
|
|
# This code is modeled after what `sudo` was observed to do in a Docker
|
|
|
|
# container. We do not bother calling setrlimit() because containers have
|
|
|
|
# their own limits.
|
2018-05-11 20:19:53 +03:00
|
|
|
print_line(b'setup', b'running as %s:%s\n' % (
|
|
|
|
args.user.encode('utf-8'), args.group.encode('utf-8')))
|
|
|
|
|
2017-03-04 01:49:23 +03:00
|
|
|
os.setgroups(gids)
|
2018-05-16 23:57:08 +03:00
|
|
|
os.umask(0o22)
|
2017-03-04 01:49:23 +03:00
|
|
|
os.setresgid(gid, gid, gid)
|
|
|
|
os.setresuid(uid, uid, uid)
|
2016-07-30 06:53:21 +03:00
|
|
|
|
2018-12-31 21:50:34 +03:00
|
|
|
vcs_checkout_from_args(args, 'gecko')
|
|
|
|
vcs_checkout_from_args(args, 'comm')
|
2017-07-25 19:17:03 +03:00
|
|
|
|
2018-06-04 23:36:28 +03:00
|
|
|
try:
|
2019-07-29 19:32:50 +03:00
|
|
|
for k in ('GECKO_PATH', 'MOZ_FETCHES_DIR', 'UPLOAD_DIR'):
|
2019-07-29 23:48:40 +03:00
|
|
|
if k in os.environ:
|
|
|
|
os.environ[k] = os.path.abspath(os.environ[k])
|
|
|
|
print_line(b'setup', b'%s is %s\n' % (
|
|
|
|
k.encode('utf-8'),
|
|
|
|
os.environ[k].encode('utf-8')))
|
2018-12-05 22:19:56 +03:00
|
|
|
|
2018-08-17 19:37:21 +03:00
|
|
|
if 'MOZ_FETCHES' in os.environ:
|
|
|
|
fetch_artifacts()
|
2018-08-24 19:04:59 +03:00
|
|
|
|
2019-07-27 09:21:35 +03:00
|
|
|
return run_and_prefix_output(b'task', task_args, cwd=args.task_cwd)
|
2018-06-04 23:36:28 +03:00
|
|
|
finally:
|
2018-07-26 20:13:39 +03:00
|
|
|
fetches_dir = os.environ.get('MOZ_FETCHES_DIR')
|
|
|
|
if fetches_dir and os.path.isdir(fetches_dir):
|
|
|
|
print_line(b'fetches', b'removing %s\n' % fetches_dir.encode('utf-8'))
|
|
|
|
shutil.rmtree(fetches_dir)
|
|
|
|
print_line(b'fetches', b'finished\n')
|
2016-07-30 06:53:21 +03:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
sys.exit(main(sys.argv[1:]))
|