From 02d1cf283bd2ec8379c851109e20d59cf9e28262 Mon Sep 17 00:00:00 2001
From: Gregory Szorc <gps@mozilla.com>
Date: Fri, 18 Aug 2017 14:07:03 -0700
Subject: [PATCH] Bug 1391789 - Improve cache coherence via run-task
 integration; r=dustin

Today, cache names are mostly static and are brittle as a result.

In theory, when a backwards incompatible change is performed on
something that touches a cache, the cache name needs to be changed
to ensure tasks running the old code don't see cached data from the
new task. (Alternatively, all code is forward compatible, but that is
hard to implement in practice.)

For many things, the process works as planned. However, not everyone
knows that cache names need changed. And, it isn't always obvious
that some things require fresh caches. When mistakes are made, tasks
break intermittently due to cache wonkiness.

One area where we get into trouble is with UID and GID mismatch.
Task A will use a Docker image where our standard "worker" user/group
is UID/GID 1000:1000. Then Task B will use UID/GID 500:500. (This is
common when mixing Debian and RedHel based distros.) If they use the
same cache, then Task B needs to chown/chmod all files in the cache
or there could be a permissions problem. This is exactly why
run-task recursively chowns certain paths before dropping root
privileges.

Permissions setting in run-task solves permissions problems. But
it doesn't solve content incompatibility problems. For that, you
need to change cache names, not use caches, or blow away content
when incompatibilities are detected.

This commit starts the process of adding a little bit more coherence
to our caching story.

There are two main features in this commit:

1) Cache names tied to run-task content
2) Cache validation in run-task

Taskgraph now detects when a task is using caches with run-task. When
caches and run-task are both being used, the cache name is adjusted to
contain a hash of run-task's content. When run-task changes, the cache
name changes. So, changing run-task ensures that all caches from that point
forward are "clean." This frees run-task and any functionality related
to run-task (such as maintaining version control checkouts) from
having to maintain backwards or forwards compatibility with any other
version of run-task. This does mean that any changes to run-task
effectively wipe out caches. But changes to run-task tend to be
seldom, so this should be acceptable.

The second part of this change is code in run-task to record per-cache
properties and validate whether a populated cache is appropriate for
use. To enable this, taskgraph passes a list of cache paths via an
environment variable. For each cache path, run-task looks for a
well-defined file containing a list of "requirements." Right now,
that list is simply a version string. But other features will be
worked into it. If the cache is empty, we simply write out a new
requirements file and are done. If the file exists, we compare
requirements and fail fast if there is a mismatch. If the cache
has content but not this special file, then we abort (because this
should never happen).

The "requirements" validation isn't very useful now because the only
entry comes from run-task's source code and modifying run-task will
change the hash and cause a new cache to be used. The implementation
at this point is more demonstrating the concept than doing anything
terribly useful with it.

MozReview-Commit-ID: HtpXIc7OD1k

--HG--
extra : rebase_source : 2424696b1fde59f20152617a6ebb2afe14b94678
---
 taskcluster/docker/recipes/run-task      | 72 +++++++++++++++++++++++
 taskcluster/docs/caches.rst              | 73 +++++++++++++++++++-----
 taskcluster/taskgraph/transforms/task.py | 38 +++++++++++-
 3 files changed, 168 insertions(+), 15 deletions(-)

diff --git a/taskcluster/docker/recipes/run-task b/taskcluster/docker/recipes/run-task
index 83b015e0c886..ddaecbb50884 100755
--- a/taskcluster/docker/recipes/run-task
+++ b/taskcluster/docker/recipes/run-task
@@ -256,6 +256,78 @@ def main(args):
     else:
         uid = gid = gids = None
 
+    # Validate caches.
+    #
+    # Taskgraph should pass in a list of paths that are caches via an
+    # environment variable (which we don't want to pass down to child
+    # processes). For each cache, we write out a special file denoting
+    # attributes and capabilities of run-task and the task being executed.
+    # These attributes are used by subsequent run-task invocations to
+    # validate that use of the cache is acceptable.
+    #
+    # We /could/ blow away the cache data on requirements mismatch.
+    # While this would be convenient, this could result in "competing" tasks
+    # effectively undoing the other's work. This would slow down task
+    # execution in aggregate. Without monitoring for this, people may not notice
+    # the problem and tasks would be slower than they could be. We follow the
+    # principle of "fail fast" to ensure optimal task execution.
+
+    if 'TASKCLUSTER_CACHES' in os.environ:
+        caches = os.environ['TASKCLUSTER_CACHES'].split(';')
+        del os.environ['TASKCLUSTER_CACHES']
+    else:
+        caches = []
+
+    our_requirements = {
+        # Include a version string that we can bump whenever to trigger
+        # fresh caches. The actual value is not relevant and doesn't need
+        # to follow any explicit order. Since taskgraph bakes this file's
+        # hash into cache names, any change to this file/version is sufficient
+        # to force the use of a new cache.
+        b'version=1',
+    }
+
+    for cache in caches:
+        if not os.path.isdir(cache):
+            print('cache %s is not a directory; this should never happen')
+            return 1
+
+        requires_path = os.path.join(cache, '.cacherequires')
+
+        # The cache is empty. No validation necessary. Just set up our
+        # requirements file.
+        if not os.listdir(cache):
+            with open(requires_path, 'wb') as fh:
+                fh.write(b'\n'.join(sorted(our_requirements)))
+
+            # And make it read-only as a precaution against deletion.
+            os.chmod(requires_path, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
+
+        # The cache has content and we have a requirements file. Validate
+        # requirements alignment.
+        elif os.path.exists(requires_path):
+            with open(requires_path, 'rb') as fh:
+                wanted_requirements = set(fh.read().splitlines())
+
+                missing = wanted_requirements - our_requirements
+                if missing:
+                    print('requirements for populated cache %s differ from '
+                          'this task' % cache)
+                    print('cache requirements: %s' % ' '.join(sorted(
+                        wanted_requirements)))
+                    print('our requirements:   %s' % ' '.join(sorted(
+                        our_requirements)))
+                    return 1
+
+        # The cache has content and no requirements file. This shouldn't
+        # happen because run-task should be the first thing that touches a
+        # cache.
+        else:
+            print('cache %s is not empty and is missing a .cacherequires '
+                  'file; the cache names for this task are likely '
+                  'mis-configured')
+            return 1
+
     # Change ownership of requested paths.
     # FUTURE: parse argument values for user/group if we don't want to
     # use --user/--group.
diff --git a/taskcluster/docs/caches.rst b/taskcluster/docs/caches.rst
index 79d3620ac707..4bc491053995 100644
--- a/taskcluster/docs/caches.rst
+++ b/taskcluster/docs/caches.rst
@@ -1,16 +1,61 @@
 .. taskcluster_caches:
 
-=============
-Common Caches
-=============
+Caches
+======
 
 There are various caches used by the in-tree tasks. This page attempts to
 document them and their appropriate use.
 
-Version Control Caches
-======================
+Caches are essentially isolated filesystems that are persisted between
+tasks. For example, if 2 tasks run on a worker - one after the other -
+and both tasks request the same cache, the subsequent task will be
+able to see files in the cache that were created by the first task.
+It's also worth noting that TaskCluster workers ensure a cache can only
+be used by 1 task at a time. If a worker is simultaneously running
+multiple tasks requesting the same named cache, the worker will
+have multiple caches of the same name on the worker.
 
-``level-{{level}}-checkouts-{{version}}``
+Caches and ``run-task``
+=======================
+
+``run-task`` is our generic task wrapper script. It does common activities
+like ensure a version control checkout is present.
+
+One of the roles of ``run-task`` is to verify and sanitize caches.
+It does this by storing state in a cache on its first use. If the recorded
+*capabilities* of an existing cache don't match expectations for the
+current task, ``run-task`` bails. This ensures that caches are only
+reused by tasks with similar execution *profiles*. This prevents
+accidental cache use across incompatible tasks. It also allows run-task
+to make assumptions about the state of caches, which can help avoid
+costly operations.
+
+In addition, the hash of ``run-task`` is used to derive the cache name.
+So any time ``run-task`` changes, a new set of caches are used. This
+ensures that any backwards incompatible changes or bug fixes to
+``run-task`` result in fresh caches.
+
+Some caches are reserved for use with run-task. That property will be denoted
+below.
+
+Common Caches
+=============
+
+Version Control Caches
+----------------------
+
+``level-{{level}}-checkouts-{{hash}}``
+   This cache holds version control checkouts, each in a subdirectory named
+   after the repo (e.g., ``gecko``).
+
+   Checkouts should be read-only. If a task needs to create new files from
+   content of a checkout, this content should be written in a separate
+   directory/cache (like a workspace).
+
+   This cache name pattern is managed by ``run-task`` and must only be
+   used with ``run-task``.
+
+``level-{{level}}-checkouts-{{version}}`` (deprecated)
    This cache holds version control checkouts, each in a subdirectory named
    after the repo (e.g., ``gecko``).
 
@@ -31,9 +76,8 @@ Version Control Caches
     This cache is used internally by ``tc-vcs``.  This tool is deprecated and
     should be replaced with ``hg robustcheckout``.
 
-
 Workspace Caches
-================
+----------------
 
 ``level-{{level}}-*-workspace``
    These caches (of various names typically ending with ``workspace``)
@@ -41,11 +85,14 @@ Workspace Caches
    dependent on the task.
 
 Other
-=====
+-----
 
-``level-{{level}}-tooltool-cache
-    Tooltool invocations should use this cache. Tooltool will store files here
-    indexed by their hash.
+``level-{{level}}-tooltool-cache-{{hash}}
+   Tooltool invocations should use this cache. Tooltool will store files here
+   indexed by their hash.
+
+   This cache name pattern is reserved for use with ``run-task`` and must only
+   be used by ``run-task``
 
 ``tooltool-cache`` (deprecated)
-    Legacy location for tooltool files. Use the per-level one instead.
+   Legacy location for tooltool files. Use the per-level one instead.
diff --git a/taskcluster/taskgraph/transforms/task.py b/taskcluster/taskgraph/transforms/task.py
index fc0f89926691..97f9b8109964 100644
--- a/taskcluster/taskgraph/transforms/task.py
+++ b/taskcluster/taskgraph/transforms/task.py
@@ -15,7 +15,9 @@ import os
 import time
 from copy import deepcopy
 
+from mozbuild.util import memoize
 from taskgraph.util.attributes import TRUNK_PROJECTS
+from taskgraph.util.hash import hash_path
 from taskgraph.util.treeherder import split_symbol
 from taskgraph.transforms.base import TransformSequence
 from taskgraph.util.schema import validate_schema, Schema
@@ -26,6 +28,15 @@ from taskgraph import GECKO
 from .gecko_v2_whitelist import JOB_NAME_WHITELIST, JOB_NAME_WHITELIST_ERROR
 
 
+RUN_TASK = os.path.join(GECKO, 'taskcluster', 'docker', 'recipes', 'run-task')
+
+
+@memoize
+def _run_task_suffix():
+    """String to append to cache names under control of run-task."""
+    return hash_path(RUN_TASK)[0:20]
+
+
 # shortcut for a string where task references are allowed
 taskref_or_string = Any(
     basestring,
@@ -656,9 +667,32 @@ def build_docker_worker_payload(config, task, task_def):
 
     if 'caches' in worker:
         caches = {}
+
+        # run-task knows how to validate caches.
+        #
+        # To help ensure new run-task features and bug fixes don't interfere
+        # with existing caches, we seed the hash of run-task into cache names.
+        # So, any time run-task changes, we should get a fresh set of caches.
+        # This means run-task can make changes to cache interaction at any time
+        # without regards for backwards or future compatibility.
+
+        run_task = payload.get('command', [''])[0].endswith('run-task')
+
+        if run_task:
+            suffix = '-%s' % _run_task_suffix()
+        else:
+            suffix = ''
+
         for cache in worker['caches']:
-            caches[cache['name']] = cache['mount-point']
-            task_def['scopes'].append('docker-worker:cache:' + cache['name'])
+            name = '%s%s' % (cache['name'], suffix)
+            caches[name] = cache['mount-point']
+            task_def['scopes'].append('docker-worker:cache:%s' % name)
+
+        # Assertion: only run-task is interested in this.
+        if run_task:
+            payload['env']['TASKCLUSTER_CACHES'] = ';'.join(sorted(
+                caches.values()))
+
         payload['cache'] = caches
 
     if features: