Bug 1333255: implement optimizations as named functions; r=jonasfj

MozReview-Commit-ID: 9xkHny7IYfA --HG-- extra : rebase_source : 201cf73414aee4bab407c67cb95e08fd2b794ba5
2017-03-10 18:17:43 +00:00 · 2017-03-10 18:17:43 +00:00 · bf33dfcabd
--- a/taskcluster/docs/index.rst
+++ b/taskcluster/docs/index.rst
@ -24,6 +24,7 @@ check out the :doc:`how-to section <how-tos>`.
    taskgraph
    loading
    transforms
+    optimization
    yaml-templates
    docker-images
    cron
--- a/taskcluster/docs/optimization.rst
+++ b/taskcluster/docs/optimization.rst
@ -0,0 +1,44 @@
+Optimization
+============
+
+The objective of optimization to remove as many tasks from the graph as
+possible, as efficiently as possible, thereby delivering useful results as
+quickly as possible.  For example, ideally if only a test script is modified in
+a push, then the resulting graph contains only the corresponding test suite
+task.
+
+A task is said to be "optimized" when it is either replaced with an equivalent,
+already-existing task, or dropped from the graph entirely.
+
+Optimization Functions
+----------------------
+
+During the optimization phase of task-graph generation, each task is optimized
+in post-order, meaning that each task's dependencies will be optimized before
+the task itself is optimized.
+
+Each task has a ``task.optimizations`` property describing the optimization
+methods that apply.  Each is specified as a list of method and arguments. For
+example::
+
+    task.optimizations = [
+        ['seta'],
+        ['files-changed', ['js/**', 'tests/**']],
+    ]
+
+These methods are defined in ``taskcluster/taskgraph/optimize.py``.  They are
+applied in order, and the first to return a success value causes the task to
+be optimized.
+
+Each method can return either a taskId (indicating that the given task can be
+replaced) or indicate that the task can be optimized away. If a task on which
+others depend is optimized away, task-graph generation will fail.
+
+Optimizing Target Tasks
+-----------------------
+
+In some cases, such as try pushes, tasks in the target task set have been
+explicitly requested and are thus excluded from optimization. In other cases,
+the target task set is almost the entire task graph, so targetted tasks are
+considered for optimization.  This behavior is controlled with the
+``optimize_target_tasks`` parameter.
--- a/taskcluster/docs/taskgraph.rst
+++ b/taskcluster/docs/taskgraph.rst
@ -96,9 +96,9 @@ Graph generation, as run via ``mach taskgraph decision``, proceeds as follows:
 #. Based on the full task graph, calculate the transitive closure of the target
   task set.  That is, the target tasks and all requirements of those tasks.
   The result is the "target task graph".
-#. Optimize the target task graph based on kind-specific optimization methods.
+#. Optimize the target task graph using task-specific optimization methods.
   The result is the "optimized task graph" with fewer nodes than the target
-   task graph.
+   task graph.  See :ref:`optimization`.
 #. Create tasks for all tasks in the optimized task graph.

 Transitive Closure
@ -123,32 +123,6 @@ complete.
 And as you can see, the graph we've built now includes everything we wanted
 (the test jobs) plus everything required to do that (docker images, builds).

-Optimization
------------
-
-The objective of optimization to remove as many tasks from the graph as
-possible, as efficiently as possible, thereby delivering useful results as
-quickly as possible.  For example, ideally if only a test script is modified in
-a push, then the resulting graph contains only the corresponding test suite
-task.
-
-A task is said to be "optimized" when it is either replaced with an equivalent,
-already-existing task, or dropped from the graph entirely.
-
-A task can be optimized if all of its dependencies can be optimized and none of
-its inputs have changed.  For a task on which no other tasks depend (a "leaf
-task"), the optimizer can determine what has changed by looking at the
-version-control history of the push: if the relevant files are not modified in
-the push, then it considers the inputs unchanged.  For tasks on which other
-tasks depend ("non-leaf tasks"), the optimizer must replace the task with
-another, equivalent task, so it generates a hash of all of the inputs and uses
-that to search for a matching, existing task.
-
-In some cases, such as try pushes, tasks in the target task set have been
-explicitly requested and are thus excluded from optimization. In other cases,
-the target task set is almost the entire task graph, so targetted tasks are
-considered for optimization.  This behavior is controlled with the
-``optimize_target_tasks`` parameter.

 Action Tasks
 ------------
--- a/taskcluster/taskgraph/optimize.py
+++ b/taskcluster/taskgraph/optimize.py
@ -3,16 +3,24 @@
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.

 from __future__ import absolute_import, print_function, unicode_literals
+
 import logging
 import re
+import os
+import requests

 from .graph import Graph
+from . import files_changed
 from .taskgraph import TaskGraph
+from .util.seta import is_low_value_task
+from .util.taskcluster import find_task_id
 from slugid import nice as slugid

 logger = logging.getLogger(__name__)
 TASK_REFERENCE_PATTERN = re.compile('<([^>]+)>')

+_optimizations = {}
+

 def optimize_task_graph(target_task_graph, params, do_not_optimize, existing_tasks=None):
    """
@ -60,6 +68,21 @@ def resolve_task_references(label, task_def, taskid_for_edge_name):
    return recurse(task_def)


+def optimize_task(task, params):
+    """
+    Optimize a single task by running its optimizations in order until one
+    succeeds.
+    """
+    for opt in task.optimizations:
+        opt_type, args = opt[0], opt[1:]
+        opt_fn = _optimizations[opt_type]
+        optimized, task_id = opt_fn(task, params, *args)
+        if optimized or task_id:
+            return optimized, task_id
+
+    return False, None
+
+
 def annotate_task_graph(target_task_graph, params, do_not_optimize,
                        named_links_dict, label_to_taskid, existing_tasks):
    """
@ -95,7 +118,7 @@ def annotate_task_graph(target_task_graph, params, do_not_optimize,
            replacement_task_id = existing_tasks[label]
        # otherwise, examine the task itself (which may be an expensive operation)
        else:
-            optimized, replacement_task_id = task.optimize(params)
+            optimized, replacement_task_id = optimize_task(task, params)

        task.optimized = optimized
        task.task_id = replacement_task_id
@ -154,3 +177,64 @@ def get_subgraph(annotated_task_graph, named_links_dict, label_to_taskid):
    return TaskGraph(
        tasks_by_taskid,
        Graph(set(tasks_by_taskid), edges_by_taskid))
+
+
+def optimization(name):
+    def wrap(func):
+        if name in _optimizations:
+            raise Exception("multiple optimizations with name {}".format(name))
+        _optimizations[name] = func
+        return func
+    return wrap
+
+
+@optimization('index-search')
+def opt_index_search(task, params, index_path):
+    try:
+        task_id = find_task_id(
+            index_path,
+            use_proxy=bool(os.environ.get('TASK_ID')))
+
+        return True, task_id
+    except requests.exceptions.HTTPError:
+        pass
+
+    return False, None
+
+
+@optimization('seta')
+def opt_seta(task, params):
+    bbb_task = False
+
+    # no need to call SETA for build jobs
+    if task.task.get('extra', {}).get('treeherder', {}).get('jobKind', '') == 'build':
+        return False, None
+
+    # for bbb tasks we need to send in the buildbot buildername
+    if task.task.get('provisionerId', '') == 'buildbot-bridge':
+        label = task.task.get('payload').get('buildername')
+        bbb_task = True
+    else:
+        label = task.label
+
+    # we would like to return 'False, None' while it's high_value_task
+    # and we wouldn't optimize it. Otherwise, it will return 'True, None'
+    if is_low_value_task(label,
+                         params.get('project'),
+                         params.get('pushlog_id'),
+                         params.get('pushdate'),
+                         bbb_task):
+        # Always optimize away low-value tasks
+        return True, None
+    else:
+        return False, None
+
+
+@optimization('files-changed')
+def opt_files_changed(task, params, file_patterns):
+    changed = files_changed.check(params, file_patterns)
+    if not changed:
+        logger.debug('no files found matching a pattern in `when.files-changed` for ' +
+                     task.label)
+        return True, None
+    return False, None
--- a/taskcluster/taskgraph/task/base.py
+++ b/taskcluster/taskgraph/task/base.py
@ -5,9 +5,6 @@
 from __future__ import absolute_import, print_function, unicode_literals

 import abc
-import os
-import requests
-from taskgraph.util.taskcluster import find_task_id


 class Task(object):
@ -18,7 +15,7 @@ class Task(object):
    - label; the label for this task
    - attributes: a dictionary of attributes for this task (used for filtering)
    - task: the task definition (JSON-able dictionary)
-    - index_paths: index paths where equivalent tasks might be found for optimization
+    - optimizations: optimizations to apply to the task (see taskgraph.optimize)
    - dependencies: tasks this one depends on, in the form {name: label}, for example
      {'build': 'build-linux64/opt', 'docker-image': 'build-docker-image-desktop-test'}

@ -35,7 +32,7 @@ class Task(object):
    __metaclass__ = abc.ABCMeta

    def __init__(self, kind, label, attributes, task,
-                 index_paths=None, dependencies=None):
+                 optimizations=None, dependencies=None):
        self.kind = kind
        self.label = label
        self.attributes = attributes
@ -46,7 +43,7 @@ class Task(object):

        self.attributes['kind'] = kind

-        self.index_paths = index_paths or ()
+        self.optimizations = optimizations or []
        self.dependencies = dependencies or {}

    def __eq__(self, other):
@ -55,35 +52,9 @@ class Task(object):
            self.attributes == other.attributes and \
            self.task == other.task and \
            self.task_id == other.task_id and \
-            self.index_paths == other.index_paths and \
+            self.optimizations == other.optimizations and \
            self.dependencies == other.dependencies

-    def optimize(self, params):
-        """
-        Determine whether this task can be optimized, and if it can, what taskId
-        it should be replaced with.
-
-        The return value is a tuple `(optimized, taskId)`.  If `optimized` is
-        true, then the task will be optimized (in other words, not included in
-        the task graph).  If the second argument is a taskid, then any
-        dependencies on this task will isntead depend on that taskId.  It is an
-        error to return no taskId for a task on which other tasks depend.
-
-        The default optimizes when a taskId can be found for one of the index
-        paths attached to the task.
-        """
-        for index_path in self.index_paths:
-            try:
-                task_id = find_task_id(
-                    index_path,
-                    use_proxy=bool(os.environ.get('TASK_ID')))
-
-                return True, task_id
-            except requests.exceptions.HTTPError:
-                pass
-
-        return False, None
-
    @classmethod
    def from_json(cls, task_dict):
        """
--- a/taskcluster/taskgraph/task/docker_image.py
+++ b/taskcluster/taskgraph/task/docker_image.py
@ -5,13 +5,10 @@
 from __future__ import absolute_import, print_function, unicode_literals

 import logging
-import os
-import urllib2

 from . import transform
 from taskgraph.util.docker import INDEX_PREFIX
 from taskgraph.transforms.base import TransformSequence, TransformConfig
-from taskgraph.util.taskcluster import get_artifact_url
 from taskgraph.util.python_path import find_object

 logger = logging.getLogger(__name__)
@ -41,25 +38,6 @@ def load_tasks(kind, path, config, params, loaded_tasks):

 class DockerImageTask(transform.TransformTask):

-    def optimize(self, params):
-        optimized, taskId = super(DockerImageTask, self).optimize(params)
-        if optimized and taskId:
-            try:
-                # Only return the task ID if the artifact exists for the indexed
-                # task.
-                request = urllib2.Request(get_artifact_url(
-                    taskId, 'public/image.tar.zst',
-                    use_proxy=bool(os.environ.get('TASK_ID'))))
-                request.get_method = lambda: 'HEAD'
-                urllib2.urlopen(request)
-
-                # HEAD success on the artifact is enough
-                return True, taskId
-            except urllib2.HTTPError:
-                pass
-
-        return False, None
-
    @classmethod
    def from_json(cls, task_dict):
        # Generating index_paths for optimization
--- a/taskcluster/taskgraph/task/transform.py
+++ b/taskcluster/taskgraph/task/transform.py
@ -8,11 +8,9 @@ import logging
 import itertools

 from . import base
-from .. import files_changed
 from ..util.python_path import find_object
 from ..util.templates import merge
 from ..util.yaml import load_yaml
-from ..util.seta import is_low_value_task

 from ..transforms.base import TransformSequence, TransformConfig

@ -84,51 +82,11 @@ class TransformTask(base.Task):
    """

    def __init__(self, kind, task):
-        self.when = task.get('when', {})
        super(TransformTask, self).__init__(kind, task['label'],
                                            task['attributes'], task['task'],
-                                            index_paths=task.get('index-paths'),
+                                            optimizations=task.get('optimizations'),
                                            dependencies=task.get('dependencies'))

-    def optimize(self, params):
-        bbb_task = False
-
-        if self.index_paths:
-            optimized, taskId = super(TransformTask, self).optimize(params)
-            if optimized:
-                return optimized, taskId
-
-        elif 'files-changed' in self.when:
-            changed = files_changed.check(
-                params, self.when['files-changed'])
-            if not changed:
-                logger.debug('no files found matching a pattern in `when.files-changed` for ' +
-                             self.label)
-                return True, None
-
-        # no need to call SETA for build jobs
-        if self.task.get('extra', {}).get('treeherder', {}).get('jobKind', '') == 'build':
-            return False, None
-
-        # for bbb tasks we need to send in the buildbot buildername
-        if self.task.get('provisionerId', '') == 'buildbot-bridge':
-            self.label = self.task.get('payload').get('buildername')
-            bbb_task = True
-
-        # we would like to return 'False, None' while it's high_value_task
-        # and we wouldn't optimize it. Otherwise, it will return 'True, None'
-        if is_low_value_task(self.label,
-                             params.get('project'),
-                             params.get('pushlog_id'),
-                             params.get('pushdate'),
-                             bbb_task):
-            # Always optimize away low-value tasks
-            return True, None
-        else:
-            return False, None
-
    @classmethod
    def from_json(cls, task_dict):
-        # when reading back from JSON, we lose the "when" information
-        task_dict['when'] = {}
        return cls(task_dict['attributes']['kind'], task_dict)
--- a/taskcluster/taskgraph/test/test_optimize.py
+++ b/taskcluster/taskgraph/test/test_optimize.py
@ -6,7 +6,7 @@ from __future__ import absolute_import, print_function, unicode_literals

 import unittest

-from ..optimize import optimize_task_graph, resolve_task_references
+from ..optimize import optimize_task_graph, resolve_task_references, optimization
 from ..optimize import annotate_task_graph, get_subgraph
 from ..taskgraph import TaskGraph
 from .. import graph
@ -53,26 +53,32 @@ class TestResolveTaskReferences(unittest.TestCase):
        )


-class OptimizingTask(TestTask):
-    # the `optimize` method on this class is overridden direclty in the tests
-    # below.
-    pass
-
-
 class TestOptimize(unittest.TestCase):

    kind = None

-    def make_task(self, label, task_def=None, optimized=None, task_id=None):
+    @classmethod
+    def setUpClass(cls):
+        # set up some simple optimization functions
+        optimization('no-optimize')(lambda self, params: (False, None))
+        optimization('optimize-away')(lambda self, params: (True, None))
+        optimization('optimize-to-task')(lambda self, params, task: (True, task))
+        optimization('false-with-taskid')(lambda self, params: (False, 'some-taskid'))
+
+    def make_task(self, label, optimization=None, task_def=None, optimized=None, task_id=None):
        task_def = task_def or {'sample': 'task-def'}
-        task = OptimizingTask(label=label, task=task_def)
+        task = TestTask(label=label, task=task_def)
        task.optimized = optimized
+        if optimization:
+            task.optimizations = [optimization]
+        else:
+            task.optimizations = []
        task.task_id = task_id
        return task

    def make_graph(self, *tasks_and_edges):
-        tasks = {t.label: t for t in tasks_and_edges if isinstance(t, OptimizingTask)}
-        edges = {e for e in tasks_and_edges if not isinstance(e, OptimizingTask)}
+        tasks = {t.label: t for t in tasks_and_edges if isinstance(t, TestTask)}
+        edges = {e for e in tasks_and_edges if not isinstance(e, TestTask)}
        return TaskGraph(tasks, graph.Graph(set(tasks), edges))

    def assert_annotations(self, graph, **annotations):
@ -85,11 +91,10 @@ class TestOptimize(unittest.TestCase):

    def test_annotate_task_graph_no_optimize(self):
        "annotating marks everything as un-optimized if the kind returns that"
-        OptimizingTask.optimize = lambda self, params: (False, None)
        graph = self.make_graph(
-            self.make_task('task1'),
-            self.make_task('task2'),
-            self.make_task('task3'),
+            self.make_task('task1', ['no-optimize']),
+            self.make_task('task2', ['no-optimize']),
+            self.make_task('task3', ['no-optimize']),
            ('task2', 'task1', 'build'),
            ('task2', 'task3', 'image'),
        )
@ -103,8 +108,7 @@ class TestOptimize(unittest.TestCase):

    def test_annotate_task_graph_taskid_without_optimize(self):
        "raises exception if kind returns a taskid without optimizing"
-        OptimizingTask.optimize = lambda self, params: (False, 'some-taskid')
-        graph = self.make_graph(self.make_task('task1'))
+        graph = self.make_graph(self.make_task('task1', ['false-with-taskid']))
        self.assertRaises(
            Exception,
            lambda: annotate_task_graph(graph, {}, set(), graph.graph.named_links_dict(), {}, None)
@ -112,11 +116,9 @@ class TestOptimize(unittest.TestCase):

    def test_annotate_task_graph_optimize_away_dependency(self):
        "raises exception if kind optimizes away a task on which another depends"
-        OptimizingTask.optimize = \
-            lambda self, params: (True, None) if self.label == 'task1' else (False, None)
        graph = self.make_graph(
-            self.make_task('task1'),
-            self.make_task('task2'),
+            self.make_task('task1', ['optimize-away']),
+            self.make_task('task2', ['no-optimize']),
            ('task2', 'task1', 'build'),
        )
        self.assertRaises(
@ -126,10 +128,9 @@ class TestOptimize(unittest.TestCase):

    def test_annotate_task_graph_do_not_optimize(self):
        "annotating marks everything as un-optimized if in do_not_optimize"
-        OptimizingTask.optimize = lambda self, params: (True, 'taskid')
        graph = self.make_graph(
-            self.make_task('task1'),
-            self.make_task('task2'),
+            self.make_task('task1', ['optimize-away']),
+            self.make_task('task2', ['optimize-away']),
            ('task2', 'task1', 'build'),
        )
        label_to_taskid = {}
@ -144,12 +145,10 @@ class TestOptimize(unittest.TestCase):

    def test_annotate_task_graph_nos_do_not_propagate(self):
        "a task with a non-optimized dependency can be optimized"
-        OptimizingTask.optimize = \
-            lambda self, params: (False, None) if self.label == 'task1' else (True, 'taskid')
        graph = self.make_graph(
-            self.make_task('task1'),
-            self.make_task('task2'),
-            self.make_task('task3'),
+            self.make_task('task1', ['no-optimize']),
+            self.make_task('task2', ['optimize-to-task', 'taskid']),
+            self.make_task('task3', ['optimize-to-task', 'taskid']),
            ('task2', 'task1', 'build'),
            ('task2', 'task3', 'image'),
        )
@ -241,12 +240,10 @@ class TestOptimize(unittest.TestCase):

    def test_optimize(self):
        "optimize_task_graph annotates and extracts the subgraph from a simple graph"
-        OptimizingTask.optimize = \
-            lambda self, params: (True, 'dep1') if self.label == 'task1' else (False, None)
        input = self.make_graph(
-            self.make_task('task1'),
-            self.make_task('task2'),
-            self.make_task('task3'),
+            self.make_task('task1', ['optimize-to-task', 'dep1']),
+            self.make_task('task2', ['no-optimize']),
+            self.make_task('task3', ['no-optimize']),
            ('task2', 'task1', 'build'),
            ('task2', 'task3', 'image'),
        )
--- a/taskcluster/taskgraph/transforms/job/init.py
+++ b/taskcluster/taskgraph/transforms/job/init.py
@ -55,6 +55,7 @@ job_description_schema = Schema({
    Optional('index'): task_description_schema['index'],
    Optional('run-on-projects'): task_description_schema['run-on-projects'],
    Optional('coalesce-name'): task_description_schema['coalesce-name'],
+    Optional('optimizations'): task_description_schema['optimizations'],
    Optional('needs-sccache'): task_description_schema['needs-sccache'],
    Optional('when'): task_description_schema['when'],

--- a/taskcluster/taskgraph/transforms/task.py
+++ b/taskcluster/taskgraph/transforms/task.py
@ -132,6 +132,18 @@ task_description_schema = Schema({
    # tasks are never coalesced
    Optional('coalesce-name'): basestring,

+    # Optimizations to perform on this task during the optimization phase,
+    # specified in order.  These optimizations are defined in
+    # taskcluster/taskgraph/optimize.py.
+    Optional('optimizations'): [Any(
+        # search the index for the given index namespace, and replace this task if found
+        ['index-search', basestring],
+        # consult SETA and skip this task if it is low-value
+        ['seta'],
+        # skip this task if none of the given file patterns match
+        ['files-changed', [basestring]],
+    )],
+
    # the provisioner-id/worker-type for the task.  The following parameters will
    # be substituted in this string:
    #  {level} -- the scm level of this push
@ -339,8 +351,8 @@ task_description_schema = Schema({
    }),

    # The "when" section contains descriptions of the circumstances
-    # under which this task can be "optimized", that is, left out of the
-    # task graph because it is unnecessary.
+    # under which this task should be included in the task graph.  This
+    # will be converted into an element in the `optimizations` list.
    Optional('when'): Any({
        # This task only needs to be run if a file matching one of the given
        # patterns has changed in the push.  The patterns use the mozpack
@ -783,6 +795,17 @@ def add_files_changed(config, tasks):
        yield task


+@transforms.add
+def setup_optimizations(config, tasks):
+    for task in tasks:
+        optimizations = task.setdefault('optimizations', [])
+        optimizations.extend([['index-search', idx] for idx in task.get('index-paths', [])])
+        optimizations.append(['seta'])
+        if 'when' in task and 'files-changed' in task['when']:
+            optimizations.append(['files-changed', task['when']['files-changed']])
+        yield task
+
+
@transforms.add
 def build_task(config, tasks):
    for task in tasks:
@ -876,8 +899,7 @@ def build_task(config, tasks):
            'task': task_def,
            'dependencies': task.get('dependencies', {}),
            'attributes': attributes,
-            'index-paths': task.get('index-paths'),
-            'when': task.get('when', {}),
+            'optimizations': task['optimizations'],
        }