Bug 1583353 - [taskgraph] Implement test chunking in transforms r=gbrown

We've long handled chunks by defining the total number of chunks in our CI configuration, and then passing that value down into the test harnesses at task runtime (via the '--this-chunk' and '--total-chunks' parameters). The test harness then runs an algorithm to determine which tests should be run in "this" chunk. There are several problems with this approach, but by far the biggest is that we can't use test information in our scheduling algorithms. The information simply isn't available yet. This patch switches things around such that we determine which tests go in which tasks during the taskgraph generation. This means we have perfect information around which tasks are running which tests, and if e.g a ccov or machine learning algorithm deems a particular test important, we can make sure to *only* schedule the tasks that contain that test. I'm planning to enable this a couple suites at a time so we don't accidentally stop running tests. This specifically only enables this mode for 'mochitest-media', 'mochitest-browser-chrome' and 'mochitest-devtools-chrome'. I chose these suites because they are the ones that are already using the 'chunk_by_runtime' algorithm. Differential Revision: https://phabricator.services.mozilla.com/D52729 --HG-- extra : moz-landing-system : lando
2019-12-11 21:10:48 +00:00 · 2019-12-11 21:10:48 +00:00 · 26e9db86f8
--- a/build/sparse-profiles/taskgraph
+++ b/build/sparse-profiles/taskgraph
@ -46,6 +46,12 @@ glob:**/*.configure
 # are all somewhere in "tooltool-manifests" directories.
 glob:**/tooltool-manifests/**

+# For test chunking
+glob:**/*.ini
+glob:**/*.list
+path:testing/mozbase/manifestparser
+path:testing/runtimes
+
 # For scheduling android-gradle-dependencies.
 path:mobile/android/config/
 glob:**/*.gradle
--- a/taskcluster/docs/attributes.rst
+++ b/taskcluster/docs/attributes.rst
@ -133,8 +133,12 @@ some kinds, ``-j`` also matches against ``build_platform``.
 test_chunk
 ==========

-This is the chunk number of a chunked test suite (talos or unittest).  Note
-that this is a string!
+This is the chunk number of a chunked test suite. Note that this is a string!
+
+test_manifests
+==============
+
+A list of the test manifests that run in this task.

 e10s
 ====
--- a/taskcluster/taskgraph/transforms/job/mozharness_test.py
+++ b/taskcluster/taskgraph/transforms/job/mozharness_test.py
@ -4,7 +4,11 @@

 from __future__ import absolute_import, print_function, unicode_literals

+import json
+import os
+
 from voluptuous import Required
+
 from taskgraph.util.taskcluster import get_artifact_url
 from taskgraph.transforms.job import (
    configure_taskdesc_for_run,
@ -19,8 +23,6 @@ from taskgraph.transforms.tests import (
 from taskgraph.transforms.job.common import (
    support_vcs_checkout,
 )
-import json
-import os

 VARIANTS = [
    'nightly',
@ -182,8 +184,11 @@ def mozharness_test_on_docker(config, job, taskdesc):
    ]
    command.extend(mozharness.get('extra-options', []))

+    if test.get('test-manifests'):
+        env['MOZHARNESS_TEST_PATHS'] = json.dumps({test['suite']: test['test-manifests']})
+
    # TODO: remove the need for run['chunked']
-    if mozharness.get('chunked') or test['chunks'] > 1:
+    elif mozharness.get('chunked') or test['chunks'] > 1:
        command.append('--total-chunk={}'.format(test['chunks']))
        command.append('--this-chunk={}'.format(test['this-chunk']))

@ -362,8 +367,11 @@ def mozharness_test_on_generic_worker(config, job, taskdesc):
    if mozharness.get('include-blob-upload-branch'):
        mh_command.append('--blob-upload-branch=' + config.params['project'])

+    if test.get('test-manifests'):
+        env['MOZHARNESS_TEST_PATHS'] = json.dumps({test['suite']: test['test-manifests']})
+
    # TODO: remove the need for run['chunked']
-    if mozharness.get('chunked') or test['chunks'] > 1:
+    elif mozharness.get('chunked') or test['chunks'] > 1:
        mh_command.append('--total-chunk={}'.format(test['chunks']))
        mh_command.append('--this-chunk={}'.format(test['this-chunk']))

@ -480,8 +488,11 @@ def mozharness_test_on_script_engine_autophone(config, job, taskdesc):
        command.append('--blob-upload-branch=' + config.params['project'])
    command.extend(mozharness.get('extra-options', []))

+    if test.get('test-manifests'):
+        env['MOZHARNESS_TEST_PATHS'] = json.dumps({test['suite']: test['test-manifests']})
+
    # TODO: remove the need for run['chunked']
-    if mozharness.get('chunked') or test['chunks'] > 1:
+    elif mozharness.get('chunked') or test['chunks'] > 1:
        command.append('--total-chunk={}'.format(test['chunks']))
        command.append('--this-chunk={}'.format(test['this-chunk']))

--- a/taskcluster/taskgraph/transforms/tests.py
+++ b/taskcluster/taskgraph/transforms/tests.py
@ -20,9 +20,22 @@ for example - use `all_tests.py` instead.
 from __future__ import absolute_import, print_function, unicode_literals

 import copy
+import json
 import logging
 import os

+from manifestparser.filters import chunk_by_runtime
+from mozbuild.schedules import INCLUSIVE_COMPONENTS
+from mozbuild.util import memoize
+from moztest.resolve import TestResolver, TestManifestLoader, TEST_SUITES
+from voluptuous import (
+    Any,
+    Optional,
+    Required,
+    Exclusive,
+)
+
+from taskgraph import GECKO
 from taskgraph.transforms.base import TransformSequence
 from taskgraph.util.attributes import match_run_on_projects, keymatch
 from taskgraph.util.keyed_by import evaluate_keyed_by
@ -38,17 +51,8 @@ from taskgraph.util.taskcluster import (
    get_artifact_path,
    get_index_url,
 )
-from mozbuild.schedules import INCLUSIVE_COMPONENTS
-
 from taskgraph.util.perfile import perfile_number_of_chunks

-from voluptuous import (
-    Any,
-    Optional,
-    Required,
-    Exclusive,
-)
-
 here = os.path.abspath(os.path.dirname(__file__))

 # default worker types keyed by instance-size
@ -420,7 +424,10 @@ test_description_schema = Schema({
            bool),
    },

-    # The current chunk; this is filled in by `all_kinds.py`
+    # The set of test manifests to run.
+    Optional('test-manifests'): [basestring],
+
+    # The current chunk (if chunking is enabled).
    Optional('this-chunk'): int,

    # os user groups for test task workers; required scopes, will be
@ -1261,11 +1268,86 @@ def split_e10s(config, tests):
            yield test


+CHUNK_SUITES_BLACKLIST = (
+    'awsy',
+    'cppunittest',
+    'crashtest',
+    'firefox-ui-functional-local',
+    'firefox-ui-functional-remote',
+    'geckoview-junit',
+    'gtest',
+    'jittest',
+    'jsreftest',
+    'marionette',
+    'mochitest-a11y',
+    'mochitest-browser-chrome',
+    'mochitest-browser-chrome-screenshots',
+    'mochitest-chrome',
+    'mochitest-devtools-chrome',
+    'mochitest-devtools-webreplay',
+    'mochitest-media',
+    'mochitest-plain',
+    'mochitest-plain-gpu',
+    'mochitest-remote',
+    'mochitest-valgrind-plain',
+    'mochitest-webgl1-core',
+    'mochitest-webgl1-ext',
+    'mochitest-webgl2-core',
+    'mochitest-webgl2-ext',
+    'raptor',
+    'reftest',
+    'reftest-gpu',
+    'reftest-no-accel',
+    'talos',
+    'telemetry-tests-client',
+    'test-coverage',
+    'test-coverage-wpt',
+    'test-verify',
+    'test-verify-gpu',
+    'test-verify-wpt',
+    'web-platform-tests',
+    'web-platform-tests-reftests',
+    'web-platform-tests-wdspec',
+    'xpcshell',
+)
+"""These suites will be chunked at test runtime rather than here in the taskgraph."""
+
+
@transforms.add
 def split_chunks(config, tests):
    """Based on the 'chunks' key, split tests up into chunks by duplicating
    them and assigning 'this-chunk' appropriately and updating the treeherder
    symbol."""
+    resolver = TestResolver.from_environment(cwd=here, loader_cls=TestManifestLoader)
+
+    @memoize
+    def get_runtimes(platform):
+        base = os.path.join(GECKO, 'testing', 'runtimes', 'manifest-runtimes-{}.json')
+        for key in ('android', 'windows'):
+            if key in platform:
+                path = base.format(key)
+                break
+        else:
+            path = base.format('unix')
+
+        with open(path, 'r') as fh:
+            return json.load(fh)
+
+    @memoize
+    def get_tests(flavor, subsuite):
+        return list(resolver.resolve_tests(flavor=flavor, subsuite=subsuite))
+
+    @memoize
+    def get_chunked_manifests(flavor, subsuite, platform, chunks):
+        tests = get_tests(flavor, subsuite)
+        return [
+            c[1] for c in chunk_by_runtime(
+                None,
+                chunks,
+                get_runtimes(platform)
+            ).get_chunked_manifests(tests)
+        ]
+
    for test in tests:
        if test['suite'].startswith('test-verify') or \
           test['suite'].startswith('test-coverage'):
@ -1285,19 +1367,35 @@ def split_chunks(config, tests):
            if test['chunks'] > maximum_number_verify_chunks:
                test['chunks'] = maximum_number_verify_chunks

-        if test['chunks'] <= 1:
-            test['this-chunk'] = 1
-            yield test
-            continue
+        chunked_manifests = None
+        if test['suite'] not in CHUNK_SUITES_BLACKLIST:
+            suite_definition = TEST_SUITES.get(test['suite'], {})
+            chunked_manifests = get_chunked_manifests(
+                suite_definition['build_flavor'],
+                suite_definition.get('kwargs', {}).get('subsuite', 'undefined'),
+                test['test-platform'],
+                test['chunks'],
+            )
+
+        for i in range(test['chunks']):
+            this_chunk = i + 1

-        for this_chunk in range(1, test['chunks'] + 1):
            # copy the test and update with the chunk number
            chunked = copy.deepcopy(test)
            chunked['this-chunk'] = this_chunk

-            # add the chunk number to the TH symbol
-            chunked['treeherder-symbol'] = add_suffix(
-                chunked['treeherder-symbol'], this_chunk)
+            if chunked_manifests is not None:
+                manifests = sorted(chunked_manifests[i])
+                if not manifests:
+                    raise Exception(
+                        'Chunking algorithm yielded no manifests for chunk {} of {} on {}'.format(
+                            this_chunk, test['test-name'], test['test-platform']))
+                chunked['test-manifests'] = manifests
+
+            if test['chunks'] > 1:
+                # add the chunk number to the TH symbol
+                chunked['treeherder-symbol'] = add_suffix(
+                    chunked['treeherder-symbol'], this_chunk)

            yield chunked

@ -1497,6 +1595,7 @@ def make_job_description(config, tests):
            'build_type': attr_build_type,
            'test_platform': test['test-platform'],
            'test_chunk': str(test['this-chunk']),
+            'test_manifests': test.get('test-manifests'),
            attr_try_name: try_name,
        })

--- a/testing/mozbase/manifestparser/manifestparser/filters.py
+++ b/testing/mozbase/manifestparser/manifestparser/filters.py
@ -336,9 +336,7 @@ class chunk_by_runtime(InstanceFilter):
            manifest = normsep(test['manifest_relpath'])
        return manifest

-    def __call__(self, tests, values):
-        tests = list(tests)
-
+    def get_chunked_manifests(self, tests):
        # Find runtimes for all relevant manifests.
        manifests = set(self.get_manifest(t) for t in tests)
        runtimes = [(self.runtimes[m], m) for m in manifests if m in self.runtimes]
@ -365,6 +363,11 @@ class chunk_by_runtime(InstanceFilter):
        # Sort one last time so we typically get chunks ordered from fastest to
        # slowest.
        chunks.sort(key=lambda x: (x[0], len(x[1])))
+        return chunks
+
+    def __call__(self, tests, values):
+        tests = list(tests)
+        chunks = self.get_chunked_manifests(tests)
        runtime, this_manifests = chunks[self.this_chunk - 1]
        log("Cumulative test runtime is around {} minutes (average is {} minutes)".format(
            round(runtime / 60), round(sum([c[0] for c in chunks]) / (60 * len(chunks)))))