Support generating a test scheduling history DB for group granularity instead of just label granularity

And introduce a new task that generates a group-level test scheduling history DB. Part of #1125
2020-02-19 13:15:03 +01:00 · 2020-02-19 13:15:03 +01:00 · ba6c358ba7
--- a/bugbug/models/testfailure.py
+++ b/bugbug/models/testfailure.py
@ -70,7 +70,7 @@ class TestFailureModel(CommitModel):
        assert len(commit_map) > 0

        done = set()
-        for test_data in test_scheduling.get_test_scheduling_history():
+        for test_data in test_scheduling.get_test_scheduling_history("label"):
            revs = test_data["revs"]

            if revs[0] in done:
@ -93,7 +93,7 @@ class TestFailureModel(CommitModel):
    def get_labels(self):
        classes = {}

-        for test_data in test_scheduling.get_test_scheduling_history():
+        for test_data in test_scheduling.get_test_scheduling_history("label"):
            rev = test_data["revs"][0]

            if test_data["is_likely_regression"] or test_data["is_possible_regression"]:
--- a/bugbug/models/testselect.py
+++ b/bugbug/models/testselect.py
@ -79,7 +79,7 @@ class TestSelectModel(Model):

        assert len(commit_map) > 0

-        for test_data in test_scheduling.get_test_scheduling_history():
+        for test_data in test_scheduling.get_test_scheduling_history("label"):
            revs = test_data["revs"]
            name = test_data["name"]

@ -101,7 +101,7 @@ class TestSelectModel(Model):
    def get_labels(self):
        classes = {}

-        for test_data in test_scheduling.get_test_scheduling_history():
+        for test_data in test_scheduling.get_test_scheduling_history("label"):
            rev = test_data["revs"][0]
            name = test_data["name"]

--- a/bugbug/test_scheduling.py
+++ b/bugbug/test_scheduling.py
@ -3,38 +3,62 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 # You can obtain one at http://mozilla.org/MPL/2.0/.

+import os
 import pickle
 import shelve

 from bugbug import db
 from bugbug.utils import ExpQueue, LMDBDict

-TEST_SCHEDULING_DB = "data/test_scheduling_history.pickle"
-PAST_FAILURES_DB = "past_failures.lmdb.tar.zst"
+TEST_LABEL_SCHEDULING_DB = "data/test_label_scheduling_history.pickle"
+PAST_FAILURES_LABEL_DB = "past_failures_label.lmdb.tar.zst"
 db.register(
-    TEST_SCHEDULING_DB,
-    "https://community-tc.services.mozilla.com/api/index/v1/task/project.relman.bugbug.data_test_scheduling_history.latest/artifacts/public/test_scheduling_history.pickle.zst",
+    TEST_LABEL_SCHEDULING_DB,
+    "https://community-tc.services.mozilla.com/api/index/v1/task/project.relman.bugbug.data_test_label_scheduling_history.latest/artifacts/public/test_label_scheduling_history.pickle.zst",
    7,
-    [PAST_FAILURES_DB],
+    [PAST_FAILURES_LABEL_DB],
+)
+
+TEST_GROUP_SCHEDULING_DB = "data/test_group_scheduling_history.pickle"
+PAST_FAILURES_GROUP_DB = "past_failures_group.lmdb.tar.zst"
+db.register(
+    TEST_GROUP_SCHEDULING_DB,
+    "https://community-tc.services.mozilla.com/api/index/v1/task/project.relman.bugbug.data_test_group_scheduling_history.latest/artifacts/public/test_group_scheduling_history.pickle.zst",
+    7,
+    [PAST_FAILURES_GROUP_DB],
 )

 HISTORICAL_TIMESPAN = 56


-def get_test_scheduling_history():
-    return db.read(TEST_SCHEDULING_DB)
+def get_test_scheduling_history(granularity):
+    if granularity == "label":
+        test_scheduling_db = TEST_LABEL_SCHEDULING_DB
+    elif granularity == "group":
+        test_scheduling_db = TEST_GROUP_SCHEDULING_DB
+    else:
+        raise Exception(f"{granularity} granularity unsupported")
+
+    return db.read(test_scheduling_db)


-def get_past_failures():
+def get_past_failures(granularity):
+    if granularity == "label":
+        past_failures_db = os.path.join("data", PAST_FAILURES_LABEL_DB)
+    elif granularity == "group":
+        past_failures_db = os.path.join("data", PAST_FAILURES_GROUP_DB)
+    else:
+        raise Exception(f"{granularity} granularity unsupported")
+
    return shelve.Shelf(
-        LMDBDict("data/past_failures.lmdb"),
+        LMDBDict(past_failures_db[: -len(".tar.zst")]),
        protocol=pickle.DEFAULT_PROTOCOL,
        writeback=True,
    )


 def _read_and_update_past_failures(
-    past_failures, type_, task, items, push_num, is_regression
+    past_failures, type_, runnable, items, push_num, is_regression
 ):
    values_total = []
    values_prev_7 = []
@ -42,7 +66,7 @@ def _read_and_update_past_failures(
    values_prev_28 = []
    values_prev_56 = []

-    key = f"{type_}${task}$"
+    key = f"{type_}${runnable}$"

    for item in items:
        full_key = key + item
@ -75,10 +99,12 @@ def _read_and_update_past_failures(


 def generate_data(
-    past_failures, commit, push_num, tasks, possible_regressions, likely_regressions
+    past_failures, commit, push_num, runnables, possible_regressions, likely_regressions
 ):
-    for task in tasks:
-        is_regression = task in possible_regressions or task in likely_regressions
+    for runnable in runnables:
+        is_regression = (
+            runnable in possible_regressions or runnable in likely_regressions
+        )

        (
            total_failures,
@ -87,7 +113,7 @@ def generate_data(
            past_28_pushes_failures,
            past_56_pushes_failures,
        ) = _read_and_update_past_failures(
-            past_failures, "all", task, ["all"], push_num, is_regression
+            past_failures, "all", runnable, ["all"], push_num, is_regression
        )

        (
@ -97,7 +123,7 @@ def generate_data(
            past_28_pushes_types_failures,
            past_56_pushes_types_failures,
        ) = _read_and_update_past_failures(
-            past_failures, "type", task, commit["types"], push_num, is_regression,
+            past_failures, "type", runnable, commit["types"], push_num, is_regression,
        )

        (
@ -107,7 +133,7 @@ def generate_data(
            past_28_pushes_files_failures,
            past_56_pushes_files_failures,
        ) = _read_and_update_past_failures(
-            past_failures, "file", task, commit["files"], push_num, is_regression,
+            past_failures, "file", runnable, commit["files"], push_num, is_regression,
        )

        (
@ -119,7 +145,7 @@ def generate_data(
        ) = _read_and_update_past_failures(
            past_failures,
            "directory",
-            task,
+            runnable,
            commit["directories"],
            push_num,
            is_regression,
@ -134,14 +160,14 @@ def generate_data(
        ) = _read_and_update_past_failures(
            past_failures,
            "component",
-            task,
+            runnable,
            commit["components"],
            push_num,
            is_regression,
        )

        yield {
-            "name": task,
+            "name": runnable,
            "failures": total_failures,
            "failures_past_7_pushes": past_7_pushes_failures,
            "failures_past_14_pushes": past_14_pushes_failures,
@ -167,6 +193,6 @@ def generate_data(
            "failures_past_14_pushes_in_components": past_14_pushes_components_failures,
            "failures_past_28_pushes_in_components": past_28_pushes_components_failures,
            "failures_past_56_pushes_in_components": past_56_pushes_components_failures,
-            "is_possible_regression": task in possible_regressions,
-            "is_likely_regression": task in likely_regressions,
+            "is_possible_regression": runnable in possible_regressions,
+            "is_likely_regression": runnable in likely_regressions,
        }
--- a/infra/data-pipeline.yml
+++ b/infra/data-pipeline.yml
@ -239,7 +239,7 @@ tasks:
        owner: release-mgmt-analysis@mozilla.com
        source: ${repository}/raw/master/data-pipeline.yml

-    - ID: test-scheduling-history-generator
+    - ID: test-label-scheduling-history-generator
      created: {$fromNow: ''}
      deadline: {$fromNow: '4 days'}
      expires: {$fromNow: '1 month'}
@ -255,16 +255,17 @@ tasks:
        command:
          - bugbug-data-test-scheduling-history
          - generate
+          - --granularity=label

        artifacts:
-          public/test_scheduling_history.pickle.zst:
-            path: /data/test_scheduling_history.pickle.zst
+          public/test_label_scheduling_history.pickle.zst:
+            path: /data/test_label_scheduling_history.pickle.zst
            type: file
-          public/test_scheduling_history.pickle.version:
-            path: /data/test_scheduling_history.pickle.version
+          public/test_label_scheduling_history.pickle.version:
+            path: /data/test_label_scheduling_history.pickle.version
            type: file
-          public/past_failures.lmdb.tar.zst:
-            path: /data/past_failures.lmdb.tar.zst
+          public/past_failures_label.lmdb.tar.zst:
+            path: /data/past_failures_label.lmdb.tar.zst
            type: file

        features:
@ -275,11 +276,56 @@ tasks:
      routes:
        - notify.email.release-mgmt-analysis@mozilla.com.on-failed
        - notify.irc-channel.#bugbug.on-failed
-        - index.project.relman.bugbug.data_test_scheduling_history.${version}
-        - index.project.relman.bugbug.data_test_scheduling_history.latest
+        - index.project.relman.bugbug.data_test_label_scheduling_history.${version}
+        - index.project.relman.bugbug.data_test_label_scheduling_history.latest
      metadata:
-        name: bugbug test scheduling history retrieval
-        description: bugbug test scheduling history retrieval
+        name: bugbug test label scheduling history retrieval
+        description: bugbug test label scheduling history retrieval
+        owner: release-mgmt-analysis@mozilla.com
+        source: ${repository}/raw/master/data-pipeline.yml
+
+    - ID: test-group-scheduling-history-generator
+      created: {$fromNow: ''}
+      deadline: {$fromNow: '4 days'}
+      expires: {$fromNow: '1 month'}
+      provisionerId: proj-relman
+      workerType: compute-small
+      dependencies:
+        - test-scheduling-history-push_data-retrieval
+      payload:
+        env:
+          TC_SECRET_ID: project/relman/bugbug/production
+        maxRunTime: 86400
+        image: mozilla/bugbug-base:${version}
+        command:
+          - bugbug-data-test-scheduling-history
+          - generate
+          - --granularity=group
+
+        artifacts:
+          public/test_group_scheduling_history.pickle.zst:
+            path: /data/test_group_scheduling_history.pickle.zst
+            type: file
+          public/test_group_scheduling_history.pickle.version:
+            path: /data/test_group_scheduling_history.pickle.version
+            type: file
+          public/past_failures_group.lmdb.tar.zst:
+            path: /data/past_failures_group.lmdb.tar.zst
+            type: file
+
+        features:
+          taskclusterProxy:
+            true
+      scopes:
+        - "secrets:get:project/relman/bugbug/production"
+      routes:
+        - notify.email.release-mgmt-analysis@mozilla.com.on-failed
+        - notify.irc-channel.#bugbug.on-failed
+        - index.project.relman.bugbug.data_test_group_scheduling_history.${version}
+        - index.project.relman.bugbug.data_test_group_scheduling_history.latest
+      metadata:
+        name: bugbug test group scheduling history retrieval
+        description: bugbug test group scheduling history retrieval
        owner: release-mgmt-analysis@mozilla.com
        source: ${repository}/raw/master/data-pipeline.yml

@ -896,7 +942,7 @@ tasks:
      workerType: compute-super-large
      dependencies:
        - commit-retrieval
-        - test-scheduling-history-generator
+        - test-label-scheduling-history-generator
      payload:
        maxRunTime: 25200
        image: mozilla/bugbug-base:${version}
@ -935,7 +981,7 @@ tasks:
      workerType: compute-large
      dependencies:
        - commit-retrieval
-        - test-scheduling-history-generator
+        - test-label-scheduling-history-generator
      payload:
        maxRunTime: 25200
        image: mozilla/bugbug-base:${version}
--- a/scripts/test_scheduling_history_retriever.py
+++ b/scripts/test_scheduling_history_retriever.py
@ -32,19 +32,23 @@ db.register(
    "https://s3-us-west-2.amazonaws.com/communitytc-bugbug/data/adr_cache.tar.zst",
    3,
 )
-PUSH_DATA_LABEL_URL = "https://community-tc.services.mozilla.com/api/index/v1/task/project.relman.bugbug.data_test_scheduling_history_push_data.latest/artifacts/public/push_data_label.json.zst"
-PUSH_DATA_GROUP_URL = "https://community-tc.services.mozilla.com/api/index/v1/task/project.relman.bugbug.data_test_scheduling_history_push_data.latest/artifacts/public/push_data_group.json.zst"
+PUSH_DATA_URL = "https://community-tc.services.mozilla.com/api/index/v1/task/project.relman.bugbug.data_test_scheduling_history_push_data.latest/artifacts/public/push_data_{granularity}.json.zst"

 TRAINING_MONTHS = 6


-def filter_tasks(tasks, all_tasks):
+def filter_runnables(runnables, all_runnables, granularity):
    return tuple(
-        task
-        for task in tasks
-        if task in all_tasks
-        and any(task.startswith(j) for j in JOBS_TO_CONSIDER)
-        and not any(task.startswith(j) for j in JOBS_TO_IGNORE)
+        runnable
+        for runnable in runnables
+        if runnable in all_runnables
+        and (
+            granularity == "group"
+            or (
+                any(runnable.startswith(j) for j in JOBS_TO_CONSIDER)
+                and not any(runnable.startswith(j) for j in JOBS_TO_IGNORE)
+            )
+        )
    )


@ -124,27 +128,37 @@ file = {{ driver = "file", path = "{os.path.abspath(self.cache_path)}" }}
        zstd_compress("push_data_label.json")
        zstd_compress("push_data_group.json")

-    def generate_test_scheduling_history(self):
-        updated = download_check_etag(PUSH_DATA_LABEL_URL)
+    def generate_test_scheduling_history(self, granularity):
+        push_data_path = f"push_data_{granularity}.json"
+        updated = download_check_etag(PUSH_DATA_URL.format(granularity=granularity))
        if updated:
-            zstd_decompress("push_data_label.json")
-        assert os.path.exists(
-            "push_data_label.json"
-        ), "Decompressed push data file exists"
+            zstd_decompress(push_data_path)
+        assert os.path.exists(push_data_path), "Decompressed push data file exists"

        # Get the commits DB.
        assert db.download(repository.COMMITS_DB)

        HISTORY_DATE_START = datetime.now() - relativedelta(months=TRAINING_MONTHS)

-        db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True)
+        if granularity == "label":
+            test_scheduling_db = test_scheduling.TEST_LABEL_SCHEDULING_DB
+            past_failures_db = os.path.join(
+                "data", test_scheduling.PAST_FAILURES_LABEL_DB
+            )
+        elif granularity == "group":
+            test_scheduling_db = test_scheduling.TEST_GROUP_SCHEDULING_DB
+            past_failures_db = os.path.join(
+                "data", test_scheduling.PAST_FAILURES_GROUP_DB
+            )
+
+        db.download(test_scheduling_db, support_files_too=True)

        last_node = None
-        for test_data in test_scheduling.get_test_scheduling_history():
+        for test_data in test_scheduling.get_test_scheduling_history(granularity):
            last_node = test_data["revs"][0]

        def generate_all_data():
-            past_failures = test_scheduling.get_past_failures()
+            past_failures = test_scheduling.get_past_failures(granularity)

            push_num = past_failures["push_num"] if "push_num" in past_failures else 0

@ -161,39 +175,42 @@ file = {{ driver = "file", path = "{os.path.abspath(self.cache_path)}" }}

                commit_map[commit_data["node"]] = commit_data

-            with open("push_data_label.json", "r") as f:
+            with open(push_data_path, "r") as f:
                push_data = json.load(f)[1:]

            logger.info(f"push data nodes: {len(push_data)}")

-            push_data = [
-                (
-                    revisions,
-                    rename_tasks(push_tasks),
-                    rename_tasks(possible_regressions),
-                    rename_tasks(likely_regressions),
-                )
-                for revisions, push_tasks, possible_regressions, likely_regressions in push_data
-            ]
+            if granularity == "label":
+                push_data = [
+                    (
+                        revisions,
+                        rename_tasks(push_tasks),
+                        rename_tasks(possible_regressions),
+                        rename_tasks(likely_regressions),
+                    )
+                    for revisions, push_tasks, possible_regressions, likely_regressions in push_data
+                ]

-            # In the last 28 pushes, we definitely run all possible tasks.
-            all_tasks_set = set(
-                sum((push_tasks for _, push_tasks, _, _ in push_data[-28:]), [])
+            # In the last 28 pushes, we definitely run all possible runnables.
+            all_runnables_set = set(
+                sum((push_runnables for _, push_runnables, _, _ in push_data[-28:]), [])
            )
-            # Filter tasks we don't need.
-            all_tasks = filter_tasks(list(all_tasks_set), all_tasks_set)
-            all_tasks_set = set(all_tasks)
-            logger.info(f"{len(all_tasks_set)} tasks run in the last 28 pushes")
+            # Filter runnables we don't need.
+            all_runnables = filter_runnables(
+                list(all_runnables_set), all_runnables_set, granularity
+            )
+            all_runnables_set = set(all_runnables_set)
+            logger.info(f"{len(all_runnables_set)} runnables run in the last 28 pushes")

-            # Store all tasks in the past_failures DB so it can be used in the evaluation phase.
-            past_failures["all_tasks"] = all_tasks
-            # XXX: Should we recreate the DB from scratch if the previous all_tasks are not the
+            # Store all runnables in the past_failures DB so it can be used in the evaluation phase.
+            past_failures["all_runnables"] = all_runnables
+            # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the
            # same as the current ones?

            saved_nodes = set()
            skipped_no_commits = 0
            skipped_too_big_commits = 0
-            skipped_no_tasks = 0
+            skipped_no_runnables = 0

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False
@ -201,7 +218,7 @@ file = {{ driver = "file", path = "{os.path.abspath(self.cache_path)}" }}
            for i in tqdm(range(len(push_data))):
                (
                    revisions,
-                    push_tasks,
+                    push_runnables,
                    possible_regressions,
                    likely_regressions,
                ) = push_data.pop(0)
@ -235,16 +252,18 @@ file = {{ driver = "file", path = "{os.path.abspath(self.cache_path)}" }}
                    skipped_too_big_commits += 1
                    continue

-                # If we considered all_tasks, we'd generate a huge amount of data.
-                # So we consider only the tasks which run in this push, and the possible and likely regressions
+                # If we considered all_runnables, we'd generate a huge amount of data.
+                # So we consider only the runnables which run in this push, and the possible and likely regressions
                # from this push.
-                tasks_to_consider = list(
-                    set(push_tasks + possible_regressions + likely_regressions)
+                runnables_to_consider = list(
+                    set(push_runnables + possible_regressions + likely_regressions)
+                )
+                runnables_to_consider = filter_runnables(
+                    runnables_to_consider, all_runnables_set, granularity
                )
-                tasks_to_consider = filter_tasks(tasks_to_consider, all_tasks_set)

-                if len(tasks_to_consider) == 0:
-                    skipped_no_tasks += 1
+                if len(runnables_to_consider) == 0:
+                    skipped_no_runnables += 1
                    continue

                # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!).
@ -257,7 +276,7 @@ file = {{ driver = "file", path = "{os.path.abspath(self.cache_path)}" }}
                    past_failures,
                    merged_commits,
                    push_num,
-                    tasks_to_consider,
+                    runnables_to_consider,
                    possible_regressions,
                    likely_regressions,
                ):
@ -269,17 +288,17 @@ file = {{ driver = "file", path = "{os.path.abspath(self.cache_path)}" }}
            logger.info(f"saved push data nodes: {len(saved_nodes)}")
            logger.info(f"skipped {skipped_no_commits} (no commits in our DB)")
            logger.info(f"skipped {skipped_too_big_commits} (too big commits)")
-            logger.info(f"skipped {skipped_no_tasks} (no interesting tasks)")
+            logger.info(f"skipped {skipped_no_runnables} (no interesting runnables)")

            past_failures["push_num"] = push_num
            past_failures.close()

-        db.append(test_scheduling.TEST_SCHEDULING_DB, generate_all_data())
+        db.append(test_scheduling_db, generate_all_data())

-        zstd_compress(test_scheduling.TEST_SCHEDULING_DB)
+        zstd_compress(test_scheduling_db)

-        with open_tar_zst("data/past_failures.lmdb.tar.zst") as tar:
-            tar.add("data/past_failures.lmdb")
+        with open_tar_zst(past_failures_db) as tar:
+            tar.add(past_failures_db[: -len(".tar.zst")])


 def main():
@ -289,6 +308,11 @@ def main():
    parser.add_argument(
        "op", help="Which operation to perform.", choices=["retrieve", "generate"]
    )
+    parser.add_argument(
+        "--granularity",
+        help="Which test granularity to use.",
+        choices=["label", "group"],
+    )

    args = parser.parse_args()

@ -296,7 +320,8 @@ def main():
    if args.op == "retrieve":
        retriever.retrieve_push_data()
    elif args.op == "generate":
-        retriever.generate_test_scheduling_history()
+        assert args.granularity is not None
+        retriever.generate_test_scheduling_history(args.granularity)


 if __name__ == "__main__":