Store regressor finder results in bugbug DBs and make it run only on commits which haven't been analyzed yet

2019-07-23 02:01:26 +02:00 · 2019-07-23 02:01:26 +02:00 · fbaef0661d
--- a/infra/annotate-pipeline.yml
+++ b/infra/annotate-pipeline.yml
@ -61,11 +61,11 @@ tasks:
          public/commits_to_ignore.csv:
            path: /commits_to_ignore.csv
            type: file
-          public/fix_commits.csv:
-            path: /fix_commits.csv
+          public/bug_fixing_commits.json.zst:
+            path: /bug_fixing_commits.json.zst
            type: file
-          public/bug_introducing_commits.csv:
-            path: /bug_introducing_commits.csv
+          public/bug_introducing_commits.json.zst:
+            path: /bug_introducing_commits.json.zst
            type: file
        cache:
          bugbug-mercurial-repository: /cache
@ -77,6 +77,7 @@ tasks:
      routes:
        - notify.email.release-mgmt-analysis@mozilla.com.on-failed"
        - notify.irc-channel.#bugbug.on-failed
+        - index.project.relman.bugbug_annotate.regressor_finder.latest
      metadata:
        name: bugbug regressor finder
        description: bugbug regressor finder
--- a/scripts/regressor_finder.py
+++ b/scripts/regressor_finder.py
@ -33,11 +33,34 @@ logger = getLogger(__name__)
 MAX_MODIFICATION_NUMBER = 50
 # TODO: Set to 2 years and 6 months. If it takes too long, make the task work incrementally like microannotate-generate.
 RELATIVE_START_DATE = relativedelta(days=49)
+# Only needed because mercurial<->git mapping could be behind.
+RELATIVE_END_DATE = relativedelta(days=3)
+
+BUG_FIXING_COMMITS_DB = "data/bug_fixing_commits.json"
+db.register(
+    BUG_FIXING_COMMITS_DB,
+    "https://index.taskcluster.net/v1/task/project.relman.bugbug_annotate.regressor_finder.latest/artifacts/public/bug_fixing_commits.json.zst",
+    1,
+)
+
+BUG_INTRODUCING_COMMITS_DB = "data/bug_introducing_commits.json"
+db.register(
+    BUG_INTRODUCING_COMMITS_DB,
+    "https://index.taskcluster.net/v1/task/project.relman.bugbug_annotate.regressor_finder.latest/artifacts/public/bug_introducing_commits.json.zst",
+    1,
+)


 BASE_URL = "https://index.taskcluster.net/v1/task/project.relman.bugbug.train_{model_name}.latest/artifacts/public/{model_name}model.zst"


+def compress_file(path):
+    cctx = zstandard.ZstdCompressor()
+    with open(path, "rb") as input_f:
+        with open(f"{path}.zst", "wb") as output_f:
+            cctx.copy_stream(input_f, output_f)
+
+
 def download_model(model_name):
    if not os.path.exists(f"{model_name}model"):
        url = BASE_URL.format(model_name=model_name)
@ -111,7 +134,7 @@ def get_commits_to_ignore(repo_dir):
    return commits_to_ignore


-def find_fix_commits():
+def find_bug_fixing_commits():
    logger.info("Downloading commits database...")
    db.download_version(repository.COMMITS_DB)
    if db.is_old_version(repository.COMMITS_DB) or not os.path.exists(
@ -124,6 +147,21 @@ def find_fix_commits():
    if db.is_old_version(bugzilla.BUGS_DB) or not os.path.exists(bugzilla.BUGS_DB):
        db.download(bugzilla.BUGS_DB, force=True)

+    logger.info("Download previous classifications...")
+    db.download_version(BUG_FIXING_COMMITS_DB)
+    if db.is_old_version(BUG_FIXING_COMMITS_DB) or not os.path.exists(
+        BUG_FIXING_COMMITS_DB
+    ):
+        db.download(BUG_FIXING_COMMITS_DB, force=True)
+
+    logger.info("Get previously classified commits...")
+    prev_bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB))
+    prev_bug_fixing_commits_nodes = set(
+        bug_fixing_commit["mercurial_rev"]
+        for bug_fixing_commit in prev_bug_fixing_commits
+    )
+    logger.info(f"Already classified {len(prev_bug_fixing_commits)} commits...")
+
    # TODO: Switch to the pure Defect model, as it's better in this case.
    logger.info("Downloading defect/enhancement/task model...")
    download_model("defectenhancementtask")
@ -134,15 +172,24 @@ def find_fix_commits():
    regression_model = RegressionModel.load("regressionmodel")

    start_date = datetime.now() - RELATIVE_START_DATE
-    logger.info(f"Gathering bug IDs associated to commits (since {start_date})...")
+    end_date = datetime.now() - RELATIVE_END_DATE
+    logger.info(
+        f"Gathering bug IDs associated to commits (since {start_date} and up to {end_date})..."
+    )
    commit_map = defaultdict(list)
    for commit in repository.get_commits():
-        if dateutil.parser.parse(commit["pushdate"]) < start_date:
+        if commit["node"] in prev_bug_fixing_commits_nodes:
+            continue
+
+        commit_date = dateutil.parser.parse(commit["pushdate"])
+        if commit_date < start_date or commit_date > end_date:
            continue

        commit_map[commit["bug_id"]].append(commit)

-    logger.info(f"{len(commit_map)} commits found")
+    logger.info(
+        f"{sum(len(commit_list) for commit_list in commit_map.values())} commits found, {len(commit_map)} bugs linked to commits"
+    )
    assert len(commit_map) > 0

    def get_relevant_bugs():
@ -156,11 +203,11 @@ def find_fix_commits():
    known_defect_labels = defect_model.get_labels()
    known_regression_labels = regression_model.get_labels()

-    fix_commits = []
+    bug_fixing_commits = []

-    def append_fix_commits(bug_id, type_):
+    def append_bug_fixing_commits(bug_id, type_):
        for commit in commit_map[bug_id]:
-            fix_commits.append(
+            bug_fixing_commits.append(
                {
                    "mercurial_rev": commit["node"],
                    "git_rev": vcs_map.mercurial_to_git(commit["node"]),
@ -178,26 +225,33 @@ def find_fix_commits():
            bug["id"] in known_regression_labels
            and known_regression_labels[bug["id"]] == 1
        ):
-            append_fix_commits(bug["id"], "r")
+            append_bug_fixing_commits(bug["id"], "r")
            continue

        if bug["id"] in known_defect_labels:
            if known_defect_labels[bug["id"]] == "defect":
-                append_fix_commits(bug["id"], "d")
+                append_bug_fixing_commits(bug["id"], "d")
+            else:
+                append_bug_fixing_commits(bug["id"], "e")
            continue

        if defect_model.classify(bug)[0] == "defect":
            if regression_model.classify(bug)[0] == 1:
-                append_fix_commits(bug["id"], "r")
+                append_bug_fixing_commits(bug["id"], "r")
            else:
-                append_fix_commits(bug["id"], "d")
+                append_bug_fixing_commits(bug["id"], "d")
+        else:
+            append_bug_fixing_commits(bug["id"], "e")

-    with open("fix_commits.csv", "w") as f:
-        writer = csv.DictWriter(f, fieldnames=["mercurial_rev", "git_rev", "type"])
-        writer.writeheader()
-        writer.writerows(fix_commits)
+    db.append(BUG_FIXING_COMMITS_DB, bug_fixing_commits)
+    compress_file(BUG_FIXING_COMMITS_DB)

-    return fix_commits
+    bug_fixing_commits = prev_bug_fixing_commits + bug_fixing_commits
+    return [
+        bug_fixing_commit
+        for bug_fixing_commit in bug_fixing_commits
+        if bug_fixing_commit["type"] in ["r", "d"]
+    ]


 def find_bug_introducing_commits(cache_dir, git_repo_dir):
@ -212,7 +266,21 @@ def find_bug_introducing_commits(cache_dir, git_repo_dir):
    logger.info(f"Cloning git repository to {git_repo_dir}...")
    clone_gecko_dev(git_repo_dir)

-    fix_commits = find_fix_commits()
+    logger.info("Download previously found bug-introducing commits...")
+    db.download_version(BUG_INTRODUCING_COMMITS_DB)
+    if db.is_old_version(BUG_INTRODUCING_COMMITS_DB) or not os.path.exists(
+        BUG_INTRODUCING_COMMITS_DB
+    ):
+        db.download(BUG_INTRODUCING_COMMITS_DB, force=True)
+
+    logger.info("Get previously found bug-introducing commits...")
+    prev_bug_introducing_commits = list(db.read(BUG_INTRODUCING_COMMITS_DB))
+    prev_bug_introducing_commits_nodes = set(
+        bug_introducing_commit["bug_fixing_mercurial_rev"]
+        for bug_introducing_commit in prev_bug_introducing_commits
+    )
+    logger.info(f"Already classified {len(prev_bug_introducing_commits)} commits...")
+
    commits_to_ignore = get_commits_to_ignore(mercurial_repo_dir)

    git_hashes_to_ignore = set(commit["git_rev"] for commit in commits_to_ignore)
@ -220,26 +288,38 @@ def find_bug_introducing_commits(cache_dir, git_repo_dir):
    with open("git_hashes_to_ignore", "w") as f:
        f.writelines(f"{git_hash}\n" for git_hash in git_hashes_to_ignore)

-    total_fix_commits_num = len(fix_commits)
-    fix_commits = [
-        fix_commit
-        for fix_commit in fix_commits
-        if fix_commit["git_rev"] not in git_hashes_to_ignore
+    bug_fixing_commits = find_bug_fixing_commits()
+
+    logger.info(f"{len(bug_fixing_commits)} commits to analyze")
+
+    # Skip already found bug-introducing commits.
+    bug_fixing_commits = [
+        bug_fixing_commit
+        for bug_fixing_commit in bug_fixing_commits
+        if bug_fixing_commit["mercurial_rev"] not in prev_bug_introducing_commits_nodes
    ]
+
    logger.info(
-        f"Skipped {total_fix_commits_num - len(fix_commits)} commits as they were in the ignore list"
+        f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones"
    )

-    logger.info(f"{len(fix_commits)} commits to analyze")
+    bug_fixing_commits = [
+        bug_fixing_commit
+        for bug_fixing_commit in bug_fixing_commits
+        if bug_fixing_commit["git_rev"] not in git_hashes_to_ignore
+    ]
+    logger.info(
+        f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list"
+    )

    def _init(git_repo_dir):
        global GIT_REPO
        GIT_REPO = GitRepository(git_repo_dir)

-    def find_bic(fix_commit):
-        logger.info("Analyzing {}...".format(fix_commit["git_rev"]))
+    def find_bic(bug_fixing_commit):
+        logger.info("Analyzing {}...".format(bug_fixing_commit["git_rev"]))

-        commit = GIT_REPO.get_commit(fix_commit["git_rev"])
+        commit = GIT_REPO.get_commit(bug_fixing_commit["git_rev"])

        # Skip huge changes, we'll likely be wrong with them.
        if len(commit.modifications) > MAX_MODIFICATION_NUMBER:
@ -249,48 +329,53 @@ def find_bug_introducing_commits(cache_dir, git_repo_dir):
            commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore")
        )
        logger.info(bug_introducing_modifications)
+
        bug_introducing_commits = []
        for bug_introducing_hashes in bug_introducing_modifications.values():
            for bug_introducing_hash in bug_introducing_hashes:
-                bug_introducing_commit = fix_commit.copy()
-                bug_introducing_commit.update(
+                bug_introducing_commits.append(
                    {
+                        "bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"],
+                        "bug_fixing_git_rev": bug_fixing_commit["git_rev"],
                        "bug_introducing_mercurial_rev": vcs_map.git_to_mercurial(
                            bug_introducing_hash
                        ),
                        "bug_introducing_git_rev": bug_introducing_hash,
                    }
                )
-                bug_introducing_commits.append(bug_introducing_commit)
+
+        # Add an empty result, just so that we don't reanalyze this again.
+        if len(bug_introducing_commits) == 0:
+            bug_introducing_commits.append(
+                {
+                    "bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"],
+                    "bug_fixing_git_rev": bug_fixing_commit["git_rev"],
+                    "bug_introducing_mercurial_rev": "",
+                    "bug_introducing_git_rev": "",
+                }
+            )

        return bug_introducing_commits

    with concurrent.futures.ThreadPoolExecutor(
        initializer=_init, initargs=(git_repo_dir,), max_workers=os.cpu_count() + 1
    ) as executor:
-        results = executor.map(find_bic, fix_commits)
-        results = tqdm(results, total=len(fix_commits))
-        results = list(itertools.chain.from_iterable(results))
+        bug_introducing_commits = executor.map(find_bic, bug_fixing_commits)
+        bug_introducing_commits = tqdm(
+            bug_introducing_commits, total=len(bug_fixing_commits)
+        )
+        bug_introducing_commits = list(
+            itertools.chain.from_iterable(bug_introducing_commits)
+        )

-    total_results_num = len(results)
-    results = list(filter(None, results))
+    total_results_num = len(bug_introducing_commits)
+    bug_introducing_commits = list(filter(None, bug_introducing_commits))
    logger.info(
-        f"Skipped {total_results_num - len(results)} commits as they were too big"
+        f"Skipped {total_results_num - len(bug_introducing_commits)} commits as they were too big"
    )

-    with open("bug_introducing_commits.csv", "w") as f:
-        writer = csv.DictWriter(
-            f,
-            fieldnames=[
-                "mercurial_rev",
-                "git_rev",
-                "type",
-                "bug_introducing_mercurial_rev",
-                "bug_introducing_git_rev",
-            ],
-        )
-        writer.writeheader()
-        writer.writerows(results)
+    db.append(BUG_INTRODUCING_COMMITS_DB, bug_introducing_commits)
+    compress_file(BUG_INTRODUCING_COMMITS_DB)


 def main():