Refactor downstream monitor

2023-05-24 18:43:16 -04:00 · 2023-05-24 18:43:16 -04:00 · 89384a334a
--- a/comma/downstream/monitor.py
+++ b/comma/downstream/monitor.py
@ -1,9 +1,13 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
+"""
+Functions for monitoring downstream repos for missing commits
+"""
+
 import logging

 import approxidate
-from git import GitCommandError
+import git

 from comma.database.driver import DatabaseDriver
 from comma.database.model import (
@ -15,7 +19,7 @@ from comma.database.model import (
 from comma.downstream.matcher import patch_matches
 from comma.upstream.parser import process_commits
 from comma.util import config
-from comma.util.tracking import get_linux_repo, get_tracked_paths, GitProgressPrinter
+from comma.util.tracking import GitProgressPrinter, get_linux_repo, get_tracked_paths


 def update_revisions_for_distro(distro_id, revs):
@ -59,37 +63,27 @@ def update_tracked_revisions(distro_id, repo):
    repo: the git repo object of whatever repo to check revisions in
    """

-    # TODO This is WRONG it's sorting by alphabetization,
-    # which happens to be correct currently but needs to be addressed
-    # git tag can sort by date, but can't specify remote. ls-remote can't naturally sort
-    # until git 1.2.18 - 1.2.17 is the latest version Ubuntu18.04 can get
+    # TODO This is WRONG. It's sorting alphabetically, which happens to be correct currently but
+    # needs to be addressed git tag can sort by date, but can't specify remote.
+    # ls-remote can naturally sort by date, but requires the object to be local

    if distro_id.startswith("Ubuntu"):
-        latest_two_kernels = []
-        tag_lines = repo.git.ls_remote("--t", "--refs", "--sort=v:refname", distro_id).split("\n")
-        tag_names = [tag_line.split("/", 1)[-1] for tag_line in tag_lines]
-        # Filter out edge, and only include azure revisions
-        tag_names = list(
-            filter(
-                lambda x: "azure" in x and "edge" not in x and "cvm" not in x and "fde" not in x,
-                tag_names,
-            )
-        )
-        latest_two_kernels = tag_names[-2:]
-        update_revisions_for_distro(distro_id, latest_two_kernels)
+        tag_names = []
+        for line in repo.git.ls_remote(
+            "--tags", "--refs", "--sort=v:refname", distro_id
+        ).splitlines():
+            name = line.split("/", 1)[-1]
+            if "azure" in name and all(label not in name for label in ("edge", "cvm", "fde")):
+                tag_names.append(name)
+
+        update_revisions_for_distro(distro_id, tag_names[-2:])


-# TODO: Refactor this function into a smaller set of pieces.
-def monitor_subject(monitoring_subject, repo, reference=None):
+def get_missing_cherries(repo, reference):
    """
-    Update the missing patches in the database for this monitoring_subject
-
-    monitoring_subject: The MonitoringSubject we are updating
-    repo: The git repo object pointing to relevant upstream linux repo
+    Get a list of cherry-picked commits missing from the downstream reference
    """

-    reference = monitoring_subject.revision if reference is None else reference
-
    # Get all upstream commits on tracked paths within window
    upstream_commits = set(
        repo.git.log(
@ -114,53 +108,30 @@ def monitor_subject(monitoring_subject, repo, reference=None):
        ).splitlines()
    )

-    missing_cherries &= upstream_commits
-    del upstream_commits
+    return missing_cherries & upstream_commits

+
+def monitor_subject(monitoring_subject, repo, reference=None):
+    """
+    Update the missing patches in the database for this monitoring_subject
+
+    monitoring_subject: The MonitoringSubject we are updating
+    repo: The git repo object pointing to relevant upstream linux repo
+    """
+
+    reference = monitoring_subject.revision if reference is None else reference
+
+    missing_cherries = get_missing_cherries(repo, reference)
    logging.debug("Found %d missing patches through cherry-pick.", len(missing_cherries))

    # Run extra checks on these missing commits
-    with DatabaseDriver.get_session() as session:
-        patches = (
-            session.query(PatchData)
-            .filter(PatchData.commitID.in_(missing_cherries))
-            .order_by(PatchData.commitTime)
-            .all()
-        )
-        # We only want to check downstream patches as old as the
-        # oldest upstream missing patch, as an optimization.
-        earliest_commit_date = min(p.commitTime for p in patches).isoformat()
-        logging.debug("Processing commits since %s", earliest_commit_date)
-
-        # Get the downstream commits for this revision (these are
-        # distinct from upstream because they’ve been cherry-picked).
-        #
-        # NOTE: This is slow but necessary!
-        downstream_patches = process_commits(
-            revision=reference,
-            since=earliest_commit_date,
-        )
-
-        logging.info("Starting confidence matching for %d upstream patches...", len(patches))
-
-        # Double check the missing cherries using our fuzzy algorithm.
-        missing_patches = [p for p in patches if not patch_matches(downstream_patches, p)]
-
-        missing_patch_ids = [p.patchID for p in missing_patches]
-
-        logging.info(
-            "Identified %d missing patches:\n  %s",
-            len(missing_patch_ids),
-            "\n  ".join(sorted(patch.commitID for patch in missing_patches)),
-        )
-
-    subject_id = monitoring_subject.monitoringSubjectID
+    missing_patch_ids = get_missing_patch_ids(missing_cherries, reference)
+    logging.info("Identified %d missing patches", len(missing_patch_ids))

    # Delete patches that are no longer missing.
-    #
-    # NOTE: We do this in separate sessions in order to cleanly expire
-    # their objects and commit the changes to the database. There is
-    # surely another way to do this, but it works.
+    # NOTE: We do this in separate sessions in order to cleanly expire their objects and commit the
+    # changes to the database. There is surely another way to do this, but it works.
+    subject_id = monitoring_subject.monitoringSubjectID
    with DatabaseDriver.get_session() as session:
        patches = session.query(MonitoringSubjectsMissingPatches).filter_by(
            monitoringSubjectID=subject_id
@ -181,9 +152,8 @@ def monitor_subject(monitoring_subject, repo, reference=None):
        )
        new_missing_patches = 0
        for patch_id in missing_patch_ids:
-            # Only add if it doesn't already exist. We're dealing with
-            # patches on the scale of 100, so the number of queries
-            # and inserts here doesn't matter.
+            # Only add if it doesn't already exist. We're dealing with patches on the scale of 100,
+            # so the number of queries and inserts here doesn't matter.
            if patches.filter_by(patchID=patch_id).first() is None:
                new_missing_patches += 1
                session.add(
@ -194,60 +164,53 @@ def monitor_subject(monitoring_subject, repo, reference=None):
        logging.info("Adding %d patches that are now missing.", new_missing_patches)


-def monitor_downstream():
-    print("Monitoring downstream...")
-    repo = get_linux_repo()
+def get_missing_patch_ids(missing_cherries, reference):
+    """
+    Attempt to determine which patches are missing from a list of missing cherries
+    """

-    # Add repos as a remote origin if not already added
-    current_remotes = repo.git.remote()
    with DatabaseDriver.get_session() as session:
-        for distro_id, url in session.query(Distros.distroID, Distros.repoLink).all():
-            # Debian we handle differently
-            if distro_id not in current_remotes and not distro_id.startswith("Debian"):
-                logging.debug("Adding remote origin for %s from %s", distro_id, url)
-                repo.create_remote(distro_id, url=url)
+        patches = (
+            session.query(PatchData)
+            .filter(PatchData.commitID.in_(missing_cherries))
+            .order_by(PatchData.commitTime)
+            .all()
+        )
+        # We only want to check downstream patches as old as the oldest upstream missing patch
+        earliest_commit_date = min(patch.commitTime for patch in patches).isoformat()
+        logging.debug("Processing commits since %s", earliest_commit_date)

-    logging.info("Updating tracked revisions for each repo.")
-    # Update stored revisions for repos as appropriate
-    with DatabaseDriver.get_session() as session:
-        for (distro_id,) in session.query(Distros.distroID).all():
-            update_tracked_revisions(distro_id, repo)
+        # Get the downstream commits for this revision (these are distinct from upstream because
+        # they’ve been cherry-picked). This is slow but necessary!
+        downstream_patches = process_commits(revision=reference, since=earliest_commit_date)

-    approx_date_since = approxidate.approx(config.since)
-    with DatabaseDriver.get_session() as session:
-        for subject in session.query(MonitoringSubjects).all():
-            if subject.distroID.startswith("Debian"):
-                # TODO don't skip debian
-                logging.debug("skipping Debian")
-                continue
+        # Double check the missing cherries using our fuzzy algorithm.
+        logging.info("Starting confidence matching for %d upstream patches...", len(patches))
+        missing_patches = [p.patchID for p in patches if not patch_matches(downstream_patches, p)]

-            remote = repo.remote(subject.distroID)
+    return missing_patches

-            # Use distro name for local refs to prevent duplicates
-            if subject.revision.startswith(f"{subject.distroID}/"):
-                local_ref = subject.revision
-                remote_ref = subject.revision.split("/", 1)[-1]
-            else:
-                local_ref = f"{subject.distroID}/{subject.revision}"
-                remote_ref = subject.revision
+
+def fetch_remote_ref(repo: git.Repo, name: str, local_ref: str, remote_ref: str) -> None:
+    """
+    Shallow fetch remote reference so it is available locally
+    """
+
+    remote = repo.remote(name)

    # Initially fetch revision at depth 1
    # TODO: Is there a better way to get the date of the last commit?
-            logging.info(
-                "Fetching remote ref %s from remote %s at depth 1", remote_ref, subject.distroID
-            )
-            fetch_info = remote.fetch(
-                remote_ref, depth=1, verbose=True, progress=GitProgressPrinter()
-            )
+    logging.info("Fetching remote ref %s from remote %s at depth 1", remote_ref, remote)
+    fetch_info = remote.fetch(remote_ref, depth=1, verbose=True, progress=GitProgressPrinter())

    # If last commit for revision is in the fetch window, expand depth
    # This check is necessary because some servers will throw an error when there are
    # no commits in the fetch window
-            if fetch_info[-1].commit.committed_date >= approx_date_since:
+    if fetch_info[-1].commit.committed_date >= approxidate.approx(config.since):
        logging.info(
            'Fetching ref %s from remote %s shallow since "%s"',
            remote_ref,
-                    subject.distroID,
+            remote,
            config.since,
        )
        try:
@ -257,7 +220,7 @@ def monitor_downstream():
                verbose=True,
                progress=GitProgressPrinter(),
            )
-                except GitCommandError as e:
+        except git.GitCommandError as e:
            # ADO repos do not currently support --shallow-since, only depth
            if "Server does not support --shallow-since" in e.stderr:
                logging.warning(
@ -270,7 +233,7 @@ def monitor_downstream():
        logging.info(
            'Newest commit for ref %s from remote %s is older than fetch window "%s"',
            remote_ref,
-                    subject.distroID,
+            remote,
            config.since,
        )

@ -278,10 +241,51 @@ def monitor_downstream():
    if not hasattr(repo.references, local_ref):
        repo.create_tag(local_ref, "FETCH_HEAD")

+
+def monitor_downstream():
+    """
+    Cycle through downstream remotes and search for missing commits
+    """
+
+    print("Monitoring downstream...")
+    repo = get_linux_repo()
+
+    # Add repos as a remote if not already added
+    with DatabaseDriver.get_session() as session:
+        for distro_id, url in session.query(Distros.distroID, Distros.repoLink).all():
+            # Skip Debian for now
+            if distro_id not in repo.remotes and not distro_id.startswith("Debian"):
+                logging.debug("Adding remote %s from %s", distro_id, url)
+                repo.create_remote(distro_id, url=url)
+
+    # Update stored revisions for repos as appropriate
+    logging.info("Updating tracked revisions for each repo.")
+    with DatabaseDriver.get_session() as session:
+        for (distro_id,) in session.query(Distros.distroID).all():
+            update_tracked_revisions(distro_id, repo)
+
+    with DatabaseDriver.get_session() as session:
+        for subject in session.query(MonitoringSubjects).all():
+            if subject.distroID.startswith("Debian"):
+                # TODO don't skip debian
+                logging.debug("skipping Debian")
+                continue
+
+            # Use distro name for local refs to prevent duplicates
+            if subject.revision.startswith(f"{subject.distroID}/"):
+                local_ref = subject.revision
+                remote_ref = subject.revision.split("/", 1)[-1]
+            else:
+                local_ref = f"{subject.distroID}/{subject.revision}"
+                remote_ref = subject.revision
+
+            fetch_remote_ref(repo, subject.distroID, local_ref, remote_ref)
+
            logging.info(
                "Monitoring Script starting for Distro: %s, revision: %s",
                subject.distroID,
                remote_ref,
            )
            monitor_subject(subject, repo, local_ref)
+
    print("Finished monitoring downstream!")