Move upstream operations to class

2023-06-14 17:58:11 -04:00 · 2023-06-14 17:58:11 -04:00 · 393a70ed47
--- a/comma/cli/init.py
+++ b/comma/cli/init.py
@ -11,7 +11,7 @@ from comma.cli.parser import parse_args
 from comma.database.driver import DatabaseDriver
 from comma.database.model import Distros, MonitoringSubjects
 from comma.downstream import Downstream
-from comma.upstream import process_commits
+from comma.upstream import Upstream
 from comma.util import config
 from comma.util.spreadsheet import export_commits, import_commits, update_commits
 from comma.util.symbols import get_missing_commits
@ -39,7 +39,7 @@ def run(options):

    if options.upstream:
        LOGGER.info("Begin monitoring upstream")
-        process_commits(add_to_database=True, since=config.since)
+        Upstream(config, DatabaseDriver()).process_commits()
        LOGGER.info("Finishing monitoring upstream")

    if options.downstream:
--- a/comma/database/model.py
+++ b/comma/database/model.py
@ -4,10 +4,18 @@
 ORM models for database objects
 """

+from datetime import datetime
+
+import git
 from sqlalchemy import Column, DateTime, ForeignKey, Integer, String
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import relationship

+from comma.util import format_diffs
+from comma.util.tracking import get_filenames
+
+
+IGNORED_IN_CMSG = "reported-by:", "signed-off-by:", "reviewed-by:", "acked-by:", "cc:"

 Base = declarative_base()

@ -46,6 +54,46 @@ class PatchData(Base):
        lazy="dynamic",
    )

+    @classmethod
+    def create(cls, commit: git.Commit, paths) -> "PatchData":
+        """
+        Create patch object from a commit object
+        """
+
+        patch = cls(
+            commitID=commit.hexsha,
+            author=commit.author.name,
+            authorEmail=commit.author.email,
+            authorTime=datetime.utcfromtimestamp(commit.authored_date),
+            commitTime=datetime.utcfromtimestamp(commit.committed_date),
+        )
+
+        description = []
+        fixed_patches = []
+        for num, line in enumerate(commit.message.splitlines()):
+            line = line.strip()  # pylint: disable=redefined-loop-name
+            if not num:
+                patch.subject = line
+                continue
+
+            if line.lower().startswith(IGNORED_IN_CMSG):
+                continue
+
+            description.append(line)
+
+            # Check if this patch fixes other patches
+            if line.lower().startswith("fixes:"):
+                words = line.split(" ")
+                if len(words) > 1:
+                    fixed_patches.append(words[1])
+
+        patch.description = "\n".join(description)
+        patch.fixedPatches = " ".join(fixed_patches)  # e.g. "SHA1 SHA2 SHA3"
+        patch.affectedFilenames = " ".join(get_filenames(commit))
+        patch.commitDiffs = format_diffs(commit, paths)
+
+        return patch
+

 class PatchDataMeta(Base):
    """
--- a/comma/downstream/init.py
+++ b/comma/downstream/init.py
@ -15,7 +15,6 @@ from comma.database.model import (
    PatchData,
 )
 from comma.downstream.matcher import patch_matches
-from comma.upstream import process_commits
 from comma.util.tracking import get_linux_repo


@ -186,6 +185,8 @@ class Downstream:
        Attempt to determine which patches are missing from a list of missing cherries
        """

+        paths = self.repo.get_tracked_paths(self.config.sections)
+
        with self.database.get_session() as session:
            patches = (
                session.query(PatchData)
@ -199,7 +200,19 @@ class Downstream:

            # Get the downstream commits for this revision (these are distinct from upstream because
            # they have been cherry-picked). This is slow but necessary!
-            downstream_patches = process_commits(revision=reference, since=earliest_commit_date)
+
+            LOGGER.info("Determining downstream commits from tracked files")
+            # We use `--min-parents=1 --max-parents=1` to avoid both merges and graft commits
+            downstream_patches = tuple(
+                PatchData.create(commit, paths)
+                for commit in self.repo.iter_commits(
+                    rev=reference,
+                    paths=paths,
+                    min_parents=1,
+                    max_parents=1,
+                    since=earliest_commit_date,
+                )
+            )

            # Double check the missing cherries using our fuzzy algorithm.
            LOGGER.info("Starting confidence matching for %d upstream patches...", len(patches))
--- a/comma/downstream/matcher.py
+++ b/comma/downstream/matcher.py
@ -6,7 +6,7 @@ Functions to compare commits for similarities

 import logging
 import os
-from typing import Iterable, List
+from typing import Iterable

 from fuzzywuzzy import fuzz

@ -64,7 +64,7 @@ def calculate_filenames_confidence(
    return total_filepaths_match / len(upstream_filepaths)


-def patch_matches(downstream_patches: List[PatchData], upstream: PatchData) -> bool:
+def patch_matches(downstream_patches: Iterable[PatchData], upstream: PatchData) -> bool:
    """Check if 'upstream' has an equivalent in 'downstream_patches'."""

    # Preprocessing for matching filenames
--- a/comma/upstream.py
+++ b/comma/upstream.py
@ -5,144 +5,59 @@ Functions for parsing commit objects into patch objects
 """

 import logging
-from datetime import datetime
-from typing import List, Optional, Set
+from functools import cached_property

-from comma.database.driver import DatabaseDriver
 from comma.database.model import PatchData
-from comma.util import config
-from comma.util.tracking import get_filenames, get_linux_repo
+from comma.util.tracking import get_linux_repo


-IGNORED_FIELDS = "reported-by:", "signed-off-by:", "reviewed-by:", "acked-by:", "cc:"
 LOGGER = logging.getLogger(__name__)


-def format_diffs(commit, paths):
+class Upstream:
    """
-    Format diffs from commit object into string
+    Parent object for downstream operations
    """

-    diffs = []
-    # We are ignoring merges so all commits should have a single parent
-    for diff in commit.tree.diff(commit.parents[0], paths=paths, create_patch=True):
-        if diff.a_path is not None:
-            # The patch commit diffs are stored as "(filename1)\n(diff1)\n(filename2)\n(diff2)..."
-            lines = "\n".join(
-                line
-                for line in diff.diff.decode("utf-8").splitlines()
-                if line.startswith(("+", "-"))
-            )
-            diffs.append(f"{diff.a_path}\n{lines}")
+    def __init__(self, config, database) -> None:
+        self.config = config
+        self.database = database

-    return "\n".join(diffs)
+    @cached_property
+    def repo(self):
+        """
+        Get repo when first accessed
+        """
+        return get_linux_repo(since=self.config.since)

+    def process_commits(self):
+        """
+        Generate patches for commits affecting tracked paths
+        """

-def create_patch(commit, paths) -> PatchData:
-    """
-    Create patch object from a commit object
-    """
+        paths = self.repo.get_tracked_paths(self.config.sections)
+        added = 0
+        total = 0

-    patch: PatchData = PatchData(
-        commitID=commit.hexsha,
-        author=commit.author.name,
-        authorEmail=commit.author.email,
-        authorTime=datetime.utcfromtimestamp(commit.authored_date),
-        commitTime=datetime.utcfromtimestamp(commit.committed_date),
-    )
-
-    description = []
-    fixed_patches = []
-    for num, line in enumerate(commit.message.splitlines()):
-        line = line.strip()  # pylint: disable=redefined-loop-name
-        if not num:
-            patch.subject = line
-            continue
-
-        if line.lower().startswith(IGNORED_FIELDS):
-            continue
-
-        description.append(line)
-
-        # Check if this patch fixes other patches
-        if line.lower().startswith("fixes:"):
-            words = line.split(" ")
-            if len(words) > 1:
-                fixed_patches.append(words[1])
-
-    patch.description = "\n".join(description)
-    patch.fixedPatches = " ".join(fixed_patches)  # e.g. "SHA1 SHA2 SHA3"
-    patch.affectedFilenames = " ".join(get_filenames(commit))
-    patch.commitDiffs = format_diffs(commit, paths)
-
-    return patch
-
-
-def process_commits(
-    commit_ids: Optional[Set[str]] = None,
-    revision: str = "origin/master",
-    add_to_database: bool = False,
-    since: str = config.since,
-) -> List[PatchData]:
-    """
-    Look at all commits in the given repo and handle based on distro.
-
-    commit_ids: Set of commits to process
-    revision: revision we want to see the commits of, or None
-    add_to_database: whether or not to add to database (side-effect)
-    since: if provided, will only process commits after this commit
-    """
-
-    repo = get_linux_repo(since=since)
-    paths = repo.get_tracked_paths(config.sections)
-
-    if commit_ids is None:
        # We use `--min-parents=1 --max-parents=1` to avoid both merges and graft commits.
-        LOGGER.info("Determining commits from tracked files")
-        commits = repo.iter_commits(
-            rev=revision,
+        LOGGER.info("Determining upstream commits from tracked files")
+        for commit in self.repo.iter_commits(
+            rev="origin/master",
            paths=paths,
            min_parents=1,
            max_parents=1,
-            since=since,
-        )
-    else:
-        # If given a set of commit SHAs, get the commit objects.
-        commits = []
-        for id_ in commit_ids:
-            try:
-                commits.append(repo.commit(id_))
-            except ValueError:
-                LOGGER.warning("Commit '%s' does not exist in the repo! Skipping...", id_)
+            since=self.config.since,
+        ):
+            total += 1
+            with self.database.get_session() as session:
+                # See if commit ID is in database
+                if (
+                    session.query(PatchData.commitID)
+                    .filter_by(commitID=commit.hexsha)
+                    .one_or_none()
+                    is None
+                ):
+                    session.add(PatchData.create(commit, paths))
+                    added += 1

-    LOGGER.info("Starting commit processing...")
-
-    all_patches = []
-    num_patches_added = 0
-    for num, commit in enumerate(commits, 1):
-        # Skip root (initial) commit since it should always be present
-        # TODO (Issue 54): This can probably be removed
-        if commit.parents:
-            LOGGER.debug("Parsing commit %s", commit.hexsha)
-            patch: PatchData = create_patch(commit, paths)
-
-            if add_to_database:
-                with DatabaseDriver.get_session() as session:
-                    if (
-                        session.query(PatchData.commitID)
-                        .filter_by(commitID=patch.commitID)
-                        .one_or_none()
-                        is None
-                    ):
-                        session.add(patch)
-                        num_patches_added += 1
-            else:
-                all_patches.append(patch)
-
-        if not num % 250:
-            LOGGER.debug(" %d commits processed...", num)
-
-    if add_to_database:
-        LOGGER.info("%d patches added to database.", num_patches_added)
-
-    return all_patches
+        LOGGER.info("%d of %d patches added to database.", added, total)
--- a/comma/util/init.py
+++ b/comma/util/init.py
@ -5,6 +5,26 @@ Utility functions and classes
 """


+def format_diffs(commit, paths):
+    """
+    Format diffs from commit object into string
+    """
+
+    diffs = []
+    # We are ignoring merges so all commits should have a single parent
+    for diff in commit.tree.diff(commit.parents[0], paths=paths, create_patch=True):
+        if diff.a_path is not None:
+            # The patch commit diffs are stored as "(filename1)\n(diff1)\n(filename2)\n(diff2)..."
+            lines = "\n".join(
+                line
+                for line in diff.diff.decode("utf-8").splitlines()
+                if line.startswith(("+", "-"))
+            )
+            diffs.append(f"{diff.a_path}\n{lines}")
+
+    return "\n".join(diffs)
+
+
 class PatchDiff:
    """
    Representation of code changes in a patch