diff --git a/comma/cli/__init__.py b/comma/cli/__init__.py index 5e5a8ca..cacb82f 100644 --- a/comma/cli/__init__.py +++ b/comma/cli/__init__.py @@ -11,7 +11,7 @@ from comma.cli.parser import parse_args from comma.database.driver import DatabaseDriver from comma.database.model import Distros, MonitoringSubjects from comma.downstream import Downstream -from comma.upstream import process_commits +from comma.upstream import Upstream from comma.util import config from comma.util.spreadsheet import export_commits, import_commits, update_commits from comma.util.symbols import get_missing_commits @@ -39,7 +39,7 @@ def run(options): if options.upstream: LOGGER.info("Begin monitoring upstream") - process_commits(add_to_database=True, since=config.since) + Upstream(config, DatabaseDriver()).process_commits() LOGGER.info("Finishing monitoring upstream") if options.downstream: diff --git a/comma/database/model.py b/comma/database/model.py index cb11514..172db73 100644 --- a/comma/database/model.py +++ b/comma/database/model.py @@ -4,10 +4,18 @@ ORM models for database objects """ +from datetime import datetime + +import git from sqlalchemy import Column, DateTime, ForeignKey, Integer, String from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import relationship +from comma.util import format_diffs +from comma.util.tracking import get_filenames + + +IGNORED_IN_CMSG = "reported-by:", "signed-off-by:", "reviewed-by:", "acked-by:", "cc:" Base = declarative_base() @@ -46,6 +54,46 @@ class PatchData(Base): lazy="dynamic", ) + @classmethod + def create(cls, commit: git.Commit, paths) -> "PatchData": + """ + Create patch object from a commit object + """ + + patch = cls( + commitID=commit.hexsha, + author=commit.author.name, + authorEmail=commit.author.email, + authorTime=datetime.utcfromtimestamp(commit.authored_date), + commitTime=datetime.utcfromtimestamp(commit.committed_date), + ) + + description = [] + fixed_patches = [] + for num, line in enumerate(commit.message.splitlines()): + line = line.strip() # pylint: disable=redefined-loop-name + if not num: + patch.subject = line + continue + + if line.lower().startswith(IGNORED_IN_CMSG): + continue + + description.append(line) + + # Check if this patch fixes other patches + if line.lower().startswith("fixes:"): + words = line.split(" ") + if len(words) > 1: + fixed_patches.append(words[1]) + + patch.description = "\n".join(description) + patch.fixedPatches = " ".join(fixed_patches) # e.g. "SHA1 SHA2 SHA3" + patch.affectedFilenames = " ".join(get_filenames(commit)) + patch.commitDiffs = format_diffs(commit, paths) + + return patch + class PatchDataMeta(Base): """ diff --git a/comma/downstream/__init__.py b/comma/downstream/__init__.py index a1a83fa..097b850 100755 --- a/comma/downstream/__init__.py +++ b/comma/downstream/__init__.py @@ -15,7 +15,6 @@ from comma.database.model import ( PatchData, ) from comma.downstream.matcher import patch_matches -from comma.upstream import process_commits from comma.util.tracking import get_linux_repo @@ -186,6 +185,8 @@ class Downstream: Attempt to determine which patches are missing from a list of missing cherries """ + paths = self.repo.get_tracked_paths(self.config.sections) + with self.database.get_session() as session: patches = ( session.query(PatchData) @@ -199,7 +200,19 @@ class Downstream: # Get the downstream commits for this revision (these are distinct from upstream because # they have been cherry-picked). This is slow but necessary! - downstream_patches = process_commits(revision=reference, since=earliest_commit_date) + + LOGGER.info("Determining downstream commits from tracked files") + # We use `--min-parents=1 --max-parents=1` to avoid both merges and graft commits + downstream_patches = tuple( + PatchData.create(commit, paths) + for commit in self.repo.iter_commits( + rev=reference, + paths=paths, + min_parents=1, + max_parents=1, + since=earliest_commit_date, + ) + ) # Double check the missing cherries using our fuzzy algorithm. LOGGER.info("Starting confidence matching for %d upstream patches...", len(patches)) diff --git a/comma/downstream/matcher.py b/comma/downstream/matcher.py index 3d1f36c..e451cee 100755 --- a/comma/downstream/matcher.py +++ b/comma/downstream/matcher.py @@ -6,7 +6,7 @@ Functions to compare commits for similarities import logging import os -from typing import Iterable, List +from typing import Iterable from fuzzywuzzy import fuzz @@ -64,7 +64,7 @@ def calculate_filenames_confidence( return total_filepaths_match / len(upstream_filepaths) -def patch_matches(downstream_patches: List[PatchData], upstream: PatchData) -> bool: +def patch_matches(downstream_patches: Iterable[PatchData], upstream: PatchData) -> bool: """Check if 'upstream' has an equivalent in 'downstream_patches'.""" # Preprocessing for matching filenames diff --git a/comma/upstream.py b/comma/upstream.py index 5c54afa..d9b7faf 100755 --- a/comma/upstream.py +++ b/comma/upstream.py @@ -5,144 +5,59 @@ Functions for parsing commit objects into patch objects """ import logging -from datetime import datetime -from typing import List, Optional, Set +from functools import cached_property -from comma.database.driver import DatabaseDriver from comma.database.model import PatchData -from comma.util import config -from comma.util.tracking import get_filenames, get_linux_repo +from comma.util.tracking import get_linux_repo -IGNORED_FIELDS = "reported-by:", "signed-off-by:", "reviewed-by:", "acked-by:", "cc:" LOGGER = logging.getLogger(__name__) -def format_diffs(commit, paths): +class Upstream: """ - Format diffs from commit object into string + Parent object for downstream operations """ - diffs = [] - # We are ignoring merges so all commits should have a single parent - for diff in commit.tree.diff(commit.parents[0], paths=paths, create_patch=True): - if diff.a_path is not None: - # The patch commit diffs are stored as "(filename1)\n(diff1)\n(filename2)\n(diff2)..." - lines = "\n".join( - line - for line in diff.diff.decode("utf-8").splitlines() - if line.startswith(("+", "-")) - ) - diffs.append(f"{diff.a_path}\n{lines}") + def __init__(self, config, database) -> None: + self.config = config + self.database = database - return "\n".join(diffs) + @cached_property + def repo(self): + """ + Get repo when first accessed + """ + return get_linux_repo(since=self.config.since) + def process_commits(self): + """ + Generate patches for commits affecting tracked paths + """ -def create_patch(commit, paths) -> PatchData: - """ - Create patch object from a commit object - """ + paths = self.repo.get_tracked_paths(self.config.sections) + added = 0 + total = 0 - patch: PatchData = PatchData( - commitID=commit.hexsha, - author=commit.author.name, - authorEmail=commit.author.email, - authorTime=datetime.utcfromtimestamp(commit.authored_date), - commitTime=datetime.utcfromtimestamp(commit.committed_date), - ) - - description = [] - fixed_patches = [] - for num, line in enumerate(commit.message.splitlines()): - line = line.strip() # pylint: disable=redefined-loop-name - if not num: - patch.subject = line - continue - - if line.lower().startswith(IGNORED_FIELDS): - continue - - description.append(line) - - # Check if this patch fixes other patches - if line.lower().startswith("fixes:"): - words = line.split(" ") - if len(words) > 1: - fixed_patches.append(words[1]) - - patch.description = "\n".join(description) - patch.fixedPatches = " ".join(fixed_patches) # e.g. "SHA1 SHA2 SHA3" - patch.affectedFilenames = " ".join(get_filenames(commit)) - patch.commitDiffs = format_diffs(commit, paths) - - return patch - - -def process_commits( - commit_ids: Optional[Set[str]] = None, - revision: str = "origin/master", - add_to_database: bool = False, - since: str = config.since, -) -> List[PatchData]: - """ - Look at all commits in the given repo and handle based on distro. - - commit_ids: Set of commits to process - revision: revision we want to see the commits of, or None - add_to_database: whether or not to add to database (side-effect) - since: if provided, will only process commits after this commit - """ - - repo = get_linux_repo(since=since) - paths = repo.get_tracked_paths(config.sections) - - if commit_ids is None: # We use `--min-parents=1 --max-parents=1` to avoid both merges and graft commits. - LOGGER.info("Determining commits from tracked files") - commits = repo.iter_commits( - rev=revision, + LOGGER.info("Determining upstream commits from tracked files") + for commit in self.repo.iter_commits( + rev="origin/master", paths=paths, min_parents=1, max_parents=1, - since=since, - ) - else: - # If given a set of commit SHAs, get the commit objects. - commits = [] - for id_ in commit_ids: - try: - commits.append(repo.commit(id_)) - except ValueError: - LOGGER.warning("Commit '%s' does not exist in the repo! Skipping...", id_) + since=self.config.since, + ): + total += 1 + with self.database.get_session() as session: + # See if commit ID is in database + if ( + session.query(PatchData.commitID) + .filter_by(commitID=commit.hexsha) + .one_or_none() + is None + ): + session.add(PatchData.create(commit, paths)) + added += 1 - LOGGER.info("Starting commit processing...") - - all_patches = [] - num_patches_added = 0 - for num, commit in enumerate(commits, 1): - # Skip root (initial) commit since it should always be present - # TODO (Issue 54): This can probably be removed - if commit.parents: - LOGGER.debug("Parsing commit %s", commit.hexsha) - patch: PatchData = create_patch(commit, paths) - - if add_to_database: - with DatabaseDriver.get_session() as session: - if ( - session.query(PatchData.commitID) - .filter_by(commitID=patch.commitID) - .one_or_none() - is None - ): - session.add(patch) - num_patches_added += 1 - else: - all_patches.append(patch) - - if not num % 250: - LOGGER.debug(" %d commits processed...", num) - - if add_to_database: - LOGGER.info("%d patches added to database.", num_patches_added) - - return all_patches + LOGGER.info("%d of %d patches added to database.", added, total) diff --git a/comma/util/__init__.py b/comma/util/__init__.py index 6876bdc..4273e80 100755 --- a/comma/util/__init__.py +++ b/comma/util/__init__.py @@ -5,6 +5,26 @@ Utility functions and classes """ +def format_diffs(commit, paths): + """ + Format diffs from commit object into string + """ + + diffs = [] + # We are ignoring merges so all commits should have a single parent + for diff in commit.tree.diff(commit.parents[0], paths=paths, create_patch=True): + if diff.a_path is not None: + # The patch commit diffs are stored as "(filename1)\n(diff1)\n(filename2)\n(diff2)..." + lines = "\n".join( + line + for line in diff.diff.decode("utf-8").splitlines() + if line.startswith(("+", "-")) + ) + diffs.append(f"{diff.a_path}\n{lines}") + + return "\n".join(diffs) + + class PatchDiff: """ Representation of code changes in a patch