Move upstream operations to class

This commit is contained in:
Avram Lubkin 2023-06-14 17:58:11 -04:00 коммит произвёл Avram Lubkin
Родитель ae8dde7186
Коммит 393a70ed47
6 изменённых файлов: 124 добавлений и 128 удалений

Просмотреть файл

@ -11,7 +11,7 @@ from comma.cli.parser import parse_args
from comma.database.driver import DatabaseDriver
from comma.database.model import Distros, MonitoringSubjects
from comma.downstream import Downstream
from comma.upstream import process_commits
from comma.upstream import Upstream
from comma.util import config
from comma.util.spreadsheet import export_commits, import_commits, update_commits
from comma.util.symbols import get_missing_commits
@ -39,7 +39,7 @@ def run(options):
if options.upstream:
LOGGER.info("Begin monitoring upstream")
process_commits(add_to_database=True, since=config.since)
Upstream(config, DatabaseDriver()).process_commits()
LOGGER.info("Finishing monitoring upstream")
if options.downstream:

Просмотреть файл

@ -4,10 +4,18 @@
ORM models for database objects
"""
from datetime import datetime
import git
from sqlalchemy import Column, DateTime, ForeignKey, Integer, String
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship
from comma.util import format_diffs
from comma.util.tracking import get_filenames
IGNORED_IN_CMSG = "reported-by:", "signed-off-by:", "reviewed-by:", "acked-by:", "cc:"
Base = declarative_base()
@ -46,6 +54,46 @@ class PatchData(Base):
lazy="dynamic",
)
@classmethod
def create(cls, commit: git.Commit, paths) -> "PatchData":
"""
Create patch object from a commit object
"""
patch = cls(
commitID=commit.hexsha,
author=commit.author.name,
authorEmail=commit.author.email,
authorTime=datetime.utcfromtimestamp(commit.authored_date),
commitTime=datetime.utcfromtimestamp(commit.committed_date),
)
description = []
fixed_patches = []
for num, line in enumerate(commit.message.splitlines()):
line = line.strip() # pylint: disable=redefined-loop-name
if not num:
patch.subject = line
continue
if line.lower().startswith(IGNORED_IN_CMSG):
continue
description.append(line)
# Check if this patch fixes other patches
if line.lower().startswith("fixes:"):
words = line.split(" ")
if len(words) > 1:
fixed_patches.append(words[1])
patch.description = "\n".join(description)
patch.fixedPatches = " ".join(fixed_patches) # e.g. "SHA1 SHA2 SHA3"
patch.affectedFilenames = " ".join(get_filenames(commit))
patch.commitDiffs = format_diffs(commit, paths)
return patch
class PatchDataMeta(Base):
"""

Просмотреть файл

@ -15,7 +15,6 @@ from comma.database.model import (
PatchData,
)
from comma.downstream.matcher import patch_matches
from comma.upstream import process_commits
from comma.util.tracking import get_linux_repo
@ -186,6 +185,8 @@ class Downstream:
Attempt to determine which patches are missing from a list of missing cherries
"""
paths = self.repo.get_tracked_paths(self.config.sections)
with self.database.get_session() as session:
patches = (
session.query(PatchData)
@ -199,7 +200,19 @@ class Downstream:
# Get the downstream commits for this revision (these are distinct from upstream because
# they have been cherry-picked). This is slow but necessary!
downstream_patches = process_commits(revision=reference, since=earliest_commit_date)
LOGGER.info("Determining downstream commits from tracked files")
# We use `--min-parents=1 --max-parents=1` to avoid both merges and graft commits
downstream_patches = tuple(
PatchData.create(commit, paths)
for commit in self.repo.iter_commits(
rev=reference,
paths=paths,
min_parents=1,
max_parents=1,
since=earliest_commit_date,
)
)
# Double check the missing cherries using our fuzzy algorithm.
LOGGER.info("Starting confidence matching for %d upstream patches...", len(patches))

Просмотреть файл

@ -6,7 +6,7 @@ Functions to compare commits for similarities
import logging
import os
from typing import Iterable, List
from typing import Iterable
from fuzzywuzzy import fuzz
@ -64,7 +64,7 @@ def calculate_filenames_confidence(
return total_filepaths_match / len(upstream_filepaths)
def patch_matches(downstream_patches: List[PatchData], upstream: PatchData) -> bool:
def patch_matches(downstream_patches: Iterable[PatchData], upstream: PatchData) -> bool:
"""Check if 'upstream' has an equivalent in 'downstream_patches'."""
# Preprocessing for matching filenames

Просмотреть файл

@ -5,144 +5,59 @@ Functions for parsing commit objects into patch objects
"""
import logging
from datetime import datetime
from typing import List, Optional, Set
from functools import cached_property
from comma.database.driver import DatabaseDriver
from comma.database.model import PatchData
from comma.util import config
from comma.util.tracking import get_filenames, get_linux_repo
from comma.util.tracking import get_linux_repo
IGNORED_FIELDS = "reported-by:", "signed-off-by:", "reviewed-by:", "acked-by:", "cc:"
LOGGER = logging.getLogger(__name__)
def format_diffs(commit, paths):
class Upstream:
"""
Format diffs from commit object into string
Parent object for downstream operations
"""
diffs = []
# We are ignoring merges so all commits should have a single parent
for diff in commit.tree.diff(commit.parents[0], paths=paths, create_patch=True):
if diff.a_path is not None:
# The patch commit diffs are stored as "(filename1)\n(diff1)\n(filename2)\n(diff2)..."
lines = "\n".join(
line
for line in diff.diff.decode("utf-8").splitlines()
if line.startswith(("+", "-"))
)
diffs.append(f"{diff.a_path}\n{lines}")
def __init__(self, config, database) -> None:
self.config = config
self.database = database
return "\n".join(diffs)
@cached_property
def repo(self):
"""
Get repo when first accessed
"""
return get_linux_repo(since=self.config.since)
def process_commits(self):
"""
Generate patches for commits affecting tracked paths
"""
def create_patch(commit, paths) -> PatchData:
"""
Create patch object from a commit object
"""
paths = self.repo.get_tracked_paths(self.config.sections)
added = 0
total = 0
patch: PatchData = PatchData(
commitID=commit.hexsha,
author=commit.author.name,
authorEmail=commit.author.email,
authorTime=datetime.utcfromtimestamp(commit.authored_date),
commitTime=datetime.utcfromtimestamp(commit.committed_date),
)
description = []
fixed_patches = []
for num, line in enumerate(commit.message.splitlines()):
line = line.strip() # pylint: disable=redefined-loop-name
if not num:
patch.subject = line
continue
if line.lower().startswith(IGNORED_FIELDS):
continue
description.append(line)
# Check if this patch fixes other patches
if line.lower().startswith("fixes:"):
words = line.split(" ")
if len(words) > 1:
fixed_patches.append(words[1])
patch.description = "\n".join(description)
patch.fixedPatches = " ".join(fixed_patches) # e.g. "SHA1 SHA2 SHA3"
patch.affectedFilenames = " ".join(get_filenames(commit))
patch.commitDiffs = format_diffs(commit, paths)
return patch
def process_commits(
commit_ids: Optional[Set[str]] = None,
revision: str = "origin/master",
add_to_database: bool = False,
since: str = config.since,
) -> List[PatchData]:
"""
Look at all commits in the given repo and handle based on distro.
commit_ids: Set of commits to process
revision: revision we want to see the commits of, or None
add_to_database: whether or not to add to database (side-effect)
since: if provided, will only process commits after this commit
"""
repo = get_linux_repo(since=since)
paths = repo.get_tracked_paths(config.sections)
if commit_ids is None:
# We use `--min-parents=1 --max-parents=1` to avoid both merges and graft commits.
LOGGER.info("Determining commits from tracked files")
commits = repo.iter_commits(
rev=revision,
LOGGER.info("Determining upstream commits from tracked files")
for commit in self.repo.iter_commits(
rev="origin/master",
paths=paths,
min_parents=1,
max_parents=1,
since=since,
)
else:
# If given a set of commit SHAs, get the commit objects.
commits = []
for id_ in commit_ids:
try:
commits.append(repo.commit(id_))
except ValueError:
LOGGER.warning("Commit '%s' does not exist in the repo! Skipping...", id_)
since=self.config.since,
):
total += 1
with self.database.get_session() as session:
# See if commit ID is in database
if (
session.query(PatchData.commitID)
.filter_by(commitID=commit.hexsha)
.one_or_none()
is None
):
session.add(PatchData.create(commit, paths))
added += 1
LOGGER.info("Starting commit processing...")
all_patches = []
num_patches_added = 0
for num, commit in enumerate(commits, 1):
# Skip root (initial) commit since it should always be present
# TODO (Issue 54): This can probably be removed
if commit.parents:
LOGGER.debug("Parsing commit %s", commit.hexsha)
patch: PatchData = create_patch(commit, paths)
if add_to_database:
with DatabaseDriver.get_session() as session:
if (
session.query(PatchData.commitID)
.filter_by(commitID=patch.commitID)
.one_or_none()
is None
):
session.add(patch)
num_patches_added += 1
else:
all_patches.append(patch)
if not num % 250:
LOGGER.debug(" %d commits processed...", num)
if add_to_database:
LOGGER.info("%d patches added to database.", num_patches_added)
return all_patches
LOGGER.info("%d of %d patches added to database.", added, total)

Просмотреть файл

@ -5,6 +5,26 @@ Utility functions and classes
"""
def format_diffs(commit, paths):
"""
Format diffs from commit object into string
"""
diffs = []
# We are ignoring merges so all commits should have a single parent
for diff in commit.tree.diff(commit.parents[0], paths=paths, create_patch=True):
if diff.a_path is not None:
# The patch commit diffs are stored as "(filename1)\n(diff1)\n(filename2)\n(diff2)..."
lines = "\n".join(
line
for line in diff.diff.decode("utf-8").splitlines()
if line.startswith(("+", "-"))
)
diffs.append(f"{diff.a_path}\n{lines}")
return "\n".join(diffs)
class PatchDiff:
"""
Representation of code changes in a patch