Move spreadsheet operations to class

This commit is contained in:
Avram Lubkin 2023-06-15 09:57:40 -04:00 коммит произвёл Avram Lubkin
Родитель 91f113f15f
Коммит 1edd645781
3 изменённых файлов: 212 добавлений и 240 удалений

Просмотреть файл

@ -13,7 +13,7 @@ from comma.database.model import Distros, MonitoringSubjects
from comma.downstream import Downstream
from comma.upstream import Upstream
from comma.util import config
from comma.util.spreadsheet import export_commits, import_commits, update_commits
from comma.util.spreadsheet import Spreadsheet
from comma.util.symbols import Symbols
from comma.util.tracking import get_linux_repo
@ -87,12 +87,11 @@ def main(args: Optional[Sequence[str]] = None):
run(options)
if options.subcommand == "spreadsheet":
if args.import_commits:
import_commits(args.in_file)
spreadsheet = Spreadsheet(config, DatabaseDriver())
if args.export_commits:
export_commits(args.in_file, args.out_file)
spreadsheet.export_commits(args.in_file, args.out_file)
if args.update_commits:
update_commits(args.in_file, args.out_file)
spreadsheet.update_commits(args.in_file, args.out_file)
if __name__ == "__main__":

Просмотреть файл

@ -83,12 +83,6 @@ def get_spreadsheet_parser():
parser = ArgumentParser(
"spreadsheet", description="Export to Excel spreadsheet", parents=BASE_PARSERS.values()
)
parser.add_argument(
"-i",
"--import-commits",
action="store_true",
help="Import commits from spreadsheet into database.",
)
parser.add_argument(
"-e",
"--export-commits",

Просмотреть файл

@ -8,8 +8,9 @@ import logging
import re
import sys
from datetime import datetime
from functools import cached_property
from pathlib import Path
from typing import Any, Dict, List, Set, Tuple
from typing import Any, Dict, List, Tuple
import git
import openpyxl
@ -18,45 +19,13 @@ from openpyxl.cell.cell import Cell
from openpyxl.workbook.workbook import Workbook
from openpyxl.worksheet.worksheet import Worksheet
from comma.database.driver import DatabaseDriver
from comma.database.model import Distros, MonitoringSubjects, PatchData
from comma.util import config, tracking
from comma.util.tracking import get_filenames, get_linux_repo
LOGGER = logging.getLogger(__name__)
def get_db_commits() -> Dict[str, int]:
"""Query the 'PatchData' table for all commit hashes and IDs."""
with DatabaseDriver.get_session() as session: # type: sqlalchemy.orm.session.Session
return dict(
session.query(PatchData.commitID, PatchData.patchID).filter(
# Exclude ~1000 CIFS patches.
~PatchData.affectedFilenames.like("%fs/cifs%")
)
)
def get_workbook(in_file: str) -> Tuple[Workbook, Worksheet]:
"""Open the spreadsheet and return it and the 'git log' worksheet.
Also fix the pivot table so the spreadsheet doesn't crash.
"""
if not Path(in_file).exists():
LOGGER.error("The file %s does not exist", in_file)
sys.exit(1)
workbook = openpyxl.load_workbook(filename=in_file)
# Force refresh of pivot table in “Pivot” worksheet.
LOGGER.debug("Finding worksheet named 'Pivot'...")
pivot = workbook["Pivot"]._pivots[0] # pylint: disable=protected-access
pivot.cache.refreshOnLoad = True
# The worksheet is manually named “git log”.
LOGGER.debug("Finding worksheet named 'git log'...")
worksheet = workbook["git log"]
return (workbook, worksheet)
def get_column(worksheet: Worksheet, name: str) -> Cell:
"""Gets the header cell for the given column name.
@ -70,209 +39,219 @@ def get_column(worksheet: Worksheet, name: str) -> Cell:
return next(cell for cell in worksheet[1] if cell.value == name)
def get_wb_commits(worksheet: Worksheet) -> Set[str]:
"""Get every commit in the workbook."""
# Skip the header and all None values.
column = get_column(worksheet, "Commit ID").column_letter
return {cell.value for cell in worksheet[column][1:] if cell.value is not None}
def import_commits(in_file: str) -> None:
"""This adds commits from the spreadsheet into the database.
This lets us track additional commits that were manually added to
the spreadsheet, but were not automatically found by CommA's
upstream monitoring logic.
"""
LOGGER.error("Importing is not supported at this time! filename: %s", in_file)
sys.exit(1)
# TODO (Issue 55): Implement import from database
# to the database, and therefore affect untracked paths.
# from comma.upstream import process_commits
# LOGGER.info(f"Importing commits from spreadsheet '{in_file}'...")
# workbook, worksheet = get_workbook(in_file)
# wb_commits = get_wb_commits(worksheet)
# db_commits = get_db_commits()
# missing_commits = wb_commits - db_commits.keys()
# LOGGER.info(f"Adding {len(missing_commits)} commits to database...")
# process_commits(commit_ids=missing_commits, add_to_database=True)
# LOGGER.info("Finished importing!")
def include_commit(sha: str, repo: tracking.Repo, base_commit: git.Commit) -> bool:
"""Determine if we should export the commit."""
# Skip empty values (such as if cell.value was passed).
if sha is None:
LOGGER.warning("Given SHA was 'None'!")
return False
# Skip commits that are not in the repo.
try:
commit = repo.commit(sha)
except ValueError:
LOGGER.warning("Commit '%s' not in repo!", sha)
return False
# Skip commits before the chosen base.
if base_commit and not repo.is_ancestor(base_commit, commit):
LOGGER.debug("Commit '%s' is too old!", sha)
return False
# Skip commits to tools.
filenames = tracking.get_filenames(commit)
if any(f.startswith("tools/hv/") for f in filenames):
LOGGER.debug("Commit '%s' is in 'tools/hv/'!", sha)
return False
return True
def get_release(sha: str, repo: tracking.Repo) -> str:
"""Get the v5.7 from v5.7-rc1-2-gc81992e7f."""
try:
# NOTE: This must be ordered “--contains <SHA>” for Git.
tag = repo.git.describe("--contains", sha)
# Use "(v[^-~]+(-rc[0-9]+)?)[-~]" to include -rcX # pylint: disable=wrong-spelling-in-comment
return re.search(r"(v[^-~]*)[-~]", tag)[1]
except git.GitCommandError:
return "N/A"
def create_commit_row(sha: str, repo: tracking.Repo, worksheet: Worksheet) -> Dict[str, Any]:
"""Create a row with the commit's SHA, date, release and title."""
commit = repo.commit(sha)
# TODO (Issue 40): Some (but not all) of this info is available in the
# database, so if add the release to the database we can skip
# using the commit here.
date = datetime.utcfromtimestamp(commit.authored_date).date()
title = commit.message.split("\n")[0]
# The worksheet has additional columns with manually entered
# info, which we cant insert, so we skip them.
def get_letter(name: str) -> str:
return get_column(worksheet, name).column_letter
return {
get_letter("Commit ID"): sha,
get_letter("Date"): date,
get_letter("Release"): get_release(sha, repo),
get_letter("Commit Title"): title[: min(len(title), 120)],
}
def export_commits(in_file: str, out_file: str) -> None:
"""This adds commits from the database to the spreadsheet.
This lets us automatically update the spreadsheet by adding
commits which CommA found. It adds the basic information available
from the commit.
"""
workbook, worksheet = get_workbook(in_file)
wb_commits = get_wb_commits(worksheet)
# Collect the commits in the database which are not in the
# workbook, but that we want to include.
db_commits = get_db_commits()
repo = tracking.get_linux_repo(since=config.since)
tag = "v4.15"
if tag in repo.references:
LOGGER.info("Skipping commits before tag '%s'!", tag)
base_commit = repo.commit(tag)
else:
LOGGER.warning("Tag '%s' not in local repo, not limiting commits by age", tag)
base_commit = None
missing_commits = [
commit
for commit in list(db_commits.keys() - wb_commits)
if include_commit(commit, repo, base_commit)
]
# Append each missing commit as a new row to the commits
# worksheet.
LOGGER.info("Exporting %d commits to %s", len(missing_commits), out_file)
for commit in missing_commits:
worksheet.append(create_commit_row(commit, repo, worksheet))
workbook.save(out_file)
LOGGER.info("Finished exporting!")
def get_distros() -> List[str]:
"""Collect the distros were tracking in the database."""
with DatabaseDriver.get_session() as session:
# TODO (Issue 51): Handle Debian.
return [
distro
for (distro,) in session.query(Distros.distroID)
if not distro.startswith("Debian")
]
def get_fixed_patches(commit: str, commits: Dict[str, int]) -> str:
"""Get the fixed patches for the given commit."""
with DatabaseDriver.get_session() as session:
patch = session.query(PatchData).filter_by(patchID=commits[commit]).one()
# The database stores these separated by a space, but we want
# commas in the spreadsheet.
return ", ".join(patch.fixedPatches.split()) if patch.fixedPatches else None
def get_revision(distro: str, commit: str, commits: Dict[str, int]) -> str:
"""Get the kernel revision which includes commit or 'Absent'."""
# NOTE: For some distros (e.g. Ubuntu), we continually add new revisions (Git tags) as they
# become available, so we need the max ID, which is the most recent.
with DatabaseDriver.get_session() as session:
subject, _ = (
session.query(
MonitoringSubjects,
sqlalchemy.func.max(MonitoringSubjects.monitoringSubjectID),
)
.filter_by(distroID=distro)
.one()
)
# TODO (Issue 40): We could try to simplify this using the monitoringSubject relationship
# on the PatchData table, but because the database tracks whats missing, it becomes
# hard to state where the patch is present.
missing_patch = subject.missingPatches.filter_by(patchID=commits[commit]).one_or_none()
return subject.revision if missing_patch is None else "Absent"
def get_cell(worksheet, name: str, row) -> Cell:
"""Get the cell of the named column in this commit's row."""
return worksheet[f"{get_column(worksheet, name).column_letter}{row}"]
def update_commits(in_file: str, out_file: str) -> None:
"""Update each row with the 'Fixes' and distro information."""
workbook, worksheet = get_workbook(in_file)
commits = get_db_commits()
distros = get_distros()
for distro in distros:
try:
get_column(worksheet, distro)
except StopIteration:
LOGGER.ERROR(f"No column with distro '{distro}', please fix spreadsheet!")
sys.exit(1)
def get_workbook(in_file: str) -> Tuple[Workbook, Worksheet]:
"""Open the spreadsheet and return it and the 'git log' worksheet.
commit_column = get_column(worksheet, "Commit ID").column_letter
Also fix the pivot table so the spreadsheet doesn't crash.
for commit_cell in worksheet[commit_column][1:]: # Skip the header row
commit = commit_cell.value
if commit is None:
continue # Ignore empty rows.
"""
if not Path(in_file).exists():
LOGGER.error("The file %s does not exist", in_file)
sys.exit(1)
workbook = openpyxl.load_workbook(filename=in_file)
# Update “Fixes” column.
if commit in commits:
get_cell(worksheet, "Fixes", commit_cell.row).value = get_fixed_patches(commit, commits)
# Force refresh of pivot table in “Pivot” worksheet.
LOGGER.debug("Finding worksheet named 'Pivot'...")
pivot = workbook["Pivot"]._pivots[0] # pylint: disable=protected-access
pivot.cache.refreshOnLoad = True
# Update all distro columns.
for distro in distros:
if commit in commits:
get_cell(worksheet, distro, commit_cell.row).value = get_revision(
distro, commit, commits
# The worksheet is manually named “git log”.
LOGGER.debug("Finding worksheet named 'git log'...")
worksheet = workbook["git log"]
return (workbook, worksheet)
class Spreadsheet:
"""
Parent object for symbol operations
"""
def __init__(self, config, database) -> None:
self.config = config
self.database = database
@cached_property
def repo(self):
"""
Get repo when first accessed
"""
return get_linux_repo(since=self.config.since)
def get_db_commits(self) -> Dict[str, int]:
"""Query the 'PatchData' table for all commit hashes and IDs."""
with self.database.get_session() as session: # type: sqlalchemy.orm.session.Session
return dict(
session.query(PatchData.commitID, PatchData.patchID).filter(
# Exclude ~1000 CIFS patches.
~PatchData.affectedFilenames.like("%fs/cifs%")
)
else:
get_cell(worksheet, distro, commit_cell.row).value = "Unknown"
)
workbook.save(out_file)
LOGGER.info("Finished updating!")
def include_commit(self, sha: str, base_commit: git.Commit) -> bool:
"""Determine if we should export the commit."""
# Skip empty values (such as if cell.value was passed).
if sha is None:
LOGGER.warning("Given SHA was 'None'!")
return False
# Skip commits that are not in the repo.
try:
commit = self.repo.commit(sha)
except ValueError:
LOGGER.warning("Commit '%s' not in repo!", sha)
return False
# Skip commits before the chosen base.
if base_commit and not self.repo.is_ancestor(base_commit, commit):
LOGGER.debug("Commit '%s' is too old!", sha)
return False
# Skip commits to tools.
filenames = get_filenames(commit)
if any(f.startswith("tools/hv/") for f in filenames):
LOGGER.debug("Commit '%s' is in 'tools/hv/'!", sha)
return False
return True
def get_release(self, sha: str) -> str:
"""Get the v5.7 from v5.7-rc1-2-gc81992e7f."""
try:
# NOTE: This must be ordered “--contains <SHA>” for Git.
tag = self.repo.git.describe("--contains", sha)
# Use "(v[^-~]+(-rc[0-9]+)?)[-~]" to include -rcX # pylint: disable=wrong-spelling-in-comment
return re.search(r"(v[^-~]*)[-~]", tag)[1]
except git.GitCommandError:
return "N/A"
def create_commit_row(self, sha: str, worksheet: Worksheet) -> Dict[str, Any]:
"""Create a row with the commit's SHA, date, release and title."""
commit = self.repo.commit(sha)
# TODO (Issue 40): Some (but not all) of this info is available in the
# database, so if add the release to the database we can skip
# using the commit here.
date = datetime.utcfromtimestamp(commit.authored_date).date()
title = commit.message.split("\n")[0]
# The worksheet has additional columns with manually entered
# info, which we cant insert, so we skip them.
def get_letter(name: str) -> str:
return get_column(worksheet, name).column_letter
return {
get_letter("Commit ID"): sha,
get_letter("Date"): date,
get_letter("Release"): self.get_release(sha),
get_letter("Commit Title"): title[: min(len(title), 120)],
}
def export_commits(self, in_file: str, out_file: str) -> None:
"""This adds commits from the database to the spreadsheet.
This lets us automatically update the spreadsheet by adding commits which CommA found.
It adds the basic information available from the commit.
"""
workbook, worksheet = get_workbook(in_file)
column = get_column(worksheet, "Commit ID").column_letter
wb_commits = {cell.value for cell in worksheet[column][1:] if cell.value is not None}
# Collect the commits in the database and not in the workbook, but that we want to include.
db_commits = self.get_db_commits()
tag = "v4.15"
if tag in self.repo.references:
LOGGER.info("Skipping commits before tag '%s'!", tag)
base_commit = self.repo.commit(tag)
else:
LOGGER.warning("Tag '%s' not in local repo, not limiting commits by age", tag)
base_commit = None
missing_commits = [
commit
for commit in list(db_commits.keys() - wb_commits)
if self.include_commit(commit, base_commit)
]
# Append each missing commit as a new row to the commits worksheet.
LOGGER.info("Exporting %d commits to %s", len(missing_commits), out_file)
for commit in missing_commits:
worksheet.append(self.create_commit_row(commit, worksheet))
workbook.save(out_file)
LOGGER.info("Finished exporting!")
def get_distros(self) -> List[str]:
"""Collect the distros were tracking in the database."""
with self.database.get_session() as session:
# TODO (Issue 51): Handle Debian.
return [
distro
for (distro,) in session.query(Distros.distroID)
if not distro.startswith("Debian")
]
def get_fixed_patches(self, commit: str, commits: Dict[str, int]) -> str:
"""Get the fixed patches for the given commit."""
with self.database.get_session() as session:
patch = session.query(PatchData).filter_by(patchID=commits[commit]).one()
# The database stores these separated by a space, but we want commas in the spreadsheet.
return ", ".join(patch.fixedPatches.split()) if patch.fixedPatches else None
def get_revision(self, distro: str, commit: str, commits: Dict[str, int]) -> str:
"""Get the kernel revision which includes commit or 'Absent'."""
# NOTE: For some distros (e.g. Ubuntu), we continually add new revisions (Git tags) as they
# become available, so we need the max ID, which is the most recent.
with self.database.get_session() as session:
subject, _ = (
session.query(
MonitoringSubjects,
sqlalchemy.func.max(MonitoringSubjects.monitoringSubjectID),
)
.filter_by(distroID=distro)
.one()
)
# TODO (Issue 40): We could try to simplify this using the monitoringSubject
# relationship on the PatchData table, but because the database tracks whats missing,
# it becomes hard to state where the patch is present.
missing_patch = subject.missingPatches.filter_by(patchID=commits[commit]).one_or_none()
return subject.revision if missing_patch is None else "Absent"
def update_commits(self, in_file: str, out_file: str) -> None:
"""Update each row with the 'Fixes' and distro information."""
workbook, worksheet = get_workbook(in_file)
commits = self.get_db_commits()
distros = self.get_distros()
for distro in distros:
try:
get_column(worksheet, distro)
except StopIteration:
LOGGER.error("No column with distro '%s', please fix spreadsheet!", distro)
sys.exit(1)
commit_column = get_column(worksheet, "Commit ID").column_letter
for commit_cell in worksheet[commit_column][1:]: # Skip the header row
commit = commit_cell.value
if commit is None:
continue # Ignore empty rows.
# Update “Fixes” column.
if commit in commits:
get_cell(worksheet, "Fixes", commit_cell.row).value = self.get_fixed_patches(
commit, commits
)
# Update all distro columns.
for distro in distros:
if commit in commits:
get_cell(worksheet, distro, commit_cell.row).value = self.get_revision(
distro, commit, commits
)
else:
get_cell(worksheet, distro, commit_cell.row).value = "Unknown"
workbook.save(out_file)
LOGGER.info("Finished updating!")