Refacter downstream matcher
This commit is contained in:
Родитель
eda435ec0c
Коммит
825f6966b2
|
@ -1,109 +1,100 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
"""
|
||||
Functions to compare commits for similarities
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List
|
||||
import os
|
||||
from typing import Iterable, List
|
||||
|
||||
from fuzzywuzzy import fuzz
|
||||
|
||||
from comma.database.model import PatchData
|
||||
from comma.util.patch_diffs import PatchDiffs
|
||||
|
||||
# Confidence weights
|
||||
AUTHOR_WEIGHT = 0.2
|
||||
AUTHOR_DATE_WEIGHT = 0.01 # This addresses some edge cases of identical other fields
|
||||
COMMIT_DATE_WEIGHT = 0.01 # This addresses some edge cases of identical other fields
|
||||
DESCRIPTION_WEIGHT = 0.1
|
||||
FILENAMES_WEIGHT = 0.2
|
||||
SUBJECT_WEIGHT = 0.48
|
||||
|
||||
CONFIDENCE_THRESHOLD = 0.75 # Threshold that we must hit to return a match
|
||||
|
||||
|
||||
def calculate_filenames_confidence(
|
||||
downstream_filepaths: Iterable[str], upstream_filepaths: Iterable[str]
|
||||
) -> float:
|
||||
"""
|
||||
Calculate filenames confidence
|
||||
Roughly the percent of upstream filepaths present in downstream filepaths
|
||||
"""
|
||||
|
||||
# If they are the same, for example empty, confidence is high
|
||||
if downstream_filepaths == upstream_filepaths:
|
||||
return 1.0
|
||||
|
||||
# If only one is empty, confidence is low
|
||||
if "" in (downstream_filepaths, upstream_filepaths):
|
||||
return 0.0
|
||||
|
||||
total_filepaths_match = 0
|
||||
downstream_file_components = [os.path.split(filepath) for filepath in downstream_filepaths]
|
||||
|
||||
for upstream_path, upstream_name in (
|
||||
os.path.split(filepath) for filepath in upstream_filepaths
|
||||
):
|
||||
max_match = 0.0
|
||||
# Find best matching downstream filepath
|
||||
for downstream_path, downstream_name in downstream_file_components:
|
||||
if upstream_name != downstream_name:
|
||||
continue
|
||||
# 0.5 for matching filename
|
||||
# The paths are fuzzymatched scaled 0.0-0.5 for remaining match
|
||||
match = 0.5 + (fuzz.partial_ratio(upstream_path, downstream_path) / 200.0)
|
||||
if match > max_match:
|
||||
max_match = match
|
||||
total_filepaths_match += max_match
|
||||
|
||||
return total_filepaths_match / len(upstream_filepaths)
|
||||
|
||||
|
||||
def patch_matches(downstream_patches: List[PatchData], upstream: PatchData) -> bool:
|
||||
"""Check if 'upstream' has an equivalent in 'downstream_patches'."""
|
||||
# Confidence weights
|
||||
author_weight = 0.2
|
||||
subject_weight = 0.48
|
||||
description_weight = 0.1
|
||||
filenames_weight = 0.2
|
||||
author_date_weight = 0.01 # This addresses some edge cases of identical other fields
|
||||
commit_date_weight = 0.01 # This addresses some edge cases of identical other fields
|
||||
|
||||
# Threshold that we must hit to return a match
|
||||
threshold = 0.75
|
||||
|
||||
# Preprocessing for matching filenames
|
||||
upstream_filepaths = upstream.affectedFilenames.split(" ")
|
||||
upstream_file_components = [
|
||||
_get_filepath_components(filepath) for filepath in upstream_filepaths
|
||||
]
|
||||
|
||||
logging.debug("Upstream missing patch, %s", upstream.commitID)
|
||||
for downstream in downstream_patches:
|
||||
# Calculate confidence that our upstream patch matches this downstream patch
|
||||
|
||||
# Calculate filenames confidence, which is roughly the percent
|
||||
# of upstream filepaths present in downstream patch.
|
||||
if downstream.affectedFilenames == "" or upstream.affectedFilenames == "":
|
||||
filenames_confidence = (
|
||||
1.0 if (downstream.affectedFilenames == upstream.affectedFilenames) else 0.0
|
||||
)
|
||||
else:
|
||||
total_filepaths_match = 0
|
||||
downstream_filepaths = downstream.affectedFilenames.split(" ")
|
||||
downstream_file_components = [
|
||||
_get_filepath_components(filepath) for filepath in downstream_filepaths
|
||||
]
|
||||
|
||||
for upstream_path, upstream_name in upstream_file_components:
|
||||
max_match = 0.0
|
||||
# Find best matching downstream filepath
|
||||
for (
|
||||
downstream_path,
|
||||
downstream_name,
|
||||
) in downstream_file_components:
|
||||
if upstream_name == downstream_name:
|
||||
# 0.5 for matching filename, the paths are
|
||||
# fuzzymatched scaled 0.0-0.5 for remaining
|
||||
# match
|
||||
match = 0.5 + (fuzz.partial_ratio(upstream_path, downstream_path) / 200.0)
|
||||
else:
|
||||
match = 0.0
|
||||
|
||||
if match > max_match:
|
||||
max_match = match
|
||||
total_filepaths_match += max_match
|
||||
|
||||
filenames_confidence = float(total_filepaths_match) / len(upstream_filepaths)
|
||||
|
||||
author_confidence = fuzz.token_set_ratio(upstream.author, downstream.author) / 100.0
|
||||
subject_confidence = fuzz.partial_ratio(upstream.subject, downstream.subject) / 100.0
|
||||
# Temporarily for description only checking exact string is in
|
||||
if upstream.description == "":
|
||||
description_confidence = 1.0 if downstream.description == "" else 0.0
|
||||
else:
|
||||
description_confidence = 1.0 if upstream.description in downstream.description else 0.0
|
||||
author_date_confidence = 1.0 if upstream.authorTime == downstream.authorTime else 0.0
|
||||
commit_date_confidence = 1.0 if upstream.commitTime == downstream.commitTime else 0.0
|
||||
|
||||
confidence = (
|
||||
author_weight * author_confidence
|
||||
+ subject_weight * subject_confidence
|
||||
+ description_weight * description_confidence
|
||||
+ filenames_weight * filenames_confidence
|
||||
+ author_date_confidence * author_date_weight
|
||||
+ commit_date_confidence * commit_date_weight
|
||||
# Temporarily for description only checking exact string is in
|
||||
description_confidence = 1.0 if upstream.description in downstream.description else 0.0
|
||||
filenames_confidence = calculate_filenames_confidence(
|
||||
downstream.affectedFilenames.split(" "), upstream_filepaths
|
||||
)
|
||||
if confidence >= threshold:
|
||||
subject_confidence = fuzz.partial_ratio(upstream.subject, downstream.subject) / 100.0
|
||||
|
||||
if (
|
||||
AUTHOR_WEIGHT * author_confidence
|
||||
+ AUTHOR_DATE_WEIGHT * author_date_confidence
|
||||
+ COMMIT_DATE_WEIGHT * commit_date_confidence
|
||||
+ DESCRIPTION_WEIGHT * description_confidence
|
||||
+ FILENAMES_WEIGHT * filenames_confidence
|
||||
+ SUBJECT_WEIGHT * subject_confidence
|
||||
) >= CONFIDENCE_THRESHOLD:
|
||||
return True
|
||||
|
||||
# TODO just do this part?...
|
||||
# Check for code matching
|
||||
upstream_diffs = PatchDiffs(upstream.commitDiffs)
|
||||
for downstream in downstream_patches:
|
||||
downstream_diffs = PatchDiffs(downstream.commitDiffs)
|
||||
code_match_confidence = upstream_diffs.percent_present_in(downstream_diffs)
|
||||
if code_match_confidence > threshold:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _get_filepath_components(filepath):
|
||||
"""
|
||||
Splits filepath to return (path, filename)
|
||||
"""
|
||||
components = filepath.rsplit("/", 1)
|
||||
if len(components) == 1:
|
||||
return (None, components[0])
|
||||
return (components[0], components[1])
|
||||
return any(
|
||||
upstream_diffs.percent_present_in(PatchDiffs(downstream.commitDiffs)) > CONFIDENCE_THRESHOLD
|
||||
for downstream in downstream_patches
|
||||
)
|
||||
|
|
Загрузка…
Ссылка в новой задаче