Refacter downstream matcher

2023-05-24 16:28:25 -04:00 · 2023-05-24 16:28:25 -04:00 · 825f6966b2
--- a/comma/downstream/matcher.py
+++ b/comma/downstream/matcher.py
@ -1,109 +1,100 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
+"""
+Functions to compare commits for similarities
+"""
+
 import logging
-from typing import List
+import os
+from typing import Iterable, List

 from fuzzywuzzy import fuzz

 from comma.database.model import PatchData
 from comma.util.patch_diffs import PatchDiffs

+# Confidence weights
+AUTHOR_WEIGHT = 0.2
+AUTHOR_DATE_WEIGHT = 0.01  # This addresses some edge cases of identical other fields
+COMMIT_DATE_WEIGHT = 0.01  # This addresses some edge cases of identical other fields
+DESCRIPTION_WEIGHT = 0.1
+FILENAMES_WEIGHT = 0.2
+SUBJECT_WEIGHT = 0.48
+
+CONFIDENCE_THRESHOLD = 0.75  # Threshold that we must hit to return a match
+
+
+def calculate_filenames_confidence(
+    downstream_filepaths: Iterable[str], upstream_filepaths: Iterable[str]
+) -> float:
+    """
+    Calculate filenames confidence
+    Roughly the percent of upstream filepaths present in downstream filepaths
+    """
+
+    # If they are the same, for example empty, confidence is high
+    if downstream_filepaths == upstream_filepaths:
+        return 1.0
+
+    # If only one is empty, confidence is low
+    if "" in (downstream_filepaths, upstream_filepaths):
+        return 0.0
+
+    total_filepaths_match = 0
+    downstream_file_components = [os.path.split(filepath) for filepath in downstream_filepaths]
+
+    for upstream_path, upstream_name in (
+        os.path.split(filepath) for filepath in upstream_filepaths
+    ):
+        max_match = 0.0
+        # Find best matching downstream filepath
+        for downstream_path, downstream_name in downstream_file_components:
+            if upstream_name != downstream_name:
+                continue
+            # 0.5 for matching filename
+            # The paths are fuzzymatched scaled 0.0-0.5 for remaining match
+            match = 0.5 + (fuzz.partial_ratio(upstream_path, downstream_path) / 200.0)
+            if match > max_match:
+                max_match = match
+        total_filepaths_match += max_match
+
+    return total_filepaths_match / len(upstream_filepaths)
+

 def patch_matches(downstream_patches: List[PatchData], upstream: PatchData) -> bool:
    """Check if 'upstream' has an equivalent in 'downstream_patches'."""
-    # Confidence weights
-    author_weight = 0.2
-    subject_weight = 0.48
-    description_weight = 0.1
-    filenames_weight = 0.2
-    author_date_weight = 0.01  # This addresses some edge cases of identical other fields
-    commit_date_weight = 0.01  # This addresses some edge cases of identical other fields
-
-    # Threshold that we must hit to return a match
-    threshold = 0.75

    # Preprocessing for matching filenames
    upstream_filepaths = upstream.affectedFilenames.split(" ")
-    upstream_file_components = [
-        _get_filepath_components(filepath) for filepath in upstream_filepaths
-    ]

    logging.debug("Upstream missing patch, %s", upstream.commitID)
    for downstream in downstream_patches:
        # Calculate confidence that our upstream patch matches this downstream patch

-        # Calculate filenames confidence, which is roughly the percent
-        # of upstream filepaths present in downstream patch.
-        if downstream.affectedFilenames == "" or upstream.affectedFilenames == "":
-            filenames_confidence = (
-                1.0 if (downstream.affectedFilenames == upstream.affectedFilenames) else 0.0
-            )
-        else:
-            total_filepaths_match = 0
-            downstream_filepaths = downstream.affectedFilenames.split(" ")
-            downstream_file_components = [
-                _get_filepath_components(filepath) for filepath in downstream_filepaths
-            ]
-
-            for upstream_path, upstream_name in upstream_file_components:
-                max_match = 0.0
-                # Find best matching downstream filepath
-                for (
-                    downstream_path,
-                    downstream_name,
-                ) in downstream_file_components:
-                    if upstream_name == downstream_name:
-                        # 0.5 for matching filename, the paths are
-                        # fuzzymatched scaled 0.0-0.5 for remaining
-                        # match
-                        match = 0.5 + (fuzz.partial_ratio(upstream_path, downstream_path) / 200.0)
-                    else:
-                        match = 0.0
-
-                    if match > max_match:
-                        max_match = match
-                total_filepaths_match += max_match
-
-            filenames_confidence = float(total_filepaths_match) / len(upstream_filepaths)
-
        author_confidence = fuzz.token_set_ratio(upstream.author, downstream.author) / 100.0
-        subject_confidence = fuzz.partial_ratio(upstream.subject, downstream.subject) / 100.0
-        # Temporarily for description only checking exact string is in
-        if upstream.description == "":
-            description_confidence = 1.0 if downstream.description == "" else 0.0
-        else:
-            description_confidence = 1.0 if upstream.description in downstream.description else 0.0
        author_date_confidence = 1.0 if upstream.authorTime == downstream.authorTime else 0.0
        commit_date_confidence = 1.0 if upstream.commitTime == downstream.commitTime else 0.0
-
-        confidence = (
-            author_weight * author_confidence
-            + subject_weight * subject_confidence
-            + description_weight * description_confidence
-            + filenames_weight * filenames_confidence
-            + author_date_confidence * author_date_weight
-            + commit_date_confidence * commit_date_weight
+        # Temporarily for description only checking exact string is in
+        description_confidence = 1.0 if upstream.description in downstream.description else 0.0
+        filenames_confidence = calculate_filenames_confidence(
+            downstream.affectedFilenames.split(" "), upstream_filepaths
        )
-        if confidence >= threshold:
+        subject_confidence = fuzz.partial_ratio(upstream.subject, downstream.subject) / 100.0
+
+        if (
+            AUTHOR_WEIGHT * author_confidence
+            + AUTHOR_DATE_WEIGHT * author_date_confidence
+            + COMMIT_DATE_WEIGHT * commit_date_confidence
+            + DESCRIPTION_WEIGHT * description_confidence
+            + FILENAMES_WEIGHT * filenames_confidence
+            + SUBJECT_WEIGHT * subject_confidence
+        ) >= CONFIDENCE_THRESHOLD:
            return True

    # TODO just do this part?...
    # Check for code matching
    upstream_diffs = PatchDiffs(upstream.commitDiffs)
-    for downstream in downstream_patches:
-        downstream_diffs = PatchDiffs(downstream.commitDiffs)
-        code_match_confidence = upstream_diffs.percent_present_in(downstream_diffs)
-        if code_match_confidence > threshold:
-            return True
-
-    return False
-
-
-def _get_filepath_components(filepath):
-    """
-    Splits filepath to return (path, filename)
-    """
-    components = filepath.rsplit("/", 1)
-    if len(components) == 1:
-        return (None, components[0])
-    return (components[0], components[1])
+    return any(
+        upstream_diffs.percent_present_in(PatchDiffs(downstream.commitDiffs)) > CONFIDENCE_THRESHOLD
+        for downstream in downstream_patches
+    )