[inline_comments_data_collection] Add CL arguments for patch and diff length thresholds (#4470)

This commit is contained in:
Benjamin Mah 2024-09-06 17:21:14 -04:00 коммит произвёл GitHub
Родитель b63e883762
Коммит 6bfef0ce42
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
3 изменённых файлов: 33 добавлений и 4 удалений

Просмотреть файл

@ -1,3 +1,4 @@
import argparse
import logging import logging
import os import os
import re import re
@ -118,7 +119,7 @@ def extract_relevant_diff(patch_diff, filename):
return None return None
def process_comments(patch_threshold, diff_length_threshold): def process_comments(limit, diff_length_limit):
patch_count = 0 patch_count = 0
for patch_id, comments in review_data.get_all_inline_comments(lambda c: True): for patch_id, comments in review_data.get_all_inline_comments(lambda c: True):
@ -159,7 +160,7 @@ def process_comments(patch_threshold, diff_length_threshold):
logger.error(f"Failed to fetch diff: {e}") logger.error(f"Failed to fetch diff: {e}")
continue continue
if len(patch_diff) > diff_length_threshold: if len(patch_diff) > diff_length_limit:
continue continue
relevant_diff = extract_relevant_diff(patch_diff, comment.filename) relevant_diff = extract_relevant_diff(patch_diff, comment.filename)
@ -178,16 +179,35 @@ def process_comments(patch_threshold, diff_length_threshold):
yield data yield data
patch_count += 1 patch_count += 1
if patch_count >= patch_threshold: if patch_count >= limit:
break break
def main(): def main():
parser = argparse.ArgumentParser(description="Process patch reviews.")
parser.add_argument(
"--limit",
type=int,
default=None,
help="Limit the number of patches to process. No limit if not specified.",
)
parser.add_argument(
"--diff-length-limit",
type=int,
default=1000,
help="Limit the maximum allowed diff length. No limit if not specified.",
)
args = parser.parse_args()
limit = args.limit or float("inf")
diff_length_limit = args.diff_length_limit or float("inf")
os.makedirs("patches", exist_ok=True) os.makedirs("patches", exist_ok=True)
os.makedirs("data", exist_ok=True) os.makedirs("data", exist_ok=True)
with open(phabricator.FIXED_COMMENTS_DB, "wb") as dataset_file_handle: with open(phabricator.FIXED_COMMENTS_DB, "wb") as dataset_file_handle:
for data in process_comments(patch_threshold=1000, diff_length_threshold=5000): for data in process_comments(limit=limit, diff_length_limit=diff_length_limit):
dataset_file_handle.write(orjson.dumps(data) + b"\n") dataset_file_handle.write(orjson.dumps(data) + b"\n")
zstd_compress(phabricator.FIXED_COMMENTS_DB) zstd_compress(phabricator.FIXED_COMMENTS_DB)

Просмотреть файл

@ -22,6 +22,14 @@ ls -lh data
# Removes it to ensure the commit retrieval work as expected # Removes it to ensure the commit retrieval work as expected
rm data/commit* rm data/commit*
# Then generate a test dataset of fixed inline comments
bugbug-fixed-comments --limit 150
ls -lh
ls -lh data
# Remove DB to ensure it works as expected
rm data/fixed_comments.json
# Then retrieve a subset of commit data # Then retrieve a subset of commit data
bugbug-data-commits --limit 500 "${CACHE_DIR:-cache}" bugbug-data-commits --limit 500 "${CACHE_DIR:-cache}"
test -d ${CACHE_DIR:-cache}/mozilla-central test -d ${CACHE_DIR:-cache}/mozilla-central

Просмотреть файл

@ -62,6 +62,7 @@ setup(
"bugbug-generate-landings-risk-report = scripts.generate_landings_risk_report:main", "bugbug-generate-landings-risk-report = scripts.generate_landings_risk_report:main",
"bugbug-shadow-scheduler-stats = scripts.shadow_scheduler_stats:main", "bugbug-shadow-scheduler-stats = scripts.shadow_scheduler_stats:main",
"bugbug-data-github = scripts.github_issue_retriever:main", "bugbug-data-github = scripts.github_issue_retriever:main",
"bugbug-fixed-comments = scripts.inline_comments_data_collection:main",
] ]
}, },
classifiers=[ classifiers=[