diff --git a/bugbug/phabricator.py b/bugbug/phabricator.py index d4faba1f..0a438790 100644 --- a/bugbug/phabricator.py +++ b/bugbug/phabricator.py @@ -27,6 +27,13 @@ db.register( 4, ) +FIXED_COMMENTS_DB = "data/fixed_comments.json" +db.register( + REVISIONS_DB, + "https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.fixed_comments.latest/artifacts/public/fixed_comments.json.zst", + 1, +) + PHABRICATOR_API = None TESTING_PROJECTS = { diff --git a/infra/data-pipeline.yml b/infra/data-pipeline.yml index 3d4086ec..2e6a7898 100644 --- a/infra/data-pipeline.yml +++ b/infra/data-pipeline.yml @@ -234,6 +234,45 @@ tasks: owner: release-mgmt-analysis@mozilla.com source: ${repository}/raw/master/data-pipeline.yml + - ID: fixed-comments-retrieval + created: { $fromNow: "" } + deadline: { $fromNow: "2 days" } + expires: { $fromNow: "1 year" } + provisionerId: proj-bugbug + workerType: compute-small + dependencies: + - revisions-retrieval + payload: + env: + TC_SECRET_ID: project/bugbug/production + maxRunTime: 86400 + image: mozilla/bugbug-base:${version} + command: + - "bugbug-fixed-comments" + + artifacts: + public/bugs.json.zst: + path: /data/fixed_comments.json.zst + type: file + public/bugs.json.version: + path: /data/fixed_comments.json.version + type: file + + features: + taskclusterProxy: true + scopes: + - "secrets:get:project/bugbug/production" + routes: + - notify.email.release-mgmt-analysis@mozilla.com.on-failed + - notify.irc-channel.#bugbug.on-failed + - index.project.bugbug.fixed_comments.${version} + - index.project.bugbug.fixed_comments.latest + metadata: + name: bugbug fixed comments retrieval + description: bugbug fixed comments retrieval + owner: release-mgmt-analysis@mozilla.com + source: ${repository}/raw/master/data-pipeline.yml + - ID: issues-retrieval created: { $fromNow: "" } deadline: { $fromNow: "2 days" } diff --git a/scripts/inline_comments_data_collection.py b/scripts/inline_comments_data_collection.py index 60f6661f..3b7bdeb0 100644 --- a/scripts/inline_comments_data_collection.py +++ b/scripts/inline_comments_data_collection.py @@ -1,13 +1,14 @@ -import json import logging import os import re +import orjson import requests from libmozdata.phabricator import PhabricatorAPI +from bugbug import phabricator from bugbug.tools.code_review import PhabricatorReviewData -from bugbug.utils import get_secret +from bugbug.utils import get_secret, zstd_compress review_data = PhabricatorReviewData() @@ -183,11 +184,13 @@ def process_comments(patch_threshold, diff_length_threshold): def main(): os.makedirs("patches", exist_ok=True) - os.makedirs("dataset", exist_ok=True) - dataset_file_path = "dataset/inline_comment_dataset2.json" - with open(dataset_file_path, "a") as dataset_file_handle: + os.makedirs("data", exist_ok=True) + + with open(phabricator.FIXED_COMMENTS_DB, "wb") as dataset_file_handle: for data in process_comments(patch_threshold=1000, diff_length_threshold=5000): - dataset_file_handle.write(json.dumps(data) + "\n") + dataset_file_handle.write(orjson.dumps(data) + b"\n") + + zstd_compress(phabricator.FIXED_COMMENTS_DB) if __name__ == "__main__":