bugbug/scripts/microannotate_generator.py

155 строки
4.8 KiB
Python

# -*- coding: utf-8 -*-
import argparse
import os
import subprocess
from logging import INFO, basicConfig, getLogger
import tenacity
from microannotate import generator
from bugbug import db, repository
from bugbug.utils import ThreadPoolExecutorResult, get_secret, upload_s3
basicConfig(level=INFO)
logger = getLogger(__name__)
# When updating the version, the git repositories will be recreated from scratch.
# This is useful when new meaningful versions of rust-code-analysis or microannotate
# are used.
VERSION = 2
COMMITS_STEP = 5000
class MicroannotateGenerator(object):
def __init__(self, cache_root, repo_url, tokenize, remove_comments):
self.cache_root = cache_root
self.repo_url = repo_url
self.git_repo_path = os.path.basename(self.repo_url)
self.tokenize = tokenize
self.remove_comments = remove_comments
assert os.path.isdir(cache_root), f"Cache root {cache_root} is not a dir."
self.repo_dir = os.path.join(cache_root, "mozilla-central")
def generate(self):
db_path = os.path.join("data", self.git_repo_path)
db.register(
db_path,
"https://s3-us-west-2.amazonaws.com/communitytc-bugbug/data/",
VERSION,
)
is_old_version = db.is_old_schema(db_path)
with ThreadPoolExecutorResult(max_workers=2) as executor:
cloner = executor.submit(repository.clone, self.repo_dir)
cloner.add_done_callback(
lambda future: logger.info("mozilla-central cloned")
)
git_user = get_secret("GIT_USER")
git_password = get_secret("GIT_PASSWORD")
repo_push_url = self.repo_url.replace(
"https://", f"https://{git_user}:{git_password}@"
)
if not is_old_version:
executor.submit(self.clone_git_repo)
else:
executor.submit(self.init_git_repo)
subprocess.run(
["git", "config", "--global", "http.postBuffer", "12M"], check=True
)
push_args = ["git", "push", repo_push_url, "master"]
if is_old_version:
push_args.append("--force")
done = False
while not done:
done = generator.generate(
self.repo_dir,
self.git_repo_path,
limit=COMMITS_STEP,
tokenize=self.tokenize,
remove_comments=self.remove_comments,
)
tenacity.retry(
wait=tenacity.wait_exponential(multiplier=1, min=16, max=64),
stop=tenacity.stop_after_attempt(5),
)(lambda: subprocess.run(push_args, cwd=self.git_repo_path, check=True))()
# We are not using db.upload as we don't need to upload the git repo.
upload_s3([f"{db_path}.version"])
def init_git_repo(self):
subprocess.run(["git", "init", self.git_repo_path], check=True)
subprocess.run(
["git", "remote", "add", "origin", self.repo_url],
cwd=self.git_repo_path,
check=True,
)
def clone_git_repo(self):
tenacity.retry(
wait=tenacity.wait_exponential(multiplier=1, min=16, max=64),
stop=tenacity.stop_after_attempt(5),
)(
lambda: subprocess.run(
["git", "clone", "--quiet", self.repo_url, self.git_repo_path],
check=True,
)
)()
try:
tenacity.retry(
wait=tenacity.wait_exponential(multiplier=1, min=16, max=64),
stop=tenacity.stop_after_attempt(5),
)(
lambda: subprocess.run(
["git", "pull", "--quiet", self.repo_url, "master"],
cwd=self.git_repo_path,
capture_output=True,
check=True,
)
)()
except subprocess.CalledProcessError as e:
# When the repo is empty.
if b"Couldn't find remote ref master" in e.stdout:
pass
def main():
description = "Generate a mirror git repository where content is split by word"
parser = argparse.ArgumentParser(description=description)
parser.add_argument("cache-root", help="Cache for repository clones.")
parser.add_argument("repo-url", help="Mirror repository URL.")
parser.add_argument(
"--tokenize", help="Enable word-level tokenization.", action="store_true"
)
parser.add_argument(
"--remove-comments", help="Enable comment removal.", action="store_true"
)
args = parser.parse_args()
generator = MicroannotateGenerator(
getattr(args, "cache-root"),
getattr(args, "repo-url"),
args.tokenize,
args.remove_comments,
)
generator.generate()
if __name__ == "__main__":
main()