Add number of previous commits touching the same modules as a feature (#229)

This commit is contained in:
Shubham Garg 2019-04-05 20:34:23 +05:30 коммит произвёл Marco
Родитель 981ee61140
Коммит 8ffe134db3
1 изменённых файлов: 70 добавлений и 10 удалений

Просмотреть файл

@ -23,13 +23,16 @@ from bugbug import db
COMMITS_DB = 'data/commits.json'
db.register(COMMITS_DB, 'https://www.dropbox.com/s/mz3afgncx0siijc/commits.json.xz?dl=1', 'v1')
COMPONENTS = {}
path_to_component = {}
Commit = namedtuple('Commit', ['node', 'author', 'desc', 'date', 'bug', 'backedoutby', 'author_email'])
Commit = namedtuple('Commit', ['node', 'author', 'desc', 'date', 'bug', 'backedoutby', 'author_email', 'files', 'file_copies'])
author_experience = {}
author_experience_90_days = {}
components_touched_prev = defaultdict(int)
components_touched_prev_90_days = defaultdict(int)
def get_commits():
return db.read(COMMITS_DB)
@ -63,9 +66,11 @@ def _transform(commit):
'files_modified_num': 0,
'types': set(),
'components': list(),
'author_experience': author_experience[commit],
'author_experience_90_days': author_experience_90_days[commit],
'author_experience': author_experience[commit.node],
'author_experience_90_days': author_experience_90_days[commit.node],
'author_email': commit.author_email.decode('utf-8'),
'components_touched_prev': components_touched_prev[commit.node],
'components_touched_prev_90_days': components_touched_prev_90_days[commit.node],
}
patch = HG.export(revs=[commit.node], git=True)
@ -103,13 +108,13 @@ def _transform(commit):
# Covert to a list, as a set is not JSON-serializable.
obj['types'] = list(obj['types'])
obj['components'] = list(set('::'.join(COMPONENTS[fl]) for fl in patch_data.keys() if COMPONENTS.get(fl)))
obj['components'] = list(set(path_to_component[path] for path in patch_data.keys() if path_to_component.get(path)))
return obj
def _hg_log(revs):
template = '{node}\\0{author}\\0{desc}\\0{date}\\0{bug}\\0{backedoutby}\\0{author|email}\\0'
template = '{node}\\0{author}\\0{desc}\\0{date}\\0{bug}\\0{backedoutby}\\0{author|email}\\0{join(files,"|")}\\0{join(file_copies,"|")}\\0'
args = hglib.util.cmdbuilder(b'log', template=template, no_merges=True, rev=revs[0] + b':' + revs[-1], branch='central')
x = HG.rawcommand(args)
@ -120,6 +125,16 @@ def _hg_log(revs):
posixtime = float(rev[3].split(b'.', 1)[0])
dt = datetime.fromtimestamp(posixtime)
file_copies = {}
for file_copy in rev[8].decode('utf-8').split('|'):
if not file_copy:
continue
parts = file_copy.split(' (')
copied = parts[0]
orig = parts[1][:-1]
file_copies[orig] = copied
revs.append(Commit(
node=rev[0],
author=rev[1],
@ -128,6 +143,8 @@ def _hg_log(revs):
bug=rev[4],
backedoutby=rev[5],
author_email=rev[6],
files=rev[7].decode('utf-8').split('|'),
file_copies=file_copies,
))
return revs
@ -199,7 +216,7 @@ def download_commits(repo_dir, date_from):
global author_experience
global author_experience_90_days
for commit in commits:
author_experience[commit] = total_commits_by_author[commit.author]
author_experience[commit.node] = total_commits_by_author[commit.author]
# We don't want to consider backed out commits when calculating author/reviewer experience.
if not commit.backedoutby:
total_commits_by_author[commit.author] += 1
@ -216,15 +233,58 @@ def download_commits(repo_dir, date_from):
if cut is not None:
commits_by_author[commit.author] = commits_by_author[commit.author][cut + 1:]
author_experience_90_days[commit] = len(commits_by_author[commit.author])
author_experience_90_days[commit.node] = len(commits_by_author[commit.author])
if not commit.backedoutby:
commits_by_author[commit.author].append(commit)
global COMPONENTS
global path_to_component
r = requests.get('https://index.taskcluster.net/v1/task/gecko.v2.mozilla-central.latest.source.source-bugzilla-info/artifacts/public/components.json')
r.raise_for_status()
COMPONENTS = r.json()
path_to_component = r.json()
path_to_component = {path: '::'.join(component) for path, component in path_to_component.items()}
global components_touched_prev
global components_touched_prev_90_days
components_touched = defaultdict(int)
prev_commits_90_days = []
for commit in commits:
components = set(path_to_component[path] for path in commit.files if path in path_to_component)
for component in components:
components_touched_prev[commit.node] += components_touched[component]
components_touched[component] += 1
if len(commit.file_copies) > 0:
for orig, copied in commit.file_copies.items():
if orig in path_to_component and copied in path_to_component:
components_touched[path_to_component[copied]] = components_touched[path_to_component[orig]]
for i, prev_commit in enumerate(prev_commits_90_days):
if (commit.date - prev_commit.date).days <= 90:
break
cut = i
if cut is not None:
prev_commits_90_days = prev_commits_90_days[cut + 1:]
components_touched_90_days = defaultdict(int)
for prev_commit in prev_commits_90_days:
components_prev = set(path_to_component[path] for path in prev_commit.files if path in path_to_component)
for component_prev in components_prev:
components_touched_90_days[component_prev] += 1
if len(prev_commit.file_copies) > 0:
for orig, copied in prev_commit.file_copies.items():
if orig in path_to_component and copied in path_to_component:
components_touched_90_days[path_to_component[copied]] = components_touched_90_days[path_to_component[orig]]
components_touched_prev_90_days[commit.node] = sum(components_touched_90_days[component] for component in components)
prev_commits_90_days.append(commit)
print(f'Mining commits using {multiprocessing.cpu_count()} processes...')