зеркало из https://github.com/mozilla/bugbug.git
Perform augmentation directly when retrieving labels
Former-commit-id: 8d0d63403b
This commit is contained in:
Родитель
8492939b08
Коммит
887514cafa
18
get_bugs.py
18
get_bugs.py
|
@ -116,7 +116,7 @@ def download_bugs(bug_ids):
|
|||
append_db(BUGS_DB, new_bugs.values())
|
||||
|
||||
|
||||
def get_labels():
|
||||
def get_labels(augmentation=False):
|
||||
with open('classes.csv', 'r') as f:
|
||||
classes = dict([row for row in csv.reader(f)][1:])
|
||||
|
||||
|
@ -137,8 +137,22 @@ def get_labels():
|
|||
# Turn bug IDs into integers and labels into booleans.
|
||||
classes = {int(bug_id): True if label == 'True' else False for bug_id, label in classes.items()}
|
||||
|
||||
# Use bugs marked as 'regression' or 'feature', as they are basically labelled.
|
||||
bug_ids = set()
|
||||
for bug in read_db(BUGS_DB):
|
||||
bug_id = int(bug['id'])
|
||||
|
||||
bug_ids.add(bug_id)
|
||||
|
||||
if bug_id in classes:
|
||||
continue
|
||||
|
||||
if any(keyword in bug['keywords'] for keyword in ['regression', 'talos-regression']) or ('cf_has_regression_range' in bug and bug['cf_has_regression_range'] == 'yes'):
|
||||
classes[bug_id] = True
|
||||
elif any(keyword in bug['keywords'] for keyword in ['feature']):
|
||||
classes[bug_id] = False
|
||||
|
||||
# Remove labels which belong to bugs for which we have no data.
|
||||
bug_ids = set([int(bug['id']) for bug in read_db(BUGS_DB)])
|
||||
classes = {bug_id: label for bug_id, label in classes.items() if bug_id in bug_ids}
|
||||
|
||||
return classes
|
||||
|
|
12
run.py
12
run.py
|
@ -40,21 +40,11 @@ class SpacyVectorizer(TfidfVectorizer):
|
|||
|
||||
def go(lemmatization=False):
|
||||
# Get labels.
|
||||
classes = get_labels()
|
||||
classes = get_labels(augmentation=True)
|
||||
|
||||
# Retrieve bugs from the local db.
|
||||
bugs_map = get_bugs()
|
||||
|
||||
# Use bugs marked as 'regression' or 'feature', as they are basically labelled.
|
||||
for bug_id, bug in bugs_map.items():
|
||||
if bug_id in classes:
|
||||
continue
|
||||
|
||||
if any(keyword in bug['keywords'] for keyword in ['regression', 'talos-regression']) or ('cf_has_regression_range' in bug and bug['cf_has_regression_range'] == 'yes'):
|
||||
classes[bug_id] = True
|
||||
elif any(keyword in bug['keywords'] for keyword in ['feature']):
|
||||
classes[bug_id] = False
|
||||
|
||||
# Turn the classes map into a numpy array for scikit-learn consumption.
|
||||
y = np.array([1 if is_bug is True else 0 for bug_id, is_bug in classes.items()])
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче