Perform augmentation directly when retrieving labels

Former-commit-id: 8d0d63403b
This commit is contained in:
Marco Castelluccio 2018-10-11 20:12:43 +02:00
Родитель 8492939b08
Коммит 887514cafa
2 изменённых файлов: 17 добавлений и 13 удалений

Просмотреть файл

@ -116,7 +116,7 @@ def download_bugs(bug_ids):
append_db(BUGS_DB, new_bugs.values())
def get_labels():
def get_labels(augmentation=False):
with open('classes.csv', 'r') as f:
classes = dict([row for row in csv.reader(f)][1:])
@ -137,8 +137,22 @@ def get_labels():
# Turn bug IDs into integers and labels into booleans.
classes = {int(bug_id): True if label == 'True' else False for bug_id, label in classes.items()}
# Use bugs marked as 'regression' or 'feature', as they are basically labelled.
bug_ids = set()
for bug in read_db(BUGS_DB):
bug_id = int(bug['id'])
bug_ids.add(bug_id)
if bug_id in classes:
continue
if any(keyword in bug['keywords'] for keyword in ['regression', 'talos-regression']) or ('cf_has_regression_range' in bug and bug['cf_has_regression_range'] == 'yes'):
classes[bug_id] = True
elif any(keyword in bug['keywords'] for keyword in ['feature']):
classes[bug_id] = False
# Remove labels which belong to bugs for which we have no data.
bug_ids = set([int(bug['id']) for bug in read_db(BUGS_DB)])
classes = {bug_id: label for bug_id, label in classes.items() if bug_id in bug_ids}
return classes

12
run.py
Просмотреть файл

@ -40,21 +40,11 @@ class SpacyVectorizer(TfidfVectorizer):
def go(lemmatization=False):
# Get labels.
classes = get_labels()
classes = get_labels(augmentation=True)
# Retrieve bugs from the local db.
bugs_map = get_bugs()
# Use bugs marked as 'regression' or 'feature', as they are basically labelled.
for bug_id, bug in bugs_map.items():
if bug_id in classes:
continue
if any(keyword in bug['keywords'] for keyword in ['regression', 'talos-regression']) or ('cf_has_regression_range' in bug and bug['cf_has_regression_range'] == 'yes'):
classes[bug_id] = True
elif any(keyword in bug['keywords'] for keyword in ['feature']):
classes[bug_id] = False
# Turn the classes map into a numpy array for scikit-learn consumption.
y = np.array([1 if is_bug is True else 0 for bug_id, is_bug in classes.items()])