Move label gathering code in the models themselves

This commit is contained in:
Marco Castelluccio 2019-01-02 13:29:37 +01:00
Родитель 569a4ab8f1
Коммит a6f442b7b2
11 изменённых файлов: 481 добавлений и 215 удалений

Просмотреть файл

@ -7,128 +7,16 @@ import csv
import os
import sys
from bugbug import bugzilla
def get_labels_dir():
return os.path.join(os.path.dirname(sys.modules[__package__].__file__), 'labels')
def get_tracking_labels():
classes = {}
for bug_data in bugzilla.get_bugs():
bug_id = int(bug_data['id'])
for entry in bug_data['history']:
for change in entry['changes']:
if change['field_name'].startswith('cf_tracking_firefox'):
if change['added'] in ['blocking', '+']:
classes[bug_id] = True
elif change['added'] == '-':
classes[bug_id] = False
if bug_id not in classes:
classes[bug_id] = False
return classes
def get_qa_needed_labels():
classes = {}
for bug_data in bugzilla.get_bugs():
bug_id = int(bug_data['id'])
for entry in bug_data['history']:
for change in entry['changes']:
if change['added'].startswith('qawanted'):
classes[bug_id] = True
elif 'flags' in entry:
for flag in entry['flags']:
if flag['name'].startswith('qe-verify'):
classes[bug_id] = True
if bug_id not in classes:
classes[bug_id] = False
return classes
def get_bugbug_labels(kind='bug', augmentation=False):
assert kind in ['bug', 'regression']
classes = {}
with open(os.path.join(get_labels_dir(), 'bug_nobug.csv'), 'r') as f:
def get_labels(file_name):
with open(os.path.join(get_labels_dir(), '{}.csv'.format(file_name)), 'r') as f:
reader = csv.reader(f)
next(reader)
for bug_id, category in reader:
assert category in ['True', 'False'], 'unexpected category {}'.format(category)
if kind == 'bug':
classes[int(bug_id)] = True if category == 'True' else False
elif kind == 'regression':
if category == 'False':
classes[int(bug_id)] = False
with open(os.path.join(get_labels_dir(), 'regression_bug_nobug.csv'), 'r') as f:
reader = csv.reader(f)
next(reader)
for bug_id, category in reader:
assert category in ['nobug', 'bug_unknown_regression', 'bug_no_regression', 'regression'], 'unexpected category {}'.format(category)
if kind == 'bug':
classes[int(bug_id)] = True if category != 'nobug' else False
elif kind == 'regression':
if category == 'bug_unknown_regression':
continue
classes[int(bug_id)] = True if category == 'regression' else False
bug_ids = set()
for bug in bugzilla.get_bugs():
bug_id = int(bug['id'])
bug_ids.add(bug_id)
if bug_id in classes:
continue
# If augmentation is enabled, use bugs marked as 'regression' or 'feature',
# as they are basically labelled.
if not augmentation:
continue
if any(keyword in bug['keywords'] for keyword in ['regression', 'talos-regression']) or ('cf_has_regression_range' in bug and bug['cf_has_regression_range'] == 'yes'):
classes[bug_id] = True
elif any(keyword in bug['keywords'] for keyword in ['feature']):
classes[bug_id] = False
elif kind == 'regression':
for history in bug['history']:
for change in history['changes']:
if change['field_name'] == 'keywords' and change['removed'] == 'regression':
classes[bug_id] = False
# Remove labels which belong to bugs for which we have no data.
return {bug_id: label for bug_id, label in classes.items() if bug_id in bug_ids}
def get_uplift_labels():
classes = {}
for bug_data in bugzilla.get_bugs():
bug_id = int(bug_data['id'])
for attachment in bug_data['attachments']:
for flag in attachment['flags']:
if not flag['name'].startswith('approval-mozilla-') or flag['status'] not in ['+', '-']:
continue
if flag['status'] == '+':
classes[bug_id] = True
elif flag['status'] == '-':
classes[bug_id] = False
return classes
yield from reader
def get_all_bug_ids():

318
bugbug/labels/str.csv Normal file
Просмотреть файл

@ -0,0 +1,318 @@
bug_id,comment_num,has_str
641030,0,y
1041822,0,n
1041822,1,n
1041822,2,n
1041822,3,n
1041822,4,n
1041822,5,n
1041822,6,n
1041830,0,n
1041830,1,n
1041830,2,n
1041830,3,n
1041830,4,n
1041830,5,n
1041830,6,n
1041830,7,n
1041830,8,n
1041836,0,n
1041836,1,n
1041836,2,n
1041836,3,n
1041836,4,n
1041836,5,n
1041836,6,n
1041836,7,n
1041836,8,n
1044586,0,y
1046143,0,y
1046255,0,y
1049215,0,n
1051830,0,y
1052947,0,y
1055310,14,n
1055973,0,y
1058713,0,y
1060929,0,y
1067153,0,y
1068439,0,n
1069724,0,y
1070722,0,y
1070988,0,y
1071686,0,y
1072193,0,y
1072954,0,y
1072991,0,y
1073991,0,y
1077369,0,y
1078743,0,y
1080971,0,y
1081514,0,y
1081677,0,y
1082249,0,y
1083996,0,y
1091109,0,y
1092626,0,y
1100966,42,n
1103635,0,y
1105066,0,y
1107702,0,n
1109155,0,y
1111337,0,y
1113238,0,y
1113834,0,y
1114774,1,n
1117984,0,y
1120065,0,y
1121706,0,y
1124221,0,y
1131463,0,n
1131685,0,y
1132918,0,y
1133356,0,y
1134531,0,y
1137234,0,y
1137906,0,y
1140616,0,y
1143644,0,y
1149608,0,n
1152469,0,y
1156636,0,y
1159259,0,y
1162372,0,y
1166133,0,y
1167105,0,y
1168081,0,y
1173548,0,y
1173792,0,y
1175941,0,y
1176018,0,y
1176551,0,y
1177619,0,y
1185886,0,y
1185927,0,y
1187232,0,y
1187404,0,y
1189715,0,y
1190112,0,y
1191539,0,y
1192539,0,y
1192720,0,y
1193695,0,y
1197569,0,y
1198723,0,y
1199296,0,y
1200602,0,y
1204483,0,y
1204809,0,y
1205476,0,n
1205833,0,y
1206545,0,y
1207536,0,y
1207546,0,y
1207821,0,y
1213688,19,n
1214261,0,y
1215948,0,y
1216366,0,y
1221030,0,y
1224936,0,y
1225125,0,y
1225882,0,y
1229742,0,y
1231758,0,y
1232087,0,n
1232328,0,y
1232346,0,y
1233429,0,y
1233625,0,y
1238427,0,y
1240561,0,y
1243034,0,y
1243413,0,y
1243657,0,y
1243657,14,n
1244597,25,n
1246606,0,y
1248948,0,y
1252974,0,y
1253399,0,y
1257063,0,n
1260022,0,y
1261228,0,y
1262125,0,y
1266372,0,y
1268069,0,y
1268141,0,y
1271173,0,y
1273024,0,y
1273882,0,y
1274362,0,y
1275880,0,y
1276717,0,n
1277295,0,y
1278221,0,y
1278581,0,y
1279036,0,y
1279744,0,n
1279928,0,y
1283721,0,y
1286459,0,n
1287066,0,y
1291270,8,n
1292337,0,y
1292904,0,y
1294602,0,y
1295193,0,y
1295354,0,y
1297549,0,y
1298205,0,n
1299324,0,y
1300805,0,y
1302414,0,y
1303727,0,y
1309219,0,y
1309413,0,y
1313272,0,n
1314128,0,y
1314491,0,y
1315608,0,y
1319911,0,y
1320502,0,y
1320565,0,y
1321069,0,y
1322274,17,n
1322441,0,y
1325955,0,y
1326163,0,y
1328023,0,y
1329386,0,y
1329631,0,y
1330609,0,y
1330836,0,y
1334677,0,n
1338005,0,y
1339497,0,y
1340127,0,y
1341521,0,y
1348148,0,n
1348701,0,y
1349552,19,n
1351102,0,y
1352004,0,y
1352108,0,y
1353039,0,y
1353041,0,y
1356883,0,y
1356921,0,y
1357098,0,y
1358964,0,y
1362764,0,n
1362984,8,n
1363406,0,y
1364727,0,y
1365133,213,n
1366824,0,y
1367688,0,y
1368464,0,y
1368852,0,y
1369246,0,y
1372043,0,y
1372448,0,y
1373528,0,y
1373823,0,y
1373937,0,n
1374584,0,y
1374653,0,y
1376406,0,y
1377597,0,y
1379624,0,n
1380323,0,y
1381682,0,y
1382719,22,n
1383363,0,y
1385440,0,y
1386483,0,y
1387476,0,y
1388043,0,y
1388394,0,y
1389377,0,y
1390863,0,y
1392659,0,y
1397241,0,y
1397737,0,y
1399388,0,y
1399400,0,y
1399559,0,y
1399651,0,y
1400165,0,y
1400556,0,y
1400604,0,y
1401224,0,y
1401943,0,y
1402244,0,y
1403166,0,y
1403750,0,y
1404497,0,y
1405696,0,y
1406414,0,y
1406509,0,y
1407748,0,y
1407983,6,n
1408613,0,y
1408834,0,y
1409634,0,n
1410028,0,y
1410225,0,y
1412213,0,y
1417014,0,y
1418814,0,y
1419173,0,y
1421240,0,y
1421905,0,y
1422215,0,y
1422478,0,y
1423810,0,y
1426081,0,y
1428174,0,y
1435451,0,y
1436311,0,y
1437310,1,y
1437832,0,y
1439285,0,y
1439857,0,y
1443224,0,y
1443583,10,n
1445207,0,y
1446445,0,y
1447052,0,y
1449887,0,y
1450248,0,y
1451683,0,y
1451688,0,y
1452392,0,y
1452673,0,y
1452819,0,y
1453718,0,y
1454572,0,y
1457039,0,y
1458615,0,y
1458866,0,y
1461635,0,y
1461881,0,y
1462906,0,y
1463694,1,n
1464789,0,y
1465616,117,n
1471935,0,y
1474284,0,y
1478754,0,y
1480934,0,y
1483148,0,y
1483593,0,n
1483865,6,n
1485178,0,y
1487263,0,y
1488105,0,y
1498940,0,y
1 bug_id comment_num has_str
2 641030 0 y
3 1041822 0 n
4 1041822 1 n
5 1041822 2 n
6 1041822 3 n
7 1041822 4 n
8 1041822 5 n
9 1041822 6 n
10 1041830 0 n
11 1041830 1 n
12 1041830 2 n
13 1041830 3 n
14 1041830 4 n
15 1041830 5 n
16 1041830 6 n
17 1041830 7 n
18 1041830 8 n
19 1041836 0 n
20 1041836 1 n
21 1041836 2 n
22 1041836 3 n
23 1041836 4 n
24 1041836 5 n
25 1041836 6 n
26 1041836 7 n
27 1041836 8 n
28 1044586 0 y
29 1046143 0 y
30 1046255 0 y
31 1049215 0 n
32 1051830 0 y
33 1052947 0 y
34 1055310 14 n
35 1055973 0 y
36 1058713 0 y
37 1060929 0 y
38 1067153 0 y
39 1068439 0 n
40 1069724 0 y
41 1070722 0 y
42 1070988 0 y
43 1071686 0 y
44 1072193 0 y
45 1072954 0 y
46 1072991 0 y
47 1073991 0 y
48 1077369 0 y
49 1078743 0 y
50 1080971 0 y
51 1081514 0 y
52 1081677 0 y
53 1082249 0 y
54 1083996 0 y
55 1091109 0 y
56 1092626 0 y
57 1100966 42 n
58 1103635 0 y
59 1105066 0 y
60 1107702 0 n
61 1109155 0 y
62 1111337 0 y
63 1113238 0 y
64 1113834 0 y
65 1114774 1 n
66 1117984 0 y
67 1120065 0 y
68 1121706 0 y
69 1124221 0 y
70 1131463 0 n
71 1131685 0 y
72 1132918 0 y
73 1133356 0 y
74 1134531 0 y
75 1137234 0 y
76 1137906 0 y
77 1140616 0 y
78 1143644 0 y
79 1149608 0 n
80 1152469 0 y
81 1156636 0 y
82 1159259 0 y
83 1162372 0 y
84 1166133 0 y
85 1167105 0 y
86 1168081 0 y
87 1173548 0 y
88 1173792 0 y
89 1175941 0 y
90 1176018 0 y
91 1176551 0 y
92 1177619 0 y
93 1185886 0 y
94 1185927 0 y
95 1187232 0 y
96 1187404 0 y
97 1189715 0 y
98 1190112 0 y
99 1191539 0 y
100 1192539 0 y
101 1192720 0 y
102 1193695 0 y
103 1197569 0 y
104 1198723 0 y
105 1199296 0 y
106 1200602 0 y
107 1204483 0 y
108 1204809 0 y
109 1205476 0 n
110 1205833 0 y
111 1206545 0 y
112 1207536 0 y
113 1207546 0 y
114 1207821 0 y
115 1213688 19 n
116 1214261 0 y
117 1215948 0 y
118 1216366 0 y
119 1221030 0 y
120 1224936 0 y
121 1225125 0 y
122 1225882 0 y
123 1229742 0 y
124 1231758 0 y
125 1232087 0 n
126 1232328 0 y
127 1232346 0 y
128 1233429 0 y
129 1233625 0 y
130 1238427 0 y
131 1240561 0 y
132 1243034 0 y
133 1243413 0 y
134 1243657 0 y
135 1243657 14 n
136 1244597 25 n
137 1246606 0 y
138 1248948 0 y
139 1252974 0 y
140 1253399 0 y
141 1257063 0 n
142 1260022 0 y
143 1261228 0 y
144 1262125 0 y
145 1266372 0 y
146 1268069 0 y
147 1268141 0 y
148 1271173 0 y
149 1273024 0 y
150 1273882 0 y
151 1274362 0 y
152 1275880 0 y
153 1276717 0 n
154 1277295 0 y
155 1278221 0 y
156 1278581 0 y
157 1279036 0 y
158 1279744 0 n
159 1279928 0 y
160 1283721 0 y
161 1286459 0 n
162 1287066 0 y
163 1291270 8 n
164 1292337 0 y
165 1292904 0 y
166 1294602 0 y
167 1295193 0 y
168 1295354 0 y
169 1297549 0 y
170 1298205 0 n
171 1299324 0 y
172 1300805 0 y
173 1302414 0 y
174 1303727 0 y
175 1309219 0 y
176 1309413 0 y
177 1313272 0 n
178 1314128 0 y
179 1314491 0 y
180 1315608 0 y
181 1319911 0 y
182 1320502 0 y
183 1320565 0 y
184 1321069 0 y
185 1322274 17 n
186 1322441 0 y
187 1325955 0 y
188 1326163 0 y
189 1328023 0 y
190 1329386 0 y
191 1329631 0 y
192 1330609 0 y
193 1330836 0 y
194 1334677 0 n
195 1338005 0 y
196 1339497 0 y
197 1340127 0 y
198 1341521 0 y
199 1348148 0 n
200 1348701 0 y
201 1349552 19 n
202 1351102 0 y
203 1352004 0 y
204 1352108 0 y
205 1353039 0 y
206 1353041 0 y
207 1356883 0 y
208 1356921 0 y
209 1357098 0 y
210 1358964 0 y
211 1362764 0 n
212 1362984 8 n
213 1363406 0 y
214 1364727 0 y
215 1365133 213 n
216 1366824 0 y
217 1367688 0 y
218 1368464 0 y
219 1368852 0 y
220 1369246 0 y
221 1372043 0 y
222 1372448 0 y
223 1373528 0 y
224 1373823 0 y
225 1373937 0 n
226 1374584 0 y
227 1374653 0 y
228 1376406 0 y
229 1377597 0 y
230 1379624 0 n
231 1380323 0 y
232 1381682 0 y
233 1382719 22 n
234 1383363 0 y
235 1385440 0 y
236 1386483 0 y
237 1387476 0 y
238 1388043 0 y
239 1388394 0 y
240 1389377 0 y
241 1390863 0 y
242 1392659 0 y
243 1397241 0 y
244 1397737 0 y
245 1399388 0 y
246 1399400 0 y
247 1399559 0 y
248 1399651 0 y
249 1400165 0 y
250 1400556 0 y
251 1400604 0 y
252 1401224 0 y
253 1401943 0 y
254 1402244 0 y
255 1403166 0 y
256 1403750 0 y
257 1404497 0 y
258 1405696 0 y
259 1406414 0 y
260 1406509 0 y
261 1407748 0 y
262 1407983 6 n
263 1408613 0 y
264 1408834 0 y
265 1409634 0 n
266 1410028 0 y
267 1410225 0 y
268 1412213 0 y
269 1417014 0 y
270 1418814 0 y
271 1419173 0 y
272 1421240 0 y
273 1421905 0 y
274 1422215 0 y
275 1422478 0 y
276 1423810 0 y
277 1426081 0 y
278 1428174 0 y
279 1435451 0 y
280 1436311 0 y
281 1437310 1 y
282 1437832 0 y
283 1439285 0 y
284 1439857 0 y
285 1443224 0 y
286 1443583 10 n
287 1445207 0 y
288 1446445 0 y
289 1447052 0 y
290 1449887 0 y
291 1450248 0 y
292 1451683 0 y
293 1451688 0 y
294 1452392 0 y
295 1452673 0 y
296 1452819 0 y
297 1453718 0 y
298 1454572 0 y
299 1457039 0 y
300 1458615 0 y
301 1458866 0 y
302 1461635 0 y
303 1461881 0 y
304 1462906 0 y
305 1463694 1 n
306 1464789 0 y
307 1465616 117 n
308 1471935 0 y
309 1474284 0 y
310 1478754 0 y
311 1480934 0 y
312 1483148 0 y
313 1483593 0 n
314 1483865 6 n
315 1485178 0 y
316 1487263 0 y
317 1488105 0 y
318 1498940 0 y

Просмотреть файл

@ -27,16 +27,18 @@ class Model():
return []
def train(self):
classes = self.get_labels()
# Get bugs.
def bugs_all():
return bugzilla.get_bugs()
# Filter out bugs for which we have no labels.
def bugs():
return (bug for bug in bugs_all() if bug['id'] in self.classes)
return (bug for bug in bugs_all() if bug['id'] in classes)
# Calculate labels.
y = np.array([1 if self.classes[bug['id']] else 0 for bug in bugs()])
y = np.array([1 if classes[bug['id']] else 0 for bug in bugs()])
# Extract features from the bugs.
X = self.extraction_pipeline.fit_transform(bugs())

Просмотреть файл

@ -9,6 +9,7 @@ from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from bugbug import bug_features
from bugbug import bugzilla
from bugbug import labels
from bugbug.model import Model
from bugbug.utils import DictSelector
@ -18,8 +19,6 @@ class BugModel(Model):
def __init__(self, lemmatization=False):
Model.__init__(self, lemmatization)
self.classes = labels.get_bugbug_labels(kind='bug', augmentation=True)
feature_extractors = [
bug_features.has_str(),
bug_features.severity(),
@ -65,6 +64,56 @@ class BugModel(Model):
])
self.clf = xgboost.XGBClassifier(n_jobs=16)
self.clf.set_params(tree_method='exact', predictor='cpu_predictor')
def get_bugbug_labels(self, kind='bug'):
assert kind in ['bug', 'regression']
classes = {}
for bug_id, category in labels.get_labels('bug_nobug'):
assert category in ['True', 'False'], 'unexpected category {}'.format(category)
if kind == 'bug':
classes[int(bug_id)] = True if category == 'True' else False
elif kind == 'regression':
if category == 'False':
classes[int(bug_id)] = False
for bug_id, category in labels.get_labels('regression_bug_nobug'):
assert category in ['nobug', 'bug_unknown_regression', 'bug_no_regression', 'regression'], 'unexpected category {}'.format(category)
if kind == 'bug':
classes[int(bug_id)] = True if category != 'nobug' else False
elif kind == 'regression':
if category == 'bug_unknown_regression':
continue
classes[int(bug_id)] = True if category == 'regression' else False
# Augment labes by using bugs marked as 'regression' or 'feature', as they are basically labelled.
bug_ids = set()
for bug in bugzilla.get_bugs():
bug_id = int(bug['id'])
bug_ids.add(bug_id)
if bug_id in classes:
continue
if any(keyword in bug['keywords'] for keyword in ['regression', 'talos-regression']) or ('cf_has_regression_range' in bug and bug['cf_has_regression_range'] == 'yes'):
classes[bug_id] = True
elif any(keyword in bug['keywords'] for keyword in ['feature']):
classes[bug_id] = False
elif kind == 'regression':
for history in bug['history']:
for change in history['changes']:
if change['field_name'] == 'keywords' and change['removed'] == 'regression':
classes[bug_id] = False
# Remove labels which belong to bugs for which we have no data.
return {bug_id: label for bug_id, label in classes.items() if bug_id in bug_ids}
def get_labels(self):
return self.get_bugbug_labels('bug')
def get_feature_names(self):
return ['data_' + name for name in self.data_vectorizer.get_feature_names()] +\

Просмотреть файл

@ -9,7 +9,7 @@ from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from bugbug import bug_features
from bugbug import labels
from bugbug import bugzilla
from bugbug.model import Model
from bugbug.utils import DictSelector
@ -18,8 +18,6 @@ class QANeededModel(Model):
def __init__(self, lemmatization=False):
Model.__init__(self, lemmatization)
self.classes = labels.get_qa_needed_labels()
feature_extractors = [
bug_features.has_str(),
bug_features.has_regression_range(),
@ -65,6 +63,26 @@ class QANeededModel(Model):
self.clf = xgboost.XGBClassifier(n_jobs=16)
def get_labels(self):
classes = {}
for bug_data in bugzilla.get_bugs():
bug_id = int(bug_data['id'])
for entry in bug_data['history']:
for change in entry['changes']:
if change['added'].startswith('qawanted'):
classes[bug_id] = True
elif 'flags' in entry:
for flag in entry['flags']:
if flag['name'].startswith('qe-verify'):
classes[bug_id] = True
if bug_id not in classes:
classes[bug_id] = False
return classes
def get_feature_names(self):
return ['data_' + name for name in self.data_vectorizer.get_feature_names()] +\
['title_' + name for name in self.title_vectorizer.get_feature_names()] +\

Просмотреть файл

@ -3,80 +3,12 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import xgboost
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from bugbug import bug_features
from bugbug import labels
from bugbug.model import Model
from bugbug.utils import DictSelector
from bugbug.models.bug import BugModel
class RegressionModel(Model):
class RegressionModel(BugModel):
def __init__(self, lemmatization=False):
Model.__init__(self, lemmatization)
BugModel.__init__(self, lemmatization)
self.classes = labels.get_bugbug_labels(kind='regression', augmentation=True)
feature_extractors = [
bug_features.has_str(),
bug_features.severity(),
# Ignore keywords that would make the ML completely skewed
# (we are going to use them as 100% rules in the evaluation phase).
bug_features.keywords(set(['regression', 'talos-regression', 'feature'])),
bug_features.is_coverity_issue(),
bug_features.has_crash_signature(),
bug_features.has_url(),
bug_features.has_w3c_url(),
bug_features.has_github_url(),
bug_features.whiteboard(),
bug_features.patches(),
bug_features.landings(),
bug_features.title(),
bug_features.comments(),
]
self.data_vectorizer = DictVectorizer()
self.title_vectorizer = self.text_vectorizer(stop_words='english')
self.comments_vectorizer = self.text_vectorizer(stop_words='english')
self.extraction_pipeline = Pipeline([
('bug_extractor', bug_features.BugExtractor(feature_extractors)),
('union', FeatureUnion(
transformer_list=[
('data', Pipeline([
('selector', DictSelector(key='data')),
('vect', self.data_vectorizer),
])),
('title', Pipeline([
('selector', DictSelector(key='title')),
('tfidf', self.title_vectorizer),
])),
('comments', Pipeline([
('selector', DictSelector(key='comments')),
('tfidf', self.comments_vectorizer),
])),
],
)),
])
self.clf = xgboost.XGBClassifier(n_jobs=16)
self.clf.set_params(tree_method='exact', predictor='cpu_predictor')
def get_feature_names(self):
return ['data_' + name for name in self.data_vectorizer.get_feature_names()] +\
['title_' + name for name in self.title_vectorizer.get_feature_names()] +\
['comments_' + name for name in self.comments_vectorizer.get_feature_names()]
def overwrite_classes(self, bugs, classes, probabilities):
for i, bug in enumerate(bugs):
if any(keyword in bug['keywords'] for keyword in ['regression', 'talos-regression']) or ('cf_has_regression_range' in bug and bug['cf_has_regression_range'] == 'yes'):
classes[i] = 1 if not probabilities else [1., 0.]
elif 'feature' in bug['keywords']:
classes[i] = 0 if not probabilities else [0., 1.]
return classes
def get_labels(self):
return self.get_bugbug_labels('regression')

Просмотреть файл

@ -9,7 +9,7 @@ from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from bugbug import bug_features
from bugbug import labels
from bugbug import bugzilla
from bugbug.model import Model
from bugbug.utils import DictSelector
@ -18,8 +18,6 @@ class TrackingModel(Model):
def __init__(self, lemmatization=False):
Model.__init__(self, lemmatization)
self.classes = labels.get_tracking_labels()
feature_extractors = [
bug_features.has_str(),
bug_features.has_regression_range(),
@ -65,6 +63,25 @@ class TrackingModel(Model):
self.clf = xgboost.XGBClassifier(n_jobs=16)
def get_labels(self):
classes = {}
for bug_data in bugzilla.get_bugs():
bug_id = int(bug_data['id'])
for entry in bug_data['history']:
for change in entry['changes']:
if change['field_name'].startswith('cf_tracking_firefox'):
if change['added'] in ['blocking', '+']:
classes[bug_id] = True
elif change['added'] == '-':
classes[bug_id] = False
if bug_id not in classes:
classes[bug_id] = False
return classes
def get_feature_names(self):
return ['data_' + name for name in self.data_vectorizer.get_feature_names()] +\
['title_' + name for name in self.title_vectorizer.get_feature_names()] +\

Просмотреть файл

@ -9,7 +9,7 @@ from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from bugbug import bug_features
from bugbug import labels
from bugbug import bugzilla
from bugbug.model import Model
from bugbug.utils import DictSelector
@ -18,8 +18,6 @@ class UpliftModel(Model):
def __init__(self, lemmatization=False):
Model.__init__(self, lemmatization)
self.classes = labels.get_uplift_labels()
feature_extractors = [
bug_features.has_str(),
bug_features.has_regression_range(),
@ -65,6 +63,24 @@ class UpliftModel(Model):
self.clf = xgboost.XGBClassifier(n_jobs=16)
def get_labels(self):
classes = {}
for bug_data in bugzilla.get_bugs():
bug_id = int(bug_data['id'])
for attachment in bug_data['attachments']:
for flag in attachment['flags']:
if not flag['name'].startswith('approval-mozilla-') or flag['status'] not in ['+', '-']:
continue
if flag['status'] == '+':
classes[bug_id] = True
elif flag['status'] == '-':
classes[bug_id] = False
return classes
def get_feature_names(self):
return ['data_' + name for name in self.data_vectorizer.get_feature_names()] +\
['title_' + name for name in self.title_vectorizer.get_feature_names()] +\

19
tests/test_bug.py Normal file
Просмотреть файл

@ -0,0 +1,19 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
from bugbug.models.bug import BugModel
def test_get_tracking_labels():
model = BugModel()
classes = model.get_labels()
# labels from bug_nobug.csv
assert classes[1087488]
assert not classes[1101825]
# labels from regression_bug_nobug.csv
assert not classes[447581] # nobug
assert classes[518272] # regression
assert classes[528988] # bug_unknown_regression
assert classes[1037762] # bug_no_regression

Просмотреть файл

@ -3,22 +3,16 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import os
from bugbug import labels
def test_get_tracking_labels():
classes = labels.get_tracking_labels()
assert not classes[1101825]
assert classes[1042096]
def test_get_labels_dir():
path = labels.get_labels_dir()
assert os.path.isabs(path)
assert path.endswith('labels')
def test_get_labels():
classes = labels.get_bugbug_labels()
# labels from bug_nobug.csv
assert classes[1087488]
assert not classes[1101825]
# labels from regression_bug_nobug.csv
assert not classes[447581] # nobug
assert classes[518272] # regression
assert classes[528988] # bug_unknown_regression
assert classes[1037762] # bug_no_regression
def test_get_all_bug_ids():
labels.get_all_bug_ids()

13
tests/test_tracking.py Normal file
Просмотреть файл

@ -0,0 +1,13 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
from bugbug.models.tracking import TrackingModel
def test_get_tracking_labels():
model = TrackingModel()
classes = model.get_labels()
assert not classes[1101825]
assert classes[1042096]