Perform under-sampling of the majority class

Former-commit-id: 8d3c7c3ba4
2018-09-24 00:11:49 +01:00 · 2018-09-24 00:11:49 +01:00 · 540b7ebaa7
--- a/requirements.txt
+++ b/requirements.txt
@ -4,3 +4,4 @@ scikit-learn==0.19.2
 xgboost==0.80
 requests==2.19.1
 numpy==1.15.2
+imbalanced-learn=0.3.3
--- a/run.py
+++ b/run.py
@ -7,6 +7,7 @@ from typing import Dict

 import numpy as np
 import xgboost
+from imblearn.under_sampling import RandomUnderSampler
 from sklearn import metrics
 from sklearn.feature_extraction import DictVectorizer
 from sklearn.feature_extraction.text import TfidfVectorizer
@ -109,6 +110,9 @@ extraction_pipeline = Pipeline([

 X = extraction_pipeline.fit_transform(bugs)

+# Under-sample the 'bug' class, as there are too many compared to 'feature'.
+X, y = RandomUnderSampler().fit_sample(X, y)
+
 # Split dataset in training and test.
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
 print(X_train.shape, y_train.shape)