Perform under-sampling of the majority class

Former-commit-id: 8d3c7c3ba4
This commit is contained in:
Marco Castelluccio 2018-09-24 00:11:49 +01:00
Родитель f9c03d0f8f
Коммит 540b7ebaa7
2 изменённых файлов: 5 добавлений и 0 удалений

Просмотреть файл

@ -4,3 +4,4 @@ scikit-learn==0.19.2
xgboost==0.80
requests==2.19.1
numpy==1.15.2
imbalanced-learn=0.3.3

4
run.py
Просмотреть файл

@ -7,6 +7,7 @@ from typing import Dict
import numpy as np
import xgboost
from imblearn.under_sampling import RandomUnderSampler
from sklearn import metrics
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
@ -109,6 +110,9 @@ extraction_pipeline = Pipeline([
X = extraction_pipeline.fit_transform(bugs)
# Under-sample the 'bug' class, as there are too many compared to 'feature'.
X, y = RandomUnderSampler().fit_sample(X, y)
# Split dataset in training and test.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
print(X_train.shape, y_train.shape)