From 1bf5a8edf30b26e3fde6f0bc123c38b4adcb6e82 Mon Sep 17 00:00:00 2001 From: Marco Castelluccio Date: Sat, 12 Oct 2019 00:32:02 +0100 Subject: [PATCH] Make train/test split configurable by subclasses of Model Some models might want to implement a different way to split between train and test sets. For example, for test scheduling we'll need to use the past past as the training set, and the recent past as the test set. We can't mix them or the evaluation will be falsified. --- bugbug/model.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bugbug/model.py b/bugbug/model.py index c6e13229..f19432dc 100644 --- a/bugbug/model.py +++ b/bugbug/model.py @@ -314,6 +314,9 @@ class Model: return feature_report + def train_test_split(self, X, y): + return train_test_split(X, y, test_size=0.1, random_state=0) + def train(self, importance_cutoff=0.15, limit=None): classes, self.class_names = self.get_labels() self.class_names = sort_class_names(self.class_names) @@ -337,9 +340,7 @@ class Model: is_binary = len(self.class_names) == 2 # Split dataset in training and test. - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.1, random_state=0 - ) + X_train, X_test, y_train, y_test = self.train_test_split(X, y) if self.sampler is not None: pipeline = make_pipeline(self.sampler, self.clf) else: