Make train/test split configurable by subclasses of Model

Some models might want to implement a different way to split between train and test sets. For example, for test scheduling we'll need to use the past past as the training set, and the recent past as the test set. We can't mix them or the evaluation will be falsified.
2019-10-12 00:32:02 +01:00 · 2019-10-12 00:32:02 +01:00 · 1bf5a8edf3
--- a/bugbug/model.py
+++ b/bugbug/model.py
@ -314,6 +314,9 @@ class Model:

        return feature_report

+    def train_test_split(self, X, y):
+        return train_test_split(X, y, test_size=0.1, random_state=0)
+
    def train(self, importance_cutoff=0.15, limit=None):
        classes, self.class_names = self.get_labels()
        self.class_names = sort_class_names(self.class_names)
@ -337,9 +340,7 @@ class Model:
        is_binary = len(self.class_names) == 2

        # Split dataset in training and test.
-        X_train, X_test, y_train, y_test = train_test_split(
-            X, y, test_size=0.1, random_state=0
-        )
+        X_train, X_test, y_train, y_test = self.train_test_split(X, y)
        if self.sampler is not None:
            pipeline = make_pipeline(self.sampler, self.clf)
        else: