From 1bf5a8edf30b26e3fde6f0bc123c38b4adcb6e82 Mon Sep 17 00:00:00 2001
From: Marco Castelluccio <mcastelluccio@mozilla.com>
Date: Sat, 12 Oct 2019 00:32:02 +0100
Subject: [PATCH] Make train/test split configurable by subclasses of Model

Some models might want to implement a different way to split between
train and test sets. For example, for test scheduling we'll need to
use the past past as the training set, and the recent past as the
test set. We can't mix them or the evaluation will be falsified.
---
 bugbug/model.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/bugbug/model.py b/bugbug/model.py
index c6e13229..f19432dc 100644
--- a/bugbug/model.py
+++ b/bugbug/model.py
@@ -314,6 +314,9 @@ class Model:
 
         return feature_report
 
+    def train_test_split(self, X, y):
+        return train_test_split(X, y, test_size=0.1, random_state=0)
+
     def train(self, importance_cutoff=0.15, limit=None):
         classes, self.class_names = self.get_labels()
         self.class_names = sort_class_names(self.class_names)
@@ -337,9 +340,7 @@ class Model:
         is_binary = len(self.class_names) == 2
 
         # Split dataset in training and test.
-        X_train, X_test, y_train, y_test = train_test_split(
-            X, y, test_size=0.1, random_state=0
-        )
+        X_train, X_test, y_train, y_test = self.train_test_split(X, y)
         if self.sampler is not None:
             pipeline = make_pipeline(self.sampler, self.clf)
         else: