* adding automl.score

* fixing the metric name in train_with_config

* adding pickle after score

* fixing a bug in automl.pickle
This commit is contained in:
Xueqing Liu 2022-03-25 17:00:08 -04:00 коммит произвёл GitHub
Родитель 1d029436e7
Коммит 5f97532986
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
11 изменённых файлов: 375 добавлений и 39 удалений

4
.gitignore поставляемый
Просмотреть файл

@ -156,3 +156,7 @@ automl.pkl
.idea/*
.DS_Store
test/nlp/testtmp.py
test/nlp/testtmpfl.py

Просмотреть файл

@ -246,11 +246,6 @@ class AutoMLState:
* sample_size
/ state.data_size[0]
)
# raise Exception("bbbbb", state.time_budget, budget)
if _is_nlp_task(state.task):
state.fit_kwargs["X_val"] = state.X_val
state.fit_kwargs["y_val"] = state.y_val
(
trained_estimator,
@ -344,7 +339,7 @@ class AutoMLState:
estimator_class=self.learner_classes.get(estimator),
budget=budget,
fit_kwargs=self.fit_kwargs,
eval_metric="train_time",
eval_metric=self.metric if hasattr(self, "metric") else "train_time",
)
if sampled_weight is not None:
@ -699,6 +694,16 @@ class AutoML(BaseEstimator):
"""Time taken to find best model in seconds."""
return self.__dict__.get("_time_taken_best_iter")
def score(self, X: pd.DataFrame, y: pd.Series, **kwargs):
estimator = getattr(self, "_trained_estimator", None)
if estimator is None:
logger.warning(
"No estimator is trained. Please run fit with enough budget."
)
return None
X = self._preprocess(X)
return estimator.score(X, y, **kwargs)
def predict(
self,
X: Union[np.array, pd.DataFrame, List[str], List[List[str]]],
@ -1259,7 +1264,7 @@ class AutoML(BaseEstimator):
record_id: An integer of the record ID in the file,
0 corresponds to the first trial.
task: A string of the task type,
'binary', 'multi', 'regression', 'ts_forecast', 'rank'.
'binary', 'multiclass', 'regression', 'ts_forecast', 'rank'.
Returns:
An estimator object for the given configuration.
@ -1645,8 +1650,12 @@ class AutoML(BaseEstimator):
estimator_to_training_function = {}
for estimator in self.estimator_list:
search_state = self._search_states[estimator]
estimator_to_training_function[estimator] = search_state.training_function
del search_state.training_function
if hasattr(search_state, "training_function"):
estimator_to_training_function[
estimator
] = search_state.training_function
del search_state.training_function
with open(output_file_name, "wb") as f:
pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
@ -1781,7 +1790,7 @@ class AutoML(BaseEstimator):
'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mae', 'mse', 'r2',
'mape'. Default is 'auto'.
If passing a customized metric function, the function needs to
have the follwing signature:
have the following signature:
```python
def custom_metric(
X_test, y_test, estimator, labels,
@ -2114,7 +2123,7 @@ class AutoML(BaseEstimator):
metric = load_default_huggingface_metric_for_task(self._state.task)
elif "binary" in self._state.task:
metric = "roc_auc"
elif "multi" in self._state.task:
elif "multiclass" in self._state.task:
metric = "log_loss"
elif self._state.task in TS_FORECAST:
metric = "mape"
@ -2838,7 +2847,7 @@ class AutoML(BaseEstimator):
estimators = []
if self._ensemble and self._state.task in (
"binary",
"multi",
"multiclass",
"regression",
):
search_states = list(

Просмотреть файл

@ -18,7 +18,7 @@ MULTICHOICECLASSIFICATION = "multichoice-classification"
TOKENCLASSIFICATION = "token-classification"
CLASSIFICATION = (
"binary",
"multi",
"multiclass",
"classification",
SEQCLASSIFICATION,
MULTICHOICECLASSIFICATION,

Просмотреть файл

@ -25,7 +25,7 @@ def load_config_predictor(estimator_name, task, location=None):
predictor = CONFIG_PREDICTORS.get(key)
if predictor:
return predictor
task = "multiclass" if task == "multi" else task
task = "multiclass" if task == "multi" else task # TODO: multi -> multiclass?
try:
location = location or LOCATION
with open(f"{location}/{estimator_name}/{task}.json", "r") as f:

Просмотреть файл

@ -219,6 +219,13 @@ def is_in_sklearn_metric_name_set(metric_name):
return metric_name.startswith("ndcg") or metric_name in sklearn_metric_name_set
def is_min_metric(metric_name):
return (
metric_name in ["rmse", "mae", "mse", "log_loss", "mape"]
or huggingface_metric_to_mode.get(metric_name, None) == "min"
)
def sklearn_metric_loss_score(
metric_name,
y_predict,
@ -565,6 +572,8 @@ def compute_estimator(
if isinstance(estimator, TransformersEstimator):
fit_kwargs["metric"] = eval_metric
fit_kwargs["X_val"] = X_val
fit_kwargs["y_val"] = y_val
if "holdout" == eval_method:
val_loss, metric_for_logging, train_time, pred_time = get_val_loss(
@ -633,7 +642,7 @@ def get_classification_objective(num_labels: int) -> str:
if num_labels == 2:
objective_name = "binary"
else:
objective_name = "multi"
objective_name = "multiclass"
return objective_name

Просмотреть файл

@ -88,7 +88,9 @@ class BaseEstimator:
Args:
task: A string of the task type, one of
'binary', 'multi', 'regression', 'rank', 'forecast'.
'binary', 'multiclass', 'regression', 'rank', 'seq-classification',
'seq-regression', 'token-classification', 'multichoice-classification',
'summarization', 'ts_forecast', 'ts_forecast_classification'.
config: A dictionary containing the hyperparameter names, 'n_jobs' as keys.
n_jobs is the number of parallel threads.
"""
@ -234,6 +236,56 @@ class BaseEstimator:
X = self._preprocess(X)
return self._model.predict_proba(X)
def score(self, X_val: DataFrame, y_val: Series, **kwargs):
"""Report the evaluation score of a trained estimator.
Args:
X_val: A pandas dataframe of the validation input data.
y_val: A pandas series of the validation label.
kwargs: keyword argument of the evaluation function, for example:
- metric: A string of the metric name or a function
e.g., 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo',
'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mae', 'mse', 'r2',
'mape'. Default is 'auto'.
If metric is given, the score will report the user specified metric.
If metric is not given, the metric is set to accuracy for classification and r2
for regression.
You can also pass a customized metric function, for examples on how to pass a
customized metric function, please check
[test/nlp/test_autohf_custom_metric.py](https://github.com/microsoft/FLAML/blob/main/test/nlp/test_autohf_custom_metric.py) and
[test/automl/test_multiclass.py](https://github.com/microsoft/FLAML/blob/main/test/automl/test_multiclass.py).
```
Returns:
The evaluation score on the validation dataset.
"""
from .ml import metric_loss_score
from .ml import is_min_metric
if self._model is not None:
if self._task == "rank":
raise NotImplementedError(
"AutoML.score() is not implemented for ranking"
)
else:
X_val = self._preprocess(X_val)
metric = kwargs.get("metric", None)
if metric:
y_pred = self.predict(X_val, **kwargs)
if is_min_metric(metric):
return metric_loss_score(metric, y_pred, y_val)
else:
return 1.0 - metric_loss_score(metric, y_pred, y_val)
else:
return self._model.score(X_val, y_val, **kwargs)
else:
logger.warning(
"Estimator is not fit yet. Please run fit() before predict()."
)
return 0.0
def cleanup(self):
del self._model
self._model = None
@ -244,7 +296,7 @@ class BaseEstimator:
Args:
data_size: A tuple of two integers, number of rows and columns.
task: A str of the task type, e.g., "binary", "multi", "regression".
task: A str of the task type, e.g., "binary", "multiclass", "regression".
Returns:
A dictionary of the search space.
@ -518,7 +570,6 @@ class TransformersEstimator(BaseEstimator):
else self.hf_args.model_path,
self._task,
)
self._metric = kwargs["metric"]
try:
@ -720,15 +771,11 @@ class TransformersEstimator(BaseEstimator):
metric_dict["automl_metric"] = loss
return metric_dict
def _init_model_for_predict(self, X_test):
from datasets import Dataset
def _init_model_for_predict(self):
from .nlp.huggingface.trainer import TrainerForAuto
from .nlp.huggingface.data_collator import DataCollatorForPredict
from .nlp.utils import load_model
X_test, _ = self._preprocess(X_test, **self._kwargs)
test_dataset = Dataset.from_pandas(X_test)
this_model = load_model(
checkpoint_path=self._checkpoint_path,
task=self._task,
@ -750,25 +797,56 @@ class TransformersEstimator(BaseEstimator):
)
if self._task in NLG_TASKS:
setattr(new_trainer, "_is_seq2seq", True)
return new_trainer, test_dataset, training_args
return new_trainer, training_args
def predict_proba(self, X, **kwargs):
from datasets import Dataset
self._update_hf_args(kwargs)
assert (
self._task in CLASSIFICATION
), "predict_proba() only for classification tasks."
new_trainer, test_dataset, _ = self._init_model_for_predict(X)
X_test, _ = self._preprocess(X, **self._kwargs)
test_dataset = Dataset.from_pandas(X_test)
new_trainer, _ = self._init_model_for_predict()
predictions = new_trainer.predict(test_dataset)
return predictions.predictions
def score(self, X_val: DataFrame, y_val: Series, **kwargs):
import transformers
from datasets import Dataset
transformers.logging.set_verbosity_error()
self._metric = kwargs["metric"]
if (self._task not in NLG_TASKS) and (self._task != TOKENCLASSIFICATION):
self._X_val, _ = self._preprocess(X=X_val)
self._y_val = y_val
else:
self._X_val, self._y_val = self._preprocess(X=X_val, y=y_val)
eval_dataset = Dataset.from_pandas(
TransformersEstimator._join(self._X_val, self._y_val)
)
new_trainer, training_args = self._init_model_for_predict()
return new_trainer.evaluate(eval_dataset)
def predict(self, X, **kwargs):
import transformers
from datasets import Dataset
transformers.logging.set_verbosity_error()
self._update_hf_args(kwargs)
new_trainer, test_dataset, training_args = self._init_model_for_predict(X)
X_test, _ = self._preprocess(X, **self._kwargs)
test_dataset = Dataset.from_pandas(X_test)
new_trainer, training_args = self._init_model_for_predict()
if self._task not in NLG_TASKS:
predictions = new_trainer.predict(test_dataset)
@ -1677,6 +1755,17 @@ class Prophet(SKLearnEstimator):
)
return np.ones(X.shape[0])
def score(self, X_val: DataFrame, y_val: Series, **kwargs):
from sklearn.metrics import r2_score
from .ml import metric_loss_score
y_pred = self.predict(X_val)
self._metric = kwargs.get("metric", None)
if self._metric:
return metric_loss_score(self._metric, y_pred, y_val)
else:
return r2_score(y_pred, y_val)
class ARIMA(Prophet):
"""The class for tuning ARIMA."""

Просмотреть файл

@ -128,9 +128,9 @@
"Requirement already satisfied: parso<0.9.0,>=0.8.0 in /usr/local/lib/python3.9/site-packages (from jedi>=0.16->ipython>=7.23.1->ipykernel->jupyter->flaml[notebook]) (0.8.2)\n",
"Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.9/site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets->jupyter->flaml[notebook]) (21.2.0)\n",
"Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.9/site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets->jupyter->flaml[notebook]) (0.18.0)\n",
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\n",
"\u001b[33mWARNING: You are using pip version 21.3; however, version 21.3.1 is available.\n",
"You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n"
"\u001B[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001B[0m\n",
"\u001B[33mWARNING: You are using pip version 21.3; however, version 21.3.1 is available.\n",
"You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.\u001B[0m\n"
]
}
],
@ -863,7 +863,7 @@
" \n",
" Args:\n",
" task: A string of the task type, one of\n",
" 'binary', 'multi', 'regression'\n",
" 'binary', 'multiclass', 'regression'\n",
" config: A dictionary containing the hyperparameter names\n",
" and 'n_jobs' as keys. n_jobs is the number of parallel threads.\n",
" '''\n",
@ -1283,4 +1283,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}

Просмотреть файл

@ -203,7 +203,7 @@ class TestMultiClass(unittest.TestCase):
print(automl_experiment.best_estimator)
automl_experiment = AutoML()
estimator = automl_experiment.get_estimator_from_log(
automl_settings["log_file_name"], record_id=0, task="multi"
automl_settings["log_file_name"], record_id=0, task="multiclass"
)
print(estimator)
(

218
test/automl/test_score.py Normal file
Просмотреть файл

@ -0,0 +1,218 @@
from flaml import AutoML
import pandas as pd
from sklearn.datasets import fetch_california_housing, fetch_openml
class TestScore:
def test_forecast(self, budget=5):
import pickle
# using dataframe
import statsmodels.api as sm
data = sm.datasets.co2.load_pandas().data["co2"].resample("MS").mean()
data = (
data.fillna(data.bfill())
.to_frame()
.reset_index()
.rename(columns={"index": "ds", "co2": "y"})
)
num_samples = data.shape[0]
time_horizon = 12
split_idx = num_samples - time_horizon
X_test = data[split_idx:]["ds"]
y_test = data[split_idx:]["y"]
df = data[:split_idx]
automl = AutoML()
settings = {
"time_budget": budget, # total running time in seconds
"metric": "mape", # primary metric
"task": "ts_forecast", # task type
"log_file_name": "test/CO2_forecast.log", # flaml log file
"eval_method": "holdout",
"label": "y",
}
"""The main flaml automl API"""
try:
import prophet
automl.fit(
dataframe=df,
estimator_list=["prophet", "arima", "sarimax"],
**settings,
period=time_horizon,
)
automl.score(X_test, y_test)
automl.pickle("automl.pkl")
with open("automl.pkl", "rb") as f:
pickle.load(f)
except ImportError:
print("not using prophet due to ImportError")
automl.fit(
dataframe=df,
**settings,
estimator_list=["arima", "sarimax"],
period=time_horizon,
)
automl.score(X_test, y_test)
automl.pickle("automl.pkl")
with open("automl.pkl", "rb") as f:
pickle.load(f)
def test_classification(self):
X = pd.DataFrame(
{
"f1": [1, -2, 3, -4, 5, -6, -7, 8, -9, -10, -11, -12, -13, -14],
"f2": [
3.0,
16.0,
10.0,
12.0,
3.0,
14.0,
11.0,
12.0,
5.0,
14.0,
20.0,
16.0,
15.0,
11.0,
],
"f3": [
"a",
"b",
"a",
"c",
"c",
"b",
"b",
"b",
"b",
"a",
"b",
1.0,
1.0,
"a",
],
"f4": [
True,
True,
False,
True,
True,
False,
False,
False,
True,
True,
False,
False,
True,
True,
],
}
)
y = pd.Series([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1])
automl = AutoML()
for each_estimator in [
"catboost",
"lrl2",
"lrl1",
"rf",
"lgbm",
"extra_tree",
"kneighbor",
"xgboost",
]:
automl_settings = {
"time_budget": 6,
"task": "classification",
"n_jobs": 1,
"estimator_list": [each_estimator],
"metric": "accuracy",
"log_training_metric": True,
}
automl.score(X, y) # for covering the case no estimator is trained
automl.fit(X, y, **automl_settings)
automl.score(X, y)
automl.score(X, y, **{"metric": "accuracy"})
automl.pickle("automl.pkl")
def test_regression(self):
automl_experiment = AutoML()
X_train, y_train = fetch_california_housing(return_X_y=True)
n = int(len(y_train) * 9 // 10)
for each_estimator in [
"lgbm",
"xgboost",
"rf",
"extra_tree",
"catboost",
"kneighbor",
]:
automl_settings = {
"time_budget": 2,
"task": "regression",
"log_file_name": "test/california.log",
"log_training_metric": True,
"estimator_list": [each_estimator],
"n_jobs": 1,
"model_history": True,
}
automl_experiment.fit(
X_train=X_train[:n],
y_train=y_train[:n],
X_val=X_train[n:],
y_val=y_train[n:],
**automl_settings,
)
automl_experiment.score(X_train[n:], y_train[n:], **{"metric": "mse"})
automl_experiment.pickle("automl.pkl")
def test_rank(self):
from sklearn.externals._arff import ArffException
dataset = "credit-g"
try:
X, y = fetch_openml(name=dataset, return_X_y=True)
y = y.cat.codes
except (ArffException, ValueError):
from sklearn.datasets import load_wine
X, y = load_wine(return_X_y=True)
import numpy as np
automl = AutoML()
n = 500
for each_estimator in ["lgbm", "xgboost"]:
automl_settings = {
"time_budget": 2,
"task": "rank",
"log_file_name": "test/{}.log".format(dataset),
"model_history": True,
"groups": np.array([0] * 200 + [1] * 200 + [2] * 100), # group labels
"learner_selector": "roundrobin",
"estimator_list": [each_estimator],
}
automl.fit(X[:n], y[:n], **automl_settings)
try:
automl.score(X[n:], y[n:])
automl.pickle("automl.pkl")
except NotImplementedError:
pass
if __name__ == "__main__":
test = TestScore()
test.test_forecast()

Просмотреть файл

@ -102,6 +102,8 @@ def test_hf_data():
y_val=y_val,
**automl_settings
)
automl.score(X_val, y_val, **{"metric": "accuracy"})
automl.pickle("automl.pkl")
except requests.exceptions.HTTPError:
return
@ -113,10 +115,6 @@ def test_hf_data():
record_id=0,
**automl_settings
)
with open("automl.pkl", "wb") as f:
pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
with open("automl.pkl", "rb") as f:
automl = pickle.load(f)
automl.predict(X_test)
automl.predict(["test test", "test test"])
automl.predict(
@ -183,8 +181,6 @@ def _test_custom_data():
]
)
import pickle
automl.pickle("automl.pkl")
with open("automl.pkl", "rb") as f:

Просмотреть файл

@ -19,7 +19,7 @@ def custom_metric(
from flaml.model import TransformersEstimator
if estimator._trainer is None:
trainer, _, _ = estimator._init_model_for_predict(X_test)
trainer, _ = estimator._init_model_for_predict()
estimator._trainer = None
else:
trainer = estimator._trainer
@ -93,6 +93,14 @@ def test_custom_metric():
# testing when max_iter=1 and do retrain only without hpo
try:
import ray
if not ray.is_initialized():
ray.init()
except ImportError:
return
automl_settings = {
"gpu_per_trial": 0,
"max_iter": 1,
@ -100,6 +108,7 @@ def test_custom_metric():
"task": "seq-classification",
"metric": custom_metric,
"log_file_name": "seqclass.log",
"use_ray": {"local_dir": "data/outut/"},
}
automl_settings["hf_args"] = {
@ -126,6 +135,8 @@ def test_custom_metric():
automl.fit(
X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
)
automl.score(X_val, y_val, **{"metric": custom_metric})
automl.pickle("automl.pkl")
del automl