From 492990655d09a144d99e91de4dedda761a70302e Mon Sep 17 00:00:00 2001 From: "Chi Wang (MSR)" Date: Fri, 4 Dec 2020 09:40:27 -0800 Subject: [PATCH] v0.1.0 --- .coveragerc | 5 + .flake8 | 5 + .github/workflows/python-package.yml | 59 ++ .gitignore | 150 +++++ CODE_OF_CONDUCT.md | 9 + LICENSE | 21 + README.md | 123 ++++ SECURITY.md | 41 ++ flaml/__init__.py | 70 +++ flaml/automl.py | 897 +++++++++++++++++++++++++++ flaml/config.py | 31 + flaml/data.py | 256 ++++++++ flaml/ml.py | 241 +++++++ flaml/model.py | 515 +++++++++++++++ flaml/search.py | 675 ++++++++++++++++++++ flaml/space.py | 249 ++++++++ flaml/training_log.py | 168 +++++ flaml/version.py | 1 + notebook/flaml_demo.ipynb | 611 ++++++++++++++++++ settings.json | 4 + setup.py | 56 ++ test/__init__.py | 0 test/test_automl.py | 235 +++++++ test/test_split.py | 45 ++ test/test_version.py | 14 + 25 files changed, 4481 insertions(+) create mode 100644 .coveragerc create mode 100644 .flake8 create mode 100644 .github/workflows/python-package.yml create mode 100644 .gitignore create mode 100644 CODE_OF_CONDUCT.md create mode 100644 LICENSE create mode 100644 README.md create mode 100644 SECURITY.md create mode 100644 flaml/__init__.py create mode 100644 flaml/automl.py create mode 100644 flaml/config.py create mode 100644 flaml/data.py create mode 100644 flaml/ml.py create mode 100644 flaml/model.py create mode 100644 flaml/search.py create mode 100644 flaml/space.py create mode 100644 flaml/training_log.py create mode 100644 flaml/version.py create mode 100644 notebook/flaml_demo.ipynb create mode 100644 settings.json create mode 100644 setup.py create mode 100644 test/__init__.py create mode 100644 test/test_automl.py create mode 100644 test/test_split.py create mode 100644 test/test_version.py diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 000000000..ca97263d2 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,5 @@ +[run] +branch = True +source = flaml +omit = + *tests* \ No newline at end of file diff --git a/.flake8 b/.flake8 new file mode 100644 index 000000000..f35990a3d --- /dev/null +++ b/.flake8 @@ -0,0 +1,5 @@ +[flake8] +ignore = E203, E266, E501, W503, F403, F401, C901 +max-line-length = 127 +max-complexity = 10 +select = B,C,E,F,W,T4,B9 \ No newline at end of file diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 000000000..813197a9e --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,59 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Python package + +on: + push: + branches: ['*'] + pull_request: + branches: ['*'] + +jobs: + build: + + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-2019] + python-version: [3.6, 3.7, 3.8] + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: If mac, install libomp to facilitate lgbm install + if: matrix.os == 'macOS-latest' + run: | + brew install libomp + export CC=/usr/bin/clang + export CXX=/usr/bin/clang++ + export CPPFLAGS="$CPPFLAGS -Xpreprocessor -fopenmp" + export CFLAGS="$CFLAGS -I/usr/local/opt/libomp/include" + export CXXFLAGS="$CXXFLAGS -I/usr/local/opt/libomp/include" + export LDFLAGS="$LDFLAGS -Wl,-rpath,/usr/local/opt/libomp/lib -L/usr/local/opt/libomp/lib -lomp" + - name: Install packages and dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest coverage + pip install -e . + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest test + - name: Coverage + run: | + coverage run -a -m pytest test + coverage xml + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v1 + with: + file: ./coverage.xml + flags: unittests \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..b57bcdb36 --- /dev/null +++ b/.gitignore @@ -0,0 +1,150 @@ +# Project +/.vs +.vscode + +# Log files +*.log + +# Python virtualenv +.venv + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ +/catboost_info +notebook/*.pkl diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 000000000..f9ba8cf65 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,9 @@ +# Microsoft Open Source Code of Conduct + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). + +Resources: + +- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) +- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) +- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..9e841e7a2 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ + MIT License + + Copyright (c) Microsoft Corporation. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE diff --git a/README.md b/README.md new file mode 100644 index 000000000..190064ff3 --- /dev/null +++ b/README.md @@ -0,0 +1,123 @@ +# FLAML - Fast and Lightweight AutoML + +FLAML is a Python library designed to automatically produce accurate machine +learning models with low computational cost. It frees users from selecting +learners and hyperparameters for each learner. It is fast and cheap. +The simple and lightweight design makes it easy to extend, such as +adding customized learners or metrics. FLAML is powered by a new, cost-effective +hyperparameter optimization and learner selection method invented by +Microsoft Research. +FLAML is easy to use: + +1. With three lines of code, you can start using this economical and fast +AutoML engine as a scikit-learn style estimator. +```python +from flaml import AutoML +automl = AutoML() +automl.fit(X_train, y_train, task="classification") +``` + +2. You can restrict the learners and use FLAML as a fast hyperparameter tuning +tool for XGBoost, LightGBM, Random Forest etc. or a customized learner. +```python +automl.fit(X_train, y_train, task="classification", estimator_list=["lgbm"]) +``` + +3. You can embed FLAML in self-tuning software for just-in-time tuning with +low latency & resource consumption. +```python +automl.fit(X_train, y_train, task="regression", time_budget=60) +``` + +## Installation + +FLAML requires **Python version >= 3.6**. It can be installed from pip: + +```bash +pip install flaml +``` + +To run the [`notebook example`](https://github.com/microsoft/FLAML/tree/main/notebook), +install flaml with the [notebook] option: + +```bash +pip install flaml[notebook] +``` + +## Examples + +A basic classification example. + +```python +from flaml import AutoML +from sklearn.datasets import load_iris +# Initialize the FLAML learner. +automl = AutoML() +# Provide configurations. +automl_settings = { + "time_budget": 10, # in seconds + "metric": 'accuracy', + "task": 'classification', + "log_file_name": "test/iris.log", +} +X_train, y_train = load_iris(return_X_y=True) +# Train with labeled input data. +automl.fit(X_train=X_train, y_train=y_train, + **automl_settings) +# Predict +print(automl.predict_proba(X_train)) +# Export the best model. +print(automl.model) +``` + +A basic regression example. + +```python +from flaml import AutoML +from sklearn.datasets import load_boston +# Initialize the FLAML learner. +automl = AutoML() +# Provide configurations. +automl_settings = { + "time_budget": 10, # in seconds + "metric": 'r2', + "task": 'regression', + "log_file_name": "test/boston.log", +} +X_train, y_train = load_boston(return_X_y=True) +# Train with labeled input data. +automl.fit(X_train=X_train, y_train=y_train, + **automl_settings) +# Predict +print(automl.predict(X_train)) +# Export the best model. +print(automl.model) +``` + +More examples: see the [notebook](https://github.com/microsoft/FLAML/tree/main/notebook/flaml_demo.ipynb) + +## Contributing + +This project welcomes contributions and suggestions. Most contributions require you to agree to a +Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us +the rights to use your contribution. For details, visit . + +When you submit a pull request, a CLA bot will automatically determine whether you need to provide +a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions +provided by the bot. You will only need to do this once across all repos using our CLA. + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). +For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or +contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. + +## Authors + +* Chi Wang +* Qingyun Wu +* Erkang Zhu + +Contributors: Markus Weimer, Silu Huang, Haozhe Zhang, Alex Deng. + +## License + +[MIT License](LICENSE) diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 000000000..f7b89984f --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,41 @@ + + +## Security + +Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). + +If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. + +## Reporting Security Issues + +**Please do not report security vulnerabilities through public GitHub issues.** + +Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). + +If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). + +You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). + +Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: + + * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) + * Full paths of source file(s) related to the manifestation of the issue + * The location of the affected source code (tag/branch/commit or direct URL) + * Any special configuration required to reproduce the issue + * Step-by-step instructions to reproduce the issue + * Proof-of-concept or exploit code (if possible) + * Impact of the issue, including how an attacker might exploit the issue + +This information will help us triage your report more quickly. + +If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. + +## Preferred Languages + +We prefer all communications to be in English. + +## Policy + +Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). + + \ No newline at end of file diff --git a/flaml/__init__.py b/flaml/__init__.py new file mode 100644 index 000000000..2de9646dc --- /dev/null +++ b/flaml/__init__.py @@ -0,0 +1,70 @@ +from flaml.automl import AutoML +from flaml.model import BaseEstimator +from flaml.data import get_output_from_log + +from flaml.version import __version__ + +import logging +from os.path import join, exists +import datetime as dt +from os import listdir, remove, mkdir +import pathlib +import json + +root = pathlib.Path(__file__).parent.parent.absolute() +jsonfilepath = join(root, "settings.json") + +with open(jsonfilepath) as f: + settings = json.load(f) + +logging_level = settings["logging_level"] + +if logging_level == "info": + logging_level = logging.INFO +elif logging_level == "debug": + logging_level = logging.DEBUG +elif logging_level == "error": + logging_level = logging.ERROR +elif logging_level == "warning": + logging_level = logging.WARNING +elif logging_level == "critical": + logging_level = logging.CRITICAL +else: + logging_level = logging.NOTSET + +keep_max_logfiles = settings["keep_max_logfiles"] + +log_dir = join(root, "logs") + +if not exists(log_dir): + mkdir(log_dir) + +del_logs = sorted([int(x.split("_")[0]) for x in listdir(log_dir) if ".log" in + x], reverse=True)[keep_max_logfiles:] + +for l in del_logs: + try: + remove(join(log_dir, str(l) + "_flaml.log")) + except Exception as e: + continue + +b = dt.datetime.now() +a = dt.datetime(2020, 4, 1, 0, 0, 0) +secs = int((b-a).total_seconds()) +name = str(secs) + +logger = logging.getLogger(__name__) +logger.setLevel(logging_level) +fh = logging.FileHandler(join(log_dir, name + "_" + __name__ + ".log")) +fh.setLevel(logging_level) +ch = logging.StreamHandler() +ch.setLevel(logging_level) +# formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S') +formatter = logging.Formatter( + '[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s', + '%m-%d %H:%M:%S') +ch.setFormatter(formatter) +fh.setFormatter(formatter) +logger.addHandler(ch) +logger.addHandler(fh) +logger.propagate = True diff --git a/flaml/automl.py b/flaml/automl.py new file mode 100644 index 000000000..d9f44f9d6 --- /dev/null +++ b/flaml/automl.py @@ -0,0 +1,897 @@ +'''! + * Copyright (c) 2020 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the + * project root for license information. +''' +import time +import warnings +from functools import partial +import ast +import numpy as np +import scipy.sparse +from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, \ + RepeatedKFold +from sklearn.utils import shuffle +import pandas as pd + +from .ml import compute_estimator, train_estimator, get_classification_objective +from .config import MIN_SAMPLE_TRAIN, MEM_THRES, ETI_INI, \ + SMALL_LARGE_THRES, CV_HOLDOUT_THRESHOLD, SPLIT_RATIO, N_SPLITS +from .data import concat +from .search import ParamSearch +from .training_log import training_log_reader, training_log_writer + +import logging +logger = logging.getLogger(__name__) + + +class AutoML: + '''The AutoML class + + Attributes: + model: An object with predict() and predict_proba() method (for + classification), storing the best trained model. + model_history: A dictionary of iter->model, storing the models when + the best model is updated each time + config_history: A dictionary of iter->(estimator, config, time), + storing the best estimator, config, and the time when the best + model is updated each time + classes_: A list of n_classes elements for class labels + best_iteration: An integer of the iteration number where the best + config is found + best_estimator: A string indicating the best estimator found. + best_config: A dictionary of the best configuration. + best_config_train_time: A float of the seconds taken by training the + best config + + Typical usage example: + + automl = AutoML() + automl_settings = { + "time_budget": 60, + "metric": 'accuracy', + "task": 'classification', + "log_file_name": 'test/mylog.log', + } + automl.fit(X_train = X_train, y_train = y_train, + **automl_settings) + ''' + + def __init__(self): + self._eti_ini = ETI_INI + self._custom_learners = {} + self._config_space_info = {} + self._custom_size_estimate = {} + self._track_iter = 0 + + @property + def model_history(self): + return self._model_history + + @property + def config_history(self): + return self._config_history + + @property + def model(self): + if self._trained_estimator: + return self._trained_estimator.model + else: + return None + + @property + def best_estimator(self): + return self._best_estimator + + @property + def best_iteration(self): + return self._best_iteration + + @property + def best_config(self): + return self._selected.best_config[0] + + @property + def best_loss(self): + return self._best_loss + + @property + def best_config_train_time(self): + return self.best_train_time + + @property + def classes_(self): + if self.label_transformer: + return self.label_transformer.classes_.tolist() + if self._trained_estimator: + return self._trained_estimator.model.classes_.tolist() + return None + + def predict(self, X_test): + '''Predict label from features. + + Args: + X_test: A numpy array of featurized instances, shape n*m. + + Returns: + A numpy array of shape n*1 -- each element is a predicted class + label for an instance. + ''' + X_test = self.preprocess(X_test) + y_pred = self._trained_estimator.predict(X_test) + if y_pred.ndim > 1: + y_pred = y_pred.flatten() + if self.label_transformer: + return self.label_transformer.inverse_transform(pd.Series( + y_pred)) + else: + return y_pred + + def predict_proba(self, X_test): + '''Predict the probability of each class from features, only works for + classification problems. + + Args: + X_test: A numpy array of featurized instances, shape n*m. + + Returns: + A numpy array of shape n*c. c is the # classes. Each element at + (i,j) is the probability for instance i to be in class j. + ''' + X_test = self.preprocess(X_test) + proba = self._trained_estimator.predict_proba(X_test) + return proba + + def preprocess(self, X): + if scipy.sparse.issparse(X): + X = X.tocsr() + if self.transformer: + X = self.transformer.transform(X) + return X + + def _validate_data(self, X_train_all, y_train_all, dataframe, label, + X_val=None, y_val=None): + if X_train_all is not None and y_train_all is not None: + if not (isinstance(X_train_all, np.ndarray) + or scipy.sparse.issparse(X_train_all) + or isinstance(X_train_all, pd.DataFrame) + ): + raise ValueError( + "X_train_all must be a numpy array, a pandas dataframe, " + "or Scipy sparse matrix.") + if not (isinstance(y_train_all, np.ndarray) + or isinstance(y_train_all, pd.Series)): + raise ValueError( + "y_train_all must be a numpy array or a pandas series.") + if X_train_all.size == 0 or y_train_all.size == 0: + raise ValueError("Input data must not be empty.") + if isinstance(y_train_all, np.ndarray): + y_train_all = y_train_all.flatten() + if X_train_all.shape[0] != y_train_all.shape[0]: + raise ValueError( + "# rows in X_train must match length of y_train.") + self.df = isinstance(X_train_all, pd.DataFrame) + self.nrow, self.ndim = X_train_all.shape + X, y = X_train_all, y_train_all + elif dataframe is not None and label is not None: + if not isinstance(dataframe, pd.DataFrame): + raise ValueError("dataframe must be a pandas DataFrame") + if not label in dataframe.columns: + raise ValueError("label must a column name in dataframe") + self.df = True + self.dataframe, self.label = dataframe, label + X = dataframe.drop(columns=label) + self.nrow, self.ndim = X.shape + y = dataframe[label] + else: + raise ValueError( + "either X_train_all+y_train_all or dataframe+label need to be provided.") + if scipy.sparse.issparse(X_train_all): + self.transformer = self.label_transformer = False + self.X_train_all, self.y_train_all = X, y + else: + from .data import DataTransformer + self.transformer = DataTransformer() + self.X_train_all, self.y_train_all = self.transformer.fit_transform( + X, y, self.task) + self.label_transformer = self.transformer.label_transformer + + if X_val is not None and y_val is not None: + if not (isinstance(X_val, np.ndarray) + or scipy.sparse.issparse(X_val) + or isinstance(X_val, pd.DataFrame) + ): + raise ValueError( + "X_val must be None, a numpy array, a pandas dataframe, " + "or Scipy sparse matrix.") + if not (isinstance(y_val, np.ndarray) + or isinstance(y_val, pd.Series)): + raise ValueError( + "y_val must be None, a numpy array or a pandas series.") + if X_val.size == 0 or y_val.size == 0: + raise ValueError( + "Validation data are expected to be nonempty. " + "Use None for X_val and y_val if no validation data.") + if isinstance(y_val, np.ndarray): + y_val = y_val.flatten() + if X_val.shape[0] != y_val.shape[0]: + raise ValueError( + "# rows in X_val must match length of y_val.") + if self.transformer: + self.X_val = self.transformer.transform(X_val) + else: + self.X_val = X_val + if self.label_transformer: + self.y_val = self.label_transformer.transform(y_val) + else: + self.y_val = y_val + else: + self.X_val = self.y_val = None + + def _prepare_data(self, + eval_method, + split_ratio, + n_splits): + X_val, y_val = self.X_val, self.y_val + if scipy.sparse.issparse(X_val): + X_val = X_val.tocsr() + X_train_all, y_train_all = self.X_train_all, self.y_train_all + if scipy.sparse.issparse(X_train_all): + X_train_all = X_train_all.tocsr() + + if self.task != 'regression': + # logger.info(f"label {pd.unique(y_train_all)}") + label_set, counts = np.unique(y_train_all, return_counts=True) + # augment rare classes + rare_threshld = 20 + rare = counts < rare_threshld + rare_label, rare_counts = label_set[rare], counts[rare] + for i, label in enumerate(rare_label): + count = rare_count = rare_counts[i] + rare_index = y_train_all == label + n = len(y_train_all) + while count < rare_threshld: + if self.df: + X_train_all = concat(X_train_all, + X_train_all.iloc[:n].loc[rare_index]) + else: + X_train_all = concat(X_train_all, + X_train_all[:n][rare_index, :]) + if isinstance(y_train_all, pd.Series): + y_train_all = concat(y_train_all, + y_train_all.iloc[:n].loc[rare_index]) + else: + y_train_all = np.concatenate([y_train_all, + y_train_all[:n][rare_index]]) + count += rare_count + logger.debug( + f"class {label} augmented from {rare_count} to {count}") + X_train_all, y_train_all = shuffle( + X_train_all, y_train_all, random_state=202020) + if self.df: + X_train_all.reset_index(drop=True, inplace=True) + if isinstance(y_train_all, pd.Series): + y_train_all.reset_index(drop=True, inplace=True) + + X_train, y_train = X_train_all, y_train_all + if X_val is None: + if self.task != 'regression' and eval_method == 'holdout': + label_set, first = np.unique(y_train_all, return_index=True) + rest = [] + last = 0 + first.sort() + for i in range(len(first)): + rest.extend(range(last, first[i])) + last = first[i] + 1 + rest.extend(range(last, len(y_train_all))) + X_first = X_train_all.iloc[first] if self.df else X_train_all[ + first] + X_rest = X_train_all.iloc[rest] if self.df else X_train_all[rest] + y_rest = y_train_all.iloc[rest] if isinstance( + y_train_all, pd.Series) else y_train_all[rest] + stratify = y_rest if self.split_type == 'stratified' else None + X_train, X_val, y_train, y_val = train_test_split( + X_rest, + y_rest, + test_size=split_ratio, + stratify=stratify, + random_state=1) + X_train = concat(X_first, X_train) + y_train = concat(label_set, + y_train) if self.df else np.concatenate([label_set, y_train]) + X_val = concat(X_first, X_val) + y_val = concat(label_set, + y_val) if self.df else np.concatenate([label_set, y_val]) + _, y_train_counts_elements = np.unique(y_train, + return_counts=True) + _, y_val_counts_elements = np.unique(y_val, + return_counts=True) + logger.debug( + f"""{self.split_type} split for y_train \ + {y_train_counts_elements}, \ + y_val {y_val_counts_elements}""") + elif eval_method == 'holdout' and self.task == 'regression': + X_train, X_val, y_train, y_val = train_test_split( + X_train_all, + y_train_all, + test_size=split_ratio, + random_state=1) + self.data_size = X_train.shape[0] + self.X_train, self.y_train, self.X_val, self.y_val = ( + X_train, y_train, X_val, y_val) + if self.split_type == "stratified": + logger.info("Using StratifiedKFold") + self.kf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=1, + random_state=202020) + else: + logger.info("Using RepeatedKFold") + self.kf = RepeatedKFold(n_splits=n_splits, n_repeats=1, + random_state=202020) + + def prepare_sample_train_data(self, sample_size): + full_size = len(self.y_train) + if sample_size <= full_size: + if isinstance(self.X_train, pd.DataFrame): + sampled_X_train = self.X_train.iloc[:sample_size] + else: + sampled_X_train = self.X_train[:sample_size] + sampled_y_train = self.y_train[:sample_size] + else: + sampled_X_train = concat(self.X_train, self.X_val) + sampled_y_train = np.concatenate([self.y_train, self.y_val]) + return sampled_X_train, sampled_y_train + + def _compute_with_config_base(self, + metric, + compute_train_loss, + estimator, + config, + sample_size): + sampled_X_train, sampled_y_train = self.prepare_sample_train_data( + sample_size) + time_left = self.time_budget - self.time_from_start + budget = time_left if sample_size == self.data_size else \ + time_left / 2 * sample_size / self.data_size + return compute_estimator(sampled_X_train, + sampled_y_train, + self.X_val, + self.y_val, + budget, + self.kf, + config, + self.task, + estimator, + self.eval_method, + metric, + self._best_loss, + self.n_jobs, + self._custom_learners.get(estimator), + compute_train_loss) + + def _train_with_config(self, estimator, config, sample_size): + sampled_X_train, sampled_y_train = self.prepare_sample_train_data( + sample_size) + budget = None if self.time_budget is None else (self.time_budget + - self.time_from_start) + model, train_time = train_estimator( + sampled_X_train, + sampled_y_train, + config, + self.task, + estimator, + self.n_jobs, + self._custom_learners.get(estimator), + budget) + return model, train_time + + def add_learner(self, + learner_name, + learner_class, + size_estimate=lambda config: 'unknown', + cost_relative2lgbm=1): + '''Add a customized learner + + Args: + learner_name: A string of the learner's name + learner_class: A subclass of BaseEstimator + size_estimate: A function from a config to its memory size in float + cost_relative2lgbm: A float number for the training cost ratio with + respect to lightgbm (when both use the initial config) + ''' + self._custom_learners[learner_name] = learner_class + self._eti_ini[learner_name] = cost_relative2lgbm + self._config_space_info[learner_name] = \ + learner_class.params_configsearch_info + self._custom_size_estimate[learner_name] = size_estimate + + def get_estimator_from_log(self, log_file_name, record_id, objective): + '''Get the estimator from log file + + Args: + log_file_name: A string of the log file name + record_id: An integer of the record ID in the file, + 0 corresponds to the first trial + objective: A string of the objective name, + 'binary', 'multi', or 'regression' + + Returns: + An estimator object for the given configuration + ''' + + with training_log_reader(log_file_name) as reader: + record = reader.get_record(record_id) + estimator = record.learner + config = record.config + + estimator, _ = train_estimator( + None, None, config, objective, estimator, + estimator_class=self._custom_learners.get(estimator) + ) + return estimator + + def retrain_from_log(self, + log_file_name, + X_train=None, + y_train=None, + dataframe=None, + label=None, + time_budget=0, + task='classification', + eval_method='auto', + split_ratio=SPLIT_RATIO, + n_splits=N_SPLITS, + split_type="stratified", + n_jobs=1, + train_best=True, + train_full=False, + record_id=-1): + '''Retrain from log file + + Args: + time_budget: A float number of the time budget in seconds + log_file_name: A string of the log file name + X_train: A numpy array of training data in shape n*m + y_train: A numpy array of labels in shape n*1 + task: A string of the task type, e.g., + 'classification', 'regression' + eval_method: A string of resampling strategy, one of + ['auto', 'cv', 'holdout'] + split_ratio: A float of the validation data percentage for holdout + n_splits: An integer of the number of folds for cross-validation + n_jobs: An integer of the number of threads for training + train_best: A boolean of whether to train the best config in the + time budget; if false, train the last config in the budget + train_full: A boolean of whether to train on the full data. If true, + eval_method and sample_size in the log file will be ignored + record_id: the ID of the training log record from which the model will + be retrained. By default `record_id = -1` which means this will be + ignored. `record_id = 0` corresponds to the first trial, and + when `record_id >= 0`, `time_budget` will be ignored. + ''' + self.task = task + self._validate_data(X_train, y_train, dataframe, label) + + logger.info('log file name {}'.format(log_file_name)) + + best_config = None + best_val_loss = float('+inf') + best_estimator = None + sample_size = None + time_used = 0.0 + training_duration = 0 + best = None + with training_log_reader(log_file_name) as reader: + if record_id >= 0: + best = reader.get_record(record_id) + else: + for record in reader.records(): + time_used = record.total_search_time + if time_used > time_budget: + break + training_duration = time_used + val_loss = record.validation_loss + if val_loss <= best_val_loss or not train_best: + if val_loss == best_val_loss and train_best: + size = record.sample_size + if size > sample_size: + best = record + best_val_loss = val_loss + sample_size = size + else: + best = record + size = record.sample_size + best_val_loss = val_loss + sample_size = size + if not training_duration: + from .model import BaseEstimator + self._trained_estimator = BaseEstimator() + self._trained_estimator.model = None + return training_duration + if not best: return + best_estimator = best.learner + best_config = best.config + sample_size = len(self.y_train_all) if train_full \ + else best.sample_size + + logger.info( + 'estimator = {}, config = {}, #training instances = {}'.format( + best_estimator, best_config, sample_size)) + # Partially copied from fit() function + # Initilize some attributes required for retrain_from_log + np.random.seed(0) + self.task = task + if self.task == 'classification': + self.task = get_classification_objective( + len(np.unique(self.y_train_all))) + assert split_type in ["stratified", "uniform"] + self.split_type = split_type + else: + self.split_type = "uniform" + if record_id >= 0: + eval_method = 'cv' + elif eval_method == 'auto': + eval_method = self._decide_eval_method(time_budget) + self.modelcount = 0 + self._prepare_data(eval_method, split_ratio, n_splits) + self.time_budget = None + self.n_jobs = n_jobs + self._trained_estimator = self._train_with_config( + best_estimator, best_config, sample_size)[0] + return training_duration + + def _decide_eval_method(self, time_budget): + if self.X_val is not None: + return 'holdout' + nrow, dim = self.nrow, self.ndim + if nrow * dim / 0.9 < SMALL_LARGE_THRES * ( + time_budget / 3600) and nrow < CV_HOLDOUT_THRESHOLD: + # time allows or sampling can be used and cv is necessary + return 'cv' + else: + return 'holdout' + + def fit(self, + X_train=None, + y_train=None, + dataframe=None, + label=None, + metric='auto', + task='classification', + n_jobs=-1, + log_file_name='default.log', + estimator_list='auto', + time_budget=60, + max_iter=1000000, + sample=True, + ensemble=False, + eval_method='auto', + log_type='better', + model_history=False, + split_ratio=SPLIT_RATIO, + n_splits=N_SPLITS, + log_training_metric=False, + mem_thres=MEM_THRES, + X_val=None, + y_val=None, + retrain_full=True, + split_type="stratified", + learner_selector='sample', + ): + '''Find a model for a given task + + Args: + X_train: A numpy array or a pandas dataframe of training data in + shape n*m + y_train: A numpy array or a pandas series of labels in shape n*1 + dataframe: A dataframe of training data including label column + label: A str of the label column name + Note: If X_train and y_train are provided, + dataframe and label are ignored; + If not, dataframe and label must be provided. + metric: A string of the metric name or a function, + e.g., 'accuracy','roc_auc','f1','log_loss','mae','mse','r2' + if passing a customized metric function, the function needs to + have the follwing signature + + def metric(X_test, y_test, estimator, labels, X_train, y_train): + return metric_to_minimize, metrics_to_log + + which returns a float number as the minimization objective, + and a tuple of floats as the metrics to log + task: A string of the task type, e.g., + 'classification', 'regression' + n_jobs: An integer of the number of threads for training + log_file_name: A string of the log file name + estimator_list: A list of strings for estimator names, or 'auto' + e.g., ['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree'] + time_budget: A float number of the time budget in seconds + max_iter: An integer of the maximal number of iterations + sample: A boolean of whether to sample the training data during + search + eval_method: A string of resampling strategy, one of + ['auto', 'cv', 'holdout'] + split_ratio: A float of the valiation data percentage for holdout + n_splits: An integer of the number of folds for cross-validation + log_type: A string of the log type, one of ['better', 'all', 'new'] + 'better' only logs configs with better loss than previos iters + 'all' logs all the tried configs + 'new' only logs non-redundant configs + model_history: A boolean of whether to keep the history of best + models in the history property. Make sure memory is large + enough if setting to True. + log_training_metric: A boolean of whether to log the training + metric for each model. + mem_thres: A float of the memory size constraint in bytes + X_val: None | a numpy array or a pandas dataframe of validation data + y_val: None | a numpy array or a pandas series of validation labels + ''' + self.task = task + self._validate_data(X_train, y_train, dataframe, label, X_val, y_val) + self.start_time_flag = time.time() + np.random.seed(0) + self.learner_selector = learner_selector + + if self.task == 'classification': + self.task = get_classification_objective( + len(np.unique(self.y_train_all))) + assert split_type in ["stratified", "uniform"] + self.split_type = split_type + else: + self.split_type = "uniform" + + if 'auto' == estimator_list: + estimator_list = ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree'] + if 'regression' != self.task: + estimator_list += ['lrl1', ] + logger.info( + "List of ML learners in AutoML Run: {}".format(estimator_list)) + + if eval_method == 'auto' or self.X_val is not None: + eval_method = self._decide_eval_method(time_budget) + self.eval_method = eval_method + logger.info("Evaluation method: {}".format(eval_method)) + + self.retrain_full = retrain_full and (eval_method == 'holdout' + and self.X_val is None) + self.sample = sample and (eval_method != 'cv') + if 'auto' == metric: + if 'binary' in task: + metric = 'roc_auc' + elif 'multi' in task: + metric = 'log_loss' + else: + metric = 'r2' + if metric in ['r2', 'accuracy', 'roc_auc', 'f1', 'ap']: + error_metric = f"1-{metric}" + elif isinstance(metric, str): + error_metric = metric + else: + error_metric = 'customized metric' + logger.info(f'Minimizing error metric: {error_metric}') + + with training_log_writer(log_file_name) as save_helper: + self.save_helper = save_helper + self._prepare_data(eval_method, split_ratio, n_splits) + self._compute_with_config = partial(AutoML._compute_with_config_base, + self, + metric, + log_training_metric) + self.time_budget = time_budget + self.estimator_list = estimator_list + self.ensemble = ensemble + self.max_iter = max_iter + self.mem_thres = mem_thres + self.log_type = log_type + self.split_ratio = split_ratio + self.save_model_history = model_history + self.n_jobs = n_jobs + self.search() + logger.info("fit succeeded") + + def search(self): + self.searchers = {} + # initialize the searchers + self.eti = [] + self._best_loss = float('+inf') + self.best_train_time = 0 + self.time_from_start = 0 + self.estimator_index = -1 + self._best_iteration = 0 + self._model_history = {} + self._config_history = {} + self.max_iter_per_learner = 10000 # TODO + self.iter_per_learner = dict([(e, 0) for e in self.estimator_list]) + self.fullsize = False + self._trained_estimator = None + if self.ensemble: + self.best_model = {} + for self._track_iter in range(self.max_iter): + if self.estimator_index == -1: + estimator = self.estimator_list[0] + else: + estimator = self._select_estimator(self.estimator_list) + if not estimator: + break + logger.info(f"iteration {self._track_iter}" + f" current learner {estimator}") + if estimator in self.searchers: + model = self.searchers[estimator].trained_estimator + improved = self.searchers[estimator].search1step( + global_best_loss=self._best_loss, + retrain_full=self.retrain_full, + mem_thres=self.mem_thres) + else: + model = improved = None + self.searchers[estimator] = ParamSearch( + estimator, + self.data_size, + self._compute_with_config, + self._train_with_config, + self.save_helper, + MIN_SAMPLE_TRAIN if self.sample else self.data_size, + self.task, + self.log_type, + self._config_space_info.get(estimator), + self._custom_size_estimate.get(estimator), + self.split_ratio) + self.searchers[estimator].search_begin(self.time_budget, + self.start_time_flag) + if self.estimator_index == -1: + eti_base = self._eti_ini[estimator] + self.eti.append( + self.searchers[estimator] + .expected_time_improvement_search()) + for e in self.estimator_list[1:]: + self.eti.append( + self._eti_ini[e] / eti_base * self.eti[0]) + self.estimator_index = 0 + self.time_from_start = time.time() - self.start_time_flag + # logger.info(f"{self.searchers[estimator].sample_size}, {data_size}") + if self.searchers[estimator].sample_size == self.data_size: + self.iter_per_learner[estimator] += 1 + if not self.fullsize: + self.fullsize = True + if self.searchers[estimator].best_loss < self._best_loss: + self._best_loss = self.searchers[estimator].best_loss + self._best_estimator = estimator + self.best_train_time = self.searchers[estimator].train_time + self._config_history[self._track_iter] = ( + estimator, + self.searchers[estimator].best_config[0], + self.time_from_start) + if self.save_model_history: + self._model_history[self._track_iter] = self.searchers[ + estimator].trained_estimator.model + elif self._trained_estimator: + del self._trained_estimator + self._trained_estimator = None + self._trained_estimator = self.searchers[ + estimator].trained_estimator + self._best_iteration = self._track_iter + if model and improved and not self.save_model_history: + model.cleanup() + + logger.info( + " at {:.1f}s,\tbest {}'s error={:.4f},\tbest {}'s error={:.4f}".format( + self.time_from_start, + estimator, + self.searchers[estimator].best_loss, + self._best_estimator, + self._best_loss)) + + if self.time_from_start >= self.time_budget: + break + if self.ensemble: + time_left = self.time_from_start - self.time_budget + time_ensemble = self.searchers[self._best_estimator].train_time + if time_left < time_ensemble < 2 * time_left: + break + if self.searchers[ + estimator].train_time > self.time_budget - self.time_from_start: + self.iter_per_learner[estimator] = self.max_iter_per_learner + + # Add a checkpoint for the current best config to the log. + self.save_helper.checkpoint() + + if self.searchers: + self._selected = self.searchers[self._best_estimator] + self._trained_estimator = self._selected.trained_estimator + self.modelcount = sum(self.searchers[estimator].model_count + for estimator in self.searchers) + logger.info(self._trained_estimator.model) + if self.ensemble: + searchers = list(self.searchers.items()) + searchers.sort(key=lambda x: x[1].best_loss) + estimators = [(x[0], x[1].trained_estimator) for x in searchers[ + :2]] + estimators += [(x[0], x[1].trained_estimator) for x in searchers[ + 2:] if x[1].best_loss < 4 * self._selected.best_loss] + logger.info(estimators) + if self.task != "regression": + from sklearn.ensemble import StackingClassifier as Stacker + for e in estimators: + e[1]._estimator_type = 'classifier' + else: + from sklearn.ensemble import StackingRegressor as Stacker + best_m = self._trained_estimator + stacker = Stacker(estimators, best_m, n_jobs=self.n_jobs, + passthrough=True) + stacker.fit(self.X_train_all, self.y_train_all) + self._trained_estimator = stacker + self._trained_estimator.model = stacker + else: + self._selected = self._trained_estimator = None + self.modelcount = 0 + + def __del__(self): + if hasattr(self, '_trained_estimator') and self._trained_estimator \ + and hasattr(self._trained_estimator, 'cleanup'): + self._trained_estimator.cleanup() + del self._trained_estimator + + def _select_estimator(self, estimator_list): + time_left = self.time_budget - self.time_from_start + if self.best_train_time < time_left < 2 * self.best_train_time: + best_searcher = self.searchers[self._best_estimator] + config_sig = best_searcher.get_hist_config_sig( + best_searcher.sample_size_full, + best_searcher.best_config[0]) + if config_sig not in best_searcher.config_tried: + # trainAll + return self._best_estimator + if self.learner_selector == 'roundrobin': + self.estimator_index += 1 + if self.estimator_index == len(estimator_list): + self.estimator_index = 0 + return estimator_list[self.estimator_index] + min_expected_time, selected = np.Inf, None + inv = [] + for i, estimator in enumerate(estimator_list): + if estimator in self.searchers: + searcher = self.searchers[estimator] + if self.iter_per_learner[estimator] >= self.max_iter_per_learner: + inv.append(0) + continue + eti_searcher = min(2 * searcher.train_time, + searcher.expected_time_improvement_search()) + gap = searcher.best_loss - self._best_loss + if gap > 0 and not self.ensemble: + delta_loss = searcher.old_loss - searcher.new_loss + delta_time = searcher.old_loss_time + \ + searcher.new_loss_time - searcher.old_train_time + speed = delta_loss / float(delta_time) + try: + expected_time = max(gap / speed, searcher.train_time) + except ZeroDivisionError: + warnings.warn("ZeroDivisionError: need to debug ", + "speed: {0}, " + "old_loss: {1}, " + "new_loss: {2}" + .format(speed, + searcher.old_loss, + searcher.new_loss)) + expected_time = 0.0 + expected_time = 2 * max(expected_time, eti_searcher) + else: + expected_time = eti_searcher + if expected_time == 0: + expected_time = 1e-10 + inv.append(1 / expected_time) + else: + expected_time = self.eti[i] + inv.append(0) + if expected_time < min_expected_time: + min_expected_time = expected_time + selected = estimator + if len(self.searchers) < len(estimator_list) or not selected: + if selected not in self.searchers: + # print('select',selected,'eti',min_expected_time) + return selected + s = sum(inv) + p = np.random.random() + q = 0 + for i in range(len(inv)): + if inv[i]: + q += inv[i] / s + if p < q: + return estimator_list[i] diff --git a/flaml/config.py b/flaml/config.py new file mode 100644 index 000000000..4785f7dd3 --- /dev/null +++ b/flaml/config.py @@ -0,0 +1,31 @@ +'''! + * Copyright (c) 2020 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. +''' + +N_SPLITS = 5 +RANDOM_SEED = 1 +SPLIT_RATIO = 0.1 +HISTORY_SIZE = 10000000 +MEM_THRES = 4*(1024**3) +SMALL_LARGE_THRES = 10000000 +MIN_SAMPLE_TRAIN = 10000 +MIN_SAMPLE_VAL = 10000 +CV_HOLDOUT_THRESHOLD = 100000 + +BASE_Const = 2 +BASE_LOWER_BOUND = 2**(0.01) + +ETI_INI = { + 'lgbm':1, + 'xgboost':1.6, + 'xgboost_nb':1.6, + 'rf':2, + 'lrl1':160, + 'lrl2':25, + 'linear_svc':16, + 'kneighbor':30, + 'catboost':15, + 'extra_tree':1.9, + 'nn':50, +} \ No newline at end of file diff --git a/flaml/data.py b/flaml/data.py new file mode 100644 index 000000000..ca24d05d9 --- /dev/null +++ b/flaml/data.py @@ -0,0 +1,256 @@ +'''! + * Copyright (c) 2020 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. +''' + +import numpy as np +from scipy.sparse import vstack, issparse +import pandas as pd +from sklearn.preprocessing import LabelEncoder +from .training_log import training_log_reader + + +def load_openml_dataset(dataset_id, data_dir=None, random_state=0): + '''Load dataset from open ML. + + If the file is not cached locally, download it from open ML. + + Args: + dataset_id: An integer of the dataset id in openml + data_dir: A string of the path to store and load the data + random_state: An integer of the random seed for splitting data + + Returns: + X_train: A 2d numpy array of training data + X_test: A 2d numpy array of test data + y_train: A 1d numpy arrya of labels for training data + y_test: A 1d numpy arrya of labels for test data + ''' + import os + import openml + import pickle + from sklearn.model_selection import train_test_split + + filename = 'openml_ds' + str(dataset_id) + '.pkl' + filepath = os.path.join(data_dir, filename) + if os.path.isfile(filepath): + print('load dataset from', filepath) + with open(filepath, 'rb') as f: + dataset = pickle.load(f) + else: + print('download dataset from openml') + dataset = openml.datasets.get_dataset(dataset_id) + if not os.path.exists(data_dir): + os.makedirs(data_dir) + with open(filepath, 'wb') as f: + pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL) + print('Dataset name:', dataset.name) + X, y, * \ + __ = dataset.get_data( + target=dataset.default_target_attribute, dataset_format='array') + X_train, X_test, y_train, y_test = train_test_split( + X, y, random_state=random_state) + print( + 'X_train.shape: {}, y_train.shape: {};\nX_test.shape: {}, y_test.shape: {}'.format( + X_train.shape, y_train.shape, X_test.shape, y_test.shape, + ) + ) + return X_train, X_test, y_train, y_test + + +def load_openml_task(task_id, data_dir): + '''Load task from open ML. + + Use the first fold of the task. + If the file is not cached locally, download it from open ML. + + Args: + task_id: An integer of the task id in openml + data_dir: A string of the path to store and load the data + + Returns: + X_train: A 2d numpy array of training data + X_test: A 2d numpy array of test data + y_train: A 1d numpy arrya of labels for training data + y_test: A 1d numpy arrya of labels for test data + ''' + import os + import openml + import pickle + task = openml.tasks.get_task(task_id) + filename = 'openml_task' + str(task_id) + '.pkl' + filepath = os.path.join(data_dir, filename) + if os.path.isfile(filepath): + print('load dataset from', filepath) + with open(filepath, 'rb') as f: + dataset = pickle.load(f) + else: + print('download dataset from openml') + dataset = task.get_dataset() + with open(filepath, 'wb') as f: + pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL) + X, y, _, _ = dataset.get_data(task.target_name, dataset_format='array') + train_indices, test_indices = task.get_train_test_split_indices( + repeat=0, + fold=0, + sample=0, + ) + X_train = X[train_indices] + y_train = y[train_indices] + X_test = X[test_indices] + y_test = y[test_indices] + print( + 'X_train.shape: {}, y_train.shape: {},\nX_test.shape: {}, y_test.shape: {}'.format( + X_train.shape, y_train.shape, X_test.shape, y_test.shape, + ) + ) + return X_train, X_test, y_train, y_test + + +def get_output_from_log(filename, time_budget): + '''Get output from log file + + Args: + filename: A string of the log file name + time_budget: A float of the time budget in seconds + + Returns: + training_time_list: A list of the finished time of each logged iter + best_error_list: + A list of the best validation error after each logged iter + error_list: A list of the validation error of each logged iter + config_list: + A list of the estimator, sample size and config of each logged iter + logged_metric_list: A list of the logged metric of each logged iter + ''' + import ast + + best_config = None + best_learner = None + best_val_loss = float('+inf') + training_duration = 0.0 + + training_time_list = [] + config_list = [] + best_error_list = [] + error_list = [] + logged_metric_list = [] + best_config_list = [] + with training_log_reader(filename) as reader: + for record in reader.records(): + time_used = record.total_search_time + training_duration = time_used + val_loss = record.validation_loss + config = record.config + learner = record.learner.split('_')[0] + sample_size = record.sample_size + train_loss = record.logged_metric + + if time_used < time_budget: + if val_loss < best_val_loss: + best_val_loss = val_loss + best_config = config + best_learner = learner + best_config_list.append(best_config) + training_time_list.append(training_duration) + best_error_list.append(best_val_loss) + logged_metric_list.append(train_loss) + error_list.append(val_loss) + config_list.append({"Current Learner": learner, + "Current Sample": sample_size, + "Current Hyper-parameters": record.config, + "Best Learner": best_learner, + "Best Hyper-parameters": best_config}) + + return (training_time_list, best_error_list, error_list, config_list, + logged_metric_list) + + +def concat(X1, X2): + '''concatenate two matrices vertically + ''' + if isinstance(X1, pd.DataFrame) or isinstance(X1, pd.Series): + if isinstance(X1, pd.DataFrame): + cat_columns = X1.select_dtypes( + include='category').columns + df = pd.concat([X1, X2], sort=False) + df.reset_index(drop=True, inplace=True) + if isinstance(X1, pd.DataFrame) and len(cat_columns): + df[cat_columns] = df[cat_columns].astype('category') + return df + if issparse(X1): + return vstack((X1, X2)) + else: + return np.concatenate([X1, X2]) + + +class DataTransformer: + '''transform X, y + ''' + + def fit_transform(self, X, y, objective): + if isinstance(X, pd.DataFrame): + X = X.copy() + n = X.shape[0] + cat_columns, num_columns = [], [] + for column in X.columns: + if X[column].dtype.name in ('object', 'category'): + if X[column].nunique() == 1 or X[column].nunique( + dropna=True) == n - X[column].isnull().sum(): + X.drop(columns=column, inplace=True) + elif X[column].dtype.name == 'category': + current_categories = X[column].cat.categories + if '__NAN__' not in current_categories: + X[column] = X[column].cat.add_categories( + '__NAN__').fillna('__NAN__') + cat_columns.append(column) + else: + X[column].fillna('__NAN__', inplace=True) + cat_columns.append(column) + else: + # print(X[column].dtype.name) + if X[column].nunique(dropna=True) < 2: + X.drop(columns=column, inplace=True) + else: + X[column].fillna(np.nan, inplace=True) + num_columns.append(column) + X = X[cat_columns + num_columns] + if cat_columns: + X[cat_columns] = X[cat_columns].astype('category') + if num_columns: + from sklearn.impute import SimpleImputer + from sklearn.compose import ColumnTransformer + self.transformer = ColumnTransformer([( + 'continuous', + SimpleImputer(missing_values=np.nan, strategy='median'), + num_columns)]) + X[num_columns] = self.transformer.fit_transform(X) + self.cat_columns, self.num_columns = cat_columns, num_columns + + if objective == 'regression': + self.label_transformer = None + else: + from sklearn.preprocessing import LabelEncoder + self.label_transformer = LabelEncoder() + y = self.label_transformer.fit_transform(y) + return X, y + + def transform(self, X): + if isinstance(X, pd.DataFrame): + cat_columns, num_columns = self.cat_columns, self.num_columns + X = X[cat_columns + num_columns].copy() + for column in cat_columns: + # print(column, X[column].dtype.name) + if X[column].dtype.name == 'object': + X[column].fillna('__NAN__', inplace=True) + elif X[column].dtype.name == 'category': + current_categories = X[column].cat.categories + if '__NAN__' not in current_categories: + X[column] = X[column].cat.add_categories( + '__NAN__').fillna('__NAN__') + if cat_columns: + X[cat_columns] = X[cat_columns].astype('category') + if num_columns: + X[num_columns].fillna(np.nan, inplace=True) + X[num_columns] = self.transformer.transform(X) + return X diff --git a/flaml/ml.py b/flaml/ml.py new file mode 100644 index 000000000..ec927d1da --- /dev/null +++ b/flaml/ml.py @@ -0,0 +1,241 @@ +'''! + * Copyright (c) 2020 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. +''' + +from .model import * +import time +from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, \ + accuracy_score, mean_absolute_error, log_loss, average_precision_score, \ + f1_score +import numpy as np +from sklearn.model_selection import RepeatedStratifiedKFold + + +def get_estimator_class(objective_name, estimator_name): + ''' when adding a new learner, need to add an elif branch ''' + + + if 'xgboost' in estimator_name: + if 'regression' in objective_name: + estimator_class = XGBoostEstimator + else: + estimator_class = XGBoostSklearnEstimator + elif 'rf' in estimator_name: + estimator_class = RandomForestEstimator + elif 'lgbm' in estimator_name: + estimator_class = LGBMEstimator + elif 'lrl1' in estimator_name: + estimator_class = LRL1Classifier + elif 'lrl2' in estimator_name: + estimator_class = LRL2Classifier + elif 'catboost' in estimator_name: + estimator_class = CatBoostEstimator + elif 'extra_tree' in estimator_name: + estimator_class = ExtraTreeEstimator + elif 'kneighbor' in estimator_name: + estimator_class = KNeighborsEstimator + else: + raise ValueError(estimator_name + ' is not a built-in learner. ' + 'Please use AutoML.add_learner() to add a customized learner.') + return estimator_class + + +def sklearn_metric_loss_score(metric_name, y_predict, y_true, labels=None): + '''Loss using the specified metric + + Args: + metric_name: A string of the mtric name, one of + 'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'log_loss', + 'f1', 'ap' + y_predict: A 1d or 2d numpy array of the predictions which can be + used to calculate the metric. E.g., 2d for log_loss and 1d + for others. + y_true: A 1d numpy array of the true labels + labels: A 1d numpy array of the unique labels + + Returns: + score: A float number of the loss, the lower the better + ''' + metric_name = metric_name.lower() + if 'r2' in metric_name: + score = 1.0 - r2_score(y_true, y_predict) + elif metric_name == 'rmse': + score = np.sqrt(mean_squared_error(y_true, y_predict)) + elif metric_name == 'mae': + score = mean_absolute_error(y_true, y_predict) + elif metric_name == 'mse': + score = mean_squared_error(y_true, y_predict) + elif metric_name == 'accuracy': + score = 1.0 - accuracy_score(y_true, y_predict) + elif 'roc_auc' in metric_name: + score = 1.0 - roc_auc_score(y_true, y_predict) + elif 'log_loss' in metric_name: + score = log_loss(y_true, y_predict, labels=labels) + elif 'f1' in metric_name: + score = 1 - f1_score(y_true, y_predict) + elif 'ap' in metric_name: + score = 1 - average_precision_score(y_true, y_predict) + else: + raise ValueError(metric_name+' is not a built-in metric, ' + 'currently built-in metrics are: ' + 'r2, rmse, mae, mse, accuracy, roc_auc, log_loss, f1, ap. ' + 'please pass a customized metric function to AutoML.fit(metric=func)') + return score + + +def get_y_pred(estimator, X, eval_metric, obj): + if eval_metric in ['roc_auc', 'ap'] and 'binary' in obj: + y_pred_classes = estimator.predict_proba(X) + y_pred = y_pred_classes[:, + 1] if y_pred_classes.ndim>1 else y_pred_classes + elif eval_metric in ['log_loss', 'roc_auc']: + y_pred = estimator.predict_proba(X) + else: + y_pred = estimator.predict(X) + return y_pred + + +def get_test_loss(estimator, X_train, y_train, X_test, y_test, eval_metric, obj, + labels=None, budget=None, train_loss=False): + start = time.time() + train_time = estimator.fit(X_train, y_train, budget) + if isinstance(eval_metric, str): + test_pred_y = get_y_pred(estimator, X_test, eval_metric, obj) + test_loss = sklearn_metric_loss_score(eval_metric, test_pred_y, y_test, + labels) + if train_loss != False: + test_pred_y = get_y_pred(estimator, X_train, eval_metric, obj) + train_loss = sklearn_metric_loss_score(eval_metric, test_pred_y, + y_train, labels) + else: # customized metric function + test_loss, train_loss = eval_metric( + X_test, y_test, estimator, labels, X_train, y_train) + train_time = time.time()-start + return test_loss, train_time, train_loss + + +def train_model(estimator, X_train, y_train, budget): + train_time = estimator.fit(X_train, y_train, budget) + return train_time + + +def evaluate_model(estimator, X_train, y_train, X_val, y_val, budget, kf, + objective_name, eval_method, eval_metric, best_val_loss, train_loss=False): + if 'holdout' in eval_method: + val_loss, train_loss, train_time = evaluate_model_holdout( + estimator, X_train, y_train, X_val, y_val, budget, + objective_name, eval_metric, best_val_loss, train_loss=train_loss) + else: + val_loss, train_loss, train_time = evaluate_model_CV( + estimator, X_train, y_train, budget, kf, objective_name, + eval_metric, best_val_loss, train_loss=train_loss) + return val_loss, train_loss, train_time + + +def evaluate_model_holdout(estimator, X_train, y_train, X_val, y_val, budget, + objective_name, eval_metric, best_val_loss, train_loss=False): + val_loss, train_time, train_loss = get_test_loss( + estimator, X_train, y_train, X_val, y_val, eval_metric, objective_name, + budget = budget, train_loss=train_loss) + return val_loss, train_loss, train_time + + +def evaluate_model_CV(estimator, X_train_all, y_train_all, budget, kf, + objective_name, eval_metric, best_val_loss, train_loss=False): + start_time = time.time() + total_val_loss = total_train_loss = 0 + train_time = 0 + valid_fold_num = 0 + n = kf.get_n_splits() + X_train_split, y_train_split = X_train_all, y_train_all + if objective_name=='regression': + labels = None + else: + labels = np.unique(y_train_all) + + if isinstance(kf, RepeatedStratifiedKFold): + kf = kf.split(X_train_split, y_train_split) + else: + kf = kf.split(X_train_split) + rng = np.random.RandomState(2020) + val_loss_list = [] + budget_per_train = budget / (n+1) + for train_index, val_index in kf: + train_index = rng.permutation(train_index) + if isinstance(X_train_all, pd.DataFrame): + X_train, X_val = X_train_split.iloc[ + train_index], X_train_split.iloc[val_index] + else: + X_train, X_val = X_train_split[ + train_index], X_train_split[val_index] + if isinstance(y_train_all, pd.Series): + y_train, y_val = y_train_split.iloc[ + train_index], y_train_split.iloc[val_index] + else: + y_train, y_val = y_train_split[ + train_index], y_train_split[val_index] + estimator.cleanup() + val_loss_i, train_time_i, train_loss_i = get_test_loss( + estimator, X_train, y_train, X_val, y_val, eval_metric, + objective_name, labels, budget_per_train, train_loss=train_loss) + valid_fold_num += 1 + total_val_loss += val_loss_i + if train_loss != False: + if total_train_loss != 0: total_train_loss += train_loss_i + else: total_train_loss = train_loss_i + train_time += train_time_i + if valid_fold_num == n: + val_loss_list.append(total_val_loss/valid_fold_num) + total_val_loss = valid_fold_num = 0 + elif time.time() - start_time >= budget: + val_loss_list.append(total_val_loss/valid_fold_num) + break + val_loss = np.max(val_loss_list) + if train_loss != False: train_loss = total_train_loss/n + budget -= time.time() - start_time + if val_loss < best_val_loss and budget > budget_per_train: + estimator.cleanup() + train_time_full = estimator.fit(X_train_all, y_train_all, budget) + train_time += train_time_full + return val_loss, train_loss, train_time + + +def compute_estimator(X_train, y_train, X_val, y_val, budget, kf, + config_dic, objective_name, estimator_name, eval_method, eval_metric, + best_val_loss = np.Inf, n_jobs=1, estimator_class=None, train_loss=False): + start_time = time.time() + estimator_class = estimator_class or get_estimator_class( + objective_name, estimator_name) + estimator = estimator_class( + **config_dic, objective_name = objective_name, n_jobs=n_jobs) + val_loss, train_loss, train_time = evaluate_model( + estimator, X_train, y_train, X_val, y_val, budget, kf, objective_name, + eval_method, eval_metric, best_val_loss, train_loss=train_loss) + all_time = time.time() - start_time + return estimator, val_loss, train_loss, train_time, all_time + + +def train_estimator(X_train, y_train, config_dic, objective_name, + estimator_name, n_jobs=1, estimator_class=None, budget=None): + start_time = time.time() + estimator_class = estimator_class or get_estimator_class(objective_name, + estimator_name) + estimator = estimator_class(**config_dic, objective_name = objective_name, + n_jobs=n_jobs) + if X_train is not None: + train_time = train_model(estimator, X_train, y_train, budget) + else: + estimator = estimator.estimator_class(**estimator.params) + train_time = time.time() - start_time + return estimator, train_time + + +def get_classification_objective(num_labels: int) -> str: + if num_labels == 2: + objective_name = 'binary:logistic' + else: + objective_name = 'multi:softmax' + return objective_name + + diff --git a/flaml/model.py b/flaml/model.py new file mode 100644 index 000000000..327214585 --- /dev/null +++ b/flaml/model.py @@ -0,0 +1,515 @@ +'''! + * Copyright (c) 2020 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. +''' + +import numpy as np +import xgboost as xgb +from xgboost import XGBClassifier, XGBRegressor +import time +from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier +from sklearn.linear_model import LogisticRegression +from lightgbm import LGBMClassifier, LGBMRegressor +import scipy.sparse +import pandas as pd + + +class BaseEstimator: + '''The abstract class for all learners + + Typical example: + XGBoostEstimator: for regression + XGBoostSklearnEstimator: for classification + LGBMEstimator, RandomForestEstimator, LRL1Classifier, LRL2Classifier: + for both regression and classification + ''' + + def __init__(self, objective_name = 'binary:logistic', + **params): + '''Constructor + + Args: + objective_name: A string of the objective name, one of + 'binary:logistic', 'multi:softmax', 'regression' + n_jobs: An integer of the number of parallel threads + params: A dictionary of the hyperparameter names and values + ''' + self.params = params + self.estimator_class = None + self.objective_name = objective_name + if '_estimator_type' in params: + self._estimator_type = params['_estimator_type'] + else: + self._estimator_type = "regressor" if objective_name=='regression' \ + else "classifier" + + def get_params(self, deep=False): + params = self.params.copy() + params["objective_name"] = self.objective_name + if hasattr(self, '_estimator_type'): + params['_estimator_type'] = self._estimator_type + return params + + @property + def classes_(self): + return self.model.classes_ + + def preprocess(self, X): + return X + + def _fit(self, X_train, y_train): + + curent_time = time.time() + X_train = self.preprocess(X_train) + model = self.estimator_class(**self.params) + model.fit(X_train, y_train) + train_time = time.time() - curent_time + self.model = model + return train_time + + def fit(self, X_train, y_train, budget=None): + '''Train the model from given training data + + Args: + X_train: A numpy array of training data in shape n*m + y_train: A numpy array of labels in shape n*1 + budget: A float of the time budget in seconds + + Returns: + train_time: A float of the training time in seconds + ''' + return self._fit(X_train, y_train) + + def predict(self, X_test): + '''Predict label from features + + Args: + X_test: A numpy array of featurized instances, shape n*m + + Returns: + A numpy array of shape n*1. + Each element is the label for a instance + ''' + X_test = self.preprocess(X_test) + return self.model.predict(X_test) + + def predict_proba(self, X_test): + '''Predict the probability of each class from features + + Only works for classification problems + + Args: + model: An object of trained model with method predict_proba() + X_test: A numpy array of featurized instances, shape n*m + + Returns: + A numpy array of shape n*c. c is the # classes + Each element at (i,j) is the probability for instance i to be in + class j + ''' + if 'regression' in self.objective_name: + print('Regression tasks do not support predict_prob') + raise ValueError + else: + X_test = self.preprocess(X_test) + return self.model.predict_proba(X_test) + + def cleanup(self): pass + + +class SKLearnEstimator(BaseEstimator): + + + def preprocess(self, X): + if isinstance(X, pd.DataFrame): + X = X.copy() + cat_columns = X.select_dtypes(include=['category']).columns + X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes) + return X + + +class LGBMEstimator(BaseEstimator): + + + def __init__(self, objective_name='binary:logistic', n_jobs=1, + n_estimators=2, max_leaves=2, min_child_weight=1e-3, learning_rate=0.1, + subsample=1.0, reg_lambda=1.0, reg_alpha=0.0, colsample_bylevel=1.0, + colsample_bytree=1.0, log_max_bin=8, **params): + super().__init__(objective_name, **params) + # Default: ‘regression’ for LGBMRegressor, + # ‘binary’ or ‘multiclass’ for LGBMClassifier + if 'regression' in objective_name: + final_objective_name = 'regression' + elif 'binary' in objective_name: + final_objective_name = 'binary' + elif 'multi' in objective_name: + final_objective_name = 'multiclass' + else: + final_objective_name = 'regression' + self.params = { + "n_estimators": int(round(n_estimators)), + "num_leaves": params[ + 'num_leaves'] if 'num_leaves' in params else int( + round(max_leaves)), + 'objective': params[ + "objective"] if "objective" in params else final_objective_name, + 'n_jobs': n_jobs, + 'learning_rate': float(learning_rate), + 'reg_alpha': float(reg_alpha), + 'reg_lambda': float(reg_lambda), + 'min_child_weight': float(min_child_weight), + 'colsample_bytree':float(colsample_bytree), + 'subsample': float(subsample), + } + self.params['max_bin'] = params['max_bin'] if 'max_bin' in params else ( + 1<4) and budget is not None: + self.params["n_estimators"] = 1 + self.t1 = self._fit(X_train, y_train) + if self.t1 >= budget: + self.params["n_estimators"] = n_iter + return self.t1 + self.params["n_estimators"] = 4 + self.t2 = self._fit(X_train, y_train) + self.time_per_iter = (self.t2 - self.t1)/( + self.params["n_estimators"]-1) if self.t2 > self.t1 \ + else self.t1 if self.t1 else 0.001 + self.train_size = X_train.shape[0] + if self.t1+self.t2>=budget or n_iter==self.params["n_estimators"]: + self.params["n_estimators"] = n_iter + return time.time() - start_time + if budget is not None: + self.params["n_estimators"] = min(n_iter, int((budget-time.time()+ + start_time-self.t1)/self.time_per_iter+1)) + if self.params["n_estimators"] > 0: + self._fit(X_train, y_train) + self.params["n_estimators"] = n_iter + train_time = time.time() - start_time + return train_time + + +class XGBoostEstimator(SKLearnEstimator): + ''' not using sklearn API, used for regression ''' + + + def __init__(self, objective_name='regression', all_thread=False, n_jobs=1, + n_estimators=4, max_leaves=4, subsample=1.0, min_child_weight=1, + learning_rate=0.1, reg_lambda=1.0, reg_alpha=0.0, colsample_bylevel=1.0, + colsample_bytree=1.0, tree_method='auto', **params): + super().__init__(objective_name, **params) + self.n_estimators = int(round(n_estimators)) + self.max_leaves = int(round(max_leaves)) + self.grids = [] + self.params = { + 'max_leaves': int(round(max_leaves)), + 'max_depth': 0, + 'grow_policy': params[ + "grow_policy"] if "grow_policy" in params else 'lossguide', + 'tree_method':tree_method, + 'verbosity': 0, + 'nthread':n_jobs, + 'learning_rate': float(learning_rate), + 'subsample': float(subsample), + 'reg_alpha': float(reg_alpha), + 'reg_lambda': float(reg_lambda), + 'min_child_weight': float(min_child_weight), + 'booster': params['booster'] if 'booster' in params else 'gbtree', + 'colsample_bylevel': float(colsample_bylevel), + 'colsample_bytree':float(colsample_bytree), + } + if all_thread: + del self.params['nthread'] + + def get_params(self, deep=False): + params = super().get_params() + params["n_jobs"] = params['nthread'] + return params + + def fit(self, X_train, y_train, budget=None): + curent_time = time.time() + if not scipy.sparse.issparse(X_train): + self.params['tree_method'] = 'hist' + X_train = self.preprocess(X_train) + dtrain = xgb.DMatrix(X_train, label=y_train) + if self.max_leaves>0: + xgb_model = xgb.train(self.params, dtrain, self.n_estimators) + del dtrain + train_time = time.time() - curent_time + self.model = xgb_model + return train_time + else: + return None + + def predict(self, X_test): + if not scipy.sparse.issparse(X_test): + X_test = self.preprocess(X_test) + dtest = xgb.DMatrix(X_test) + return super().predict(dtest) + + +class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator): + ''' using sklearn API, used for classification ''' + + + def __init__(self, objective_name='binary:logistic', n_jobs=1, + n_estimators=4, max_leaves=4, subsample=1.0, + min_child_weight=1, learning_rate=0.1, reg_lambda=1.0, reg_alpha=0.0, + colsample_bylevel=1.0, colsample_bytree=1.0, tree_method='hist', + **params): + super().__init__(objective_name, **params) + self.params = { + "n_estimators": int(round(n_estimators)), + 'max_leaves': int(round(max_leaves)), + 'max_depth': 0, + 'grow_policy': params[ + "grow_policy"] if "grow_policy" in params else 'lossguide', + 'tree_method':tree_method, + 'verbosity': 0, + 'n_jobs': n_jobs, + 'learning_rate': float(learning_rate), + 'subsample': float(subsample), + 'reg_alpha': float(reg_alpha), + 'reg_lambda': float(reg_lambda), + 'min_child_weight': float(min_child_weight), + 'booster': params['booster'] if 'booster' in params else 'gbtree', + 'colsample_bylevel': float(colsample_bylevel), + 'colsample_bytree': float(colsample_bytree), + } + + if 'regression' in objective_name: + self.estimator_class = XGBRegressor + else: + self.estimator_class = XGBClassifier + self.time_per_iter = None + self.train_size = 0 + + def fit(self, X_train, y_train, budget=None): + if scipy.sparse.issparse(X_train): + self.params['tree_method'] = 'auto' + return super().fit(X_train, y_train, budget) + + +class RandomForestEstimator(SKLearnEstimator, LGBMEstimator): + + + def __init__(self, objective_name = 'binary:logistic', n_jobs = 1, + n_estimators = 4, max_leaves = 4, max_features = 1.0, + min_samples_split = 2, min_samples_leaf = 1, criterion = 1, **params): + super().__init__(objective_name, **params) + self.params = { + "n_estimators": int(round(n_estimators)), + "n_jobs": n_jobs, + 'max_features': float(max_features), + } + if 'regression' in objective_name: + self.estimator_class = RandomForestRegressor + else: + self.estimator_class = RandomForestClassifier + self.params['criterion'] = 'entropy' if criterion>1.5 else 'gini' + self.time_per_iter = None + self.train_size = 0 + + def get_params(self, deep=False): + params = super().get_params() + params["criterion"] = 1 if params["criterion"]=='gini' else 2 + return params + + +class ExtraTreeEstimator(RandomForestEstimator): + + + def __init__(self, objective_name = 'binary:logistic', n_jobs = 1, + n_estimators = 4, max_leaves = 4, max_features = 1.0, + min_samples_split = 2, min_samples_leaf = 1, criterion = 1, **params): + super().__init__(objective_name, **params) + self.params = { + "n_estimators": int(round(n_estimators)), + "n_jobs": n_jobs, + 'max_features': float(max_features), + } + if 'regression' in objective_name: + from sklearn.ensemble import ExtraTreesRegressor + self.estimator_class = ExtraTreesRegressor + else: + from sklearn.ensemble import ExtraTreesClassifier + self.estimator_class = ExtraTreesClassifier + self.params['criterion'] = 'entropy' if criterion>1.5 else 'gini' + self.time_per_iter = None + self.train_size = 0 + + +class LRL1Classifier(SKLearnEstimator): + + + def __init__(self, tol=0.0001, C=1.0, + objective_name='binary:logistic', n_jobs=1, **params): + super().__init__(objective_name, **params) + self.params = { + 'penalty': 'l1', + 'tol': float(tol), + 'C': float(C), + 'solver': 'saga', + 'n_jobs': n_jobs, + } + if 'regression' in objective_name: + self.estimator_class = None + print('Does not support regression task') + raise NotImplementedError + else: + self.estimator_class = LogisticRegression + + +class LRL2Classifier(SKLearnEstimator): + + + def __init__(self, tol=0.0001, C=1.0, + objective_name='binary:logistic', n_jobs=1, **params): + super().__init__(objective_name, **params) + self.params = { + 'penalty': 'l2', + 'tol': float(tol), + 'C': float(C), + 'solver': 'lbfgs', + 'n_jobs': n_jobs, + } + if 'regression' in objective_name: + self.estimator_class = None + print('Does not support regression task') + raise NotImplementedError + else: + self.estimator_class = LogisticRegression + + +class CatBoostEstimator(BaseEstimator): + + + time_per_iter = None + train_size = 0 + + def __init__(self, objective_name = 'binary:logistic', n_jobs=1, + n_estimators=8192, exp_max_depth=64, learning_rate=0.1, rounds=4, + l2_leaf_reg=3, **params): + super().__init__(objective_name, **params) + self.params = { + "early_stopping_rounds": int(round(rounds)), + "n_estimators": n_estimators, + 'learning_rate': learning_rate, + 'thread_count': n_jobs, + 'verbose': False, + 'random_seed': params[ + "random_seed"] if "random_seed" in params else 10242048, + } + # print(n_estimators) + if 'regression' in objective_name: + from catboost import CatBoostRegressor + self.estimator_class = CatBoostRegressor + else: + from catboost import CatBoostClassifier + self.estimator_class = CatBoostClassifier + + def get_params(self, deep=False): + params = super().get_params() + params['n_jobs'] = params['thread_count'] + params['rounds'] = params['early_stopping_rounds'] + return params + + def fit(self, X_train, y_train, budget=None): + start_time = time.time() + n_iter = self.params["n_estimators"] + if isinstance(X_train, pd.DataFrame): + cat_features = list(X_train.select_dtypes( + include='category').columns) + else: + cat_features = [] + if (not CatBoostEstimator.time_per_iter or + abs(CatBoostEstimator.train_size-len(y_train))>4) and budget: + # measure the time per iteration + self.params["n_estimators"] = 1 + CatBoostEstimator.model = self.estimator_class(**self.params) + CatBoostEstimator.model.fit(X_train, y_train, + cat_features=cat_features) + CatBoostEstimator.t1 = time.time() - start_time + if CatBoostEstimator.t1 >= budget: + self.params["n_estimators"] = n_iter + self.model = CatBoostEstimator.model + return CatBoostEstimator.t1 + self.params["n_estimators"] = 4 + CatBoostEstimator.model = self.estimator_class(**self.params) + CatBoostEstimator.model.fit(X_train, y_train, + cat_features=cat_features) + CatBoostEstimator.time_per_iter = (time.time() - start_time - + CatBoostEstimator.t1)/(self.params["n_estimators"]-1) + if CatBoostEstimator.time_per_iter <= 0: + CatBoostEstimator.time_per_iter = CatBoostEstimator.t1 + CatBoostEstimator.train_size = len(y_train) + if time.time()-start_time>=budget or n_iter==self.params[ + "n_estimators"]: + self.params["n_estimators"] = n_iter + self.model = CatBoostEstimator.model + return time.time()-start_time + if budget: + train_times = 1 + self.params["n_estimators"] = min(n_iter, int((budget-time.time()+ + start_time-CatBoostEstimator.t1)/train_times/ + CatBoostEstimator.time_per_iter+1)) + self.model = CatBoostEstimator.model + if self.params["n_estimators"] > 0: + l = max(int(len(y_train)*0.9), len(y_train)-1000) + X_tr, y_tr = X_train[:l], y_train[:l] + from catboost import Pool + model = self.estimator_class(**self.params) + model.fit(X_tr, y_tr, cat_features=cat_features, eval_set=Pool( + data=X_train[l:], label=y_train[l:], cat_features=cat_features)) + # print(self.params["n_estimators"], model.get_best_iteration()) + self.model = model + self.params["n_estimators"] = n_iter + train_time = time.time() - start_time + # print(budget, train_time) + return train_time + + +class KNeighborsEstimator(BaseEstimator): + + + def __init__(self, objective_name='binary:logistic', n_jobs=1, + n_neighbors=5, **params): + super().__init__(objective_name, **params) + self.params= { + 'n_neighbors': int(round(n_neighbors)), + 'weights': 'distance', + 'n_jobs': n_jobs, + } + if 'regression' in objective_name: + from sklearn.neighbors import KNeighborsRegressor + self.estimator_class = KNeighborsRegressor + else: + from sklearn.neighbors import KNeighborsClassifier + self.estimator_class = KNeighborsClassifier + + def preprocess(self, X): + if isinstance(X, pd.DataFrame): + cat_columns = X.select_dtypes(['category']).columns + # print(X.dtypes) + # print(cat_columns) + if X.shape[1] == len(cat_columns): + raise ValueError( + "kneighbor requires at least one numeric feature") + X = X.drop(cat_columns, axis=1) + return X diff --git a/flaml/search.py b/flaml/search.py new file mode 100644 index 000000000..5c90ad7af --- /dev/null +++ b/flaml/search.py @@ -0,0 +1,675 @@ +'''! + * Copyright (c) 2020 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. +''' + +from functools import partial +from .ml import train_estimator +import time +import math +import numpy as np +from .space import config_space, estimator_size, get_config_values, \ + generate_config_ini, generate_config_max, generate_config_min +from .config import SPLIT_RATIO, MIN_SAMPLE_TRAIN, \ + HISTORY_SIZE, MEM_THRES, BASE_Const, BASE_LOWER_BOUND +from random import gauss + + +def rand_vector_unit_sphere(dims): + vec = [gauss(0, 1) for i in range(dims)] + mag = sum(x**2 for x in vec) ** .5 + return [x / mag for x in vec] + + +def rand_vector_gaussian(dims): + vec = [gauss(0, 1) for i in range(dims)] + return vec + + +class ParamSearch: + ''' + the class for searching params for 1 learner + ''' + + def __init__(self, estimator, data_size, + compute_with_config, train_with_config, save_info_helper=None, + init_sample_size=MIN_SAMPLE_TRAIN, objective_name='regression', + log_type='better', config_space_info=None, size_estimator=None, + split_ratio=SPLIT_RATIO, base_change='sqrtK', use_dual_dir=True, + move_type='geo'): + self.log_type = log_type + self.base_change = base_change + if init_sample_size > data_size: + init_sample_size = data_size + self.next_sample_size = {} + self.prev_sample_size = {} + s = init_sample_size + self.prev_sample_size[s] = s + self.estimator_configspace = config_space_info or config_space( + estimator, data_size, objective_name) + self.get_size_for_config = size_estimator or ( + lambda x: estimator_size(x, estimator)) + config_min_dic_primary, config_min_dic_more, config_min_dic = \ + generate_config_min(estimator, self.estimator_configspace, None) + self.min_config_primary = np.array( + list(config_min_dic_primary.values())) + self.min_config_more = np.array(list(config_min_dic_more.values())) + self.min_config = np.array(list(config_min_dic.values())) + # init configurations for different sample size + config_init_dic_primary, config_init_dic_more, _, config_type_dic = \ + generate_config_ini(estimator, self.estimator_configspace) + self.init_config_dic_primary = {s: config_init_dic_primary} + self.init_config_dic_more = {s: config_init_dic_more} + self.init_config_dic_type_dic = {'primary': { + s: config_init_dic_primary}, 'more': {s: config_init_dic_more}} + self.init_config_dic = { + **self.init_config_dic_type_dic['primary'], + **self.init_config_dic_type_dic['more'] + } + self.config_type_dic = config_type_dic + # max configurations for different sample size + config_max_dic_primary, config_max_dic_more, config_max_dic = \ + generate_config_max( + estimator, self.estimator_configspace, int(s)) + self.max_config_dic_primary = {s: np.array( + list(config_max_dic_primary.values()))} + self.max_config_dic_more = {s: np.array( + list(config_max_dic_more.values()))} + self.max_config_dic = {s: np.array(list(config_max_dic.values()))} + self.dims = (len(self.min_config_primary), len(self.min_config_more)) + # print(self.dims) + if self.dims[1] > 0 and self.dims[0] > 0: + self.base_upper_bound = { + s: + max( + max( + (self.max_config_dic_primary[s][i] / self.min_config_primary[i]) + ** math.sqrt(self.dims[0]) for i in range(self.dims[0]) + ), + max( + (self.max_config_dic_more[s][i] / self.min_config_more[i]) + ** math.sqrt(self.dims[1]) for i in range(self.dims[1])) + ) + } + elif self.dims[0] > 0: + self.base_upper_bound = { + s: + max( + (self.max_config_dic_primary[s][i] / self.min_config_primary[i]) + ** (math.sqrt(self.dims[0])) for i in range(self.dims[0]) + ) + } + else: + self.base_upper_bound = { + s: + max( + (self.max_config_dic_more[s][i] / self.min_config_more[i]) + ** (math.sqrt(self.dims[1])) for i in range(self.dims[1]) + ) + } + + # create sample size sequence + while s < data_size: + s2 = self.next_sample_size[s] = s * 2 if s * 2 <= data_size else data_size + self.prev_sample_size[s2] = s + s = s2 + + config_max_dic_primary, config_max_dic_more, config_max_dic = \ + generate_config_max( + estimator, self.estimator_configspace, int(s)) + self.max_config_dic_primary[s] = np.array( + list(config_max_dic_primary.values())) + self.max_config_dic_more[s] = np.array( + list(config_max_dic_more.values())) + self.max_config_dic[s] = np.array(list(config_max_dic.values())) + if self.dims[1] > 0 and self.dims[0] > 0: + self.base_upper_bound[s] = max( + max( + (self.max_config_dic_primary[s][i] + / self.min_config_primary[i]) + ** math.sqrt(self.dims[0]) for i in range(self.dims[0]) + ), + max( + (self.max_config_dic_more[s][i] + / self.min_config_more[i]) + ** math.sqrt(self.dims[1]) for i in range(self.dims[1]) + ) + ) + elif self.dims[0] > 0: + self.base_upper_bound[s] = max( + (self.max_config_dic_primary[s][i] + / self.min_config_primary[i]) + ** math.sqrt(self.dims[0]) for i in range(self.dims[0]) + ) + else: + self.base_upper_bound[s] = max( + (self.max_config_dic_more[s][i] / self.min_config_more[i]) + ** math.sqrt(self.dims[1]) for i in range(self.dims[1]) + ) + + self.init_sample_size = init_sample_size + self.data_size = data_size + self.sample_size_full = int(self.data_size / (1.0 - split_ratio)) + + self.compute_with_config = compute_with_config + self.estimator = estimator + + # for logging + self.save_helper = save_info_helper + self.estimator_type_list = ['primary', 'more'] + self.dim = self.dims[0] if self.dims[0] > 0 else self.dims[1] + self.b = BASE_Const**(math.sqrt(self.dim)) + self.base_ini = self.b + self.total_dim = sum(self.dims) + + self.epo = 2**(self.dim - 1) + # keys are [sample size, config], values are (loss, train_time) + self.config_tried = {} + self.train_with_config = train_with_config + + self.current_config_loss = None + self.use_dual_dir = use_dual_dir + self.move_type = move_type + + def evaluate_config(self, config, sample_size, move='_pos'): + ''' + evaluate a configuration, update search state, + and return whether the state is changed + ''' + if self.time_from_start >= self.time_budget or move != '_ini' and \ + self.train_time > self.time_budget - self.time_from_start: + return False + + model, val_loss, new_train_time, from_history, train_loss = \ + self.evaluate_proposed_config(config, sample_size, move) + # update current config + self.update_current_config(config, val_loss, sample_size) + # update best model statistics, including statistics about loss and time + improved = self.update_search_state_best( + config, sample_size, model, val_loss, new_train_time, from_history) + self.time_from_start = time.time() - self.start_time + if self.save_helper is not None: + if from_history: + move = move + '_from_hist' + self.save_helper.append(self.model_count, + train_loss, + new_train_time, + self.time_from_start, + val_loss, + config, + self.best_loss, + self.best_config[0], + self.estimator, + sample_size) + return improved + + def get_hist_config_sig(self, sample_size, config): + config_values = get_config_values(config, self.config_type_dic) + config_sig = str(sample_size) + '_' + str(config_values) + return config_sig + + def evaluate_proposed_config(self, config, sample_size, move): + self.model_count += 1 + config_sig = self.get_hist_config_sig(sample_size, config) + d = self.total_dim + history_size_per_d = len(self.config_tried) / float(d) + if config_sig in self.config_tried: + val_loss, new_train_time = self.config_tried[config_sig] + # print(config_sig,'found in history') + model = train_loss = None + from_history = True + else: + model, val_loss, train_loss, new_train_time, _ = \ + self.compute_with_config(self.estimator, config, sample_size) + from_history = False + if history_size_per_d < HISTORY_SIZE: + self.config_tried[config_sig] = (val_loss, new_train_time) + + if self.first_move: + self.init_config_dic[sample_size] = config + move = '_ini' + self.base = self.base_ini + self.num_noimprovement = 0 + move = str(self.estimator) + move + return model, val_loss, new_train_time, from_history, train_loss + + def update_current_config(self, config, val_loss, sample_size): + if self.first_move or val_loss < self.current_config_loss: + self.first_move = False + # update current config and coressponding sample_size + self.sample_size = sample_size + self.config = config + self.config_primary = {x: config[x] + for x in self.config_primary.keys()} + try: + self.config_more = {x: config[x] + for x in self.config_more.keys()} + except: + self.config_more = {} + self.current_config_loss = val_loss + + def update_reset_best_config_loss(self, sample_size, config, val_loss): + if sample_size == self.data_size: + if self.best_config_loss_dic_full_reset[1] is None: + self.best_config_loss_dic_full_reset = [ + config, val_loss, self.model_count] + else: + full_reset_best_loss = self.best_config_loss_dic_full_reset[1] + if val_loss < full_reset_best_loss: + self.best_config_loss_dic_full_reset = [ + config, full_reset_best_loss, self.model_count] + + def update_search_state_best(self, config, sample_size, model, val_loss, + new_train_time, from_history): + # upate the loss statistics for a particular sample size + if sample_size not in self.best_config_loss_samplesize_dic: + self.best_config_loss_samplesize_dic[sample_size] = [ + config, val_loss, self.model_count] + else: + s_best_loss = self.best_config_loss_samplesize_dic[sample_size][1] + if val_loss < s_best_loss: + self.best_config_loss_samplesize_dic[sample_size] = [ + config, val_loss, self.model_count] + + self.update_reset_best_config_loss(sample_size, config, val_loss) + + # update best model statistics, including statistics about loss and time + if val_loss < self.new_loss: + self.old_loss = self.new_loss if self.new_loss < float( + 'inf') else 2 * val_loss + self.new_loss = val_loss + self.old_loss_time = self.new_loss_time + self.old_train_time = self.train_time + self.new_loss_time = self.train_time = new_train_time + if val_loss < self.best_loss: + self.best_config = [self.config, self.model_count] + if not from_history: + self.trained_estimator = model + # print(model) + else: + print(val_loss, self.best_loss) + self.best_loss = val_loss + self.time_best_found = self.time_from_start + return True + else: + if not from_history: + self.new_loss_time += new_train_time + return False + + def get_proposal(self, current_config, rand_vector_func, base, move_type): + rand_vector = rand_vector_func(len(current_config)) + rand_vector = [i for i in rand_vector] + rand_vector_neg = [-i for i in rand_vector] + + move_vector = {} + move_vector_neg = {} + + index_ = 0 + for k, v in current_config.items(): + if 'geo' in move_type: + # get the move vector using the proposed random vector + move_vector[k] = v * (base**(rand_vector[index_])) + move_vector_neg[k] = v * (base**(rand_vector_neg[index_])) + else: + move_vector[k] = v + (base * (rand_vector[index_])) + move_vector_neg[k] = v + (base * (rand_vector_neg[index_])) + index_ += 1 + + # as long as one of the proposed model (+ or -) is within the mem_limit + # we will proceed + if not self.use_dual_dir: + move_vector_neg = None + return move_vector, move_vector_neg + + def get_config_from_move_vector(self, v, estimator_type): + if v != None: + if 'all' in estimator_type: + v = v + elif 'primary' in estimator_type: + v = {**v, **self.config_more} + else: + v = {**self.config_primary, **v} + + bounded_v = self.get_v_within_min_max(v) + else: + bounded_v = None + return bounded_v + + def dual_direction_sample(self, base, current_search_config, + estimator_type='primary', rand_vector_func=rand_vector_unit_sphere, + mem_thres=MEM_THRES, move_type='geo'): + current_config = current_search_config + if len(current_config) == 0: + return None, None + bounded_v_list = [None, None] + while not bounded_v_list[0] and not bounded_v_list[ + 1] and self.time_from_start < self.time_budget: + move_vector, move_vector_neg = self.get_proposal( + current_config, rand_vector_func, + base, move_type) + bounded_v_list = [move_vector, move_vector_neg] + for i, v in enumerate(bounded_v_list): + bounded_v = self.get_config_from_move_vector(v, estimator_type) + proposed_model_size = self.get_size_for_config(bounded_v) + proposed_model_size = 0 if not isinstance( + proposed_model_size, float) else proposed_model_size + if proposed_model_size > mem_thres: + # print(bounded_v, proposed_model_size, mem_thres) + bounded_v = None + bounded_v_list[i] = bounded_v + self.time_from_start = time.time() - self.start_time + return bounded_v_list + + def get_v_within_min_max(self, v): + index_ = 0 + bounded_v = {} + for key, value in v.items(): + new_value = min(max( + value, self.min_config[index_]), self.max_config_dic[ + self.sample_size][index_]) + bounded_v[key] = new_value + index_ += 1 + return bounded_v + + def expected_time_improvement_search(self): + return max(self.old_loss_time - self.old_train_time + self.train_time, + self.new_loss_time) + + def increase_sample_size(self): + ''' + whether it's time to increase sample size + ''' + expected_time_improvement_sample = 2 * self.train_time + self.increase = self.sample_size < self.data_size and ( + self.estimator_type == 0 or self.dims[0] == 0) and ( + not self.improved + or expected_time_improvement_sample + < self.expected_time_improvement_search() + ) + return self.increase + + def search_begin(self, time_budget, start_time=None): + self.time_budget = time_budget + if not start_time: + self.start_time = time.time() + else: + self.start_time = start_time + # the time to train the last selected config + self.old_train_time = self.train_time = 0 + self.time_from_start = 0 + # search states + self.first_move = True + self.improved = True + self.estimator_type = 0 if self.dims[0] > 0 else 1 + + self.old_loss = self.new_loss = self.best_loss = float('+inf') + # new_loss_time is the time from the beginning of training self.config to + # now, + # old_loss_time is the time from the beginning of training the old + # self.config to the beginning of training self.config + self.old_loss_time = self.new_loss_time = 0 + + self.trained_estimator = None + self.model_count = 0 + self.K = 0 + self.old_modelcount = 0 + + # self.config has two parts: config_primary contain the configs + # that are related with model complexity, config_more contains the + # configs that is not related with model complexity + self.config_primary = self.init_config_dic_primary[self.init_sample_size] + self.config_more = self.init_config_dic_more[self.init_sample_size] + self.config = {**self.config_primary, **self.config_more} + self.best_config = [None, None] + # key: sample size, value: [best_config, best_loss, model_count] under + # sample size in the key + self.best_config_loss_samplesize_dic = { + self.init_sample_size: [self.config, self.old_loss, self.model_count]} + # key: sample size, value: [best_config, best_loss, model_count] under + # sample size in the key + self.best_config_loss_dic_full_reset = [None, None, None] + self.sample_size = self.init_sample_size + self.base_change_bound = 1 + self.base_change_count = 0 + self.evaluate_config(self.config, self.sample_size, '_ini') + self.increase = False + + def train_config(self, config, sample_size): + ''' + train a configuration + ''' + # print('Evalute Config') + if self.time_from_start >= self.time_budget: + return False + config_sig = self.get_hist_config_sig(sample_size, config) + if not config_sig in self.config_tried: + _, new_train_time = self.train_with_config( + self.estimator, config, sample_size) + train_loss, val_loss, move = None, self.new_loss, str( + self.estimator) + '_trainAll' + self.time_from_start = time.time() - self.start_time + if self.save_helper is not None: + self.save_helper.append(self.model_count, + train_loss, + new_train_time, + self.time_from_start, + val_loss, + config, + self.best_loss, + self.best_config, + move, + sample_size) + self.config_tried[config_sig] = (val_loss, new_train_time) + + def try_increase_sample_size(self): + # print( self.estimator, self.sample_size) + if self.sample_size in self.next_sample_size: + if self.increase_sample_size(): + self.first_move = True + self.improved = True + self.estimator_type = 0 if self.dims[0] > 0 else 1 + self.evaluate_config( + self.config, self.next_sample_size[self.sample_size]) + if not self.old_modelcount and self.sample_size == self.data_size: + self.old_modelcount = self.model_count + + def setup_current_search_config(self): + estimator_type = self.estimator_type_list[self.estimator_type] + if 'all' in estimator_type: + current_search_config = self.config + elif 'primary' in estimator_type: + current_search_config = self.config_primary + else: + current_search_config = self.config_more + # print(self.config_more) + return estimator_type, current_search_config + + def search1step(self, global_best_loss=float('+inf'), + retrain_full=True, mem_thres=MEM_THRES, reset_type='init_gaussian'): + # try to increase sample size + self.try_increase_sample_size() + # decide current_search_config according to estimator_type + estimator_type, current_search_config = \ + self.setup_current_search_config() + time_left = self.time_budget - self.time_from_start + if time_left < self.train_time: + return False + if retrain_full and self.train_time < time_left < 2 * self.train_time \ + and self.best_loss <= global_best_loss: + self.train_config(self.best_config[0], self.sample_size_full) + + move_vector, move_vector_neg = self.dual_direction_sample( + self.base, current_search_config, estimator_type, + rand_vector_unit_sphere, mem_thres, self.move_type) + if move_vector is None: + if move_vector_neg is None: + self.improved = False + else: + self.improved = self.evaluate_config( + move_vector_neg, self.sample_size, '_neg' + str( + estimator_type)) + else: + self.improved = self.evaluate_config( + move_vector, self.sample_size, '_pos' + str(estimator_type)) + if not self.improved: + if move_vector_neg is None: + pass + else: + self.improved = self.evaluate_config( + move_vector_neg, self.sample_size, '_neg' + str( + estimator_type)) + self.update_noimprovement_stat( + global_best_loss, retrain_full, reset_type) + return self.improved + + def update_noimprovement_stat(self, global_best_loss, retrain_full, + reset_type): + if self.improved: + self.num_noimprovement = 0 + else: + self.estimator_type = 1 - self.estimator_type + if self.dims[self.estimator_type] == 0: + self.estimator_type = 1 - self.estimator_type + if self.estimator_type == 1 or self.dims[1] == 0: + self.noimprovement(global_best_loss, retrain_full, reset_type) + + def noimprovement(self, global_best_loss, retrain_full, reset_type='org'): + if self.sample_size == self.data_size: + # Do not wait until full sample size to update num_noimprovement? + self.num_noimprovement += 1 + if self.num_noimprovement >= self.epo: + self.num_noimprovement = 0 + # print(self.num_noimprovement, self.epo) + if self.base_change == 'squareroot': + self.base = math.sqrt(self.base) + else: + if self.K == 0: # first time + oldK = self.best_config_loss_dic_full_reset[2] - \ + self.old_modelcount + else: + oldK = self.K + self.K = self.model_count + 1 - self.old_modelcount + if self.base_change == 'K': + self.base **= oldK / self.K + else: + self.base **= math.sqrt(oldK / self.K) + if self.dims[1] > 0 and self.dims[0] > 0: + base_lower_bound = min( + min( + (1.0 + self.estimator_configspace[i].min_change + / self.config_primary[i]) + ** math.sqrt(self.dims[0]) + for i in self.config_primary.keys() + ), + min( + (1.0 + self.estimator_configspace[i].min_change + / self.config_more[i]) + ** math.sqrt(self.dims[1]) + for i in self.config_more.keys() + ) + ) + elif self.dims[0] > 0: + base_lower_bound = min( + (1.0 + self.estimator_configspace[i].min_change + / self.config_primary[i]) + ** math.sqrt(self.dims[0]) + for i in self.config_primary.keys() + ) + else: + base_lower_bound = min( + (1.0 + self.estimator_configspace[i].min_change + / self.config_more[i]) + ** math.sqrt(self.dims[1]) + for i in self.config_more.keys() + ) + if np.isinf(base_lower_bound): + base_lower_bound = BASE_LOWER_BOUND + self.base_change_count += 1 + if self.base <= base_lower_bound or \ + self.base_change_count == self.base_change_bound: + if retrain_full and self.sample_size == self.data_size: + if self.best_loss <= global_best_loss: + # Only train on full data when the curent estimator + # is the best estimator + # print('best estimator and train on full data') + self.train_config( + self.best_config[0], self.sample_size_full) + # remaining time is more than enough for another trial + if self.time_budget - self.time_from_start > self.train_time: + self.base_change_bound <<= 1 + self.base_change_count = 0 + self.K = 0 + self.old_modelcount = self.model_count + self.best_config_loss_dic_full_reset = [None, None, + None] + self.first_move = True + self.improved = True + self.base_ini = min( + self.base_ini * 2, self.base_upper_bound[ + self.sample_size]) + self.estimator_type = 0 if self.dims[0] > 0 else 1 + reset_config, reset_sample_size = self.get_reset_config( + self.init_sample_size, reset_type) + self.sample_size = reset_sample_size + # print('reset sample size', reset_sample_size) + self.evaluate_config(reset_config, self.sample_size, + '_ini') + + def get_reset_config(self, sample_size, reset_type): + init_config = self.init_config_dic[self.sample_size] + reset_sample_size = sample_size + if 'org' in reset_type: + reset_config = init_config + else: + if 'init_gaussian' in reset_type: + reset_config = init_config + reset_sample_size = self.get_reset_sample_size(reset_config) + config_values = get_config_values( + reset_config, self.config_type_dic) + config_sig = str(reset_sample_size) + '_' + str(config_values) + count = 0 + while config_sig in self.config_tried and \ + self.time_from_start < self.time_budget and count < 1000: + # TODO: check exhaustiveness? use time as condition? + count += 1 + move, move_neg = self.dual_direction_sample( + base=self.b, current_search_config=init_config, + estimator_type='all', + rand_vector_func=rand_vector_gaussian, + move_type=self.move_type) + if move: + reset_config = move_neg + elif move_neg: + reset_config = move_neg + else: + continue + reset_sample_size = self.get_reset_sample_size( + reset_config) + config_values = get_config_values( + reset_config, self.config_type_dic) + config_sig = str(reset_sample_size) + \ + '_' + str(config_values) + self.time_from_start = time.time() - self.start_time + else: + raise NotImplementedError + return reset_config, reset_sample_size + + def get_reset_sample_size(self, reset_config): + if not reset_config: + print('reset_config is none') + reset_config_size = self.get_size_for_config(reset_config) + + candidate_sample_size_list = [] + for sample_size, config_and_bestloss in \ + self.best_config_loss_samplesize_dic.items(): + s_best_config = config_and_bestloss[0] + if not s_best_config: + print('best config is none', sample_size) + s_best_config_model_size = self.get_size_for_config(s_best_config) + if s_best_config_model_size >= reset_config_size: + candidate_sample_size_list.append(sample_size) + + if len(candidate_sample_size_list) != 0: + return min(candidate_sample_size_list) + else: + return self.data_size diff --git a/flaml/space.py b/flaml/space.py new file mode 100644 index 000000000..8bc2cb1ad --- /dev/null +++ b/flaml/space.py @@ -0,0 +1,249 @@ +'''! + * Copyright (c) 2020 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. +''' + + +class ConfigSearchInfo: + '''The class of the search space of a hyperparameters: + + Attributes: + name: A string of the name of the hyperparameter + type: data type of the hyperparameter + lower: A number of the lower bound of the value + upper: A number of the upper bound of the value + init: A number of the initial value. For hyperparameters related to + complexity, the init value needs to correspond to the lowest + complexity + change_tpe: A string of the change type, 'linear' or 'log' + min_change: A number of the minimal change required. Could be inf if + no such requirement + ''' + + def __init__(self, name, type, lower, upper, init, change_type = 'log', + complexity_related = True, min_change = None): + self.name = name + self.type = type + self.lower = lower + self.upper = upper + self.init = init + self.change_type = change_type + self.complexity_related = complexity_related + # default setting of min_change: if type is int, min_change + # should be 1, otherwise +inf + if min_change is None: + if self.type == int: + self.min_change = 1.0 #minimum change required, + else: + self.min_change = float('+inf') + else: + self.min_change = min_change + + +def config_space(estimator, data_size, objective_name = "regression"): + CS = {} + n_estimators_upper = min(32768,int(data_size)) + max_leaves_upper = min(32768,int(data_size)) + # exp_max_depth_upper = min(32768,data_size) + if 'xgboost' in estimator: + CS['n_estimators'] = ConfigSearchInfo(name = 'n_estimators', + type = int, lower = 4, init = 4, upper = n_estimators_upper, + change_type = 'log') + CS['max_leaves'] = ConfigSearchInfo(name = 'max_leaves', type =int, + lower = 4, init = 4, upper = max_leaves_upper, change_type = 'log') + CS['min_child_weight'] = ConfigSearchInfo(name = 'min_child_weight', + type = float, lower = 0.001, init = 20.0, upper = 20.0, + change_type = 'log') + + CS['learning_rate'] = ConfigSearchInfo(name = 'learning_rate', + type = float, lower = 0.01, init = 0.1, upper = 1.0, + change_type = 'log') + CS['subsample'] = ConfigSearchInfo(name = 'subsample', type = float, + lower = 0.6, init = 1.0, upper = 1.0, change_type = 'linear') + CS['reg_alpha'] = ConfigSearchInfo(name = 'reg_alpha', type = float, + lower = 1e-10, init = 1e-10, upper = 1.0, change_type = 'log', + complexity_related = True) + CS['reg_lambda'] = ConfigSearchInfo(name = 'reg_lambda', type = float, + lower = 1e-10, init = 1.0, upper = 1.0, change_type = 'log') + CS['colsample_bylevel'] = ConfigSearchInfo(name = 'colsample_bylevel', + type = float, lower = 0.6, init = 1.0, upper = 1.0, + change_type = 'linear') + CS['colsample_bytree'] = ConfigSearchInfo(name = 'colsample_bytree', + type = float, lower = 0.7, init = 1.0, upper = 1.0, + change_type = 'linear') + elif estimator in ('rf', 'extra_tree'): + n_estimators_upper = min(2048, n_estimators_upper) + # max_leaves_upper = min(2048, max_leaves_upper) + CS['n_estimators'] = ConfigSearchInfo(name = 'n_estimators', + type = int, lower = 4, init = 4, upper = n_estimators_upper, + change_type = 'log') + if objective_name != 'regression': + CS['criterion'] = ConfigSearchInfo(name = 'criterion', + type = int, lower = 1, init = 1, upper = 2, + change_type = 'log') + + # CS['max_leaves'] = ConfigSearchInfo(name = 'max_leaves', type =int, + # lower = 4, init = 4, upper = max_leaves_upper, change_type = 'log', + # complexity_related = True) + + CS['max_features'] = ConfigSearchInfo(name = 'max_features', type = float, + lower = 0.1, init = 1.0, upper = 1.0, change_type = 'log') + # CS['min_samples_split'] = ConfigSearchInfo(name = 'min_samples_split', + # type = int, lower = 2, init = 2, upper = 20, change_type = 'log', + # complexity_related = True) + # CS['min_samples_leaf'] = ConfigSearchInfo(name = 'min_samples_leaf', + # type = int, lower = 1, init = 1, upper = 20, change_type = 'log', + # complexity_related = True) + elif 'lgbm' in estimator: + CS['n_estimators'] = ConfigSearchInfo(name = 'n_estimators', type = int, + lower = 4, init = 4, upper = n_estimators_upper, change_type = 'log') + CS['max_leaves'] = ConfigSearchInfo(name = 'max_leaves', type = int, + lower = 4, init = 4, upper = max_leaves_upper, change_type = 'log') + CS['min_child_weight'] = ConfigSearchInfo(name = 'min_child_weight', + type = float, lower = 0.001, init = 20, upper = 20.0, + change_type = 'log') + + CS['learning_rate'] = ConfigSearchInfo(name = 'learning_rate', + type = float, lower = 0.01, init = 0.1, upper = 1.0, + change_type = 'log') + CS['subsample'] = ConfigSearchInfo(name = 'subsample', type = float, + lower = 0.6, init = 1.0, upper = 1.0, change_type = 'log', + complexity_related = True) + CS['log_max_bin'] = ConfigSearchInfo(name = 'log_max_bin', type = int, + lower = 3, init = 8, upper = 10, change_type = 'log', + complexity_related = True) + CS['reg_alpha'] = ConfigSearchInfo(name = 'reg_alpha', type = float, + lower = 1e-10, init = 1e-10, upper = 1.0, change_type = 'log', + complexity_related = True) + CS['reg_lambda'] = ConfigSearchInfo(name = 'reg_lambda', type = float, + lower = 1e-10, init = 1.0, upper = 1.0, change_type = 'log') + CS['colsample_bytree'] = ConfigSearchInfo(name = 'colsample_bytree', + type = float, lower = 0.7, init = 1.0, upper = 1.0, + change_type = 'log') + elif 'lr' in estimator: + CS['C'] = ConfigSearchInfo(name = 'C', type =float, lower = 0.03125, + init = 1.0, upper = 32768.0, change_type = 'log', + complexity_related = True) + elif 'catboost' in estimator: + # CS['n_estimators'] = ConfigSearchInfo(name = 'n_estimators', type = int, + # lower = 4, init = 64, upper = n_estimators_upper, change_type = 'log', + # complexity_related = True) + early_stopping_rounds = max(min(round(1500000/data_size),150), 10) + CS['rounds'] = ConfigSearchInfo(name = 'rounds', type = int, + lower = 10, init = 10, + upper = early_stopping_rounds, change_type = 'log') + # CS['exp_max_depth'] = ConfigSearchInfo(name = 'exp_max_depth', type = int, + # lower = 32, init = 64, upper = 256, change_type = 'log', + # complexity_related = True) + + CS['learning_rate'] = ConfigSearchInfo(name = 'learning_rate', + type = float, lower = 0.005, init = 0.1, upper = .2, + change_type = 'log') + # CS['l2_leaf_reg'] = ConfigSearchInfo(name = 'l2_leaf_reg', + # type = float, lower = 1, init = 3, upper = 5, + # change_type = 'log') + elif 'nn' == estimator: + CS['learning_rate'] = ConfigSearchInfo(name = 'learning_rate', + type = float, lower = 1e-4, init = 3e-4, upper = 3e-2, + change_type = 'log') + CS['weight_decay'] = ConfigSearchInfo(name = 'weight_decay', + type = float, lower = 1e-12, init = 1e-6, upper = .1, + change_type = 'log') + CS['dropout_prob'] = ConfigSearchInfo(name = 'dropout_prob', + type = float, lower = 1.0, init = 1.1, upper = 1.5, + change_type = 'log') + elif 'kneighbor' in estimator: + n_neighbors_upper = min(512,int(data_size/2)) + CS['n_neighbors'] = ConfigSearchInfo(name = 'n_neighbors', type = int, + lower = 1, init = 5, upper = n_neighbors_upper, change_type = 'log') + else: + raise NotImplementedError + + return CS + + +def estimator_size(config, estimator): + if estimator in ['xgboost', 'lgbm', 'rf', 'extra_tree']: + try: + max_leaves = int(round(config['max_leaves'])) + n_estimators = int(round(config['n_estimators'])) + model_size = float((max_leaves*3 + (max_leaves-1)*4 + 1)* + n_estimators*8) + except: + model_size = 0 + return model_size + elif 'catboost' in estimator: + # if config is None: raise Exception("config is none") + n_estimators = int(round(config.get('n_estimators',8192))) + max_leaves = int(round(config.get('exp_max_depth',64))) + model_size = float((max_leaves*3 + (max_leaves-1)*4 + 1)* + n_estimators*8) + return model_size + else: + model_size = 1.0 + # raise NotImplementedError + return model_size + + +def generate_config_ini(estimator, estimator_configspace): + + + config_dic = {} + config_dic_more = {} + config_type_dic = {} + for _, config in estimator_configspace.items(): + name, init = config.name, config.init + type_, complexity_related = config.type, config.complexity_related + config_type_dic[name] = type_ + if complexity_related: + config_dic[name] = init + else: + config_dic_more[name] = init + return config_dic, config_dic_more, {**config_dic, **config_dic_more}, \ + config_type_dic + + +def generate_config_min(estimator,estimator_configspace, max_config_size): + + + config_dic = {} + config_dic_more = {} + for _, config in estimator_configspace.items(): + name, lower = config.name, config.lower + complexity_related = config.complexity_related + if complexity_related: + config_dic[name] = lower + else: + config_dic_more[name] = lower + + return config_dic, config_dic_more, {**config_dic, **config_dic_more} + + +def generate_config_max(estimator, estimator_configspace, max_config_size): + + + config_dic = {} + config_dic_more = {} + for _, config in estimator_configspace.items(): + name, upper = config.name, config.upper + complexity_related = config.complexity_related + if complexity_related: + if name in ('n_estimators', 'max_leaves'): + config_dic[name] = min(upper, max_config_size) + else: + config_dic[name] = upper + else: + config_dic_more[name] = upper + return config_dic, config_dic_more, {**config_dic, **config_dic_more} + + +def get_config_values(config_dic, config_type_dic): + value_list = [] + for k in config_dic.keys(): + org_v = config_dic[k] + if config_type_dic[k] == int: + v = int(round(org_v)) + value_list.append(v) + else: + value_list.append(org_v) + return value_list diff --git a/flaml/training_log.py b/flaml/training_log.py new file mode 100644 index 000000000..8ad5fb3fb --- /dev/null +++ b/flaml/training_log.py @@ -0,0 +1,168 @@ +'''! + * Copyright (c) 2020 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. +''' + +import json +from typing import IO +from contextlib import contextmanager +import warnings + + +class TrainingLogRecord(object): + + def __init__(self, + record_id: int, + iter_per_learner: int, + logged_metric: float, + trial_time: float, + total_search_time: float, + validation_loss, + config, + best_validation_loss, + best_config, + learner, + sample_size): + self.record_id = record_id + self.iter_per_learner = iter_per_learner + self.logged_metric = logged_metric + self.trial_time = trial_time + self.total_search_time = total_search_time + self.validation_loss = validation_loss + self.config = config + self.best_validation_loss = best_validation_loss + self.best_config = best_config + self.learner = learner + self.sample_size = sample_size + + def dump(self, fp: IO[str]): + d = vars(self) + return json.dump(d, fp) + + @classmethod + def load(cls, json_str: str): + d = json.loads(json_str) + return cls(**d) + + +class TrainingLogCheckPoint(TrainingLogRecord): + + def __init__(self, curr_best_record_id: int): + self.curr_best_record_id = curr_best_record_id + + +class TrainingLogWriter(object): + + def __init__(self, output_filename: str): + self.output_filename = output_filename + self.file = None + self.current_best_loss_record_id = None + self.current_best_loss = float('+inf') + self.current_sample_size = None + self.current_record_id = 0 + + def open(self): + self.file = open(self.output_filename, 'w') + + def append(self, + it_counter: int, + train_loss: float, + trial_time: float, + total_search_time: float, + validation_loss, + config, + best_validation_loss, + best_config, + learner, + sample_size): + if self.file is None: + raise IOError("Call open() to open the outpute file first.") + if validation_loss is None: + raise ValueError('TEST LOSS NONE ERROR!!!') + record = TrainingLogRecord(self.current_record_id, + it_counter, + train_loss, + trial_time, + total_search_time, + validation_loss, + config, + best_validation_loss, + best_config, + learner, + sample_size) + if validation_loss < self.current_best_loss or \ + validation_loss == self.current_best_loss and \ + sample_size > self.current_sample_size: + self.current_best_loss = validation_loss + self.current_sample_size = sample_size + self.current_best_loss_record_id = self.current_record_id + self.current_record_id += 1 + record.dump(self.file) + self.file.write('\n') + self.file.flush() + + def checkpoint(self): + if self.file is None: + raise IOError("Call open() to open the outpute file first.") + if self.current_best_loss_record_id is None: + warnings.warn("checkpoint() called before any record is written, " + "skipped.") + return + record = TrainingLogCheckPoint(self.current_best_loss_record_id) + record.dump(self.file) + self.file.write('\n') + self.file.flush() + + def close(self): + self.file.close() + + +class TrainingLogReader(object): + + def __init__(self, filename: str): + self.filename = filename + self.file = None + + def open(self): + self.file = open(self.filename) + + def records(self): + if self.file is None: + raise IOError("Call open() before reading log file.") + for line in self.file: + data = json.loads(line) + if len(data) == 1: + # Skip checkpoints. + continue + yield TrainingLogRecord(**data) + + def close(self): + self.file.close() + + def get_record(self, record_id) -> TrainingLogRecord: + if self.file is None: + raise IOError("Call open() before reading log file.") + for rec in self.records(): + if rec.record_id == record_id: + return rec + raise ValueError(f"Cannot find record with id {record_id}.") + + +@contextmanager +def training_log_writer(filename: str): + try: + w = TrainingLogWriter(filename) + w.open() + yield w + finally: + w.close() + + +@contextmanager +def training_log_reader(filename: str): + try: + r = TrainingLogReader(filename) + r.open() + yield r + finally: + r.close() diff --git a/flaml/version.py b/flaml/version.py new file mode 100644 index 000000000..40692a7a9 --- /dev/null +++ b/flaml/version.py @@ -0,0 +1 @@ +__version__="0.1.0" diff --git a/notebook/flaml_demo.ipynb b/notebook/flaml_demo.ipynb new file mode 100644 index 000000000..22e6f32fd --- /dev/null +++ b/notebook/flaml_demo.ipynb @@ -0,0 +1,611 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Copyright (c) 2020 Microsoft Corporation. All rights reserved. \n", + "\n", + "Licensed under the MIT License.\n", + "\n", + "# Demo of AutoML with FLAML Library\n", + "\n", + "\n", + "## 1. Introduction\n", + "\n", + "FLAML is a Python library (https://github.com/microsoft/FLAML) designed to automatically produce accurate machine learning models \n", + "with low computational cost. It is fast and cheap. The simple and lightweight design makes it easy \n", + "to use and extend, such as adding new learners. FLAML can \n", + "- serve as an economical AutoML engine,\n", + "- be used as a fast hyperparameter tuning tool, or \n", + "- be embedded in self-tuning software that requires low latency & resource in repetitive\n", + " tuning tasks.\n", + "\n", + "In this notebook, we use one real data example (binary classification) to showcase how to ues FLAML library.\n", + "\n", + "FLAML requires `Python>=3.6`. To run this notebook example, please install flaml with the [notebook] option:\n", + "```bash\n", + "pip install flaml[notebook]\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## 2. Real Data Example\n", + "### Load data and preprocess\n", + "\n", + "Download [Airlines dataset](https://www.openml.org/d/1169) from OpenML. The task is to predict whether a given flight will be delayed, given the information of the scheduled departure." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "slideshow": { + "slide_type": "subslide" + }, + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "load dataset from ./openml_ds1169.pkl\n", + "Dataset name: airlines\n", + "X_train.shape: (404537, 7), y_train.shape: (404537,);\n", + "X_test.shape: (134846, 7), y_test.shape: (134846,)\n" + ] + } + ], + "source": [ + "from flaml.data import load_openml_dataset\n", + "X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id = 1169, data_dir = './')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Run FLAML\n", + "In the FLAML automl run configuration, users can specify the task type, time budget, error metric, learner list, whether to subsample, resampling strategy type, and so on. All these arguments have default values which will be used if users do not provide them. For example, the default ML learners of FLAML are `['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree', 'lrl1']`. " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "''' import AutoML class from flaml package '''\n", + "from flaml import AutoML\n", + "automl = AutoML()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "settings = {\n", + " \"time_budget\": 60, # total running time in seconds\n", + " \"metric\": 'accuracy', # primary metrics can be chosen from: ['accuracy','roc_auc','f1','log_loss','mae','mse','r2']\n", + " \"estimator_list\": ['lgbm', 'rf', 'xgboost'], # list of ML learners\n", + " \"task\": 'classification', # task type \n", + " \"sample\": False, # whether to subsample training data\n", + " \"log_file_name\": 'airlines_experiment.log', # cache directory of flaml log files \n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[flaml.automl: 11-22 10:30:17] {649} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost']\n", + "[flaml.automl: 11-22 10:30:17] {654} INFO - Evaluation method: holdout\n", + "[flaml.automl: 11-22 10:30:17] {672} INFO - Minimizing error metric: 1-accuracy\n", + "[flaml.automl: 11-22 10:30:20] {326} INFO - Using StratifiedKFold\n", + "[flaml.automl: 11-22 10:30:20] {717} INFO - iteration 0 current learner lgbm\n", + "[flaml.automl: 11-22 10:30:22] {782} INFO - at 4.8s,\tbest lgbm's error=0.3748,\tbest lgbm's error=0.3748\n", + "[flaml.automl: 11-22 10:30:22] {717} INFO - iteration 1 current learner lgbm\n", + "[flaml.automl: 11-22 10:30:25] {782} INFO - at 8.3s,\tbest lgbm's error=0.3638,\tbest lgbm's error=0.3638\n", + "[flaml.automl: 11-22 10:30:25] {717} INFO - iteration 2 current learner xgboost\n", + "[flaml.automl: 11-22 10:30:29] {782} INFO - at 11.8s,\tbest xgboost's error=0.3742,\tbest lgbm's error=0.3638\n", + "[flaml.automl: 11-22 10:30:29] {717} INFO - iteration 3 current learner xgboost\n", + "[flaml.automl: 11-22 10:30:39] {782} INFO - at 21.6s,\tbest xgboost's error=0.3742,\tbest lgbm's error=0.3638\n", + "[flaml.automl: 11-22 10:30:39] {717} INFO - iteration 4 current learner lgbm\n", + "[flaml.automl: 11-22 10:30:45] {782} INFO - at 28.4s,\tbest lgbm's error=0.3609,\tbest lgbm's error=0.3609\n", + "[flaml.automl: 11-22 10:30:45] {717} INFO - iteration 5 current learner rf\n", + "[flaml.automl: 11-22 10:31:05] {782} INFO - at 47.8s,\tbest rf's error=0.3882,\tbest lgbm's error=0.3609\n", + "[flaml.automl: 11-22 10:31:05] {717} INFO - iteration 6 current learner lgbm\n", + "[flaml.automl: 11-22 10:31:10] {782} INFO - at 52.8s,\tbest lgbm's error=0.3579,\tbest lgbm's error=0.3579\n", + "[flaml.automl: 11-22 10:31:10] {717} INFO - iteration 7 current learner lgbm\n", + "[flaml.automl: 11-22 10:31:13] {782} INFO - at 56.3s,\tbest lgbm's error=0.3474,\tbest lgbm's error=0.3474\n", + "[flaml.automl: 11-22 10:31:13] {717} INFO - iteration 8 current learner lgbm\n", + "[flaml.automl: 11-22 10:31:16] {782} INFO - at 59.3s,\tbest lgbm's error=0.3474,\tbest lgbm's error=0.3474\n", + "[flaml.automl: 11-22 10:31:16] {717} INFO - iteration 9 current learner xgboost\n", + "[flaml.automl: 11-22 10:31:16] {782} INFO - at 59.3s,\tbest xgboost's error=0.3742,\tbest lgbm's error=0.3474\n", + "[flaml.automl: 11-22 10:31:16] {803} INFO - LGBMClassifier(colsample_bytree=0.7, learning_rate=0.7508368515284745,\n", + " max_bin=1023, min_child_weight=2.57801629551926, n_estimators=15,\n", + " num_leaves=17, objective='binary',\n", + " reg_alpha=1.832070610572943e-10, reg_lambda=0.3606535801605071,\n", + " subsample=0.699879308565092)\n", + "[flaml.automl: 11-22 10:31:16] {691} INFO - fit succeeded\n" + ] + } + ], + "source": [ + "'''The main flaml automl API'''\n", + "automl.fit(X_train = X_train, y_train = y_train, **settings)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Best model and metric" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Best ML leaner: lgbm\nBest hyperparmeter config: {'n_estimators': 15.192392947194397, 'max_leaves': 17.203047569920084, 'min_child_weight': 2.57801629551926, 'learning_rate': 0.7508368515284745, 'subsample': 0.699879308565092, 'log_max_bin': 10.0, 'reg_alpha': 1.832070610572943e-10, 'reg_lambda': 0.3606535801605071, 'colsample_bytree': 0.7}\nBest accuracy on validation data: 0.6526\nTraining duration of best run: 3.485 s\n" + ] + } + ], + "source": [ + "''' retrieve best config and best learner'''\n", + "print('Best ML leaner:', automl.best_estimator)\n", + "print('Best hyperparmeter config:', automl.best_config)\n", + "print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))\n", + "print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "LGBMClassifier(colsample_bytree=0.7, learning_rate=0.7508368515284745,\n", + " max_bin=1023, min_child_weight=2.57801629551926, n_estimators=15,\n", + " num_leaves=17, objective='binary',\n", + " reg_alpha=1.832070610572943e-10, reg_lambda=0.3606535801605071,\n", + " subsample=0.699879308565092)" + ] + }, + "metadata": {}, + "execution_count": 11 + } + ], + "source": [ + "automl.model" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "''' pickle and save the best model '''\n", + "import pickle\n", + "with open('best_model.pkl', 'wb') as f:\n", + " pickle.dump(automl.model, f, pickle.HIGHEST_PROTOCOL)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Predicted labels [1 0 1 ... 1 0 0]\nTrue labels [0 0 0 ... 0 1 0]\n" + ] + } + ], + "source": [ + "''' compute predictions of testing dataset ''' \n", + "y_pred = automl.predict(X_test)\n", + "print('Predicted labels', y_pred)\n", + "print('True labels', y_test)\n", + "y_pred_proba = automl.predict_proba(X_test)[:,1]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "accuracy = 0.6529077614463907\n", + "roc_auc = 0.6991857682861167\n", + "log_loss = 0.6207438299015984\n", + "f1 = 0.5468504928063822\n" + ] + } + ], + "source": [ + "''' compute different metric values on testing dataset'''\n", + "from flaml.ml import sklearn_metric_loss_score\n", + "print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred, y_test))\n", + "print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test))\n", + "print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test))\n", + "print('f1', '=', 1 - sklearn_metric_loss_score('f1', y_pred, y_test))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Log history" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "slideshow": { + "slide_type": "subslide" + }, + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 4, 'max_leaves': 4, 'min_child_weight': 20, 'learning_rate': 0.1, 'subsample': 1.0, 'log_max_bin': 8, 'reg_alpha': 1e-10, 'reg_lambda': 1.0, 'colsample_bytree': 1.0}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 4, 'max_leaves': 4, 'min_child_weight': 20, 'learning_rate': 0.1, 'subsample': 1.0, 'log_max_bin': 8, 'reg_alpha': 1e-10, 'reg_lambda': 1.0, 'colsample_bytree': 1.0}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 14.910702833861574, 'max_leaves': 11.190080057422913, 'min_child_weight': 20.0, 'learning_rate': 0.08941779365546668, 'subsample': 1.0, 'log_max_bin': 8.148457575491062, 'reg_alpha': 1e-10, 'reg_lambda': 1.0, 'colsample_bytree': 0.7}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 14.910702833861574, 'max_leaves': 11.190080057422913, 'min_child_weight': 20.0, 'learning_rate': 0.08941779365546668, 'subsample': 1.0, 'log_max_bin': 8.148457575491062, 'reg_alpha': 1e-10, 'reg_lambda': 1.0, 'colsample_bytree': 0.7}}\n{'Current Learner': 'xgboost', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 4, 'max_leaves': 4, 'min_child_weight': 20.0, 'learning_rate': 0.1, 'subsample': 1.0, 'reg_alpha': 1e-10, 'reg_lambda': 1.0, 'colsample_bylevel': 1.0, 'colsample_bytree': 1.0}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 14.910702833861574, 'max_leaves': 11.190080057422913, 'min_child_weight': 20.0, 'learning_rate': 0.08941779365546668, 'subsample': 1.0, 'log_max_bin': 8.148457575491062, 'reg_alpha': 1e-10, 'reg_lambda': 1.0, 'colsample_bytree': 0.7}}\n{'Current Learner': 'xgboost', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 19.407062304628294, 'max_leaves': 4.089279288647953, 'min_child_weight': 20.0, 'learning_rate': 0.0666254583409074, 'subsample': 0.8953637096714, 'reg_alpha': 1e-10, 'reg_lambda': 0.5362533759049211, 'colsample_bylevel': 0.7360077369961437, 'colsample_bytree': 0.8727182620355596}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 14.910702833861574, 'max_leaves': 11.190080057422913, 'min_child_weight': 20.0, 'learning_rate': 0.08941779365546668, 'subsample': 1.0, 'log_max_bin': 8.148457575491062, 'reg_alpha': 1e-10, 'reg_lambda': 1.0, 'colsample_bytree': 0.7}}\n{'Current Learner': 'xgboost', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 4.0, 'max_leaves': 4.0, 'min_child_weight': 8.75767132182036, 'learning_rate': 0.15009277608016236, 'subsample': 1.0, 'reg_alpha': 1.984021711625501e-10, 'reg_lambda': 1.0, 'colsample_bylevel': 1.0, 'colsample_bytree': 1.0}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 14.910702833861574, 'max_leaves': 11.190080057422913, 'min_child_weight': 20.0, 'learning_rate': 0.08941779365546668, 'subsample': 1.0, 'log_max_bin': 8.148457575491062, 'reg_alpha': 1e-10, 'reg_lambda': 1.0, 'colsample_bytree': 0.7}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 23.12482594118475, 'max_leaves': 13.137233040107322, 'min_child_weight': 16.737337377896562, 'learning_rate': 0.04193971066903862, 'subsample': 1.0, 'log_max_bin': 10.0, 'reg_alpha': 5.090318687618562e-10, 'reg_lambda': 0.6027318887059488, 'colsample_bytree': 1.0}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 14.910702833861574, 'max_leaves': 11.190080057422913, 'min_child_weight': 20.0, 'learning_rate': 0.08941779365546668, 'subsample': 1.0, 'log_max_bin': 8.148457575491062, 'reg_alpha': 1e-10, 'reg_lambda': 1.0, 'colsample_bytree': 0.7}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 9.614301943945227, 'max_leaves': 9.53152702013049, 'min_child_weight': 20.0, 'learning_rate': 0.19064370484830762, 'subsample': 0.6629253372107331, 'log_max_bin': 4.903424989804441, 'reg_alpha': 1e-10, 'reg_lambda': 1.0, 'colsample_bytree': 0.7}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 9.614301943945227, 'max_leaves': 9.53152702013049, 'min_child_weight': 20.0, 'learning_rate': 0.19064370484830762, 'subsample': 0.6629253372107331, 'log_max_bin': 4.903424989804441, 'reg_alpha': 1e-10, 'reg_lambda': 1.0, 'colsample_bytree': 0.7}}\n{'Current Learner': 'rf', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 4, 'criterion': 1, 'max_features': 1.0}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 9.614301943945227, 'max_leaves': 9.53152702013049, 'min_child_weight': 20.0, 'learning_rate': 0.19064370484830762, 'subsample': 0.6629253372107331, 'log_max_bin': 4.903424989804441, 'reg_alpha': 1e-10, 'reg_lambda': 1.0, 'colsample_bytree': 0.7}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 12.936352853193041, 'max_leaves': 5.9593857069945635, 'min_child_weight': 20.0, 'learning_rate': 0.0681181870320628, 'subsample': 0.6088343986090358, 'log_max_bin': 5.171176894908052, 'reg_alpha': 1e-10, 'reg_lambda': 1.0, 'colsample_bytree': 1.0}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 9.614301943945227, 'max_leaves': 9.53152702013049, 'min_child_weight': 20.0, 'learning_rate': 0.19064370484830762, 'subsample': 0.6629253372107331, 'log_max_bin': 4.903424989804441, 'reg_alpha': 1e-10, 'reg_lambda': 1.0, 'colsample_bytree': 0.7}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 7.145352551707305, 'max_leaves': 15.244861098493168, 'min_child_weight': 6.579398853250162, 'learning_rate': 0.533558272494558, 'subsample': 0.7218219005364884, 'log_max_bin': 4.6495366759381, 'reg_alpha': 1.0256261116727895e-10, 'reg_lambda': 0.4566694152359654, 'colsample_bytree': 0.7}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 7.145352551707305, 'max_leaves': 15.244861098493168, 'min_child_weight': 6.579398853250162, 'learning_rate': 0.533558272494558, 'subsample': 0.7218219005364884, 'log_max_bin': 4.6495366759381, 'reg_alpha': 1.0256261116727895e-10, 'reg_lambda': 0.4566694152359654, 'colsample_bytree': 0.7}}\n{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 15.192392947194397, 'max_leaves': 17.203047569920084, 'min_child_weight': 2.57801629551926, 'learning_rate': 0.7508368515284745, 'subsample': 0.699879308565092, 'log_max_bin': 10.0, 'reg_alpha': 1.832070610572943e-10, 'reg_lambda': 0.3606535801605071, 'colsample_bytree': 0.7}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 15.192392947194397, 'max_leaves': 17.203047569920084, 'min_child_weight': 2.57801629551926, 'learning_rate': 0.7508368515284745, 'subsample': 0.699879308565092, 'log_max_bin': 10.0, 'reg_alpha': 1.832070610572943e-10, 'reg_lambda': 0.3606535801605071, 'colsample_bytree': 0.7}}\n{'Current Learner': 'lgbm', 'Current Sample': 404536, 'Current Hyper-parameters': {'n_estimators': 15.192392947194397, 'max_leaves': 17.203047569920084, 'min_child_weight': 2.57801629551926, 'learning_rate': 0.7508368515284745, 'subsample': 0.699879308565092, 'log_max_bin': 10.0, 'reg_alpha': 1.832070610572943e-10, 'reg_lambda': 0.3606535801605071, 'colsample_bytree': 0.7}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 15.192392947194397, 'max_leaves': 17.203047569920084, 'min_child_weight': 2.57801629551926, 'learning_rate': 0.7508368515284745, 'subsample': 0.699879308565092, 'log_max_bin': 10.0, 'reg_alpha': 1.832070610572943e-10, 'reg_lambda': 0.3606535801605071, 'colsample_bytree': 0.7}}\n" + ] + } + ], + "source": [ + "from flaml import get_output_from_log\n", + "time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \\\n", + " get_output_from_log(filename = settings['log_file_name'], time_budget = 60)\n", + "\n", + "for config in config_history:\n", + " print(config)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": "
", + "image/svg+xml": "\r\n\r\n\r\n\r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n\r\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAEWCAYAAABxMXBSAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8GearUAAAgAElEQVR4nO3deZxU1Z338c+XBhTcAEXDpmAEIkYFbcUsJmpMwCWCxn2MThaNz8TEJBMy6IwZTZ5MTJisEw2PEhKzGKMGcBkDOiow4xihSSOLiCIhQIMsKoqIsvTv+ePewrKs7q6Crq6u6u/79epX1T333HvPwbJ+de7yO4oIzMzMCtWp3A0wM7PK4sBhZmZFceAwM7OiOHCYmVlRHDjMzKwoDhxmZlYUBw6zVibpZElLy90Os1Jx4LCqImmFpNPL2YaI+O+IGFqq/UsaJWm2pM2SNkiaJemcUh3PLJcDh1mRJNWU8djnA/cAvwb6A4cA3wQ+uRv7kiR/B1jR/KGxDkFSJ0njJb0g6SVJd0vqlbX+HkkvSno1/TV/VNa6X0n6uaSHJG0BTk1HNl+XtCDd5g+S9k7rnyJpddb2TdZN139D0lpJayR9XlJIOiJPHwT8EPh2REyKiFcjojEiZkXElWmdGyX9Nmubgen+OqfLMyV9R9ITwBvA9ZLqco7zVUn3p+/3kvTvklZKWidpoqRue/ifwyqcA4d1FF8GxgIfBfoCrwC3ZK3/EzAYOBj4C/C7nO0vBb4D7Af8T1p2ITAaGAQcA/x9M8fPW1fSaOBrwOnAEWn7mjIUGADc20ydQnwauIqkL/8BDJU0OGv9pcCd6fvvAUOA4Wn7+pGMcKwDc+CwjuILwD9HxOqIeAu4ETg/80s8IiZHxOasdcdKOiBr+/si4on0F/6badlPI2JNRLwMPEDy5dqUpupeCPwyIhZHxBvATc3s48D0dW3Bvc7vV+nxdkTEq8B9wCUAaQB5H3B/OsK5EvhqRLwcEZuBfwMu3sPjW4Vz4LCO4jBgqqRNkjYBS4CdwCGSaiTdnJ7Geg1YkW5zUNb2q/Ls88Ws928A+zZz/Kbq9s3Zd77jZLyUvvZppk4hco9xJ2ngIBltTEuDWG+gOzAv699telpuHZgDh3UUq4AzIqJH1t/eEdFA8mU5huR00QHAwHQbZW1fqjTSa0kucmcMaKbuUpJ+fKqZOltIvuwz3pOnTm5fHgYOkjScJIBkTlNtBLYCR2X9mx0QEc0FSOsAHDisGnWRtHfWX2dgIvAdSYcBSOotaUxafz/gLZJf9N1JTse0lbuBz0g6UlJ3mrl+EMkcCF8DbpD0GUn7pxf9PyzptrTafOAjkg5NT7Vd11IDImIHyXWTCUAv4JG0vBG4HfiRpIMBJPWTNGq3e2tVwYHDqtFDJL+UM383Aj8B7gcelrQZ+DMwMq3/a+BvQAPwTLquTUTEn4CfAo8Dy4An01VvNVH/XuAi4LPAGmAd8H9JrlMQEY8AfwAWAPOABwtsyp0kI6570kCS8U9pu/6cnsb7L5KL9NaByRM5mbUfko4EFgF75XyBm7UbHnGYlZmkcyV1ldST5PbXBxw0rD1z4DArvy8AG4AXSO70+j/lbY5Z83yqyszMiuIRh5mZFaVzuRvQFg466KAYOHBguZthZlZR5s2btzEi3vXAZ4cIHAMHDqSurq7limZmtoukv+Ur96kqMzMrigOHmZkVxYHDzMyK4sBhZmZFceAwM7OidIi7qszMKsm0+gYmzFjKmk1b6dujG+NGDWXsiH5tvo+mOHCYmbUj0+obuG7KQrZu3wlAw6atXDdlIUDBX/ytsY/mOHCYmbUjE2Ys3fWFn7F1+05umLaI5RteL2gfv3xiRd59TJix1IHDzKzarNm0NW/55rd28B+PLytoH02lIGxq38Vy4DAza0f69uhGQ54v+H49uvHE+NMK2seHbn4s7z769ui2x+0D31VlZtaujBs1lBrpHWXdutQwblThEy+OGzWUbl1q9mgfzfGIw8ysHfnYkQfTqRPsVVPD1m07d+uOqExd31VlZtYBPLhgLdt3BvdcfRLDB/TY7f2MHdGv1QJFLp+qMjNrR/4wdxVDDtmXY/sfUO6mNMmBw8ysnXhu3Wbmr9rEhbUDUM51jvbEgcPMrJ24e+4qOncS55boFFNrceAwM2sHtu1oZGp9A6cfeQgH7rtXuZvTLAcOM7N24LFn1/HSlm1cdMKAcjelRSUNHJJGS1oqaZmk8U3UOUXSfEmLJc3KKl8haWG6ri6rvJekRyQ9n772LGUfzMzawt11qzlk/704efBB5W5Ki0oWOCTVALcAZwDDgEskDcup0wO4FTgnIo4CLsjZzakRMTwiarPKxgOPRsRg4NF02cysYr346pvMXLqe84/vT+ea9n8iqJQtPBFYFhHLI2IbcBcwJqfOpcCUiFgJEBHrC9jvGOCO9P0dwNhWaq+ZWVn88S+raQy44Pj2f5oKShs4+gGrspZXp2XZhgA9Jc2UNE/S5VnrAng4Lb8qq/yQiFgLkL4enO/gkq6SVCepbsOGDXvcGTOzUogI7qlbxchBvRh40D7lbk5BSvnkeL6bkHNzNnYGjgc+BnQDnpT054h4DvhQRKyRdDDwiKRnI2J2oQePiNuA2wBqa2ubyBVpZlZec/76MiteeoMvnTa43E0pWClHHKuB7HFXf2BNnjrTI2JLRGwEZgPHAkTEmvR1PTCV5NQXwDpJfQDS10JOb5mZtUt3161m3706c8bR7yl3UwpWysAxFxgsaZCkrsDFwP05de4DTpbUWVJ3YCSwRNI+kvYDkLQP8AlgUbrN/cAV6fsr0n2YmVWczW9u56GFa/nksX3p3rVyUgeWrKURsUPSNcAMoAaYHBGLJV2drp8YEUskTQcWAI3ApIhYJOlwYGr6yH1n4M6ImJ7u+mbgbkmfA1by7juxzMwqwoML1rJ1+04urO1f7qYURdHUVFFVpLa2Nurq6lquaGbWhsbe8gRvbNvBjK98pF3mppI0L+dxCMBPjpuZlUWlJDTMx4HDzKwMKiWhYT4OHGZmbaySEhrm48BhZtbGKimhYT4OHGZmbaySEhrm48BhZtaGMgkNP3VcZSQ0zKcyW21mVqEyCQ0vrK3M01TgwGFm1mYyCQ1PrKCEhvk4cJiZtZFMQsOLKni0AQ4cZmZtphITGubjwGFm1gYqNaFhPg4cZmZtoFITGubjwGFm1gb+MHcVQw7Zl+EDepS7KXvMgcPMrMQqOaFhPg4cZmYllkloOLYCExrmU9LAIWm0pKWSlkka30SdUyTNl7RY0qycdTWS6iU9mFV2o6SGdJv5ks4sZR/MzPZEdkLDgyowoWE+Jbu0L6kGuAX4OMnc4nMl3R8Rz2TV6QHcCoyOiJWSDs7ZzbXAEmD/nPIfRcS/l6rtZmatJZPQ8MITKv+ieEYpRxwnAssiYnlEbAPuAsbk1LkUmBIRKwEiYn1mhaT+wFnApBK20cyspDIJDT8yuHe5m9JqShk4+gGrspZXp2XZhgA9Jc2UNE/S5Vnrfgx8g2Qu8lzXSFogabKknvkOLukqSXWS6jZs2LAH3TAz2z3rXqv8hIb5lLIn+W4dyJ3gvDNwPMnIYhRwg6Qhks4G1kfEvDz7+DnwXmA4sBb4Qb6DR8RtEVEbEbW9e1dPpDezynHvvMpPaJhPKR9fXA1k/2v1B9bkqbMxIrYAWyTNBo4FjgPOSS987w3sL+m3EXFZRKzLbCzpduBBzMzamWpJaJhPKUccc4HBkgZJ6gpcDNyfU+c+4GRJnSV1B0YCSyLiuojoHxED0+0ei4jLACT1ydr+XGBRCftgZrZbqiWhYT4lG3FExA5J1wAzgBpgckQslnR1un5iRCyRNB1YQHItY1JEtBQIvi9pOMlprxXAF0rVBzOz3VUtCQ3zKWmmrYh4CHgop2xizvIEYEIz+5gJzMxa/nSrNtLMrJVlEhqOHVH5CQ3zqZ7L/GZm7cTbCQ2r7zQVOHCYmbW6P8xdxeCDqyOhYT4OHGZmrSiT0PCiE6ojoWE+DhxmZq2o2hIa5uPAYWbWSqoxoWE+DhxmZq1gWn0DJ/3bo7y0ZRtzV7zMtPqGcjepZKrvPjEzszY2rb6B66YsZOv2nQC8tGUb101ZCFCVp6w84jAz20MTZizdFTQytm7fyYQZS8vUotJqMXBI6tUWDTEzq1QNm7bmLV/TRHmlK2TE8ZSkeySdqWq9t8zMbDdtfP0tOnfK/9XYt0e3Nm5N2ygkcAwBbgM+DSyT9G+ShpS2WWZm7d+mN7Zx2aSnkKBrznwb3brUMG7U0DK1rLRaDByReCQiLgE+D1wBzJE0S9IHSt5CM7N26LU3t3P55Dks37CFyX9/At8//xj69eiGgH49uvHd846uygvjUMBdVZIOBC4jGXGsA75Ekh59OHAPMKiUDTQza2/e2LaDz/5yLs+seY2Jlx3Pyem0sNUaKHIVcjvuk8BvgLERsTqrvE7SxCa2MTOrSm9u38nn76jjLytf4T8uOY7Thx1S7ia1uUICx9CIyJ3yFYCI+F4rt8fMrN3atqOR//PbeTy5/CV+cMGxnHVMn5Y3qkKFXBx/WNKuFI+SekqaUcjOJY2WtFTSMknjm6hziqT5khZLmpWzrkZSvaQHs8p6SXpE0vPpa89C2mJmtid27Gzky7+v5/GlG/jO2KM577j+5W5S2RQSOHpHxKbMQkS8Ahzc0kaSaoBbgDOAYcAlkobl1OkB3AqcExFHARfk7OZaYElO2Xjg0YgYDDyaLpuZlczOxuAf73ma6Ytf5JtnD+PSkYeWu0llVUjg2Clp17+SpMNIpm1tyYnAsohYHhHbgLuAMTl1LgWmRMRKgIhYn3Wc/sBZwKScbcYAd6Tv7wDGFtAWM7Pd0tgYXD9lIffNX8M3Rg/lsx/2/UCFXOP4Z+B/sk4jfQS4qoDt+gGrspZXAyNz6gwBukiaCewH/CQifp2u+zHwjbQ82yERsRYgItZKyjv6kXRVpp2HHtqxfx2Y2e6JCG56YDF/qFvFl087gn845YhyN6ldaDFwRMR0SccBJwECvhoRGwvYd75HKXNHKp2B44GPAd2AJyX9mSSgrI+IeZJOKeBY+dp9G8mDi9TW1hYyQjIz2yUiuPlPz3LHk3/jypMH8dWP+7nnjEKz4+4E1gN7A8MkERGzW9hmNZA94W5/YE2eOhsjYguwRdJs4FjgOOAcSWemx9xf0m8j4jJgnaQ+6WijT9ous5KaVt/AhBlLWbNpK317dGPcqKEd5p79jurH//U8/2/2ci476VCuP/PIqp3Nb3cUkuTw88BsYAZwU/p6YwH7ngsMljRIUlfgYpIHB7PdB5wsqbOk7iSnspZExHUR0T8iBqbbPZYGDdJ9XJG+vyLdh1nJZFJmN2zaSpAktLtuysKqnm+ho/v5zBf4yaPPc/7x/fnWOe930MhRyIjjWuAE4M8Rcaqk95EEkGZFxA5J15AEmhpgckQslnR1un5iRCyRNB1YADQCkyJiUQu7vhm4W9LngJW8+04ss1bVVMrsr/xhPt+4d0GZWmWltG1nI2cf04fvfeoYOjWRwLAjKyRwvBkRb0pC0l4R8aykgjJ3RcRDwEM5ZRNzlicAE5rZx0xgZtbySyTXRMzaRFMpswE+d7LvsKlGB+27F5d/4DBqHDTyKiRwrE6ft5gGPCLpFd59rcKsKv3XM+uaXNevRzf+afT72rA1Zu1DIXdVnZu+vVHS48ABwPSStsqsHbh33mr+6Y8LGNCzGxs2v8WbOxp3ravmlNlmLWn24rikTpJ2XXOIiFkRcX/6QJ9Z1bp99nK+fs/TnHR4L/70lY9w86c6Tspss5Y0O+KIiEZJT0s6NPN0t1k1iwi+N30pE2e9wFlH9+GHFx3LXp1rGDuinwOFWaqQaxx9gMWS5gBbMoURcU7JWmVWBjt2NnL91IXcXbeavxt5KN8a835fHDXLo5DA0eKtt2aV7s3tO/ny7+t5+Jl1fPljg/nq6YN9775ZEwq5OD6rpTpmley1N7dz5R11zFnxMjedcxRXfHBguZtk1q4VMnXsZt7OMdUV6AJsiYj9S9kws7awYfNbXDF5Ds+t28yPLxrOmOG+jmHWkkJGHO/ITitpLEnKdLOKtvKlN/j05KdY/9pb/OLvT+CjQ3qXu0lmFaHQJIe7RMS0pmbzs6Y5SV77smTta1w+eQ7bdzZy55UjGXGoJ5I0K1Qhp6rOy1rsBNRS2EROlsokycvkO8okyQMcPMpg7oqX+eyv5rLvXp258/MfYPAhuVO+mFlzChlxfDLr/Q5gBe+eyc+a0VSSvJseWOzbPdvYS6+/xXf/9Cz9enbjN58bSb8e3crdJLOKU8g1js+0RUOq2ZomkuS98sZ2vvT7+jZujR3T/wB++fcncOC+e5W7KWYVqZBTVXcA10bEpnS5J/CDiPhsqRtXLfr26JY3w+rB++3FnVfmzqZrpSUGHtidzjUtTkVjZk0o5FTVMZmgARARr0gaUcI2VZ0vnXYE49NrGhndutRw/ZlHcsTBPr9uZpWlkJ9dndJRBgCSelHg3ViSRktaKmlZU3diSTpF0nxJiyXNSsv2ljQnzZO1WNJNWfVvlNSQbjM/nV62Xcv8uj1o365OkldG0+ob+NDNjzFo/H/yoZsf8wx+ZrupkADwA+B/Jd1LcjfVhcB3WtpIUg1wC/BxkrnF50q6PyKeyarTA7gVGB0RKyUdnK56CzgtIl6X1AX4H0l/iog/p+t/FBH/XmAfy25afQMDenVj9rhTncaiTHxnm1nraXHEERG/Bj4FrAM2AOdFxG8K2PeJwLKIWJ6mYb+Ld9+NdSkwJZN5NyLWp68REa+ndbqkfxV5C/CLr77JEy9s5Nzh/Rw0yqipO9smzFhaphaZVa4WA4ekk4BVEfGziPgPYJWkQq7o9gNWZS2vTsuyDQF6SpopaZ6ky7OOWyNpPrAeeCQinsra7hpJCyRNzj6NltPuqyTVSarbsGFDAc0tjfvmNxDhX7Xl1tSdbU2Vm1nTCrnG8XPg9azlLWlZS/L9vM4dNXQGjgfOAkYBN0gaAhAROyNiONAfOFHS+7Pa815gOLCW5FTauw8UcVtE1EZEbe/e5UslMbW+gWMH9ODw3vuWrQ2W3NlWTLmZNa2QwKGI2PWFHxGNFDhXOTAga7k/756rfDUwPSK2RMRGYDZwbHaF9I6umcDodHldGlQagdtpx3mzlqx9jWdf3Mx5Hm2U3bhRQ+nWpeYdZZ7+1Wz3FBI4lkv6sqQu6d+1wPICtpsLDJY0SFJX4GLg/pw69wEnS+osqTswElgiqXd64RxJ3YDTgWfT5T5Z258LLKKdmlbfQOdO4uxj+rRc2Upq7Ih+fPe8oz39q1krKGTkcDXwU+BfSE41PQpc2dJGEbFD0jXADKAGmBwRiyVdna6fGBFLJE0HFgCNwKSIWCTpGOCO9M6sTsDdEfFguuvvSxqetmUF8IXCu9t2djYG0+Y38NEhvf2Ecjvh6V/NWkchKUfWk4wWgF0jgLOBewrY9iHgoZyyiTnLE4AJOWULgLwPGUbEp1s6bnvw5Asvse61t7jhbH9RmVl1KSjvQnqH0xmSfg38FbiotM2qfFPrG9hvr86cfuQh5W6KmVmranbEIekjJM9anAXMAT4EHB4Rb7RB2yrW1m07mb5oLWcd04e9cy7ImplVuiYDh6TVwEqS21/HRcRmSX910GjZw8+8yJZtOzl3RP9yN8XMrNU1d6rqjyQP7F0EfFLSPlTo09ttbWp9A30P2JuRg3qVuylmZq2uycAREdcCA4EfAqcCzwG9JV0oyU+zNWHD5rf47+c3MmZEPzp5kiYzq0LNXhxPc0Y9FhFXkgSRS4GxJLfBWh4PPL2GnY3hh/7MrGoVlB4dICK2Aw8AD6S35FoeU+sbOKrv/p7H2syq1m5NgxYRzgyXx7L1m1nY8CrnerRhZlXM82e2oqn1DXQSnDO8b7mbYmZWMg4craSxMZhWv4YPD+7NwfvtXe7mmJmVTIvXONI05+OAw7LrR8RpJWxXxZm74mUaNm11tlUzq3qFXBy/B5hIksJ8Zwt1O6yp9Q1071rDJ45yihEzq26FBI4dEVHIxE0d1pvbd/KfC9cy+qj30L1rwTeqmZlVpEKucTwg6R8k9ZHUK/NX8pZVkMeeXc/mN3dw7nG+m8rMql8hP4+vSF/HZZUFcHjrN6cyTflLAwfvtxcffO9B5W6KmVnJtTjiiIhBef4KChqSRktaKmmZpPFN1DlF0nxJiyXNSsv2ljRH0tNp+U1Z9XtJekTS8+lrz0I7Wwovb9nGzKXrGTO8LzVOMWJmHUCLgSOdLvbLku5N/66R1KWA7WqAW4AzgGHAJZKG5dTpAdwKnBMRRwEXpKveAk6LiGOB4cBoSSel68YDj0bEYJLZCPMGpLbynwvWsKMxnAnXzDqMQq5x/Bw4nuQL/tb0fSEXy08ElkXE8ojYBtwFjMmpcykwJSJWwq7ZBjM5sl5P63RJ/zKZeccAd6Tv7yDJnVU2U+obGHrIfhzZxylGzKxjKOQaxwnpL/+MxyQ9XcB2/YBVWcurgZE5dYYAXSTNBPYDfhIRv4ZdI5Z5wBHALRHxVLrNIRGxFiAi1ko6uIC2lMSKjVuoX7mJ8We8D8mnqcysYyhkxLFT0nszC5IOp7DnOfJ9k+bO59GZZARzFjAKuCF94JCI2BkRw4H+wImS3l/AMd8+uHSVpDpJdRs2bChm04JNrW9AgjFOMWJmHUghI45xwOOSlpMEg8OAzxSw3WpgQNZyf2BNnjobI2ILsEXSbOBYkrk/AIiITemIZDSwCFgnqU862ugDrM938Ii4DbgNoLa2ttUnoIoIps1v4AOHH0ifA5ws2Mw6jkLuqnoUGAx8Of0bGhGPF7DvucBgSYMkdQUuBu7PqXMfcLKkzpK6k5zKWiKpd3rhnDSF++nAs+k29/P2LcJXpPtoc39ZuYm/vfSGM+GaWYfT3Jzjp0XEY5LOy1n1XklExJTmdhwROyRdA8wAaoDJEbFY0tXp+okRsUTSdGAB0AhMiohFko4B7kivc3QC7o6IB9Nd3wzcLelzJHOiX0AZTK1fzd5dOjH6/e8px+HNzMqmuVNVHwUeAz6ZZ10AzQYOgIh4CHgop2xizvIEYEJO2QJgRBP7fAn4WEvHLqVtOxp5cMFaPj7sPey3d4t3JpuZVZUmA0dE/Gv69lsR8dfsdZIGlbRV7dzMpevZ9MZ2Tw9rZh1SIXdV/TFP2b2t3ZBKMrW+gQP36crJg51ixMw6nuaucbwPOAo4IOc6x/5Ah52p6NWt23l0yXouHXkonWs8D5aZdTzNXeMYCpwN9OCd1zk2A1eWslHt2UML17JtZyPnOROumXVQzV3juA+4T9IHIuLJNmxTuza1voHDe+/D0f0OKHdTzMzKopAHAOslfZHktNWuU1QR8dmStaqdWvXyG8z568t8/RNDnGLEzDqsQk7S/wZ4D0lKkFkkT4BvLmWj2qv75jcAMGa4T1OZWcdVSOA4IiJuALZExB0keaWOLm2z2p+IYGp9AycO7MWAXt3L3Rwzs7IpJHBsT183pYkGDwAGlqxF7dTChld5YcMWTw9rZh1eIdc4bktn2buBJE/UvsA3S9qqdmhqfQNdazpx5tF9yt0UM7OyajFwRMSk9O0sOug84zt2NvLA02v42JEHc0A3pxgxs46tuQcAv9bchhHxw9ZvTvv0389vZOPr25wJ16zKTatvYMKMpazZtJW+PboxbtRQxvr/+3dpbsSRmQt1KHACb6dE/yQwu5SNam+m1jfQo3sXThlatskG36GaP9zV3Ddr36bVN3DdlIVs3Z7MU9ewaSvXTVkI4M9gjuYeALwJQNLDwHERsTldvhG4p01a1w68/tYOHn7mRc4/vj9dO5c/xUg1f7iruW/W/k2YsXTXZy9j6/adTJix1J+/HIV8Ex4KbMta3kYHuqvqTwvX8ub2Rs4d0b/cTQGa/3BXumrum7V/azZtLaq8IyvkrqrfAHMkTSWZh+Nc4NclbVU7kDll0rBpKzWdxMqXtnD8YT3L3ayq/nBXc9+s/evboxsNeT5rfXt4auhchUwd+x2SOcZfATYBn4mIfytk55JGS1oqaZmk8U3UOUXSfEmLJc1KywZIelzSkrT82qz6N0pqSLeZL+nMQtpSjMwpk8yHaGdjcP3URUyrb2jtQxWtqQ9xNXy4q7lv1v6NGzWUbl1q3lHWrUsN40YNLVOL2q8mA4ek/dPXXsAKkpHHb4C/pWXNSqd9vQU4AxgGXCJpWE6dHsCtwDkRcRRvTwO7A/jHiDgSOAn4Ys62P4qI4enfO2YYbA3t+ZRJNX+4q7lv1v6NHdGP7553NP16dENAvx7d+O55R/v6Rh7Nnaq6kySt+jySU1QZSpdbeqbjRGBZRCwHkHQXMAZ4JqvOpcCUiFgJEBHr09e1wNr0/WZJS4B+OduWTHs+ZZL5EFfjnUfV3DerDGNH9PPnrQDN3VV1dvq6u9PE9gNWZS2vBkbm1BkCdJE0k+T2359ExDuun0gaSDL/+FNZxddIuhyoIxmZvJJ7cElXAVcBHHrooUU1vL2f66zmD3c1982sWjR3quq45v4K2He+vOORs9wZOJ4kceIo4AZJQ7LasC/J1LVfiYjX0uKfA+8FhpOMSn6Q7+ARcVtE1EZEbe/evQto7tt8ysTMrGnNnarK+4WcCuC0Fva9GhiQtdwfWJOnzsaI2AJskTQbOBZ4TlIXkqDxu4iYsuvAEesy7yXdDjzYQjuK5lMmZmZNa+5U1al7uO+5wGBJg4AG4GKSaxrZ7gN+Jqkz0JXkVNaPlMyS9AtgSW5qE0l90msgkNwavGgP25mXT5mYmeVXyHMcpOnUh/HOGQCbfZYjInZIugaYAdQAkyNisaSr0/UTI2KJpOnAAqARmBQRiyR9GPg0sFDS/HSX16d3UH1f0nCSUc8K4AuFd9fMzPaUInIvO+RUkP4VOIUkcDxEcnvt/0TE+SVvXSupra2Nurq6cjfDzKyiSJoXEbW55YWkHDkf+BjwYkR8huQaxF6t3D4zM6sQhQSOrRHRCOxIHwpcTwedl8PMzAq7xlGXPuF9O8nDgK8Dc0raKqt4To9uVr2am8jpZ9dxy/8AAA4KSURBVMCdEfEPadHE9EL2/hGxoE1aZxXJ6dHNqltzp6qeB34gaYWk70kaHhErHDSsJe0515eZ7bkmA0dE/CQiPgB8FHgZ+GWarfab2U93m+Vqz7m+zGzPFZJW/W8R8b2IGEHyAN+5wJKSt8wqltOjm1W3FgOHpC6SPinpd8CfgOeAT5W8ZVaxnOvLrLo1d3H848AlJAkI5wB3AVeleaXMmuRcX2bVrbnbca8nmZPj6xHxchu1x6qEc32ZVa9SJjk0M7MqVMiT42ZmZrs4cJiZWVEcOMzMrCgOHGZmVpSSBg5JoyUtlbRM0vgm6pwiab6kxZJmpWUDJD2ePqm+WNK1WfV7SXpE0vPpa89S9sHMzN6pZIFDUg1wC8nET8OASyQNy6nTA7gVOCcijgIuSFftAP4xIo4ETgK+mLXteODRiBgMPJoum5lZGynliONEYFlELI+IbSQPEI7JqXMpMCUiVgJExPr0dW1E/CV9v5kkxUnmoYAxwB3p+zuAsSXsg5mZ5Shl4OgHrMpaXs3bX/4ZQ4CekmZKmifp8tydSBoIjACeSosOiYi1kAQY4OB8B5d0laQ6SXUbNmzYo46YmdnbShk4lKcsd4LzzsDxJGlNRgE3ZGfelbQv8EfgKxHxWjEHj4jbIqI2Imp79+5dXMvNzKxJpQwcq4EBWcv9gTV56kyPiC0RsRGYTTKnOZK6kASN30XElKxt1knqk9bpQzKVrZmZtZFSBo65wGBJgyR1BS4G7s+pcx9wsqTOkroDI4ElkgT8AlgSET/M2eZ+4Ir0/RXpPszMrI0UMuf4bomIHZKuAWYANcDkiFgs6ep0/cSIWJJOR7sAaAQmRcQiSR8GPg0slDQ/3eX1EfEQcDNwt6TPASt5+04sMzNrA4rIvexQfWpra6Ourq7czTAzqyiS5kVEbW65nxw3M7OiOHCYmVlRHDjMzKwoDhxmZlYUBw4zMyuKA4eZmRXFgcPMzIriwGFmZkVx4DAzs6I4cJiZWVEcOMzMrCgOHGZmVhQHDjMzK4oDh5mZFcWBw8zMilLSwCFptKSlkpZJGt9EnVMkzZe0WNKsrPLJktZLWpRT/0ZJDek28yWdWco+mJnZO5UscEiqAW4BzgCGAZdIGpZTpwdwK3BORBzFO2fz+xUwuond/ygihqd/D7V6483MrEmlHHGcCCyLiOURsQ24CxiTU+dSYEpErASIiPWZFRExG3i5hO0zM7PdUMrA0Q9YlbW8Oi3LNgToKWmmpHmSLi9w39dIWpCezuqZr4KkqyTVSarbsGFD8a03M7O8Shk4lKcsd4LzzsDxwFnAKOAGSUNa2O/PgfcCw4G1wA/yVYqI2yKiNiJqe/fuXVTDzcysaZ1LuO/VwICs5f7Amjx1NkbEFmCLpNnAscBzTe00ItZl3ku6HXiw1VpsZmYtKuWIYy4wWNIgSV2Bi4H7c+rcB5wsqbOk7sBIYElzO5XUJ2vxXGBRU3XNzKz1lWzEERE7JF0DzABqgMkRsVjS1en6iRGxRNJ0YAHQCEyKiEUAkn4PnAIcJGk18K8R8Qvg+5KGk5z2WgF8oVR9MDOzd1NE7mWH6lNbWxt1dXXlboaZWUWRNC8ianPL/eS4mZkVxYHDzMyK4sBhZmZFceAwM7OiOHCYmVlRHDjMzKwoDhxmZlaUUqYcMTMri2n1DUyYsZQ1m7bSt0c3xo0aytgRuTlWbXc5cJhZVZlW38B1UxaydftOABo2beW6KQsBHDxaiU9VmVlVmTBj6a6gkbF1+04mzFhaphZVHwcOM6sqazZtLarciufAYWZVpW+PbkWVW/EcOMysqowbNZRuXWreUdatSw3jRg0tU4uqjy+Om1lVyVwA911VpePAYWZVZ+yIfg4UJVTSU1WSRktaKmmZpPFN1DlF0nxJiyXNyiqfLGm9pEU59XtJekTS8+lrz1L2wczM3qlkgUNSDXALcAYwDLhE0rCcOj2AW4FzIuIo4IKs1b8CRufZ9Xjg0YgYDDyaLpuZWRsp5YjjRGBZRCyPiG3AXcCYnDqXAlMiYiVARKzPrIiI2cDLefY7BrgjfX8HMLa1G25mZk0rZeDoB6zKWl6dlmUbAvSUNFPSPEmXF7DfQyJiLUD6enC+SpKuklQnqW7Dhg270XwzM8unlIFDecpyJzjvDBwPnAWMAm6QNKQ1Dh4Rt0VEbUTU9u7duzV2aWZmlPauqtXAgKzl/sCaPHU2RsQWYIuk2cCxwHPN7HedpD4RsVZSH2B9M3UBmDdv3kZJfyuu+WV1ELCx3I0oIfevsrl/la2Y/h2Wr7CUgWMuMFjSIKABuJjkmka2+4CfSeoMdAVGAj9qYb/3A1cAN6ev97XUkIioqCGHpLqIqC13O0rF/ats7l9la43+lexUVUTsAK4BZgBLgLsjYrGkqyVdndZZAkwHFgBzgEkRsQhA0u+BJ4GhklZL+ly665uBj0t6Hvh4umxmZm1EEbmXHazc/Iunsrl/lc39a5lzVbVPt5W7ASXm/lU296+y7XH/POIwM7OieMRhZmZFceAwM7OiOHCUWb5kjtWSyFHSAEmPS1qSJrG8Ni2vlv7tLWmOpKfT/t2UlldF/zIk1Uiql/Rgulw1/ZO0QtLCNNFqXVpWTf3rIeleSc+m/x9+oDX658BRfr/i3ckcqyWR4w7gHyPiSOAk4Itpostq6d9bwGkRcSwwHBgt6SSqp38Z15LcUp9Rbf07NSKGZ91pVE39+wkwPSLeR/Jw9RJao38R4b8y/wEDgUVZy0uBPun7PsDScrexlfp5H8mzN1XXP6A78BeSh1irpn8kGR8eBU4DHkzLqql/K4CDcsqqon/A/sBfSW+Cas3+ecTRPhWUyLGSSBoIjACeoor6l57GmU+S+uaRiKiq/gE/Br4BNGaVVVP/Ang4TbJ6VVpWLf07HNgA/DI91ThJ0j60Qv8cOKzkJO0L/BH4SkS8Vu72tKaI2BkRw0l+mZ8o6f3lblNrkXQ2sD4i5pW7LSX0oYg4jmTeoC9K+ki5G9SKOgPHAT+PiBHAFlrptJsDR/u0Lk3gSKGJHNsrSV1IgsbvImJKWlw1/cuIiE3ATJLrVdXSvw8B50haQTKfzmmSfkv19I+IWJO+rgemkswjVC39Ww2sTkfBAPeSBJI97p8DR/uUSeQIBSZybI8kCfgFsCQifpi1qlr61zudxRJJ3YDTgWepkv5FxHUR0T8iBpIkKX0sIi6jSvonaR9J+2XeA58AFlEl/YuIF4FVkoamRR8DnqEV+ucnx8ssTeZ4Ckmq43XAvwLTgLuBQ4GVwAURkW82xHZN0oeB/wYW8vY58utJrnNUQ/+OIZmFsobkR9jdEfEtSQdSBf3LJukU4OsRcXa19E/S4SSjDEhO69wZEd+plv4BSBoOTCLJPr4c+AzpZ5U96J8Dh5mZFcWnqszMrCgOHGZmVhQHDjMzK4oDh5mZFcWBw8zMiuLAYRVP0o8kfSVreYakSVnLP5D0tWa2/5Wk89P3MyW9a1pNSV0k3ZxmFF2UZsU9I123QtJBu9HuXcdtYv0tadbWZyRtTd/Pl3S+pIcyz5C0Jkl9Mllwm1jfVdJsSZ1b+9hWORw4rBr8L/BBAEmdSJ6JOSpr/QeBJ/bwGN8mSQj3/oh4P/BJYL893GezIuKLaTqTM4EXIsngOjwi7o2IM9On1Vvb14Dbm2nTNpKkhxeV4NhWIRw4rBo8QRo4SALGImCzpJ6S9gKOBOolfVPS3HTEcFv6ZHuLJHUHrgS+FBFvAUTEuoi4O0/dr6X7X5QzCrpc0gIlc3f8Js92305HIAX9P5kZ5UgamM61MCk95u8knS7piXR0dGJafx8lc7/MTRPejWli158CpqfbHJWOrOanbR+c1pkG/F0h7bTq5OGmVbyIWCNph6RDSQLIk0A/4APAq8CCiNgm6WcR8S2A9Mv7bOCBAg5xBLCypQSNko4neTJ3JCDgKUmzgG3AP5Mk1NsoqVfOdt8HDgA+E7v3RO4RwAXAVcBc4FLgw8A5JE/qj02P/1hEfDY9xTVH0n9FxJasdgwCXskER+Bq4CcR8TtJXUmekIckMJ+wG+20KuERh1WLzKgjEziezFr+37TOqZKekrSQZH6Jo/LtaA98GJgaEVsi4nVgCnByeqx7I2IjQE56hxuAHhHxhd0MGgB/jYiFEdEILCaZpCdIUr0MTOt8AhivJAX8TGBvkpQT2fqQpOHOeBK4XtI/AYdFxNa0/TuBbZk8T9bxOHBYtchc5zia5Bfxn0lGHB8EnpC0N3ArcH5EHE1yHn/vAve9DDi0gC/Kpk59iWTeh3zmAsfnjkKK9FbW+8as5UbePqsg4FNZ10kOjYjsWf0AtpL1bxIRd5KMWrYCMySdllV3L+DNPWizVTAHDqsWT5Cceno5nSPjZaAHSfB4kre/EDcqmR+kybuZckXEGyRZfn+anrLJ3H10WU7V2cBYSd3TbKvnkiR5fBS4ME2eR06QmA7cDPxniX/BzwC+lLmuI2lEnjrP8fYIJZMEcHlE/JQko+oxafmBwIaI2F7C9lo75sBh1WIhyd1Uf84pezUiNqZ3IN2elk0j+aVfjH8hOY3zjKRF6T6yT+sQEX8hmUN+DkkG4EkRUR8Ri4HvALMkPQ38MGe7e9K23a8kPXspfBvoAixI2//t3Arp9Y4XJB2RFl0ELEpPb70P+HVafirwUInaaRXA2XHNbBdJ5wLHR8S/NFNnCnBdRCxtu5ZZe+K7qsxsl4iYmjmllk96qm6ag0bH5hGHmZkVxdc4zMysKA4cZmZWFAcOMzMrigOHmZkVxYHDzMyK8v8BrOBFbRF7o8YAAAAASUVORK5CYII=\n" + }, + "metadata": { + "needs_background": "light" + } + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "plt.title('Learning Curve')\n", + "plt.xlabel('Wall Clock Time (s)')\n", + "plt.ylabel('Validation Accuracy')\n", + "plt.scatter(time_history, 1-np.array(valid_loss_history))\n", + "plt.plot(time_history, 1-np.array(best_valid_loss_history))\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## 3. Customized Learner" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Some experienced automl users may have a preferred model to tune or may already have a reasonably by-hand-tuned model before launching the automl experiment. They need to select optimal configurations for the customized model mixed with standard built-in learners. \n", + "\n", + "FLAML can easily incorporate customized/new learners (preferably with sklearn API) provided by users in a real-time manner, as demonstrated below." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Example of Regularized Greedy Forest\n", + "\n", + "[Regularized Greedy Forest](https://arxiv.org/abs/1109.0887) (RGF) is a machine learning method currently not included in FLAML. The RGF has many tuning parameters, the most critical of which are: `[max_leaf, n_iter, n_tree_search, opt_interval, min_samples_leaf]`. To run a customized/new learner, the user needs to provide the following information:\n", + "* an implementation of the customized/new learner\n", + "* a list of hyperparameter names and types\n", + "* rough ranges of hyperparameters (i.e., upper/lower bounds)\n", + "* choose initial value corresponding to low cost for cost-related hyperparameters (e.g., initial value for max_leaf and n_iter should be small)\n", + "\n", + "In this example, the above information for RGF is wrapped in a python class called *MyRegularizedGreedyForest* that exposes the hyperparameters." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "''' BaseEstimator is the parent module for a customized learner '''\n", + "from flaml.model import BaseEstimator\n", + "from flaml.space import ConfigSearchInfo\n", + "''' import the original implementation of RGF from rgf.sklearn package'''\n", + "from rgf.sklearn import RGFClassifier, RGFRegressor\n", + "\n", + "\n", + "class MyRegularizedGreedyForest(BaseEstimator):\n", + "\n", + " # search space\n", + " params_configsearch_info = {\n", + " 'max_leaf': ConfigSearchInfo(name = 'max_leaf', type = int, lower = 4, init = 4, upper = 10000),\n", + " 'n_iter': ConfigSearchInfo(name = 'n_iter', type = int, lower = 1, init = 1, upper = 32768),\n", + " 'n_tree_search': ConfigSearchInfo(name = 'n_tree_search', type = int, lower = 1, init = 1, upper = 32768),\n", + " 'opt_interval': ConfigSearchInfo(name = 'opt_interval', type = int, lower = 1, init = 100, upper = 10000),\n", + " 'learning_rate': ConfigSearchInfo(name = 'learning_rate', type = float, lower = 0.01, init = 1.0, upper = 20.0),\n", + " 'min_samples_leaf': ConfigSearchInfo(name = 'min_samples_leaf', type = int, lower = 1, init = 20, upper = 20)\n", + " }\n", + " \n", + " def __init__(self, objective_name = 'binary:logistic', n_jobs = 1, max_leaf = 1000, \n", + " n_iter = 1, n_tree_search = 1, opt_interval = 1, learning_rate = 1.0, min_samples_leaf = 1):\n", + "\n", + " '''regression for RGFRegressor; binary:logistic and multiclass for RGFClassifier'''\n", + " self.objective_name = objective_name\n", + "\n", + " if 'regression' in objective_name:\n", + " self.estimator_class = RGFRegressor\n", + " else:\n", + " self.estimator_class = RGFClassifier\n", + "\n", + " # round integer hyperparameters\n", + " self.params = {\n", + " 'max_leaf': int(round(max_leaf)),\n", + " 'n_iter': int(round(n_iter)),\n", + " 'n_tree_search': int(round(n_tree_search)),\n", + " 'opt_interval': int(round(opt_interval)),\n", + " 'learning_rate': learning_rate,\n", + " 'min_samples_leaf':int(round(min_samples_leaf))\n", + " } \n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Add Customized Learner and Run FLAML AutoML\n", + "\n", + "After adding RGF into the list of learners, we run automl by tuning hyperpameters of RGF as well as the default learners. " + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "''' add a new learner RGF'''\n", + "automl = AutoML()\n", + "automl.add_learner(learner_name = 'RGF', learner_class = MyRegularizedGreedyForest)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[flaml.automl: 11-22 10:32:04] {649} INFO - List of ML learners in AutoML Run: ['RGF', 'lgbm', 'rf', 'xgboost']\n", + "[flaml.automl: 11-22 10:32:04] {654} INFO - Evaluation method: holdout\n", + "[flaml.automl: 11-22 10:32:04] {672} INFO - Minimizing error metric: 1-accuracy\n", + "[flaml.automl: 11-22 10:32:05] {326} INFO - Using StratifiedKFold\n", + "[flaml.automl: 11-22 10:32:05] {717} INFO - iteration 0 current learner RGF\n", + "[flaml.automl: 11-22 10:32:10] {782} INFO - at 5.9s,\tbest RGF's error=0.3764,\tbest RGF's error=0.3764\n", + "[flaml.automl: 11-22 10:32:10] {717} INFO - iteration 1 current learner RGF\n", + "[flaml.automl: 11-22 10:32:17] {782} INFO - at 13.5s,\tbest RGF's error=0.3764,\tbest RGF's error=0.3764\n", + "[flaml.automl: 11-22 10:32:17] {717} INFO - iteration 2 current learner lgbm\n", + "[flaml.automl: 11-22 10:32:17] {782} INFO - at 13.7s,\tbest lgbm's error=0.3790,\tbest RGF's error=0.3764\n", + "[flaml.automl: 11-22 10:32:17] {717} INFO - iteration 3 current learner lgbm\n", + "[flaml.automl: 11-22 10:32:18] {782} INFO - at 14.1s,\tbest lgbm's error=0.3739,\tbest lgbm's error=0.3739\n", + "[flaml.automl: 11-22 10:32:18] {717} INFO - iteration 4 current learner lgbm\n", + "[flaml.automl: 11-22 10:32:18] {782} INFO - at 14.5s,\tbest lgbm's error=0.3738,\tbest lgbm's error=0.3738\n", + "[flaml.automl: 11-22 10:32:18] {717} INFO - iteration 5 current learner lgbm\n", + "[flaml.automl: 11-22 10:32:18] {782} INFO - at 14.7s,\tbest lgbm's error=0.3657,\tbest lgbm's error=0.3657\n", + "[flaml.automl: 11-22 10:32:18] {717} INFO - iteration 6 current learner lgbm\n", + "[flaml.automl: 11-22 10:32:19] {782} INFO - at 15.3s,\tbest lgbm's error=0.3657,\tbest lgbm's error=0.3657\n", + "[flaml.automl: 11-22 10:32:19] {717} INFO - iteration 7 current learner lgbm\n", + "[flaml.automl: 11-22 10:32:20] {782} INFO - at 16.4s,\tbest lgbm's error=0.3650,\tbest lgbm's error=0.3650\n", + "[flaml.automl: 11-22 10:32:20] {717} INFO - iteration 8 current learner lgbm\n", + "[flaml.automl: 11-22 10:32:23] {782} INFO - at 19.2s,\tbest lgbm's error=0.3562,\tbest lgbm's error=0.3562\n", + "[flaml.automl: 11-22 10:32:23] {717} INFO - iteration 9 current learner lgbm\n", + "[flaml.automl: 11-22 10:32:27] {782} INFO - at 23.4s,\tbest lgbm's error=0.3547,\tbest lgbm's error=0.3547\n", + "[flaml.automl: 11-22 10:32:27] {717} INFO - iteration 10 current learner lgbm\n", + "[flaml.automl: 11-22 10:32:34] {782} INFO - at 30.0s,\tbest lgbm's error=0.3477,\tbest lgbm's error=0.3477\n", + "[flaml.automl: 11-22 10:32:34] {717} INFO - iteration 11 current learner lgbm\n", + "[flaml.automl: 11-22 10:32:42] {782} INFO - at 38.1s,\tbest lgbm's error=0.3477,\tbest lgbm's error=0.3477\n", + "[flaml.automl: 11-22 10:32:42] {717} INFO - iteration 12 current learner lgbm\n", + "[flaml.automl: 11-22 10:32:50] {782} INFO - at 46.8s,\tbest lgbm's error=0.3435,\tbest lgbm's error=0.3435\n", + "[flaml.automl: 11-22 10:32:50] {717} INFO - iteration 13 current learner xgboost\n", + "[flaml.automl: 11-22 10:32:51] {782} INFO - at 47.0s,\tbest xgboost's error=0.3740,\tbest lgbm's error=0.3435\n", + "[flaml.automl: 11-22 10:32:51] {717} INFO - iteration 14 current learner xgboost\n", + "[flaml.automl: 11-22 10:32:51] {782} INFO - at 47.4s,\tbest xgboost's error=0.3685,\tbest lgbm's error=0.3435\n", + "[flaml.automl: 11-22 10:32:51] {717} INFO - iteration 15 current learner xgboost\n", + "[flaml.automl: 11-22 10:32:51] {782} INFO - at 47.8s,\tbest xgboost's error=0.3673,\tbest lgbm's error=0.3435\n", + "[flaml.automl: 11-22 10:32:52] {717} INFO - iteration 16 current learner xgboost\n", + "[flaml.automl: 11-22 10:32:52] {782} INFO - at 48.3s,\tbest xgboost's error=0.3662,\tbest lgbm's error=0.3435\n", + "[flaml.automl: 11-22 10:32:52] {717} INFO - iteration 17 current learner RGF\n", + "[flaml.automl: 11-22 10:33:03] {782} INFO - at 59.5s,\tbest RGF's error=0.3764,\tbest lgbm's error=0.3435\n", + "[flaml.automl: 11-22 10:33:03] {717} INFO - iteration 18 current learner lgbm\n", + "[flaml.automl: 11-22 10:33:05] {782} INFO - at 61.1s,\tbest lgbm's error=0.3435,\tbest lgbm's error=0.3435\n", + "[flaml.automl: 11-22 10:33:05] {803} INFO - LGBMClassifier(colsample_bytree=0.7, learning_rate=0.6244209542375836,\n", + " max_bin=1023, min_child_weight=8.99139787892973, n_estimators=8,\n", + " num_leaves=60, objective='binary',\n", + " reg_alpha=1.1018060088007014e-10,\n", + " reg_lambda=0.33075796457184126)\n", + "[flaml.automl: 11-22 10:33:05] {691} INFO - fit succeeded\n" + ] + } + ], + "source": [ + "settings = {\n", + " \"time_budget\": 60, # total running time in seconds\n", + " \"metric\": 'accuracy', \n", + " \"estimator_list\": ['RGF', 'lgbm', 'rf', 'xgboost'], # list of ML learners\n", + " \"task\": 'classification', # task type \n", + " \"sample\": True, # whether to subsample training data\n", + " \"log_file_name\": 'airlines_experiment.log', # cache directory of flaml log files \n", + " \"log_training_metric\": True, # whether to log training metric\n", + "}\n", + "\n", + "'''The main flaml automl API'''\n", + "automl.fit(X_train = X_train, y_train = y_train, **settings)" + ] + } + ], + "metadata": { + "kernelspec": { + "name": "python3", + "display_name": "Python 3.7.7 64-bit ('flaml': conda)", + "metadata": { + "interpreter": { + "hash": "bfcd9a6a9254a5e160761a1fd7a9e444f011592c6770d9f4180dde058a9df5dd" + } + } + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7-final" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/settings.json b/settings.json new file mode 100644 index 000000000..b9d92de39 --- /dev/null +++ b/settings.json @@ -0,0 +1,4 @@ +{ + "keep_max_logfiles": 30, + "logging_level": "info" +} \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 000000000..9b7609260 --- /dev/null +++ b/setup.py @@ -0,0 +1,56 @@ +import setuptools +import os + +here = os.path.abspath(os.path.dirname(__file__)) + +with open("README.md", "r") as fh: + long_description = fh.read() + + +# Get the code version +version = {} +with open(os.path.join(here, "flaml/version.py")) as fp: + exec(fp.read(), version) +__version__ = version["__version__"] + +install_requires = [ + "NumPy>=1.16.2", + "lightgbm>=2.3.1", + "xgboost>=0.90", + "scipy>=1.4.1", + "catboost>=0.23", + "scikit-learn>=0.23", +], + + +setuptools.setup( + name="FLAML", + version=__version__, + author="Microsoft Corporation", + author_email="hpo@microsoft.com", + description="A fast and lightweight autoML system", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/microsoft/FLAML", + packages=["flaml"], + install_requires=install_requires, + extras_require={ + "notebook": [ + "openml==0.10.2", + "jupyter", + "matplotlib==3.2.0", + "rgf-python", + ], + "test": [ + "flake8>=3.8.4", + "pytest>=6.1.1", + "coverage>=5.3", + ], + }, + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + python_requires=">=3.6", +) diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/test_automl.py b/test/test_automl.py new file mode 100644 index 000000000..1c0269abf --- /dev/null +++ b/test/test_automl.py @@ -0,0 +1,235 @@ +import unittest + +import numpy as np +import scipy.sparse +from sklearn.datasets import load_boston, load_iris + +from flaml import AutoML, get_output_from_log + + +def custom_metric(X_test, y_test, estimator, labels, X_train, y_train): + from sklearn.metrics import log_loss + y_pred = estimator.predict_proba(X_test) + test_loss = log_loss(y_test, y_pred, labels=labels) + y_pred = estimator.predict_proba(X_train) + train_loss = log_loss(y_train, y_pred, labels=labels) + alpha = 0.5 + return test_loss * (1 + alpha) - alpha * train_loss, [test_loss, train_loss] + + +class TestAutoML(unittest.TestCase): + + def test_dataframe(self): + self.test_classification(True) + + def test_custom_metric(self): + + automl_experiment = AutoML() + automl_settings = { + "time_budget": 10, + 'eval_method': 'holdout', + "metric": custom_metric, + "task": 'classification', + "log_file_name": "test/iris_custom.log", + "log_training_metric": True, + 'log_type': 'all', + "model_history": True + } + X_train, y_train = load_iris(return_X_y=True) + automl_experiment.fit(X_train=X_train, y_train=y_train, + **automl_settings) + print(automl_experiment.classes_) + print(automl_experiment.predict_proba(X_train)) + print(automl_experiment.model) + print(automl_experiment.config_history) + print(automl_experiment.model_history) + print(automl_experiment.best_iteration) + print(automl_experiment.best_estimator) + automl_experiment = AutoML() + estimator = automl_experiment.get_estimator_from_log( + automl_settings["log_file_name"], record_id=0, + objective='multi') + print(estimator) + time_history, best_valid_loss_history, valid_loss_history, \ + config_history, train_loss_history = get_output_from_log( + filename=automl_settings['log_file_name'], time_budget=6) + print(train_loss_history) + + def test_classification(self, as_frame=False): + + automl_experiment = AutoML() + automl_settings = { + "time_budget": 4, + "metric": 'accuracy', + "task": 'classification', + "log_file_name": "test/iris.log", + "log_training_metric": True, + "model_history": True + } + X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame) + automl_experiment.fit(X_train=X_train, y_train=y_train, + **automl_settings) + print(automl_experiment.classes_) + print(automl_experiment.predict_proba(X_train)[:5]) + print(automl_experiment.model) + print(automl_experiment.config_history) + print(automl_experiment.model_history) + print(automl_experiment.best_iteration) + print(automl_experiment.best_estimator) + del automl_settings["metric"] + del automl_settings["model_history"] + del automl_settings["log_training_metric"] + automl_experiment = AutoML() + duration = automl_experiment.retrain_from_log( + log_file_name=automl_settings["log_file_name"], + X_train=X_train, y_train=y_train, + train_full=True, record_id=0) + print(duration) + print(automl_experiment.model) + print(automl_experiment.predict_proba(X_train)[:5]) + + def test_regression(self): + + automl_experiment = AutoML() + automl_settings = { + "time_budget": 2, + "metric": 'mse', + "task": 'regression', + "log_file_name": "test/boston.log", + "log_training_metric": True, + "model_history": True + } + X_train, y_train = load_boston(return_X_y=True) + n = len(y_train) + automl_experiment.fit(X_train=X_train[:n >> 1], y_train=y_train[:n >> 1], + X_val=X_train[n >> 1:], y_val=y_train[n >> 1:], + **automl_settings) + assert automl_experiment.y_val.shape[0] == n - (n >> 1) + assert automl_experiment.eval_method == 'holdout' + print(automl_experiment.predict(X_train)) + print(automl_experiment.model) + print(automl_experiment.config_history) + print(automl_experiment.model_history) + print(automl_experiment.best_iteration) + print(automl_experiment.best_estimator) + print(get_output_from_log(automl_settings["log_file_name"], 1)) + + def test_sparse_matrix_classification(self): + + automl_experiment = AutoML() + automl_settings = { + "time_budget": 2, + "metric": 'auto', + "task": 'classification', + "log_file_name": "test/sparse_classification.log", + "split_type": "uniform", + "model_history": True + } + X_train = scipy.sparse.random(1554, 21, dtype=int) + y_train = np.random.randint(3, size=1554) + automl_experiment.fit(X_train=X_train, y_train=y_train, + **automl_settings) + print(automl_experiment.classes_) + print(automl_experiment.predict_proba(X_train)) + print(automl_experiment.model) + print(automl_experiment.config_history) + print(automl_experiment.model_history) + print(automl_experiment.best_iteration) + print(automl_experiment.best_estimator) + + def test_sparse_matrix_regression(self): + + automl_experiment = AutoML() + automl_settings = { + "time_budget": 2, + "metric": 'mae', + "task": 'regression', + "log_file_name": "test/sparse_regression.log", + "model_history": True + } + X_train = scipy.sparse.random(300, 900, density=0.0001) + y_train = np.random.uniform(size=300) + X_val = scipy.sparse.random(100, 900, density=0.0001) + y_val = np.random.uniform(size=100) + automl_experiment.fit(X_train=X_train, y_train=y_train, + X_val=X_val, y_val=y_val, + **automl_settings) + assert automl_experiment.X_val.shape == X_val.shape + print(automl_experiment.predict(X_train)) + print(automl_experiment.model) + print(automl_experiment.config_history) + print(automl_experiment.model_history) + print(automl_experiment.best_iteration) + print(automl_experiment.best_estimator) + print(automl_experiment.best_config) + print(automl_experiment.best_loss) + print(automl_experiment.best_config_train_time) + + def test_sparse_matrix_xgboost(self): + + automl_experiment = AutoML() + automl_settings = { + "time_budget": 2, + "metric": 'ap', + "task": 'classification', + "log_file_name": "test/sparse_classification.log", + "estimator_list": ["xgboost"], + "log_type": "all", + } + X_train = scipy.sparse.eye(900000) + y_train = np.random.randint(2, size=900000) + automl_experiment.fit(X_train=X_train, y_train=y_train, + **automl_settings) + print(automl_experiment.predict(X_train)) + print(automl_experiment.model) + print(automl_experiment.config_history) + print(automl_experiment.model_history) + print(automl_experiment.best_iteration) + print(automl_experiment.best_estimator) + + def test_sparse_matrix_lr(self): + + automl_experiment = AutoML() + automl_settings = { + "time_budget": 2, + "metric": 'f1', + "task": 'classification', + "log_file_name": "test/sparse_classification.log", + "estimator_list": ["lrl1", "lrl2"], + "log_type": "all", + } + X_train = scipy.sparse.random(3000, 900, density=0.1) + y_train = np.random.randint(2, size=3000) + automl_experiment.fit(X_train=X_train, y_train=y_train, + **automl_settings) + print(automl_experiment.predict(X_train)) + print(automl_experiment.model) + print(automl_experiment.config_history) + print(automl_experiment.model_history) + print(automl_experiment.best_iteration) + print(automl_experiment.best_estimator) + + def test_sparse_matrix_regression_cv(self): + + automl_experiment = AutoML() + automl_settings = { + "time_budget": 2, + 'eval_method': 'cv', + "task": 'regression', + "log_file_name": "test/sparse_regression.log", + "model_history": True + } + X_train = scipy.sparse.random(100, 100) + y_train = np.random.uniform(size=100) + automl_experiment.fit(X_train=X_train, y_train=y_train, + **automl_settings) + print(automl_experiment.predict(X_train)) + print(automl_experiment.model) + print(automl_experiment.config_history) + print(automl_experiment.model_history) + print(automl_experiment.best_iteration) + print(automl_experiment.best_estimator) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/test_split.py b/test/test_split.py new file mode 100644 index 000000000..77903f259 --- /dev/null +++ b/test/test_split.py @@ -0,0 +1,45 @@ +import unittest + +from sklearn.datasets import fetch_openml +from flaml.automl import AutoML +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score + + +dataset = "Airlines" + + +def _test(split_type): + automl = AutoML() + + automl_settings = { + "time_budget": 2, + # "metric": 'accuracy', + "task": 'classification', + "log_file_name": "test/{}.log".format(dataset), + "model_history": True, + "log_training_metric": True, + "split_type": split_type, + } + + X, y = fetch_openml(name=dataset, return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, + random_state=42) + automl.fit(X_train=X_train, y_train=y_train, **automl_settings) + + pred = automl.predict(X_test) + acc = accuracy_score(y_test, pred) + + print(acc) + + +def test_stratified(): + _test(split_type="stratified") + + +def test_uniform(): + _test(split_type="uniform") + + +if __name__ == "__main__": + unittest.main() diff --git a/test/test_version.py b/test/test_version.py new file mode 100644 index 000000000..7fd8a9099 --- /dev/null +++ b/test/test_version.py @@ -0,0 +1,14 @@ +import unittest +import flaml + + +class TestVersion(unittest.TestCase): + + + def test_version(self): + self.assertTrue(hasattr(flaml, '__version__')) + self.assertTrue(len(flaml.__version__) > 0) + + +if __name__ == "__main__": + unittest.main()