This commit is contained in:
Chi Wang (MSR) 2020-12-04 09:40:27 -08:00
Коммит 492990655d
25 изменённых файлов: 4481 добавлений и 0 удалений

5
.coveragerc Normal file
Просмотреть файл

@ -0,0 +1,5 @@
[run]
branch = True
source = flaml
omit =
*tests*

5
.flake8 Normal file
Просмотреть файл

@ -0,0 +1,5 @@
[flake8]
ignore = E203, E266, E501, W503, F403, F401, C901
max-line-length = 127
max-complexity = 10
select = B,C,E,F,W,T4,B9

59
.github/workflows/python-package.yml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,59 @@
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
name: Python package
on:
push:
branches: ['*']
pull_request:
branches: ['*']
jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-2019]
python-version: [3.6, 3.7, 3.8]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: If mac, install libomp to facilitate lgbm install
if: matrix.os == 'macOS-latest'
run: |
brew install libomp
export CC=/usr/bin/clang
export CXX=/usr/bin/clang++
export CPPFLAGS="$CPPFLAGS -Xpreprocessor -fopenmp"
export CFLAGS="$CFLAGS -I/usr/local/opt/libomp/include"
export CXXFLAGS="$CXXFLAGS -I/usr/local/opt/libomp/include"
export LDFLAGS="$LDFLAGS -Wl,-rpath,/usr/local/opt/libomp/lib -L/usr/local/opt/libomp/lib -lomp"
- name: Install packages and dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 pytest coverage
pip install -e .
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pytest test
- name: Coverage
run: |
coverage run -a -m pytest test
coverage xml
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v1
with:
file: ./coverage.xml
flags: unittests

150
.gitignore поставляемый Normal file
Просмотреть файл

@ -0,0 +1,150 @@
# Project
/.vs
.vscode
# Log files
*.log
# Python virtualenv
.venv
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
/catboost_info
notebook/*.pkl

9
CODE_OF_CONDUCT.md Normal file
Просмотреть файл

@ -0,0 +1,9 @@
# Microsoft Open Source Code of Conduct
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
Resources:
- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns

21
LICENSE Normal file
Просмотреть файл

@ -0,0 +1,21 @@
MIT License
Copyright (c) Microsoft Corporation.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE

123
README.md Normal file
Просмотреть файл

@ -0,0 +1,123 @@
# FLAML - Fast and Lightweight AutoML
FLAML is a Python library designed to automatically produce accurate machine
learning models with low computational cost. It frees users from selecting
learners and hyperparameters for each learner. It is fast and cheap.
The simple and lightweight design makes it easy to extend, such as
adding customized learners or metrics. FLAML is powered by a new, cost-effective
hyperparameter optimization and learner selection method invented by
Microsoft Research.
FLAML is easy to use:
1. With three lines of code, you can start using this economical and fast
AutoML engine as a scikit-learn style estimator.
```python
from flaml import AutoML
automl = AutoML()
automl.fit(X_train, y_train, task="classification")
```
2. You can restrict the learners and use FLAML as a fast hyperparameter tuning
tool for XGBoost, LightGBM, Random Forest etc. or a customized learner.
```python
automl.fit(X_train, y_train, task="classification", estimator_list=["lgbm"])
```
3. You can embed FLAML in self-tuning software for just-in-time tuning with
low latency & resource consumption.
```python
automl.fit(X_train, y_train, task="regression", time_budget=60)
```
## Installation
FLAML requires **Python version >= 3.6**. It can be installed from pip:
```bash
pip install flaml
```
To run the [`notebook example`](https://github.com/microsoft/FLAML/tree/main/notebook),
install flaml with the [notebook] option:
```bash
pip install flaml[notebook]
```
## Examples
A basic classification example.
```python
from flaml import AutoML
from sklearn.datasets import load_iris
# Initialize the FLAML learner.
automl = AutoML()
# Provide configurations.
automl_settings = {
"time_budget": 10, # in seconds
"metric": 'accuracy',
"task": 'classification',
"log_file_name": "test/iris.log",
}
X_train, y_train = load_iris(return_X_y=True)
# Train with labeled input data.
automl.fit(X_train=X_train, y_train=y_train,
**automl_settings)
# Predict
print(automl.predict_proba(X_train))
# Export the best model.
print(automl.model)
```
A basic regression example.
```python
from flaml import AutoML
from sklearn.datasets import load_boston
# Initialize the FLAML learner.
automl = AutoML()
# Provide configurations.
automl_settings = {
"time_budget": 10, # in seconds
"metric": 'r2',
"task": 'regression',
"log_file_name": "test/boston.log",
}
X_train, y_train = load_boston(return_X_y=True)
# Train with labeled input data.
automl.fit(X_train=X_train, y_train=y_train,
**automl_settings)
# Predict
print(automl.predict(X_train))
# Export the best model.
print(automl.model)
```
More examples: see the [notebook](https://github.com/microsoft/FLAML/tree/main/notebook/flaml_demo.ipynb)
## Contributing
This project welcomes contributions and suggestions. Most contributions require you to agree to a
Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
the rights to use your contribution. For details, visit <https://cla.opensource.microsoft.com>.
When you submit a pull request, a CLA bot will automatically determine whether you need to provide
a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
provided by the bot. You will only need to do this once across all repos using our CLA.
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
## Authors
* Chi Wang
* Qingyun Wu
* Erkang Zhu
Contributors: Markus Weimer, Silu Huang, Haozhe Zhang, Alex Deng.
## License
[MIT License](LICENSE)

41
SECURITY.md Normal file
Просмотреть файл

@ -0,0 +1,41 @@
<!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
## Security
Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
## Reporting Security Issues
**Please do not report security vulnerabilities through public GitHub issues.**
Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
* Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
* Full paths of source file(s) related to the manifestation of the issue
* The location of the affected source code (tag/branch/commit or direct URL)
* Any special configuration required to reproduce the issue
* Step-by-step instructions to reproduce the issue
* Proof-of-concept or exploit code (if possible)
* Impact of the issue, including how an attacker might exploit the issue
This information will help us triage your report more quickly.
If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
## Preferred Languages
We prefer all communications to be in English.
## Policy
Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
<!-- END MICROSOFT SECURITY.MD BLOCK -->

70
flaml/__init__.py Normal file
Просмотреть файл

@ -0,0 +1,70 @@
from flaml.automl import AutoML
from flaml.model import BaseEstimator
from flaml.data import get_output_from_log
from flaml.version import __version__
import logging
from os.path import join, exists
import datetime as dt
from os import listdir, remove, mkdir
import pathlib
import json
root = pathlib.Path(__file__).parent.parent.absolute()
jsonfilepath = join(root, "settings.json")
with open(jsonfilepath) as f:
settings = json.load(f)
logging_level = settings["logging_level"]
if logging_level == "info":
logging_level = logging.INFO
elif logging_level == "debug":
logging_level = logging.DEBUG
elif logging_level == "error":
logging_level = logging.ERROR
elif logging_level == "warning":
logging_level = logging.WARNING
elif logging_level == "critical":
logging_level = logging.CRITICAL
else:
logging_level = logging.NOTSET
keep_max_logfiles = settings["keep_max_logfiles"]
log_dir = join(root, "logs")
if not exists(log_dir):
mkdir(log_dir)
del_logs = sorted([int(x.split("_")[0]) for x in listdir(log_dir) if ".log" in
x], reverse=True)[keep_max_logfiles:]
for l in del_logs:
try:
remove(join(log_dir, str(l) + "_flaml.log"))
except Exception as e:
continue
b = dt.datetime.now()
a = dt.datetime(2020, 4, 1, 0, 0, 0)
secs = int((b-a).total_seconds())
name = str(secs)
logger = logging.getLogger(__name__)
logger.setLevel(logging_level)
fh = logging.FileHandler(join(log_dir, name + "_" + __name__ + ".log"))
fh.setLevel(logging_level)
ch = logging.StreamHandler()
ch.setLevel(logging_level)
# formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
formatter = logging.Formatter(
'[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s',
'%m-%d %H:%M:%S')
ch.setFormatter(formatter)
fh.setFormatter(formatter)
logger.addHandler(ch)
logger.addHandler(fh)
logger.propagate = True

897
flaml/automl.py Normal file
Просмотреть файл

@ -0,0 +1,897 @@
'''!
* Copyright (c) 2020 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the
* project root for license information.
'''
import time
import warnings
from functools import partial
import ast
import numpy as np
import scipy.sparse
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, \
RepeatedKFold
from sklearn.utils import shuffle
import pandas as pd
from .ml import compute_estimator, train_estimator, get_classification_objective
from .config import MIN_SAMPLE_TRAIN, MEM_THRES, ETI_INI, \
SMALL_LARGE_THRES, CV_HOLDOUT_THRESHOLD, SPLIT_RATIO, N_SPLITS
from .data import concat
from .search import ParamSearch
from .training_log import training_log_reader, training_log_writer
import logging
logger = logging.getLogger(__name__)
class AutoML:
'''The AutoML class
Attributes:
model: An object with predict() and predict_proba() method (for
classification), storing the best trained model.
model_history: A dictionary of iter->model, storing the models when
the best model is updated each time
config_history: A dictionary of iter->(estimator, config, time),
storing the best estimator, config, and the time when the best
model is updated each time
classes_: A list of n_classes elements for class labels
best_iteration: An integer of the iteration number where the best
config is found
best_estimator: A string indicating the best estimator found.
best_config: A dictionary of the best configuration.
best_config_train_time: A float of the seconds taken by training the
best config
Typical usage example:
automl = AutoML()
automl_settings = {
"time_budget": 60,
"metric": 'accuracy',
"task": 'classification',
"log_file_name": 'test/mylog.log',
}
automl.fit(X_train = X_train, y_train = y_train,
**automl_settings)
'''
def __init__(self):
self._eti_ini = ETI_INI
self._custom_learners = {}
self._config_space_info = {}
self._custom_size_estimate = {}
self._track_iter = 0
@property
def model_history(self):
return self._model_history
@property
def config_history(self):
return self._config_history
@property
def model(self):
if self._trained_estimator:
return self._trained_estimator.model
else:
return None
@property
def best_estimator(self):
return self._best_estimator
@property
def best_iteration(self):
return self._best_iteration
@property
def best_config(self):
return self._selected.best_config[0]
@property
def best_loss(self):
return self._best_loss
@property
def best_config_train_time(self):
return self.best_train_time
@property
def classes_(self):
if self.label_transformer:
return self.label_transformer.classes_.tolist()
if self._trained_estimator:
return self._trained_estimator.model.classes_.tolist()
return None
def predict(self, X_test):
'''Predict label from features.
Args:
X_test: A numpy array of featurized instances, shape n*m.
Returns:
A numpy array of shape n*1 -- each element is a predicted class
label for an instance.
'''
X_test = self.preprocess(X_test)
y_pred = self._trained_estimator.predict(X_test)
if y_pred.ndim > 1:
y_pred = y_pred.flatten()
if self.label_transformer:
return self.label_transformer.inverse_transform(pd.Series(
y_pred))
else:
return y_pred
def predict_proba(self, X_test):
'''Predict the probability of each class from features, only works for
classification problems.
Args:
X_test: A numpy array of featurized instances, shape n*m.
Returns:
A numpy array of shape n*c. c is the # classes. Each element at
(i,j) is the probability for instance i to be in class j.
'''
X_test = self.preprocess(X_test)
proba = self._trained_estimator.predict_proba(X_test)
return proba
def preprocess(self, X):
if scipy.sparse.issparse(X):
X = X.tocsr()
if self.transformer:
X = self.transformer.transform(X)
return X
def _validate_data(self, X_train_all, y_train_all, dataframe, label,
X_val=None, y_val=None):
if X_train_all is not None and y_train_all is not None:
if not (isinstance(X_train_all, np.ndarray)
or scipy.sparse.issparse(X_train_all)
or isinstance(X_train_all, pd.DataFrame)
):
raise ValueError(
"X_train_all must be a numpy array, a pandas dataframe, "
"or Scipy sparse matrix.")
if not (isinstance(y_train_all, np.ndarray)
or isinstance(y_train_all, pd.Series)):
raise ValueError(
"y_train_all must be a numpy array or a pandas series.")
if X_train_all.size == 0 or y_train_all.size == 0:
raise ValueError("Input data must not be empty.")
if isinstance(y_train_all, np.ndarray):
y_train_all = y_train_all.flatten()
if X_train_all.shape[0] != y_train_all.shape[0]:
raise ValueError(
"# rows in X_train must match length of y_train.")
self.df = isinstance(X_train_all, pd.DataFrame)
self.nrow, self.ndim = X_train_all.shape
X, y = X_train_all, y_train_all
elif dataframe is not None and label is not None:
if not isinstance(dataframe, pd.DataFrame):
raise ValueError("dataframe must be a pandas DataFrame")
if not label in dataframe.columns:
raise ValueError("label must a column name in dataframe")
self.df = True
self.dataframe, self.label = dataframe, label
X = dataframe.drop(columns=label)
self.nrow, self.ndim = X.shape
y = dataframe[label]
else:
raise ValueError(
"either X_train_all+y_train_all or dataframe+label need to be provided.")
if scipy.sparse.issparse(X_train_all):
self.transformer = self.label_transformer = False
self.X_train_all, self.y_train_all = X, y
else:
from .data import DataTransformer
self.transformer = DataTransformer()
self.X_train_all, self.y_train_all = self.transformer.fit_transform(
X, y, self.task)
self.label_transformer = self.transformer.label_transformer
if X_val is not None and y_val is not None:
if not (isinstance(X_val, np.ndarray)
or scipy.sparse.issparse(X_val)
or isinstance(X_val, pd.DataFrame)
):
raise ValueError(
"X_val must be None, a numpy array, a pandas dataframe, "
"or Scipy sparse matrix.")
if not (isinstance(y_val, np.ndarray)
or isinstance(y_val, pd.Series)):
raise ValueError(
"y_val must be None, a numpy array or a pandas series.")
if X_val.size == 0 or y_val.size == 0:
raise ValueError(
"Validation data are expected to be nonempty. "
"Use None for X_val and y_val if no validation data.")
if isinstance(y_val, np.ndarray):
y_val = y_val.flatten()
if X_val.shape[0] != y_val.shape[0]:
raise ValueError(
"# rows in X_val must match length of y_val.")
if self.transformer:
self.X_val = self.transformer.transform(X_val)
else:
self.X_val = X_val
if self.label_transformer:
self.y_val = self.label_transformer.transform(y_val)
else:
self.y_val = y_val
else:
self.X_val = self.y_val = None
def _prepare_data(self,
eval_method,
split_ratio,
n_splits):
X_val, y_val = self.X_val, self.y_val
if scipy.sparse.issparse(X_val):
X_val = X_val.tocsr()
X_train_all, y_train_all = self.X_train_all, self.y_train_all
if scipy.sparse.issparse(X_train_all):
X_train_all = X_train_all.tocsr()
if self.task != 'regression':
# logger.info(f"label {pd.unique(y_train_all)}")
label_set, counts = np.unique(y_train_all, return_counts=True)
# augment rare classes
rare_threshld = 20
rare = counts < rare_threshld
rare_label, rare_counts = label_set[rare], counts[rare]
for i, label in enumerate(rare_label):
count = rare_count = rare_counts[i]
rare_index = y_train_all == label
n = len(y_train_all)
while count < rare_threshld:
if self.df:
X_train_all = concat(X_train_all,
X_train_all.iloc[:n].loc[rare_index])
else:
X_train_all = concat(X_train_all,
X_train_all[:n][rare_index, :])
if isinstance(y_train_all, pd.Series):
y_train_all = concat(y_train_all,
y_train_all.iloc[:n].loc[rare_index])
else:
y_train_all = np.concatenate([y_train_all,
y_train_all[:n][rare_index]])
count += rare_count
logger.debug(
f"class {label} augmented from {rare_count} to {count}")
X_train_all, y_train_all = shuffle(
X_train_all, y_train_all, random_state=202020)
if self.df:
X_train_all.reset_index(drop=True, inplace=True)
if isinstance(y_train_all, pd.Series):
y_train_all.reset_index(drop=True, inplace=True)
X_train, y_train = X_train_all, y_train_all
if X_val is None:
if self.task != 'regression' and eval_method == 'holdout':
label_set, first = np.unique(y_train_all, return_index=True)
rest = []
last = 0
first.sort()
for i in range(len(first)):
rest.extend(range(last, first[i]))
last = first[i] + 1
rest.extend(range(last, len(y_train_all)))
X_first = X_train_all.iloc[first] if self.df else X_train_all[
first]
X_rest = X_train_all.iloc[rest] if self.df else X_train_all[rest]
y_rest = y_train_all.iloc[rest] if isinstance(
y_train_all, pd.Series) else y_train_all[rest]
stratify = y_rest if self.split_type == 'stratified' else None
X_train, X_val, y_train, y_val = train_test_split(
X_rest,
y_rest,
test_size=split_ratio,
stratify=stratify,
random_state=1)
X_train = concat(X_first, X_train)
y_train = concat(label_set,
y_train) if self.df else np.concatenate([label_set, y_train])
X_val = concat(X_first, X_val)
y_val = concat(label_set,
y_val) if self.df else np.concatenate([label_set, y_val])
_, y_train_counts_elements = np.unique(y_train,
return_counts=True)
_, y_val_counts_elements = np.unique(y_val,
return_counts=True)
logger.debug(
f"""{self.split_type} split for y_train \
{y_train_counts_elements}, \
y_val {y_val_counts_elements}""")
elif eval_method == 'holdout' and self.task == 'regression':
X_train, X_val, y_train, y_val = train_test_split(
X_train_all,
y_train_all,
test_size=split_ratio,
random_state=1)
self.data_size = X_train.shape[0]
self.X_train, self.y_train, self.X_val, self.y_val = (
X_train, y_train, X_val, y_val)
if self.split_type == "stratified":
logger.info("Using StratifiedKFold")
self.kf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=1,
random_state=202020)
else:
logger.info("Using RepeatedKFold")
self.kf = RepeatedKFold(n_splits=n_splits, n_repeats=1,
random_state=202020)
def prepare_sample_train_data(self, sample_size):
full_size = len(self.y_train)
if sample_size <= full_size:
if isinstance(self.X_train, pd.DataFrame):
sampled_X_train = self.X_train.iloc[:sample_size]
else:
sampled_X_train = self.X_train[:sample_size]
sampled_y_train = self.y_train[:sample_size]
else:
sampled_X_train = concat(self.X_train, self.X_val)
sampled_y_train = np.concatenate([self.y_train, self.y_val])
return sampled_X_train, sampled_y_train
def _compute_with_config_base(self,
metric,
compute_train_loss,
estimator,
config,
sample_size):
sampled_X_train, sampled_y_train = self.prepare_sample_train_data(
sample_size)
time_left = self.time_budget - self.time_from_start
budget = time_left if sample_size == self.data_size else \
time_left / 2 * sample_size / self.data_size
return compute_estimator(sampled_X_train,
sampled_y_train,
self.X_val,
self.y_val,
budget,
self.kf,
config,
self.task,
estimator,
self.eval_method,
metric,
self._best_loss,
self.n_jobs,
self._custom_learners.get(estimator),
compute_train_loss)
def _train_with_config(self, estimator, config, sample_size):
sampled_X_train, sampled_y_train = self.prepare_sample_train_data(
sample_size)
budget = None if self.time_budget is None else (self.time_budget
- self.time_from_start)
model, train_time = train_estimator(
sampled_X_train,
sampled_y_train,
config,
self.task,
estimator,
self.n_jobs,
self._custom_learners.get(estimator),
budget)
return model, train_time
def add_learner(self,
learner_name,
learner_class,
size_estimate=lambda config: 'unknown',
cost_relative2lgbm=1):
'''Add a customized learner
Args:
learner_name: A string of the learner's name
learner_class: A subclass of BaseEstimator
size_estimate: A function from a config to its memory size in float
cost_relative2lgbm: A float number for the training cost ratio with
respect to lightgbm (when both use the initial config)
'''
self._custom_learners[learner_name] = learner_class
self._eti_ini[learner_name] = cost_relative2lgbm
self._config_space_info[learner_name] = \
learner_class.params_configsearch_info
self._custom_size_estimate[learner_name] = size_estimate
def get_estimator_from_log(self, log_file_name, record_id, objective):
'''Get the estimator from log file
Args:
log_file_name: A string of the log file name
record_id: An integer of the record ID in the file,
0 corresponds to the first trial
objective: A string of the objective name,
'binary', 'multi', or 'regression'
Returns:
An estimator object for the given configuration
'''
with training_log_reader(log_file_name) as reader:
record = reader.get_record(record_id)
estimator = record.learner
config = record.config
estimator, _ = train_estimator(
None, None, config, objective, estimator,
estimator_class=self._custom_learners.get(estimator)
)
return estimator
def retrain_from_log(self,
log_file_name,
X_train=None,
y_train=None,
dataframe=None,
label=None,
time_budget=0,
task='classification',
eval_method='auto',
split_ratio=SPLIT_RATIO,
n_splits=N_SPLITS,
split_type="stratified",
n_jobs=1,
train_best=True,
train_full=False,
record_id=-1):
'''Retrain from log file
Args:
time_budget: A float number of the time budget in seconds
log_file_name: A string of the log file name
X_train: A numpy array of training data in shape n*m
y_train: A numpy array of labels in shape n*1
task: A string of the task type, e.g.,
'classification', 'regression'
eval_method: A string of resampling strategy, one of
['auto', 'cv', 'holdout']
split_ratio: A float of the validation data percentage for holdout
n_splits: An integer of the number of folds for cross-validation
n_jobs: An integer of the number of threads for training
train_best: A boolean of whether to train the best config in the
time budget; if false, train the last config in the budget
train_full: A boolean of whether to train on the full data. If true,
eval_method and sample_size in the log file will be ignored
record_id: the ID of the training log record from which the model will
be retrained. By default `record_id = -1` which means this will be
ignored. `record_id = 0` corresponds to the first trial, and
when `record_id >= 0`, `time_budget` will be ignored.
'''
self.task = task
self._validate_data(X_train, y_train, dataframe, label)
logger.info('log file name {}'.format(log_file_name))
best_config = None
best_val_loss = float('+inf')
best_estimator = None
sample_size = None
time_used = 0.0
training_duration = 0
best = None
with training_log_reader(log_file_name) as reader:
if record_id >= 0:
best = reader.get_record(record_id)
else:
for record in reader.records():
time_used = record.total_search_time
if time_used > time_budget:
break
training_duration = time_used
val_loss = record.validation_loss
if val_loss <= best_val_loss or not train_best:
if val_loss == best_val_loss and train_best:
size = record.sample_size
if size > sample_size:
best = record
best_val_loss = val_loss
sample_size = size
else:
best = record
size = record.sample_size
best_val_loss = val_loss
sample_size = size
if not training_duration:
from .model import BaseEstimator
self._trained_estimator = BaseEstimator()
self._trained_estimator.model = None
return training_duration
if not best: return
best_estimator = best.learner
best_config = best.config
sample_size = len(self.y_train_all) if train_full \
else best.sample_size
logger.info(
'estimator = {}, config = {}, #training instances = {}'.format(
best_estimator, best_config, sample_size))
# Partially copied from fit() function
# Initilize some attributes required for retrain_from_log
np.random.seed(0)
self.task = task
if self.task == 'classification':
self.task = get_classification_objective(
len(np.unique(self.y_train_all)))
assert split_type in ["stratified", "uniform"]
self.split_type = split_type
else:
self.split_type = "uniform"
if record_id >= 0:
eval_method = 'cv'
elif eval_method == 'auto':
eval_method = self._decide_eval_method(time_budget)
self.modelcount = 0
self._prepare_data(eval_method, split_ratio, n_splits)
self.time_budget = None
self.n_jobs = n_jobs
self._trained_estimator = self._train_with_config(
best_estimator, best_config, sample_size)[0]
return training_duration
def _decide_eval_method(self, time_budget):
if self.X_val is not None:
return 'holdout'
nrow, dim = self.nrow, self.ndim
if nrow * dim / 0.9 < SMALL_LARGE_THRES * (
time_budget / 3600) and nrow < CV_HOLDOUT_THRESHOLD:
# time allows or sampling can be used and cv is necessary
return 'cv'
else:
return 'holdout'
def fit(self,
X_train=None,
y_train=None,
dataframe=None,
label=None,
metric='auto',
task='classification',
n_jobs=-1,
log_file_name='default.log',
estimator_list='auto',
time_budget=60,
max_iter=1000000,
sample=True,
ensemble=False,
eval_method='auto',
log_type='better',
model_history=False,
split_ratio=SPLIT_RATIO,
n_splits=N_SPLITS,
log_training_metric=False,
mem_thres=MEM_THRES,
X_val=None,
y_val=None,
retrain_full=True,
split_type="stratified",
learner_selector='sample',
):
'''Find a model for a given task
Args:
X_train: A numpy array or a pandas dataframe of training data in
shape n*m
y_train: A numpy array or a pandas series of labels in shape n*1
dataframe: A dataframe of training data including label column
label: A str of the label column name
Note: If X_train and y_train are provided,
dataframe and label are ignored;
If not, dataframe and label must be provided.
metric: A string of the metric name or a function,
e.g., 'accuracy','roc_auc','f1','log_loss','mae','mse','r2'
if passing a customized metric function, the function needs to
have the follwing signature
def metric(X_test, y_test, estimator, labels, X_train, y_train):
return metric_to_minimize, metrics_to_log
which returns a float number as the minimization objective,
and a tuple of floats as the metrics to log
task: A string of the task type, e.g.,
'classification', 'regression'
n_jobs: An integer of the number of threads for training
log_file_name: A string of the log file name
estimator_list: A list of strings for estimator names, or 'auto'
e.g., ['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree']
time_budget: A float number of the time budget in seconds
max_iter: An integer of the maximal number of iterations
sample: A boolean of whether to sample the training data during
search
eval_method: A string of resampling strategy, one of
['auto', 'cv', 'holdout']
split_ratio: A float of the valiation data percentage for holdout
n_splits: An integer of the number of folds for cross-validation
log_type: A string of the log type, one of ['better', 'all', 'new']
'better' only logs configs with better loss than previos iters
'all' logs all the tried configs
'new' only logs non-redundant configs
model_history: A boolean of whether to keep the history of best
models in the history property. Make sure memory is large
enough if setting to True.
log_training_metric: A boolean of whether to log the training
metric for each model.
mem_thres: A float of the memory size constraint in bytes
X_val: None | a numpy array or a pandas dataframe of validation data
y_val: None | a numpy array or a pandas series of validation labels
'''
self.task = task
self._validate_data(X_train, y_train, dataframe, label, X_val, y_val)
self.start_time_flag = time.time()
np.random.seed(0)
self.learner_selector = learner_selector
if self.task == 'classification':
self.task = get_classification_objective(
len(np.unique(self.y_train_all)))
assert split_type in ["stratified", "uniform"]
self.split_type = split_type
else:
self.split_type = "uniform"
if 'auto' == estimator_list:
estimator_list = ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree']
if 'regression' != self.task:
estimator_list += ['lrl1', ]
logger.info(
"List of ML learners in AutoML Run: {}".format(estimator_list))
if eval_method == 'auto' or self.X_val is not None:
eval_method = self._decide_eval_method(time_budget)
self.eval_method = eval_method
logger.info("Evaluation method: {}".format(eval_method))
self.retrain_full = retrain_full and (eval_method == 'holdout'
and self.X_val is None)
self.sample = sample and (eval_method != 'cv')
if 'auto' == metric:
if 'binary' in task:
metric = 'roc_auc'
elif 'multi' in task:
metric = 'log_loss'
else:
metric = 'r2'
if metric in ['r2', 'accuracy', 'roc_auc', 'f1', 'ap']:
error_metric = f"1-{metric}"
elif isinstance(metric, str):
error_metric = metric
else:
error_metric = 'customized metric'
logger.info(f'Minimizing error metric: {error_metric}')
with training_log_writer(log_file_name) as save_helper:
self.save_helper = save_helper
self._prepare_data(eval_method, split_ratio, n_splits)
self._compute_with_config = partial(AutoML._compute_with_config_base,
self,
metric,
log_training_metric)
self.time_budget = time_budget
self.estimator_list = estimator_list
self.ensemble = ensemble
self.max_iter = max_iter
self.mem_thres = mem_thres
self.log_type = log_type
self.split_ratio = split_ratio
self.save_model_history = model_history
self.n_jobs = n_jobs
self.search()
logger.info("fit succeeded")
def search(self):
self.searchers = {}
# initialize the searchers
self.eti = []
self._best_loss = float('+inf')
self.best_train_time = 0
self.time_from_start = 0
self.estimator_index = -1
self._best_iteration = 0
self._model_history = {}
self._config_history = {}
self.max_iter_per_learner = 10000 # TODO
self.iter_per_learner = dict([(e, 0) for e in self.estimator_list])
self.fullsize = False
self._trained_estimator = None
if self.ensemble:
self.best_model = {}
for self._track_iter in range(self.max_iter):
if self.estimator_index == -1:
estimator = self.estimator_list[0]
else:
estimator = self._select_estimator(self.estimator_list)
if not estimator:
break
logger.info(f"iteration {self._track_iter}"
f" current learner {estimator}")
if estimator in self.searchers:
model = self.searchers[estimator].trained_estimator
improved = self.searchers[estimator].search1step(
global_best_loss=self._best_loss,
retrain_full=self.retrain_full,
mem_thres=self.mem_thres)
else:
model = improved = None
self.searchers[estimator] = ParamSearch(
estimator,
self.data_size,
self._compute_with_config,
self._train_with_config,
self.save_helper,
MIN_SAMPLE_TRAIN if self.sample else self.data_size,
self.task,
self.log_type,
self._config_space_info.get(estimator),
self._custom_size_estimate.get(estimator),
self.split_ratio)
self.searchers[estimator].search_begin(self.time_budget,
self.start_time_flag)
if self.estimator_index == -1:
eti_base = self._eti_ini[estimator]
self.eti.append(
self.searchers[estimator]
.expected_time_improvement_search())
for e in self.estimator_list[1:]:
self.eti.append(
self._eti_ini[e] / eti_base * self.eti[0])
self.estimator_index = 0
self.time_from_start = time.time() - self.start_time_flag
# logger.info(f"{self.searchers[estimator].sample_size}, {data_size}")
if self.searchers[estimator].sample_size == self.data_size:
self.iter_per_learner[estimator] += 1
if not self.fullsize:
self.fullsize = True
if self.searchers[estimator].best_loss < self._best_loss:
self._best_loss = self.searchers[estimator].best_loss
self._best_estimator = estimator
self.best_train_time = self.searchers[estimator].train_time
self._config_history[self._track_iter] = (
estimator,
self.searchers[estimator].best_config[0],
self.time_from_start)
if self.save_model_history:
self._model_history[self._track_iter] = self.searchers[
estimator].trained_estimator.model
elif self._trained_estimator:
del self._trained_estimator
self._trained_estimator = None
self._trained_estimator = self.searchers[
estimator].trained_estimator
self._best_iteration = self._track_iter
if model and improved and not self.save_model_history:
model.cleanup()
logger.info(
" at {:.1f}s,\tbest {}'s error={:.4f},\tbest {}'s error={:.4f}".format(
self.time_from_start,
estimator,
self.searchers[estimator].best_loss,
self._best_estimator,
self._best_loss))
if self.time_from_start >= self.time_budget:
break
if self.ensemble:
time_left = self.time_from_start - self.time_budget
time_ensemble = self.searchers[self._best_estimator].train_time
if time_left < time_ensemble < 2 * time_left:
break
if self.searchers[
estimator].train_time > self.time_budget - self.time_from_start:
self.iter_per_learner[estimator] = self.max_iter_per_learner
# Add a checkpoint for the current best config to the log.
self.save_helper.checkpoint()
if self.searchers:
self._selected = self.searchers[self._best_estimator]
self._trained_estimator = self._selected.trained_estimator
self.modelcount = sum(self.searchers[estimator].model_count
for estimator in self.searchers)
logger.info(self._trained_estimator.model)
if self.ensemble:
searchers = list(self.searchers.items())
searchers.sort(key=lambda x: x[1].best_loss)
estimators = [(x[0], x[1].trained_estimator) for x in searchers[
:2]]
estimators += [(x[0], x[1].trained_estimator) for x in searchers[
2:] if x[1].best_loss < 4 * self._selected.best_loss]
logger.info(estimators)
if self.task != "regression":
from sklearn.ensemble import StackingClassifier as Stacker
for e in estimators:
e[1]._estimator_type = 'classifier'
else:
from sklearn.ensemble import StackingRegressor as Stacker
best_m = self._trained_estimator
stacker = Stacker(estimators, best_m, n_jobs=self.n_jobs,
passthrough=True)
stacker.fit(self.X_train_all, self.y_train_all)
self._trained_estimator = stacker
self._trained_estimator.model = stacker
else:
self._selected = self._trained_estimator = None
self.modelcount = 0
def __del__(self):
if hasattr(self, '_trained_estimator') and self._trained_estimator \
and hasattr(self._trained_estimator, 'cleanup'):
self._trained_estimator.cleanup()
del self._trained_estimator
def _select_estimator(self, estimator_list):
time_left = self.time_budget - self.time_from_start
if self.best_train_time < time_left < 2 * self.best_train_time:
best_searcher = self.searchers[self._best_estimator]
config_sig = best_searcher.get_hist_config_sig(
best_searcher.sample_size_full,
best_searcher.best_config[0])
if config_sig not in best_searcher.config_tried:
# trainAll
return self._best_estimator
if self.learner_selector == 'roundrobin':
self.estimator_index += 1
if self.estimator_index == len(estimator_list):
self.estimator_index = 0
return estimator_list[self.estimator_index]
min_expected_time, selected = np.Inf, None
inv = []
for i, estimator in enumerate(estimator_list):
if estimator in self.searchers:
searcher = self.searchers[estimator]
if self.iter_per_learner[estimator] >= self.max_iter_per_learner:
inv.append(0)
continue
eti_searcher = min(2 * searcher.train_time,
searcher.expected_time_improvement_search())
gap = searcher.best_loss - self._best_loss
if gap > 0 and not self.ensemble:
delta_loss = searcher.old_loss - searcher.new_loss
delta_time = searcher.old_loss_time + \
searcher.new_loss_time - searcher.old_train_time
speed = delta_loss / float(delta_time)
try:
expected_time = max(gap / speed, searcher.train_time)
except ZeroDivisionError:
warnings.warn("ZeroDivisionError: need to debug ",
"speed: {0}, "
"old_loss: {1}, "
"new_loss: {2}"
.format(speed,
searcher.old_loss,
searcher.new_loss))
expected_time = 0.0
expected_time = 2 * max(expected_time, eti_searcher)
else:
expected_time = eti_searcher
if expected_time == 0:
expected_time = 1e-10
inv.append(1 / expected_time)
else:
expected_time = self.eti[i]
inv.append(0)
if expected_time < min_expected_time:
min_expected_time = expected_time
selected = estimator
if len(self.searchers) < len(estimator_list) or not selected:
if selected not in self.searchers:
# print('select',selected,'eti',min_expected_time)
return selected
s = sum(inv)
p = np.random.random()
q = 0
for i in range(len(inv)):
if inv[i]:
q += inv[i] / s
if p < q:
return estimator_list[i]

31
flaml/config.py Normal file
Просмотреть файл

@ -0,0 +1,31 @@
'''!
* Copyright (c) 2020 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License.
'''
N_SPLITS = 5
RANDOM_SEED = 1
SPLIT_RATIO = 0.1
HISTORY_SIZE = 10000000
MEM_THRES = 4*(1024**3)
SMALL_LARGE_THRES = 10000000
MIN_SAMPLE_TRAIN = 10000
MIN_SAMPLE_VAL = 10000
CV_HOLDOUT_THRESHOLD = 100000
BASE_Const = 2
BASE_LOWER_BOUND = 2**(0.01)
ETI_INI = {
'lgbm':1,
'xgboost':1.6,
'xgboost_nb':1.6,
'rf':2,
'lrl1':160,
'lrl2':25,
'linear_svc':16,
'kneighbor':30,
'catboost':15,
'extra_tree':1.9,
'nn':50,
}

256
flaml/data.py Normal file
Просмотреть файл

@ -0,0 +1,256 @@
'''!
* Copyright (c) 2020 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License.
'''
import numpy as np
from scipy.sparse import vstack, issparse
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from .training_log import training_log_reader
def load_openml_dataset(dataset_id, data_dir=None, random_state=0):
'''Load dataset from open ML.
If the file is not cached locally, download it from open ML.
Args:
dataset_id: An integer of the dataset id in openml
data_dir: A string of the path to store and load the data
random_state: An integer of the random seed for splitting data
Returns:
X_train: A 2d numpy array of training data
X_test: A 2d numpy array of test data
y_train: A 1d numpy arrya of labels for training data
y_test: A 1d numpy arrya of labels for test data
'''
import os
import openml
import pickle
from sklearn.model_selection import train_test_split
filename = 'openml_ds' + str(dataset_id) + '.pkl'
filepath = os.path.join(data_dir, filename)
if os.path.isfile(filepath):
print('load dataset from', filepath)
with open(filepath, 'rb') as f:
dataset = pickle.load(f)
else:
print('download dataset from openml')
dataset = openml.datasets.get_dataset(dataset_id)
if not os.path.exists(data_dir):
os.makedirs(data_dir)
with open(filepath, 'wb') as f:
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
print('Dataset name:', dataset.name)
X, y, * \
__ = dataset.get_data(
target=dataset.default_target_attribute, dataset_format='array')
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=random_state)
print(
'X_train.shape: {}, y_train.shape: {};\nX_test.shape: {}, y_test.shape: {}'.format(
X_train.shape, y_train.shape, X_test.shape, y_test.shape,
)
)
return X_train, X_test, y_train, y_test
def load_openml_task(task_id, data_dir):
'''Load task from open ML.
Use the first fold of the task.
If the file is not cached locally, download it from open ML.
Args:
task_id: An integer of the task id in openml
data_dir: A string of the path to store and load the data
Returns:
X_train: A 2d numpy array of training data
X_test: A 2d numpy array of test data
y_train: A 1d numpy arrya of labels for training data
y_test: A 1d numpy arrya of labels for test data
'''
import os
import openml
import pickle
task = openml.tasks.get_task(task_id)
filename = 'openml_task' + str(task_id) + '.pkl'
filepath = os.path.join(data_dir, filename)
if os.path.isfile(filepath):
print('load dataset from', filepath)
with open(filepath, 'rb') as f:
dataset = pickle.load(f)
else:
print('download dataset from openml')
dataset = task.get_dataset()
with open(filepath, 'wb') as f:
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
X, y, _, _ = dataset.get_data(task.target_name, dataset_format='array')
train_indices, test_indices = task.get_train_test_split_indices(
repeat=0,
fold=0,
sample=0,
)
X_train = X[train_indices]
y_train = y[train_indices]
X_test = X[test_indices]
y_test = y[test_indices]
print(
'X_train.shape: {}, y_train.shape: {},\nX_test.shape: {}, y_test.shape: {}'.format(
X_train.shape, y_train.shape, X_test.shape, y_test.shape,
)
)
return X_train, X_test, y_train, y_test
def get_output_from_log(filename, time_budget):
'''Get output from log file
Args:
filename: A string of the log file name
time_budget: A float of the time budget in seconds
Returns:
training_time_list: A list of the finished time of each logged iter
best_error_list:
A list of the best validation error after each logged iter
error_list: A list of the validation error of each logged iter
config_list:
A list of the estimator, sample size and config of each logged iter
logged_metric_list: A list of the logged metric of each logged iter
'''
import ast
best_config = None
best_learner = None
best_val_loss = float('+inf')
training_duration = 0.0
training_time_list = []
config_list = []
best_error_list = []
error_list = []
logged_metric_list = []
best_config_list = []
with training_log_reader(filename) as reader:
for record in reader.records():
time_used = record.total_search_time
training_duration = time_used
val_loss = record.validation_loss
config = record.config
learner = record.learner.split('_')[0]
sample_size = record.sample_size
train_loss = record.logged_metric
if time_used < time_budget:
if val_loss < best_val_loss:
best_val_loss = val_loss
best_config = config
best_learner = learner
best_config_list.append(best_config)
training_time_list.append(training_duration)
best_error_list.append(best_val_loss)
logged_metric_list.append(train_loss)
error_list.append(val_loss)
config_list.append({"Current Learner": learner,
"Current Sample": sample_size,
"Current Hyper-parameters": record.config,
"Best Learner": best_learner,
"Best Hyper-parameters": best_config})
return (training_time_list, best_error_list, error_list, config_list,
logged_metric_list)
def concat(X1, X2):
'''concatenate two matrices vertically
'''
if isinstance(X1, pd.DataFrame) or isinstance(X1, pd.Series):
if isinstance(X1, pd.DataFrame):
cat_columns = X1.select_dtypes(
include='category').columns
df = pd.concat([X1, X2], sort=False)
df.reset_index(drop=True, inplace=True)
if isinstance(X1, pd.DataFrame) and len(cat_columns):
df[cat_columns] = df[cat_columns].astype('category')
return df
if issparse(X1):
return vstack((X1, X2))
else:
return np.concatenate([X1, X2])
class DataTransformer:
'''transform X, y
'''
def fit_transform(self, X, y, objective):
if isinstance(X, pd.DataFrame):
X = X.copy()
n = X.shape[0]
cat_columns, num_columns = [], []
for column in X.columns:
if X[column].dtype.name in ('object', 'category'):
if X[column].nunique() == 1 or X[column].nunique(
dropna=True) == n - X[column].isnull().sum():
X.drop(columns=column, inplace=True)
elif X[column].dtype.name == 'category':
current_categories = X[column].cat.categories
if '__NAN__' not in current_categories:
X[column] = X[column].cat.add_categories(
'__NAN__').fillna('__NAN__')
cat_columns.append(column)
else:
X[column].fillna('__NAN__', inplace=True)
cat_columns.append(column)
else:
# print(X[column].dtype.name)
if X[column].nunique(dropna=True) < 2:
X.drop(columns=column, inplace=True)
else:
X[column].fillna(np.nan, inplace=True)
num_columns.append(column)
X = X[cat_columns + num_columns]
if cat_columns:
X[cat_columns] = X[cat_columns].astype('category')
if num_columns:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
self.transformer = ColumnTransformer([(
'continuous',
SimpleImputer(missing_values=np.nan, strategy='median'),
num_columns)])
X[num_columns] = self.transformer.fit_transform(X)
self.cat_columns, self.num_columns = cat_columns, num_columns
if objective == 'regression':
self.label_transformer = None
else:
from sklearn.preprocessing import LabelEncoder
self.label_transformer = LabelEncoder()
y = self.label_transformer.fit_transform(y)
return X, y
def transform(self, X):
if isinstance(X, pd.DataFrame):
cat_columns, num_columns = self.cat_columns, self.num_columns
X = X[cat_columns + num_columns].copy()
for column in cat_columns:
# print(column, X[column].dtype.name)
if X[column].dtype.name == 'object':
X[column].fillna('__NAN__', inplace=True)
elif X[column].dtype.name == 'category':
current_categories = X[column].cat.categories
if '__NAN__' not in current_categories:
X[column] = X[column].cat.add_categories(
'__NAN__').fillna('__NAN__')
if cat_columns:
X[cat_columns] = X[cat_columns].astype('category')
if num_columns:
X[num_columns].fillna(np.nan, inplace=True)
X[num_columns] = self.transformer.transform(X)
return X

241
flaml/ml.py Normal file
Просмотреть файл

@ -0,0 +1,241 @@
'''!
* Copyright (c) 2020 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License.
'''
from .model import *
import time
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, \
accuracy_score, mean_absolute_error, log_loss, average_precision_score, \
f1_score
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold
def get_estimator_class(objective_name, estimator_name):
''' when adding a new learner, need to add an elif branch '''
if 'xgboost' in estimator_name:
if 'regression' in objective_name:
estimator_class = XGBoostEstimator
else:
estimator_class = XGBoostSklearnEstimator
elif 'rf' in estimator_name:
estimator_class = RandomForestEstimator
elif 'lgbm' in estimator_name:
estimator_class = LGBMEstimator
elif 'lrl1' in estimator_name:
estimator_class = LRL1Classifier
elif 'lrl2' in estimator_name:
estimator_class = LRL2Classifier
elif 'catboost' in estimator_name:
estimator_class = CatBoostEstimator
elif 'extra_tree' in estimator_name:
estimator_class = ExtraTreeEstimator
elif 'kneighbor' in estimator_name:
estimator_class = KNeighborsEstimator
else:
raise ValueError(estimator_name + ' is not a built-in learner. '
'Please use AutoML.add_learner() to add a customized learner.')
return estimator_class
def sklearn_metric_loss_score(metric_name, y_predict, y_true, labels=None):
'''Loss using the specified metric
Args:
metric_name: A string of the mtric name, one of
'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'log_loss',
'f1', 'ap'
y_predict: A 1d or 2d numpy array of the predictions which can be
used to calculate the metric. E.g., 2d for log_loss and 1d
for others.
y_true: A 1d numpy array of the true labels
labels: A 1d numpy array of the unique labels
Returns:
score: A float number of the loss, the lower the better
'''
metric_name = metric_name.lower()
if 'r2' in metric_name:
score = 1.0 - r2_score(y_true, y_predict)
elif metric_name == 'rmse':
score = np.sqrt(mean_squared_error(y_true, y_predict))
elif metric_name == 'mae':
score = mean_absolute_error(y_true, y_predict)
elif metric_name == 'mse':
score = mean_squared_error(y_true, y_predict)
elif metric_name == 'accuracy':
score = 1.0 - accuracy_score(y_true, y_predict)
elif 'roc_auc' in metric_name:
score = 1.0 - roc_auc_score(y_true, y_predict)
elif 'log_loss' in metric_name:
score = log_loss(y_true, y_predict, labels=labels)
elif 'f1' in metric_name:
score = 1 - f1_score(y_true, y_predict)
elif 'ap' in metric_name:
score = 1 - average_precision_score(y_true, y_predict)
else:
raise ValueError(metric_name+' is not a built-in metric, '
'currently built-in metrics are: '
'r2, rmse, mae, mse, accuracy, roc_auc, log_loss, f1, ap. '
'please pass a customized metric function to AutoML.fit(metric=func)')
return score
def get_y_pred(estimator, X, eval_metric, obj):
if eval_metric in ['roc_auc', 'ap'] and 'binary' in obj:
y_pred_classes = estimator.predict_proba(X)
y_pred = y_pred_classes[:,
1] if y_pred_classes.ndim>1 else y_pred_classes
elif eval_metric in ['log_loss', 'roc_auc']:
y_pred = estimator.predict_proba(X)
else:
y_pred = estimator.predict(X)
return y_pred
def get_test_loss(estimator, X_train, y_train, X_test, y_test, eval_metric, obj,
labels=None, budget=None, train_loss=False):
start = time.time()
train_time = estimator.fit(X_train, y_train, budget)
if isinstance(eval_metric, str):
test_pred_y = get_y_pred(estimator, X_test, eval_metric, obj)
test_loss = sklearn_metric_loss_score(eval_metric, test_pred_y, y_test,
labels)
if train_loss != False:
test_pred_y = get_y_pred(estimator, X_train, eval_metric, obj)
train_loss = sklearn_metric_loss_score(eval_metric, test_pred_y,
y_train, labels)
else: # customized metric function
test_loss, train_loss = eval_metric(
X_test, y_test, estimator, labels, X_train, y_train)
train_time = time.time()-start
return test_loss, train_time, train_loss
def train_model(estimator, X_train, y_train, budget):
train_time = estimator.fit(X_train, y_train, budget)
return train_time
def evaluate_model(estimator, X_train, y_train, X_val, y_val, budget, kf,
objective_name, eval_method, eval_metric, best_val_loss, train_loss=False):
if 'holdout' in eval_method:
val_loss, train_loss, train_time = evaluate_model_holdout(
estimator, X_train, y_train, X_val, y_val, budget,
objective_name, eval_metric, best_val_loss, train_loss=train_loss)
else:
val_loss, train_loss, train_time = evaluate_model_CV(
estimator, X_train, y_train, budget, kf, objective_name,
eval_metric, best_val_loss, train_loss=train_loss)
return val_loss, train_loss, train_time
def evaluate_model_holdout(estimator, X_train, y_train, X_val, y_val, budget,
objective_name, eval_metric, best_val_loss, train_loss=False):
val_loss, train_time, train_loss = get_test_loss(
estimator, X_train, y_train, X_val, y_val, eval_metric, objective_name,
budget = budget, train_loss=train_loss)
return val_loss, train_loss, train_time
def evaluate_model_CV(estimator, X_train_all, y_train_all, budget, kf,
objective_name, eval_metric, best_val_loss, train_loss=False):
start_time = time.time()
total_val_loss = total_train_loss = 0
train_time = 0
valid_fold_num = 0
n = kf.get_n_splits()
X_train_split, y_train_split = X_train_all, y_train_all
if objective_name=='regression':
labels = None
else:
labels = np.unique(y_train_all)
if isinstance(kf, RepeatedStratifiedKFold):
kf = kf.split(X_train_split, y_train_split)
else:
kf = kf.split(X_train_split)
rng = np.random.RandomState(2020)
val_loss_list = []
budget_per_train = budget / (n+1)
for train_index, val_index in kf:
train_index = rng.permutation(train_index)
if isinstance(X_train_all, pd.DataFrame):
X_train, X_val = X_train_split.iloc[
train_index], X_train_split.iloc[val_index]
else:
X_train, X_val = X_train_split[
train_index], X_train_split[val_index]
if isinstance(y_train_all, pd.Series):
y_train, y_val = y_train_split.iloc[
train_index], y_train_split.iloc[val_index]
else:
y_train, y_val = y_train_split[
train_index], y_train_split[val_index]
estimator.cleanup()
val_loss_i, train_time_i, train_loss_i = get_test_loss(
estimator, X_train, y_train, X_val, y_val, eval_metric,
objective_name, labels, budget_per_train, train_loss=train_loss)
valid_fold_num += 1
total_val_loss += val_loss_i
if train_loss != False:
if total_train_loss != 0: total_train_loss += train_loss_i
else: total_train_loss = train_loss_i
train_time += train_time_i
if valid_fold_num == n:
val_loss_list.append(total_val_loss/valid_fold_num)
total_val_loss = valid_fold_num = 0
elif time.time() - start_time >= budget:
val_loss_list.append(total_val_loss/valid_fold_num)
break
val_loss = np.max(val_loss_list)
if train_loss != False: train_loss = total_train_loss/n
budget -= time.time() - start_time
if val_loss < best_val_loss and budget > budget_per_train:
estimator.cleanup()
train_time_full = estimator.fit(X_train_all, y_train_all, budget)
train_time += train_time_full
return val_loss, train_loss, train_time
def compute_estimator(X_train, y_train, X_val, y_val, budget, kf,
config_dic, objective_name, estimator_name, eval_method, eval_metric,
best_val_loss = np.Inf, n_jobs=1, estimator_class=None, train_loss=False):
start_time = time.time()
estimator_class = estimator_class or get_estimator_class(
objective_name, estimator_name)
estimator = estimator_class(
**config_dic, objective_name = objective_name, n_jobs=n_jobs)
val_loss, train_loss, train_time = evaluate_model(
estimator, X_train, y_train, X_val, y_val, budget, kf, objective_name,
eval_method, eval_metric, best_val_loss, train_loss=train_loss)
all_time = time.time() - start_time
return estimator, val_loss, train_loss, train_time, all_time
def train_estimator(X_train, y_train, config_dic, objective_name,
estimator_name, n_jobs=1, estimator_class=None, budget=None):
start_time = time.time()
estimator_class = estimator_class or get_estimator_class(objective_name,
estimator_name)
estimator = estimator_class(**config_dic, objective_name = objective_name,
n_jobs=n_jobs)
if X_train is not None:
train_time = train_model(estimator, X_train, y_train, budget)
else:
estimator = estimator.estimator_class(**estimator.params)
train_time = time.time() - start_time
return estimator, train_time
def get_classification_objective(num_labels: int) -> str:
if num_labels == 2:
objective_name = 'binary:logistic'
else:
objective_name = 'multi:softmax'
return objective_name

515
flaml/model.py Normal file
Просмотреть файл

@ -0,0 +1,515 @@
'''!
* Copyright (c) 2020 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License.
'''
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
import time
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier, LGBMRegressor
import scipy.sparse
import pandas as pd
class BaseEstimator:
'''The abstract class for all learners
Typical example:
XGBoostEstimator: for regression
XGBoostSklearnEstimator: for classification
LGBMEstimator, RandomForestEstimator, LRL1Classifier, LRL2Classifier:
for both regression and classification
'''
def __init__(self, objective_name = 'binary:logistic',
**params):
'''Constructor
Args:
objective_name: A string of the objective name, one of
'binary:logistic', 'multi:softmax', 'regression'
n_jobs: An integer of the number of parallel threads
params: A dictionary of the hyperparameter names and values
'''
self.params = params
self.estimator_class = None
self.objective_name = objective_name
if '_estimator_type' in params:
self._estimator_type = params['_estimator_type']
else:
self._estimator_type = "regressor" if objective_name=='regression' \
else "classifier"
def get_params(self, deep=False):
params = self.params.copy()
params["objective_name"] = self.objective_name
if hasattr(self, '_estimator_type'):
params['_estimator_type'] = self._estimator_type
return params
@property
def classes_(self):
return self.model.classes_
def preprocess(self, X):
return X
def _fit(self, X_train, y_train):
curent_time = time.time()
X_train = self.preprocess(X_train)
model = self.estimator_class(**self.params)
model.fit(X_train, y_train)
train_time = time.time() - curent_time
self.model = model
return train_time
def fit(self, X_train, y_train, budget=None):
'''Train the model from given training data
Args:
X_train: A numpy array of training data in shape n*m
y_train: A numpy array of labels in shape n*1
budget: A float of the time budget in seconds
Returns:
train_time: A float of the training time in seconds
'''
return self._fit(X_train, y_train)
def predict(self, X_test):
'''Predict label from features
Args:
X_test: A numpy array of featurized instances, shape n*m
Returns:
A numpy array of shape n*1.
Each element is the label for a instance
'''
X_test = self.preprocess(X_test)
return self.model.predict(X_test)
def predict_proba(self, X_test):
'''Predict the probability of each class from features
Only works for classification problems
Args:
model: An object of trained model with method predict_proba()
X_test: A numpy array of featurized instances, shape n*m
Returns:
A numpy array of shape n*c. c is the # classes
Each element at (i,j) is the probability for instance i to be in
class j
'''
if 'regression' in self.objective_name:
print('Regression tasks do not support predict_prob')
raise ValueError
else:
X_test = self.preprocess(X_test)
return self.model.predict_proba(X_test)
def cleanup(self): pass
class SKLearnEstimator(BaseEstimator):
def preprocess(self, X):
if isinstance(X, pd.DataFrame):
X = X.copy()
cat_columns = X.select_dtypes(include=['category']).columns
X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes)
return X
class LGBMEstimator(BaseEstimator):
def __init__(self, objective_name='binary:logistic', n_jobs=1,
n_estimators=2, max_leaves=2, min_child_weight=1e-3, learning_rate=0.1,
subsample=1.0, reg_lambda=1.0, reg_alpha=0.0, colsample_bylevel=1.0,
colsample_bytree=1.0, log_max_bin=8, **params):
super().__init__(objective_name, **params)
# Default: regression for LGBMRegressor,
# binary or multiclass for LGBMClassifier
if 'regression' in objective_name:
final_objective_name = 'regression'
elif 'binary' in objective_name:
final_objective_name = 'binary'
elif 'multi' in objective_name:
final_objective_name = 'multiclass'
else:
final_objective_name = 'regression'
self.params = {
"n_estimators": int(round(n_estimators)),
"num_leaves": params[
'num_leaves'] if 'num_leaves' in params else int(
round(max_leaves)),
'objective': params[
"objective"] if "objective" in params else final_objective_name,
'n_jobs': n_jobs,
'learning_rate': float(learning_rate),
'reg_alpha': float(reg_alpha),
'reg_lambda': float(reg_lambda),
'min_child_weight': float(min_child_weight),
'colsample_bytree':float(colsample_bytree),
'subsample': float(subsample),
}
self.params['max_bin'] = params['max_bin'] if 'max_bin' in params else (
1<<int(round(log_max_bin)))-1
if 'regression' in objective_name:
self.estimator_class = LGBMRegressor
else:
self.estimator_class = LGBMClassifier
self.time_per_iter = None
self.train_size = 0
def preprocess(self, X):
if not isinstance(X, pd.DataFrame) and scipy.sparse.issparse(
X) and np.issubdtype(X.dtype, np.integer):
X = X.astype(float)
return X
def fit(self, X_train, y_train, budget=None):
start_time = time.time()
n_iter = self.params["n_estimators"]
if (not self.time_per_iter or
abs(self.train_size-X_train.shape[0])>4) and budget is not None:
self.params["n_estimators"] = 1
self.t1 = self._fit(X_train, y_train)
if self.t1 >= budget:
self.params["n_estimators"] = n_iter
return self.t1
self.params["n_estimators"] = 4
self.t2 = self._fit(X_train, y_train)
self.time_per_iter = (self.t2 - self.t1)/(
self.params["n_estimators"]-1) if self.t2 > self.t1 \
else self.t1 if self.t1 else 0.001
self.train_size = X_train.shape[0]
if self.t1+self.t2>=budget or n_iter==self.params["n_estimators"]:
self.params["n_estimators"] = n_iter
return time.time() - start_time
if budget is not None:
self.params["n_estimators"] = min(n_iter, int((budget-time.time()+
start_time-self.t1)/self.time_per_iter+1))
if self.params["n_estimators"] > 0:
self._fit(X_train, y_train)
self.params["n_estimators"] = n_iter
train_time = time.time() - start_time
return train_time
class XGBoostEstimator(SKLearnEstimator):
''' not using sklearn API, used for regression '''
def __init__(self, objective_name='regression', all_thread=False, n_jobs=1,
n_estimators=4, max_leaves=4, subsample=1.0, min_child_weight=1,
learning_rate=0.1, reg_lambda=1.0, reg_alpha=0.0, colsample_bylevel=1.0,
colsample_bytree=1.0, tree_method='auto', **params):
super().__init__(objective_name, **params)
self.n_estimators = int(round(n_estimators))
self.max_leaves = int(round(max_leaves))
self.grids = []
self.params = {
'max_leaves': int(round(max_leaves)),
'max_depth': 0,
'grow_policy': params[
"grow_policy"] if "grow_policy" in params else 'lossguide',
'tree_method':tree_method,
'verbosity': 0,
'nthread':n_jobs,
'learning_rate': float(learning_rate),
'subsample': float(subsample),
'reg_alpha': float(reg_alpha),
'reg_lambda': float(reg_lambda),
'min_child_weight': float(min_child_weight),
'booster': params['booster'] if 'booster' in params else 'gbtree',
'colsample_bylevel': float(colsample_bylevel),
'colsample_bytree':float(colsample_bytree),
}
if all_thread:
del self.params['nthread']
def get_params(self, deep=False):
params = super().get_params()
params["n_jobs"] = params['nthread']
return params
def fit(self, X_train, y_train, budget=None):
curent_time = time.time()
if not scipy.sparse.issparse(X_train):
self.params['tree_method'] = 'hist'
X_train = self.preprocess(X_train)
dtrain = xgb.DMatrix(X_train, label=y_train)
if self.max_leaves>0:
xgb_model = xgb.train(self.params, dtrain, self.n_estimators)
del dtrain
train_time = time.time() - curent_time
self.model = xgb_model
return train_time
else:
return None
def predict(self, X_test):
if not scipy.sparse.issparse(X_test):
X_test = self.preprocess(X_test)
dtest = xgb.DMatrix(X_test)
return super().predict(dtest)
class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
''' using sklearn API, used for classification '''
def __init__(self, objective_name='binary:logistic', n_jobs=1,
n_estimators=4, max_leaves=4, subsample=1.0,
min_child_weight=1, learning_rate=0.1, reg_lambda=1.0, reg_alpha=0.0,
colsample_bylevel=1.0, colsample_bytree=1.0, tree_method='hist',
**params):
super().__init__(objective_name, **params)
self.params = {
"n_estimators": int(round(n_estimators)),
'max_leaves': int(round(max_leaves)),
'max_depth': 0,
'grow_policy': params[
"grow_policy"] if "grow_policy" in params else 'lossguide',
'tree_method':tree_method,
'verbosity': 0,
'n_jobs': n_jobs,
'learning_rate': float(learning_rate),
'subsample': float(subsample),
'reg_alpha': float(reg_alpha),
'reg_lambda': float(reg_lambda),
'min_child_weight': float(min_child_weight),
'booster': params['booster'] if 'booster' in params else 'gbtree',
'colsample_bylevel': float(colsample_bylevel),
'colsample_bytree': float(colsample_bytree),
}
if 'regression' in objective_name:
self.estimator_class = XGBRegressor
else:
self.estimator_class = XGBClassifier
self.time_per_iter = None
self.train_size = 0
def fit(self, X_train, y_train, budget=None):
if scipy.sparse.issparse(X_train):
self.params['tree_method'] = 'auto'
return super().fit(X_train, y_train, budget)
class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
def __init__(self, objective_name = 'binary:logistic', n_jobs = 1,
n_estimators = 4, max_leaves = 4, max_features = 1.0,
min_samples_split = 2, min_samples_leaf = 1, criterion = 1, **params):
super().__init__(objective_name, **params)
self.params = {
"n_estimators": int(round(n_estimators)),
"n_jobs": n_jobs,
'max_features': float(max_features),
}
if 'regression' in objective_name:
self.estimator_class = RandomForestRegressor
else:
self.estimator_class = RandomForestClassifier
self.params['criterion'] = 'entropy' if criterion>1.5 else 'gini'
self.time_per_iter = None
self.train_size = 0
def get_params(self, deep=False):
params = super().get_params()
params["criterion"] = 1 if params["criterion"]=='gini' else 2
return params
class ExtraTreeEstimator(RandomForestEstimator):
def __init__(self, objective_name = 'binary:logistic', n_jobs = 1,
n_estimators = 4, max_leaves = 4, max_features = 1.0,
min_samples_split = 2, min_samples_leaf = 1, criterion = 1, **params):
super().__init__(objective_name, **params)
self.params = {
"n_estimators": int(round(n_estimators)),
"n_jobs": n_jobs,
'max_features': float(max_features),
}
if 'regression' in objective_name:
from sklearn.ensemble import ExtraTreesRegressor
self.estimator_class = ExtraTreesRegressor
else:
from sklearn.ensemble import ExtraTreesClassifier
self.estimator_class = ExtraTreesClassifier
self.params['criterion'] = 'entropy' if criterion>1.5 else 'gini'
self.time_per_iter = None
self.train_size = 0
class LRL1Classifier(SKLearnEstimator):
def __init__(self, tol=0.0001, C=1.0,
objective_name='binary:logistic', n_jobs=1, **params):
super().__init__(objective_name, **params)
self.params = {
'penalty': 'l1',
'tol': float(tol),
'C': float(C),
'solver': 'saga',
'n_jobs': n_jobs,
}
if 'regression' in objective_name:
self.estimator_class = None
print('Does not support regression task')
raise NotImplementedError
else:
self.estimator_class = LogisticRegression
class LRL2Classifier(SKLearnEstimator):
def __init__(self, tol=0.0001, C=1.0,
objective_name='binary:logistic', n_jobs=1, **params):
super().__init__(objective_name, **params)
self.params = {
'penalty': 'l2',
'tol': float(tol),
'C': float(C),
'solver': 'lbfgs',
'n_jobs': n_jobs,
}
if 'regression' in objective_name:
self.estimator_class = None
print('Does not support regression task')
raise NotImplementedError
else:
self.estimator_class = LogisticRegression
class CatBoostEstimator(BaseEstimator):
time_per_iter = None
train_size = 0
def __init__(self, objective_name = 'binary:logistic', n_jobs=1,
n_estimators=8192, exp_max_depth=64, learning_rate=0.1, rounds=4,
l2_leaf_reg=3, **params):
super().__init__(objective_name, **params)
self.params = {
"early_stopping_rounds": int(round(rounds)),
"n_estimators": n_estimators,
'learning_rate': learning_rate,
'thread_count': n_jobs,
'verbose': False,
'random_seed': params[
"random_seed"] if "random_seed" in params else 10242048,
}
# print(n_estimators)
if 'regression' in objective_name:
from catboost import CatBoostRegressor
self.estimator_class = CatBoostRegressor
else:
from catboost import CatBoostClassifier
self.estimator_class = CatBoostClassifier
def get_params(self, deep=False):
params = super().get_params()
params['n_jobs'] = params['thread_count']
params['rounds'] = params['early_stopping_rounds']
return params
def fit(self, X_train, y_train, budget=None):
start_time = time.time()
n_iter = self.params["n_estimators"]
if isinstance(X_train, pd.DataFrame):
cat_features = list(X_train.select_dtypes(
include='category').columns)
else:
cat_features = []
if (not CatBoostEstimator.time_per_iter or
abs(CatBoostEstimator.train_size-len(y_train))>4) and budget:
# measure the time per iteration
self.params["n_estimators"] = 1
CatBoostEstimator.model = self.estimator_class(**self.params)
CatBoostEstimator.model.fit(X_train, y_train,
cat_features=cat_features)
CatBoostEstimator.t1 = time.time() - start_time
if CatBoostEstimator.t1 >= budget:
self.params["n_estimators"] = n_iter
self.model = CatBoostEstimator.model
return CatBoostEstimator.t1
self.params["n_estimators"] = 4
CatBoostEstimator.model = self.estimator_class(**self.params)
CatBoostEstimator.model.fit(X_train, y_train,
cat_features=cat_features)
CatBoostEstimator.time_per_iter = (time.time() - start_time -
CatBoostEstimator.t1)/(self.params["n_estimators"]-1)
if CatBoostEstimator.time_per_iter <= 0:
CatBoostEstimator.time_per_iter = CatBoostEstimator.t1
CatBoostEstimator.train_size = len(y_train)
if time.time()-start_time>=budget or n_iter==self.params[
"n_estimators"]:
self.params["n_estimators"] = n_iter
self.model = CatBoostEstimator.model
return time.time()-start_time
if budget:
train_times = 1
self.params["n_estimators"] = min(n_iter, int((budget-time.time()+
start_time-CatBoostEstimator.t1)/train_times/
CatBoostEstimator.time_per_iter+1))
self.model = CatBoostEstimator.model
if self.params["n_estimators"] > 0:
l = max(int(len(y_train)*0.9), len(y_train)-1000)
X_tr, y_tr = X_train[:l], y_train[:l]
from catboost import Pool
model = self.estimator_class(**self.params)
model.fit(X_tr, y_tr, cat_features=cat_features, eval_set=Pool(
data=X_train[l:], label=y_train[l:], cat_features=cat_features))
# print(self.params["n_estimators"], model.get_best_iteration())
self.model = model
self.params["n_estimators"] = n_iter
train_time = time.time() - start_time
# print(budget, train_time)
return train_time
class KNeighborsEstimator(BaseEstimator):
def __init__(self, objective_name='binary:logistic', n_jobs=1,
n_neighbors=5, **params):
super().__init__(objective_name, **params)
self.params= {
'n_neighbors': int(round(n_neighbors)),
'weights': 'distance',
'n_jobs': n_jobs,
}
if 'regression' in objective_name:
from sklearn.neighbors import KNeighborsRegressor
self.estimator_class = KNeighborsRegressor
else:
from sklearn.neighbors import KNeighborsClassifier
self.estimator_class = KNeighborsClassifier
def preprocess(self, X):
if isinstance(X, pd.DataFrame):
cat_columns = X.select_dtypes(['category']).columns
# print(X.dtypes)
# print(cat_columns)
if X.shape[1] == len(cat_columns):
raise ValueError(
"kneighbor requires at least one numeric feature")
X = X.drop(cat_columns, axis=1)
return X

675
flaml/search.py Normal file
Просмотреть файл

@ -0,0 +1,675 @@
'''!
* Copyright (c) 2020 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License.
'''
from functools import partial
from .ml import train_estimator
import time
import math
import numpy as np
from .space import config_space, estimator_size, get_config_values, \
generate_config_ini, generate_config_max, generate_config_min
from .config import SPLIT_RATIO, MIN_SAMPLE_TRAIN, \
HISTORY_SIZE, MEM_THRES, BASE_Const, BASE_LOWER_BOUND
from random import gauss
def rand_vector_unit_sphere(dims):
vec = [gauss(0, 1) for i in range(dims)]
mag = sum(x**2 for x in vec) ** .5
return [x / mag for x in vec]
def rand_vector_gaussian(dims):
vec = [gauss(0, 1) for i in range(dims)]
return vec
class ParamSearch:
'''
the class for searching params for 1 learner
'''
def __init__(self, estimator, data_size,
compute_with_config, train_with_config, save_info_helper=None,
init_sample_size=MIN_SAMPLE_TRAIN, objective_name='regression',
log_type='better', config_space_info=None, size_estimator=None,
split_ratio=SPLIT_RATIO, base_change='sqrtK', use_dual_dir=True,
move_type='geo'):
self.log_type = log_type
self.base_change = base_change
if init_sample_size > data_size:
init_sample_size = data_size
self.next_sample_size = {}
self.prev_sample_size = {}
s = init_sample_size
self.prev_sample_size[s] = s
self.estimator_configspace = config_space_info or config_space(
estimator, data_size, objective_name)
self.get_size_for_config = size_estimator or (
lambda x: estimator_size(x, estimator))
config_min_dic_primary, config_min_dic_more, config_min_dic = \
generate_config_min(estimator, self.estimator_configspace, None)
self.min_config_primary = np.array(
list(config_min_dic_primary.values()))
self.min_config_more = np.array(list(config_min_dic_more.values()))
self.min_config = np.array(list(config_min_dic.values()))
# init configurations for different sample size
config_init_dic_primary, config_init_dic_more, _, config_type_dic = \
generate_config_ini(estimator, self.estimator_configspace)
self.init_config_dic_primary = {s: config_init_dic_primary}
self.init_config_dic_more = {s: config_init_dic_more}
self.init_config_dic_type_dic = {'primary': {
s: config_init_dic_primary}, 'more': {s: config_init_dic_more}}
self.init_config_dic = {
**self.init_config_dic_type_dic['primary'],
**self.init_config_dic_type_dic['more']
}
self.config_type_dic = config_type_dic
# max configurations for different sample size
config_max_dic_primary, config_max_dic_more, config_max_dic = \
generate_config_max(
estimator, self.estimator_configspace, int(s))
self.max_config_dic_primary = {s: np.array(
list(config_max_dic_primary.values()))}
self.max_config_dic_more = {s: np.array(
list(config_max_dic_more.values()))}
self.max_config_dic = {s: np.array(list(config_max_dic.values()))}
self.dims = (len(self.min_config_primary), len(self.min_config_more))
# print(self.dims)
if self.dims[1] > 0 and self.dims[0] > 0:
self.base_upper_bound = {
s:
max(
max(
(self.max_config_dic_primary[s][i] / self.min_config_primary[i])
** math.sqrt(self.dims[0]) for i in range(self.dims[0])
),
max(
(self.max_config_dic_more[s][i] / self.min_config_more[i])
** math.sqrt(self.dims[1]) for i in range(self.dims[1]))
)
}
elif self.dims[0] > 0:
self.base_upper_bound = {
s:
max(
(self.max_config_dic_primary[s][i] / self.min_config_primary[i])
** (math.sqrt(self.dims[0])) for i in range(self.dims[0])
)
}
else:
self.base_upper_bound = {
s:
max(
(self.max_config_dic_more[s][i] / self.min_config_more[i])
** (math.sqrt(self.dims[1])) for i in range(self.dims[1])
)
}
# create sample size sequence
while s < data_size:
s2 = self.next_sample_size[s] = s * 2 if s * 2 <= data_size else data_size
self.prev_sample_size[s2] = s
s = s2
config_max_dic_primary, config_max_dic_more, config_max_dic = \
generate_config_max(
estimator, self.estimator_configspace, int(s))
self.max_config_dic_primary[s] = np.array(
list(config_max_dic_primary.values()))
self.max_config_dic_more[s] = np.array(
list(config_max_dic_more.values()))
self.max_config_dic[s] = np.array(list(config_max_dic.values()))
if self.dims[1] > 0 and self.dims[0] > 0:
self.base_upper_bound[s] = max(
max(
(self.max_config_dic_primary[s][i]
/ self.min_config_primary[i])
** math.sqrt(self.dims[0]) for i in range(self.dims[0])
),
max(
(self.max_config_dic_more[s][i]
/ self.min_config_more[i])
** math.sqrt(self.dims[1]) for i in range(self.dims[1])
)
)
elif self.dims[0] > 0:
self.base_upper_bound[s] = max(
(self.max_config_dic_primary[s][i]
/ self.min_config_primary[i])
** math.sqrt(self.dims[0]) for i in range(self.dims[0])
)
else:
self.base_upper_bound[s] = max(
(self.max_config_dic_more[s][i] / self.min_config_more[i])
** math.sqrt(self.dims[1]) for i in range(self.dims[1])
)
self.init_sample_size = init_sample_size
self.data_size = data_size
self.sample_size_full = int(self.data_size / (1.0 - split_ratio))
self.compute_with_config = compute_with_config
self.estimator = estimator
# for logging
self.save_helper = save_info_helper
self.estimator_type_list = ['primary', 'more']
self.dim = self.dims[0] if self.dims[0] > 0 else self.dims[1]
self.b = BASE_Const**(math.sqrt(self.dim))
self.base_ini = self.b
self.total_dim = sum(self.dims)
self.epo = 2**(self.dim - 1)
# keys are [sample size, config], values are (loss, train_time)
self.config_tried = {}
self.train_with_config = train_with_config
self.current_config_loss = None
self.use_dual_dir = use_dual_dir
self.move_type = move_type
def evaluate_config(self, config, sample_size, move='_pos'):
'''
evaluate a configuration, update search state,
and return whether the state is changed
'''
if self.time_from_start >= self.time_budget or move != '_ini' and \
self.train_time > self.time_budget - self.time_from_start:
return False
model, val_loss, new_train_time, from_history, train_loss = \
self.evaluate_proposed_config(config, sample_size, move)
# update current config
self.update_current_config(config, val_loss, sample_size)
# update best model statistics, including statistics about loss and time
improved = self.update_search_state_best(
config, sample_size, model, val_loss, new_train_time, from_history)
self.time_from_start = time.time() - self.start_time
if self.save_helper is not None:
if from_history:
move = move + '_from_hist'
self.save_helper.append(self.model_count,
train_loss,
new_train_time,
self.time_from_start,
val_loss,
config,
self.best_loss,
self.best_config[0],
self.estimator,
sample_size)
return improved
def get_hist_config_sig(self, sample_size, config):
config_values = get_config_values(config, self.config_type_dic)
config_sig = str(sample_size) + '_' + str(config_values)
return config_sig
def evaluate_proposed_config(self, config, sample_size, move):
self.model_count += 1
config_sig = self.get_hist_config_sig(sample_size, config)
d = self.total_dim
history_size_per_d = len(self.config_tried) / float(d)
if config_sig in self.config_tried:
val_loss, new_train_time = self.config_tried[config_sig]
# print(config_sig,'found in history')
model = train_loss = None
from_history = True
else:
model, val_loss, train_loss, new_train_time, _ = \
self.compute_with_config(self.estimator, config, sample_size)
from_history = False
if history_size_per_d < HISTORY_SIZE:
self.config_tried[config_sig] = (val_loss, new_train_time)
if self.first_move:
self.init_config_dic[sample_size] = config
move = '_ini'
self.base = self.base_ini
self.num_noimprovement = 0
move = str(self.estimator) + move
return model, val_loss, new_train_time, from_history, train_loss
def update_current_config(self, config, val_loss, sample_size):
if self.first_move or val_loss < self.current_config_loss:
self.first_move = False
# update current config and coressponding sample_size
self.sample_size = sample_size
self.config = config
self.config_primary = {x: config[x]
for x in self.config_primary.keys()}
try:
self.config_more = {x: config[x]
for x in self.config_more.keys()}
except:
self.config_more = {}
self.current_config_loss = val_loss
def update_reset_best_config_loss(self, sample_size, config, val_loss):
if sample_size == self.data_size:
if self.best_config_loss_dic_full_reset[1] is None:
self.best_config_loss_dic_full_reset = [
config, val_loss, self.model_count]
else:
full_reset_best_loss = self.best_config_loss_dic_full_reset[1]
if val_loss < full_reset_best_loss:
self.best_config_loss_dic_full_reset = [
config, full_reset_best_loss, self.model_count]
def update_search_state_best(self, config, sample_size, model, val_loss,
new_train_time, from_history):
# upate the loss statistics for a particular sample size
if sample_size not in self.best_config_loss_samplesize_dic:
self.best_config_loss_samplesize_dic[sample_size] = [
config, val_loss, self.model_count]
else:
s_best_loss = self.best_config_loss_samplesize_dic[sample_size][1]
if val_loss < s_best_loss:
self.best_config_loss_samplesize_dic[sample_size] = [
config, val_loss, self.model_count]
self.update_reset_best_config_loss(sample_size, config, val_loss)
# update best model statistics, including statistics about loss and time
if val_loss < self.new_loss:
self.old_loss = self.new_loss if self.new_loss < float(
'inf') else 2 * val_loss
self.new_loss = val_loss
self.old_loss_time = self.new_loss_time
self.old_train_time = self.train_time
self.new_loss_time = self.train_time = new_train_time
if val_loss < self.best_loss:
self.best_config = [self.config, self.model_count]
if not from_history:
self.trained_estimator = model
# print(model)
else:
print(val_loss, self.best_loss)
self.best_loss = val_loss
self.time_best_found = self.time_from_start
return True
else:
if not from_history:
self.new_loss_time += new_train_time
return False
def get_proposal(self, current_config, rand_vector_func, base, move_type):
rand_vector = rand_vector_func(len(current_config))
rand_vector = [i for i in rand_vector]
rand_vector_neg = [-i for i in rand_vector]
move_vector = {}
move_vector_neg = {}
index_ = 0
for k, v in current_config.items():
if 'geo' in move_type:
# get the move vector using the proposed random vector
move_vector[k] = v * (base**(rand_vector[index_]))
move_vector_neg[k] = v * (base**(rand_vector_neg[index_]))
else:
move_vector[k] = v + (base * (rand_vector[index_]))
move_vector_neg[k] = v + (base * (rand_vector_neg[index_]))
index_ += 1
# as long as one of the proposed model (+ or -) is within the mem_limit
# we will proceed
if not self.use_dual_dir:
move_vector_neg = None
return move_vector, move_vector_neg
def get_config_from_move_vector(self, v, estimator_type):
if v != None:
if 'all' in estimator_type:
v = v
elif 'primary' in estimator_type:
v = {**v, **self.config_more}
else:
v = {**self.config_primary, **v}
bounded_v = self.get_v_within_min_max(v)
else:
bounded_v = None
return bounded_v
def dual_direction_sample(self, base, current_search_config,
estimator_type='primary', rand_vector_func=rand_vector_unit_sphere,
mem_thres=MEM_THRES, move_type='geo'):
current_config = current_search_config
if len(current_config) == 0:
return None, None
bounded_v_list = [None, None]
while not bounded_v_list[0] and not bounded_v_list[
1] and self.time_from_start < self.time_budget:
move_vector, move_vector_neg = self.get_proposal(
current_config, rand_vector_func,
base, move_type)
bounded_v_list = [move_vector, move_vector_neg]
for i, v in enumerate(bounded_v_list):
bounded_v = self.get_config_from_move_vector(v, estimator_type)
proposed_model_size = self.get_size_for_config(bounded_v)
proposed_model_size = 0 if not isinstance(
proposed_model_size, float) else proposed_model_size
if proposed_model_size > mem_thres:
# print(bounded_v, proposed_model_size, mem_thres)
bounded_v = None
bounded_v_list[i] = bounded_v
self.time_from_start = time.time() - self.start_time
return bounded_v_list
def get_v_within_min_max(self, v):
index_ = 0
bounded_v = {}
for key, value in v.items():
new_value = min(max(
value, self.min_config[index_]), self.max_config_dic[
self.sample_size][index_])
bounded_v[key] = new_value
index_ += 1
return bounded_v
def expected_time_improvement_search(self):
return max(self.old_loss_time - self.old_train_time + self.train_time,
self.new_loss_time)
def increase_sample_size(self):
'''
whether it's time to increase sample size
'''
expected_time_improvement_sample = 2 * self.train_time
self.increase = self.sample_size < self.data_size and (
self.estimator_type == 0 or self.dims[0] == 0) and (
not self.improved
or expected_time_improvement_sample
< self.expected_time_improvement_search()
)
return self.increase
def search_begin(self, time_budget, start_time=None):
self.time_budget = time_budget
if not start_time:
self.start_time = time.time()
else:
self.start_time = start_time
# the time to train the last selected config
self.old_train_time = self.train_time = 0
self.time_from_start = 0
# search states
self.first_move = True
self.improved = True
self.estimator_type = 0 if self.dims[0] > 0 else 1
self.old_loss = self.new_loss = self.best_loss = float('+inf')
# new_loss_time is the time from the beginning of training self.config to
# now,
# old_loss_time is the time from the beginning of training the old
# self.config to the beginning of training self.config
self.old_loss_time = self.new_loss_time = 0
self.trained_estimator = None
self.model_count = 0
self.K = 0
self.old_modelcount = 0
# self.config has two parts: config_primary contain the configs
# that are related with model complexity, config_more contains the
# configs that is not related with model complexity
self.config_primary = self.init_config_dic_primary[self.init_sample_size]
self.config_more = self.init_config_dic_more[self.init_sample_size]
self.config = {**self.config_primary, **self.config_more}
self.best_config = [None, None]
# key: sample size, value: [best_config, best_loss, model_count] under
# sample size in the key
self.best_config_loss_samplesize_dic = {
self.init_sample_size: [self.config, self.old_loss, self.model_count]}
# key: sample size, value: [best_config, best_loss, model_count] under
# sample size in the key
self.best_config_loss_dic_full_reset = [None, None, None]
self.sample_size = self.init_sample_size
self.base_change_bound = 1
self.base_change_count = 0
self.evaluate_config(self.config, self.sample_size, '_ini')
self.increase = False
def train_config(self, config, sample_size):
'''
train a configuration
'''
# print('Evalute Config')
if self.time_from_start >= self.time_budget:
return False
config_sig = self.get_hist_config_sig(sample_size, config)
if not config_sig in self.config_tried:
_, new_train_time = self.train_with_config(
self.estimator, config, sample_size)
train_loss, val_loss, move = None, self.new_loss, str(
self.estimator) + '_trainAll'
self.time_from_start = time.time() - self.start_time
if self.save_helper is not None:
self.save_helper.append(self.model_count,
train_loss,
new_train_time,
self.time_from_start,
val_loss,
config,
self.best_loss,
self.best_config,
move,
sample_size)
self.config_tried[config_sig] = (val_loss, new_train_time)
def try_increase_sample_size(self):
# print( self.estimator, self.sample_size)
if self.sample_size in self.next_sample_size:
if self.increase_sample_size():
self.first_move = True
self.improved = True
self.estimator_type = 0 if self.dims[0] > 0 else 1
self.evaluate_config(
self.config, self.next_sample_size[self.sample_size])
if not self.old_modelcount and self.sample_size == self.data_size:
self.old_modelcount = self.model_count
def setup_current_search_config(self):
estimator_type = self.estimator_type_list[self.estimator_type]
if 'all' in estimator_type:
current_search_config = self.config
elif 'primary' in estimator_type:
current_search_config = self.config_primary
else:
current_search_config = self.config_more
# print(self.config_more)
return estimator_type, current_search_config
def search1step(self, global_best_loss=float('+inf'),
retrain_full=True, mem_thres=MEM_THRES, reset_type='init_gaussian'):
# try to increase sample size
self.try_increase_sample_size()
# decide current_search_config according to estimator_type
estimator_type, current_search_config = \
self.setup_current_search_config()
time_left = self.time_budget - self.time_from_start
if time_left < self.train_time:
return False
if retrain_full and self.train_time < time_left < 2 * self.train_time \
and self.best_loss <= global_best_loss:
self.train_config(self.best_config[0], self.sample_size_full)
move_vector, move_vector_neg = self.dual_direction_sample(
self.base, current_search_config, estimator_type,
rand_vector_unit_sphere, mem_thres, self.move_type)
if move_vector is None:
if move_vector_neg is None:
self.improved = False
else:
self.improved = self.evaluate_config(
move_vector_neg, self.sample_size, '_neg' + str(
estimator_type))
else:
self.improved = self.evaluate_config(
move_vector, self.sample_size, '_pos' + str(estimator_type))
if not self.improved:
if move_vector_neg is None:
pass
else:
self.improved = self.evaluate_config(
move_vector_neg, self.sample_size, '_neg' + str(
estimator_type))
self.update_noimprovement_stat(
global_best_loss, retrain_full, reset_type)
return self.improved
def update_noimprovement_stat(self, global_best_loss, retrain_full,
reset_type):
if self.improved:
self.num_noimprovement = 0
else:
self.estimator_type = 1 - self.estimator_type
if self.dims[self.estimator_type] == 0:
self.estimator_type = 1 - self.estimator_type
if self.estimator_type == 1 or self.dims[1] == 0:
self.noimprovement(global_best_loss, retrain_full, reset_type)
def noimprovement(self, global_best_loss, retrain_full, reset_type='org'):
if self.sample_size == self.data_size:
# Do not wait until full sample size to update num_noimprovement?
self.num_noimprovement += 1
if self.num_noimprovement >= self.epo:
self.num_noimprovement = 0
# print(self.num_noimprovement, self.epo)
if self.base_change == 'squareroot':
self.base = math.sqrt(self.base)
else:
if self.K == 0: # first time
oldK = self.best_config_loss_dic_full_reset[2] - \
self.old_modelcount
else:
oldK = self.K
self.K = self.model_count + 1 - self.old_modelcount
if self.base_change == 'K':
self.base **= oldK / self.K
else:
self.base **= math.sqrt(oldK / self.K)
if self.dims[1] > 0 and self.dims[0] > 0:
base_lower_bound = min(
min(
(1.0 + self.estimator_configspace[i].min_change
/ self.config_primary[i])
** math.sqrt(self.dims[0])
for i in self.config_primary.keys()
),
min(
(1.0 + self.estimator_configspace[i].min_change
/ self.config_more[i])
** math.sqrt(self.dims[1])
for i in self.config_more.keys()
)
)
elif self.dims[0] > 0:
base_lower_bound = min(
(1.0 + self.estimator_configspace[i].min_change
/ self.config_primary[i])
** math.sqrt(self.dims[0])
for i in self.config_primary.keys()
)
else:
base_lower_bound = min(
(1.0 + self.estimator_configspace[i].min_change
/ self.config_more[i])
** math.sqrt(self.dims[1])
for i in self.config_more.keys()
)
if np.isinf(base_lower_bound):
base_lower_bound = BASE_LOWER_BOUND
self.base_change_count += 1
if self.base <= base_lower_bound or \
self.base_change_count == self.base_change_bound:
if retrain_full and self.sample_size == self.data_size:
if self.best_loss <= global_best_loss:
# Only train on full data when the curent estimator
# is the best estimator
# print('best estimator and train on full data')
self.train_config(
self.best_config[0], self.sample_size_full)
# remaining time is more than enough for another trial
if self.time_budget - self.time_from_start > self.train_time:
self.base_change_bound <<= 1
self.base_change_count = 0
self.K = 0
self.old_modelcount = self.model_count
self.best_config_loss_dic_full_reset = [None, None,
None]
self.first_move = True
self.improved = True
self.base_ini = min(
self.base_ini * 2, self.base_upper_bound[
self.sample_size])
self.estimator_type = 0 if self.dims[0] > 0 else 1
reset_config, reset_sample_size = self.get_reset_config(
self.init_sample_size, reset_type)
self.sample_size = reset_sample_size
# print('reset sample size', reset_sample_size)
self.evaluate_config(reset_config, self.sample_size,
'_ini')
def get_reset_config(self, sample_size, reset_type):
init_config = self.init_config_dic[self.sample_size]
reset_sample_size = sample_size
if 'org' in reset_type:
reset_config = init_config
else:
if 'init_gaussian' in reset_type:
reset_config = init_config
reset_sample_size = self.get_reset_sample_size(reset_config)
config_values = get_config_values(
reset_config, self.config_type_dic)
config_sig = str(reset_sample_size) + '_' + str(config_values)
count = 0
while config_sig in self.config_tried and \
self.time_from_start < self.time_budget and count < 1000:
# TODO: check exhaustiveness? use time as condition?
count += 1
move, move_neg = self.dual_direction_sample(
base=self.b, current_search_config=init_config,
estimator_type='all',
rand_vector_func=rand_vector_gaussian,
move_type=self.move_type)
if move:
reset_config = move_neg
elif move_neg:
reset_config = move_neg
else:
continue
reset_sample_size = self.get_reset_sample_size(
reset_config)
config_values = get_config_values(
reset_config, self.config_type_dic)
config_sig = str(reset_sample_size) + \
'_' + str(config_values)
self.time_from_start = time.time() - self.start_time
else:
raise NotImplementedError
return reset_config, reset_sample_size
def get_reset_sample_size(self, reset_config):
if not reset_config:
print('reset_config is none')
reset_config_size = self.get_size_for_config(reset_config)
candidate_sample_size_list = []
for sample_size, config_and_bestloss in \
self.best_config_loss_samplesize_dic.items():
s_best_config = config_and_bestloss[0]
if not s_best_config:
print('best config is none', sample_size)
s_best_config_model_size = self.get_size_for_config(s_best_config)
if s_best_config_model_size >= reset_config_size:
candidate_sample_size_list.append(sample_size)
if len(candidate_sample_size_list) != 0:
return min(candidate_sample_size_list)
else:
return self.data_size

249
flaml/space.py Normal file
Просмотреть файл

@ -0,0 +1,249 @@
'''!
* Copyright (c) 2020 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License.
'''
class ConfigSearchInfo:
'''The class of the search space of a hyperparameters:
Attributes:
name: A string of the name of the hyperparameter
type: data type of the hyperparameter
lower: A number of the lower bound of the value
upper: A number of the upper bound of the value
init: A number of the initial value. For hyperparameters related to
complexity, the init value needs to correspond to the lowest
complexity
change_tpe: A string of the change type, 'linear' or 'log'
min_change: A number of the minimal change required. Could be inf if
no such requirement
'''
def __init__(self, name, type, lower, upper, init, change_type = 'log',
complexity_related = True, min_change = None):
self.name = name
self.type = type
self.lower = lower
self.upper = upper
self.init = init
self.change_type = change_type
self.complexity_related = complexity_related
# default setting of min_change: if type is int, min_change
# should be 1, otherwise +inf
if min_change is None:
if self.type == int:
self.min_change = 1.0 #minimum change required,
else:
self.min_change = float('+inf')
else:
self.min_change = min_change
def config_space(estimator, data_size, objective_name = "regression"):
CS = {}
n_estimators_upper = min(32768,int(data_size))
max_leaves_upper = min(32768,int(data_size))
# exp_max_depth_upper = min(32768,data_size)
if 'xgboost' in estimator:
CS['n_estimators'] = ConfigSearchInfo(name = 'n_estimators',
type = int, lower = 4, init = 4, upper = n_estimators_upper,
change_type = 'log')
CS['max_leaves'] = ConfigSearchInfo(name = 'max_leaves', type =int,
lower = 4, init = 4, upper = max_leaves_upper, change_type = 'log')
CS['min_child_weight'] = ConfigSearchInfo(name = 'min_child_weight',
type = float, lower = 0.001, init = 20.0, upper = 20.0,
change_type = 'log')
CS['learning_rate'] = ConfigSearchInfo(name = 'learning_rate',
type = float, lower = 0.01, init = 0.1, upper = 1.0,
change_type = 'log')
CS['subsample'] = ConfigSearchInfo(name = 'subsample', type = float,
lower = 0.6, init = 1.0, upper = 1.0, change_type = 'linear')
CS['reg_alpha'] = ConfigSearchInfo(name = 'reg_alpha', type = float,
lower = 1e-10, init = 1e-10, upper = 1.0, change_type = 'log',
complexity_related = True)
CS['reg_lambda'] = ConfigSearchInfo(name = 'reg_lambda', type = float,
lower = 1e-10, init = 1.0, upper = 1.0, change_type = 'log')
CS['colsample_bylevel'] = ConfigSearchInfo(name = 'colsample_bylevel',
type = float, lower = 0.6, init = 1.0, upper = 1.0,
change_type = 'linear')
CS['colsample_bytree'] = ConfigSearchInfo(name = 'colsample_bytree',
type = float, lower = 0.7, init = 1.0, upper = 1.0,
change_type = 'linear')
elif estimator in ('rf', 'extra_tree'):
n_estimators_upper = min(2048, n_estimators_upper)
# max_leaves_upper = min(2048, max_leaves_upper)
CS['n_estimators'] = ConfigSearchInfo(name = 'n_estimators',
type = int, lower = 4, init = 4, upper = n_estimators_upper,
change_type = 'log')
if objective_name != 'regression':
CS['criterion'] = ConfigSearchInfo(name = 'criterion',
type = int, lower = 1, init = 1, upper = 2,
change_type = 'log')
# CS['max_leaves'] = ConfigSearchInfo(name = 'max_leaves', type =int,
# lower = 4, init = 4, upper = max_leaves_upper, change_type = 'log',
# complexity_related = True)
CS['max_features'] = ConfigSearchInfo(name = 'max_features', type = float,
lower = 0.1, init = 1.0, upper = 1.0, change_type = 'log')
# CS['min_samples_split'] = ConfigSearchInfo(name = 'min_samples_split',
# type = int, lower = 2, init = 2, upper = 20, change_type = 'log',
# complexity_related = True)
# CS['min_samples_leaf'] = ConfigSearchInfo(name = 'min_samples_leaf',
# type = int, lower = 1, init = 1, upper = 20, change_type = 'log',
# complexity_related = True)
elif 'lgbm' in estimator:
CS['n_estimators'] = ConfigSearchInfo(name = 'n_estimators', type = int,
lower = 4, init = 4, upper = n_estimators_upper, change_type = 'log')
CS['max_leaves'] = ConfigSearchInfo(name = 'max_leaves', type = int,
lower = 4, init = 4, upper = max_leaves_upper, change_type = 'log')
CS['min_child_weight'] = ConfigSearchInfo(name = 'min_child_weight',
type = float, lower = 0.001, init = 20, upper = 20.0,
change_type = 'log')
CS['learning_rate'] = ConfigSearchInfo(name = 'learning_rate',
type = float, lower = 0.01, init = 0.1, upper = 1.0,
change_type = 'log')
CS['subsample'] = ConfigSearchInfo(name = 'subsample', type = float,
lower = 0.6, init = 1.0, upper = 1.0, change_type = 'log',
complexity_related = True)
CS['log_max_bin'] = ConfigSearchInfo(name = 'log_max_bin', type = int,
lower = 3, init = 8, upper = 10, change_type = 'log',
complexity_related = True)
CS['reg_alpha'] = ConfigSearchInfo(name = 'reg_alpha', type = float,
lower = 1e-10, init = 1e-10, upper = 1.0, change_type = 'log',
complexity_related = True)
CS['reg_lambda'] = ConfigSearchInfo(name = 'reg_lambda', type = float,
lower = 1e-10, init = 1.0, upper = 1.0, change_type = 'log')
CS['colsample_bytree'] = ConfigSearchInfo(name = 'colsample_bytree',
type = float, lower = 0.7, init = 1.0, upper = 1.0,
change_type = 'log')
elif 'lr' in estimator:
CS['C'] = ConfigSearchInfo(name = 'C', type =float, lower = 0.03125,
init = 1.0, upper = 32768.0, change_type = 'log',
complexity_related = True)
elif 'catboost' in estimator:
# CS['n_estimators'] = ConfigSearchInfo(name = 'n_estimators', type = int,
# lower = 4, init = 64, upper = n_estimators_upper, change_type = 'log',
# complexity_related = True)
early_stopping_rounds = max(min(round(1500000/data_size),150), 10)
CS['rounds'] = ConfigSearchInfo(name = 'rounds', type = int,
lower = 10, init = 10,
upper = early_stopping_rounds, change_type = 'log')
# CS['exp_max_depth'] = ConfigSearchInfo(name = 'exp_max_depth', type = int,
# lower = 32, init = 64, upper = 256, change_type = 'log',
# complexity_related = True)
CS['learning_rate'] = ConfigSearchInfo(name = 'learning_rate',
type = float, lower = 0.005, init = 0.1, upper = .2,
change_type = 'log')
# CS['l2_leaf_reg'] = ConfigSearchInfo(name = 'l2_leaf_reg',
# type = float, lower = 1, init = 3, upper = 5,
# change_type = 'log')
elif 'nn' == estimator:
CS['learning_rate'] = ConfigSearchInfo(name = 'learning_rate',
type = float, lower = 1e-4, init = 3e-4, upper = 3e-2,
change_type = 'log')
CS['weight_decay'] = ConfigSearchInfo(name = 'weight_decay',
type = float, lower = 1e-12, init = 1e-6, upper = .1,
change_type = 'log')
CS['dropout_prob'] = ConfigSearchInfo(name = 'dropout_prob',
type = float, lower = 1.0, init = 1.1, upper = 1.5,
change_type = 'log')
elif 'kneighbor' in estimator:
n_neighbors_upper = min(512,int(data_size/2))
CS['n_neighbors'] = ConfigSearchInfo(name = 'n_neighbors', type = int,
lower = 1, init = 5, upper = n_neighbors_upper, change_type = 'log')
else:
raise NotImplementedError
return CS
def estimator_size(config, estimator):
if estimator in ['xgboost', 'lgbm', 'rf', 'extra_tree']:
try:
max_leaves = int(round(config['max_leaves']))
n_estimators = int(round(config['n_estimators']))
model_size = float((max_leaves*3 + (max_leaves-1)*4 + 1)*
n_estimators*8)
except:
model_size = 0
return model_size
elif 'catboost' in estimator:
# if config is None: raise Exception("config is none")
n_estimators = int(round(config.get('n_estimators',8192)))
max_leaves = int(round(config.get('exp_max_depth',64)))
model_size = float((max_leaves*3 + (max_leaves-1)*4 + 1)*
n_estimators*8)
return model_size
else:
model_size = 1.0
# raise NotImplementedError
return model_size
def generate_config_ini(estimator, estimator_configspace):
config_dic = {}
config_dic_more = {}
config_type_dic = {}
for _, config in estimator_configspace.items():
name, init = config.name, config.init
type_, complexity_related = config.type, config.complexity_related
config_type_dic[name] = type_
if complexity_related:
config_dic[name] = init
else:
config_dic_more[name] = init
return config_dic, config_dic_more, {**config_dic, **config_dic_more}, \
config_type_dic
def generate_config_min(estimator,estimator_configspace, max_config_size):
config_dic = {}
config_dic_more = {}
for _, config in estimator_configspace.items():
name, lower = config.name, config.lower
complexity_related = config.complexity_related
if complexity_related:
config_dic[name] = lower
else:
config_dic_more[name] = lower
return config_dic, config_dic_more, {**config_dic, **config_dic_more}
def generate_config_max(estimator, estimator_configspace, max_config_size):
config_dic = {}
config_dic_more = {}
for _, config in estimator_configspace.items():
name, upper = config.name, config.upper
complexity_related = config.complexity_related
if complexity_related:
if name in ('n_estimators', 'max_leaves'):
config_dic[name] = min(upper, max_config_size)
else:
config_dic[name] = upper
else:
config_dic_more[name] = upper
return config_dic, config_dic_more, {**config_dic, **config_dic_more}
def get_config_values(config_dic, config_type_dic):
value_list = []
for k in config_dic.keys():
org_v = config_dic[k]
if config_type_dic[k] == int:
v = int(round(org_v))
value_list.append(v)
else:
value_list.append(org_v)
return value_list

168
flaml/training_log.py Normal file
Просмотреть файл

@ -0,0 +1,168 @@
'''!
* Copyright (c) 2020 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License.
'''
import json
from typing import IO
from contextlib import contextmanager
import warnings
class TrainingLogRecord(object):
def __init__(self,
record_id: int,
iter_per_learner: int,
logged_metric: float,
trial_time: float,
total_search_time: float,
validation_loss,
config,
best_validation_loss,
best_config,
learner,
sample_size):
self.record_id = record_id
self.iter_per_learner = iter_per_learner
self.logged_metric = logged_metric
self.trial_time = trial_time
self.total_search_time = total_search_time
self.validation_loss = validation_loss
self.config = config
self.best_validation_loss = best_validation_loss
self.best_config = best_config
self.learner = learner
self.sample_size = sample_size
def dump(self, fp: IO[str]):
d = vars(self)
return json.dump(d, fp)
@classmethod
def load(cls, json_str: str):
d = json.loads(json_str)
return cls(**d)
class TrainingLogCheckPoint(TrainingLogRecord):
def __init__(self, curr_best_record_id: int):
self.curr_best_record_id = curr_best_record_id
class TrainingLogWriter(object):
def __init__(self, output_filename: str):
self.output_filename = output_filename
self.file = None
self.current_best_loss_record_id = None
self.current_best_loss = float('+inf')
self.current_sample_size = None
self.current_record_id = 0
def open(self):
self.file = open(self.output_filename, 'w')
def append(self,
it_counter: int,
train_loss: float,
trial_time: float,
total_search_time: float,
validation_loss,
config,
best_validation_loss,
best_config,
learner,
sample_size):
if self.file is None:
raise IOError("Call open() to open the outpute file first.")
if validation_loss is None:
raise ValueError('TEST LOSS NONE ERROR!!!')
record = TrainingLogRecord(self.current_record_id,
it_counter,
train_loss,
trial_time,
total_search_time,
validation_loss,
config,
best_validation_loss,
best_config,
learner,
sample_size)
if validation_loss < self.current_best_loss or \
validation_loss == self.current_best_loss and \
sample_size > self.current_sample_size:
self.current_best_loss = validation_loss
self.current_sample_size = sample_size
self.current_best_loss_record_id = self.current_record_id
self.current_record_id += 1
record.dump(self.file)
self.file.write('\n')
self.file.flush()
def checkpoint(self):
if self.file is None:
raise IOError("Call open() to open the outpute file first.")
if self.current_best_loss_record_id is None:
warnings.warn("checkpoint() called before any record is written, "
"skipped.")
return
record = TrainingLogCheckPoint(self.current_best_loss_record_id)
record.dump(self.file)
self.file.write('\n')
self.file.flush()
def close(self):
self.file.close()
class TrainingLogReader(object):
def __init__(self, filename: str):
self.filename = filename
self.file = None
def open(self):
self.file = open(self.filename)
def records(self):
if self.file is None:
raise IOError("Call open() before reading log file.")
for line in self.file:
data = json.loads(line)
if len(data) == 1:
# Skip checkpoints.
continue
yield TrainingLogRecord(**data)
def close(self):
self.file.close()
def get_record(self, record_id) -> TrainingLogRecord:
if self.file is None:
raise IOError("Call open() before reading log file.")
for rec in self.records():
if rec.record_id == record_id:
return rec
raise ValueError(f"Cannot find record with id {record_id}.")
@contextmanager
def training_log_writer(filename: str):
try:
w = TrainingLogWriter(filename)
w.open()
yield w
finally:
w.close()
@contextmanager
def training_log_reader(filename: str):
try:
r = TrainingLogReader(filename)
r.open()
yield r
finally:
r.close()

1
flaml/version.py Normal file
Просмотреть файл

@ -0,0 +1 @@
__version__="0.1.0"

611
notebook/flaml_demo.ipynb Normal file

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

4
settings.json Normal file
Просмотреть файл

@ -0,0 +1,4 @@
{
"keep_max_logfiles": 30,
"logging_level": "info"
}

56
setup.py Normal file
Просмотреть файл

@ -0,0 +1,56 @@
import setuptools
import os
here = os.path.abspath(os.path.dirname(__file__))
with open("README.md", "r") as fh:
long_description = fh.read()
# Get the code version
version = {}
with open(os.path.join(here, "flaml/version.py")) as fp:
exec(fp.read(), version)
__version__ = version["__version__"]
install_requires = [
"NumPy>=1.16.2",
"lightgbm>=2.3.1",
"xgboost>=0.90",
"scipy>=1.4.1",
"catboost>=0.23",
"scikit-learn>=0.23",
],
setuptools.setup(
name="FLAML",
version=__version__,
author="Microsoft Corporation",
author_email="hpo@microsoft.com",
description="A fast and lightweight autoML system",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/microsoft/FLAML",
packages=["flaml"],
install_requires=install_requires,
extras_require={
"notebook": [
"openml==0.10.2",
"jupyter",
"matplotlib==3.2.0",
"rgf-python",
],
"test": [
"flake8>=3.8.4",
"pytest>=6.1.1",
"coverage>=5.3",
],
},
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
python_requires=">=3.6",
)

0
test/__init__.py Normal file
Просмотреть файл

235
test/test_automl.py Normal file
Просмотреть файл

@ -0,0 +1,235 @@
import unittest
import numpy as np
import scipy.sparse
from sklearn.datasets import load_boston, load_iris
from flaml import AutoML, get_output_from_log
def custom_metric(X_test, y_test, estimator, labels, X_train, y_train):
from sklearn.metrics import log_loss
y_pred = estimator.predict_proba(X_test)
test_loss = log_loss(y_test, y_pred, labels=labels)
y_pred = estimator.predict_proba(X_train)
train_loss = log_loss(y_train, y_pred, labels=labels)
alpha = 0.5
return test_loss * (1 + alpha) - alpha * train_loss, [test_loss, train_loss]
class TestAutoML(unittest.TestCase):
def test_dataframe(self):
self.test_classification(True)
def test_custom_metric(self):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 10,
'eval_method': 'holdout',
"metric": custom_metric,
"task": 'classification',
"log_file_name": "test/iris_custom.log",
"log_training_metric": True,
'log_type': 'all',
"model_history": True
}
X_train, y_train = load_iris(return_X_y=True)
automl_experiment.fit(X_train=X_train, y_train=y_train,
**automl_settings)
print(automl_experiment.classes_)
print(automl_experiment.predict_proba(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
automl_experiment = AutoML()
estimator = automl_experiment.get_estimator_from_log(
automl_settings["log_file_name"], record_id=0,
objective='multi')
print(estimator)
time_history, best_valid_loss_history, valid_loss_history, \
config_history, train_loss_history = get_output_from_log(
filename=automl_settings['log_file_name'], time_budget=6)
print(train_loss_history)
def test_classification(self, as_frame=False):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 4,
"metric": 'accuracy',
"task": 'classification',
"log_file_name": "test/iris.log",
"log_training_metric": True,
"model_history": True
}
X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
automl_experiment.fit(X_train=X_train, y_train=y_train,
**automl_settings)
print(automl_experiment.classes_)
print(automl_experiment.predict_proba(X_train)[:5])
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
del automl_settings["metric"]
del automl_settings["model_history"]
del automl_settings["log_training_metric"]
automl_experiment = AutoML()
duration = automl_experiment.retrain_from_log(
log_file_name=automl_settings["log_file_name"],
X_train=X_train, y_train=y_train,
train_full=True, record_id=0)
print(duration)
print(automl_experiment.model)
print(automl_experiment.predict_proba(X_train)[:5])
def test_regression(self):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 2,
"metric": 'mse',
"task": 'regression',
"log_file_name": "test/boston.log",
"log_training_metric": True,
"model_history": True
}
X_train, y_train = load_boston(return_X_y=True)
n = len(y_train)
automl_experiment.fit(X_train=X_train[:n >> 1], y_train=y_train[:n >> 1],
X_val=X_train[n >> 1:], y_val=y_train[n >> 1:],
**automl_settings)
assert automl_experiment.y_val.shape[0] == n - (n >> 1)
assert automl_experiment.eval_method == 'holdout'
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
print(get_output_from_log(automl_settings["log_file_name"], 1))
def test_sparse_matrix_classification(self):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 2,
"metric": 'auto',
"task": 'classification',
"log_file_name": "test/sparse_classification.log",
"split_type": "uniform",
"model_history": True
}
X_train = scipy.sparse.random(1554, 21, dtype=int)
y_train = np.random.randint(3, size=1554)
automl_experiment.fit(X_train=X_train, y_train=y_train,
**automl_settings)
print(automl_experiment.classes_)
print(automl_experiment.predict_proba(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
def test_sparse_matrix_regression(self):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 2,
"metric": 'mae',
"task": 'regression',
"log_file_name": "test/sparse_regression.log",
"model_history": True
}
X_train = scipy.sparse.random(300, 900, density=0.0001)
y_train = np.random.uniform(size=300)
X_val = scipy.sparse.random(100, 900, density=0.0001)
y_val = np.random.uniform(size=100)
automl_experiment.fit(X_train=X_train, y_train=y_train,
X_val=X_val, y_val=y_val,
**automl_settings)
assert automl_experiment.X_val.shape == X_val.shape
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
print(automl_experiment.best_config)
print(automl_experiment.best_loss)
print(automl_experiment.best_config_train_time)
def test_sparse_matrix_xgboost(self):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 2,
"metric": 'ap',
"task": 'classification',
"log_file_name": "test/sparse_classification.log",
"estimator_list": ["xgboost"],
"log_type": "all",
}
X_train = scipy.sparse.eye(900000)
y_train = np.random.randint(2, size=900000)
automl_experiment.fit(X_train=X_train, y_train=y_train,
**automl_settings)
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
def test_sparse_matrix_lr(self):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 2,
"metric": 'f1',
"task": 'classification',
"log_file_name": "test/sparse_classification.log",
"estimator_list": ["lrl1", "lrl2"],
"log_type": "all",
}
X_train = scipy.sparse.random(3000, 900, density=0.1)
y_train = np.random.randint(2, size=3000)
automl_experiment.fit(X_train=X_train, y_train=y_train,
**automl_settings)
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
def test_sparse_matrix_regression_cv(self):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 2,
'eval_method': 'cv',
"task": 'regression',
"log_file_name": "test/sparse_regression.log",
"model_history": True
}
X_train = scipy.sparse.random(100, 100)
y_train = np.random.uniform(size=100)
automl_experiment.fit(X_train=X_train, y_train=y_train,
**automl_settings)
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
if __name__ == "__main__":
unittest.main()

45
test/test_split.py Normal file
Просмотреть файл

@ -0,0 +1,45 @@
import unittest
from sklearn.datasets import fetch_openml
from flaml.automl import AutoML
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
dataset = "Airlines"
def _test(split_type):
automl = AutoML()
automl_settings = {
"time_budget": 2,
# "metric": 'accuracy',
"task": 'classification',
"log_file_name": "test/{}.log".format(dataset),
"model_history": True,
"log_training_metric": True,
"split_type": split_type,
}
X, y = fetch_openml(name=dataset, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
random_state=42)
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
pred = automl.predict(X_test)
acc = accuracy_score(y_test, pred)
print(acc)
def test_stratified():
_test(split_type="stratified")
def test_uniform():
_test(split_type="uniform")
if __name__ == "__main__":
unittest.main()

14
test/test_version.py Normal file
Просмотреть файл

@ -0,0 +1,14 @@
import unittest
import flaml
class TestVersion(unittest.TestCase):
def test_version(self):
self.assertTrue(hasattr(flaml, '__version__'))
self.assertTrue(len(flaml.__version__) > 0)
if __name__ == "__main__":
unittest.main()