зеркало из https://github.com/microsoft/FLAML.git
v0.1.0
This commit is contained in:
Коммит
492990655d
|
@ -0,0 +1,5 @@
|
|||
[run]
|
||||
branch = True
|
||||
source = flaml
|
||||
omit =
|
||||
*tests*
|
|
@ -0,0 +1,5 @@
|
|||
[flake8]
|
||||
ignore = E203, E266, E501, W503, F403, F401, C901
|
||||
max-line-length = 127
|
||||
max-complexity = 10
|
||||
select = B,C,E,F,W,T4,B9
|
|
@ -0,0 +1,59 @@
|
|||
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
|
||||
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
|
||||
|
||||
name: Python package
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: ['*']
|
||||
pull_request:
|
||||
branches: ['*']
|
||||
|
||||
jobs:
|
||||
build:
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest, windows-2019]
|
||||
python-version: [3.6, 3.7, 3.8]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: If mac, install libomp to facilitate lgbm install
|
||||
if: matrix.os == 'macOS-latest'
|
||||
run: |
|
||||
brew install libomp
|
||||
export CC=/usr/bin/clang
|
||||
export CXX=/usr/bin/clang++
|
||||
export CPPFLAGS="$CPPFLAGS -Xpreprocessor -fopenmp"
|
||||
export CFLAGS="$CFLAGS -I/usr/local/opt/libomp/include"
|
||||
export CXXFLAGS="$CXXFLAGS -I/usr/local/opt/libomp/include"
|
||||
export LDFLAGS="$LDFLAGS -Wl,-rpath,/usr/local/opt/libomp/lib -L/usr/local/opt/libomp/lib -lomp"
|
||||
- name: Install packages and dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install flake8 pytest coverage
|
||||
pip install -e .
|
||||
- name: Lint with flake8
|
||||
run: |
|
||||
# stop the build if there are Python syntax errors or undefined names
|
||||
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
|
||||
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
|
||||
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
|
||||
- name: Test with pytest
|
||||
run: |
|
||||
pytest test
|
||||
- name: Coverage
|
||||
run: |
|
||||
coverage run -a -m pytest test
|
||||
coverage xml
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v1
|
||||
with:
|
||||
file: ./coverage.xml
|
||||
flags: unittests
|
|
@ -0,0 +1,150 @@
|
|||
# Project
|
||||
/.vs
|
||||
.vscode
|
||||
|
||||
# Log files
|
||||
*.log
|
||||
|
||||
# Python virtualenv
|
||||
.venv
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
/catboost_info
|
||||
notebook/*.pkl
|
|
@ -0,0 +1,9 @@
|
|||
# Microsoft Open Source Code of Conduct
|
||||
|
||||
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
|
||||
|
||||
Resources:
|
||||
|
||||
- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
|
||||
- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
|
||||
- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
|
|
@ -0,0 +1,21 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) Microsoft Corporation.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE
|
|
@ -0,0 +1,123 @@
|
|||
# FLAML - Fast and Lightweight AutoML
|
||||
|
||||
FLAML is a Python library designed to automatically produce accurate machine
|
||||
learning models with low computational cost. It frees users from selecting
|
||||
learners and hyperparameters for each learner. It is fast and cheap.
|
||||
The simple and lightweight design makes it easy to extend, such as
|
||||
adding customized learners or metrics. FLAML is powered by a new, cost-effective
|
||||
hyperparameter optimization and learner selection method invented by
|
||||
Microsoft Research.
|
||||
FLAML is easy to use:
|
||||
|
||||
1. With three lines of code, you can start using this economical and fast
|
||||
AutoML engine as a scikit-learn style estimator.
|
||||
```python
|
||||
from flaml import AutoML
|
||||
automl = AutoML()
|
||||
automl.fit(X_train, y_train, task="classification")
|
||||
```
|
||||
|
||||
2. You can restrict the learners and use FLAML as a fast hyperparameter tuning
|
||||
tool for XGBoost, LightGBM, Random Forest etc. or a customized learner.
|
||||
```python
|
||||
automl.fit(X_train, y_train, task="classification", estimator_list=["lgbm"])
|
||||
```
|
||||
|
||||
3. You can embed FLAML in self-tuning software for just-in-time tuning with
|
||||
low latency & resource consumption.
|
||||
```python
|
||||
automl.fit(X_train, y_train, task="regression", time_budget=60)
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
FLAML requires **Python version >= 3.6**. It can be installed from pip:
|
||||
|
||||
```bash
|
||||
pip install flaml
|
||||
```
|
||||
|
||||
To run the [`notebook example`](https://github.com/microsoft/FLAML/tree/main/notebook),
|
||||
install flaml with the [notebook] option:
|
||||
|
||||
```bash
|
||||
pip install flaml[notebook]
|
||||
```
|
||||
|
||||
## Examples
|
||||
|
||||
A basic classification example.
|
||||
|
||||
```python
|
||||
from flaml import AutoML
|
||||
from sklearn.datasets import load_iris
|
||||
# Initialize the FLAML learner.
|
||||
automl = AutoML()
|
||||
# Provide configurations.
|
||||
automl_settings = {
|
||||
"time_budget": 10, # in seconds
|
||||
"metric": 'accuracy',
|
||||
"task": 'classification',
|
||||
"log_file_name": "test/iris.log",
|
||||
}
|
||||
X_train, y_train = load_iris(return_X_y=True)
|
||||
# Train with labeled input data.
|
||||
automl.fit(X_train=X_train, y_train=y_train,
|
||||
**automl_settings)
|
||||
# Predict
|
||||
print(automl.predict_proba(X_train))
|
||||
# Export the best model.
|
||||
print(automl.model)
|
||||
```
|
||||
|
||||
A basic regression example.
|
||||
|
||||
```python
|
||||
from flaml import AutoML
|
||||
from sklearn.datasets import load_boston
|
||||
# Initialize the FLAML learner.
|
||||
automl = AutoML()
|
||||
# Provide configurations.
|
||||
automl_settings = {
|
||||
"time_budget": 10, # in seconds
|
||||
"metric": 'r2',
|
||||
"task": 'regression',
|
||||
"log_file_name": "test/boston.log",
|
||||
}
|
||||
X_train, y_train = load_boston(return_X_y=True)
|
||||
# Train with labeled input data.
|
||||
automl.fit(X_train=X_train, y_train=y_train,
|
||||
**automl_settings)
|
||||
# Predict
|
||||
print(automl.predict(X_train))
|
||||
# Export the best model.
|
||||
print(automl.model)
|
||||
```
|
||||
|
||||
More examples: see the [notebook](https://github.com/microsoft/FLAML/tree/main/notebook/flaml_demo.ipynb)
|
||||
|
||||
## Contributing
|
||||
|
||||
This project welcomes contributions and suggestions. Most contributions require you to agree to a
|
||||
Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
|
||||
the rights to use your contribution. For details, visit <https://cla.opensource.microsoft.com>.
|
||||
|
||||
When you submit a pull request, a CLA bot will automatically determine whether you need to provide
|
||||
a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
|
||||
provided by the bot. You will only need to do this once across all repos using our CLA.
|
||||
|
||||
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
|
||||
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
|
||||
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
|
||||
|
||||
## Authors
|
||||
|
||||
* Chi Wang
|
||||
* Qingyun Wu
|
||||
* Erkang Zhu
|
||||
|
||||
Contributors: Markus Weimer, Silu Huang, Haozhe Zhang, Alex Deng.
|
||||
|
||||
## License
|
||||
|
||||
[MIT License](LICENSE)
|
|
@ -0,0 +1,41 @@
|
|||
<!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
|
||||
|
||||
## Security
|
||||
|
||||
Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
|
||||
|
||||
If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
|
||||
|
||||
## Reporting Security Issues
|
||||
|
||||
**Please do not report security vulnerabilities through public GitHub issues.**
|
||||
|
||||
Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
|
||||
|
||||
If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
|
||||
|
||||
You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
|
||||
|
||||
Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
|
||||
|
||||
* Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
|
||||
* Full paths of source file(s) related to the manifestation of the issue
|
||||
* The location of the affected source code (tag/branch/commit or direct URL)
|
||||
* Any special configuration required to reproduce the issue
|
||||
* Step-by-step instructions to reproduce the issue
|
||||
* Proof-of-concept or exploit code (if possible)
|
||||
* Impact of the issue, including how an attacker might exploit the issue
|
||||
|
||||
This information will help us triage your report more quickly.
|
||||
|
||||
If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
|
||||
|
||||
## Preferred Languages
|
||||
|
||||
We prefer all communications to be in English.
|
||||
|
||||
## Policy
|
||||
|
||||
Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
|
||||
|
||||
<!-- END MICROSOFT SECURITY.MD BLOCK -->
|
|
@ -0,0 +1,70 @@
|
|||
from flaml.automl import AutoML
|
||||
from flaml.model import BaseEstimator
|
||||
from flaml.data import get_output_from_log
|
||||
|
||||
from flaml.version import __version__
|
||||
|
||||
import logging
|
||||
from os.path import join, exists
|
||||
import datetime as dt
|
||||
from os import listdir, remove, mkdir
|
||||
import pathlib
|
||||
import json
|
||||
|
||||
root = pathlib.Path(__file__).parent.parent.absolute()
|
||||
jsonfilepath = join(root, "settings.json")
|
||||
|
||||
with open(jsonfilepath) as f:
|
||||
settings = json.load(f)
|
||||
|
||||
logging_level = settings["logging_level"]
|
||||
|
||||
if logging_level == "info":
|
||||
logging_level = logging.INFO
|
||||
elif logging_level == "debug":
|
||||
logging_level = logging.DEBUG
|
||||
elif logging_level == "error":
|
||||
logging_level = logging.ERROR
|
||||
elif logging_level == "warning":
|
||||
logging_level = logging.WARNING
|
||||
elif logging_level == "critical":
|
||||
logging_level = logging.CRITICAL
|
||||
else:
|
||||
logging_level = logging.NOTSET
|
||||
|
||||
keep_max_logfiles = settings["keep_max_logfiles"]
|
||||
|
||||
log_dir = join(root, "logs")
|
||||
|
||||
if not exists(log_dir):
|
||||
mkdir(log_dir)
|
||||
|
||||
del_logs = sorted([int(x.split("_")[0]) for x in listdir(log_dir) if ".log" in
|
||||
x], reverse=True)[keep_max_logfiles:]
|
||||
|
||||
for l in del_logs:
|
||||
try:
|
||||
remove(join(log_dir, str(l) + "_flaml.log"))
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
b = dt.datetime.now()
|
||||
a = dt.datetime(2020, 4, 1, 0, 0, 0)
|
||||
secs = int((b-a).total_seconds())
|
||||
name = str(secs)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging_level)
|
||||
fh = logging.FileHandler(join(log_dir, name + "_" + __name__ + ".log"))
|
||||
fh.setLevel(logging_level)
|
||||
ch = logging.StreamHandler()
|
||||
ch.setLevel(logging_level)
|
||||
# formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
|
||||
formatter = logging.Formatter(
|
||||
'[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s',
|
||||
'%m-%d %H:%M:%S')
|
||||
ch.setFormatter(formatter)
|
||||
fh.setFormatter(formatter)
|
||||
logger.addHandler(ch)
|
||||
logger.addHandler(fh)
|
||||
logger.propagate = True
|
|
@ -0,0 +1,897 @@
|
|||
'''!
|
||||
* Copyright (c) 2020 Microsoft Corporation. All rights reserved.
|
||||
* Licensed under the MIT License. See LICENSE file in the
|
||||
* project root for license information.
|
||||
'''
|
||||
import time
|
||||
import warnings
|
||||
from functools import partial
|
||||
import ast
|
||||
import numpy as np
|
||||
import scipy.sparse
|
||||
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, \
|
||||
RepeatedKFold
|
||||
from sklearn.utils import shuffle
|
||||
import pandas as pd
|
||||
|
||||
from .ml import compute_estimator, train_estimator, get_classification_objective
|
||||
from .config import MIN_SAMPLE_TRAIN, MEM_THRES, ETI_INI, \
|
||||
SMALL_LARGE_THRES, CV_HOLDOUT_THRESHOLD, SPLIT_RATIO, N_SPLITS
|
||||
from .data import concat
|
||||
from .search import ParamSearch
|
||||
from .training_log import training_log_reader, training_log_writer
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AutoML:
|
||||
'''The AutoML class
|
||||
|
||||
Attributes:
|
||||
model: An object with predict() and predict_proba() method (for
|
||||
classification), storing the best trained model.
|
||||
model_history: A dictionary of iter->model, storing the models when
|
||||
the best model is updated each time
|
||||
config_history: A dictionary of iter->(estimator, config, time),
|
||||
storing the best estimator, config, and the time when the best
|
||||
model is updated each time
|
||||
classes_: A list of n_classes elements for class labels
|
||||
best_iteration: An integer of the iteration number where the best
|
||||
config is found
|
||||
best_estimator: A string indicating the best estimator found.
|
||||
best_config: A dictionary of the best configuration.
|
||||
best_config_train_time: A float of the seconds taken by training the
|
||||
best config
|
||||
|
||||
Typical usage example:
|
||||
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 60,
|
||||
"metric": 'accuracy',
|
||||
"task": 'classification',
|
||||
"log_file_name": 'test/mylog.log',
|
||||
}
|
||||
automl.fit(X_train = X_train, y_train = y_train,
|
||||
**automl_settings)
|
||||
'''
|
||||
|
||||
def __init__(self):
|
||||
self._eti_ini = ETI_INI
|
||||
self._custom_learners = {}
|
||||
self._config_space_info = {}
|
||||
self._custom_size_estimate = {}
|
||||
self._track_iter = 0
|
||||
|
||||
@property
|
||||
def model_history(self):
|
||||
return self._model_history
|
||||
|
||||
@property
|
||||
def config_history(self):
|
||||
return self._config_history
|
||||
|
||||
@property
|
||||
def model(self):
|
||||
if self._trained_estimator:
|
||||
return self._trained_estimator.model
|
||||
else:
|
||||
return None
|
||||
|
||||
@property
|
||||
def best_estimator(self):
|
||||
return self._best_estimator
|
||||
|
||||
@property
|
||||
def best_iteration(self):
|
||||
return self._best_iteration
|
||||
|
||||
@property
|
||||
def best_config(self):
|
||||
return self._selected.best_config[0]
|
||||
|
||||
@property
|
||||
def best_loss(self):
|
||||
return self._best_loss
|
||||
|
||||
@property
|
||||
def best_config_train_time(self):
|
||||
return self.best_train_time
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
if self.label_transformer:
|
||||
return self.label_transformer.classes_.tolist()
|
||||
if self._trained_estimator:
|
||||
return self._trained_estimator.model.classes_.tolist()
|
||||
return None
|
||||
|
||||
def predict(self, X_test):
|
||||
'''Predict label from features.
|
||||
|
||||
Args:
|
||||
X_test: A numpy array of featurized instances, shape n*m.
|
||||
|
||||
Returns:
|
||||
A numpy array of shape n*1 -- each element is a predicted class
|
||||
label for an instance.
|
||||
'''
|
||||
X_test = self.preprocess(X_test)
|
||||
y_pred = self._trained_estimator.predict(X_test)
|
||||
if y_pred.ndim > 1:
|
||||
y_pred = y_pred.flatten()
|
||||
if self.label_transformer:
|
||||
return self.label_transformer.inverse_transform(pd.Series(
|
||||
y_pred))
|
||||
else:
|
||||
return y_pred
|
||||
|
||||
def predict_proba(self, X_test):
|
||||
'''Predict the probability of each class from features, only works for
|
||||
classification problems.
|
||||
|
||||
Args:
|
||||
X_test: A numpy array of featurized instances, shape n*m.
|
||||
|
||||
Returns:
|
||||
A numpy array of shape n*c. c is the # classes. Each element at
|
||||
(i,j) is the probability for instance i to be in class j.
|
||||
'''
|
||||
X_test = self.preprocess(X_test)
|
||||
proba = self._trained_estimator.predict_proba(X_test)
|
||||
return proba
|
||||
|
||||
def preprocess(self, X):
|
||||
if scipy.sparse.issparse(X):
|
||||
X = X.tocsr()
|
||||
if self.transformer:
|
||||
X = self.transformer.transform(X)
|
||||
return X
|
||||
|
||||
def _validate_data(self, X_train_all, y_train_all, dataframe, label,
|
||||
X_val=None, y_val=None):
|
||||
if X_train_all is not None and y_train_all is not None:
|
||||
if not (isinstance(X_train_all, np.ndarray)
|
||||
or scipy.sparse.issparse(X_train_all)
|
||||
or isinstance(X_train_all, pd.DataFrame)
|
||||
):
|
||||
raise ValueError(
|
||||
"X_train_all must be a numpy array, a pandas dataframe, "
|
||||
"or Scipy sparse matrix.")
|
||||
if not (isinstance(y_train_all, np.ndarray)
|
||||
or isinstance(y_train_all, pd.Series)):
|
||||
raise ValueError(
|
||||
"y_train_all must be a numpy array or a pandas series.")
|
||||
if X_train_all.size == 0 or y_train_all.size == 0:
|
||||
raise ValueError("Input data must not be empty.")
|
||||
if isinstance(y_train_all, np.ndarray):
|
||||
y_train_all = y_train_all.flatten()
|
||||
if X_train_all.shape[0] != y_train_all.shape[0]:
|
||||
raise ValueError(
|
||||
"# rows in X_train must match length of y_train.")
|
||||
self.df = isinstance(X_train_all, pd.DataFrame)
|
||||
self.nrow, self.ndim = X_train_all.shape
|
||||
X, y = X_train_all, y_train_all
|
||||
elif dataframe is not None and label is not None:
|
||||
if not isinstance(dataframe, pd.DataFrame):
|
||||
raise ValueError("dataframe must be a pandas DataFrame")
|
||||
if not label in dataframe.columns:
|
||||
raise ValueError("label must a column name in dataframe")
|
||||
self.df = True
|
||||
self.dataframe, self.label = dataframe, label
|
||||
X = dataframe.drop(columns=label)
|
||||
self.nrow, self.ndim = X.shape
|
||||
y = dataframe[label]
|
||||
else:
|
||||
raise ValueError(
|
||||
"either X_train_all+y_train_all or dataframe+label need to be provided.")
|
||||
if scipy.sparse.issparse(X_train_all):
|
||||
self.transformer = self.label_transformer = False
|
||||
self.X_train_all, self.y_train_all = X, y
|
||||
else:
|
||||
from .data import DataTransformer
|
||||
self.transformer = DataTransformer()
|
||||
self.X_train_all, self.y_train_all = self.transformer.fit_transform(
|
||||
X, y, self.task)
|
||||
self.label_transformer = self.transformer.label_transformer
|
||||
|
||||
if X_val is not None and y_val is not None:
|
||||
if not (isinstance(X_val, np.ndarray)
|
||||
or scipy.sparse.issparse(X_val)
|
||||
or isinstance(X_val, pd.DataFrame)
|
||||
):
|
||||
raise ValueError(
|
||||
"X_val must be None, a numpy array, a pandas dataframe, "
|
||||
"or Scipy sparse matrix.")
|
||||
if not (isinstance(y_val, np.ndarray)
|
||||
or isinstance(y_val, pd.Series)):
|
||||
raise ValueError(
|
||||
"y_val must be None, a numpy array or a pandas series.")
|
||||
if X_val.size == 0 or y_val.size == 0:
|
||||
raise ValueError(
|
||||
"Validation data are expected to be nonempty. "
|
||||
"Use None for X_val and y_val if no validation data.")
|
||||
if isinstance(y_val, np.ndarray):
|
||||
y_val = y_val.flatten()
|
||||
if X_val.shape[0] != y_val.shape[0]:
|
||||
raise ValueError(
|
||||
"# rows in X_val must match length of y_val.")
|
||||
if self.transformer:
|
||||
self.X_val = self.transformer.transform(X_val)
|
||||
else:
|
||||
self.X_val = X_val
|
||||
if self.label_transformer:
|
||||
self.y_val = self.label_transformer.transform(y_val)
|
||||
else:
|
||||
self.y_val = y_val
|
||||
else:
|
||||
self.X_val = self.y_val = None
|
||||
|
||||
def _prepare_data(self,
|
||||
eval_method,
|
||||
split_ratio,
|
||||
n_splits):
|
||||
X_val, y_val = self.X_val, self.y_val
|
||||
if scipy.sparse.issparse(X_val):
|
||||
X_val = X_val.tocsr()
|
||||
X_train_all, y_train_all = self.X_train_all, self.y_train_all
|
||||
if scipy.sparse.issparse(X_train_all):
|
||||
X_train_all = X_train_all.tocsr()
|
||||
|
||||
if self.task != 'regression':
|
||||
# logger.info(f"label {pd.unique(y_train_all)}")
|
||||
label_set, counts = np.unique(y_train_all, return_counts=True)
|
||||
# augment rare classes
|
||||
rare_threshld = 20
|
||||
rare = counts < rare_threshld
|
||||
rare_label, rare_counts = label_set[rare], counts[rare]
|
||||
for i, label in enumerate(rare_label):
|
||||
count = rare_count = rare_counts[i]
|
||||
rare_index = y_train_all == label
|
||||
n = len(y_train_all)
|
||||
while count < rare_threshld:
|
||||
if self.df:
|
||||
X_train_all = concat(X_train_all,
|
||||
X_train_all.iloc[:n].loc[rare_index])
|
||||
else:
|
||||
X_train_all = concat(X_train_all,
|
||||
X_train_all[:n][rare_index, :])
|
||||
if isinstance(y_train_all, pd.Series):
|
||||
y_train_all = concat(y_train_all,
|
||||
y_train_all.iloc[:n].loc[rare_index])
|
||||
else:
|
||||
y_train_all = np.concatenate([y_train_all,
|
||||
y_train_all[:n][rare_index]])
|
||||
count += rare_count
|
||||
logger.debug(
|
||||
f"class {label} augmented from {rare_count} to {count}")
|
||||
X_train_all, y_train_all = shuffle(
|
||||
X_train_all, y_train_all, random_state=202020)
|
||||
if self.df:
|
||||
X_train_all.reset_index(drop=True, inplace=True)
|
||||
if isinstance(y_train_all, pd.Series):
|
||||
y_train_all.reset_index(drop=True, inplace=True)
|
||||
|
||||
X_train, y_train = X_train_all, y_train_all
|
||||
if X_val is None:
|
||||
if self.task != 'regression' and eval_method == 'holdout':
|
||||
label_set, first = np.unique(y_train_all, return_index=True)
|
||||
rest = []
|
||||
last = 0
|
||||
first.sort()
|
||||
for i in range(len(first)):
|
||||
rest.extend(range(last, first[i]))
|
||||
last = first[i] + 1
|
||||
rest.extend(range(last, len(y_train_all)))
|
||||
X_first = X_train_all.iloc[first] if self.df else X_train_all[
|
||||
first]
|
||||
X_rest = X_train_all.iloc[rest] if self.df else X_train_all[rest]
|
||||
y_rest = y_train_all.iloc[rest] if isinstance(
|
||||
y_train_all, pd.Series) else y_train_all[rest]
|
||||
stratify = y_rest if self.split_type == 'stratified' else None
|
||||
X_train, X_val, y_train, y_val = train_test_split(
|
||||
X_rest,
|
||||
y_rest,
|
||||
test_size=split_ratio,
|
||||
stratify=stratify,
|
||||
random_state=1)
|
||||
X_train = concat(X_first, X_train)
|
||||
y_train = concat(label_set,
|
||||
y_train) if self.df else np.concatenate([label_set, y_train])
|
||||
X_val = concat(X_first, X_val)
|
||||
y_val = concat(label_set,
|
||||
y_val) if self.df else np.concatenate([label_set, y_val])
|
||||
_, y_train_counts_elements = np.unique(y_train,
|
||||
return_counts=True)
|
||||
_, y_val_counts_elements = np.unique(y_val,
|
||||
return_counts=True)
|
||||
logger.debug(
|
||||
f"""{self.split_type} split for y_train \
|
||||
{y_train_counts_elements}, \
|
||||
y_val {y_val_counts_elements}""")
|
||||
elif eval_method == 'holdout' and self.task == 'regression':
|
||||
X_train, X_val, y_train, y_val = train_test_split(
|
||||
X_train_all,
|
||||
y_train_all,
|
||||
test_size=split_ratio,
|
||||
random_state=1)
|
||||
self.data_size = X_train.shape[0]
|
||||
self.X_train, self.y_train, self.X_val, self.y_val = (
|
||||
X_train, y_train, X_val, y_val)
|
||||
if self.split_type == "stratified":
|
||||
logger.info("Using StratifiedKFold")
|
||||
self.kf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=1,
|
||||
random_state=202020)
|
||||
else:
|
||||
logger.info("Using RepeatedKFold")
|
||||
self.kf = RepeatedKFold(n_splits=n_splits, n_repeats=1,
|
||||
random_state=202020)
|
||||
|
||||
def prepare_sample_train_data(self, sample_size):
|
||||
full_size = len(self.y_train)
|
||||
if sample_size <= full_size:
|
||||
if isinstance(self.X_train, pd.DataFrame):
|
||||
sampled_X_train = self.X_train.iloc[:sample_size]
|
||||
else:
|
||||
sampled_X_train = self.X_train[:sample_size]
|
||||
sampled_y_train = self.y_train[:sample_size]
|
||||
else:
|
||||
sampled_X_train = concat(self.X_train, self.X_val)
|
||||
sampled_y_train = np.concatenate([self.y_train, self.y_val])
|
||||
return sampled_X_train, sampled_y_train
|
||||
|
||||
def _compute_with_config_base(self,
|
||||
metric,
|
||||
compute_train_loss,
|
||||
estimator,
|
||||
config,
|
||||
sample_size):
|
||||
sampled_X_train, sampled_y_train = self.prepare_sample_train_data(
|
||||
sample_size)
|
||||
time_left = self.time_budget - self.time_from_start
|
||||
budget = time_left if sample_size == self.data_size else \
|
||||
time_left / 2 * sample_size / self.data_size
|
||||
return compute_estimator(sampled_X_train,
|
||||
sampled_y_train,
|
||||
self.X_val,
|
||||
self.y_val,
|
||||
budget,
|
||||
self.kf,
|
||||
config,
|
||||
self.task,
|
||||
estimator,
|
||||
self.eval_method,
|
||||
metric,
|
||||
self._best_loss,
|
||||
self.n_jobs,
|
||||
self._custom_learners.get(estimator),
|
||||
compute_train_loss)
|
||||
|
||||
def _train_with_config(self, estimator, config, sample_size):
|
||||
sampled_X_train, sampled_y_train = self.prepare_sample_train_data(
|
||||
sample_size)
|
||||
budget = None if self.time_budget is None else (self.time_budget
|
||||
- self.time_from_start)
|
||||
model, train_time = train_estimator(
|
||||
sampled_X_train,
|
||||
sampled_y_train,
|
||||
config,
|
||||
self.task,
|
||||
estimator,
|
||||
self.n_jobs,
|
||||
self._custom_learners.get(estimator),
|
||||
budget)
|
||||
return model, train_time
|
||||
|
||||
def add_learner(self,
|
||||
learner_name,
|
||||
learner_class,
|
||||
size_estimate=lambda config: 'unknown',
|
||||
cost_relative2lgbm=1):
|
||||
'''Add a customized learner
|
||||
|
||||
Args:
|
||||
learner_name: A string of the learner's name
|
||||
learner_class: A subclass of BaseEstimator
|
||||
size_estimate: A function from a config to its memory size in float
|
||||
cost_relative2lgbm: A float number for the training cost ratio with
|
||||
respect to lightgbm (when both use the initial config)
|
||||
'''
|
||||
self._custom_learners[learner_name] = learner_class
|
||||
self._eti_ini[learner_name] = cost_relative2lgbm
|
||||
self._config_space_info[learner_name] = \
|
||||
learner_class.params_configsearch_info
|
||||
self._custom_size_estimate[learner_name] = size_estimate
|
||||
|
||||
def get_estimator_from_log(self, log_file_name, record_id, objective):
|
||||
'''Get the estimator from log file
|
||||
|
||||
Args:
|
||||
log_file_name: A string of the log file name
|
||||
record_id: An integer of the record ID in the file,
|
||||
0 corresponds to the first trial
|
||||
objective: A string of the objective name,
|
||||
'binary', 'multi', or 'regression'
|
||||
|
||||
Returns:
|
||||
An estimator object for the given configuration
|
||||
'''
|
||||
|
||||
with training_log_reader(log_file_name) as reader:
|
||||
record = reader.get_record(record_id)
|
||||
estimator = record.learner
|
||||
config = record.config
|
||||
|
||||
estimator, _ = train_estimator(
|
||||
None, None, config, objective, estimator,
|
||||
estimator_class=self._custom_learners.get(estimator)
|
||||
)
|
||||
return estimator
|
||||
|
||||
def retrain_from_log(self,
|
||||
log_file_name,
|
||||
X_train=None,
|
||||
y_train=None,
|
||||
dataframe=None,
|
||||
label=None,
|
||||
time_budget=0,
|
||||
task='classification',
|
||||
eval_method='auto',
|
||||
split_ratio=SPLIT_RATIO,
|
||||
n_splits=N_SPLITS,
|
||||
split_type="stratified",
|
||||
n_jobs=1,
|
||||
train_best=True,
|
||||
train_full=False,
|
||||
record_id=-1):
|
||||
'''Retrain from log file
|
||||
|
||||
Args:
|
||||
time_budget: A float number of the time budget in seconds
|
||||
log_file_name: A string of the log file name
|
||||
X_train: A numpy array of training data in shape n*m
|
||||
y_train: A numpy array of labels in shape n*1
|
||||
task: A string of the task type, e.g.,
|
||||
'classification', 'regression'
|
||||
eval_method: A string of resampling strategy, one of
|
||||
['auto', 'cv', 'holdout']
|
||||
split_ratio: A float of the validation data percentage for holdout
|
||||
n_splits: An integer of the number of folds for cross-validation
|
||||
n_jobs: An integer of the number of threads for training
|
||||
train_best: A boolean of whether to train the best config in the
|
||||
time budget; if false, train the last config in the budget
|
||||
train_full: A boolean of whether to train on the full data. If true,
|
||||
eval_method and sample_size in the log file will be ignored
|
||||
record_id: the ID of the training log record from which the model will
|
||||
be retrained. By default `record_id = -1` which means this will be
|
||||
ignored. `record_id = 0` corresponds to the first trial, and
|
||||
when `record_id >= 0`, `time_budget` will be ignored.
|
||||
'''
|
||||
self.task = task
|
||||
self._validate_data(X_train, y_train, dataframe, label)
|
||||
|
||||
logger.info('log file name {}'.format(log_file_name))
|
||||
|
||||
best_config = None
|
||||
best_val_loss = float('+inf')
|
||||
best_estimator = None
|
||||
sample_size = None
|
||||
time_used = 0.0
|
||||
training_duration = 0
|
||||
best = None
|
||||
with training_log_reader(log_file_name) as reader:
|
||||
if record_id >= 0:
|
||||
best = reader.get_record(record_id)
|
||||
else:
|
||||
for record in reader.records():
|
||||
time_used = record.total_search_time
|
||||
if time_used > time_budget:
|
||||
break
|
||||
training_duration = time_used
|
||||
val_loss = record.validation_loss
|
||||
if val_loss <= best_val_loss or not train_best:
|
||||
if val_loss == best_val_loss and train_best:
|
||||
size = record.sample_size
|
||||
if size > sample_size:
|
||||
best = record
|
||||
best_val_loss = val_loss
|
||||
sample_size = size
|
||||
else:
|
||||
best = record
|
||||
size = record.sample_size
|
||||
best_val_loss = val_loss
|
||||
sample_size = size
|
||||
if not training_duration:
|
||||
from .model import BaseEstimator
|
||||
self._trained_estimator = BaseEstimator()
|
||||
self._trained_estimator.model = None
|
||||
return training_duration
|
||||
if not best: return
|
||||
best_estimator = best.learner
|
||||
best_config = best.config
|
||||
sample_size = len(self.y_train_all) if train_full \
|
||||
else best.sample_size
|
||||
|
||||
logger.info(
|
||||
'estimator = {}, config = {}, #training instances = {}'.format(
|
||||
best_estimator, best_config, sample_size))
|
||||
# Partially copied from fit() function
|
||||
# Initilize some attributes required for retrain_from_log
|
||||
np.random.seed(0)
|
||||
self.task = task
|
||||
if self.task == 'classification':
|
||||
self.task = get_classification_objective(
|
||||
len(np.unique(self.y_train_all)))
|
||||
assert split_type in ["stratified", "uniform"]
|
||||
self.split_type = split_type
|
||||
else:
|
||||
self.split_type = "uniform"
|
||||
if record_id >= 0:
|
||||
eval_method = 'cv'
|
||||
elif eval_method == 'auto':
|
||||
eval_method = self._decide_eval_method(time_budget)
|
||||
self.modelcount = 0
|
||||
self._prepare_data(eval_method, split_ratio, n_splits)
|
||||
self.time_budget = None
|
||||
self.n_jobs = n_jobs
|
||||
self._trained_estimator = self._train_with_config(
|
||||
best_estimator, best_config, sample_size)[0]
|
||||
return training_duration
|
||||
|
||||
def _decide_eval_method(self, time_budget):
|
||||
if self.X_val is not None:
|
||||
return 'holdout'
|
||||
nrow, dim = self.nrow, self.ndim
|
||||
if nrow * dim / 0.9 < SMALL_LARGE_THRES * (
|
||||
time_budget / 3600) and nrow < CV_HOLDOUT_THRESHOLD:
|
||||
# time allows or sampling can be used and cv is necessary
|
||||
return 'cv'
|
||||
else:
|
||||
return 'holdout'
|
||||
|
||||
def fit(self,
|
||||
X_train=None,
|
||||
y_train=None,
|
||||
dataframe=None,
|
||||
label=None,
|
||||
metric='auto',
|
||||
task='classification',
|
||||
n_jobs=-1,
|
||||
log_file_name='default.log',
|
||||
estimator_list='auto',
|
||||
time_budget=60,
|
||||
max_iter=1000000,
|
||||
sample=True,
|
||||
ensemble=False,
|
||||
eval_method='auto',
|
||||
log_type='better',
|
||||
model_history=False,
|
||||
split_ratio=SPLIT_RATIO,
|
||||
n_splits=N_SPLITS,
|
||||
log_training_metric=False,
|
||||
mem_thres=MEM_THRES,
|
||||
X_val=None,
|
||||
y_val=None,
|
||||
retrain_full=True,
|
||||
split_type="stratified",
|
||||
learner_selector='sample',
|
||||
):
|
||||
'''Find a model for a given task
|
||||
|
||||
Args:
|
||||
X_train: A numpy array or a pandas dataframe of training data in
|
||||
shape n*m
|
||||
y_train: A numpy array or a pandas series of labels in shape n*1
|
||||
dataframe: A dataframe of training data including label column
|
||||
label: A str of the label column name
|
||||
Note: If X_train and y_train are provided,
|
||||
dataframe and label are ignored;
|
||||
If not, dataframe and label must be provided.
|
||||
metric: A string of the metric name or a function,
|
||||
e.g., 'accuracy','roc_auc','f1','log_loss','mae','mse','r2'
|
||||
if passing a customized metric function, the function needs to
|
||||
have the follwing signature
|
||||
|
||||
def metric(X_test, y_test, estimator, labels, X_train, y_train):
|
||||
return metric_to_minimize, metrics_to_log
|
||||
|
||||
which returns a float number as the minimization objective,
|
||||
and a tuple of floats as the metrics to log
|
||||
task: A string of the task type, e.g.,
|
||||
'classification', 'regression'
|
||||
n_jobs: An integer of the number of threads for training
|
||||
log_file_name: A string of the log file name
|
||||
estimator_list: A list of strings for estimator names, or 'auto'
|
||||
e.g., ['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree']
|
||||
time_budget: A float number of the time budget in seconds
|
||||
max_iter: An integer of the maximal number of iterations
|
||||
sample: A boolean of whether to sample the training data during
|
||||
search
|
||||
eval_method: A string of resampling strategy, one of
|
||||
['auto', 'cv', 'holdout']
|
||||
split_ratio: A float of the valiation data percentage for holdout
|
||||
n_splits: An integer of the number of folds for cross-validation
|
||||
log_type: A string of the log type, one of ['better', 'all', 'new']
|
||||
'better' only logs configs with better loss than previos iters
|
||||
'all' logs all the tried configs
|
||||
'new' only logs non-redundant configs
|
||||
model_history: A boolean of whether to keep the history of best
|
||||
models in the history property. Make sure memory is large
|
||||
enough if setting to True.
|
||||
log_training_metric: A boolean of whether to log the training
|
||||
metric for each model.
|
||||
mem_thres: A float of the memory size constraint in bytes
|
||||
X_val: None | a numpy array or a pandas dataframe of validation data
|
||||
y_val: None | a numpy array or a pandas series of validation labels
|
||||
'''
|
||||
self.task = task
|
||||
self._validate_data(X_train, y_train, dataframe, label, X_val, y_val)
|
||||
self.start_time_flag = time.time()
|
||||
np.random.seed(0)
|
||||
self.learner_selector = learner_selector
|
||||
|
||||
if self.task == 'classification':
|
||||
self.task = get_classification_objective(
|
||||
len(np.unique(self.y_train_all)))
|
||||
assert split_type in ["stratified", "uniform"]
|
||||
self.split_type = split_type
|
||||
else:
|
||||
self.split_type = "uniform"
|
||||
|
||||
if 'auto' == estimator_list:
|
||||
estimator_list = ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree']
|
||||
if 'regression' != self.task:
|
||||
estimator_list += ['lrl1', ]
|
||||
logger.info(
|
||||
"List of ML learners in AutoML Run: {}".format(estimator_list))
|
||||
|
||||
if eval_method == 'auto' or self.X_val is not None:
|
||||
eval_method = self._decide_eval_method(time_budget)
|
||||
self.eval_method = eval_method
|
||||
logger.info("Evaluation method: {}".format(eval_method))
|
||||
|
||||
self.retrain_full = retrain_full and (eval_method == 'holdout'
|
||||
and self.X_val is None)
|
||||
self.sample = sample and (eval_method != 'cv')
|
||||
if 'auto' == metric:
|
||||
if 'binary' in task:
|
||||
metric = 'roc_auc'
|
||||
elif 'multi' in task:
|
||||
metric = 'log_loss'
|
||||
else:
|
||||
metric = 'r2'
|
||||
if metric in ['r2', 'accuracy', 'roc_auc', 'f1', 'ap']:
|
||||
error_metric = f"1-{metric}"
|
||||
elif isinstance(metric, str):
|
||||
error_metric = metric
|
||||
else:
|
||||
error_metric = 'customized metric'
|
||||
logger.info(f'Minimizing error metric: {error_metric}')
|
||||
|
||||
with training_log_writer(log_file_name) as save_helper:
|
||||
self.save_helper = save_helper
|
||||
self._prepare_data(eval_method, split_ratio, n_splits)
|
||||
self._compute_with_config = partial(AutoML._compute_with_config_base,
|
||||
self,
|
||||
metric,
|
||||
log_training_metric)
|
||||
self.time_budget = time_budget
|
||||
self.estimator_list = estimator_list
|
||||
self.ensemble = ensemble
|
||||
self.max_iter = max_iter
|
||||
self.mem_thres = mem_thres
|
||||
self.log_type = log_type
|
||||
self.split_ratio = split_ratio
|
||||
self.save_model_history = model_history
|
||||
self.n_jobs = n_jobs
|
||||
self.search()
|
||||
logger.info("fit succeeded")
|
||||
|
||||
def search(self):
|
||||
self.searchers = {}
|
||||
# initialize the searchers
|
||||
self.eti = []
|
||||
self._best_loss = float('+inf')
|
||||
self.best_train_time = 0
|
||||
self.time_from_start = 0
|
||||
self.estimator_index = -1
|
||||
self._best_iteration = 0
|
||||
self._model_history = {}
|
||||
self._config_history = {}
|
||||
self.max_iter_per_learner = 10000 # TODO
|
||||
self.iter_per_learner = dict([(e, 0) for e in self.estimator_list])
|
||||
self.fullsize = False
|
||||
self._trained_estimator = None
|
||||
if self.ensemble:
|
||||
self.best_model = {}
|
||||
for self._track_iter in range(self.max_iter):
|
||||
if self.estimator_index == -1:
|
||||
estimator = self.estimator_list[0]
|
||||
else:
|
||||
estimator = self._select_estimator(self.estimator_list)
|
||||
if not estimator:
|
||||
break
|
||||
logger.info(f"iteration {self._track_iter}"
|
||||
f" current learner {estimator}")
|
||||
if estimator in self.searchers:
|
||||
model = self.searchers[estimator].trained_estimator
|
||||
improved = self.searchers[estimator].search1step(
|
||||
global_best_loss=self._best_loss,
|
||||
retrain_full=self.retrain_full,
|
||||
mem_thres=self.mem_thres)
|
||||
else:
|
||||
model = improved = None
|
||||
self.searchers[estimator] = ParamSearch(
|
||||
estimator,
|
||||
self.data_size,
|
||||
self._compute_with_config,
|
||||
self._train_with_config,
|
||||
self.save_helper,
|
||||
MIN_SAMPLE_TRAIN if self.sample else self.data_size,
|
||||
self.task,
|
||||
self.log_type,
|
||||
self._config_space_info.get(estimator),
|
||||
self._custom_size_estimate.get(estimator),
|
||||
self.split_ratio)
|
||||
self.searchers[estimator].search_begin(self.time_budget,
|
||||
self.start_time_flag)
|
||||
if self.estimator_index == -1:
|
||||
eti_base = self._eti_ini[estimator]
|
||||
self.eti.append(
|
||||
self.searchers[estimator]
|
||||
.expected_time_improvement_search())
|
||||
for e in self.estimator_list[1:]:
|
||||
self.eti.append(
|
||||
self._eti_ini[e] / eti_base * self.eti[0])
|
||||
self.estimator_index = 0
|
||||
self.time_from_start = time.time() - self.start_time_flag
|
||||
# logger.info(f"{self.searchers[estimator].sample_size}, {data_size}")
|
||||
if self.searchers[estimator].sample_size == self.data_size:
|
||||
self.iter_per_learner[estimator] += 1
|
||||
if not self.fullsize:
|
||||
self.fullsize = True
|
||||
if self.searchers[estimator].best_loss < self._best_loss:
|
||||
self._best_loss = self.searchers[estimator].best_loss
|
||||
self._best_estimator = estimator
|
||||
self.best_train_time = self.searchers[estimator].train_time
|
||||
self._config_history[self._track_iter] = (
|
||||
estimator,
|
||||
self.searchers[estimator].best_config[0],
|
||||
self.time_from_start)
|
||||
if self.save_model_history:
|
||||
self._model_history[self._track_iter] = self.searchers[
|
||||
estimator].trained_estimator.model
|
||||
elif self._trained_estimator:
|
||||
del self._trained_estimator
|
||||
self._trained_estimator = None
|
||||
self._trained_estimator = self.searchers[
|
||||
estimator].trained_estimator
|
||||
self._best_iteration = self._track_iter
|
||||
if model and improved and not self.save_model_history:
|
||||
model.cleanup()
|
||||
|
||||
logger.info(
|
||||
" at {:.1f}s,\tbest {}'s error={:.4f},\tbest {}'s error={:.4f}".format(
|
||||
self.time_from_start,
|
||||
estimator,
|
||||
self.searchers[estimator].best_loss,
|
||||
self._best_estimator,
|
||||
self._best_loss))
|
||||
|
||||
if self.time_from_start >= self.time_budget:
|
||||
break
|
||||
if self.ensemble:
|
||||
time_left = self.time_from_start - self.time_budget
|
||||
time_ensemble = self.searchers[self._best_estimator].train_time
|
||||
if time_left < time_ensemble < 2 * time_left:
|
||||
break
|
||||
if self.searchers[
|
||||
estimator].train_time > self.time_budget - self.time_from_start:
|
||||
self.iter_per_learner[estimator] = self.max_iter_per_learner
|
||||
|
||||
# Add a checkpoint for the current best config to the log.
|
||||
self.save_helper.checkpoint()
|
||||
|
||||
if self.searchers:
|
||||
self._selected = self.searchers[self._best_estimator]
|
||||
self._trained_estimator = self._selected.trained_estimator
|
||||
self.modelcount = sum(self.searchers[estimator].model_count
|
||||
for estimator in self.searchers)
|
||||
logger.info(self._trained_estimator.model)
|
||||
if self.ensemble:
|
||||
searchers = list(self.searchers.items())
|
||||
searchers.sort(key=lambda x: x[1].best_loss)
|
||||
estimators = [(x[0], x[1].trained_estimator) for x in searchers[
|
||||
:2]]
|
||||
estimators += [(x[0], x[1].trained_estimator) for x in searchers[
|
||||
2:] if x[1].best_loss < 4 * self._selected.best_loss]
|
||||
logger.info(estimators)
|
||||
if self.task != "regression":
|
||||
from sklearn.ensemble import StackingClassifier as Stacker
|
||||
for e in estimators:
|
||||
e[1]._estimator_type = 'classifier'
|
||||
else:
|
||||
from sklearn.ensemble import StackingRegressor as Stacker
|
||||
best_m = self._trained_estimator
|
||||
stacker = Stacker(estimators, best_m, n_jobs=self.n_jobs,
|
||||
passthrough=True)
|
||||
stacker.fit(self.X_train_all, self.y_train_all)
|
||||
self._trained_estimator = stacker
|
||||
self._trained_estimator.model = stacker
|
||||
else:
|
||||
self._selected = self._trained_estimator = None
|
||||
self.modelcount = 0
|
||||
|
||||
def __del__(self):
|
||||
if hasattr(self, '_trained_estimator') and self._trained_estimator \
|
||||
and hasattr(self._trained_estimator, 'cleanup'):
|
||||
self._trained_estimator.cleanup()
|
||||
del self._trained_estimator
|
||||
|
||||
def _select_estimator(self, estimator_list):
|
||||
time_left = self.time_budget - self.time_from_start
|
||||
if self.best_train_time < time_left < 2 * self.best_train_time:
|
||||
best_searcher = self.searchers[self._best_estimator]
|
||||
config_sig = best_searcher.get_hist_config_sig(
|
||||
best_searcher.sample_size_full,
|
||||
best_searcher.best_config[0])
|
||||
if config_sig not in best_searcher.config_tried:
|
||||
# trainAll
|
||||
return self._best_estimator
|
||||
if self.learner_selector == 'roundrobin':
|
||||
self.estimator_index += 1
|
||||
if self.estimator_index == len(estimator_list):
|
||||
self.estimator_index = 0
|
||||
return estimator_list[self.estimator_index]
|
||||
min_expected_time, selected = np.Inf, None
|
||||
inv = []
|
||||
for i, estimator in enumerate(estimator_list):
|
||||
if estimator in self.searchers:
|
||||
searcher = self.searchers[estimator]
|
||||
if self.iter_per_learner[estimator] >= self.max_iter_per_learner:
|
||||
inv.append(0)
|
||||
continue
|
||||
eti_searcher = min(2 * searcher.train_time,
|
||||
searcher.expected_time_improvement_search())
|
||||
gap = searcher.best_loss - self._best_loss
|
||||
if gap > 0 and not self.ensemble:
|
||||
delta_loss = searcher.old_loss - searcher.new_loss
|
||||
delta_time = searcher.old_loss_time + \
|
||||
searcher.new_loss_time - searcher.old_train_time
|
||||
speed = delta_loss / float(delta_time)
|
||||
try:
|
||||
expected_time = max(gap / speed, searcher.train_time)
|
||||
except ZeroDivisionError:
|
||||
warnings.warn("ZeroDivisionError: need to debug ",
|
||||
"speed: {0}, "
|
||||
"old_loss: {1}, "
|
||||
"new_loss: {2}"
|
||||
.format(speed,
|
||||
searcher.old_loss,
|
||||
searcher.new_loss))
|
||||
expected_time = 0.0
|
||||
expected_time = 2 * max(expected_time, eti_searcher)
|
||||
else:
|
||||
expected_time = eti_searcher
|
||||
if expected_time == 0:
|
||||
expected_time = 1e-10
|
||||
inv.append(1 / expected_time)
|
||||
else:
|
||||
expected_time = self.eti[i]
|
||||
inv.append(0)
|
||||
if expected_time < min_expected_time:
|
||||
min_expected_time = expected_time
|
||||
selected = estimator
|
||||
if len(self.searchers) < len(estimator_list) or not selected:
|
||||
if selected not in self.searchers:
|
||||
# print('select',selected,'eti',min_expected_time)
|
||||
return selected
|
||||
s = sum(inv)
|
||||
p = np.random.random()
|
||||
q = 0
|
||||
for i in range(len(inv)):
|
||||
if inv[i]:
|
||||
q += inv[i] / s
|
||||
if p < q:
|
||||
return estimator_list[i]
|
|
@ -0,0 +1,31 @@
|
|||
'''!
|
||||
* Copyright (c) 2020 Microsoft Corporation. All rights reserved.
|
||||
* Licensed under the MIT License.
|
||||
'''
|
||||
|
||||
N_SPLITS = 5
|
||||
RANDOM_SEED = 1
|
||||
SPLIT_RATIO = 0.1
|
||||
HISTORY_SIZE = 10000000
|
||||
MEM_THRES = 4*(1024**3)
|
||||
SMALL_LARGE_THRES = 10000000
|
||||
MIN_SAMPLE_TRAIN = 10000
|
||||
MIN_SAMPLE_VAL = 10000
|
||||
CV_HOLDOUT_THRESHOLD = 100000
|
||||
|
||||
BASE_Const = 2
|
||||
BASE_LOWER_BOUND = 2**(0.01)
|
||||
|
||||
ETI_INI = {
|
||||
'lgbm':1,
|
||||
'xgboost':1.6,
|
||||
'xgboost_nb':1.6,
|
||||
'rf':2,
|
||||
'lrl1':160,
|
||||
'lrl2':25,
|
||||
'linear_svc':16,
|
||||
'kneighbor':30,
|
||||
'catboost':15,
|
||||
'extra_tree':1.9,
|
||||
'nn':50,
|
||||
}
|
|
@ -0,0 +1,256 @@
|
|||
'''!
|
||||
* Copyright (c) 2020 Microsoft Corporation. All rights reserved.
|
||||
* Licensed under the MIT License.
|
||||
'''
|
||||
|
||||
import numpy as np
|
||||
from scipy.sparse import vstack, issparse
|
||||
import pandas as pd
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from .training_log import training_log_reader
|
||||
|
||||
|
||||
def load_openml_dataset(dataset_id, data_dir=None, random_state=0):
|
||||
'''Load dataset from open ML.
|
||||
|
||||
If the file is not cached locally, download it from open ML.
|
||||
|
||||
Args:
|
||||
dataset_id: An integer of the dataset id in openml
|
||||
data_dir: A string of the path to store and load the data
|
||||
random_state: An integer of the random seed for splitting data
|
||||
|
||||
Returns:
|
||||
X_train: A 2d numpy array of training data
|
||||
X_test: A 2d numpy array of test data
|
||||
y_train: A 1d numpy arrya of labels for training data
|
||||
y_test: A 1d numpy arrya of labels for test data
|
||||
'''
|
||||
import os
|
||||
import openml
|
||||
import pickle
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
filename = 'openml_ds' + str(dataset_id) + '.pkl'
|
||||
filepath = os.path.join(data_dir, filename)
|
||||
if os.path.isfile(filepath):
|
||||
print('load dataset from', filepath)
|
||||
with open(filepath, 'rb') as f:
|
||||
dataset = pickle.load(f)
|
||||
else:
|
||||
print('download dataset from openml')
|
||||
dataset = openml.datasets.get_dataset(dataset_id)
|
||||
if not os.path.exists(data_dir):
|
||||
os.makedirs(data_dir)
|
||||
with open(filepath, 'wb') as f:
|
||||
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
|
||||
print('Dataset name:', dataset.name)
|
||||
X, y, * \
|
||||
__ = dataset.get_data(
|
||||
target=dataset.default_target_attribute, dataset_format='array')
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, random_state=random_state)
|
||||
print(
|
||||
'X_train.shape: {}, y_train.shape: {};\nX_test.shape: {}, y_test.shape: {}'.format(
|
||||
X_train.shape, y_train.shape, X_test.shape, y_test.shape,
|
||||
)
|
||||
)
|
||||
return X_train, X_test, y_train, y_test
|
||||
|
||||
|
||||
def load_openml_task(task_id, data_dir):
|
||||
'''Load task from open ML.
|
||||
|
||||
Use the first fold of the task.
|
||||
If the file is not cached locally, download it from open ML.
|
||||
|
||||
Args:
|
||||
task_id: An integer of the task id in openml
|
||||
data_dir: A string of the path to store and load the data
|
||||
|
||||
Returns:
|
||||
X_train: A 2d numpy array of training data
|
||||
X_test: A 2d numpy array of test data
|
||||
y_train: A 1d numpy arrya of labels for training data
|
||||
y_test: A 1d numpy arrya of labels for test data
|
||||
'''
|
||||
import os
|
||||
import openml
|
||||
import pickle
|
||||
task = openml.tasks.get_task(task_id)
|
||||
filename = 'openml_task' + str(task_id) + '.pkl'
|
||||
filepath = os.path.join(data_dir, filename)
|
||||
if os.path.isfile(filepath):
|
||||
print('load dataset from', filepath)
|
||||
with open(filepath, 'rb') as f:
|
||||
dataset = pickle.load(f)
|
||||
else:
|
||||
print('download dataset from openml')
|
||||
dataset = task.get_dataset()
|
||||
with open(filepath, 'wb') as f:
|
||||
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
|
||||
X, y, _, _ = dataset.get_data(task.target_name, dataset_format='array')
|
||||
train_indices, test_indices = task.get_train_test_split_indices(
|
||||
repeat=0,
|
||||
fold=0,
|
||||
sample=0,
|
||||
)
|
||||
X_train = X[train_indices]
|
||||
y_train = y[train_indices]
|
||||
X_test = X[test_indices]
|
||||
y_test = y[test_indices]
|
||||
print(
|
||||
'X_train.shape: {}, y_train.shape: {},\nX_test.shape: {}, y_test.shape: {}'.format(
|
||||
X_train.shape, y_train.shape, X_test.shape, y_test.shape,
|
||||
)
|
||||
)
|
||||
return X_train, X_test, y_train, y_test
|
||||
|
||||
|
||||
def get_output_from_log(filename, time_budget):
|
||||
'''Get output from log file
|
||||
|
||||
Args:
|
||||
filename: A string of the log file name
|
||||
time_budget: A float of the time budget in seconds
|
||||
|
||||
Returns:
|
||||
training_time_list: A list of the finished time of each logged iter
|
||||
best_error_list:
|
||||
A list of the best validation error after each logged iter
|
||||
error_list: A list of the validation error of each logged iter
|
||||
config_list:
|
||||
A list of the estimator, sample size and config of each logged iter
|
||||
logged_metric_list: A list of the logged metric of each logged iter
|
||||
'''
|
||||
import ast
|
||||
|
||||
best_config = None
|
||||
best_learner = None
|
||||
best_val_loss = float('+inf')
|
||||
training_duration = 0.0
|
||||
|
||||
training_time_list = []
|
||||
config_list = []
|
||||
best_error_list = []
|
||||
error_list = []
|
||||
logged_metric_list = []
|
||||
best_config_list = []
|
||||
with training_log_reader(filename) as reader:
|
||||
for record in reader.records():
|
||||
time_used = record.total_search_time
|
||||
training_duration = time_used
|
||||
val_loss = record.validation_loss
|
||||
config = record.config
|
||||
learner = record.learner.split('_')[0]
|
||||
sample_size = record.sample_size
|
||||
train_loss = record.logged_metric
|
||||
|
||||
if time_used < time_budget:
|
||||
if val_loss < best_val_loss:
|
||||
best_val_loss = val_loss
|
||||
best_config = config
|
||||
best_learner = learner
|
||||
best_config_list.append(best_config)
|
||||
training_time_list.append(training_duration)
|
||||
best_error_list.append(best_val_loss)
|
||||
logged_metric_list.append(train_loss)
|
||||
error_list.append(val_loss)
|
||||
config_list.append({"Current Learner": learner,
|
||||
"Current Sample": sample_size,
|
||||
"Current Hyper-parameters": record.config,
|
||||
"Best Learner": best_learner,
|
||||
"Best Hyper-parameters": best_config})
|
||||
|
||||
return (training_time_list, best_error_list, error_list, config_list,
|
||||
logged_metric_list)
|
||||
|
||||
|
||||
def concat(X1, X2):
|
||||
'''concatenate two matrices vertically
|
||||
'''
|
||||
if isinstance(X1, pd.DataFrame) or isinstance(X1, pd.Series):
|
||||
if isinstance(X1, pd.DataFrame):
|
||||
cat_columns = X1.select_dtypes(
|
||||
include='category').columns
|
||||
df = pd.concat([X1, X2], sort=False)
|
||||
df.reset_index(drop=True, inplace=True)
|
||||
if isinstance(X1, pd.DataFrame) and len(cat_columns):
|
||||
df[cat_columns] = df[cat_columns].astype('category')
|
||||
return df
|
||||
if issparse(X1):
|
||||
return vstack((X1, X2))
|
||||
else:
|
||||
return np.concatenate([X1, X2])
|
||||
|
||||
|
||||
class DataTransformer:
|
||||
'''transform X, y
|
||||
'''
|
||||
|
||||
def fit_transform(self, X, y, objective):
|
||||
if isinstance(X, pd.DataFrame):
|
||||
X = X.copy()
|
||||
n = X.shape[0]
|
||||
cat_columns, num_columns = [], []
|
||||
for column in X.columns:
|
||||
if X[column].dtype.name in ('object', 'category'):
|
||||
if X[column].nunique() == 1 or X[column].nunique(
|
||||
dropna=True) == n - X[column].isnull().sum():
|
||||
X.drop(columns=column, inplace=True)
|
||||
elif X[column].dtype.name == 'category':
|
||||
current_categories = X[column].cat.categories
|
||||
if '__NAN__' not in current_categories:
|
||||
X[column] = X[column].cat.add_categories(
|
||||
'__NAN__').fillna('__NAN__')
|
||||
cat_columns.append(column)
|
||||
else:
|
||||
X[column].fillna('__NAN__', inplace=True)
|
||||
cat_columns.append(column)
|
||||
else:
|
||||
# print(X[column].dtype.name)
|
||||
if X[column].nunique(dropna=True) < 2:
|
||||
X.drop(columns=column, inplace=True)
|
||||
else:
|
||||
X[column].fillna(np.nan, inplace=True)
|
||||
num_columns.append(column)
|
||||
X = X[cat_columns + num_columns]
|
||||
if cat_columns:
|
||||
X[cat_columns] = X[cat_columns].astype('category')
|
||||
if num_columns:
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.compose import ColumnTransformer
|
||||
self.transformer = ColumnTransformer([(
|
||||
'continuous',
|
||||
SimpleImputer(missing_values=np.nan, strategy='median'),
|
||||
num_columns)])
|
||||
X[num_columns] = self.transformer.fit_transform(X)
|
||||
self.cat_columns, self.num_columns = cat_columns, num_columns
|
||||
|
||||
if objective == 'regression':
|
||||
self.label_transformer = None
|
||||
else:
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
self.label_transformer = LabelEncoder()
|
||||
y = self.label_transformer.fit_transform(y)
|
||||
return X, y
|
||||
|
||||
def transform(self, X):
|
||||
if isinstance(X, pd.DataFrame):
|
||||
cat_columns, num_columns = self.cat_columns, self.num_columns
|
||||
X = X[cat_columns + num_columns].copy()
|
||||
for column in cat_columns:
|
||||
# print(column, X[column].dtype.name)
|
||||
if X[column].dtype.name == 'object':
|
||||
X[column].fillna('__NAN__', inplace=True)
|
||||
elif X[column].dtype.name == 'category':
|
||||
current_categories = X[column].cat.categories
|
||||
if '__NAN__' not in current_categories:
|
||||
X[column] = X[column].cat.add_categories(
|
||||
'__NAN__').fillna('__NAN__')
|
||||
if cat_columns:
|
||||
X[cat_columns] = X[cat_columns].astype('category')
|
||||
if num_columns:
|
||||
X[num_columns].fillna(np.nan, inplace=True)
|
||||
X[num_columns] = self.transformer.transform(X)
|
||||
return X
|
|
@ -0,0 +1,241 @@
|
|||
'''!
|
||||
* Copyright (c) 2020 Microsoft Corporation. All rights reserved.
|
||||
* Licensed under the MIT License.
|
||||
'''
|
||||
|
||||
from .model import *
|
||||
import time
|
||||
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, \
|
||||
accuracy_score, mean_absolute_error, log_loss, average_precision_score, \
|
||||
f1_score
|
||||
import numpy as np
|
||||
from sklearn.model_selection import RepeatedStratifiedKFold
|
||||
|
||||
|
||||
def get_estimator_class(objective_name, estimator_name):
|
||||
''' when adding a new learner, need to add an elif branch '''
|
||||
|
||||
|
||||
if 'xgboost' in estimator_name:
|
||||
if 'regression' in objective_name:
|
||||
estimator_class = XGBoostEstimator
|
||||
else:
|
||||
estimator_class = XGBoostSklearnEstimator
|
||||
elif 'rf' in estimator_name:
|
||||
estimator_class = RandomForestEstimator
|
||||
elif 'lgbm' in estimator_name:
|
||||
estimator_class = LGBMEstimator
|
||||
elif 'lrl1' in estimator_name:
|
||||
estimator_class = LRL1Classifier
|
||||
elif 'lrl2' in estimator_name:
|
||||
estimator_class = LRL2Classifier
|
||||
elif 'catboost' in estimator_name:
|
||||
estimator_class = CatBoostEstimator
|
||||
elif 'extra_tree' in estimator_name:
|
||||
estimator_class = ExtraTreeEstimator
|
||||
elif 'kneighbor' in estimator_name:
|
||||
estimator_class = KNeighborsEstimator
|
||||
else:
|
||||
raise ValueError(estimator_name + ' is not a built-in learner. '
|
||||
'Please use AutoML.add_learner() to add a customized learner.')
|
||||
return estimator_class
|
||||
|
||||
|
||||
def sklearn_metric_loss_score(metric_name, y_predict, y_true, labels=None):
|
||||
'''Loss using the specified metric
|
||||
|
||||
Args:
|
||||
metric_name: A string of the mtric name, one of
|
||||
'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'log_loss',
|
||||
'f1', 'ap'
|
||||
y_predict: A 1d or 2d numpy array of the predictions which can be
|
||||
used to calculate the metric. E.g., 2d for log_loss and 1d
|
||||
for others.
|
||||
y_true: A 1d numpy array of the true labels
|
||||
labels: A 1d numpy array of the unique labels
|
||||
|
||||
Returns:
|
||||
score: A float number of the loss, the lower the better
|
||||
'''
|
||||
metric_name = metric_name.lower()
|
||||
if 'r2' in metric_name:
|
||||
score = 1.0 - r2_score(y_true, y_predict)
|
||||
elif metric_name == 'rmse':
|
||||
score = np.sqrt(mean_squared_error(y_true, y_predict))
|
||||
elif metric_name == 'mae':
|
||||
score = mean_absolute_error(y_true, y_predict)
|
||||
elif metric_name == 'mse':
|
||||
score = mean_squared_error(y_true, y_predict)
|
||||
elif metric_name == 'accuracy':
|
||||
score = 1.0 - accuracy_score(y_true, y_predict)
|
||||
elif 'roc_auc' in metric_name:
|
||||
score = 1.0 - roc_auc_score(y_true, y_predict)
|
||||
elif 'log_loss' in metric_name:
|
||||
score = log_loss(y_true, y_predict, labels=labels)
|
||||
elif 'f1' in metric_name:
|
||||
score = 1 - f1_score(y_true, y_predict)
|
||||
elif 'ap' in metric_name:
|
||||
score = 1 - average_precision_score(y_true, y_predict)
|
||||
else:
|
||||
raise ValueError(metric_name+' is not a built-in metric, '
|
||||
'currently built-in metrics are: '
|
||||
'r2, rmse, mae, mse, accuracy, roc_auc, log_loss, f1, ap. '
|
||||
'please pass a customized metric function to AutoML.fit(metric=func)')
|
||||
return score
|
||||
|
||||
|
||||
def get_y_pred(estimator, X, eval_metric, obj):
|
||||
if eval_metric in ['roc_auc', 'ap'] and 'binary' in obj:
|
||||
y_pred_classes = estimator.predict_proba(X)
|
||||
y_pred = y_pred_classes[:,
|
||||
1] if y_pred_classes.ndim>1 else y_pred_classes
|
||||
elif eval_metric in ['log_loss', 'roc_auc']:
|
||||
y_pred = estimator.predict_proba(X)
|
||||
else:
|
||||
y_pred = estimator.predict(X)
|
||||
return y_pred
|
||||
|
||||
|
||||
def get_test_loss(estimator, X_train, y_train, X_test, y_test, eval_metric, obj,
|
||||
labels=None, budget=None, train_loss=False):
|
||||
start = time.time()
|
||||
train_time = estimator.fit(X_train, y_train, budget)
|
||||
if isinstance(eval_metric, str):
|
||||
test_pred_y = get_y_pred(estimator, X_test, eval_metric, obj)
|
||||
test_loss = sklearn_metric_loss_score(eval_metric, test_pred_y, y_test,
|
||||
labels)
|
||||
if train_loss != False:
|
||||
test_pred_y = get_y_pred(estimator, X_train, eval_metric, obj)
|
||||
train_loss = sklearn_metric_loss_score(eval_metric, test_pred_y,
|
||||
y_train, labels)
|
||||
else: # customized metric function
|
||||
test_loss, train_loss = eval_metric(
|
||||
X_test, y_test, estimator, labels, X_train, y_train)
|
||||
train_time = time.time()-start
|
||||
return test_loss, train_time, train_loss
|
||||
|
||||
|
||||
def train_model(estimator, X_train, y_train, budget):
|
||||
train_time = estimator.fit(X_train, y_train, budget)
|
||||
return train_time
|
||||
|
||||
|
||||
def evaluate_model(estimator, X_train, y_train, X_val, y_val, budget, kf,
|
||||
objective_name, eval_method, eval_metric, best_val_loss, train_loss=False):
|
||||
if 'holdout' in eval_method:
|
||||
val_loss, train_loss, train_time = evaluate_model_holdout(
|
||||
estimator, X_train, y_train, X_val, y_val, budget,
|
||||
objective_name, eval_metric, best_val_loss, train_loss=train_loss)
|
||||
else:
|
||||
val_loss, train_loss, train_time = evaluate_model_CV(
|
||||
estimator, X_train, y_train, budget, kf, objective_name,
|
||||
eval_metric, best_val_loss, train_loss=train_loss)
|
||||
return val_loss, train_loss, train_time
|
||||
|
||||
|
||||
def evaluate_model_holdout(estimator, X_train, y_train, X_val, y_val, budget,
|
||||
objective_name, eval_metric, best_val_loss, train_loss=False):
|
||||
val_loss, train_time, train_loss = get_test_loss(
|
||||
estimator, X_train, y_train, X_val, y_val, eval_metric, objective_name,
|
||||
budget = budget, train_loss=train_loss)
|
||||
return val_loss, train_loss, train_time
|
||||
|
||||
|
||||
def evaluate_model_CV(estimator, X_train_all, y_train_all, budget, kf,
|
||||
objective_name, eval_metric, best_val_loss, train_loss=False):
|
||||
start_time = time.time()
|
||||
total_val_loss = total_train_loss = 0
|
||||
train_time = 0
|
||||
valid_fold_num = 0
|
||||
n = kf.get_n_splits()
|
||||
X_train_split, y_train_split = X_train_all, y_train_all
|
||||
if objective_name=='regression':
|
||||
labels = None
|
||||
else:
|
||||
labels = np.unique(y_train_all)
|
||||
|
||||
if isinstance(kf, RepeatedStratifiedKFold):
|
||||
kf = kf.split(X_train_split, y_train_split)
|
||||
else:
|
||||
kf = kf.split(X_train_split)
|
||||
rng = np.random.RandomState(2020)
|
||||
val_loss_list = []
|
||||
budget_per_train = budget / (n+1)
|
||||
for train_index, val_index in kf:
|
||||
train_index = rng.permutation(train_index)
|
||||
if isinstance(X_train_all, pd.DataFrame):
|
||||
X_train, X_val = X_train_split.iloc[
|
||||
train_index], X_train_split.iloc[val_index]
|
||||
else:
|
||||
X_train, X_val = X_train_split[
|
||||
train_index], X_train_split[val_index]
|
||||
if isinstance(y_train_all, pd.Series):
|
||||
y_train, y_val = y_train_split.iloc[
|
||||
train_index], y_train_split.iloc[val_index]
|
||||
else:
|
||||
y_train, y_val = y_train_split[
|
||||
train_index], y_train_split[val_index]
|
||||
estimator.cleanup()
|
||||
val_loss_i, train_time_i, train_loss_i = get_test_loss(
|
||||
estimator, X_train, y_train, X_val, y_val, eval_metric,
|
||||
objective_name, labels, budget_per_train, train_loss=train_loss)
|
||||
valid_fold_num += 1
|
||||
total_val_loss += val_loss_i
|
||||
if train_loss != False:
|
||||
if total_train_loss != 0: total_train_loss += train_loss_i
|
||||
else: total_train_loss = train_loss_i
|
||||
train_time += train_time_i
|
||||
if valid_fold_num == n:
|
||||
val_loss_list.append(total_val_loss/valid_fold_num)
|
||||
total_val_loss = valid_fold_num = 0
|
||||
elif time.time() - start_time >= budget:
|
||||
val_loss_list.append(total_val_loss/valid_fold_num)
|
||||
break
|
||||
val_loss = np.max(val_loss_list)
|
||||
if train_loss != False: train_loss = total_train_loss/n
|
||||
budget -= time.time() - start_time
|
||||
if val_loss < best_val_loss and budget > budget_per_train:
|
||||
estimator.cleanup()
|
||||
train_time_full = estimator.fit(X_train_all, y_train_all, budget)
|
||||
train_time += train_time_full
|
||||
return val_loss, train_loss, train_time
|
||||
|
||||
|
||||
def compute_estimator(X_train, y_train, X_val, y_val, budget, kf,
|
||||
config_dic, objective_name, estimator_name, eval_method, eval_metric,
|
||||
best_val_loss = np.Inf, n_jobs=1, estimator_class=None, train_loss=False):
|
||||
start_time = time.time()
|
||||
estimator_class = estimator_class or get_estimator_class(
|
||||
objective_name, estimator_name)
|
||||
estimator = estimator_class(
|
||||
**config_dic, objective_name = objective_name, n_jobs=n_jobs)
|
||||
val_loss, train_loss, train_time = evaluate_model(
|
||||
estimator, X_train, y_train, X_val, y_val, budget, kf, objective_name,
|
||||
eval_method, eval_metric, best_val_loss, train_loss=train_loss)
|
||||
all_time = time.time() - start_time
|
||||
return estimator, val_loss, train_loss, train_time, all_time
|
||||
|
||||
|
||||
def train_estimator(X_train, y_train, config_dic, objective_name,
|
||||
estimator_name, n_jobs=1, estimator_class=None, budget=None):
|
||||
start_time = time.time()
|
||||
estimator_class = estimator_class or get_estimator_class(objective_name,
|
||||
estimator_name)
|
||||
estimator = estimator_class(**config_dic, objective_name = objective_name,
|
||||
n_jobs=n_jobs)
|
||||
if X_train is not None:
|
||||
train_time = train_model(estimator, X_train, y_train, budget)
|
||||
else:
|
||||
estimator = estimator.estimator_class(**estimator.params)
|
||||
train_time = time.time() - start_time
|
||||
return estimator, train_time
|
||||
|
||||
|
||||
def get_classification_objective(num_labels: int) -> str:
|
||||
if num_labels == 2:
|
||||
objective_name = 'binary:logistic'
|
||||
else:
|
||||
objective_name = 'multi:softmax'
|
||||
return objective_name
|
||||
|
||||
|
|
@ -0,0 +1,515 @@
|
|||
'''!
|
||||
* Copyright (c) 2020 Microsoft Corporation. All rights reserved.
|
||||
* Licensed under the MIT License.
|
||||
'''
|
||||
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
from xgboost import XGBClassifier, XGBRegressor
|
||||
import time
|
||||
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from lightgbm import LGBMClassifier, LGBMRegressor
|
||||
import scipy.sparse
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class BaseEstimator:
|
||||
'''The abstract class for all learners
|
||||
|
||||
Typical example:
|
||||
XGBoostEstimator: for regression
|
||||
XGBoostSklearnEstimator: for classification
|
||||
LGBMEstimator, RandomForestEstimator, LRL1Classifier, LRL2Classifier:
|
||||
for both regression and classification
|
||||
'''
|
||||
|
||||
def __init__(self, objective_name = 'binary:logistic',
|
||||
**params):
|
||||
'''Constructor
|
||||
|
||||
Args:
|
||||
objective_name: A string of the objective name, one of
|
||||
'binary:logistic', 'multi:softmax', 'regression'
|
||||
n_jobs: An integer of the number of parallel threads
|
||||
params: A dictionary of the hyperparameter names and values
|
||||
'''
|
||||
self.params = params
|
||||
self.estimator_class = None
|
||||
self.objective_name = objective_name
|
||||
if '_estimator_type' in params:
|
||||
self._estimator_type = params['_estimator_type']
|
||||
else:
|
||||
self._estimator_type = "regressor" if objective_name=='regression' \
|
||||
else "classifier"
|
||||
|
||||
def get_params(self, deep=False):
|
||||
params = self.params.copy()
|
||||
params["objective_name"] = self.objective_name
|
||||
if hasattr(self, '_estimator_type'):
|
||||
params['_estimator_type'] = self._estimator_type
|
||||
return params
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
return self.model.classes_
|
||||
|
||||
def preprocess(self, X):
|
||||
return X
|
||||
|
||||
def _fit(self, X_train, y_train):
|
||||
|
||||
curent_time = time.time()
|
||||
X_train = self.preprocess(X_train)
|
||||
model = self.estimator_class(**self.params)
|
||||
model.fit(X_train, y_train)
|
||||
train_time = time.time() - curent_time
|
||||
self.model = model
|
||||
return train_time
|
||||
|
||||
def fit(self, X_train, y_train, budget=None):
|
||||
'''Train the model from given training data
|
||||
|
||||
Args:
|
||||
X_train: A numpy array of training data in shape n*m
|
||||
y_train: A numpy array of labels in shape n*1
|
||||
budget: A float of the time budget in seconds
|
||||
|
||||
Returns:
|
||||
train_time: A float of the training time in seconds
|
||||
'''
|
||||
return self._fit(X_train, y_train)
|
||||
|
||||
def predict(self, X_test):
|
||||
'''Predict label from features
|
||||
|
||||
Args:
|
||||
X_test: A numpy array of featurized instances, shape n*m
|
||||
|
||||
Returns:
|
||||
A numpy array of shape n*1.
|
||||
Each element is the label for a instance
|
||||
'''
|
||||
X_test = self.preprocess(X_test)
|
||||
return self.model.predict(X_test)
|
||||
|
||||
def predict_proba(self, X_test):
|
||||
'''Predict the probability of each class from features
|
||||
|
||||
Only works for classification problems
|
||||
|
||||
Args:
|
||||
model: An object of trained model with method predict_proba()
|
||||
X_test: A numpy array of featurized instances, shape n*m
|
||||
|
||||
Returns:
|
||||
A numpy array of shape n*c. c is the # classes
|
||||
Each element at (i,j) is the probability for instance i to be in
|
||||
class j
|
||||
'''
|
||||
if 'regression' in self.objective_name:
|
||||
print('Regression tasks do not support predict_prob')
|
||||
raise ValueError
|
||||
else:
|
||||
X_test = self.preprocess(X_test)
|
||||
return self.model.predict_proba(X_test)
|
||||
|
||||
def cleanup(self): pass
|
||||
|
||||
|
||||
class SKLearnEstimator(BaseEstimator):
|
||||
|
||||
|
||||
def preprocess(self, X):
|
||||
if isinstance(X, pd.DataFrame):
|
||||
X = X.copy()
|
||||
cat_columns = X.select_dtypes(include=['category']).columns
|
||||
X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes)
|
||||
return X
|
||||
|
||||
|
||||
class LGBMEstimator(BaseEstimator):
|
||||
|
||||
|
||||
def __init__(self, objective_name='binary:logistic', n_jobs=1,
|
||||
n_estimators=2, max_leaves=2, min_child_weight=1e-3, learning_rate=0.1,
|
||||
subsample=1.0, reg_lambda=1.0, reg_alpha=0.0, colsample_bylevel=1.0,
|
||||
colsample_bytree=1.0, log_max_bin=8, **params):
|
||||
super().__init__(objective_name, **params)
|
||||
# Default: ‘regression’ for LGBMRegressor,
|
||||
# ‘binary’ or ‘multiclass’ for LGBMClassifier
|
||||
if 'regression' in objective_name:
|
||||
final_objective_name = 'regression'
|
||||
elif 'binary' in objective_name:
|
||||
final_objective_name = 'binary'
|
||||
elif 'multi' in objective_name:
|
||||
final_objective_name = 'multiclass'
|
||||
else:
|
||||
final_objective_name = 'regression'
|
||||
self.params = {
|
||||
"n_estimators": int(round(n_estimators)),
|
||||
"num_leaves": params[
|
||||
'num_leaves'] if 'num_leaves' in params else int(
|
||||
round(max_leaves)),
|
||||
'objective': params[
|
||||
"objective"] if "objective" in params else final_objective_name,
|
||||
'n_jobs': n_jobs,
|
||||
'learning_rate': float(learning_rate),
|
||||
'reg_alpha': float(reg_alpha),
|
||||
'reg_lambda': float(reg_lambda),
|
||||
'min_child_weight': float(min_child_weight),
|
||||
'colsample_bytree':float(colsample_bytree),
|
||||
'subsample': float(subsample),
|
||||
}
|
||||
self.params['max_bin'] = params['max_bin'] if 'max_bin' in params else (
|
||||
1<<int(round(log_max_bin)))-1
|
||||
if 'regression' in objective_name:
|
||||
self.estimator_class = LGBMRegressor
|
||||
else:
|
||||
self.estimator_class = LGBMClassifier
|
||||
self.time_per_iter = None
|
||||
self.train_size = 0
|
||||
|
||||
def preprocess(self, X):
|
||||
if not isinstance(X, pd.DataFrame) and scipy.sparse.issparse(
|
||||
X) and np.issubdtype(X.dtype, np.integer):
|
||||
X = X.astype(float)
|
||||
return X
|
||||
|
||||
def fit(self, X_train, y_train, budget=None):
|
||||
start_time = time.time()
|
||||
n_iter = self.params["n_estimators"]
|
||||
if (not self.time_per_iter or
|
||||
abs(self.train_size-X_train.shape[0])>4) and budget is not None:
|
||||
self.params["n_estimators"] = 1
|
||||
self.t1 = self._fit(X_train, y_train)
|
||||
if self.t1 >= budget:
|
||||
self.params["n_estimators"] = n_iter
|
||||
return self.t1
|
||||
self.params["n_estimators"] = 4
|
||||
self.t2 = self._fit(X_train, y_train)
|
||||
self.time_per_iter = (self.t2 - self.t1)/(
|
||||
self.params["n_estimators"]-1) if self.t2 > self.t1 \
|
||||
else self.t1 if self.t1 else 0.001
|
||||
self.train_size = X_train.shape[0]
|
||||
if self.t1+self.t2>=budget or n_iter==self.params["n_estimators"]:
|
||||
self.params["n_estimators"] = n_iter
|
||||
return time.time() - start_time
|
||||
if budget is not None:
|
||||
self.params["n_estimators"] = min(n_iter, int((budget-time.time()+
|
||||
start_time-self.t1)/self.time_per_iter+1))
|
||||
if self.params["n_estimators"] > 0:
|
||||
self._fit(X_train, y_train)
|
||||
self.params["n_estimators"] = n_iter
|
||||
train_time = time.time() - start_time
|
||||
return train_time
|
||||
|
||||
|
||||
class XGBoostEstimator(SKLearnEstimator):
|
||||
''' not using sklearn API, used for regression '''
|
||||
|
||||
|
||||
def __init__(self, objective_name='regression', all_thread=False, n_jobs=1,
|
||||
n_estimators=4, max_leaves=4, subsample=1.0, min_child_weight=1,
|
||||
learning_rate=0.1, reg_lambda=1.0, reg_alpha=0.0, colsample_bylevel=1.0,
|
||||
colsample_bytree=1.0, tree_method='auto', **params):
|
||||
super().__init__(objective_name, **params)
|
||||
self.n_estimators = int(round(n_estimators))
|
||||
self.max_leaves = int(round(max_leaves))
|
||||
self.grids = []
|
||||
self.params = {
|
||||
'max_leaves': int(round(max_leaves)),
|
||||
'max_depth': 0,
|
||||
'grow_policy': params[
|
||||
"grow_policy"] if "grow_policy" in params else 'lossguide',
|
||||
'tree_method':tree_method,
|
||||
'verbosity': 0,
|
||||
'nthread':n_jobs,
|
||||
'learning_rate': float(learning_rate),
|
||||
'subsample': float(subsample),
|
||||
'reg_alpha': float(reg_alpha),
|
||||
'reg_lambda': float(reg_lambda),
|
||||
'min_child_weight': float(min_child_weight),
|
||||
'booster': params['booster'] if 'booster' in params else 'gbtree',
|
||||
'colsample_bylevel': float(colsample_bylevel),
|
||||
'colsample_bytree':float(colsample_bytree),
|
||||
}
|
||||
if all_thread:
|
||||
del self.params['nthread']
|
||||
|
||||
def get_params(self, deep=False):
|
||||
params = super().get_params()
|
||||
params["n_jobs"] = params['nthread']
|
||||
return params
|
||||
|
||||
def fit(self, X_train, y_train, budget=None):
|
||||
curent_time = time.time()
|
||||
if not scipy.sparse.issparse(X_train):
|
||||
self.params['tree_method'] = 'hist'
|
||||
X_train = self.preprocess(X_train)
|
||||
dtrain = xgb.DMatrix(X_train, label=y_train)
|
||||
if self.max_leaves>0:
|
||||
xgb_model = xgb.train(self.params, dtrain, self.n_estimators)
|
||||
del dtrain
|
||||
train_time = time.time() - curent_time
|
||||
self.model = xgb_model
|
||||
return train_time
|
||||
else:
|
||||
return None
|
||||
|
||||
def predict(self, X_test):
|
||||
if not scipy.sparse.issparse(X_test):
|
||||
X_test = self.preprocess(X_test)
|
||||
dtest = xgb.DMatrix(X_test)
|
||||
return super().predict(dtest)
|
||||
|
||||
|
||||
class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
|
||||
''' using sklearn API, used for classification '''
|
||||
|
||||
|
||||
def __init__(self, objective_name='binary:logistic', n_jobs=1,
|
||||
n_estimators=4, max_leaves=4, subsample=1.0,
|
||||
min_child_weight=1, learning_rate=0.1, reg_lambda=1.0, reg_alpha=0.0,
|
||||
colsample_bylevel=1.0, colsample_bytree=1.0, tree_method='hist',
|
||||
**params):
|
||||
super().__init__(objective_name, **params)
|
||||
self.params = {
|
||||
"n_estimators": int(round(n_estimators)),
|
||||
'max_leaves': int(round(max_leaves)),
|
||||
'max_depth': 0,
|
||||
'grow_policy': params[
|
||||
"grow_policy"] if "grow_policy" in params else 'lossguide',
|
||||
'tree_method':tree_method,
|
||||
'verbosity': 0,
|
||||
'n_jobs': n_jobs,
|
||||
'learning_rate': float(learning_rate),
|
||||
'subsample': float(subsample),
|
||||
'reg_alpha': float(reg_alpha),
|
||||
'reg_lambda': float(reg_lambda),
|
||||
'min_child_weight': float(min_child_weight),
|
||||
'booster': params['booster'] if 'booster' in params else 'gbtree',
|
||||
'colsample_bylevel': float(colsample_bylevel),
|
||||
'colsample_bytree': float(colsample_bytree),
|
||||
}
|
||||
|
||||
if 'regression' in objective_name:
|
||||
self.estimator_class = XGBRegressor
|
||||
else:
|
||||
self.estimator_class = XGBClassifier
|
||||
self.time_per_iter = None
|
||||
self.train_size = 0
|
||||
|
||||
def fit(self, X_train, y_train, budget=None):
|
||||
if scipy.sparse.issparse(X_train):
|
||||
self.params['tree_method'] = 'auto'
|
||||
return super().fit(X_train, y_train, budget)
|
||||
|
||||
|
||||
class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
|
||||
|
||||
|
||||
def __init__(self, objective_name = 'binary:logistic', n_jobs = 1,
|
||||
n_estimators = 4, max_leaves = 4, max_features = 1.0,
|
||||
min_samples_split = 2, min_samples_leaf = 1, criterion = 1, **params):
|
||||
super().__init__(objective_name, **params)
|
||||
self.params = {
|
||||
"n_estimators": int(round(n_estimators)),
|
||||
"n_jobs": n_jobs,
|
||||
'max_features': float(max_features),
|
||||
}
|
||||
if 'regression' in objective_name:
|
||||
self.estimator_class = RandomForestRegressor
|
||||
else:
|
||||
self.estimator_class = RandomForestClassifier
|
||||
self.params['criterion'] = 'entropy' if criterion>1.5 else 'gini'
|
||||
self.time_per_iter = None
|
||||
self.train_size = 0
|
||||
|
||||
def get_params(self, deep=False):
|
||||
params = super().get_params()
|
||||
params["criterion"] = 1 if params["criterion"]=='gini' else 2
|
||||
return params
|
||||
|
||||
|
||||
class ExtraTreeEstimator(RandomForestEstimator):
|
||||
|
||||
|
||||
def __init__(self, objective_name = 'binary:logistic', n_jobs = 1,
|
||||
n_estimators = 4, max_leaves = 4, max_features = 1.0,
|
||||
min_samples_split = 2, min_samples_leaf = 1, criterion = 1, **params):
|
||||
super().__init__(objective_name, **params)
|
||||
self.params = {
|
||||
"n_estimators": int(round(n_estimators)),
|
||||
"n_jobs": n_jobs,
|
||||
'max_features': float(max_features),
|
||||
}
|
||||
if 'regression' in objective_name:
|
||||
from sklearn.ensemble import ExtraTreesRegressor
|
||||
self.estimator_class = ExtraTreesRegressor
|
||||
else:
|
||||
from sklearn.ensemble import ExtraTreesClassifier
|
||||
self.estimator_class = ExtraTreesClassifier
|
||||
self.params['criterion'] = 'entropy' if criterion>1.5 else 'gini'
|
||||
self.time_per_iter = None
|
||||
self.train_size = 0
|
||||
|
||||
|
||||
class LRL1Classifier(SKLearnEstimator):
|
||||
|
||||
|
||||
def __init__(self, tol=0.0001, C=1.0,
|
||||
objective_name='binary:logistic', n_jobs=1, **params):
|
||||
super().__init__(objective_name, **params)
|
||||
self.params = {
|
||||
'penalty': 'l1',
|
||||
'tol': float(tol),
|
||||
'C': float(C),
|
||||
'solver': 'saga',
|
||||
'n_jobs': n_jobs,
|
||||
}
|
||||
if 'regression' in objective_name:
|
||||
self.estimator_class = None
|
||||
print('Does not support regression task')
|
||||
raise NotImplementedError
|
||||
else:
|
||||
self.estimator_class = LogisticRegression
|
||||
|
||||
|
||||
class LRL2Classifier(SKLearnEstimator):
|
||||
|
||||
|
||||
def __init__(self, tol=0.0001, C=1.0,
|
||||
objective_name='binary:logistic', n_jobs=1, **params):
|
||||
super().__init__(objective_name, **params)
|
||||
self.params = {
|
||||
'penalty': 'l2',
|
||||
'tol': float(tol),
|
||||
'C': float(C),
|
||||
'solver': 'lbfgs',
|
||||
'n_jobs': n_jobs,
|
||||
}
|
||||
if 'regression' in objective_name:
|
||||
self.estimator_class = None
|
||||
print('Does not support regression task')
|
||||
raise NotImplementedError
|
||||
else:
|
||||
self.estimator_class = LogisticRegression
|
||||
|
||||
|
||||
class CatBoostEstimator(BaseEstimator):
|
||||
|
||||
|
||||
time_per_iter = None
|
||||
train_size = 0
|
||||
|
||||
def __init__(self, objective_name = 'binary:logistic', n_jobs=1,
|
||||
n_estimators=8192, exp_max_depth=64, learning_rate=0.1, rounds=4,
|
||||
l2_leaf_reg=3, **params):
|
||||
super().__init__(objective_name, **params)
|
||||
self.params = {
|
||||
"early_stopping_rounds": int(round(rounds)),
|
||||
"n_estimators": n_estimators,
|
||||
'learning_rate': learning_rate,
|
||||
'thread_count': n_jobs,
|
||||
'verbose': False,
|
||||
'random_seed': params[
|
||||
"random_seed"] if "random_seed" in params else 10242048,
|
||||
}
|
||||
# print(n_estimators)
|
||||
if 'regression' in objective_name:
|
||||
from catboost import CatBoostRegressor
|
||||
self.estimator_class = CatBoostRegressor
|
||||
else:
|
||||
from catboost import CatBoostClassifier
|
||||
self.estimator_class = CatBoostClassifier
|
||||
|
||||
def get_params(self, deep=False):
|
||||
params = super().get_params()
|
||||
params['n_jobs'] = params['thread_count']
|
||||
params['rounds'] = params['early_stopping_rounds']
|
||||
return params
|
||||
|
||||
def fit(self, X_train, y_train, budget=None):
|
||||
start_time = time.time()
|
||||
n_iter = self.params["n_estimators"]
|
||||
if isinstance(X_train, pd.DataFrame):
|
||||
cat_features = list(X_train.select_dtypes(
|
||||
include='category').columns)
|
||||
else:
|
||||
cat_features = []
|
||||
if (not CatBoostEstimator.time_per_iter or
|
||||
abs(CatBoostEstimator.train_size-len(y_train))>4) and budget:
|
||||
# measure the time per iteration
|
||||
self.params["n_estimators"] = 1
|
||||
CatBoostEstimator.model = self.estimator_class(**self.params)
|
||||
CatBoostEstimator.model.fit(X_train, y_train,
|
||||
cat_features=cat_features)
|
||||
CatBoostEstimator.t1 = time.time() - start_time
|
||||
if CatBoostEstimator.t1 >= budget:
|
||||
self.params["n_estimators"] = n_iter
|
||||
self.model = CatBoostEstimator.model
|
||||
return CatBoostEstimator.t1
|
||||
self.params["n_estimators"] = 4
|
||||
CatBoostEstimator.model = self.estimator_class(**self.params)
|
||||
CatBoostEstimator.model.fit(X_train, y_train,
|
||||
cat_features=cat_features)
|
||||
CatBoostEstimator.time_per_iter = (time.time() - start_time -
|
||||
CatBoostEstimator.t1)/(self.params["n_estimators"]-1)
|
||||
if CatBoostEstimator.time_per_iter <= 0:
|
||||
CatBoostEstimator.time_per_iter = CatBoostEstimator.t1
|
||||
CatBoostEstimator.train_size = len(y_train)
|
||||
if time.time()-start_time>=budget or n_iter==self.params[
|
||||
"n_estimators"]:
|
||||
self.params["n_estimators"] = n_iter
|
||||
self.model = CatBoostEstimator.model
|
||||
return time.time()-start_time
|
||||
if budget:
|
||||
train_times = 1
|
||||
self.params["n_estimators"] = min(n_iter, int((budget-time.time()+
|
||||
start_time-CatBoostEstimator.t1)/train_times/
|
||||
CatBoostEstimator.time_per_iter+1))
|
||||
self.model = CatBoostEstimator.model
|
||||
if self.params["n_estimators"] > 0:
|
||||
l = max(int(len(y_train)*0.9), len(y_train)-1000)
|
||||
X_tr, y_tr = X_train[:l], y_train[:l]
|
||||
from catboost import Pool
|
||||
model = self.estimator_class(**self.params)
|
||||
model.fit(X_tr, y_tr, cat_features=cat_features, eval_set=Pool(
|
||||
data=X_train[l:], label=y_train[l:], cat_features=cat_features))
|
||||
# print(self.params["n_estimators"], model.get_best_iteration())
|
||||
self.model = model
|
||||
self.params["n_estimators"] = n_iter
|
||||
train_time = time.time() - start_time
|
||||
# print(budget, train_time)
|
||||
return train_time
|
||||
|
||||
|
||||
class KNeighborsEstimator(BaseEstimator):
|
||||
|
||||
|
||||
def __init__(self, objective_name='binary:logistic', n_jobs=1,
|
||||
n_neighbors=5, **params):
|
||||
super().__init__(objective_name, **params)
|
||||
self.params= {
|
||||
'n_neighbors': int(round(n_neighbors)),
|
||||
'weights': 'distance',
|
||||
'n_jobs': n_jobs,
|
||||
}
|
||||
if 'regression' in objective_name:
|
||||
from sklearn.neighbors import KNeighborsRegressor
|
||||
self.estimator_class = KNeighborsRegressor
|
||||
else:
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
self.estimator_class = KNeighborsClassifier
|
||||
|
||||
def preprocess(self, X):
|
||||
if isinstance(X, pd.DataFrame):
|
||||
cat_columns = X.select_dtypes(['category']).columns
|
||||
# print(X.dtypes)
|
||||
# print(cat_columns)
|
||||
if X.shape[1] == len(cat_columns):
|
||||
raise ValueError(
|
||||
"kneighbor requires at least one numeric feature")
|
||||
X = X.drop(cat_columns, axis=1)
|
||||
return X
|
|
@ -0,0 +1,675 @@
|
|||
'''!
|
||||
* Copyright (c) 2020 Microsoft Corporation. All rights reserved.
|
||||
* Licensed under the MIT License.
|
||||
'''
|
||||
|
||||
from functools import partial
|
||||
from .ml import train_estimator
|
||||
import time
|
||||
import math
|
||||
import numpy as np
|
||||
from .space import config_space, estimator_size, get_config_values, \
|
||||
generate_config_ini, generate_config_max, generate_config_min
|
||||
from .config import SPLIT_RATIO, MIN_SAMPLE_TRAIN, \
|
||||
HISTORY_SIZE, MEM_THRES, BASE_Const, BASE_LOWER_BOUND
|
||||
from random import gauss
|
||||
|
||||
|
||||
def rand_vector_unit_sphere(dims):
|
||||
vec = [gauss(0, 1) for i in range(dims)]
|
||||
mag = sum(x**2 for x in vec) ** .5
|
||||
return [x / mag for x in vec]
|
||||
|
||||
|
||||
def rand_vector_gaussian(dims):
|
||||
vec = [gauss(0, 1) for i in range(dims)]
|
||||
return vec
|
||||
|
||||
|
||||
class ParamSearch:
|
||||
'''
|
||||
the class for searching params for 1 learner
|
||||
'''
|
||||
|
||||
def __init__(self, estimator, data_size,
|
||||
compute_with_config, train_with_config, save_info_helper=None,
|
||||
init_sample_size=MIN_SAMPLE_TRAIN, objective_name='regression',
|
||||
log_type='better', config_space_info=None, size_estimator=None,
|
||||
split_ratio=SPLIT_RATIO, base_change='sqrtK', use_dual_dir=True,
|
||||
move_type='geo'):
|
||||
self.log_type = log_type
|
||||
self.base_change = base_change
|
||||
if init_sample_size > data_size:
|
||||
init_sample_size = data_size
|
||||
self.next_sample_size = {}
|
||||
self.prev_sample_size = {}
|
||||
s = init_sample_size
|
||||
self.prev_sample_size[s] = s
|
||||
self.estimator_configspace = config_space_info or config_space(
|
||||
estimator, data_size, objective_name)
|
||||
self.get_size_for_config = size_estimator or (
|
||||
lambda x: estimator_size(x, estimator))
|
||||
config_min_dic_primary, config_min_dic_more, config_min_dic = \
|
||||
generate_config_min(estimator, self.estimator_configspace, None)
|
||||
self.min_config_primary = np.array(
|
||||
list(config_min_dic_primary.values()))
|
||||
self.min_config_more = np.array(list(config_min_dic_more.values()))
|
||||
self.min_config = np.array(list(config_min_dic.values()))
|
||||
# init configurations for different sample size
|
||||
config_init_dic_primary, config_init_dic_more, _, config_type_dic = \
|
||||
generate_config_ini(estimator, self.estimator_configspace)
|
||||
self.init_config_dic_primary = {s: config_init_dic_primary}
|
||||
self.init_config_dic_more = {s: config_init_dic_more}
|
||||
self.init_config_dic_type_dic = {'primary': {
|
||||
s: config_init_dic_primary}, 'more': {s: config_init_dic_more}}
|
||||
self.init_config_dic = {
|
||||
**self.init_config_dic_type_dic['primary'],
|
||||
**self.init_config_dic_type_dic['more']
|
||||
}
|
||||
self.config_type_dic = config_type_dic
|
||||
# max configurations for different sample size
|
||||
config_max_dic_primary, config_max_dic_more, config_max_dic = \
|
||||
generate_config_max(
|
||||
estimator, self.estimator_configspace, int(s))
|
||||
self.max_config_dic_primary = {s: np.array(
|
||||
list(config_max_dic_primary.values()))}
|
||||
self.max_config_dic_more = {s: np.array(
|
||||
list(config_max_dic_more.values()))}
|
||||
self.max_config_dic = {s: np.array(list(config_max_dic.values()))}
|
||||
self.dims = (len(self.min_config_primary), len(self.min_config_more))
|
||||
# print(self.dims)
|
||||
if self.dims[1] > 0 and self.dims[0] > 0:
|
||||
self.base_upper_bound = {
|
||||
s:
|
||||
max(
|
||||
max(
|
||||
(self.max_config_dic_primary[s][i] / self.min_config_primary[i])
|
||||
** math.sqrt(self.dims[0]) for i in range(self.dims[0])
|
||||
),
|
||||
max(
|
||||
(self.max_config_dic_more[s][i] / self.min_config_more[i])
|
||||
** math.sqrt(self.dims[1]) for i in range(self.dims[1]))
|
||||
)
|
||||
}
|
||||
elif self.dims[0] > 0:
|
||||
self.base_upper_bound = {
|
||||
s:
|
||||
max(
|
||||
(self.max_config_dic_primary[s][i] / self.min_config_primary[i])
|
||||
** (math.sqrt(self.dims[0])) for i in range(self.dims[0])
|
||||
)
|
||||
}
|
||||
else:
|
||||
self.base_upper_bound = {
|
||||
s:
|
||||
max(
|
||||
(self.max_config_dic_more[s][i] / self.min_config_more[i])
|
||||
** (math.sqrt(self.dims[1])) for i in range(self.dims[1])
|
||||
)
|
||||
}
|
||||
|
||||
# create sample size sequence
|
||||
while s < data_size:
|
||||
s2 = self.next_sample_size[s] = s * 2 if s * 2 <= data_size else data_size
|
||||
self.prev_sample_size[s2] = s
|
||||
s = s2
|
||||
|
||||
config_max_dic_primary, config_max_dic_more, config_max_dic = \
|
||||
generate_config_max(
|
||||
estimator, self.estimator_configspace, int(s))
|
||||
self.max_config_dic_primary[s] = np.array(
|
||||
list(config_max_dic_primary.values()))
|
||||
self.max_config_dic_more[s] = np.array(
|
||||
list(config_max_dic_more.values()))
|
||||
self.max_config_dic[s] = np.array(list(config_max_dic.values()))
|
||||
if self.dims[1] > 0 and self.dims[0] > 0:
|
||||
self.base_upper_bound[s] = max(
|
||||
max(
|
||||
(self.max_config_dic_primary[s][i]
|
||||
/ self.min_config_primary[i])
|
||||
** math.sqrt(self.dims[0]) for i in range(self.dims[0])
|
||||
),
|
||||
max(
|
||||
(self.max_config_dic_more[s][i]
|
||||
/ self.min_config_more[i])
|
||||
** math.sqrt(self.dims[1]) for i in range(self.dims[1])
|
||||
)
|
||||
)
|
||||
elif self.dims[0] > 0:
|
||||
self.base_upper_bound[s] = max(
|
||||
(self.max_config_dic_primary[s][i]
|
||||
/ self.min_config_primary[i])
|
||||
** math.sqrt(self.dims[0]) for i in range(self.dims[0])
|
||||
)
|
||||
else:
|
||||
self.base_upper_bound[s] = max(
|
||||
(self.max_config_dic_more[s][i] / self.min_config_more[i])
|
||||
** math.sqrt(self.dims[1]) for i in range(self.dims[1])
|
||||
)
|
||||
|
||||
self.init_sample_size = init_sample_size
|
||||
self.data_size = data_size
|
||||
self.sample_size_full = int(self.data_size / (1.0 - split_ratio))
|
||||
|
||||
self.compute_with_config = compute_with_config
|
||||
self.estimator = estimator
|
||||
|
||||
# for logging
|
||||
self.save_helper = save_info_helper
|
||||
self.estimator_type_list = ['primary', 'more']
|
||||
self.dim = self.dims[0] if self.dims[0] > 0 else self.dims[1]
|
||||
self.b = BASE_Const**(math.sqrt(self.dim))
|
||||
self.base_ini = self.b
|
||||
self.total_dim = sum(self.dims)
|
||||
|
||||
self.epo = 2**(self.dim - 1)
|
||||
# keys are [sample size, config], values are (loss, train_time)
|
||||
self.config_tried = {}
|
||||
self.train_with_config = train_with_config
|
||||
|
||||
self.current_config_loss = None
|
||||
self.use_dual_dir = use_dual_dir
|
||||
self.move_type = move_type
|
||||
|
||||
def evaluate_config(self, config, sample_size, move='_pos'):
|
||||
'''
|
||||
evaluate a configuration, update search state,
|
||||
and return whether the state is changed
|
||||
'''
|
||||
if self.time_from_start >= self.time_budget or move != '_ini' and \
|
||||
self.train_time > self.time_budget - self.time_from_start:
|
||||
return False
|
||||
|
||||
model, val_loss, new_train_time, from_history, train_loss = \
|
||||
self.evaluate_proposed_config(config, sample_size, move)
|
||||
# update current config
|
||||
self.update_current_config(config, val_loss, sample_size)
|
||||
# update best model statistics, including statistics about loss and time
|
||||
improved = self.update_search_state_best(
|
||||
config, sample_size, model, val_loss, new_train_time, from_history)
|
||||
self.time_from_start = time.time() - self.start_time
|
||||
if self.save_helper is not None:
|
||||
if from_history:
|
||||
move = move + '_from_hist'
|
||||
self.save_helper.append(self.model_count,
|
||||
train_loss,
|
||||
new_train_time,
|
||||
self.time_from_start,
|
||||
val_loss,
|
||||
config,
|
||||
self.best_loss,
|
||||
self.best_config[0],
|
||||
self.estimator,
|
||||
sample_size)
|
||||
return improved
|
||||
|
||||
def get_hist_config_sig(self, sample_size, config):
|
||||
config_values = get_config_values(config, self.config_type_dic)
|
||||
config_sig = str(sample_size) + '_' + str(config_values)
|
||||
return config_sig
|
||||
|
||||
def evaluate_proposed_config(self, config, sample_size, move):
|
||||
self.model_count += 1
|
||||
config_sig = self.get_hist_config_sig(sample_size, config)
|
||||
d = self.total_dim
|
||||
history_size_per_d = len(self.config_tried) / float(d)
|
||||
if config_sig in self.config_tried:
|
||||
val_loss, new_train_time = self.config_tried[config_sig]
|
||||
# print(config_sig,'found in history')
|
||||
model = train_loss = None
|
||||
from_history = True
|
||||
else:
|
||||
model, val_loss, train_loss, new_train_time, _ = \
|
||||
self.compute_with_config(self.estimator, config, sample_size)
|
||||
from_history = False
|
||||
if history_size_per_d < HISTORY_SIZE:
|
||||
self.config_tried[config_sig] = (val_loss, new_train_time)
|
||||
|
||||
if self.first_move:
|
||||
self.init_config_dic[sample_size] = config
|
||||
move = '_ini'
|
||||
self.base = self.base_ini
|
||||
self.num_noimprovement = 0
|
||||
move = str(self.estimator) + move
|
||||
return model, val_loss, new_train_time, from_history, train_loss
|
||||
|
||||
def update_current_config(self, config, val_loss, sample_size):
|
||||
if self.first_move or val_loss < self.current_config_loss:
|
||||
self.first_move = False
|
||||
# update current config and coressponding sample_size
|
||||
self.sample_size = sample_size
|
||||
self.config = config
|
||||
self.config_primary = {x: config[x]
|
||||
for x in self.config_primary.keys()}
|
||||
try:
|
||||
self.config_more = {x: config[x]
|
||||
for x in self.config_more.keys()}
|
||||
except:
|
||||
self.config_more = {}
|
||||
self.current_config_loss = val_loss
|
||||
|
||||
def update_reset_best_config_loss(self, sample_size, config, val_loss):
|
||||
if sample_size == self.data_size:
|
||||
if self.best_config_loss_dic_full_reset[1] is None:
|
||||
self.best_config_loss_dic_full_reset = [
|
||||
config, val_loss, self.model_count]
|
||||
else:
|
||||
full_reset_best_loss = self.best_config_loss_dic_full_reset[1]
|
||||
if val_loss < full_reset_best_loss:
|
||||
self.best_config_loss_dic_full_reset = [
|
||||
config, full_reset_best_loss, self.model_count]
|
||||
|
||||
def update_search_state_best(self, config, sample_size, model, val_loss,
|
||||
new_train_time, from_history):
|
||||
# upate the loss statistics for a particular sample size
|
||||
if sample_size not in self.best_config_loss_samplesize_dic:
|
||||
self.best_config_loss_samplesize_dic[sample_size] = [
|
||||
config, val_loss, self.model_count]
|
||||
else:
|
||||
s_best_loss = self.best_config_loss_samplesize_dic[sample_size][1]
|
||||
if val_loss < s_best_loss:
|
||||
self.best_config_loss_samplesize_dic[sample_size] = [
|
||||
config, val_loss, self.model_count]
|
||||
|
||||
self.update_reset_best_config_loss(sample_size, config, val_loss)
|
||||
|
||||
# update best model statistics, including statistics about loss and time
|
||||
if val_loss < self.new_loss:
|
||||
self.old_loss = self.new_loss if self.new_loss < float(
|
||||
'inf') else 2 * val_loss
|
||||
self.new_loss = val_loss
|
||||
self.old_loss_time = self.new_loss_time
|
||||
self.old_train_time = self.train_time
|
||||
self.new_loss_time = self.train_time = new_train_time
|
||||
if val_loss < self.best_loss:
|
||||
self.best_config = [self.config, self.model_count]
|
||||
if not from_history:
|
||||
self.trained_estimator = model
|
||||
# print(model)
|
||||
else:
|
||||
print(val_loss, self.best_loss)
|
||||
self.best_loss = val_loss
|
||||
self.time_best_found = self.time_from_start
|
||||
return True
|
||||
else:
|
||||
if not from_history:
|
||||
self.new_loss_time += new_train_time
|
||||
return False
|
||||
|
||||
def get_proposal(self, current_config, rand_vector_func, base, move_type):
|
||||
rand_vector = rand_vector_func(len(current_config))
|
||||
rand_vector = [i for i in rand_vector]
|
||||
rand_vector_neg = [-i for i in rand_vector]
|
||||
|
||||
move_vector = {}
|
||||
move_vector_neg = {}
|
||||
|
||||
index_ = 0
|
||||
for k, v in current_config.items():
|
||||
if 'geo' in move_type:
|
||||
# get the move vector using the proposed random vector
|
||||
move_vector[k] = v * (base**(rand_vector[index_]))
|
||||
move_vector_neg[k] = v * (base**(rand_vector_neg[index_]))
|
||||
else:
|
||||
move_vector[k] = v + (base * (rand_vector[index_]))
|
||||
move_vector_neg[k] = v + (base * (rand_vector_neg[index_]))
|
||||
index_ += 1
|
||||
|
||||
# as long as one of the proposed model (+ or -) is within the mem_limit
|
||||
# we will proceed
|
||||
if not self.use_dual_dir:
|
||||
move_vector_neg = None
|
||||
return move_vector, move_vector_neg
|
||||
|
||||
def get_config_from_move_vector(self, v, estimator_type):
|
||||
if v != None:
|
||||
if 'all' in estimator_type:
|
||||
v = v
|
||||
elif 'primary' in estimator_type:
|
||||
v = {**v, **self.config_more}
|
||||
else:
|
||||
v = {**self.config_primary, **v}
|
||||
|
||||
bounded_v = self.get_v_within_min_max(v)
|
||||
else:
|
||||
bounded_v = None
|
||||
return bounded_v
|
||||
|
||||
def dual_direction_sample(self, base, current_search_config,
|
||||
estimator_type='primary', rand_vector_func=rand_vector_unit_sphere,
|
||||
mem_thres=MEM_THRES, move_type='geo'):
|
||||
current_config = current_search_config
|
||||
if len(current_config) == 0:
|
||||
return None, None
|
||||
bounded_v_list = [None, None]
|
||||
while not bounded_v_list[0] and not bounded_v_list[
|
||||
1] and self.time_from_start < self.time_budget:
|
||||
move_vector, move_vector_neg = self.get_proposal(
|
||||
current_config, rand_vector_func,
|
||||
base, move_type)
|
||||
bounded_v_list = [move_vector, move_vector_neg]
|
||||
for i, v in enumerate(bounded_v_list):
|
||||
bounded_v = self.get_config_from_move_vector(v, estimator_type)
|
||||
proposed_model_size = self.get_size_for_config(bounded_v)
|
||||
proposed_model_size = 0 if not isinstance(
|
||||
proposed_model_size, float) else proposed_model_size
|
||||
if proposed_model_size > mem_thres:
|
||||
# print(bounded_v, proposed_model_size, mem_thres)
|
||||
bounded_v = None
|
||||
bounded_v_list[i] = bounded_v
|
||||
self.time_from_start = time.time() - self.start_time
|
||||
return bounded_v_list
|
||||
|
||||
def get_v_within_min_max(self, v):
|
||||
index_ = 0
|
||||
bounded_v = {}
|
||||
for key, value in v.items():
|
||||
new_value = min(max(
|
||||
value, self.min_config[index_]), self.max_config_dic[
|
||||
self.sample_size][index_])
|
||||
bounded_v[key] = new_value
|
||||
index_ += 1
|
||||
return bounded_v
|
||||
|
||||
def expected_time_improvement_search(self):
|
||||
return max(self.old_loss_time - self.old_train_time + self.train_time,
|
||||
self.new_loss_time)
|
||||
|
||||
def increase_sample_size(self):
|
||||
'''
|
||||
whether it's time to increase sample size
|
||||
'''
|
||||
expected_time_improvement_sample = 2 * self.train_time
|
||||
self.increase = self.sample_size < self.data_size and (
|
||||
self.estimator_type == 0 or self.dims[0] == 0) and (
|
||||
not self.improved
|
||||
or expected_time_improvement_sample
|
||||
< self.expected_time_improvement_search()
|
||||
)
|
||||
return self.increase
|
||||
|
||||
def search_begin(self, time_budget, start_time=None):
|
||||
self.time_budget = time_budget
|
||||
if not start_time:
|
||||
self.start_time = time.time()
|
||||
else:
|
||||
self.start_time = start_time
|
||||
# the time to train the last selected config
|
||||
self.old_train_time = self.train_time = 0
|
||||
self.time_from_start = 0
|
||||
# search states
|
||||
self.first_move = True
|
||||
self.improved = True
|
||||
self.estimator_type = 0 if self.dims[0] > 0 else 1
|
||||
|
||||
self.old_loss = self.new_loss = self.best_loss = float('+inf')
|
||||
# new_loss_time is the time from the beginning of training self.config to
|
||||
# now,
|
||||
# old_loss_time is the time from the beginning of training the old
|
||||
# self.config to the beginning of training self.config
|
||||
self.old_loss_time = self.new_loss_time = 0
|
||||
|
||||
self.trained_estimator = None
|
||||
self.model_count = 0
|
||||
self.K = 0
|
||||
self.old_modelcount = 0
|
||||
|
||||
# self.config has two parts: config_primary contain the configs
|
||||
# that are related with model complexity, config_more contains the
|
||||
# configs that is not related with model complexity
|
||||
self.config_primary = self.init_config_dic_primary[self.init_sample_size]
|
||||
self.config_more = self.init_config_dic_more[self.init_sample_size]
|
||||
self.config = {**self.config_primary, **self.config_more}
|
||||
self.best_config = [None, None]
|
||||
# key: sample size, value: [best_config, best_loss, model_count] under
|
||||
# sample size in the key
|
||||
self.best_config_loss_samplesize_dic = {
|
||||
self.init_sample_size: [self.config, self.old_loss, self.model_count]}
|
||||
# key: sample size, value: [best_config, best_loss, model_count] under
|
||||
# sample size in the key
|
||||
self.best_config_loss_dic_full_reset = [None, None, None]
|
||||
self.sample_size = self.init_sample_size
|
||||
self.base_change_bound = 1
|
||||
self.base_change_count = 0
|
||||
self.evaluate_config(self.config, self.sample_size, '_ini')
|
||||
self.increase = False
|
||||
|
||||
def train_config(self, config, sample_size):
|
||||
'''
|
||||
train a configuration
|
||||
'''
|
||||
# print('Evalute Config')
|
||||
if self.time_from_start >= self.time_budget:
|
||||
return False
|
||||
config_sig = self.get_hist_config_sig(sample_size, config)
|
||||
if not config_sig in self.config_tried:
|
||||
_, new_train_time = self.train_with_config(
|
||||
self.estimator, config, sample_size)
|
||||
train_loss, val_loss, move = None, self.new_loss, str(
|
||||
self.estimator) + '_trainAll'
|
||||
self.time_from_start = time.time() - self.start_time
|
||||
if self.save_helper is not None:
|
||||
self.save_helper.append(self.model_count,
|
||||
train_loss,
|
||||
new_train_time,
|
||||
self.time_from_start,
|
||||
val_loss,
|
||||
config,
|
||||
self.best_loss,
|
||||
self.best_config,
|
||||
move,
|
||||
sample_size)
|
||||
self.config_tried[config_sig] = (val_loss, new_train_time)
|
||||
|
||||
def try_increase_sample_size(self):
|
||||
# print( self.estimator, self.sample_size)
|
||||
if self.sample_size in self.next_sample_size:
|
||||
if self.increase_sample_size():
|
||||
self.first_move = True
|
||||
self.improved = True
|
||||
self.estimator_type = 0 if self.dims[0] > 0 else 1
|
||||
self.evaluate_config(
|
||||
self.config, self.next_sample_size[self.sample_size])
|
||||
if not self.old_modelcount and self.sample_size == self.data_size:
|
||||
self.old_modelcount = self.model_count
|
||||
|
||||
def setup_current_search_config(self):
|
||||
estimator_type = self.estimator_type_list[self.estimator_type]
|
||||
if 'all' in estimator_type:
|
||||
current_search_config = self.config
|
||||
elif 'primary' in estimator_type:
|
||||
current_search_config = self.config_primary
|
||||
else:
|
||||
current_search_config = self.config_more
|
||||
# print(self.config_more)
|
||||
return estimator_type, current_search_config
|
||||
|
||||
def search1step(self, global_best_loss=float('+inf'),
|
||||
retrain_full=True, mem_thres=MEM_THRES, reset_type='init_gaussian'):
|
||||
# try to increase sample size
|
||||
self.try_increase_sample_size()
|
||||
# decide current_search_config according to estimator_type
|
||||
estimator_type, current_search_config = \
|
||||
self.setup_current_search_config()
|
||||
time_left = self.time_budget - self.time_from_start
|
||||
if time_left < self.train_time:
|
||||
return False
|
||||
if retrain_full and self.train_time < time_left < 2 * self.train_time \
|
||||
and self.best_loss <= global_best_loss:
|
||||
self.train_config(self.best_config[0], self.sample_size_full)
|
||||
|
||||
move_vector, move_vector_neg = self.dual_direction_sample(
|
||||
self.base, current_search_config, estimator_type,
|
||||
rand_vector_unit_sphere, mem_thres, self.move_type)
|
||||
if move_vector is None:
|
||||
if move_vector_neg is None:
|
||||
self.improved = False
|
||||
else:
|
||||
self.improved = self.evaluate_config(
|
||||
move_vector_neg, self.sample_size, '_neg' + str(
|
||||
estimator_type))
|
||||
else:
|
||||
self.improved = self.evaluate_config(
|
||||
move_vector, self.sample_size, '_pos' + str(estimator_type))
|
||||
if not self.improved:
|
||||
if move_vector_neg is None:
|
||||
pass
|
||||
else:
|
||||
self.improved = self.evaluate_config(
|
||||
move_vector_neg, self.sample_size, '_neg' + str(
|
||||
estimator_type))
|
||||
self.update_noimprovement_stat(
|
||||
global_best_loss, retrain_full, reset_type)
|
||||
return self.improved
|
||||
|
||||
def update_noimprovement_stat(self, global_best_loss, retrain_full,
|
||||
reset_type):
|
||||
if self.improved:
|
||||
self.num_noimprovement = 0
|
||||
else:
|
||||
self.estimator_type = 1 - self.estimator_type
|
||||
if self.dims[self.estimator_type] == 0:
|
||||
self.estimator_type = 1 - self.estimator_type
|
||||
if self.estimator_type == 1 or self.dims[1] == 0:
|
||||
self.noimprovement(global_best_loss, retrain_full, reset_type)
|
||||
|
||||
def noimprovement(self, global_best_loss, retrain_full, reset_type='org'):
|
||||
if self.sample_size == self.data_size:
|
||||
# Do not wait until full sample size to update num_noimprovement?
|
||||
self.num_noimprovement += 1
|
||||
if self.num_noimprovement >= self.epo:
|
||||
self.num_noimprovement = 0
|
||||
# print(self.num_noimprovement, self.epo)
|
||||
if self.base_change == 'squareroot':
|
||||
self.base = math.sqrt(self.base)
|
||||
else:
|
||||
if self.K == 0: # first time
|
||||
oldK = self.best_config_loss_dic_full_reset[2] - \
|
||||
self.old_modelcount
|
||||
else:
|
||||
oldK = self.K
|
||||
self.K = self.model_count + 1 - self.old_modelcount
|
||||
if self.base_change == 'K':
|
||||
self.base **= oldK / self.K
|
||||
else:
|
||||
self.base **= math.sqrt(oldK / self.K)
|
||||
if self.dims[1] > 0 and self.dims[0] > 0:
|
||||
base_lower_bound = min(
|
||||
min(
|
||||
(1.0 + self.estimator_configspace[i].min_change
|
||||
/ self.config_primary[i])
|
||||
** math.sqrt(self.dims[0])
|
||||
for i in self.config_primary.keys()
|
||||
),
|
||||
min(
|
||||
(1.0 + self.estimator_configspace[i].min_change
|
||||
/ self.config_more[i])
|
||||
** math.sqrt(self.dims[1])
|
||||
for i in self.config_more.keys()
|
||||
)
|
||||
)
|
||||
elif self.dims[0] > 0:
|
||||
base_lower_bound = min(
|
||||
(1.0 + self.estimator_configspace[i].min_change
|
||||
/ self.config_primary[i])
|
||||
** math.sqrt(self.dims[0])
|
||||
for i in self.config_primary.keys()
|
||||
)
|
||||
else:
|
||||
base_lower_bound = min(
|
||||
(1.0 + self.estimator_configspace[i].min_change
|
||||
/ self.config_more[i])
|
||||
** math.sqrt(self.dims[1])
|
||||
for i in self.config_more.keys()
|
||||
)
|
||||
if np.isinf(base_lower_bound):
|
||||
base_lower_bound = BASE_LOWER_BOUND
|
||||
self.base_change_count += 1
|
||||
if self.base <= base_lower_bound or \
|
||||
self.base_change_count == self.base_change_bound:
|
||||
if retrain_full and self.sample_size == self.data_size:
|
||||
if self.best_loss <= global_best_loss:
|
||||
# Only train on full data when the curent estimator
|
||||
# is the best estimator
|
||||
# print('best estimator and train on full data')
|
||||
self.train_config(
|
||||
self.best_config[0], self.sample_size_full)
|
||||
# remaining time is more than enough for another trial
|
||||
if self.time_budget - self.time_from_start > self.train_time:
|
||||
self.base_change_bound <<= 1
|
||||
self.base_change_count = 0
|
||||
self.K = 0
|
||||
self.old_modelcount = self.model_count
|
||||
self.best_config_loss_dic_full_reset = [None, None,
|
||||
None]
|
||||
self.first_move = True
|
||||
self.improved = True
|
||||
self.base_ini = min(
|
||||
self.base_ini * 2, self.base_upper_bound[
|
||||
self.sample_size])
|
||||
self.estimator_type = 0 if self.dims[0] > 0 else 1
|
||||
reset_config, reset_sample_size = self.get_reset_config(
|
||||
self.init_sample_size, reset_type)
|
||||
self.sample_size = reset_sample_size
|
||||
# print('reset sample size', reset_sample_size)
|
||||
self.evaluate_config(reset_config, self.sample_size,
|
||||
'_ini')
|
||||
|
||||
def get_reset_config(self, sample_size, reset_type):
|
||||
init_config = self.init_config_dic[self.sample_size]
|
||||
reset_sample_size = sample_size
|
||||
if 'org' in reset_type:
|
||||
reset_config = init_config
|
||||
else:
|
||||
if 'init_gaussian' in reset_type:
|
||||
reset_config = init_config
|
||||
reset_sample_size = self.get_reset_sample_size(reset_config)
|
||||
config_values = get_config_values(
|
||||
reset_config, self.config_type_dic)
|
||||
config_sig = str(reset_sample_size) + '_' + str(config_values)
|
||||
count = 0
|
||||
while config_sig in self.config_tried and \
|
||||
self.time_from_start < self.time_budget and count < 1000:
|
||||
# TODO: check exhaustiveness? use time as condition?
|
||||
count += 1
|
||||
move, move_neg = self.dual_direction_sample(
|
||||
base=self.b, current_search_config=init_config,
|
||||
estimator_type='all',
|
||||
rand_vector_func=rand_vector_gaussian,
|
||||
move_type=self.move_type)
|
||||
if move:
|
||||
reset_config = move_neg
|
||||
elif move_neg:
|
||||
reset_config = move_neg
|
||||
else:
|
||||
continue
|
||||
reset_sample_size = self.get_reset_sample_size(
|
||||
reset_config)
|
||||
config_values = get_config_values(
|
||||
reset_config, self.config_type_dic)
|
||||
config_sig = str(reset_sample_size) + \
|
||||
'_' + str(config_values)
|
||||
self.time_from_start = time.time() - self.start_time
|
||||
else:
|
||||
raise NotImplementedError
|
||||
return reset_config, reset_sample_size
|
||||
|
||||
def get_reset_sample_size(self, reset_config):
|
||||
if not reset_config:
|
||||
print('reset_config is none')
|
||||
reset_config_size = self.get_size_for_config(reset_config)
|
||||
|
||||
candidate_sample_size_list = []
|
||||
for sample_size, config_and_bestloss in \
|
||||
self.best_config_loss_samplesize_dic.items():
|
||||
s_best_config = config_and_bestloss[0]
|
||||
if not s_best_config:
|
||||
print('best config is none', sample_size)
|
||||
s_best_config_model_size = self.get_size_for_config(s_best_config)
|
||||
if s_best_config_model_size >= reset_config_size:
|
||||
candidate_sample_size_list.append(sample_size)
|
||||
|
||||
if len(candidate_sample_size_list) != 0:
|
||||
return min(candidate_sample_size_list)
|
||||
else:
|
||||
return self.data_size
|
|
@ -0,0 +1,249 @@
|
|||
'''!
|
||||
* Copyright (c) 2020 Microsoft Corporation. All rights reserved.
|
||||
* Licensed under the MIT License.
|
||||
'''
|
||||
|
||||
|
||||
class ConfigSearchInfo:
|
||||
'''The class of the search space of a hyperparameters:
|
||||
|
||||
Attributes:
|
||||
name: A string of the name of the hyperparameter
|
||||
type: data type of the hyperparameter
|
||||
lower: A number of the lower bound of the value
|
||||
upper: A number of the upper bound of the value
|
||||
init: A number of the initial value. For hyperparameters related to
|
||||
complexity, the init value needs to correspond to the lowest
|
||||
complexity
|
||||
change_tpe: A string of the change type, 'linear' or 'log'
|
||||
min_change: A number of the minimal change required. Could be inf if
|
||||
no such requirement
|
||||
'''
|
||||
|
||||
def __init__(self, name, type, lower, upper, init, change_type = 'log',
|
||||
complexity_related = True, min_change = None):
|
||||
self.name = name
|
||||
self.type = type
|
||||
self.lower = lower
|
||||
self.upper = upper
|
||||
self.init = init
|
||||
self.change_type = change_type
|
||||
self.complexity_related = complexity_related
|
||||
# default setting of min_change: if type is int, min_change
|
||||
# should be 1, otherwise +inf
|
||||
if min_change is None:
|
||||
if self.type == int:
|
||||
self.min_change = 1.0 #minimum change required,
|
||||
else:
|
||||
self.min_change = float('+inf')
|
||||
else:
|
||||
self.min_change = min_change
|
||||
|
||||
|
||||
def config_space(estimator, data_size, objective_name = "regression"):
|
||||
CS = {}
|
||||
n_estimators_upper = min(32768,int(data_size))
|
||||
max_leaves_upper = min(32768,int(data_size))
|
||||
# exp_max_depth_upper = min(32768,data_size)
|
||||
if 'xgboost' in estimator:
|
||||
CS['n_estimators'] = ConfigSearchInfo(name = 'n_estimators',
|
||||
type = int, lower = 4, init = 4, upper = n_estimators_upper,
|
||||
change_type = 'log')
|
||||
CS['max_leaves'] = ConfigSearchInfo(name = 'max_leaves', type =int,
|
||||
lower = 4, init = 4, upper = max_leaves_upper, change_type = 'log')
|
||||
CS['min_child_weight'] = ConfigSearchInfo(name = 'min_child_weight',
|
||||
type = float, lower = 0.001, init = 20.0, upper = 20.0,
|
||||
change_type = 'log')
|
||||
|
||||
CS['learning_rate'] = ConfigSearchInfo(name = 'learning_rate',
|
||||
type = float, lower = 0.01, init = 0.1, upper = 1.0,
|
||||
change_type = 'log')
|
||||
CS['subsample'] = ConfigSearchInfo(name = 'subsample', type = float,
|
||||
lower = 0.6, init = 1.0, upper = 1.0, change_type = 'linear')
|
||||
CS['reg_alpha'] = ConfigSearchInfo(name = 'reg_alpha', type = float,
|
||||
lower = 1e-10, init = 1e-10, upper = 1.0, change_type = 'log',
|
||||
complexity_related = True)
|
||||
CS['reg_lambda'] = ConfigSearchInfo(name = 'reg_lambda', type = float,
|
||||
lower = 1e-10, init = 1.0, upper = 1.0, change_type = 'log')
|
||||
CS['colsample_bylevel'] = ConfigSearchInfo(name = 'colsample_bylevel',
|
||||
type = float, lower = 0.6, init = 1.0, upper = 1.0,
|
||||
change_type = 'linear')
|
||||
CS['colsample_bytree'] = ConfigSearchInfo(name = 'colsample_bytree',
|
||||
type = float, lower = 0.7, init = 1.0, upper = 1.0,
|
||||
change_type = 'linear')
|
||||
elif estimator in ('rf', 'extra_tree'):
|
||||
n_estimators_upper = min(2048, n_estimators_upper)
|
||||
# max_leaves_upper = min(2048, max_leaves_upper)
|
||||
CS['n_estimators'] = ConfigSearchInfo(name = 'n_estimators',
|
||||
type = int, lower = 4, init = 4, upper = n_estimators_upper,
|
||||
change_type = 'log')
|
||||
if objective_name != 'regression':
|
||||
CS['criterion'] = ConfigSearchInfo(name = 'criterion',
|
||||
type = int, lower = 1, init = 1, upper = 2,
|
||||
change_type = 'log')
|
||||
|
||||
# CS['max_leaves'] = ConfigSearchInfo(name = 'max_leaves', type =int,
|
||||
# lower = 4, init = 4, upper = max_leaves_upper, change_type = 'log',
|
||||
# complexity_related = True)
|
||||
|
||||
CS['max_features'] = ConfigSearchInfo(name = 'max_features', type = float,
|
||||
lower = 0.1, init = 1.0, upper = 1.0, change_type = 'log')
|
||||
# CS['min_samples_split'] = ConfigSearchInfo(name = 'min_samples_split',
|
||||
# type = int, lower = 2, init = 2, upper = 20, change_type = 'log',
|
||||
# complexity_related = True)
|
||||
# CS['min_samples_leaf'] = ConfigSearchInfo(name = 'min_samples_leaf',
|
||||
# type = int, lower = 1, init = 1, upper = 20, change_type = 'log',
|
||||
# complexity_related = True)
|
||||
elif 'lgbm' in estimator:
|
||||
CS['n_estimators'] = ConfigSearchInfo(name = 'n_estimators', type = int,
|
||||
lower = 4, init = 4, upper = n_estimators_upper, change_type = 'log')
|
||||
CS['max_leaves'] = ConfigSearchInfo(name = 'max_leaves', type = int,
|
||||
lower = 4, init = 4, upper = max_leaves_upper, change_type = 'log')
|
||||
CS['min_child_weight'] = ConfigSearchInfo(name = 'min_child_weight',
|
||||
type = float, lower = 0.001, init = 20, upper = 20.0,
|
||||
change_type = 'log')
|
||||
|
||||
CS['learning_rate'] = ConfigSearchInfo(name = 'learning_rate',
|
||||
type = float, lower = 0.01, init = 0.1, upper = 1.0,
|
||||
change_type = 'log')
|
||||
CS['subsample'] = ConfigSearchInfo(name = 'subsample', type = float,
|
||||
lower = 0.6, init = 1.0, upper = 1.0, change_type = 'log',
|
||||
complexity_related = True)
|
||||
CS['log_max_bin'] = ConfigSearchInfo(name = 'log_max_bin', type = int,
|
||||
lower = 3, init = 8, upper = 10, change_type = 'log',
|
||||
complexity_related = True)
|
||||
CS['reg_alpha'] = ConfigSearchInfo(name = 'reg_alpha', type = float,
|
||||
lower = 1e-10, init = 1e-10, upper = 1.0, change_type = 'log',
|
||||
complexity_related = True)
|
||||
CS['reg_lambda'] = ConfigSearchInfo(name = 'reg_lambda', type = float,
|
||||
lower = 1e-10, init = 1.0, upper = 1.0, change_type = 'log')
|
||||
CS['colsample_bytree'] = ConfigSearchInfo(name = 'colsample_bytree',
|
||||
type = float, lower = 0.7, init = 1.0, upper = 1.0,
|
||||
change_type = 'log')
|
||||
elif 'lr' in estimator:
|
||||
CS['C'] = ConfigSearchInfo(name = 'C', type =float, lower = 0.03125,
|
||||
init = 1.0, upper = 32768.0, change_type = 'log',
|
||||
complexity_related = True)
|
||||
elif 'catboost' in estimator:
|
||||
# CS['n_estimators'] = ConfigSearchInfo(name = 'n_estimators', type = int,
|
||||
# lower = 4, init = 64, upper = n_estimators_upper, change_type = 'log',
|
||||
# complexity_related = True)
|
||||
early_stopping_rounds = max(min(round(1500000/data_size),150), 10)
|
||||
CS['rounds'] = ConfigSearchInfo(name = 'rounds', type = int,
|
||||
lower = 10, init = 10,
|
||||
upper = early_stopping_rounds, change_type = 'log')
|
||||
# CS['exp_max_depth'] = ConfigSearchInfo(name = 'exp_max_depth', type = int,
|
||||
# lower = 32, init = 64, upper = 256, change_type = 'log',
|
||||
# complexity_related = True)
|
||||
|
||||
CS['learning_rate'] = ConfigSearchInfo(name = 'learning_rate',
|
||||
type = float, lower = 0.005, init = 0.1, upper = .2,
|
||||
change_type = 'log')
|
||||
# CS['l2_leaf_reg'] = ConfigSearchInfo(name = 'l2_leaf_reg',
|
||||
# type = float, lower = 1, init = 3, upper = 5,
|
||||
# change_type = 'log')
|
||||
elif 'nn' == estimator:
|
||||
CS['learning_rate'] = ConfigSearchInfo(name = 'learning_rate',
|
||||
type = float, lower = 1e-4, init = 3e-4, upper = 3e-2,
|
||||
change_type = 'log')
|
||||
CS['weight_decay'] = ConfigSearchInfo(name = 'weight_decay',
|
||||
type = float, lower = 1e-12, init = 1e-6, upper = .1,
|
||||
change_type = 'log')
|
||||
CS['dropout_prob'] = ConfigSearchInfo(name = 'dropout_prob',
|
||||
type = float, lower = 1.0, init = 1.1, upper = 1.5,
|
||||
change_type = 'log')
|
||||
elif 'kneighbor' in estimator:
|
||||
n_neighbors_upper = min(512,int(data_size/2))
|
||||
CS['n_neighbors'] = ConfigSearchInfo(name = 'n_neighbors', type = int,
|
||||
lower = 1, init = 5, upper = n_neighbors_upper, change_type = 'log')
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
return CS
|
||||
|
||||
|
||||
def estimator_size(config, estimator):
|
||||
if estimator in ['xgboost', 'lgbm', 'rf', 'extra_tree']:
|
||||
try:
|
||||
max_leaves = int(round(config['max_leaves']))
|
||||
n_estimators = int(round(config['n_estimators']))
|
||||
model_size = float((max_leaves*3 + (max_leaves-1)*4 + 1)*
|
||||
n_estimators*8)
|
||||
except:
|
||||
model_size = 0
|
||||
return model_size
|
||||
elif 'catboost' in estimator:
|
||||
# if config is None: raise Exception("config is none")
|
||||
n_estimators = int(round(config.get('n_estimators',8192)))
|
||||
max_leaves = int(round(config.get('exp_max_depth',64)))
|
||||
model_size = float((max_leaves*3 + (max_leaves-1)*4 + 1)*
|
||||
n_estimators*8)
|
||||
return model_size
|
||||
else:
|
||||
model_size = 1.0
|
||||
# raise NotImplementedError
|
||||
return model_size
|
||||
|
||||
|
||||
def generate_config_ini(estimator, estimator_configspace):
|
||||
|
||||
|
||||
config_dic = {}
|
||||
config_dic_more = {}
|
||||
config_type_dic = {}
|
||||
for _, config in estimator_configspace.items():
|
||||
name, init = config.name, config.init
|
||||
type_, complexity_related = config.type, config.complexity_related
|
||||
config_type_dic[name] = type_
|
||||
if complexity_related:
|
||||
config_dic[name] = init
|
||||
else:
|
||||
config_dic_more[name] = init
|
||||
return config_dic, config_dic_more, {**config_dic, **config_dic_more}, \
|
||||
config_type_dic
|
||||
|
||||
|
||||
def generate_config_min(estimator,estimator_configspace, max_config_size):
|
||||
|
||||
|
||||
config_dic = {}
|
||||
config_dic_more = {}
|
||||
for _, config in estimator_configspace.items():
|
||||
name, lower = config.name, config.lower
|
||||
complexity_related = config.complexity_related
|
||||
if complexity_related:
|
||||
config_dic[name] = lower
|
||||
else:
|
||||
config_dic_more[name] = lower
|
||||
|
||||
return config_dic, config_dic_more, {**config_dic, **config_dic_more}
|
||||
|
||||
|
||||
def generate_config_max(estimator, estimator_configspace, max_config_size):
|
||||
|
||||
|
||||
config_dic = {}
|
||||
config_dic_more = {}
|
||||
for _, config in estimator_configspace.items():
|
||||
name, upper = config.name, config.upper
|
||||
complexity_related = config.complexity_related
|
||||
if complexity_related:
|
||||
if name in ('n_estimators', 'max_leaves'):
|
||||
config_dic[name] = min(upper, max_config_size)
|
||||
else:
|
||||
config_dic[name] = upper
|
||||
else:
|
||||
config_dic_more[name] = upper
|
||||
return config_dic, config_dic_more, {**config_dic, **config_dic_more}
|
||||
|
||||
|
||||
def get_config_values(config_dic, config_type_dic):
|
||||
value_list = []
|
||||
for k in config_dic.keys():
|
||||
org_v = config_dic[k]
|
||||
if config_type_dic[k] == int:
|
||||
v = int(round(org_v))
|
||||
value_list.append(v)
|
||||
else:
|
||||
value_list.append(org_v)
|
||||
return value_list
|
|
@ -0,0 +1,168 @@
|
|||
'''!
|
||||
* Copyright (c) 2020 Microsoft Corporation. All rights reserved.
|
||||
* Licensed under the MIT License.
|
||||
'''
|
||||
|
||||
import json
|
||||
from typing import IO
|
||||
from contextlib import contextmanager
|
||||
import warnings
|
||||
|
||||
|
||||
class TrainingLogRecord(object):
|
||||
|
||||
def __init__(self,
|
||||
record_id: int,
|
||||
iter_per_learner: int,
|
||||
logged_metric: float,
|
||||
trial_time: float,
|
||||
total_search_time: float,
|
||||
validation_loss,
|
||||
config,
|
||||
best_validation_loss,
|
||||
best_config,
|
||||
learner,
|
||||
sample_size):
|
||||
self.record_id = record_id
|
||||
self.iter_per_learner = iter_per_learner
|
||||
self.logged_metric = logged_metric
|
||||
self.trial_time = trial_time
|
||||
self.total_search_time = total_search_time
|
||||
self.validation_loss = validation_loss
|
||||
self.config = config
|
||||
self.best_validation_loss = best_validation_loss
|
||||
self.best_config = best_config
|
||||
self.learner = learner
|
||||
self.sample_size = sample_size
|
||||
|
||||
def dump(self, fp: IO[str]):
|
||||
d = vars(self)
|
||||
return json.dump(d, fp)
|
||||
|
||||
@classmethod
|
||||
def load(cls, json_str: str):
|
||||
d = json.loads(json_str)
|
||||
return cls(**d)
|
||||
|
||||
|
||||
class TrainingLogCheckPoint(TrainingLogRecord):
|
||||
|
||||
def __init__(self, curr_best_record_id: int):
|
||||
self.curr_best_record_id = curr_best_record_id
|
||||
|
||||
|
||||
class TrainingLogWriter(object):
|
||||
|
||||
def __init__(self, output_filename: str):
|
||||
self.output_filename = output_filename
|
||||
self.file = None
|
||||
self.current_best_loss_record_id = None
|
||||
self.current_best_loss = float('+inf')
|
||||
self.current_sample_size = None
|
||||
self.current_record_id = 0
|
||||
|
||||
def open(self):
|
||||
self.file = open(self.output_filename, 'w')
|
||||
|
||||
def append(self,
|
||||
it_counter: int,
|
||||
train_loss: float,
|
||||
trial_time: float,
|
||||
total_search_time: float,
|
||||
validation_loss,
|
||||
config,
|
||||
best_validation_loss,
|
||||
best_config,
|
||||
learner,
|
||||
sample_size):
|
||||
if self.file is None:
|
||||
raise IOError("Call open() to open the outpute file first.")
|
||||
if validation_loss is None:
|
||||
raise ValueError('TEST LOSS NONE ERROR!!!')
|
||||
record = TrainingLogRecord(self.current_record_id,
|
||||
it_counter,
|
||||
train_loss,
|
||||
trial_time,
|
||||
total_search_time,
|
||||
validation_loss,
|
||||
config,
|
||||
best_validation_loss,
|
||||
best_config,
|
||||
learner,
|
||||
sample_size)
|
||||
if validation_loss < self.current_best_loss or \
|
||||
validation_loss == self.current_best_loss and \
|
||||
sample_size > self.current_sample_size:
|
||||
self.current_best_loss = validation_loss
|
||||
self.current_sample_size = sample_size
|
||||
self.current_best_loss_record_id = self.current_record_id
|
||||
self.current_record_id += 1
|
||||
record.dump(self.file)
|
||||
self.file.write('\n')
|
||||
self.file.flush()
|
||||
|
||||
def checkpoint(self):
|
||||
if self.file is None:
|
||||
raise IOError("Call open() to open the outpute file first.")
|
||||
if self.current_best_loss_record_id is None:
|
||||
warnings.warn("checkpoint() called before any record is written, "
|
||||
"skipped.")
|
||||
return
|
||||
record = TrainingLogCheckPoint(self.current_best_loss_record_id)
|
||||
record.dump(self.file)
|
||||
self.file.write('\n')
|
||||
self.file.flush()
|
||||
|
||||
def close(self):
|
||||
self.file.close()
|
||||
|
||||
|
||||
class TrainingLogReader(object):
|
||||
|
||||
def __init__(self, filename: str):
|
||||
self.filename = filename
|
||||
self.file = None
|
||||
|
||||
def open(self):
|
||||
self.file = open(self.filename)
|
||||
|
||||
def records(self):
|
||||
if self.file is None:
|
||||
raise IOError("Call open() before reading log file.")
|
||||
for line in self.file:
|
||||
data = json.loads(line)
|
||||
if len(data) == 1:
|
||||
# Skip checkpoints.
|
||||
continue
|
||||
yield TrainingLogRecord(**data)
|
||||
|
||||
def close(self):
|
||||
self.file.close()
|
||||
|
||||
def get_record(self, record_id) -> TrainingLogRecord:
|
||||
if self.file is None:
|
||||
raise IOError("Call open() before reading log file.")
|
||||
for rec in self.records():
|
||||
if rec.record_id == record_id:
|
||||
return rec
|
||||
raise ValueError(f"Cannot find record with id {record_id}.")
|
||||
|
||||
|
||||
@contextmanager
|
||||
def training_log_writer(filename: str):
|
||||
try:
|
||||
w = TrainingLogWriter(filename)
|
||||
w.open()
|
||||
yield w
|
||||
finally:
|
||||
w.close()
|
||||
|
||||
|
||||
@contextmanager
|
||||
def training_log_reader(filename: str):
|
||||
try:
|
||||
r = TrainingLogReader(filename)
|
||||
r.open()
|
||||
yield r
|
||||
finally:
|
||||
r.close()
|
|
@ -0,0 +1 @@
|
|||
__version__="0.1.0"
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -0,0 +1,4 @@
|
|||
{
|
||||
"keep_max_logfiles": 30,
|
||||
"logging_level": "info"
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
import setuptools
|
||||
import os
|
||||
|
||||
here = os.path.abspath(os.path.dirname(__file__))
|
||||
|
||||
with open("README.md", "r") as fh:
|
||||
long_description = fh.read()
|
||||
|
||||
|
||||
# Get the code version
|
||||
version = {}
|
||||
with open(os.path.join(here, "flaml/version.py")) as fp:
|
||||
exec(fp.read(), version)
|
||||
__version__ = version["__version__"]
|
||||
|
||||
install_requires = [
|
||||
"NumPy>=1.16.2",
|
||||
"lightgbm>=2.3.1",
|
||||
"xgboost>=0.90",
|
||||
"scipy>=1.4.1",
|
||||
"catboost>=0.23",
|
||||
"scikit-learn>=0.23",
|
||||
],
|
||||
|
||||
|
||||
setuptools.setup(
|
||||
name="FLAML",
|
||||
version=__version__,
|
||||
author="Microsoft Corporation",
|
||||
author_email="hpo@microsoft.com",
|
||||
description="A fast and lightweight autoML system",
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
url="https://github.com/microsoft/FLAML",
|
||||
packages=["flaml"],
|
||||
install_requires=install_requires,
|
||||
extras_require={
|
||||
"notebook": [
|
||||
"openml==0.10.2",
|
||||
"jupyter",
|
||||
"matplotlib==3.2.0",
|
||||
"rgf-python",
|
||||
],
|
||||
"test": [
|
||||
"flake8>=3.8.4",
|
||||
"pytest>=6.1.1",
|
||||
"coverage>=5.3",
|
||||
],
|
||||
},
|
||||
classifiers=[
|
||||
"Programming Language :: Python :: 3",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: OS Independent",
|
||||
],
|
||||
python_requires=">=3.6",
|
||||
)
|
|
@ -0,0 +1,235 @@
|
|||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import scipy.sparse
|
||||
from sklearn.datasets import load_boston, load_iris
|
||||
|
||||
from flaml import AutoML, get_output_from_log
|
||||
|
||||
|
||||
def custom_metric(X_test, y_test, estimator, labels, X_train, y_train):
|
||||
from sklearn.metrics import log_loss
|
||||
y_pred = estimator.predict_proba(X_test)
|
||||
test_loss = log_loss(y_test, y_pred, labels=labels)
|
||||
y_pred = estimator.predict_proba(X_train)
|
||||
train_loss = log_loss(y_train, y_pred, labels=labels)
|
||||
alpha = 0.5
|
||||
return test_loss * (1 + alpha) - alpha * train_loss, [test_loss, train_loss]
|
||||
|
||||
|
||||
class TestAutoML(unittest.TestCase):
|
||||
|
||||
def test_dataframe(self):
|
||||
self.test_classification(True)
|
||||
|
||||
def test_custom_metric(self):
|
||||
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 10,
|
||||
'eval_method': 'holdout',
|
||||
"metric": custom_metric,
|
||||
"task": 'classification',
|
||||
"log_file_name": "test/iris_custom.log",
|
||||
"log_training_metric": True,
|
||||
'log_type': 'all',
|
||||
"model_history": True
|
||||
}
|
||||
X_train, y_train = load_iris(return_X_y=True)
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train,
|
||||
**automl_settings)
|
||||
print(automl_experiment.classes_)
|
||||
print(automl_experiment.predict_proba(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
automl_experiment = AutoML()
|
||||
estimator = automl_experiment.get_estimator_from_log(
|
||||
automl_settings["log_file_name"], record_id=0,
|
||||
objective='multi')
|
||||
print(estimator)
|
||||
time_history, best_valid_loss_history, valid_loss_history, \
|
||||
config_history, train_loss_history = get_output_from_log(
|
||||
filename=automl_settings['log_file_name'], time_budget=6)
|
||||
print(train_loss_history)
|
||||
|
||||
def test_classification(self, as_frame=False):
|
||||
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 4,
|
||||
"metric": 'accuracy',
|
||||
"task": 'classification',
|
||||
"log_file_name": "test/iris.log",
|
||||
"log_training_metric": True,
|
||||
"model_history": True
|
||||
}
|
||||
X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train,
|
||||
**automl_settings)
|
||||
print(automl_experiment.classes_)
|
||||
print(automl_experiment.predict_proba(X_train)[:5])
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
del automl_settings["metric"]
|
||||
del automl_settings["model_history"]
|
||||
del automl_settings["log_training_metric"]
|
||||
automl_experiment = AutoML()
|
||||
duration = automl_experiment.retrain_from_log(
|
||||
log_file_name=automl_settings["log_file_name"],
|
||||
X_train=X_train, y_train=y_train,
|
||||
train_full=True, record_id=0)
|
||||
print(duration)
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.predict_proba(X_train)[:5])
|
||||
|
||||
def test_regression(self):
|
||||
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
"metric": 'mse',
|
||||
"task": 'regression',
|
||||
"log_file_name": "test/boston.log",
|
||||
"log_training_metric": True,
|
||||
"model_history": True
|
||||
}
|
||||
X_train, y_train = load_boston(return_X_y=True)
|
||||
n = len(y_train)
|
||||
automl_experiment.fit(X_train=X_train[:n >> 1], y_train=y_train[:n >> 1],
|
||||
X_val=X_train[n >> 1:], y_val=y_train[n >> 1:],
|
||||
**automl_settings)
|
||||
assert automl_experiment.y_val.shape[0] == n - (n >> 1)
|
||||
assert automl_experiment.eval_method == 'holdout'
|
||||
print(automl_experiment.predict(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
print(get_output_from_log(automl_settings["log_file_name"], 1))
|
||||
|
||||
def test_sparse_matrix_classification(self):
|
||||
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
"metric": 'auto',
|
||||
"task": 'classification',
|
||||
"log_file_name": "test/sparse_classification.log",
|
||||
"split_type": "uniform",
|
||||
"model_history": True
|
||||
}
|
||||
X_train = scipy.sparse.random(1554, 21, dtype=int)
|
||||
y_train = np.random.randint(3, size=1554)
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train,
|
||||
**automl_settings)
|
||||
print(automl_experiment.classes_)
|
||||
print(automl_experiment.predict_proba(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
|
||||
def test_sparse_matrix_regression(self):
|
||||
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
"metric": 'mae',
|
||||
"task": 'regression',
|
||||
"log_file_name": "test/sparse_regression.log",
|
||||
"model_history": True
|
||||
}
|
||||
X_train = scipy.sparse.random(300, 900, density=0.0001)
|
||||
y_train = np.random.uniform(size=300)
|
||||
X_val = scipy.sparse.random(100, 900, density=0.0001)
|
||||
y_val = np.random.uniform(size=100)
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train,
|
||||
X_val=X_val, y_val=y_val,
|
||||
**automl_settings)
|
||||
assert automl_experiment.X_val.shape == X_val.shape
|
||||
print(automl_experiment.predict(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
print(automl_experiment.best_config)
|
||||
print(automl_experiment.best_loss)
|
||||
print(automl_experiment.best_config_train_time)
|
||||
|
||||
def test_sparse_matrix_xgboost(self):
|
||||
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
"metric": 'ap',
|
||||
"task": 'classification',
|
||||
"log_file_name": "test/sparse_classification.log",
|
||||
"estimator_list": ["xgboost"],
|
||||
"log_type": "all",
|
||||
}
|
||||
X_train = scipy.sparse.eye(900000)
|
||||
y_train = np.random.randint(2, size=900000)
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train,
|
||||
**automl_settings)
|
||||
print(automl_experiment.predict(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
|
||||
def test_sparse_matrix_lr(self):
|
||||
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
"metric": 'f1',
|
||||
"task": 'classification',
|
||||
"log_file_name": "test/sparse_classification.log",
|
||||
"estimator_list": ["lrl1", "lrl2"],
|
||||
"log_type": "all",
|
||||
}
|
||||
X_train = scipy.sparse.random(3000, 900, density=0.1)
|
||||
y_train = np.random.randint(2, size=3000)
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train,
|
||||
**automl_settings)
|
||||
print(automl_experiment.predict(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
|
||||
def test_sparse_matrix_regression_cv(self):
|
||||
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
'eval_method': 'cv',
|
||||
"task": 'regression',
|
||||
"log_file_name": "test/sparse_regression.log",
|
||||
"model_history": True
|
||||
}
|
||||
X_train = scipy.sparse.random(100, 100)
|
||||
y_train = np.random.uniform(size=100)
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train,
|
||||
**automl_settings)
|
||||
print(automl_experiment.predict(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
|
@ -0,0 +1,45 @@
|
|||
import unittest
|
||||
|
||||
from sklearn.datasets import fetch_openml
|
||||
from flaml.automl import AutoML
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import accuracy_score
|
||||
|
||||
|
||||
dataset = "Airlines"
|
||||
|
||||
|
||||
def _test(split_type):
|
||||
automl = AutoML()
|
||||
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
# "metric": 'accuracy',
|
||||
"task": 'classification',
|
||||
"log_file_name": "test/{}.log".format(dataset),
|
||||
"model_history": True,
|
||||
"log_training_metric": True,
|
||||
"split_type": split_type,
|
||||
}
|
||||
|
||||
X, y = fetch_openml(name=dataset, return_X_y=True)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
|
||||
random_state=42)
|
||||
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
|
||||
pred = automl.predict(X_test)
|
||||
acc = accuracy_score(y_test, pred)
|
||||
|
||||
print(acc)
|
||||
|
||||
|
||||
def test_stratified():
|
||||
_test(split_type="stratified")
|
||||
|
||||
|
||||
def test_uniform():
|
||||
_test(split_type="uniform")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
|
@ -0,0 +1,14 @@
|
|||
import unittest
|
||||
import flaml
|
||||
|
||||
|
||||
class TestVersion(unittest.TestCase):
|
||||
|
||||
|
||||
def test_version(self):
|
||||
self.assertTrue(hasattr(flaml, '__version__'))
|
||||
self.assertTrue(len(flaml.__version__) > 0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
Загрузка…
Ссылка в новой задаче