v0.1.0

2020-12-04 09:40:27 -08:00 · 2020-12-04 09:40:27 -08:00 · 492990655d
--- a/.coveragerc
+++ b/.coveragerc
@ -0,0 +1,5 @@
 [run]
 branch = True
 source = flaml
 omit =
  *tests*
--- a/.flake8
+++ b/.flake8
@ -0,0 +1,5 @@
 [flake8]
 ignore = E203, E266, E501, W503, F403, F401, C901
 max-line-length = 127
 max-complexity = 10
 select = B,C,E,F,W,T4,B9
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@ -0,0 +1,59 @@
 # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 name: Python package
 on:
  push:
    branches: ['*']
  pull_request:
    branches: ['*']
 jobs:
  build:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest, macos-latest, windows-2019]
        python-version: [3.6, 3.7, 3.8]
    steps:
    - uses: actions/checkout@v2
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v2
      with:
        python-version: ${{ matrix.python-version }}
    - name: If mac, install libomp to facilitate lgbm install
      if: matrix.os == 'macOS-latest'
      run: |
        brew install libomp
        export CC=/usr/bin/clang
        export CXX=/usr/bin/clang++
        export CPPFLAGS="$CPPFLAGS -Xpreprocessor -fopenmp"
        export CFLAGS="$CFLAGS -I/usr/local/opt/libomp/include"
        export CXXFLAGS="$CXXFLAGS -I/usr/local/opt/libomp/include"
        export LDFLAGS="$LDFLAGS -Wl,-rpath,/usr/local/opt/libomp/lib -L/usr/local/opt/libomp/lib -lomp"
    - name: Install packages and dependencies
      run: |
        python -m pip install --upgrade pip
        pip install flake8 pytest coverage
        pip install -e .
    - name: Lint with flake8
      run: |
        # stop the build if there are Python syntax errors or undefined names
        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
    - name: Test with pytest
      run: |
        pytest test
    - name: Coverage
      run: |
        coverage run -a -m pytest test
        coverage xml
    - name: Upload coverage to Codecov
      uses: codecov/codecov-action@v1
      with:
        file: ./coverage.xml
        flags: unittests
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,150 @@
 # Project
 /.vs
 .vscode
 # Log files
 *.log
 # Python virtualenv
 .venv
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 /catboost_info
 notebook/*.pkl
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@ -0,0 +1,9 @@
 # Microsoft Open Source Code of Conduct
 This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 Resources:
 - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
--- a/21
+++ b/21
@ -0,0 +1,21 @@
    MIT License
    Copyright (c) Microsoft Corporation.
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
    The above copyright notice and this permission notice shall be included in all
    copies or substantial portions of the Software.
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    SOFTWARE
--- a/README.md
+++ b/README.md
@ -0,0 +1,123 @@
 # FLAML - Fast and Lightweight AutoML
 FLAML is a Python library designed to automatically produce accurate machine
 learning models with low computational cost. It frees users from selecting
 learners and hyperparameters for each learner. It is fast and cheap.
 The simple and lightweight design makes it easy to extend, such as
 adding customized learners or metrics. FLAML is powered by a new, cost-effective
 hyperparameter optimization and learner selection method invented by
 Microsoft Research.
 FLAML is easy to use:
 1. With three lines of code, you can start using this economical and fast
 AutoML engine as a scikit-learn style estimator.
 ```python
 from flaml import AutoML
 automl = AutoML()
 automl.fit(X_train, y_train, task="classification")
 ```
 2. You can restrict the learners and use FLAML as a fast hyperparameter tuning
 tool for XGBoost, LightGBM, Random Forest etc. or a customized learner.
 ```python
 automl.fit(X_train, y_train, task="classification", estimator_list=["lgbm"])
 ```
 3. You can embed FLAML in self-tuning software for just-in-time tuning with
 low latency & resource consumption.
 ```python
 automl.fit(X_train, y_train, task="regression", time_budget=60)
 ```
 ## Installation
 FLAML requires **Python version >= 3.6**. It can be installed from pip:
 ```bash
 pip install flaml
 ```
 To run the [`notebook example`](https://github.com/microsoft/FLAML/tree/main/notebook),
 install flaml with the [notebook] option:
 ```bash
 pip install flaml[notebook]
 ```
 ## Examples
 A basic classification example.
 ```python
 from flaml import AutoML
 from sklearn.datasets import load_iris
 # Initialize the FLAML learner.
 automl = AutoML()
 # Provide configurations.
 automl_settings = {
    "time_budget": 10,  # in seconds
    "metric": 'accuracy',
    "task": 'classification',
    "log_file_name": "test/iris.log",
 }
 X_train, y_train = load_iris(return_X_y=True)
 # Train with labeled input data.
 automl.fit(X_train=X_train, y_train=y_train,
                        **automl_settings)
 # Predict
 print(automl.predict_proba(X_train))
 # Export the best model.
 print(automl.model)
 ```
 A basic regression example.
 ```python
 from flaml import AutoML
 from sklearn.datasets import load_boston
 # Initialize the FLAML learner.
 automl = AutoML()
 # Provide configurations.
 automl_settings = {
    "time_budget": 10,  # in seconds
    "metric": 'r2',
    "task": 'regression',
    "log_file_name": "test/boston.log",
 }
 X_train, y_train = load_boston(return_X_y=True)
 # Train with labeled input data.
 automl.fit(X_train=X_train, y_train=y_train,
                        **automl_settings)
 # Predict
 print(automl.predict(X_train))
 # Export the best model.
 print(automl.model)
 ```
 More examples: see the [notebook](https://github.com/microsoft/FLAML/tree/main/notebook/flaml_demo.ipynb)
 ## Contributing
 This project welcomes contributions and suggestions. Most contributions require you to agree to a
 Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
 the rights to use your contribution. For details, visit <https://cla.opensource.microsoft.com>.
 When you submit a pull request, a CLA bot will automatically determine whether you need to provide
 a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
 provided by the bot. You will only need to do this once across all repos using our CLA.
 This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
 contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
 ## Authors
 * Chi Wang
 * Qingyun Wu
 * Erkang Zhu
 Contributors: Markus Weimer, Silu Huang, Haozhe Zhang, Alex Deng.
 ## License
 [MIT License](LICENSE)
--- a/SECURITY.md
+++ b/SECURITY.md
@ -0,0 +1,41 @@
 <!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
 ## Security
 Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 ## Reporting Security Issues
 **Please do not report security vulnerabilities through public GitHub issues.**
 Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
 If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
 You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
 Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
  * Full paths of source file(s) related to the manifestation of the issue
  * The location of the affected source code (tag/branch/commit or direct URL)
  * Any special configuration required to reproduce the issue
  * Step-by-step instructions to reproduce the issue
  * Proof-of-concept or exploit code (if possible)
  * Impact of the issue, including how an attacker might exploit the issue
 This information will help us triage your report more quickly.
 If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
 ## Preferred Languages
 We prefer all communications to be in English.
 ## Policy
 Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
 <!-- END MICROSOFT SECURITY.MD BLOCK -->
--- a/flaml/init.py
+++ b/flaml/init.py
@ -0,0 +1,70 @@
 from flaml.automl import AutoML
 from flaml.model import BaseEstimator
 from flaml.data import get_output_from_log
 from flaml.version import __version__
 import logging
 from os.path import join, exists
 import datetime as dt
 from os import listdir, remove, mkdir
 import pathlib
 import json
 root = pathlib.Path(__file__).parent.parent.absolute() 
 jsonfilepath = join(root, "settings.json")
 with open(jsonfilepath) as f:
    settings = json.load(f)
 logging_level = settings["logging_level"]
 if logging_level == "info":
    logging_level = logging.INFO
 elif logging_level == "debug":
    logging_level = logging.DEBUG
 elif logging_level == "error":
    logging_level = logging.ERROR
 elif logging_level == "warning":
    logging_level = logging.WARNING
 elif logging_level == "critical":
    logging_level = logging.CRITICAL
 else:
    logging_level = logging.NOTSET
 keep_max_logfiles = settings["keep_max_logfiles"]
 log_dir = join(root, "logs")
 if not exists(log_dir):
    mkdir(log_dir)
 del_logs = sorted([int(x.split("_")[0]) for x in listdir(log_dir) if ".log" in
 x], reverse=True)[keep_max_logfiles:]
 for l in del_logs:
    try:
        remove(join(log_dir, str(l) + "_flaml.log"))
    except Exception as e:
        continue
 b = dt.datetime.now()
 a = dt.datetime(2020, 4, 1, 0, 0, 0)
 secs = int((b-a).total_seconds())
 name = str(secs) 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging_level)
 fh = logging.FileHandler(join(log_dir, name + "_" + __name__ + ".log"))
 fh.setLevel(logging_level)
 ch = logging.StreamHandler()
 ch.setLevel(logging_level)
 # formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
 formatter = logging.Formatter(
    '[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s',
    '%m-%d %H:%M:%S')
 ch.setFormatter(formatter)
 fh.setFormatter(formatter)
 logger.addHandler(ch)
 logger.addHandler(fh)
 logger.propagate = True
--- a/flaml/automl.py
+++ b/flaml/automl.py
@ -0,0 +1,897 @@
 '''!
 * Copyright (c) 2020 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the
 * project root for license information.
 '''
 import time
 import warnings
 from functools import partial
 import ast
 import numpy as np
 import scipy.sparse
 from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, \
    RepeatedKFold
 from sklearn.utils import shuffle
 import pandas as pd
 from .ml import compute_estimator, train_estimator, get_classification_objective
 from .config import MIN_SAMPLE_TRAIN, MEM_THRES, ETI_INI, \
    SMALL_LARGE_THRES, CV_HOLDOUT_THRESHOLD, SPLIT_RATIO, N_SPLITS
 from .data import concat
 from .search import ParamSearch
 from .training_log import training_log_reader, training_log_writer
 import logging
 logger = logging.getLogger(__name__)
 class AutoML:
    '''The AutoML class
    Attributes:
        model: An object with predict() and predict_proba() method (for
            classification), storing the best trained model.
        model_history: A dictionary of iter->model, storing the models when
            the best model is updated each time
        config_history: A dictionary of iter->(estimator, config, time), 
            storing the best estimator, config, and the time when the best
            model is updated each time
        classes_: A list of n_classes elements for class labels
        best_iteration: An integer of the iteration number where the best
            config is found
        best_estimator: A string indicating the best estimator found.
        best_config: A dictionary of the best configuration.
        best_config_train_time: A float of the seconds taken by training the
            best config 
    Typical usage example:
        automl = AutoML()
        automl_settings = {
            "time_budget": 60,
            "metric": 'accuracy',
            "task": 'classification',
            "log_file_name": 'test/mylog.log',
        }
        automl.fit(X_train = X_train, y_train = y_train,
            **automl_settings)
    '''
    def __init__(self):
        self._eti_ini = ETI_INI
        self._custom_learners = {}
        self._config_space_info = {}
        self._custom_size_estimate = {}
        self._track_iter = 0
    @property
    def model_history(self):
        return self._model_history
    @property
    def config_history(self):
        return self._config_history
    @property
    def model(self):
        if self._trained_estimator:
            return self._trained_estimator.model
        else:
            return None
    @property
    def best_estimator(self):
        return self._best_estimator
    @property
    def best_iteration(self):
        return self._best_iteration
    @property
    def best_config(self):
        return self._selected.best_config[0]
    @property
    def best_loss(self):
        return self._best_loss
    @property
    def best_config_train_time(self):
        return self.best_train_time
    @property
    def classes_(self):
        if self.label_transformer:
            return self.label_transformer.classes_.tolist()
        if self._trained_estimator:
            return self._trained_estimator.model.classes_.tolist()
        return None
    def predict(self, X_test):
        '''Predict label from features.
        Args:
            X_test: A numpy array of featurized instances, shape n*m.
        Returns:
            A numpy array of shape n*1 -- each element is a predicted class
            label for an instance.
        '''
        X_test = self.preprocess(X_test)
        y_pred = self._trained_estimator.predict(X_test)
        if y_pred.ndim > 1:
            y_pred = y_pred.flatten()
        if self.label_transformer:
            return self.label_transformer.inverse_transform(pd.Series(
                y_pred))
        else:
            return y_pred
    def predict_proba(self, X_test):
        '''Predict the probability of each class from features, only works for
        classification problems.
        Args:
            X_test: A numpy array of featurized instances, shape n*m.
        Returns:
            A numpy array of shape n*c. c is the # classes. Each element at
            (i,j) is the probability for instance i to be in class j.
        '''
        X_test = self.preprocess(X_test)
        proba = self._trained_estimator.predict_proba(X_test)
        return proba
    def preprocess(self, X):
        if scipy.sparse.issparse(X):
            X = X.tocsr()
        if self.transformer:
            X = self.transformer.transform(X)
        return X
    def _validate_data(self, X_train_all, y_train_all, dataframe, label,
                       X_val=None, y_val=None):
        if X_train_all is not None and y_train_all is not None:
            if not (isinstance(X_train_all, np.ndarray)
                    or scipy.sparse.issparse(X_train_all)
                    or isinstance(X_train_all, pd.DataFrame)
                    ):
                raise ValueError(
                    "X_train_all must be a numpy array, a pandas dataframe, "
                    "or Scipy sparse matrix.")
            if not (isinstance(y_train_all, np.ndarray)
                    or isinstance(y_train_all, pd.Series)):
                raise ValueError(
                    "y_train_all must be a numpy array or a pandas series.")
            if X_train_all.size == 0 or y_train_all.size == 0:
                raise ValueError("Input data must not be empty.")
            if isinstance(y_train_all, np.ndarray):
                y_train_all = y_train_all.flatten()
            if X_train_all.shape[0] != y_train_all.shape[0]:
                raise ValueError(
                    "# rows in X_train must match length of y_train.")
            self.df = isinstance(X_train_all, pd.DataFrame)
            self.nrow, self.ndim = X_train_all.shape
            X, y = X_train_all, y_train_all
        elif dataframe is not None and label is not None:
            if not isinstance(dataframe, pd.DataFrame):
                raise ValueError("dataframe must be a pandas DataFrame")
            if not label in dataframe.columns:
                raise ValueError("label must a column name in dataframe")
            self.df = True
            self.dataframe, self.label = dataframe, label
            X = dataframe.drop(columns=label)
            self.nrow, self.ndim = X.shape
            y = dataframe[label]
        else:
            raise ValueError(
                "either X_train_all+y_train_all or dataframe+label need to be provided.")
        if scipy.sparse.issparse(X_train_all):
            self.transformer = self.label_transformer = False
            self.X_train_all, self.y_train_all = X, y
        else:
            from .data import DataTransformer
            self.transformer = DataTransformer()
            self.X_train_all, self.y_train_all = self.transformer.fit_transform(
                X, y, self.task)
            self.label_transformer = self.transformer.label_transformer
        if X_val is not None and y_val is not None:
            if not (isinstance(X_val, np.ndarray)
                    or scipy.sparse.issparse(X_val)
                    or isinstance(X_val, pd.DataFrame)
                    ):
                raise ValueError(
                    "X_val must be None, a numpy array, a pandas dataframe, "
                    "or Scipy sparse matrix.")
            if not (isinstance(y_val, np.ndarray)
                    or isinstance(y_val, pd.Series)):
                raise ValueError(
                    "y_val must be None, a numpy array or a pandas series.")
            if X_val.size == 0 or y_val.size == 0:
                raise ValueError(
                    "Validation data are expected to be nonempty. "
                    "Use None for X_val and y_val if no validation data.")
            if isinstance(y_val, np.ndarray):
                y_val = y_val.flatten()
            if X_val.shape[0] != y_val.shape[0]:
                raise ValueError(
                    "# rows in X_val must match length of y_val.")
            if self.transformer:
                self.X_val = self.transformer.transform(X_val)
            else:
                self.X_val = X_val
            if self.label_transformer:
                self.y_val = self.label_transformer.transform(y_val)
            else:
                self.y_val = y_val
        else:
            self.X_val = self.y_val = None
    def _prepare_data(self,
                      eval_method,
                      split_ratio,
                      n_splits):
        X_val, y_val = self.X_val, self.y_val
        if scipy.sparse.issparse(X_val):
            X_val = X_val.tocsr()
        X_train_all, y_train_all = self.X_train_all, self.y_train_all
        if scipy.sparse.issparse(X_train_all):
            X_train_all = X_train_all.tocsr()
        if self.task != 'regression':
            # logger.info(f"label {pd.unique(y_train_all)}")
            label_set, counts = np.unique(y_train_all, return_counts=True)
            # augment rare classes
            rare_threshld = 20
            rare = counts < rare_threshld
            rare_label, rare_counts = label_set[rare], counts[rare]
            for i, label in enumerate(rare_label):
                count = rare_count = rare_counts[i]
                rare_index = y_train_all == label
                n = len(y_train_all)
                while count < rare_threshld:
                    if self.df:
                        X_train_all = concat(X_train_all,
                                             X_train_all.iloc[:n].loc[rare_index])
                    else:
                        X_train_all = concat(X_train_all,
                                             X_train_all[:n][rare_index, :])
                    if isinstance(y_train_all, pd.Series):
                        y_train_all = concat(y_train_all,
                                             y_train_all.iloc[:n].loc[rare_index])
                    else:
                        y_train_all = np.concatenate([y_train_all,
                                                      y_train_all[:n][rare_index]])
                    count += rare_count
                logger.debug(
                    f"class {label} augmented from {rare_count} to {count}")
        X_train_all, y_train_all = shuffle(
            X_train_all, y_train_all, random_state=202020)
        if self.df:
            X_train_all.reset_index(drop=True, inplace=True)
            if isinstance(y_train_all, pd.Series):
                y_train_all.reset_index(drop=True, inplace=True)
        X_train, y_train = X_train_all, y_train_all
        if X_val is None:
            if self.task != 'regression' and eval_method == 'holdout':
                label_set, first = np.unique(y_train_all, return_index=True)
                rest = []
                last = 0
                first.sort()
                for i in range(len(first)):
                    rest.extend(range(last, first[i]))
                    last = first[i] + 1
                rest.extend(range(last, len(y_train_all)))
                X_first = X_train_all.iloc[first] if self.df else X_train_all[
                    first]
                X_rest = X_train_all.iloc[rest] if self.df else X_train_all[rest]
                y_rest = y_train_all.iloc[rest] if isinstance(
                    y_train_all, pd.Series) else y_train_all[rest]
                stratify = y_rest if self.split_type == 'stratified' else None
                X_train, X_val, y_train, y_val = train_test_split(
                    X_rest,
                    y_rest,
                    test_size=split_ratio,
                    stratify=stratify,
                    random_state=1)
                X_train = concat(X_first, X_train)
                y_train = concat(label_set,
                                    y_train) if self.df else np.concatenate([label_set, y_train])
                X_val = concat(X_first, X_val)
                y_val = concat(label_set,
                                y_val) if self.df else np.concatenate([label_set, y_val])
                _, y_train_counts_elements = np.unique(y_train,
                                                        return_counts=True)
                _, y_val_counts_elements = np.unique(y_val,
                                                        return_counts=True)
                logger.debug(
                    f"""{self.split_type} split for y_train \
                        {y_train_counts_elements}, \
                        y_val {y_val_counts_elements}""")
            elif eval_method == 'holdout' and self.task == 'regression':
                X_train, X_val, y_train, y_val = train_test_split(
                    X_train_all,
                    y_train_all,
                    test_size=split_ratio,
                    random_state=1)
        self.data_size = X_train.shape[0]
        self.X_train, self.y_train, self.X_val, self.y_val = (
            X_train, y_train, X_val, y_val)
        if self.split_type == "stratified":
            logger.info("Using StratifiedKFold")
            self.kf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=1,
                                              random_state=202020)
        else:
            logger.info("Using RepeatedKFold")
            self.kf = RepeatedKFold(n_splits=n_splits, n_repeats=1,
                                    random_state=202020)
    def prepare_sample_train_data(self, sample_size):
        full_size = len(self.y_train)
        if sample_size <= full_size:
            if isinstance(self.X_train, pd.DataFrame):
                sampled_X_train = self.X_train.iloc[:sample_size]
            else:
                sampled_X_train = self.X_train[:sample_size]
            sampled_y_train = self.y_train[:sample_size]
        else:
            sampled_X_train = concat(self.X_train, self.X_val)
            sampled_y_train = np.concatenate([self.y_train, self.y_val])
        return sampled_X_train, sampled_y_train
    def _compute_with_config_base(self,
                                  metric,
                                  compute_train_loss,
                                  estimator,
                                  config,
                                  sample_size):
        sampled_X_train, sampled_y_train = self.prepare_sample_train_data(
            sample_size)
        time_left = self.time_budget - self.time_from_start
        budget = time_left if sample_size == self.data_size else \
            time_left / 2 * sample_size / self.data_size
        return compute_estimator(sampled_X_train,
                                 sampled_y_train,
                                 self.X_val,
                                 self.y_val,
                                 budget,
                                 self.kf,
                                 config,
                                 self.task,
                                 estimator,
                                 self.eval_method,
                                 metric,
                                 self._best_loss,
                                 self.n_jobs,
                                 self._custom_learners.get(estimator),
                                 compute_train_loss)
    def _train_with_config(self, estimator, config, sample_size):
        sampled_X_train, sampled_y_train = self.prepare_sample_train_data(
            sample_size)
        budget = None if self.time_budget is None else (self.time_budget
                                                        - self.time_from_start)
        model, train_time = train_estimator(
            sampled_X_train,
            sampled_y_train,
            config,
            self.task,
            estimator,
            self.n_jobs,
            self._custom_learners.get(estimator),
            budget)
        return model, train_time
    def add_learner(self,
                    learner_name,
                    learner_class,
                    size_estimate=lambda config: 'unknown',
                    cost_relative2lgbm=1):
        '''Add a customized learner
        Args:
            learner_name: A string of the learner's name
            learner_class: A subclass of BaseEstimator
            size_estimate: A function from a config to its memory size in float
            cost_relative2lgbm: A float number for the training cost ratio with
                respect to lightgbm (when both use the initial config)
        '''
        self._custom_learners[learner_name] = learner_class
        self._eti_ini[learner_name] = cost_relative2lgbm
        self._config_space_info[learner_name] = \
            learner_class.params_configsearch_info
        self._custom_size_estimate[learner_name] = size_estimate
    def get_estimator_from_log(self, log_file_name, record_id, objective):
        '''Get the estimator from log file
        Args:
            log_file_name: A string of the log file name
            record_id: An integer of the record ID in the file,
                0 corresponds to the first trial
            objective: A string of the objective name,
                'binary', 'multi', or 'regression'
        Returns:
            An estimator object for the given configuration
        '''
        with training_log_reader(log_file_name) as reader:
            record = reader.get_record(record_id)
            estimator = record.learner
            config = record.config
        estimator, _ = train_estimator(
            None, None, config, objective, estimator,
            estimator_class=self._custom_learners.get(estimator)
        )
        return estimator
    def retrain_from_log(self,
                         log_file_name,
                         X_train=None,
                         y_train=None,
                         dataframe=None,
                         label=None,
                         time_budget=0,
                         task='classification',
                         eval_method='auto',
                         split_ratio=SPLIT_RATIO,
                         n_splits=N_SPLITS,
                         split_type="stratified",
                         n_jobs=1,
                         train_best=True,
                         train_full=False,
                         record_id=-1):
        '''Retrain from log file
        Args:
            time_budget: A float number of the time budget in seconds
            log_file_name: A string of the log file name
            X_train: A numpy array of training data in shape n*m
            y_train: A numpy array of labels in shape n*1
            task: A string of the task type, e.g.,
                'classification', 'regression'
            eval_method: A string of resampling strategy, one of
                ['auto', 'cv', 'holdout']
            split_ratio: A float of the validation data percentage for holdout
            n_splits: An integer of the number of folds for cross-validation
            n_jobs: An integer of the number of threads for training
            train_best: A boolean of whether to train the best config in the
                time budget; if false, train the last config in the budget
            train_full: A boolean of whether to train on the full data. If true,
                eval_method and sample_size in the log file will be ignored
            record_id: the ID of the training log record from which the model will
                be retrained. By default `record_id = -1` which means this will be
                ignored. `record_id = 0` corresponds to the first trial, and
                when `record_id >= 0`, `time_budget` will be ignored.
        '''
        self.task = task
        self._validate_data(X_train, y_train, dataframe, label)
        logger.info('log file name {}'.format(log_file_name))
        best_config = None
        best_val_loss = float('+inf')
        best_estimator = None
        sample_size = None
        time_used = 0.0
        training_duration = 0
        best = None
        with training_log_reader(log_file_name) as reader:
            if record_id >= 0:
                best = reader.get_record(record_id)
            else:
                for record in reader.records():
                    time_used = record.total_search_time
                    if time_used > time_budget:
                        break
                    training_duration = time_used
                    val_loss = record.validation_loss
                    if val_loss <= best_val_loss or not train_best:
                        if val_loss == best_val_loss and train_best:
                            size = record.sample_size
                            if size > sample_size:
                                best = record
                                best_val_loss = val_loss
                                sample_size = size
                        else:
                            best = record
                            size = record.sample_size
                            best_val_loss = val_loss
                            sample_size = size
                if not training_duration:
                    from .model import BaseEstimator
                    self._trained_estimator = BaseEstimator()
                    self._trained_estimator.model = None
                    return training_duration
        if not best: return
        best_estimator = best.learner
        best_config = best.config
        sample_size = len(self.y_train_all) if train_full \
            else best.sample_size
        logger.info(
            'estimator = {}, config = {}, #training instances = {}'.format(
                best_estimator, best_config, sample_size))
        # Partially copied from fit() function
        # Initilize some attributes required for retrain_from_log
        np.random.seed(0)
        self.task = task
        if self.task == 'classification':
            self.task = get_classification_objective(
                len(np.unique(self.y_train_all)))
            assert split_type in ["stratified", "uniform"]
            self.split_type = split_type
        else:
            self.split_type = "uniform"
        if record_id >= 0:
            eval_method = 'cv'
        elif eval_method == 'auto':
            eval_method = self._decide_eval_method(time_budget)
        self.modelcount = 0
        self._prepare_data(eval_method, split_ratio, n_splits)
        self.time_budget = None
        self.n_jobs = n_jobs
        self._trained_estimator = self._train_with_config(
            best_estimator, best_config, sample_size)[0]
        return training_duration
    def _decide_eval_method(self, time_budget):
        if self.X_val is not None:
            return 'holdout'
        nrow, dim = self.nrow, self.ndim
        if nrow * dim / 0.9 < SMALL_LARGE_THRES * (
                time_budget / 3600) and nrow < CV_HOLDOUT_THRESHOLD:
            # time allows or sampling can be used and cv is necessary
            return 'cv'
        else:
            return 'holdout'
    def fit(self,
            X_train=None,
            y_train=None,
            dataframe=None,
            label=None,
            metric='auto',
            task='classification',
            n_jobs=-1,
            log_file_name='default.log',
            estimator_list='auto',
            time_budget=60,
            max_iter=1000000,
            sample=True,
            ensemble=False,
            eval_method='auto',
            log_type='better',
            model_history=False,
            split_ratio=SPLIT_RATIO,
            n_splits=N_SPLITS,
            log_training_metric=False,
            mem_thres=MEM_THRES,
            X_val=None,
            y_val=None,
            retrain_full=True,
            split_type="stratified",
            learner_selector='sample',
            ):
        '''Find a model for a given task
        Args:
            X_train: A numpy array or a pandas dataframe of training data in
             shape n*m
            y_train: A numpy array or a pandas series of labels in shape n*1
            dataframe: A dataframe of training data including label column
            label: A str of the label column name
                Note: If X_train and y_train are provided, 
                dataframe and label are ignored;
                If not, dataframe and label must be provided.
            metric: A string of the metric name or a function,
                e.g., 'accuracy','roc_auc','f1','log_loss','mae','mse','r2'
                if passing a customized metric function, the function needs to
                have the follwing signature
                def metric(X_test, y_test, estimator, labels, X_train, y_train):
                    return metric_to_minimize, metrics_to_log
                which returns a float number as the minimization objective, 
                and a tuple of floats as the metrics to log
            task: A string of the task type, e.g.,
                'classification', 'regression'
            n_jobs: An integer of the number of threads for training
            log_file_name: A string of the log file name
            estimator_list: A list of strings for estimator names, or 'auto'
                e.g., ['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree']
            time_budget: A float number of the time budget in seconds
            max_iter: An integer of the maximal number of iterations
            sample: A boolean of whether to sample the training data during
                search
            eval_method: A string of resampling strategy, one of
                ['auto', 'cv', 'holdout']
            split_ratio: A float of the valiation data percentage for holdout
            n_splits: An integer of the number of folds for cross-validation
            log_type: A string of the log type, one of ['better', 'all', 'new']
                'better' only logs configs with better loss than previos iters
                'all' logs all the tried configs
                'new' only logs non-redundant configs
            model_history: A boolean of whether to keep the history of best
                models in the history property. Make sure memory is large
                enough if setting to True.
            log_training_metric: A boolean of whether to log the training 
                metric for each model. 
            mem_thres: A float of the memory size constraint in bytes
            X_val: None | a numpy array or a pandas dataframe of validation data
            y_val: None | a numpy array or a pandas series of validation labels
        '''
        self.task = task
        self._validate_data(X_train, y_train, dataframe, label, X_val, y_val)
        self.start_time_flag = time.time()
        np.random.seed(0)
        self.learner_selector = learner_selector
        if self.task == 'classification':
            self.task = get_classification_objective(
                len(np.unique(self.y_train_all)))
            assert split_type in ["stratified", "uniform"]
            self.split_type = split_type
        else:
            self.split_type = "uniform"
        if 'auto' == estimator_list:
            estimator_list = ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree']
            if 'regression' != self.task:
                estimator_list += ['lrl1', ]
        logger.info(
            "List of ML learners in AutoML Run: {}".format(estimator_list))
        if eval_method == 'auto' or self.X_val is not None:
            eval_method = self._decide_eval_method(time_budget)
        self.eval_method = eval_method
        logger.info("Evaluation method: {}".format(eval_method))
        self.retrain_full = retrain_full and (eval_method == 'holdout'
                                              and self.X_val is None)
        self.sample = sample and (eval_method != 'cv')
        if 'auto' == metric:
            if 'binary' in task:
                metric = 'roc_auc'
            elif 'multi' in task:
                metric = 'log_loss'
            else:
                metric = 'r2'
        if metric in ['r2', 'accuracy', 'roc_auc', 'f1', 'ap']:
            error_metric = f"1-{metric}"
        elif isinstance(metric, str):
            error_metric = metric
        else:
            error_metric = 'customized metric'
        logger.info(f'Minimizing error metric: {error_metric}')
        with training_log_writer(log_file_name) as save_helper:
            self.save_helper = save_helper
            self._prepare_data(eval_method, split_ratio, n_splits)
            self._compute_with_config = partial(AutoML._compute_with_config_base,
                                                self,
                                                metric,
                                                log_training_metric)
            self.time_budget = time_budget
            self.estimator_list = estimator_list
            self.ensemble = ensemble
            self.max_iter = max_iter
            self.mem_thres = mem_thres
            self.log_type = log_type
            self.split_ratio = split_ratio
            self.save_model_history = model_history
            self.n_jobs = n_jobs
            self.search()
            logger.info("fit succeeded")
    def search(self):
        self.searchers = {}
        # initialize the searchers
        self.eti = []
        self._best_loss = float('+inf')
        self.best_train_time = 0
        self.time_from_start = 0
        self.estimator_index = -1
        self._best_iteration = 0
        self._model_history = {}
        self._config_history = {}
        self.max_iter_per_learner = 10000  # TODO
        self.iter_per_learner = dict([(e, 0) for e in self.estimator_list])
        self.fullsize = False
        self._trained_estimator = None
        if self.ensemble:
            self.best_model = {}
        for self._track_iter in range(self.max_iter):
            if self.estimator_index == -1:
                estimator = self.estimator_list[0]
            else:
                estimator = self._select_estimator(self.estimator_list)
                if not estimator:
                    break
            logger.info(f"iteration {self._track_iter}"
                        f"  current learner {estimator}")
            if estimator in self.searchers:
                model = self.searchers[estimator].trained_estimator
                improved = self.searchers[estimator].search1step(
                    global_best_loss=self._best_loss,
                    retrain_full=self.retrain_full,
                    mem_thres=self.mem_thres)
            else:
                model = improved = None
                self.searchers[estimator] = ParamSearch(
                    estimator,
                    self.data_size,
                    self._compute_with_config,
                    self._train_with_config,
                    self.save_helper,
                    MIN_SAMPLE_TRAIN if self.sample else self.data_size,
                    self.task,
                    self.log_type,
                    self._config_space_info.get(estimator),
                    self._custom_size_estimate.get(estimator),
                    self.split_ratio)
                self.searchers[estimator].search_begin(self.time_budget,
                                                       self.start_time_flag)
                if self.estimator_index == -1:
                    eti_base = self._eti_ini[estimator]
                    self.eti.append(
                        self.searchers[estimator]
                            .expected_time_improvement_search())
                    for e in self.estimator_list[1:]:
                        self.eti.append(
                            self._eti_ini[e] / eti_base * self.eti[0])
                    self.estimator_index = 0
            self.time_from_start = time.time() - self.start_time_flag
            # logger.info(f"{self.searchers[estimator].sample_size}, {data_size}")
            if self.searchers[estimator].sample_size == self.data_size:
                self.iter_per_learner[estimator] += 1
                if not self.fullsize:
                    self.fullsize = True
            if self.searchers[estimator].best_loss < self._best_loss:
                self._best_loss = self.searchers[estimator].best_loss
                self._best_estimator = estimator
                self.best_train_time = self.searchers[estimator].train_time
                self._config_history[self._track_iter] = (
                    estimator,
                    self.searchers[estimator].best_config[0],
                    self.time_from_start)
                if self.save_model_history:
                    self._model_history[self._track_iter] = self.searchers[
                        estimator].trained_estimator.model
                elif self._trained_estimator:
                    del self._trained_estimator
                    self._trained_estimator = None
                self._trained_estimator = self.searchers[
                    estimator].trained_estimator
                self._best_iteration = self._track_iter
            if model and improved and not self.save_model_history:
                model.cleanup()
            logger.info(
                " at {:.1f}s,\tbest {}'s error={:.4f},\tbest {}'s error={:.4f}".format(
                    self.time_from_start,
                    estimator,
                    self.searchers[estimator].best_loss,
                    self._best_estimator,
                    self._best_loss))
            if self.time_from_start >= self.time_budget:
                break
            if self.ensemble:
                time_left = self.time_from_start - self.time_budget
                time_ensemble = self.searchers[self._best_estimator].train_time
                if time_left < time_ensemble < 2 * time_left:
                    break
            if self.searchers[
                    estimator].train_time > self.time_budget - self.time_from_start:
                self.iter_per_learner[estimator] = self.max_iter_per_learner
        # Add a checkpoint for the current best config to the log.
        self.save_helper.checkpoint()
        if self.searchers:
            self._selected = self.searchers[self._best_estimator]
            self._trained_estimator = self._selected.trained_estimator
            self.modelcount = sum(self.searchers[estimator].model_count
                                  for estimator in self.searchers)
            logger.info(self._trained_estimator.model)
            if self.ensemble:
                searchers = list(self.searchers.items())
                searchers.sort(key=lambda x: x[1].best_loss)
                estimators = [(x[0], x[1].trained_estimator) for x in searchers[
                    :2]]
                estimators += [(x[0], x[1].trained_estimator) for x in searchers[
                    2:] if x[1].best_loss < 4 * self._selected.best_loss]
                logger.info(estimators)
                if self.task != "regression":
                    from sklearn.ensemble import StackingClassifier as Stacker
                    for e in estimators:
                        e[1]._estimator_type = 'classifier'
                else:
                    from sklearn.ensemble import StackingRegressor as Stacker
                best_m = self._trained_estimator
                stacker = Stacker(estimators, best_m, n_jobs=self.n_jobs,
                                  passthrough=True)
                stacker.fit(self.X_train_all, self.y_train_all)
                self._trained_estimator = stacker
                self._trained_estimator.model = stacker
        else:
            self._selected = self._trained_estimator = None
            self.modelcount = 0
    def __del__(self):
        if hasattr(self, '_trained_estimator') and self._trained_estimator \
                and hasattr(self._trained_estimator, 'cleanup'):
            self._trained_estimator.cleanup()
            del self._trained_estimator
    def _select_estimator(self, estimator_list):
        time_left = self.time_budget - self.time_from_start
        if self.best_train_time < time_left < 2 * self.best_train_time:
            best_searcher = self.searchers[self._best_estimator]
            config_sig = best_searcher.get_hist_config_sig(
                best_searcher.sample_size_full,
                best_searcher.best_config[0])
            if config_sig not in best_searcher.config_tried:
                # trainAll
                return self._best_estimator
        if self.learner_selector == 'roundrobin':
            self.estimator_index += 1
            if self.estimator_index == len(estimator_list):
                self.estimator_index = 0
            return estimator_list[self.estimator_index]
        min_expected_time, selected = np.Inf, None
        inv = []
        for i, estimator in enumerate(estimator_list):
            if estimator in self.searchers:
                searcher = self.searchers[estimator]
                if self.iter_per_learner[estimator] >= self.max_iter_per_learner:
                    inv.append(0)
                    continue
                eti_searcher = min(2 * searcher.train_time,
                                   searcher.expected_time_improvement_search())
                gap = searcher.best_loss - self._best_loss
                if gap > 0 and not self.ensemble:
                    delta_loss = searcher.old_loss - searcher.new_loss
                    delta_time = searcher.old_loss_time + \
                        searcher.new_loss_time - searcher.old_train_time
                    speed = delta_loss / float(delta_time)
                    try:
                        expected_time = max(gap / speed, searcher.train_time)
                    except ZeroDivisionError:
                        warnings.warn("ZeroDivisionError: need to debug ",
                                      "speed: {0}, "
                                      "old_loss: {1}, "
                                      "new_loss: {2}"
                                      .format(speed,
                                              searcher.old_loss,
                                              searcher.new_loss))
                        expected_time = 0.0
                    expected_time = 2 * max(expected_time, eti_searcher)
                else:
                    expected_time = eti_searcher
                if expected_time == 0:
                    expected_time = 1e-10
                inv.append(1 / expected_time)
            else:
                expected_time = self.eti[i]
                inv.append(0)
            if expected_time < min_expected_time:
                min_expected_time = expected_time
                selected = estimator
        if len(self.searchers) < len(estimator_list) or not selected:
            if selected not in self.searchers:
                # print('select',selected,'eti',min_expected_time)
                return selected
        s = sum(inv)
        p = np.random.random()
        q = 0
        for i in range(len(inv)):
            if inv[i]:
                q += inv[i] / s
                if p < q:
                    return estimator_list[i]
--- a/flaml/config.py
+++ b/flaml/config.py
@ -0,0 +1,31 @@
 '''!
 * Copyright (c) 2020 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. 
 '''
 N_SPLITS = 5
 RANDOM_SEED = 1
 SPLIT_RATIO = 0.1
 HISTORY_SIZE = 10000000
 MEM_THRES = 4*(1024**3)
 SMALL_LARGE_THRES = 10000000
 MIN_SAMPLE_TRAIN = 10000
 MIN_SAMPLE_VAL = 10000
 CV_HOLDOUT_THRESHOLD = 100000
 BASE_Const = 2
 BASE_LOWER_BOUND = 2**(0.01)
 ETI_INI = {
    'lgbm':1,
    'xgboost':1.6,
    'xgboost_nb':1.6,
    'rf':2,
    'lrl1':160,
    'lrl2':25,
    'linear_svc':16,
    'kneighbor':30,
    'catboost':15,
    'extra_tree':1.9,
    'nn':50,
 }
--- a/flaml/data.py
+++ b/flaml/data.py
@ -0,0 +1,256 @@
 '''!
 * Copyright (c) 2020 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. 
 '''
 import numpy as np
 from scipy.sparse import vstack, issparse
 import pandas as pd
 from sklearn.preprocessing import LabelEncoder
 from .training_log import training_log_reader
 def load_openml_dataset(dataset_id, data_dir=None, random_state=0):
    '''Load dataset from open ML. 
    If the file is not cached locally, download it from open ML.
    Args:
        dataset_id: An integer of the dataset id in openml
        data_dir: A string of the path to store and load the data
        random_state: An integer of the random seed for splitting data
    Returns:
        X_train: A 2d numpy array of training data
        X_test:  A 2d numpy array of test data
        y_train: A 1d numpy arrya of labels for training data
        y_test:  A 1d numpy arrya of labels for test data        
    '''
    import os
    import openml
    import pickle
    from sklearn.model_selection import train_test_split
    filename = 'openml_ds' + str(dataset_id) + '.pkl'
    filepath = os.path.join(data_dir, filename)
    if os.path.isfile(filepath):
        print('load dataset from', filepath)
        with open(filepath, 'rb') as f:
            dataset = pickle.load(f)
    else:
        print('download dataset from openml')
        dataset = openml.datasets.get_dataset(dataset_id)
        if not os.path.exists(data_dir):
            os.makedirs(data_dir)
        with open(filepath, 'wb') as f:
            pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
    print('Dataset name:', dataset.name)
    X, y, * \
        __ = dataset.get_data(
            target=dataset.default_target_attribute, dataset_format='array')
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=random_state)
    print(
        'X_train.shape: {}, y_train.shape: {};\nX_test.shape: {}, y_test.shape: {}'.format(
            X_train.shape, y_train.shape, X_test.shape, y_test.shape,
        )
    )
    return X_train, X_test, y_train, y_test
 def load_openml_task(task_id, data_dir):
    '''Load task from open ML. 
    Use the first fold of the task. 
    If the file is not cached locally, download it from open ML.
    Args:
        task_id: An integer of the task id in openml
        data_dir: A string of the path to store and load the data
    Returns:
        X_train: A 2d numpy array of training data
        X_test:  A 2d numpy array of test data
        y_train: A 1d numpy arrya of labels for training data
        y_test:  A 1d numpy arrya of labels for test data        
    '''
    import os
    import openml
    import pickle
    task = openml.tasks.get_task(task_id)
    filename = 'openml_task' + str(task_id) + '.pkl'
    filepath = os.path.join(data_dir, filename)
    if os.path.isfile(filepath):
        print('load dataset from', filepath)
        with open(filepath, 'rb') as f:
            dataset = pickle.load(f)
    else:
        print('download dataset from openml')
        dataset = task.get_dataset()
        with open(filepath, 'wb') as f:
            pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
    X, y, _, _ = dataset.get_data(task.target_name, dataset_format='array')
    train_indices, test_indices = task.get_train_test_split_indices(
        repeat=0,
        fold=0,
        sample=0,
    )
    X_train = X[train_indices]
    y_train = y[train_indices]
    X_test = X[test_indices]
    y_test = y[test_indices]
    print(
        'X_train.shape: {}, y_train.shape: {},\nX_test.shape: {}, y_test.shape: {}'.format(
            X_train.shape, y_train.shape, X_test.shape, y_test.shape,
        )
    )
    return X_train, X_test, y_train, y_test
 def get_output_from_log(filename, time_budget):
    '''Get output from log file
    Args:
        filename: A string of the log file name
        time_budget: A float of the time budget in seconds
    Returns:
        training_time_list: A list of the finished time of each logged iter
        best_error_list: 
            A list of the best validation error after each logged iter
        error_list: A list of the validation error of each logged iter
        config_list: 
            A list of the estimator, sample size and config of each logged iter
        logged_metric_list: A list of the logged metric of each logged iter 
    '''
    import ast
    best_config = None
    best_learner = None
    best_val_loss = float('+inf')
    training_duration = 0.0
    training_time_list = []
    config_list = []
    best_error_list = []
    error_list = []
    logged_metric_list = []
    best_config_list = []
    with training_log_reader(filename) as reader:
        for record in reader.records():
            time_used = record.total_search_time
            training_duration = time_used
            val_loss = record.validation_loss
            config = record.config
            learner = record.learner.split('_')[0]
            sample_size = record.sample_size
            train_loss = record.logged_metric
            if time_used < time_budget:
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    best_config = config
                    best_learner = learner
                    best_config_list.append(best_config)
                training_time_list.append(training_duration)
                best_error_list.append(best_val_loss)
                logged_metric_list.append(train_loss)
                error_list.append(val_loss)
                config_list.append({"Current Learner": learner,
                                    "Current Sample": sample_size,
                                    "Current Hyper-parameters": record.config,
                                    "Best Learner": best_learner,
                                    "Best Hyper-parameters": best_config})
    return (training_time_list, best_error_list, error_list, config_list,
            logged_metric_list)
 def concat(X1, X2):
    '''concatenate two matrices vertically
    '''
    if isinstance(X1, pd.DataFrame) or isinstance(X1, pd.Series):
        if isinstance(X1, pd.DataFrame):
            cat_columns = X1.select_dtypes(
                include='category').columns
        df = pd.concat([X1, X2], sort=False)
        df.reset_index(drop=True, inplace=True)
        if isinstance(X1, pd.DataFrame) and len(cat_columns):
            df[cat_columns] = df[cat_columns].astype('category')
        return df
    if issparse(X1):
        return vstack((X1, X2))
    else:
        return np.concatenate([X1, X2])
 class DataTransformer:
    '''transform X, y
    '''
    def fit_transform(self, X, y, objective):
        if isinstance(X, pd.DataFrame):
            X = X.copy()
            n = X.shape[0]
            cat_columns, num_columns = [], []
            for column in X.columns:
                if X[column].dtype.name in ('object', 'category'):
                    if X[column].nunique() == 1 or X[column].nunique(
                            dropna=True) == n - X[column].isnull().sum():
                        X.drop(columns=column, inplace=True)
                    elif X[column].dtype.name == 'category':
                        current_categories = X[column].cat.categories
                        if '__NAN__' not in current_categories:
                            X[column] = X[column].cat.add_categories(
                                '__NAN__').fillna('__NAN__')
                        cat_columns.append(column)
                    else:
                        X[column].fillna('__NAN__', inplace=True)
                        cat_columns.append(column)
                else:
                    # print(X[column].dtype.name)
                    if X[column].nunique(dropna=True) < 2:
                        X.drop(columns=column, inplace=True)
                    else:
                        X[column].fillna(np.nan, inplace=True)
                        num_columns.append(column)
            X = X[cat_columns + num_columns]
            if cat_columns:
                X[cat_columns] = X[cat_columns].astype('category')
            if num_columns:
                from sklearn.impute import SimpleImputer
                from sklearn.compose import ColumnTransformer
                self.transformer = ColumnTransformer([(
                    'continuous',
                    SimpleImputer(missing_values=np.nan, strategy='median'),
                    num_columns)])
                X[num_columns] = self.transformer.fit_transform(X)
            self.cat_columns, self.num_columns = cat_columns, num_columns
        if objective == 'regression':
            self.label_transformer = None
        else:
            from sklearn.preprocessing import LabelEncoder
            self.label_transformer = LabelEncoder()
            y = self.label_transformer.fit_transform(y)
        return X, y
    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            cat_columns, num_columns = self.cat_columns, self.num_columns
            X = X[cat_columns + num_columns].copy()
            for column in cat_columns:
                # print(column, X[column].dtype.name)
                if X[column].dtype.name == 'object':
                    X[column].fillna('__NAN__', inplace=True)
                elif X[column].dtype.name == 'category':
                    current_categories = X[column].cat.categories
                    if '__NAN__' not in current_categories:
                        X[column] = X[column].cat.add_categories(
                            '__NAN__').fillna('__NAN__')
            if cat_columns:
                X[cat_columns] = X[cat_columns].astype('category')
            if num_columns:
                X[num_columns].fillna(np.nan, inplace=True)
                X[num_columns] = self.transformer.transform(X)
        return X
--- a/flaml/ml.py
+++ b/flaml/ml.py
@ -0,0 +1,241 @@
 '''!
 * Copyright (c) 2020 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. 
 '''
 from .model import *
 import time
 from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, \
    accuracy_score, mean_absolute_error, log_loss, average_precision_score, \
        f1_score
 import numpy as np
 from sklearn.model_selection import RepeatedStratifiedKFold
 def get_estimator_class(objective_name, estimator_name):
    ''' when adding a new learner, need to add an elif branch '''
    if 'xgboost' in estimator_name:
        if 'regression' in objective_name:
            estimator_class = XGBoostEstimator
        else:
            estimator_class = XGBoostSklearnEstimator
    elif 'rf' in estimator_name:
        estimator_class = RandomForestEstimator
    elif 'lgbm' in estimator_name:
        estimator_class = LGBMEstimator
    elif 'lrl1' in estimator_name:
        estimator_class = LRL1Classifier
    elif 'lrl2' in estimator_name:
        estimator_class = LRL2Classifier  
    elif 'catboost' in estimator_name:
        estimator_class = CatBoostEstimator
    elif 'extra_tree' in estimator_name:
        estimator_class = ExtraTreeEstimator
    elif 'kneighbor' in estimator_name:
        estimator_class = KNeighborsEstimator
    else:
        raise ValueError(estimator_name + ' is not a built-in learner. '
            'Please use AutoML.add_learner() to add a customized learner.')
    return estimator_class
 def sklearn_metric_loss_score(metric_name, y_predict, y_true, labels=None):
    '''Loss using the specified metric
    Args:
        metric_name: A string of the mtric name, one of 
            'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'log_loss', 
            'f1', 'ap'
        y_predict: A 1d or 2d numpy array of the predictions which can be
            used to calculate the metric. E.g., 2d for log_loss and 1d
            for others. 
        y_true: A 1d numpy array of the true labels
        labels: A 1d numpy array of the unique labels
    Returns:
        score: A float number of the loss, the lower the better
    '''
    metric_name = metric_name.lower()
    if 'r2' in metric_name:
        score = 1.0 - r2_score(y_true, y_predict)
    elif metric_name == 'rmse':
        score = np.sqrt(mean_squared_error(y_true, y_predict))
    elif metric_name == 'mae':
        score = mean_absolute_error(y_true, y_predict)
    elif metric_name == 'mse':
        score = mean_squared_error(y_true, y_predict)
    elif metric_name == 'accuracy':
        score = 1.0 - accuracy_score(y_true, y_predict)
    elif 'roc_auc' in metric_name:
        score = 1.0 - roc_auc_score(y_true, y_predict)
    elif 'log_loss' in metric_name:
        score = log_loss(y_true, y_predict, labels=labels)
    elif 'f1' in metric_name:
        score = 1 - f1_score(y_true, y_predict)
    elif 'ap' in metric_name:
        score = 1 - average_precision_score(y_true, y_predict)
    else:
        raise ValueError(metric_name+' is not a built-in metric, '
        'currently built-in metrics are: '
        'r2, rmse, mae, mse, accuracy, roc_auc, log_loss, f1, ap. '
        'please pass a customized metric function to AutoML.fit(metric=func)')
    return score
 def get_y_pred(estimator, X, eval_metric, obj):
    if eval_metric in ['roc_auc', 'ap'] and 'binary' in obj:
        y_pred_classes = estimator.predict_proba(X)        
        y_pred = y_pred_classes[:,
         1] if y_pred_classes.ndim>1 else y_pred_classes
    elif eval_metric in ['log_loss', 'roc_auc']:
        y_pred = estimator.predict_proba(X)
    else:
        y_pred = estimator.predict(X)
    return y_pred
 def get_test_loss(estimator, X_train, y_train, X_test, y_test, eval_metric, obj,
 labels=None, budget=None, train_loss=False):
    start = time.time()
    train_time = estimator.fit(X_train, y_train, budget)
    if isinstance(eval_metric, str):
        test_pred_y = get_y_pred(estimator, X_test, eval_metric, obj)
        test_loss = sklearn_metric_loss_score(eval_metric, test_pred_y, y_test,
        labels)
        if train_loss != False:
            test_pred_y = get_y_pred(estimator, X_train, eval_metric, obj)
            train_loss = sklearn_metric_loss_score(eval_metric, test_pred_y,
            y_train, labels)
    else: # customized metric function
        test_loss, train_loss = eval_metric(
            X_test, y_test, estimator, labels, X_train, y_train)
    train_time = time.time()-start
    return test_loss, train_time, train_loss
 def train_model(estimator, X_train, y_train, budget):
    train_time = estimator.fit(X_train, y_train, budget)
    return train_time
 def evaluate_model(estimator, X_train, y_train, X_val, y_val, budget, kf,
 objective_name, eval_method, eval_metric, best_val_loss, train_loss=False):
    if 'holdout' in eval_method:
        val_loss, train_loss, train_time = evaluate_model_holdout(
            estimator, X_train, y_train, X_val, y_val, budget, 
            objective_name, eval_metric, best_val_loss, train_loss=train_loss)
    else:
        val_loss, train_loss, train_time = evaluate_model_CV(
            estimator, X_train, y_train, budget, kf, objective_name, 
            eval_metric, best_val_loss, train_loss=train_loss)
    return val_loss, train_loss, train_time
 def evaluate_model_holdout(estimator, X_train, y_train, X_val, y_val, budget,
 objective_name, eval_metric, best_val_loss, train_loss=False):
    val_loss, train_time, train_loss = get_test_loss(
        estimator, X_train, y_train, X_val, y_val, eval_metric, objective_name,
        budget = budget, train_loss=train_loss)
    return  val_loss, train_loss, train_time
 def evaluate_model_CV(estimator, X_train_all, y_train_all, budget, kf,
 objective_name, eval_metric, best_val_loss, train_loss=False):
    start_time = time.time()
    total_val_loss = total_train_loss = 0
    train_time = 0
    valid_fold_num = 0
    n = kf.get_n_splits()
    X_train_split, y_train_split = X_train_all, y_train_all
    if objective_name=='regression':
        labels = None
    else:
        labels = np.unique(y_train_all) 
    if isinstance(kf, RepeatedStratifiedKFold):
        kf = kf.split(X_train_split, y_train_split)
    else:
        kf = kf.split(X_train_split)
    rng = np.random.RandomState(2020)
    val_loss_list = []
    budget_per_train = budget / (n+1)
    for train_index, val_index in kf:
        train_index = rng.permutation(train_index)
        if isinstance(X_train_all, pd.DataFrame):
            X_train, X_val = X_train_split.iloc[
                train_index], X_train_split.iloc[val_index]
        else:
            X_train, X_val = X_train_split[
                train_index], X_train_split[val_index]
        if isinstance(y_train_all, pd.Series):
            y_train, y_val = y_train_split.iloc[
                train_index], y_train_split.iloc[val_index]
        else:
            y_train, y_val = y_train_split[
                train_index], y_train_split[val_index]
        estimator.cleanup()
        val_loss_i, train_time_i, train_loss_i = get_test_loss(
            estimator, X_train, y_train, X_val, y_val, eval_metric, 
            objective_name, labels, budget_per_train, train_loss=train_loss)
        valid_fold_num += 1
        total_val_loss += val_loss_i
        if train_loss != False: 
            if total_train_loss != 0: total_train_loss += train_loss_i
            else: total_train_loss = train_loss_i
        train_time += train_time_i
        if valid_fold_num == n:
            val_loss_list.append(total_val_loss/valid_fold_num)
            total_val_loss = valid_fold_num = 0
        elif time.time() - start_time >= budget:
            val_loss_list.append(total_val_loss/valid_fold_num)
            break
    val_loss = np.max(val_loss_list)
    if train_loss != False: train_loss = total_train_loss/n
    budget -= time.time() - start_time
    if val_loss < best_val_loss and budget > budget_per_train:
        estimator.cleanup()
        train_time_full = estimator.fit(X_train_all, y_train_all, budget)
        train_time += train_time_full
    return val_loss, train_loss, train_time
 def compute_estimator(X_train, y_train, X_val, y_val, budget, kf,
 config_dic, objective_name, estimator_name, eval_method, eval_metric, 
 best_val_loss = np.Inf, n_jobs=1, estimator_class=None, train_loss=False):
    start_time = time.time()
    estimator_class = estimator_class or get_estimator_class(
        objective_name, estimator_name)
    estimator = estimator_class(
        **config_dic, objective_name = objective_name, n_jobs=n_jobs)
    val_loss, train_loss, train_time = evaluate_model(
        estimator, X_train, y_train, X_val, y_val, budget, kf, objective_name, 
        eval_method, eval_metric, best_val_loss, train_loss=train_loss)
    all_time = time.time() - start_time
    return estimator, val_loss, train_loss, train_time, all_time
 def train_estimator(X_train, y_train, config_dic, objective_name,
 estimator_name, n_jobs=1, estimator_class=None, budget=None):
    start_time = time.time()
    estimator_class = estimator_class or get_estimator_class(objective_name,
     estimator_name)
    estimator = estimator_class(**config_dic, objective_name = objective_name,
     n_jobs=n_jobs)
    if X_train is not None:
        train_time = train_model(estimator, X_train, y_train, budget)
    else:
        estimator = estimator.estimator_class(**estimator.params)
    train_time = time.time() - start_time
    return estimator, train_time
 def get_classification_objective(num_labels: int) -> str:
    if num_labels == 2:
        objective_name = 'binary:logistic'
    else:
        objective_name = 'multi:softmax'
    return objective_name
--- a/flaml/model.py
+++ b/flaml/model.py
@ -0,0 +1,515 @@
 '''!
 * Copyright (c) 2020 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. 
 '''
 import numpy as np
 import xgboost as xgb
 from xgboost import XGBClassifier, XGBRegressor
 import time
 from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
 from sklearn.linear_model import LogisticRegression
 from lightgbm import LGBMClassifier, LGBMRegressor
 import scipy.sparse
 import pandas as pd
 class BaseEstimator:
    '''The abstract class for all learners
    Typical example:
        XGBoostEstimator: for regression
        XGBoostSklearnEstimator: for classification
        LGBMEstimator, RandomForestEstimator, LRL1Classifier, LRL2Classifier: 
            for both regression and classification        
    '''
    def __init__(self, objective_name = 'binary:logistic', 
        **params):
        '''Constructor
        Args:
            objective_name: A string of the objective name, one of
                'binary:logistic', 'multi:softmax', 'regression'
            n_jobs: An integer of the number of parallel threads
            params: A dictionary of the hyperparameter names and values
        '''
        self.params = params
        self.estimator_class = None
        self.objective_name = objective_name
        if '_estimator_type' in params:
            self._estimator_type = params['_estimator_type']
        else:
            self._estimator_type = "regressor" if objective_name=='regression' \
                else "classifier" 
    def get_params(self, deep=False):
        params = self.params.copy()
        params["objective_name"] = self.objective_name
        if hasattr(self, '_estimator_type'):
            params['_estimator_type'] = self._estimator_type
        return params
    @property
    def classes_(self):
        return self.model.classes_
    def preprocess(self, X):
        return X
    def _fit(self, X_train, y_train):    
        curent_time = time.time()
        X_train = self.preprocess(X_train)
        model = self.estimator_class(**self.params)
        model.fit(X_train, y_train)
        train_time =  time.time() - curent_time
        self.model = model
        return train_time
    def fit(self, X_train, y_train, budget=None):    
        '''Train the model from given training data
        Args:
            X_train: A numpy array of training data in shape n*m
            y_train: A numpy array of labels in shape n*1
            budget: A float of the time budget in seconds
        Returns:
            train_time: A float of the training time in seconds
        '''
        return self._fit(X_train, y_train)
    def predict(self, X_test):
        '''Predict label from features
        Args:
            X_test: A numpy array of featurized instances, shape n*m
        Returns:
            A numpy array of shape n*1. 
            Each element is the label for a instance
        '''      
        X_test = self.preprocess(X_test)
        return self.model.predict(X_test)
    def predict_proba(self, X_test):
        '''Predict the probability of each class from features
        Only works for classification problems
        Args:
            model: An object of trained model with method predict_proba()
            X_test: A numpy array of featurized instances, shape n*m
        Returns:
            A numpy array of shape n*c. c is the # classes
            Each element at (i,j) is the probability for instance i to be in
                class j
        '''
        if 'regression' in self.objective_name:
            print('Regression tasks do not support predict_prob')
            raise ValueError
        else:
            X_test = self.preprocess(X_test)
            return self.model.predict_proba(X_test)
    def cleanup(self): pass
 class SKLearnEstimator(BaseEstimator):
    def preprocess(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.copy()
            cat_columns = X.select_dtypes(include=['category']).columns
            X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes)
        return X
 class LGBMEstimator(BaseEstimator):
    def __init__(self, objective_name='binary:logistic', n_jobs=1,
     n_estimators=2, max_leaves=2, min_child_weight=1e-3, learning_rate=0.1, 
     subsample=1.0, reg_lambda=1.0, reg_alpha=0.0, colsample_bylevel=1.0, 
     colsample_bytree=1.0, log_max_bin=8, **params):
        super().__init__(objective_name, **params)
        # Default: ‘regression’ for LGBMRegressor, 
        # ‘binary’ or ‘multiclass’ for LGBMClassifier
        if 'regression' in objective_name:
            final_objective_name = 'regression'
        elif 'binary' in objective_name:
            final_objective_name = 'binary'
        elif 'multi' in objective_name:
            final_objective_name = 'multiclass'
        else:
            final_objective_name = 'regression'
        self.params = {
            "n_estimators": int(round(n_estimators)),
            "num_leaves":  params[
                'num_leaves'] if 'num_leaves' in params else int(
                    round(max_leaves)),
            'objective': params[
                "objective"] if "objective" in params else final_objective_name,
            'n_jobs': n_jobs,
            'learning_rate': float(learning_rate),
            'reg_alpha': float(reg_alpha),
            'reg_lambda': float(reg_lambda),
            'min_child_weight': float(min_child_weight),
            'colsample_bytree':float(colsample_bytree),
            'subsample': float(subsample),
        }
        self.params['max_bin'] = params['max_bin'] if 'max_bin' in params else (
            1<<int(round(log_max_bin)))-1
        if 'regression' in objective_name:
            self.estimator_class = LGBMRegressor
        else:
            self.estimator_class = LGBMClassifier
        self.time_per_iter = None
        self.train_size = 0
    def preprocess(self, X):
        if not isinstance(X, pd.DataFrame) and scipy.sparse.issparse(
            X) and np.issubdtype(X.dtype, np.integer):
            X = X.astype(float)
        return X
    def fit(self, X_train, y_train, budget=None):
        start_time = time.time()
        n_iter = self.params["n_estimators"]
        if (not self.time_per_iter or
         abs(self.train_size-X_train.shape[0])>4) and budget is not None:
            self.params["n_estimators"] = 1
            self.t1 = self._fit(X_train, y_train)
            if self.t1 >= budget: 
                self.params["n_estimators"] = n_iter
                return self.t1
            self.params["n_estimators"] = 4
            self.t2 = self._fit(X_train, y_train)
            self.time_per_iter = (self.t2 - self.t1)/(
                self.params["n_estimators"]-1) if self.t2 > self.t1 \
                else self.t1 if self.t1 else 0.001
            self.train_size = X_train.shape[0]
            if self.t1+self.t2>=budget or n_iter==self.params["n_estimators"]:
                self.params["n_estimators"] = n_iter
                return time.time() - start_time
        if budget is not None:
            self.params["n_estimators"] = min(n_iter, int((budget-time.time()+
                start_time-self.t1)/self.time_per_iter+1))
        if self.params["n_estimators"] > 0:
            self._fit(X_train, y_train)
        self.params["n_estimators"] = n_iter
        train_time = time.time() - start_time
        return train_time
 class XGBoostEstimator(SKLearnEstimator):
    ''' not using sklearn API, used for regression '''
    def __init__(self, objective_name='regression', all_thread=False, n_jobs=1,
        n_estimators=4, max_leaves=4, subsample=1.0, min_child_weight=1, 
        learning_rate=0.1, reg_lambda=1.0, reg_alpha=0.0, colsample_bylevel=1.0,
        colsample_bytree=1.0, tree_method='auto', **params):
        super().__init__(objective_name, **params)
        self.n_estimators = int(round(n_estimators))
        self.max_leaves = int(round(max_leaves))
        self.grids = []
        self.params = {
            'max_leaves': int(round(max_leaves)),
            'max_depth': 0,
            'grow_policy': params[
                "grow_policy"] if "grow_policy" in params else 'lossguide',
            'tree_method':tree_method,
            'verbosity': 0,
            'nthread':n_jobs,
            'learning_rate': float(learning_rate),
            'subsample': float(subsample),
            'reg_alpha': float(reg_alpha),
            'reg_lambda': float(reg_lambda),
            'min_child_weight': float(min_child_weight),
            'booster': params['booster'] if 'booster' in params else 'gbtree',
            'colsample_bylevel': float(colsample_bylevel),
            'colsample_bytree':float(colsample_bytree),
            }
        if all_thread:
            del self.params['nthread']
    def get_params(self, deep=False):
        params = super().get_params()
        params["n_jobs"] = params['nthread']
        return params
    def fit(self, X_train, y_train, budget=None):    
        curent_time = time.time()        
        if not scipy.sparse.issparse(X_train):
            self.params['tree_method'] = 'hist'
            X_train = self.preprocess(X_train)
        dtrain = xgb.DMatrix(X_train, label=y_train)
        if self.max_leaves>0:
            xgb_model = xgb.train(self.params,  dtrain, self.n_estimators)
            del dtrain
            train_time = time.time() - curent_time
            self.model = xgb_model
            return train_time
        else:
            return None
    def predict(self, X_test):
        if not scipy.sparse.issparse(X_test):
            X_test = self.preprocess(X_test)
        dtest = xgb.DMatrix(X_test)
        return super().predict(dtest)
 class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
    ''' using sklearn API, used for classification '''
    def __init__(self, objective_name='binary:logistic', n_jobs=1,  
        n_estimators=4, max_leaves=4, subsample=1.0, 
        min_child_weight=1, learning_rate=0.1, reg_lambda=1.0, reg_alpha=0.0,
        colsample_bylevel=1.0, colsample_bytree=1.0, tree_method='hist', 
        **params):
        super().__init__(objective_name, **params)
        self.params = {
        "n_estimators": int(round(n_estimators)),
        'max_leaves': int(round(max_leaves)),
        'max_depth': 0,
        'grow_policy': params[
                "grow_policy"] if "grow_policy" in params else 'lossguide',
        'tree_method':tree_method,
        'verbosity': 0,
        'n_jobs': n_jobs,
        'learning_rate': float(learning_rate),
        'subsample': float(subsample),
        'reg_alpha': float(reg_alpha),
        'reg_lambda': float(reg_lambda),
        'min_child_weight': float(min_child_weight),
        'booster': params['booster'] if 'booster' in params else 'gbtree',
        'colsample_bylevel': float(colsample_bylevel),
        'colsample_bytree': float(colsample_bytree),
        }
        if 'regression' in objective_name:
            self.estimator_class = XGBRegressor
        else:
            self.estimator_class = XGBClassifier
        self.time_per_iter = None
        self.train_size = 0
    def fit(self, X_train, y_train, budget=None):    
        if scipy.sparse.issparse(X_train):
            self.params['tree_method'] = 'auto'
        return super().fit(X_train, y_train, budget)
 class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
    def __init__(self, objective_name = 'binary:logistic', n_jobs = 1,
      n_estimators = 4, max_leaves = 4, max_features = 1.0, 
      min_samples_split = 2, min_samples_leaf = 1, criterion = 1, **params):
        super().__init__(objective_name, **params)
        self.params = {
        "n_estimators": int(round(n_estimators)),
        "n_jobs": n_jobs,
        'max_features': float(max_features),
        }
        if 'regression' in objective_name:
            self.estimator_class = RandomForestRegressor
        else:
            self.estimator_class = RandomForestClassifier
            self.params['criterion'] = 'entropy' if criterion>1.5 else 'gini'
        self.time_per_iter = None
        self.train_size = 0
    def get_params(self, deep=False):
        params = super().get_params()
        params["criterion"] = 1 if params["criterion"]=='gini' else 2
        return params
 class ExtraTreeEstimator(RandomForestEstimator):
    def __init__(self, objective_name = 'binary:logistic', n_jobs = 1,
      n_estimators = 4, max_leaves = 4, max_features = 1.0, 
      min_samples_split = 2, min_samples_leaf = 1, criterion = 1, **params):
        super().__init__(objective_name, **params)
        self.params = {
        "n_estimators": int(round(n_estimators)),
        "n_jobs": n_jobs,
        'max_features': float(max_features),
        }
        if 'regression' in objective_name:
            from sklearn.ensemble import ExtraTreesRegressor
            self.estimator_class = ExtraTreesRegressor
        else:
            from sklearn.ensemble import ExtraTreesClassifier
            self.estimator_class = ExtraTreesClassifier
            self.params['criterion'] = 'entropy' if criterion>1.5 else 'gini'
        self.time_per_iter = None
        self.train_size = 0
 class LRL1Classifier(SKLearnEstimator):
    def __init__(self, tol=0.0001, C=1.0, 
        objective_name='binary:logistic', n_jobs=1, **params):
        super().__init__(objective_name, **params)
        self.params = {
            'penalty': 'l1',
            'tol': float(tol),
            'C': float(C),
            'solver': 'saga',
            'n_jobs': n_jobs,
        }
        if 'regression' in objective_name:
            self.estimator_class = None
            print('Does not support regression task')
            raise NotImplementedError
        else:
            self.estimator_class = LogisticRegression
 class LRL2Classifier(SKLearnEstimator):
    def __init__(self, tol=0.0001, C=1.0, 
        objective_name='binary:logistic', n_jobs=1, **params):
        super().__init__(objective_name, **params)
        self.params = {
            'penalty': 'l2',
            'tol': float(tol),
            'C': float(C),
            'solver': 'lbfgs',
            'n_jobs': n_jobs,
        }
        if 'regression' in objective_name:
            self.estimator_class = None
            print('Does not support regression task')
            raise NotImplementedError
        else:
            self.estimator_class = LogisticRegression
 class CatBoostEstimator(BaseEstimator):
    time_per_iter = None
    train_size = 0
    def __init__(self, objective_name = 'binary:logistic', n_jobs=1,
    n_estimators=8192, exp_max_depth=64, learning_rate=0.1, rounds=4, 
    l2_leaf_reg=3, **params):
        super().__init__(objective_name, **params)
        self.params = {
            "early_stopping_rounds": int(round(rounds)),
            "n_estimators": n_estimators, 
            'learning_rate': learning_rate,
            'thread_count': n_jobs,
            'verbose': False,
            'random_seed': params[
                "random_seed"] if "random_seed" in params else 10242048,
        }
        # print(n_estimators)
        if 'regression' in objective_name:
            from catboost import CatBoostRegressor
            self.estimator_class = CatBoostRegressor
        else:
            from catboost import CatBoostClassifier
            self.estimator_class = CatBoostClassifier
    def get_params(self, deep=False):
        params = super().get_params()
        params['n_jobs'] = params['thread_count']
        params['rounds'] = params['early_stopping_rounds']
        return params
    def fit(self, X_train, y_train, budget=None):
        start_time = time.time()
        n_iter = self.params["n_estimators"]
        if isinstance(X_train, pd.DataFrame):
            cat_features = list(X_train.select_dtypes(
                include='category').columns)
        else:
            cat_features = []
        if (not CatBoostEstimator.time_per_iter or
         abs(CatBoostEstimator.train_size-len(y_train))>4) and budget:
            # measure the time per iteration
            self.params["n_estimators"] = 1
            CatBoostEstimator.model = self.estimator_class(**self.params)
            CatBoostEstimator.model.fit(X_train, y_train,
             cat_features=cat_features)
            CatBoostEstimator.t1 = time.time() - start_time
            if CatBoostEstimator.t1 >= budget: 
                self.params["n_estimators"] = n_iter
                self.model = CatBoostEstimator.model
                return CatBoostEstimator.t1
            self.params["n_estimators"] = 4
            CatBoostEstimator.model = self.estimator_class(**self.params)
            CatBoostEstimator.model.fit(X_train, y_train,
             cat_features=cat_features)
            CatBoostEstimator.time_per_iter = (time.time() - start_time -
             CatBoostEstimator.t1)/(self.params["n_estimators"]-1)
            if CatBoostEstimator.time_per_iter <= 0: 
                CatBoostEstimator.time_per_iter = CatBoostEstimator.t1
            CatBoostEstimator.train_size = len(y_train)
            if time.time()-start_time>=budget or n_iter==self.params[
                "n_estimators"]: 
                self.params["n_estimators"] = n_iter
                self.model = CatBoostEstimator.model
                return time.time()-start_time
        if budget:
            train_times = 1 
            self.params["n_estimators"] = min(n_iter, int((budget-time.time()+
                start_time-CatBoostEstimator.t1)/train_times/
                CatBoostEstimator.time_per_iter+1))
            self.model = CatBoostEstimator.model
        if self.params["n_estimators"] > 0:
            l = max(int(len(y_train)*0.9), len(y_train)-1000)
            X_tr, y_tr = X_train[:l], y_train[:l]
            from catboost import Pool
            model = self.estimator_class(**self.params)
            model.fit(X_tr, y_tr, cat_features=cat_features, eval_set=Pool(
                data=X_train[l:], label=y_train[l:], cat_features=cat_features))
            # print(self.params["n_estimators"], model.get_best_iteration())
            self.model = model
        self.params["n_estimators"] = n_iter
        train_time = time.time() - start_time
        # print(budget, train_time)
        return train_time
 class KNeighborsEstimator(BaseEstimator):
    def __init__(self, objective_name='binary:logistic', n_jobs=1,
     n_neighbors=5, **params):
        super().__init__(objective_name, **params)
        self.params= {
            'n_neighbors': int(round(n_neighbors)),
            'weights': 'distance',
            'n_jobs': n_jobs,
        }
        if 'regression' in objective_name:
            from sklearn.neighbors import KNeighborsRegressor
            self.estimator_class = KNeighborsRegressor
        else:
            from sklearn.neighbors import KNeighborsClassifier
            self.estimator_class = KNeighborsClassifier
    def preprocess(self, X):
        if isinstance(X, pd.DataFrame):
            cat_columns = X.select_dtypes(['category']).columns
            # print(X.dtypes)
            # print(cat_columns)
            if X.shape[1] == len(cat_columns):
                raise ValueError(
            "kneighbor requires at least one numeric feature")
            X = X.drop(cat_columns, axis=1) 
        return X
--- a/flaml/search.py
+++ b/flaml/search.py
@ -0,0 +1,675 @@
 '''!
 * Copyright (c) 2020 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. 
 '''
 from functools import partial
 from .ml import train_estimator
 import time
 import math
 import numpy as np
 from .space import config_space, estimator_size, get_config_values, \
    generate_config_ini, generate_config_max, generate_config_min
 from .config import SPLIT_RATIO, MIN_SAMPLE_TRAIN, \
    HISTORY_SIZE, MEM_THRES, BASE_Const, BASE_LOWER_BOUND
 from random import gauss
 def rand_vector_unit_sphere(dims):
    vec = [gauss(0, 1) for i in range(dims)]
    mag = sum(x**2 for x in vec) ** .5
    return [x / mag for x in vec]
 def rand_vector_gaussian(dims):
    vec = [gauss(0, 1) for i in range(dims)]
    return vec
 class ParamSearch:
    '''
    the class for searching params for 1 learner
    '''
    def __init__(self, estimator, data_size,
                 compute_with_config, train_with_config, save_info_helper=None,
                 init_sample_size=MIN_SAMPLE_TRAIN, objective_name='regression',
                 log_type='better', config_space_info=None, size_estimator=None,
                 split_ratio=SPLIT_RATIO, base_change='sqrtK', use_dual_dir=True,
                 move_type='geo'):
        self.log_type = log_type
        self.base_change = base_change
        if init_sample_size > data_size:
            init_sample_size = data_size
        self.next_sample_size = {}
        self.prev_sample_size = {}
        s = init_sample_size
        self.prev_sample_size[s] = s
        self.estimator_configspace = config_space_info or config_space(
            estimator, data_size, objective_name)
        self.get_size_for_config = size_estimator or (
            lambda x: estimator_size(x, estimator))
        config_min_dic_primary, config_min_dic_more, config_min_dic = \
            generate_config_min(estimator, self.estimator_configspace, None)
        self.min_config_primary = np.array(
            list(config_min_dic_primary.values()))
        self.min_config_more = np.array(list(config_min_dic_more.values()))
        self.min_config = np.array(list(config_min_dic.values()))
        # init configurations for different sample size
        config_init_dic_primary, config_init_dic_more, _, config_type_dic = \
            generate_config_ini(estimator, self.estimator_configspace)
        self.init_config_dic_primary = {s: config_init_dic_primary}
        self.init_config_dic_more = {s: config_init_dic_more}
        self.init_config_dic_type_dic = {'primary': {
            s: config_init_dic_primary}, 'more': {s: config_init_dic_more}}
        self.init_config_dic = {
            **self.init_config_dic_type_dic['primary'],
            **self.init_config_dic_type_dic['more']
        }
        self.config_type_dic = config_type_dic
        # max configurations for different sample size
        config_max_dic_primary, config_max_dic_more, config_max_dic = \
            generate_config_max(
                estimator, self.estimator_configspace, int(s))
        self.max_config_dic_primary = {s: np.array(
            list(config_max_dic_primary.values()))}
        self.max_config_dic_more = {s: np.array(
            list(config_max_dic_more.values()))}
        self.max_config_dic = {s: np.array(list(config_max_dic.values()))}
        self.dims = (len(self.min_config_primary), len(self.min_config_more))
        # print(self.dims)
        if self.dims[1] > 0 and self.dims[0] > 0:
            self.base_upper_bound = {
                s:
                max(
                    max(
                        (self.max_config_dic_primary[s][i] / self.min_config_primary[i])
                        ** math.sqrt(self.dims[0]) for i in range(self.dims[0])
                    ),
                    max(
                        (self.max_config_dic_more[s][i] / self.min_config_more[i])
                        ** math.sqrt(self.dims[1]) for i in range(self.dims[1]))
                )
            }
        elif self.dims[0] > 0:
            self.base_upper_bound = {
                s:
                max(
                    (self.max_config_dic_primary[s][i] / self.min_config_primary[i])
                    ** (math.sqrt(self.dims[0])) for i in range(self.dims[0])
                )
            }
        else:
            self.base_upper_bound = {
                s:
                max(
                    (self.max_config_dic_more[s][i] / self.min_config_more[i])
                    ** (math.sqrt(self.dims[1])) for i in range(self.dims[1])
                )
            }
        # create sample size sequence
        while s < data_size:
            s2 = self.next_sample_size[s] = s * 2 if s * 2 <= data_size else data_size
            self.prev_sample_size[s2] = s
            s = s2
            config_max_dic_primary, config_max_dic_more, config_max_dic = \
                generate_config_max(
                    estimator, self.estimator_configspace, int(s))
            self.max_config_dic_primary[s] = np.array(
                list(config_max_dic_primary.values()))
            self.max_config_dic_more[s] = np.array(
                list(config_max_dic_more.values()))
            self.max_config_dic[s] = np.array(list(config_max_dic.values()))
            if self.dims[1] > 0 and self.dims[0] > 0:
                self.base_upper_bound[s] = max(
                    max(
                        (self.max_config_dic_primary[s][i]
                         / self.min_config_primary[i])
                        ** math.sqrt(self.dims[0]) for i in range(self.dims[0])
                    ),
                    max(
                        (self.max_config_dic_more[s][i]
                         / self.min_config_more[i])
                        ** math.sqrt(self.dims[1]) for i in range(self.dims[1])
                    )
                )
            elif self.dims[0] > 0:
                self.base_upper_bound[s] = max(
                    (self.max_config_dic_primary[s][i]
                     / self.min_config_primary[i])
                    ** math.sqrt(self.dims[0]) for i in range(self.dims[0])
                )
            else:
                self.base_upper_bound[s] = max(
                    (self.max_config_dic_more[s][i] / self.min_config_more[i])
                    ** math.sqrt(self.dims[1]) for i in range(self.dims[1])
                )
        self.init_sample_size = init_sample_size
        self.data_size = data_size
        self.sample_size_full = int(self.data_size / (1.0 - split_ratio))
        self.compute_with_config = compute_with_config
        self.estimator = estimator
        # for logging
        self.save_helper = save_info_helper
        self.estimator_type_list = ['primary', 'more']
        self.dim = self.dims[0] if self.dims[0] > 0 else self.dims[1]
        self.b = BASE_Const**(math.sqrt(self.dim))
        self.base_ini = self.b
        self.total_dim = sum(self.dims)
        self.epo = 2**(self.dim - 1)
        # keys are [sample size, config], values are (loss, train_time)
        self.config_tried = {}
        self.train_with_config = train_with_config
        self.current_config_loss = None
        self.use_dual_dir = use_dual_dir
        self.move_type = move_type
    def evaluate_config(self, config, sample_size, move='_pos'):
        '''
        evaluate a configuration, update search state, 
        and return whether the state is changed
        '''
        if self.time_from_start >= self.time_budget or move != '_ini' and \
                self.train_time > self.time_budget - self.time_from_start:
            return False
        model, val_loss, new_train_time, from_history, train_loss = \
            self.evaluate_proposed_config(config, sample_size, move)
        # update current config
        self.update_current_config(config, val_loss, sample_size)
        # update best model statistics, including statistics about loss and time
        improved = self.update_search_state_best(
            config, sample_size, model, val_loss, new_train_time, from_history)
        self.time_from_start = time.time() - self.start_time
        if self.save_helper is not None:
            if from_history:
                move = move + '_from_hist'
            self.save_helper.append(self.model_count,
                                    train_loss,
                                    new_train_time,
                                    self.time_from_start,
                                    val_loss,
                                    config,
                                    self.best_loss,
                                    self.best_config[0],
                                    self.estimator,
                                    sample_size)
        return improved
    def get_hist_config_sig(self, sample_size, config):
        config_values = get_config_values(config, self.config_type_dic)
        config_sig = str(sample_size) + '_' + str(config_values)
        return config_sig
    def evaluate_proposed_config(self, config, sample_size, move):
        self.model_count += 1
        config_sig = self.get_hist_config_sig(sample_size, config)
        d = self.total_dim
        history_size_per_d = len(self.config_tried) / float(d)
        if config_sig in self.config_tried:
            val_loss, new_train_time = self.config_tried[config_sig]
            # print(config_sig,'found in history')
            model = train_loss = None
            from_history = True
        else:
            model, val_loss, train_loss, new_train_time, _ = \
                self.compute_with_config(self.estimator, config, sample_size)
            from_history = False
            if history_size_per_d < HISTORY_SIZE:
                self.config_tried[config_sig] = (val_loss, new_train_time)
        if self.first_move:
            self.init_config_dic[sample_size] = config
            move = '_ini'
            self.base = self.base_ini
            self.num_noimprovement = 0
        move = str(self.estimator) + move
        return model, val_loss, new_train_time, from_history, train_loss
    def update_current_config(self, config, val_loss, sample_size):
        if self.first_move or val_loss < self.current_config_loss:
            self.first_move = False
            # update current config and coressponding sample_size
            self.sample_size = sample_size
            self.config = config
            self.config_primary = {x: config[x]
                                   for x in self.config_primary.keys()}
            try:
                self.config_more = {x: config[x]
                                    for x in self.config_more.keys()}
            except:
                self.config_more = {}
            self.current_config_loss = val_loss
    def update_reset_best_config_loss(self, sample_size, config, val_loss):
        if sample_size == self.data_size:
            if self.best_config_loss_dic_full_reset[1] is None:
                self.best_config_loss_dic_full_reset = [
                    config, val_loss, self.model_count]
            else:
                full_reset_best_loss = self.best_config_loss_dic_full_reset[1]
                if val_loss < full_reset_best_loss:
                    self.best_config_loss_dic_full_reset = [
                        config, full_reset_best_loss, self.model_count]
    def update_search_state_best(self, config, sample_size, model, val_loss,
                                 new_train_time, from_history):
        # upate the loss statistics for a particular sample size
        if sample_size not in self.best_config_loss_samplesize_dic:
            self.best_config_loss_samplesize_dic[sample_size] = [
                config, val_loss, self.model_count]
        else:
            s_best_loss = self.best_config_loss_samplesize_dic[sample_size][1]
            if val_loss < s_best_loss:
                self.best_config_loss_samplesize_dic[sample_size] = [
                    config, val_loss, self.model_count]
        self.update_reset_best_config_loss(sample_size, config, val_loss)
        # update best model statistics, including statistics about loss and time
        if val_loss < self.new_loss:
            self.old_loss = self.new_loss if self.new_loss < float(
                'inf') else 2 * val_loss
            self.new_loss = val_loss
            self.old_loss_time = self.new_loss_time
            self.old_train_time = self.train_time
            self.new_loss_time = self.train_time = new_train_time
            if val_loss < self.best_loss:
                self.best_config = [self.config, self.model_count]
                if not from_history:
                    self.trained_estimator = model
                    # print(model)
                else:
                    print(val_loss, self.best_loss)
                self.best_loss = val_loss
                self.time_best_found = self.time_from_start
            return True
        else:
            if not from_history:
                self.new_loss_time += new_train_time
            return False
    def get_proposal(self, current_config, rand_vector_func, base, move_type):
        rand_vector = rand_vector_func(len(current_config))
        rand_vector = [i for i in rand_vector]
        rand_vector_neg = [-i for i in rand_vector]
        move_vector = {}
        move_vector_neg = {}
        index_ = 0
        for k, v in current_config.items():
            if 'geo' in move_type:
                # get the move vector using the proposed random vector
                move_vector[k] = v * (base**(rand_vector[index_]))
                move_vector_neg[k] = v * (base**(rand_vector_neg[index_]))
            else:
                move_vector[k] = v + (base * (rand_vector[index_]))
                move_vector_neg[k] = v + (base * (rand_vector_neg[index_]))
            index_ += 1
        # as long as one of the proposed model (+ or -) is within the mem_limit
        # we will proceed
        if not self.use_dual_dir:
            move_vector_neg = None
        return move_vector, move_vector_neg
    def get_config_from_move_vector(self, v, estimator_type):
        if v != None:
            if 'all' in estimator_type:
                v = v
            elif 'primary' in estimator_type:
                v = {**v, **self.config_more}
            else:
                v = {**self.config_primary, **v}
            bounded_v = self.get_v_within_min_max(v)
        else:
            bounded_v = None
        return bounded_v
    def dual_direction_sample(self, base, current_search_config,
                              estimator_type='primary', rand_vector_func=rand_vector_unit_sphere,
                              mem_thres=MEM_THRES, move_type='geo'):
        current_config = current_search_config
        if len(current_config) == 0:
            return None, None
        bounded_v_list = [None, None]
        while not bounded_v_list[0] and not bounded_v_list[
                1] and self.time_from_start < self.time_budget:
            move_vector, move_vector_neg = self.get_proposal(
                current_config, rand_vector_func,
                base, move_type)
            bounded_v_list = [move_vector, move_vector_neg]
            for i, v in enumerate(bounded_v_list):
                bounded_v = self.get_config_from_move_vector(v, estimator_type)
                proposed_model_size = self.get_size_for_config(bounded_v)
                proposed_model_size = 0 if not isinstance(
                    proposed_model_size, float) else proposed_model_size
                if proposed_model_size > mem_thres:
                    # print(bounded_v, proposed_model_size, mem_thres)
                    bounded_v = None
                bounded_v_list[i] = bounded_v
            self.time_from_start = time.time() - self.start_time
        return bounded_v_list
    def get_v_within_min_max(self, v):
        index_ = 0
        bounded_v = {}
        for key, value in v.items():
            new_value = min(max(
                value, self.min_config[index_]), self.max_config_dic[
                    self.sample_size][index_])
            bounded_v[key] = new_value
            index_ += 1
        return bounded_v
    def expected_time_improvement_search(self):
        return max(self.old_loss_time - self.old_train_time + self.train_time,
                   self.new_loss_time)
    def increase_sample_size(self):
        '''
        whether it's time to increase sample size
        '''
        expected_time_improvement_sample = 2 * self.train_time
        self.increase = self.sample_size < self.data_size and (
            self.estimator_type == 0 or self.dims[0] == 0) and (
                not self.improved
            or expected_time_improvement_sample
            < self.expected_time_improvement_search()
        )
        return self.increase
    def search_begin(self, time_budget, start_time=None):
        self.time_budget = time_budget
        if not start_time:
            self.start_time = time.time()
        else:
            self.start_time = start_time
        # the time to train the last selected config
        self.old_train_time = self.train_time = 0
        self.time_from_start = 0
        # search states
        self.first_move = True
        self.improved = True
        self.estimator_type = 0 if self.dims[0] > 0 else 1
        self.old_loss = self.new_loss = self.best_loss = float('+inf')
        # new_loss_time is the time from the beginning of training self.config to
        # now,
        # old_loss_time is the time from the beginning of training the old
        # self.config to the beginning of training self.config
        self.old_loss_time = self.new_loss_time = 0
        self.trained_estimator = None
        self.model_count = 0
        self.K = 0
        self.old_modelcount = 0
        # self.config has two parts: config_primary contain the configs
        # that are related with model complexity, config_more contains the
        # configs that is not related with model complexity
        self.config_primary = self.init_config_dic_primary[self.init_sample_size]
        self.config_more = self.init_config_dic_more[self.init_sample_size]
        self.config = {**self.config_primary, **self.config_more}
        self.best_config = [None, None]
        # key: sample size, value: [best_config, best_loss, model_count] under
        # sample size in the key
        self.best_config_loss_samplesize_dic = {
            self.init_sample_size: [self.config, self.old_loss, self.model_count]}
        # key: sample size, value: [best_config, best_loss, model_count] under
        # sample size in the key
        self.best_config_loss_dic_full_reset = [None, None, None]
        self.sample_size = self.init_sample_size
        self.base_change_bound = 1
        self.base_change_count = 0
        self.evaluate_config(self.config, self.sample_size, '_ini')
        self.increase = False
    def train_config(self, config, sample_size):
        '''
        train a configuration
        '''
        # print('Evalute Config')
        if self.time_from_start >= self.time_budget:
            return False
        config_sig = self.get_hist_config_sig(sample_size, config)
        if not config_sig in self.config_tried:
            _, new_train_time = self.train_with_config(
                self.estimator, config, sample_size)
            train_loss, val_loss, move = None, self.new_loss, str(
                self.estimator) + '_trainAll'
            self.time_from_start = time.time() - self.start_time
            if self.save_helper is not None:
                self.save_helper.append(self.model_count,
                                        train_loss,
                                        new_train_time,
                                        self.time_from_start,
                                        val_loss,
                                        config,
                                        self.best_loss,
                                        self.best_config,
                                        move,
                                        sample_size)
            self.config_tried[config_sig] = (val_loss, new_train_time)
    def try_increase_sample_size(self):
        # print( self.estimator, self.sample_size)
        if self.sample_size in self.next_sample_size:
            if self.increase_sample_size():
                self.first_move = True
                self.improved = True
                self.estimator_type = 0 if self.dims[0] > 0 else 1
                self.evaluate_config(
                    self.config, self.next_sample_size[self.sample_size])
        if not self.old_modelcount and self.sample_size == self.data_size:
            self.old_modelcount = self.model_count
    def setup_current_search_config(self):
        estimator_type = self.estimator_type_list[self.estimator_type]
        if 'all' in estimator_type:
            current_search_config = self.config
        elif 'primary' in estimator_type:
            current_search_config = self.config_primary
        else:
            current_search_config = self.config_more
            # print(self.config_more)
        return estimator_type, current_search_config
    def search1step(self, global_best_loss=float('+inf'),
                    retrain_full=True, mem_thres=MEM_THRES, reset_type='init_gaussian'):
        # try to increase sample size
        self.try_increase_sample_size()
        # decide current_search_config according to estimator_type
        estimator_type, current_search_config = \
            self.setup_current_search_config()
        time_left = self.time_budget - self.time_from_start
        if time_left < self.train_time:
            return False
        if retrain_full and self.train_time < time_left < 2 * self.train_time \
                and self.best_loss <= global_best_loss:
            self.train_config(self.best_config[0], self.sample_size_full)
        move_vector, move_vector_neg = self.dual_direction_sample(
            self.base, current_search_config, estimator_type,
            rand_vector_unit_sphere, mem_thres, self.move_type)
        if move_vector is None:
            if move_vector_neg is None:
                self.improved = False
            else:
                self.improved = self.evaluate_config(
                    move_vector_neg, self.sample_size, '_neg' + str(
                        estimator_type))
        else:
            self.improved = self.evaluate_config(
                move_vector, self.sample_size, '_pos' + str(estimator_type))
            if not self.improved:
                if move_vector_neg is None:
                    pass
                else:
                    self.improved = self.evaluate_config(
                        move_vector_neg, self.sample_size, '_neg' + str(
                            estimator_type))
        self.update_noimprovement_stat(
            global_best_loss, retrain_full, reset_type)
        return self.improved
    def update_noimprovement_stat(self, global_best_loss, retrain_full,
                                  reset_type):
        if self.improved:
            self.num_noimprovement = 0
        else:
            self.estimator_type = 1 - self.estimator_type
            if self.dims[self.estimator_type] == 0:
                self.estimator_type = 1 - self.estimator_type
            if self.estimator_type == 1 or self.dims[1] == 0:
                self.noimprovement(global_best_loss, retrain_full, reset_type)
    def noimprovement(self, global_best_loss, retrain_full, reset_type='org'):
        if self.sample_size == self.data_size:
            # Do not wait until full sample size to update num_noimprovement?
            self.num_noimprovement += 1
            if self.num_noimprovement >= self.epo:
                self.num_noimprovement = 0
                # print(self.num_noimprovement, self.epo)
                if self.base_change == 'squareroot':
                    self.base = math.sqrt(self.base)
                else:
                    if self.K == 0:  # first time
                        oldK = self.best_config_loss_dic_full_reset[2] - \
                            self.old_modelcount
                    else:
                        oldK = self.K
                    self.K = self.model_count + 1 - self.old_modelcount
                    if self.base_change == 'K':
                        self.base **= oldK / self.K
                    else:
                        self.base **= math.sqrt(oldK / self.K)
                if self.dims[1] > 0 and self.dims[0] > 0:
                    base_lower_bound = min(
                        min(
                            (1.0 + self.estimator_configspace[i].min_change
                             / self.config_primary[i])
                            ** math.sqrt(self.dims[0])
                            for i in self.config_primary.keys()
                        ),
                        min(
                            (1.0 + self.estimator_configspace[i].min_change
                             / self.config_more[i])
                            ** math.sqrt(self.dims[1])
                            for i in self.config_more.keys()
                        )
                    )
                elif self.dims[0] > 0:
                    base_lower_bound = min(
                        (1.0 + self.estimator_configspace[i].min_change
                         / self.config_primary[i])
                        ** math.sqrt(self.dims[0])
                        for i in self.config_primary.keys()
                    )
                else:
                    base_lower_bound = min(
                        (1.0 + self.estimator_configspace[i].min_change
                         / self.config_more[i])
                        ** math.sqrt(self.dims[1])
                        for i in self.config_more.keys()
                    )
                if np.isinf(base_lower_bound):
                    base_lower_bound = BASE_LOWER_BOUND
                self.base_change_count += 1
                if self.base <= base_lower_bound or \
                        self.base_change_count == self.base_change_bound:
                    if retrain_full and self.sample_size == self.data_size:
                        if self.best_loss <= global_best_loss:
                            # Only train on full data when the curent estimator
                            #  is the best estimator
                            # print('best estimator and train on full data')
                            self.train_config(
                                self.best_config[0], self.sample_size_full)
                    # remaining time is more than enough for another trial
                    if self.time_budget - self.time_from_start > self.train_time:
                        self.base_change_bound <<= 1
                        self.base_change_count = 0
                        self.K = 0
                        self.old_modelcount = self.model_count
                        self.best_config_loss_dic_full_reset = [None, None,
                                                                None]
                        self.first_move = True
                        self.improved = True
                        self.base_ini = min(
                            self.base_ini * 2, self.base_upper_bound[
                                self.sample_size])
                        self.estimator_type = 0 if self.dims[0] > 0 else 1
                        reset_config, reset_sample_size = self.get_reset_config(
                            self.init_sample_size, reset_type)
                        self.sample_size = reset_sample_size
                        # print('reset sample size', reset_sample_size)
                        self.evaluate_config(reset_config, self.sample_size,
                                             '_ini')
    def get_reset_config(self, sample_size, reset_type):
        init_config = self.init_config_dic[self.sample_size]
        reset_sample_size = sample_size
        if 'org' in reset_type:
            reset_config = init_config
        else:
            if 'init_gaussian' in reset_type:
                reset_config = init_config
                reset_sample_size = self.get_reset_sample_size(reset_config)
                config_values = get_config_values(
                    reset_config, self.config_type_dic)
                config_sig = str(reset_sample_size) + '_' + str(config_values)
                count = 0
                while config_sig in self.config_tried and \
                        self.time_from_start < self.time_budget and count < 1000:
                    # TODO: check exhaustiveness? use time as condition?
                    count += 1
                    move, move_neg = self.dual_direction_sample(
                        base=self.b, current_search_config=init_config,
                        estimator_type='all',
                        rand_vector_func=rand_vector_gaussian,
                        move_type=self.move_type)
                    if move:
                        reset_config = move_neg
                    elif move_neg:
                        reset_config = move_neg
                    else:
                        continue
                    reset_sample_size = self.get_reset_sample_size(
                        reset_config)
                    config_values = get_config_values(
                        reset_config, self.config_type_dic)
                    config_sig = str(reset_sample_size) + \
                        '_' + str(config_values)
                    self.time_from_start = time.time() - self.start_time
            else:
                raise NotImplementedError
        return reset_config, reset_sample_size
    def get_reset_sample_size(self, reset_config):
        if not reset_config:
            print('reset_config is none')
        reset_config_size = self.get_size_for_config(reset_config)
        candidate_sample_size_list = []
        for sample_size, config_and_bestloss in \
                self.best_config_loss_samplesize_dic.items():
            s_best_config = config_and_bestloss[0]
            if not s_best_config:
                print('best config is none', sample_size)
            s_best_config_model_size = self.get_size_for_config(s_best_config)
            if s_best_config_model_size >= reset_config_size:
                candidate_sample_size_list.append(sample_size)
        if len(candidate_sample_size_list) != 0:
            return min(candidate_sample_size_list)
        else:
            return self.data_size
--- a/flaml/space.py
+++ b/flaml/space.py
@ -0,0 +1,249 @@
 '''!
 * Copyright (c) 2020 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. 
 '''
 class ConfigSearchInfo:
    '''The class of the search space of a hyperparameters:
    Attributes:
        name: A string of the name of the hyperparameter
        type: data type of the hyperparameter
        lower: A number of the lower bound of the value
        upper: A number of the upper bound of the value
        init: A number of the initial value. For hyperparameters related to
            complexity, the init value needs to correspond to the lowest
            complexity
        change_tpe: A string of the change type, 'linear' or 'log'
        min_change: A number of the minimal change required. Could be inf if
            no such requirement
    '''
    def __init__(self, name, type, lower, upper, init, change_type = 'log',
     complexity_related = True, min_change = None):
        self.name = name  
        self.type = type  
        self.lower = lower 
        self.upper = upper 
        self.init = init  
        self.change_type = change_type
        self.complexity_related = complexity_related
        # default setting of min_change: if type is int, min_change 
        # should be 1, otherwise +inf
        if min_change is None:
            if self.type == int:
                self.min_change = 1.0 #minimum change required, 
            else:
                self.min_change = float('+inf')
        else:
            self.min_change = min_change
 def config_space(estimator, data_size, objective_name = "regression"):
    CS = {}
    n_estimators_upper = min(32768,int(data_size))
    max_leaves_upper = min(32768,int(data_size))
    # exp_max_depth_upper = min(32768,data_size)
    if 'xgboost' in estimator:
        CS['n_estimators'] = ConfigSearchInfo(name = 'n_estimators',
         type = int, lower = 4, init = 4, upper = n_estimators_upper, 
         change_type = 'log')
        CS['max_leaves'] = ConfigSearchInfo(name = 'max_leaves', type =int,
         lower = 4, init = 4, upper = max_leaves_upper, change_type = 'log')
        CS['min_child_weight'] = ConfigSearchInfo(name = 'min_child_weight',
         type = float, lower = 0.001, init = 20.0, upper = 20.0, 
         change_type = 'log')
        CS['learning_rate'] = ConfigSearchInfo(name = 'learning_rate',
         type = float, lower = 0.01, init = 0.1, upper = 1.0, 
         change_type = 'log')
        CS['subsample'] = ConfigSearchInfo(name = 'subsample', type = float,
         lower = 0.6, init = 1.0, upper = 1.0, change_type = 'linear')
        CS['reg_alpha'] = ConfigSearchInfo(name = 'reg_alpha', type = float,
         lower = 1e-10, init = 1e-10, upper = 1.0, change_type = 'log',
         complexity_related = True)
        CS['reg_lambda'] = ConfigSearchInfo(name = 'reg_lambda', type = float,
         lower = 1e-10, init = 1.0, upper = 1.0, change_type = 'log')
        CS['colsample_bylevel'] = ConfigSearchInfo(name = 'colsample_bylevel',
         type = float, lower = 0.6, init = 1.0, upper = 1.0, 
         change_type = 'linear')
        CS['colsample_bytree'] = ConfigSearchInfo(name = 'colsample_bytree',
         type = float, lower = 0.7, init = 1.0, upper = 1.0, 
         change_type = 'linear')
    elif estimator in ('rf', 'extra_tree'):
        n_estimators_upper = min(2048, n_estimators_upper)
        # max_leaves_upper = min(2048, max_leaves_upper)
        CS['n_estimators'] = ConfigSearchInfo(name = 'n_estimators',
         type = int, lower = 4, init = 4, upper = n_estimators_upper, 
         change_type = 'log')
        if objective_name != 'regression':
            CS['criterion'] = ConfigSearchInfo(name = 'criterion',
            type = int, lower = 1, init = 1, upper = 2, 
            change_type = 'log')
        # CS['max_leaves'] = ConfigSearchInfo(name = 'max_leaves', type =int,
        #  lower = 4, init = 4, upper = max_leaves_upper, change_type = 'log',
        #  complexity_related = True)
        CS['max_features'] = ConfigSearchInfo(name = 'max_features', type = float,
         lower = 0.1, init = 1.0, upper = 1.0, change_type = 'log')
        # CS['min_samples_split'] = ConfigSearchInfo(name = 'min_samples_split',
        #  type = int, lower = 2, init = 2, upper = 20, change_type = 'log', 
        #  complexity_related = True)
        # CS['min_samples_leaf'] = ConfigSearchInfo(name = 'min_samples_leaf',
        #  type = int, lower = 1, init = 1, upper = 20, change_type = 'log', 
        #  complexity_related = True)
    elif 'lgbm' in estimator:
        CS['n_estimators'] = ConfigSearchInfo(name = 'n_estimators', type = int,
         lower = 4, init = 4, upper = n_estimators_upper, change_type = 'log')
        CS['max_leaves'] = ConfigSearchInfo(name = 'max_leaves', type = int,
         lower = 4, init = 4, upper = max_leaves_upper, change_type = 'log')
        CS['min_child_weight'] = ConfigSearchInfo(name = 'min_child_weight',
         type = float, lower = 0.001, init = 20, upper = 20.0, 
         change_type = 'log')
        CS['learning_rate'] = ConfigSearchInfo(name = 'learning_rate',
         type = float, lower = 0.01, init = 0.1, upper = 1.0, 
         change_type = 'log')
        CS['subsample'] = ConfigSearchInfo(name = 'subsample', type = float,
         lower = 0.6, init = 1.0, upper = 1.0, change_type = 'log',
         complexity_related = True)
        CS['log_max_bin'] = ConfigSearchInfo(name = 'log_max_bin', type = int,
         lower = 3, init = 8, upper = 10, change_type = 'log',
         complexity_related = True)
        CS['reg_alpha'] = ConfigSearchInfo(name = 'reg_alpha', type = float,
         lower = 1e-10, init = 1e-10, upper = 1.0, change_type = 'log',
         complexity_related = True)
        CS['reg_lambda'] = ConfigSearchInfo(name = 'reg_lambda', type = float,
         lower = 1e-10, init = 1.0, upper = 1.0, change_type = 'log')
        CS['colsample_bytree'] = ConfigSearchInfo(name = 'colsample_bytree',
         type = float, lower = 0.7, init = 1.0, upper = 1.0, 
         change_type = 'log')
    elif 'lr' in estimator:
        CS['C'] = ConfigSearchInfo(name = 'C', type =float, lower = 0.03125,
          init = 1.0, upper = 32768.0, change_type = 'log', 
          complexity_related = True)
    elif 'catboost' in estimator:
        # CS['n_estimators'] = ConfigSearchInfo(name = 'n_estimators', type = int,
        #  lower = 4, init = 64,  upper = n_estimators_upper, change_type = 'log', 
        #  complexity_related = True)
        early_stopping_rounds = max(min(round(1500000/data_size),150), 10)
        CS['rounds'] = ConfigSearchInfo(name = 'rounds', type = int,
         lower = 10, init = 10, 
         upper = early_stopping_rounds, change_type = 'log')
        # CS['exp_max_depth'] = ConfigSearchInfo(name = 'exp_max_depth', type = int,
        #  lower = 32, init = 64,  upper = 256, change_type = 'log', 
        #  complexity_related = True)
        CS['learning_rate'] = ConfigSearchInfo(name = 'learning_rate',
         type = float, lower = 0.005,  init = 0.1,  upper = .2, 
         change_type = 'log')
        # CS['l2_leaf_reg'] = ConfigSearchInfo(name = 'l2_leaf_reg',
        #  type = float, lower = 1,  init = 3, upper = 5, 
        #  change_type = 'log')
    elif 'nn' == estimator:
        CS['learning_rate'] = ConfigSearchInfo(name = 'learning_rate',
         type = float, lower = 1e-4, init = 3e-4, upper = 3e-2, 
         change_type = 'log')
        CS['weight_decay'] = ConfigSearchInfo(name = 'weight_decay',
         type = float, lower = 1e-12, init = 1e-6, upper = .1, 
         change_type = 'log')
        CS['dropout_prob'] = ConfigSearchInfo(name = 'dropout_prob',
         type = float, lower = 1.0, init = 1.1, upper = 1.5, 
         change_type = 'log')
    elif 'kneighbor' in estimator:
        n_neighbors_upper = min(512,int(data_size/2))
        CS['n_neighbors'] = ConfigSearchInfo(name = 'n_neighbors', type = int,
         lower = 1, init = 5, upper = n_neighbors_upper, change_type = 'log')        
    else:
        raise NotImplementedError
    return CS
 def estimator_size(config, estimator):
    if estimator in ['xgboost', 'lgbm', 'rf', 'extra_tree']:
        try:
            max_leaves = int(round(config['max_leaves']))
            n_estimators = int(round(config['n_estimators']))
            model_size = float((max_leaves*3 + (max_leaves-1)*4 + 1)*
                n_estimators*8) 
        except:
            model_size = 0
        return model_size
    elif 'catboost' in estimator:
        # if config is None: raise Exception("config is none")
        n_estimators = int(round(config.get('n_estimators',8192)))
        max_leaves = int(round(config.get('exp_max_depth',64)))
        model_size = float((max_leaves*3 + (max_leaves-1)*4 + 1)*
            n_estimators*8) 
        return model_size
    else:
        model_size = 1.0
        # raise NotImplementedError
    return model_size
 def generate_config_ini(estimator, estimator_configspace):
    config_dic = {}
    config_dic_more = {}
    config_type_dic = {}
    for _, config in estimator_configspace.items():
        name, init = config.name, config.init
        type_, complexity_related = config.type, config.complexity_related
        config_type_dic[name] = type_
        if complexity_related:
            config_dic[name] = init
        else:
            config_dic_more[name] = init
    return config_dic, config_dic_more, {**config_dic, **config_dic_more}, \
        config_type_dic
 def generate_config_min(estimator,estimator_configspace, max_config_size):
    config_dic = {}
    config_dic_more = {}
    for _, config in estimator_configspace.items():
        name, lower = config.name, config.lower
        complexity_related = config.complexity_related
        if complexity_related:
            config_dic[name] = lower
        else:
            config_dic_more[name] = lower
    return config_dic, config_dic_more, {**config_dic, **config_dic_more}
 def generate_config_max(estimator, estimator_configspace, max_config_size):
    config_dic = {}
    config_dic_more = {}
    for _, config in estimator_configspace.items():
        name, upper = config.name, config.upper
        complexity_related = config.complexity_related
        if complexity_related:
            if name in ('n_estimators', 'max_leaves'):
                config_dic[name] = min(upper, max_config_size)
            else:
                config_dic[name] = upper
        else:
            config_dic_more[name] = upper
    return config_dic, config_dic_more, {**config_dic, **config_dic_more}
 def get_config_values(config_dic, config_type_dic):
    value_list = []
    for k in config_dic.keys():
        org_v = config_dic[k]
        if config_type_dic[k] == int:
            v = int(round(org_v))
            value_list.append(v)
        else:
            value_list.append(org_v)
    return value_list
--- a/flaml/training_log.py
+++ b/flaml/training_log.py
@ -0,0 +1,168 @@
 '''!
 * Copyright (c) 2020 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. 
 '''
 import json
 from typing import IO
 from contextlib import contextmanager
 import warnings
 class TrainingLogRecord(object):
    def __init__(self,
                 record_id: int,
                 iter_per_learner: int,
                 logged_metric: float,
                 trial_time: float,
                 total_search_time: float,
                 validation_loss,
                 config,
                 best_validation_loss,
                 best_config,
                 learner,
                 sample_size):
        self.record_id = record_id
        self.iter_per_learner = iter_per_learner
        self.logged_metric = logged_metric
        self.trial_time = trial_time
        self.total_search_time = total_search_time
        self.validation_loss = validation_loss
        self.config = config
        self.best_validation_loss = best_validation_loss
        self.best_config = best_config
        self.learner = learner
        self.sample_size = sample_size
    def dump(self, fp: IO[str]):
        d = vars(self)
        return json.dump(d, fp)
    @classmethod
    def load(cls, json_str: str):
        d = json.loads(json_str)
        return cls(**d)
 class TrainingLogCheckPoint(TrainingLogRecord):
    def __init__(self, curr_best_record_id: int):
        self.curr_best_record_id = curr_best_record_id
 class TrainingLogWriter(object):
    def __init__(self, output_filename: str):
        self.output_filename = output_filename
        self.file = None
        self.current_best_loss_record_id = None
        self.current_best_loss = float('+inf')
        self.current_sample_size = None
        self.current_record_id = 0
    def open(self):
        self.file = open(self.output_filename, 'w')
    def append(self,
               it_counter: int,
               train_loss: float,
               trial_time: float,
               total_search_time: float,
               validation_loss,
               config,
               best_validation_loss,
               best_config,
               learner,
               sample_size):
        if self.file is None:
            raise IOError("Call open() to open the outpute file first.")
        if validation_loss is None:
            raise ValueError('TEST LOSS NONE ERROR!!!')
        record = TrainingLogRecord(self.current_record_id,
                                   it_counter,
                                   train_loss,
                                   trial_time,
                                   total_search_time,
                                   validation_loss,
                                   config,
                                   best_validation_loss,
                                   best_config,
                                   learner,
                                   sample_size)
        if validation_loss < self.current_best_loss or \
            validation_loss == self.current_best_loss and \
                sample_size > self.current_sample_size:
            self.current_best_loss = validation_loss
            self.current_sample_size = sample_size
            self.current_best_loss_record_id = self.current_record_id
        self.current_record_id += 1
        record.dump(self.file)
        self.file.write('\n')
        self.file.flush()
    def checkpoint(self):
        if self.file is None:
            raise IOError("Call open() to open the outpute file first.")
        if self.current_best_loss_record_id is None:
            warnings.warn("checkpoint() called before any record is written, "
                          "skipped.")
            return
        record = TrainingLogCheckPoint(self.current_best_loss_record_id)
        record.dump(self.file)
        self.file.write('\n')
        self.file.flush()
    def close(self):
        self.file.close()
 class TrainingLogReader(object):
    def __init__(self, filename: str):
        self.filename = filename
        self.file = None
    def open(self):
        self.file = open(self.filename)
    def records(self):
        if self.file is None:
            raise IOError("Call open() before reading log file.")
        for line in self.file:
            data = json.loads(line)
            if len(data) == 1:
                # Skip checkpoints.
                continue
            yield TrainingLogRecord(**data)
    def close(self):
        self.file.close()
    def get_record(self, record_id) -> TrainingLogRecord:
        if self.file is None:
            raise IOError("Call open() before reading log file.")
        for rec in self.records():
            if rec.record_id == record_id:
                return rec
        raise ValueError(f"Cannot find record with id {record_id}.")
@contextmanager
 def training_log_writer(filename: str):
    try:
        w = TrainingLogWriter(filename)
        w.open()
        yield w
    finally:
        w.close()
@contextmanager
 def training_log_reader(filename: str):
    try:
        r = TrainingLogReader(filename)
        r.open()
        yield r
    finally:
        r.close()
--- a/flaml/version.py
+++ b/flaml/version.py
@ -0,0 +1 @@
 __version__="0.1.0"
--- a/notebook/flaml_demo.ipynb
+++ b/notebook/flaml_demo.ipynb
--- a/settings.json
+++ b/settings.json
@ -0,0 +1,4 @@
 {
    "keep_max_logfiles": 30,
    "logging_level": "info"
 }
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,56 @@
 import setuptools
 import os
 here = os.path.abspath(os.path.dirname(__file__))
 with open("README.md", "r") as fh:
    long_description = fh.read()
 # Get the code version
 version = {}
 with open(os.path.join(here, "flaml/version.py")) as fp:
    exec(fp.read(), version)
 __version__ = version["__version__"]
 install_requires = [
    "NumPy>=1.16.2",
    "lightgbm>=2.3.1",
    "xgboost>=0.90",
    "scipy>=1.4.1",
    "catboost>=0.23",
    "scikit-learn>=0.23",
 ],
 setuptools.setup(
    name="FLAML",
    version=__version__,
    author="Microsoft Corporation",
    author_email="hpo@microsoft.com",
    description="A fast and lightweight autoML system",
    long_description=long_description,
    long_description_content_type="text/markdown",
    url="https://github.com/microsoft/FLAML",
    packages=["flaml"],
    install_requires=install_requires,
    extras_require={
        "notebook": [
            "openml==0.10.2",
            "jupyter",
            "matplotlib==3.2.0",
            "rgf-python",
        ],
        "test": [
            "flake8>=3.8.4",
            "pytest>=6.1.1",
            "coverage>=5.3",
        ],
    },
    classifiers=[
        "Programming Language :: Python :: 3",
        "License :: OSI Approved :: MIT License",
        "Operating System :: OS Independent",        
    ],
    python_requires=">=3.6",
 )
--- a/test/init.py
+++ b/test/init.py
--- a/test/test_automl.py
+++ b/test/test_automl.py
@ -0,0 +1,235 @@
 import unittest
 import numpy as np
 import scipy.sparse
 from sklearn.datasets import load_boston, load_iris
 from flaml import AutoML, get_output_from_log
 def custom_metric(X_test, y_test, estimator, labels, X_train, y_train):
    from sklearn.metrics import log_loss
    y_pred = estimator.predict_proba(X_test)
    test_loss = log_loss(y_test, y_pred, labels=labels)
    y_pred = estimator.predict_proba(X_train)
    train_loss = log_loss(y_train, y_pred, labels=labels)
    alpha = 0.5
    return test_loss * (1 + alpha) - alpha * train_loss, [test_loss, train_loss]
 class TestAutoML(unittest.TestCase):
    def test_dataframe(self):
        self.test_classification(True)
    def test_custom_metric(self):
        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 10,
            'eval_method': 'holdout',
            "metric": custom_metric,
            "task": 'classification',
            "log_file_name": "test/iris_custom.log",
            "log_training_metric": True,
            'log_type': 'all',
            "model_history": True
        }
        X_train, y_train = load_iris(return_X_y=True)
        automl_experiment.fit(X_train=X_train, y_train=y_train,
                              **automl_settings)
        print(automl_experiment.classes_)
        print(automl_experiment.predict_proba(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        automl_experiment = AutoML()
        estimator = automl_experiment.get_estimator_from_log(
            automl_settings["log_file_name"], record_id=0,
            objective='multi')
        print(estimator)
        time_history, best_valid_loss_history, valid_loss_history, \
            config_history, train_loss_history = get_output_from_log(
                filename=automl_settings['log_file_name'], time_budget=6)
        print(train_loss_history)
    def test_classification(self, as_frame=False):
        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 4,
            "metric": 'accuracy',
            "task": 'classification',
            "log_file_name": "test/iris.log",
            "log_training_metric": True,
            "model_history": True
        }
        X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
        automl_experiment.fit(X_train=X_train, y_train=y_train,
                              **automl_settings)
        print(automl_experiment.classes_)
        print(automl_experiment.predict_proba(X_train)[:5])
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        del automl_settings["metric"]
        del automl_settings["model_history"]
        del automl_settings["log_training_metric"]
        automl_experiment = AutoML()
        duration = automl_experiment.retrain_from_log(
            log_file_name=automl_settings["log_file_name"],
            X_train=X_train, y_train=y_train,
            train_full=True, record_id=0)
        print(duration)
        print(automl_experiment.model)
        print(automl_experiment.predict_proba(X_train)[:5])
    def test_regression(self):
        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
            "metric": 'mse',
            "task": 'regression',
            "log_file_name": "test/boston.log",
            "log_training_metric": True,
            "model_history": True
        }
        X_train, y_train = load_boston(return_X_y=True)
        n = len(y_train)
        automl_experiment.fit(X_train=X_train[:n >> 1], y_train=y_train[:n >> 1],
                              X_val=X_train[n >> 1:], y_val=y_train[n >> 1:],
                              **automl_settings)
        assert automl_experiment.y_val.shape[0] == n - (n >> 1)
        assert automl_experiment.eval_method == 'holdout'
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        print(get_output_from_log(automl_settings["log_file_name"], 1))
    def test_sparse_matrix_classification(self):
        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
            "metric": 'auto',
            "task": 'classification',
            "log_file_name": "test/sparse_classification.log",
            "split_type": "uniform",
            "model_history": True
        }
        X_train = scipy.sparse.random(1554, 21, dtype=int)
        y_train = np.random.randint(3, size=1554)
        automl_experiment.fit(X_train=X_train, y_train=y_train,
                              **automl_settings)
        print(automl_experiment.classes_)
        print(automl_experiment.predict_proba(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
    def test_sparse_matrix_regression(self):
        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
            "metric": 'mae',
            "task": 'regression',
            "log_file_name": "test/sparse_regression.log",
            "model_history": True
        }
        X_train = scipy.sparse.random(300, 900, density=0.0001)
        y_train = np.random.uniform(size=300)
        X_val = scipy.sparse.random(100, 900, density=0.0001)
        y_val = np.random.uniform(size=100)
        automl_experiment.fit(X_train=X_train, y_train=y_train,
                              X_val=X_val, y_val=y_val,
                              **automl_settings)
        assert automl_experiment.X_val.shape == X_val.shape
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        print(automl_experiment.best_config)
        print(automl_experiment.best_loss)
        print(automl_experiment.best_config_train_time)
    def test_sparse_matrix_xgboost(self):
        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
            "metric": 'ap',
            "task": 'classification',
            "log_file_name": "test/sparse_classification.log",
            "estimator_list": ["xgboost"],
            "log_type": "all",
        }
        X_train = scipy.sparse.eye(900000)
        y_train = np.random.randint(2, size=900000)
        automl_experiment.fit(X_train=X_train, y_train=y_train,
                              **automl_settings)
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
    def test_sparse_matrix_lr(self):
        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
            "metric": 'f1',
            "task": 'classification',
            "log_file_name": "test/sparse_classification.log",
            "estimator_list": ["lrl1", "lrl2"],
            "log_type": "all",
        }
        X_train = scipy.sparse.random(3000, 900, density=0.1)
        y_train = np.random.randint(2, size=3000)
        automl_experiment.fit(X_train=X_train, y_train=y_train,
                              **automl_settings)
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
    def test_sparse_matrix_regression_cv(self):
        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
            'eval_method': 'cv',
            "task": 'regression',
            "log_file_name": "test/sparse_regression.log",
            "model_history": True
        }
        X_train = scipy.sparse.random(100, 100)
        y_train = np.random.uniform(size=100)
        automl_experiment.fit(X_train=X_train, y_train=y_train,
                              **automl_settings)
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
 if __name__ == "__main__":
    unittest.main()
--- a/test/test_split.py
+++ b/test/test_split.py
@ -0,0 +1,45 @@
 import unittest
 from sklearn.datasets import fetch_openml
 from flaml.automl import AutoML
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score
 dataset = "Airlines"
 def _test(split_type):
    automl = AutoML()
    automl_settings = {
        "time_budget": 2,
        # "metric": 'accuracy',
        "task": 'classification',
        "log_file_name": "test/{}.log".format(dataset),
        "model_history": True,
        "log_training_metric": True,
        "split_type": split_type,
    }
    X, y = fetch_openml(name=dataset, return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
     random_state=42)
    automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
    pred = automl.predict(X_test)
    acc = accuracy_score(y_test, pred)
    print(acc)
 def test_stratified():
    _test(split_type="stratified")
 def test_uniform():
    _test(split_type="uniform")
 if __name__ == "__main__":
    unittest.main()
--- a/test/test_version.py
+++ b/test/test_version.py
@ -0,0 +1,14 @@
 import unittest
 import flaml
 class TestVersion(unittest.TestCase):
    def test_version(self):
        self.assertTrue(hasattr(flaml, '__version__'))
        self.assertTrue(len(flaml.__version__) > 0)
 if __name__ == "__main__":
    unittest.main()