set black with 120 line length (#975)

* set black with 120 line length

* apply pre-commit

* apply black
This commit is contained in:
Jirka Borovec 2023-04-10 21:50:40 +02:00 коммит произвёл GitHub
Родитель ef5a17cd83
Коммит a701cd82f8
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
97 изменённых файлов: 809 добавлений и 2696 удалений

Просмотреть файл

@ -11,6 +11,7 @@ repos:
rev: 23.1.0
hooks:
- id: black
args: ["--line-length=120"]
- repo: https://github.com/pycqa/flake8
rev: 6.0.0
hooks:

Просмотреть файл

@ -27,9 +27,7 @@ def execute_code(code: str, max_exec_time: Optional[int] = 3):
return int(result.returncode == 0)
def generate_assertions(
definition: str, model: Optional[str] = "gpt-3.5-turbo"
) -> Tuple[str, float]:
def generate_assertions(definition: str, model: Optional[str] = "gpt-3.5-turbo") -> Tuple[str, float]:
"""Generate assertions for a function.
Args:
@ -112,9 +110,7 @@ def eval_function_completions(
for i in range(n):
response = responses[i] = _remove_check(responses[i])
code = (
f"{response}\n{assertions}"
if response.startswith("def")
else f"{definition}{response}\n{assertions}"
f"{response}\n{assertions}" if response.startswith("def") else f"{definition}{response}\n{assertions}"
)
succeed_assertions = execute_code(code)
if succeed_assertions:
@ -149,9 +145,7 @@ def eval_function_completions(
def implement(
definition: str,
configs: List[Dict],
assertions: Optional[
Union[str, Callable[[str], Tuple[str, float]]]
] = generate_assertions,
assertions: Optional[Union[str, Callable[[str], Tuple[str, float]]]] = generate_assertions,
) -> Tuple[str, float]:
"""Implement a function from a definition.
@ -172,9 +166,7 @@ def implement(
response = oai.Completion.create({"definition": definition}, **config)
cost += oai.Completion.cost(config["model"], response)
responses = oai.Completion.extract_text(response)
metrics = eval_function_completions(
responses, definition, assertions=assertions
)
metrics = eval_function_completions(responses, definition, assertions=assertions)
assertions = metrics["assertions"]
cost += metrics["gen_cost"]
if metrics["succeed_assertions"] or i == len(configs) - 1:

Просмотреть файл

@ -21,9 +21,7 @@ try:
ERROR = None
except ImportError:
ERROR = ImportError(
"please install flaml[openai] option to use the flaml.oai subpackage."
)
ERROR = ImportError("please install flaml[openai] option to use the flaml.oai subpackage.")
logger = logging.getLogger(__name__)
if not logger.handlers:
# Add the console handler.
@ -142,17 +140,11 @@ class Completion:
if response is not None and (response != -1 or not eval_only):
# print("using cached response")
return response
openai_completion = (
openai.ChatCompletion
if config["model"] in cls.chat_models
else openai.Completion
)
openai_completion = openai.ChatCompletion if config["model"] in cls.chat_models else openai.Completion
start_time = time.time()
while True:
try:
response = openai_completion.create(
request_timeout=cls.request_timeout, **config
)
response = openai_completion.create(request_timeout=cls.request_timeout, **config)
cls._cache.set(key, response)
return response
except (
@ -176,9 +168,7 @@ class Completion:
if "azure" == openai.api_type and "model" in config:
# azure api uses "engine" instead of "model"
config = config.copy()
config["engine"] = config.pop("model").replace(
"gpt-3.5-turbo", "gpt-35-turbo"
)
config["engine"] = config.pop("model").replace("gpt-3.5-turbo", "gpt-35-turbo")
else:
raise
logger.warning(
@ -193,11 +183,7 @@ class Completion:
# find the max value in max_valid_n_per_max_tokens
# whose key is equal or larger than max_tokens
return max(
(
value
for k, value in cls._max_valid_n_per_max_tokens.get(key, {}).items()
if k >= max_tokens
),
(value for k, value in cls._max_valid_n_per_max_tokens.get(key, {}).items() if k >= max_tokens),
default=1,
)
@ -206,11 +192,7 @@ class Completion:
# find the min value in min_invalid_n_per_max_tokens
# whose key is equal or smaller than max_tokens
return min(
(
value
for k, value in cls._min_invalid_n_per_max_tokens.get(key, {}).items()
if k <= max_tokens
),
(value for k, value in cls._min_invalid_n_per_max_tokens.get(key, {}).items() if k <= max_tokens),
default=None,
)
@ -227,12 +209,10 @@ class Completion:
def _update_invalid_n(cls, prune, region_key, max_tokens, num_completions):
if prune:
# update invalid n and prune this config
cls._min_invalid_n_per_max_tokens[
region_key
] = invalid_n = cls._min_invalid_n_per_max_tokens.get(region_key, {})
invalid_n[max_tokens] = min(
num_completions, invalid_n.get(max_tokens, np.inf)
cls._min_invalid_n_per_max_tokens[region_key] = invalid_n = cls._min_invalid_n_per_max_tokens.get(
region_key, {}
)
invalid_n[max_tokens] = min(num_completions, invalid_n.get(max_tokens, np.inf))
@classmethod
def _pop_subspace(cls, config):
@ -280,16 +260,12 @@ class Completion:
model = config["model"]
data_length = len(data)
price = cls.price1K.get(model)
price_input, price_output = (
price if isinstance(price, tuple) else (price, price)
)
price_input, price_output = price if isinstance(price, tuple) else (price, price)
inference_budget = getattr(cls, "inference_budget", None)
prune_hp = getattr(cls, "_prune_hp", "n")
metric = cls._metric
config_n = config.get(prune_hp, 1) # default value in OpenAI is 1
max_tokens = config.get(
"max_tokens", np.inf if model in cls.chat_models else 16
)
max_tokens = config.get("max_tokens", np.inf if model in cls.chat_models else 16)
prompt, messages = cls._get_prompt_messages_from_config(model, config)
stop = cls._stops and cls._stops[config["stop"]]
target_output_tokens = None
@ -300,9 +276,7 @@ class Completion:
region_key = cls._get_region_key(config)
max_valid_n = cls._get_max_valid_n(region_key, max_tokens)
if cls.avg_input_tokens:
target_output_tokens = (
inference_budget * 1000 - cls.avg_input_tokens * price_input
) / price_output
target_output_tokens = (inference_budget * 1000 - cls.avg_input_tokens * price_input) / price_output
# max_tokens bounds the maximum tokens
# so using it we can calculate a valid n according to the avg # input tokens
max_valid_n = max(
@ -340,16 +314,12 @@ class Completion:
while True: # data_limit <= data_length
# limit the number of data points to avoid rate limit
for i in range(prev_data_limit, data_limit):
logger.debug(
f"num_completions={num_completions}, data instance={i}"
)
logger.debug(f"num_completions={num_completions}, data instance={i}")
data_i = data[i]
params = cls._construct_params(data_i, params, prompt, messages)
response = cls._get_response(params, eval_only)
if response == -1: # rate limit error, treat as invalid
cls._update_invalid_n(
prune, region_key, max_tokens, num_completions
)
cls._update_invalid_n(prune, region_key, max_tokens, num_completions)
result[metric] = 0
result["cost"] = cost
return result
@ -361,16 +331,10 @@ class Completion:
if not cls.avg_input_tokens and not input_tokens[i]:
# store the # input tokens
input_tokens[i] = n_input_tokens
query_cost = (
price_input * n_input_tokens + price_output * n_output_tokens
) / 1000
query_cost = (price_input * n_input_tokens + price_output * n_output_tokens) / 1000
cls._total_cost += query_cost
cost += query_cost
if (
cls.optimization_budget
and cls._total_cost >= cls.optimization_budget
and not eval_only
):
if cls.optimization_budget and cls._total_cost >= cls.optimization_budget and not eval_only:
# limit the total tuning cost
return {
metric: 0,
@ -393,14 +357,8 @@ class Completion:
)
# Hoeffding-Serfling bound
ratio = 0.1 * np.sqrt(rho / data_limit)
if (
target_output_tokens
and avg_n_tokens > target_output_tokens * (1 + ratio)
and not eval_only
):
cls._update_invalid_n(
prune, region_key, max_tokens, num_completions
)
if target_output_tokens and avg_n_tokens > target_output_tokens * (1 + ratio) and not eval_only:
cls._update_invalid_n(prune, region_key, max_tokens, num_completions)
result[metric] = 0
result["total_cost"] = cls._total_cost
result["cost"] = cost
@ -409,19 +367,13 @@ class Completion:
prune
and target_output_tokens
and avg_n_tokens <= target_output_tokens * (1 - ratio)
and (
num_completions < config_n
or num_completions == config_n
and data_limit == data_length
)
and (num_completions < config_n or num_completions == config_n and data_limit == data_length)
):
# update valid n
cls._max_valid_n_per_max_tokens[
region_key
] = valid_n = cls._max_valid_n_per_max_tokens.get(region_key, {})
valid_n[max_tokens] = max(
num_completions, valid_n.get(max_tokens, 0)
cls._max_valid_n_per_max_tokens[region_key] = valid_n = cls._max_valid_n_per_max_tokens.get(
region_key, {}
)
valid_n[max_tokens] = max(num_completions, valid_n.get(max_tokens, 0))
if num_completions < config_n:
# valid already, skip the rest of the data
data_limit = data_length
@ -455,9 +407,7 @@ class Completion:
target_output_tokens = (
inference_budget * 1000 - cls.avg_input_tokens * price_input
) / price_output
result["inference_cost"] = (
avg_n_tokens * price_output + cls.avg_input_tokens * price_input
) / 1000
result["inference_cost"] = (avg_n_tokens * price_output + cls.avg_input_tokens * price_input) / 1000
break
else:
if data_early_stop:
@ -552,9 +502,7 @@ class Completion:
space.pop("temperature_or_top_p")
space["temperature"] = temperature
space["top_p"] = top_p
logger.warning(
"temperature and top_p are not recommended to vary together."
)
logger.warning("temperature and top_p are not recommended to vary together.")
cls._max_valid_n_per_max_tokens, cls._min_invalid_n_per_max_tokens = {}, {}
cls.optimization_budget = optimization_budget
cls.inference_budget = inference_budget
@ -569,12 +517,8 @@ class Completion:
cls._messages = [cls._messages]
space["messages"] = tune.choice(list(range(len(cls._messages))))
else:
assert (
space.get("messages") is None
), "messages and prompt cannot be provided at the same time."
assert isinstance(
cls._prompts, (str, list)
), "prompt must be a string or a list of strings."
assert space.get("messages") is None, "messages and prompt cannot be provided at the same time."
assert isinstance(cls._prompts, (str, list)), "prompt must be a string or a list of strings."
if isinstance(cls._prompts, str):
cls._prompts = [cls._prompts]
space["prompt"] = tune.choice(list(range(len(cls._prompts))))
@ -703,9 +647,7 @@ class Completion:
# or "messages" should be in config (for tuning chat models only)
if prompt is None and model in cls.chat_models:
if messages is None:
raise ValueError(
"Either prompt or messages should be in config for chat models."
)
raise ValueError("Either prompt or messages should be in config for chat models.")
if prompt is None:
params["messages"] = [
{
@ -725,18 +667,12 @@ class Completion:
params["messages"] = [
{
"role": "user",
"content": prompt_msg
if isinstance(prompt, str)
else prompt(data_instance),
"content": prompt_msg if isinstance(prompt, str) else prompt(data_instance),
},
]
params.pop("prompt", None)
else:
params["prompt"] = (
prompt.format(**data_instance)
if isinstance(prompt, str)
else prompt(data_instance)
)
params["prompt"] = prompt.format(**data_instance) if isinstance(prompt, str) else prompt(data_instance)
return params
@classmethod
@ -855,9 +791,7 @@ class Completion:
elif isinstance(agg_method, dict):
for key in metric_keys:
metric_agg_method = agg_method[key]
assert callable(
metric_agg_method
), "please provide a callable for each metric"
assert callable(metric_agg_method), "please provide a callable for each metric"
result_agg[key] = metric_agg_method([r[key] for r in result_list])
else:
raise ValueError(

Просмотреть файл

@ -387,15 +387,11 @@ class AutoML(BaseEstimator):
settings["free_mem_ratio"] = settings.get("free_mem_ratio", 0)
settings["metric_constraints"] = settings.get("metric_constraints", [])
settings["cv_score_agg_func"] = settings.get("cv_score_agg_func", None)
settings["fit_kwargs_by_estimator"] = settings.get(
"fit_kwargs_by_estimator", {}
)
settings["fit_kwargs_by_estimator"] = settings.get("fit_kwargs_by_estimator", {})
settings["custom_hp"] = settings.get("custom_hp", {})
settings["skip_transform"] = settings.get("skip_transform", False)
self._estimator_type = (
"classifier" if settings["task"] in CLASSIFICATION else "regressor"
)
self._estimator_type = "classifier" if settings["task"] in CLASSIFICATION else "regressor"
def get_params(self, deep: bool = False) -> dict:
return self._settings.copy()
@ -452,18 +448,14 @@ class AutoML(BaseEstimator):
def best_config_per_estimator(self):
"""A dictionary of all estimators' best configuration."""
return {
e: e_search_state.best_config
and AutoMLState.sanitize(e_search_state.best_config)
e: e_search_state.best_config and AutoMLState.sanitize(e_search_state.best_config)
for e, e_search_state in self._search_states.items()
}
@property
def best_loss_per_estimator(self):
"""A dictionary of all estimators' best loss."""
return {
e: e_search_state.best_loss
for e, e_search_state in self._search_states.items()
}
return {e: e_search_state.best_loss for e, e_search_state in self._search_states.items()}
@property
def best_loss(self):
@ -482,16 +474,12 @@ class AutoML(BaseEstimator):
associated with the best config. These two objects correspond to the returned
objects by the customized metric function for the config with the best loss."""
state = self._search_states.get(self._best_estimator)
return self._state.best_loss, state and getattr(state, "best_result", {}).get(
"metric_for_logging"
)
return self._state.best_loss, state and getattr(state, "best_result", {}).get("metric_for_logging")
@property
def best_config_train_time(self):
"""A float of the seconds taken by training the best config."""
return getattr(
self._search_states[self._best_estimator], "best_config_train_time", None
)
return getattr(self._search_states[self._best_estimator], "best_config_train_time", None)
def save_best_config(self, filename):
best = {
@ -544,9 +532,7 @@ class AutoML(BaseEstimator):
):
estimator = getattr(self, "_trained_estimator", None)
if estimator is None:
logger.warning(
"No estimator is trained. Please run fit with enough budget."
)
logger.warning("No estimator is trained. Please run fit with enough budget.")
return None
X = self._state.task.preprocess(X, self._transformer)
if self._label_transformer:
@ -588,23 +574,15 @@ class AutoML(BaseEstimator):
"""
estimator = getattr(self, "_trained_estimator", None)
if estimator is None:
logger.warning(
"No estimator is trained. Please run fit with enough budget."
)
logger.warning("No estimator is trained. Please run fit with enough budget.")
return None
X = self._state.task.preprocess(X, self._transformer)
y_pred = estimator.predict(X, **pred_kwargs)
if (
isinstance(y_pred, np.ndarray)
and y_pred.ndim > 1
and isinstance(y_pred, np.ndarray)
):
if isinstance(y_pred, np.ndarray) and y_pred.ndim > 1 and isinstance(y_pred, np.ndarray):
y_pred = y_pred.flatten()
if self._label_transformer:
return self._label_transformer.inverse_transform(
pd.Series(y_pred.astype(int))
)
return self._label_transformer.inverse_transform(pd.Series(y_pred.astype(int)))
else:
return y_pred
@ -623,9 +601,7 @@ class AutoML(BaseEstimator):
"""
estimator = getattr(self, "_trained_estimator", None)
if estimator is None:
logger.warning(
"No estimator is trained. Please run fit with enough budget."
)
logger.warning("No estimator is trained. Please run fit with enough budget.")
return None
X = self._state.task.preprocess(X, self._transformer)
proba = self._trained_estimator.predict_proba(X, **pred_kwargs)
@ -640,9 +616,7 @@ class AutoML(BaseEstimator):
"""
self._state.learner_classes[learner_name] = learner_class
def get_estimator_from_log(
self, log_file_name: str, record_id: int, task: Union[str, Task]
):
def get_estimator_from_log(self, log_file_name: str, record_id: int, task: Union[str, Task]):
"""Get the estimator from log file.
Args:
@ -820,30 +794,18 @@ class AutoML(BaseEstimator):
split_ratio = split_ratio or self._settings.get("split_ratio")
n_splits = n_splits or self._settings.get("n_splits")
split_type = split_type or self._settings.get("split_type")
auto_augment = (
self._settings.get("auto_augment") if auto_augment is None else auto_augment
)
auto_augment = self._settings.get("auto_augment") if auto_augment is None else auto_augment
self._state.task = task
self._estimator_type = "classifier" if task.is_classification() else "regressor"
self._state.fit_kwargs = fit_kwargs
self._state.custom_hp = custom_hp or self._settings.get("custom_hp")
self._skip_transform = (
self._settings.get("skip_transform")
if skip_transform is None
else skip_transform
)
self._state.fit_kwargs_by_estimator = (
fit_kwargs_by_estimator or self._settings.get("fit_kwargs_by_estimator")
)
self._skip_transform = self._settings.get("skip_transform") if skip_transform is None else skip_transform
self._state.fit_kwargs_by_estimator = fit_kwargs_by_estimator or self._settings.get("fit_kwargs_by_estimator")
self.preserve_checkpoint = (
self._settings.get("preserve_checkpoint")
if preserve_checkpoint is None
else preserve_checkpoint
)
task.validate_data(
self, self._state, X_train, y_train, dataframe, label, groups=groups
self._settings.get("preserve_checkpoint") if preserve_checkpoint is None else preserve_checkpoint
)
task.validate_data(self, self._state, X_train, y_train, dataframe, label, groups=groups)
logger.info("log file name {}".format(log_file_name))
@ -877,9 +839,7 @@ class AutoML(BaseEstimator):
best_val_loss = val_loss
sample_size = size
if not training_duration:
logger.warning(
f"No estimator found within time_budget={time_budget}"
)
logger.warning(f"No estimator found within time_budget={time_budget}")
from .model import BaseEstimator as Estimator
self._trained_estimator = Estimator()
@ -901,9 +861,7 @@ class AutoML(BaseEstimator):
self._state.fit_kwargs_by_estimator[best_estimator] = self._state.fit_kwargs
logger.info(
"estimator = {}, config = {}, #training instances = {}".format(
best_estimator, best_config, sample_size
)
"estimator = {}, config = {}, #training instances = {}".format(best_estimator, best_config, sample_size)
)
# Partially copied from fit() function
# Initilize some attributes required for retrain_from_log
@ -944,9 +902,7 @@ class AutoML(BaseEstimator):
"auto",
"cv",
], "eval_method must be 'auto' or 'cv' for custom data splitter."
assert (
self._state.X_val is None
), "custom splitter and custom validation data can't be used together."
assert self._state.X_val is None, "custom splitter and custom validation data can't be used together."
return "cv"
if self._state.X_val is not None:
assert eval_method in [
@ -1051,10 +1007,7 @@ class AutoML(BaseEstimator):
c = self._search_states[estimator].cat_hp_cost
configs.append(c)
configs.append(
[
self._state.learner_classes.get(estimator).cost_relative2lgbm()
for estimator in self.estimator_list
]
[self._state.learner_classes.get(estimator).cost_relative2lgbm() for estimator in self.estimator_list]
)
config = {"ml": configs}
return config
@ -1112,9 +1065,7 @@ class AutoML(BaseEstimator):
for estimator in self.estimator_list:
search_state = self._search_states[estimator]
if hasattr(search_state, "training_function"):
estimator_to_training_function[
estimator
] = search_state.training_function
estimator_to_training_function[estimator] = search_state.training_function
del search_state.training_function
with open(output_file_name, "wb") as f:
@ -1577,74 +1528,44 @@ class AutoML(BaseEstimator):
eval_method = eval_method or self._settings.get("eval_method")
split_ratio = split_ratio or self._settings.get("split_ratio")
n_splits = n_splits or self._settings.get("n_splits")
auto_augment = (
self._settings.get("auto_augment") if auto_augment is None else auto_augment
)
auto_augment = self._settings.get("auto_augment") if auto_augment is None else auto_augment
metric = metric or self._settings.get("metric")
estimator_list = estimator_list or self._settings.get("estimator_list")
log_file_name = (
self._settings.get("log_file_name")
if log_file_name is None
else log_file_name
)
log_file_name = self._settings.get("log_file_name") if log_file_name is None else log_file_name
max_iter = self._settings.get("max_iter") if max_iter is None else max_iter
sample_is_none = sample is None
if sample_is_none:
sample = self._settings.get("sample")
ensemble = self._settings.get("ensemble") if ensemble is None else ensemble
log_type = log_type or self._settings.get("log_type")
model_history = (
self._settings.get("model_history")
if model_history is None
else model_history
)
model_history = self._settings.get("model_history") if model_history is None else model_history
log_training_metric = (
self._settings.get("log_training_metric")
if log_training_metric is None
else log_training_metric
self._settings.get("log_training_metric") if log_training_metric is None else log_training_metric
)
mem_thres = mem_thres or self._settings.get("mem_thres")
pred_time_limit = pred_time_limit or self._settings.get("pred_time_limit")
train_time_limit = train_time_limit or self._settings.get("train_time_limit")
self._metric_constraints = metric_constraints or self._settings.get(
"metric_constraints"
)
self._metric_constraints = metric_constraints or self._settings.get("metric_constraints")
if np.isfinite(pred_time_limit):
self._metric_constraints.append(("pred_time", "<=", pred_time_limit))
verbose = self._settings.get("verbose") if verbose is None else verbose
retrain_full = (
self._settings.get("retrain_full") if retrain_full is None else retrain_full
)
retrain_full = self._settings.get("retrain_full") if retrain_full is None else retrain_full
split_type = split_type or self._settings.get("split_type")
hpo_method = hpo_method or self._settings.get("hpo_method")
learner_selector = learner_selector or self._settings.get("learner_selector")
no_starting_points = starting_points is None
if no_starting_points:
starting_points = self._settings.get("starting_points")
n_concurrent_trials = n_concurrent_trials or self._settings.get(
"n_concurrent_trials"
)
keep_search_state = (
self._settings.get("keep_search_state")
if keep_search_state is None
else keep_search_state
)
n_concurrent_trials = n_concurrent_trials or self._settings.get("n_concurrent_trials")
keep_search_state = self._settings.get("keep_search_state") if keep_search_state is None else keep_search_state
self.preserve_checkpoint = (
self._settings.get("preserve_checkpoint")
if preserve_checkpoint is None
else preserve_checkpoint
)
early_stop = (
self._settings.get("early_stop") if early_stop is None else early_stop
)
force_cancel = (
self._settings.get("force_cancel") if force_cancel is None else force_cancel
self._settings.get("preserve_checkpoint") if preserve_checkpoint is None else preserve_checkpoint
)
early_stop = self._settings.get("early_stop") if early_stop is None else early_stop
force_cancel = self._settings.get("force_cancel") if force_cancel is None else force_cancel
# no search budget is provided?
no_budget = time_budget < 0 and max_iter is None and not early_stop
append_log = (
self._settings.get("append_log") if append_log is None else append_log
)
append_log = self._settings.get("append_log") if append_log is None else append_log
min_sample_size = min_sample_size or self._settings.get("min_sample_size")
use_ray = self._settings.get("use_ray") if use_ray is None else use_ray
use_spark = self._settings.get("use_spark") if use_spark is None else use_spark
@ -1698,11 +1619,7 @@ class AutoML(BaseEstimator):
if self._use_ray is not False:
import ray
n_cpus = (
ray.is_initialized()
and ray.available_resources()["CPU"]
or os.cpu_count()
)
n_cpus = ray.is_initialized() and ray.available_resources()["CPU"] or os.cpu_count()
self._state.resources_per_trial = (
# when using gpu, default cpu is 1 per job; otherwise, default cpu is n_cpus / n_concurrent_trials
@ -1724,30 +1641,16 @@ class AutoML(BaseEstimator):
dataframe = ray.get(dataframe)
else:
# TODO: Integrate with Spark
self._state.resources_per_trial = (
{"cpu": n_jobs} if n_jobs > 0 else {"cpu": 1}
)
self._state.free_mem_ratio = (
self._settings.get("free_mem_ratio")
if free_mem_ratio is None
else free_mem_ratio
)
self._state.resources_per_trial = {"cpu": n_jobs} if n_jobs > 0 else {"cpu": 1}
self._state.free_mem_ratio = self._settings.get("free_mem_ratio") if free_mem_ratio is None else free_mem_ratio
self._state.task = task
self._state.log_training_metric = log_training_metric
self._state.fit_kwargs = fit_kwargs
custom_hp = custom_hp or self._settings.get("custom_hp")
self._skip_transform = (
self._settings.get("skip_transform")
if skip_transform is None
else skip_transform
)
fit_kwargs_by_estimator = fit_kwargs_by_estimator or self._settings.get(
"fit_kwargs_by_estimator"
)
self._state.fit_kwargs_by_estimator = (
fit_kwargs_by_estimator.copy()
) # shallow copy of fit_kwargs_by_estimator
self._skip_transform = self._settings.get("skip_transform") if skip_transform is None else skip_transform
fit_kwargs_by_estimator = fit_kwargs_by_estimator or self._settings.get("fit_kwargs_by_estimator")
self._state.fit_kwargs_by_estimator = fit_kwargs_by_estimator.copy() # shallow copy of fit_kwargs_by_estimator
self._state.weight_val = sample_weight_val
task.validate_data(
@ -1777,13 +1680,9 @@ class AutoML(BaseEstimator):
eval_method = self._decide_eval_method(eval_method, time_budget)
self._state.eval_method = eval_method
logger.info("Evaluation method: {}".format(eval_method))
self._state.cv_score_agg_func = cv_score_agg_func or self._settings.get(
"cv_score_agg_func"
)
self._state.cv_score_agg_func = cv_score_agg_func or self._settings.get("cv_score_agg_func")
self._retrain_in_budget = retrain_full == "budget" and (
eval_method == "holdout" and self._state.X_val is None
)
self._retrain_in_budget = retrain_full == "budget" and (eval_method == "holdout" and self._state.X_val is None)
self._auto_augment = auto_augment
_sample_size_from_starting_points = {}
@ -1805,9 +1704,7 @@ class AutoML(BaseEstimator):
]
)
if _sample_size_set:
_sample_size_from_starting_points[_estimator] = min(
_sample_size_set
)
_sample_size_from_starting_points[_estimator] = min(_sample_size_set)
if len(_sample_size_set) > 1:
logger.warning(
"Using the min FLAML_sample_size of all the provided starting points for estimator {}. (Provided FLAML_sample_size are: {})".format(
@ -1831,10 +1728,7 @@ class AutoML(BaseEstimator):
sample
and not task.is_rank()
and eval_method != "cv"
and (
self._min_sample_size[k] * SAMPLE_MULTIPLY_FACTOR
< self._state.data_size[0]
),
and (self._min_sample_size[k] * SAMPLE_MULTIPLY_FACTOR < self._state.data_size[0]),
)
for k in self._min_sample_size.keys()
}
@ -1843,10 +1737,7 @@ class AutoML(BaseEstimator):
sample
and not task.is_rank()
and eval_method != "cv"
and (
self._min_sample_size * SAMPLE_MULTIPLY_FACTOR
< self._state.data_size[0]
)
and (self._min_sample_size * SAMPLE_MULTIPLY_FACTOR < self._state.data_size[0])
)
metric = task.default_metric(metric)
@ -1874,10 +1765,7 @@ class AutoML(BaseEstimator):
if task.is_nlp():
from flaml.automl.ml import huggingface_metric_to_mode
if (
metric in huggingface_metric_to_mode
and huggingface_metric_to_mode[metric] == "max"
):
if metric in huggingface_metric_to_mode and huggingface_metric_to_mode[metric] == "max":
return True, f"-{metric}"
return False, None
@ -1891,9 +1779,7 @@ class AutoML(BaseEstimator):
error_metric = "customized metric"
logger.info(f"Minimizing error metric: {error_metric}")
is_spark_dataframe = isinstance(X_train, psDataFrame) or isinstance(
dataframe, psDataFrame
)
is_spark_dataframe = isinstance(X_train, psDataFrame) or isinstance(dataframe, psDataFrame)
estimator_list = task.default_estimator_list(estimator_list, is_spark_dataframe)
if is_spark_dataframe and self._use_spark:
@ -1954,9 +1840,7 @@ class AutoML(BaseEstimator):
location,
k=1,
)
starting_points[estimator_name] = [
x["hyperparameters"] for x in configs
]
starting_points[estimator_name] = [x["hyperparameters"] for x in configs]
except FileNotFoundError:
pass
try:
@ -1978,9 +1862,7 @@ class AutoML(BaseEstimator):
for estimator_name in estimator_list:
estimator_class = self._state.learner_classes[estimator_name]
estimator_class.init()
this_estimator_kwargs = self._state.fit_kwargs_by_estimator.get(
estimator_name
)
this_estimator_kwargs = self._state.fit_kwargs_by_estimator.get(estimator_name)
if this_estimator_kwargs:
# make another shallow copy of the value (a dict obj), so user's fit_kwargs_by_estimator won't be updated
this_estimator_kwargs = this_estimator_kwargs.copy()
@ -1991,9 +1873,7 @@ class AutoML(BaseEstimator):
estimator_name
] = this_estimator_kwargs # set self._state.fit_kwargs_by_estimator[estimator_name] to the update, so only self._state.fit_kwargs_by_estimator will be updated
else:
self._state.fit_kwargs_by_estimator[
estimator_name
] = self._state.fit_kwargs
self._state.fit_kwargs_by_estimator[estimator_name] = self._state.fit_kwargs
self._search_states[estimator_name] = SearchState(
learner_class=estimator_class,
@ -2004,9 +1884,7 @@ class AutoML(BaseEstimator):
"period"
), # NOTE: this is after kwargs is updated to fit_kwargs_by_estimator
custom_hp=custom_hp and custom_hp.get(estimator_name),
max_iter=max_iter / len(estimator_list)
if self._learner_selector == "roundrobin"
else max_iter,
max_iter=max_iter / len(estimator_list) if self._learner_selector == "roundrobin" else max_iter,
budget=self._state.time_budget,
)
logger.info("List of ML learners in AutoML Run: {}".format(estimator_list))
@ -2040,9 +1918,7 @@ class AutoML(BaseEstimator):
self._search()
if self._best_estimator:
logger.info("fit succeeded")
logger.info(
f"Time taken to find the best model: {self._time_taken_best_iter}"
)
logger.info(f"Time taken to find the best model: {self._time_taken_best_iter}")
if (
self._hpo_method in ("cfo", "bs")
and self._state.time_budget > 0
@ -2085,10 +1961,7 @@ class AutoML(BaseEstimator):
from ray.tune.search import ConcurrencyLimiter
import ray
except (ImportError, AssertionError):
raise ImportError(
"use_ray=True requires installation of ray. "
"Please run pip install flaml[ray]"
)
raise ImportError("use_ray=True requires installation of ray. " "Please run pip install flaml[ray]")
else:
from flaml.tune.searcher.suggestion import ConcurrencyLimiter
@ -2116,16 +1989,11 @@ class AutoML(BaseEstimator):
from flaml.tune.searcher.suggestion import OptunaSearch as SearchAlgo
else:
raise NotImplementedError(
f"hpo_method={self._hpo_method} is not recognized. "
"'auto', 'cfo' and 'bs' are supported."
f"hpo_method={self._hpo_method} is not recognized. " "'auto', 'cfo' and 'bs' are supported."
)
space = self.search_space
self._state.time_from_start = time.time() - self._start_time_flag
time_budget_s = (
self._state.time_budget - self._state.time_from_start
if self._state.time_budget >= 0
else None
)
time_budget_s = self._state.time_budget - self._state.time_from_start if self._state.time_budget >= 0 else None
if self._hpo_method != "optuna":
min_resource = self.min_resource
if isinstance(min_resource, dict):
@ -2146,9 +2014,7 @@ class AutoML(BaseEstimator):
resource_attr=self.resource_attr,
min_resource=min_resource_all_estimator,
max_resource=self.max_resource,
config_constraints=[
(partial(size, self._state.learner_classes), "<=", self._mem_thres)
],
config_constraints=[(partial(size, self._state.learner_classes), "<=", self._mem_thres)],
metric_constraints=self.metric_constraints,
seed=self._seed,
time_budget_s=time_budget_s,
@ -2171,9 +2037,7 @@ class AutoML(BaseEstimator):
search_alg = SearchAlgo(
metric="val_loss",
mode="min",
points_to_evaluate=[
p for p in new_points_to_evaluate if len(p) == len(converted_space)
],
points_to_evaluate=[p for p in new_points_to_evaluate if len(p) == len(converted_space)],
)
search_alg = ConcurrencyLimiter(search_alg, self._n_concurrent_trials)
resources_per_trial = self._state.resources_per_trial
@ -2218,8 +2082,7 @@ class AutoML(BaseEstimator):
(
trial
for trial in analysis.trials
if trial.last_result
and trial.last_result.get("wall_clock_time") is not None
if trial.last_result and trial.last_result.get("wall_clock_time") is not None
),
key=lambda x: x.last_result["wall_clock_time"],
)
@ -2272,14 +2135,10 @@ class AutoML(BaseEstimator):
if (search_state.metric_for_logging is not None) and (
"intermediate_results" in search_state.metric_for_logging
):
for each_entry in search_state.metric_for_logging[
"intermediate_results"
]:
for each_entry in search_state.metric_for_logging["intermediate_results"]:
with mlflow.start_run(nested=True):
mlflow.log_metrics(each_entry)
mlflow.log_metric(
"iter_counter", self._iter_per_learner[estimator]
)
mlflow.log_metric("iter_counter", self._iter_per_learner[estimator])
del search_state.metric_for_logging["intermediate_results"]
if search_state.metric_for_logging:
mlflow.log_metrics(search_state.metric_for_logging)
@ -2325,8 +2184,7 @@ class AutoML(BaseEstimator):
from flaml.tune.searcher.cfo_cat import CFOCat as SearchAlgo
else:
raise NotImplementedError(
f"hpo_method={self._hpo_method} is not recognized. "
"'cfo' and 'bs' are supported."
f"hpo_method={self._hpo_method} is not recognized. " "'cfo' and 'bs' are supported."
)
est_retrain_time = next_trial_time = 0
@ -2348,8 +2206,7 @@ class AutoML(BaseEstimator):
if not self._retrain_in_budget
or better
or (not self.best_estimator)
or self._search_states[self.best_estimator].sample_size
< self._state.data_size[0]
or self._search_states[self.best_estimator].sample_size < self._state.data_size[0]
else time_left - est_retrain_time
)
if not search_state.search_alg:
@ -2363,8 +2220,7 @@ class AutoML(BaseEstimator):
resource_attr = "FLAML_sample_size"
min_resource = (
self._min_sample_size[estimator]
if isinstance(self._min_sample_size, dict)
and estimator in self._min_sample_size
if isinstance(self._min_sample_size, dict) and estimator in self._min_sample_size
else self._min_sample_size_input
)
max_resource = self._state.data_size[0]
@ -2391,9 +2247,7 @@ class AutoML(BaseEstimator):
low_cost_partial_config = search_state.low_cost_partial_config
time_budget_s = (
min(budget_left, self._state.train_time_limit or np.inf)
if self._state.time_budget >= 0
else None
min(budget_left, self._state.train_time_limit or np.inf) if self._state.time_budget >= 0 else None
)
if self._hpo_method in ("bs", "cfo", "grid", "cfocat", "random"):
algo = SearchAlgo(
@ -2406,9 +2260,7 @@ class AutoML(BaseEstimator):
resource_attr=resource_attr,
min_resource=min_resource,
max_resource=max_resource,
config_constraints=[
(learner_class.size, "<=", self._mem_thres)
],
config_constraints=[(learner_class.size, "<=", self._mem_thres)],
metric_constraints=self.metric_constraints,
seed=self._seed,
allow_empty_config=True,
@ -2419,9 +2271,7 @@ class AutoML(BaseEstimator):
# if self._hpo_method is optuna, sometimes the search space and the initial config dimension do not match
# need to remove the extra keys from the search space to be consistent with the initial config
converted_space = SearchAlgo.convert_search_space(search_space)
removed_keys = set(search_space.keys()).difference(
converted_space.keys()
)
removed_keys = set(search_space.keys()).difference(converted_space.keys())
new_points_to_evaluate = []
for idx in range(len(points_to_evaluate)):
r = points_to_evaluate[idx].copy()
@ -2434,9 +2284,7 @@ class AutoML(BaseEstimator):
metric="val_loss",
mode="min",
space=search_space,
points_to_evaluate=[
p for p in points_to_evaluate if len(p) == len(search_space)
],
points_to_evaluate=[p for p in points_to_evaluate if len(p) == len(search_space)],
)
search_state.search_alg = ConcurrencyLimiter(algo, max_concurrent=1)
# search_state.search_alg = algo
@ -2467,9 +2315,7 @@ class AutoML(BaseEstimator):
eci_base = search_state.init_eci
self._eci.append(search_state.estimated_cost4improvement)
for e in self.estimator_list[1:]:
self._eci.append(
self._search_states[e].init_eci / eci_base * self._eci[0]
)
self._eci.append(self._search_states[e].init_eci / eci_base * self._eci[0])
self._estimator_index = 0
min_budget = max(10 * self._eci[0], sum(self._eci))
max_budget = 10000 * self._eci[0]
@ -2536,14 +2382,10 @@ class AutoML(BaseEstimator):
if (
self._hpo_method in ("cfo", "bs")
and all(
state.search_alg
and state.search_alg.searcher.is_ls_ever_converged
state.search_alg and state.search_alg.searcher.is_ls_ever_converged
for state in self._search_states.values()
)
and (
self._state.time_from_start
> self._warn_threshold * self._time_taken_best_iter
)
and (self._state.time_from_start > self._warn_threshold * self._time_taken_best_iter)
):
logger.warning(
"All estimator hyperparameters local search has "
@ -2566,8 +2408,7 @@ class AutoML(BaseEstimator):
and best_config_sig
and est_retrain_time
and not better
and self._search_states[self._best_estimator].sample_size
== self._state.data_size[0]
and self._search_states[self._best_estimator].sample_size == self._state.data_size[0]
and (
est_retrain_time
<= self._state.time_budget - self._state.time_from_start
@ -2580,18 +2421,11 @@ class AutoML(BaseEstimator):
state.best_config,
self.data_size_full,
)
logger.info(
"retrain {} for {:.1f}s".format(self._best_estimator, retrain_time)
)
self._retrained_config[
best_config_sig
] = state.best_config_train_time = retrain_time
logger.info("retrain {} for {:.1f}s".format(self._best_estimator, retrain_time))
self._retrained_config[best_config_sig] = state.best_config_train_time = retrain_time
est_retrain_time = 0
self._state.time_from_start = time.time() - self._start_time_flag
if (
self._state.time_from_start >= self._state.time_budget >= 0
or not self._active_estimators
):
if self._state.time_from_start >= self._state.time_budget >= 0 or not self._active_estimators:
break
if self._ensemble and self._best_estimator:
time_left = self._state.time_budget - self._state.time_from_start
@ -2636,9 +2470,7 @@ class AutoML(BaseEstimator):
self._state.time_from_start = time.time() - self._start_time_flag
if self._best_estimator:
self._selected = self._search_states[self._best_estimator]
self.modelcount = sum(
search_state.total_iter for search_state in self._search_states.values()
)
self.modelcount = sum(search_state.total_iter for search_state in self._search_states.values())
if self._trained_estimator:
logger.info(f"selected model: {self._trained_estimator.model}")
estimators = []
@ -2647,9 +2479,7 @@ class AutoML(BaseEstimator):
"multiclass",
"regression",
):
search_states = list(
x for x in self._search_states.items() if x[1].best_config
)
search_states = list(x for x in self._search_states.items() if x[1].best_config)
search_states.sort(key=lambda x: x[1].best_loss)
estimators = [
(
@ -2674,9 +2504,7 @@ class AutoML(BaseEstimator):
for x in search_states[2:]
if x[1].best_loss < 4 * self._selected.best_loss
]
logger.info(
[(estimator[0], estimator[1].params) for estimator in estimators]
)
logger.info([(estimator[0], estimator[1].params) for estimator in estimators])
if len(estimators) > 1:
if self._state.task.is_classification():
from sklearn.ensemble import StackingClassifier as Stacker
@ -2685,11 +2513,7 @@ class AutoML(BaseEstimator):
if self._use_ray is not False:
import ray
n_cpus = (
ray.is_initialized()
and ray.available_resources()["CPU"]
or os.cpu_count()
)
n_cpus = ray.is_initialized() and ray.available_resources()["CPU"] or os.cpu_count()
elif self._use_spark:
from flaml.tune.spark.utils import get_n_cpus
@ -2698,15 +2522,12 @@ class AutoML(BaseEstimator):
n_cpus = os.cpu_count()
ensemble_n_jobs = (
-self._state.n_jobs # maximize total parallelization degree
if abs(self._state.n_jobs)
== 1 # 1 and -1 correspond to min/max parallelization
if abs(self._state.n_jobs) == 1 # 1 and -1 correspond to min/max parallelization
else max(1, int(n_cpus / 2 / self._state.n_jobs))
# the total degree of parallelization = parallelization degree per estimator * parallelization degree of ensemble
)
if isinstance(self._ensemble, dict):
final_estimator = self._ensemble.get(
"final_estimator", self._trained_estimator
)
final_estimator = self._ensemble.get("final_estimator", self._trained_estimator)
passthrough = self._ensemble.get("passthrough", True)
ensemble_n_jobs = self._ensemble.get("n_jobs", ensemble_n_jobs)
else:
@ -2719,9 +2540,7 @@ class AutoML(BaseEstimator):
passthrough=passthrough,
)
sample_weight_dict = (
(self._sample_weight_full is not None)
and {"sample_weight": self._sample_weight_full}
or {}
(self._sample_weight_full is not None) and {"sample_weight": self._sample_weight_full} or {}
)
for e in estimators:
e[1].__class__.init()
@ -2776,8 +2595,7 @@ class AutoML(BaseEstimator):
or self._state.time_budget - self._state.time_from_start
> self._selected.est_retrain_time(self.data_size_full)
)
and self._selected.best_config_sample_size
== self._state.data_size[0]
and self._selected.best_config_sample_size == self._state.data_size[0]
):
state = self._search_states[self._best_estimator]
(
@ -2788,11 +2606,7 @@ class AutoML(BaseEstimator):
state.best_config,
self.data_size_full,
)
logger.info(
"retrain {} for {:.1f}s".format(
self._best_estimator, retrain_time
)
)
logger.info("retrain {} for {:.1f}s".format(self._best_estimator, retrain_time))
state.best_config_train_time = retrain_time
if self._trained_estimator:
logger.info(f"retrained model: {self._trained_estimator.model}")
@ -2827,16 +2641,12 @@ class AutoML(BaseEstimator):
self._state.time_budget >= 0
and self._search_states[estimator].time2eval_best
> self._state.time_budget - self._state.time_from_start
or self._iter_per_learner_fullsize[estimator]
>= self._max_iter_per_learner
or self._iter_per_learner_fullsize[estimator] >= self._max_iter_per_learner
):
inv.append(0)
continue
estimated_cost = search_state.estimated_cost4improvement
if (
search_state.sample_size < self._state.data_size[0]
and self._state.time_budget >= 0
):
if search_state.sample_size < self._state.data_size[0] and self._state.time_budget >= 0:
estimated_cost = min(
estimated_cost,
search_state.time2eval_best
@ -2847,12 +2657,8 @@ class AutoML(BaseEstimator):
)
gap = search_state.best_loss - self._state.best_loss
if gap > 0 and not self._ensemble:
delta_loss = (
search_state.best_loss_old - search_state.best_loss
) or search_state.best_loss
delta_time = (
search_state.total_time_used - search_state.time_best_found_old
) or 1e-10
delta_loss = (search_state.best_loss_old - search_state.best_loss) or search_state.best_loss
delta_time = (search_state.total_time_used - search_state.time_best_found_old) or 1e-10
speed = delta_loss / delta_time
if speed:
estimated_cost = max(2 * gap / speed, estimated_cost)

Просмотреть файл

@ -35,9 +35,7 @@ TS_TIMESTAMP_COL = "ds"
TS_VALUE_COL = "y"
def load_openml_dataset(
dataset_id, data_dir=None, random_state=0, dataset_format="dataframe"
):
def load_openml_dataset(dataset_id, data_dir=None, random_state=0, dataset_format="dataframe"):
"""Load dataset from open ML.
If the file is not cached locally, download it from open ML.
@ -77,9 +75,7 @@ def load_openml_dataset(
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
print("Dataset name:", dataset.name)
try:
X, y, *__ = dataset.get_data(
target=dataset.default_target_attribute, dataset_format=dataset_format
)
X, y, *__ = dataset.get_data(target=dataset.default_target_attribute, dataset_format=dataset_format)
except ValueError:
from sklearn.datasets import fetch_openml
@ -267,9 +263,7 @@ def add_time_idx_col(X):
class DataTransformer:
"""Transform input training data."""
def fit_transform(
self, X: Union[DataFrame, np.ndarray], y, task: Union[str, "Task"]
):
def fit_transform(self, X: Union[DataFrame, np.ndarray], y, task: Union[str, "Task"]):
"""Fit transformer and process the input training data according to the task type.
Args:
@ -312,21 +306,13 @@ class DataTransformer:
for column in X.columns:
# sklearn\utils\validation.py needs int/float values
if X[column].dtype.name in ("object", "category"):
if (
X[column].nunique() == 1
or X[column].nunique(dropna=True)
== n - X[column].isnull().sum()
):
if X[column].nunique() == 1 or X[column].nunique(dropna=True) == n - X[column].isnull().sum():
X.drop(columns=column, inplace=True)
drop = True
elif X[column].dtype.name == "category":
current_categories = X[column].cat.categories
if "__NAN__" not in current_categories:
X[column] = (
X[column]
.cat.add_categories("__NAN__")
.fillna("__NAN__")
)
X[column] = X[column].cat.add_categories("__NAN__").fillna("__NAN__")
cat_columns.append(column)
else:
X[column] = X[column].fillna("__NAN__")
@ -349,10 +335,7 @@ class DataTransformer:
f"quarter_{column}": tmp_dt.quarter,
}
for key, value in new_columns_dict.items():
if (
key not in X.columns
and value.nunique(dropna=False) >= 2
):
if key not in X.columns and value.nunique(dropna=False) >= 2:
X[key] = value
num_columns.append(key)
X[column] = X[column].map(datetime.toordinal)
@ -368,9 +351,7 @@ class DataTransformer:
if num_columns:
X_num = X[num_columns]
if np.issubdtype(X_num.columns.dtype, np.integer) and (
drop
or min(X_num.columns) != 0
or max(X_num.columns) != X_num.shape[1] - 1
drop or min(X_num.columns) != 0 or max(X_num.columns) != X_num.shape[1] - 1
):
X_num.columns = range(X_num.shape[1])
drop = True
@ -395,11 +376,7 @@ class DataTransformer:
datetime_columns,
)
self._drop = drop
if (
task.is_classification()
or not pd.api.types.is_numeric_dtype(y)
and not task.is_nlg()
):
if task.is_classification() or not pd.api.types.is_numeric_dtype(y) and not task.is_nlg():
if not task.is_token_classification():
from sklearn.preprocessing import LabelEncoder
@ -466,9 +443,7 @@ class DataTransformer:
elif X[column].dtype.name == "category":
current_categories = X[column].cat.categories
if "__NAN__" not in current_categories:
X[column] = (
X[column].cat.add_categories("__NAN__").fillna("__NAN__")
)
X[column] = X[column].cat.add_categories("__NAN__").fillna("__NAN__")
if cat_columns:
X[cat_columns] = X[cat_columns].astype("category")
if num_columns:

Просмотреть файл

@ -132,9 +132,7 @@ def get_estimator_class(task: str, estimator_name: str) -> EstimatorSubclass:
if "xgboost" == estimator_name:
estimator_class = XGBoost_TS if task in TS_FORECAST else XGBoostSklearnEstimator
elif "xgb_limitdepth" == estimator_name:
estimator_class = (
XGBoostLimitDepth_TS if task in TS_FORECAST else XGBoostLimitDepthEstimator
)
estimator_class = XGBoostLimitDepth_TS if task in TS_FORECAST else XGBoostLimitDepthEstimator
elif "rf" == estimator_name:
estimator_class = RF_TS if task in TS_FORECAST else RandomForestEstimator
elif "lgbm" == estimator_name:
@ -203,40 +201,27 @@ def metric_loss_score(
try:
import datasets
datasets_metric_name = huggingface_submetric_to_metric.get(
metric_name, metric_name.split(":")[0]
)
datasets_metric_name = huggingface_submetric_to_metric.get(metric_name, metric_name.split(":")[0])
metric = datasets.load_metric(datasets_metric_name)
metric_mode = huggingface_metric_to_mode[datasets_metric_name]
if metric_name.startswith("seqeval"):
y_processed_true = [
[labels[tr] for tr in each_list] for each_list in y_processed_true
]
y_processed_true = [[labels[tr] for tr in each_list] for each_list in y_processed_true]
elif metric in ("pearsonr", "spearmanr"):
y_processed_true = (
y_processed_true.to_list()
if isinstance(y_processed_true, pd.Series)
else list(y_processed_true)
y_processed_true.to_list() if isinstance(y_processed_true, pd.Series) else list(y_processed_true)
)
score_dict = metric.compute(
predictions=y_processed_predict, references=y_processed_true
)
score_dict = metric.compute(predictions=y_processed_predict, references=y_processed_true)
if "rouge" in metric_name:
score = score_dict[metric_name].mid.fmeasure
elif metric_name.startswith("seqeval"):
metric_submetric_names = metric_name.split(":")
score = score_dict[
metric_submetric_names[1]
if len(metric_submetric_names) > 1
else "overall_accuracy"
]
score = score_dict[metric_submetric_names[1] if len(metric_submetric_names) > 1 else "overall_accuracy"]
else:
score = score_dict[metric_name]
except ImportError:
raise ValueError(
metric_name
+ " is not an built-in sklearn metric and [hf] is not installed. "
metric_name + " is not an built-in sklearn metric and [hf] is not installed. "
"Currently built-in sklearn metrics are: "
"r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,"
"log_loss, mape, f1, micro_f1, macro_f1, ap. "
@ -303,9 +288,7 @@ def sklearn_metric_loss_score(
if "r2" == metric_name:
score = 1.0 - r2_score(y_true, y_predict, sample_weight=sample_weight)
elif metric_name == "rmse":
score = np.sqrt(
mean_squared_error(y_true, y_predict, sample_weight=sample_weight)
)
score = np.sqrt(mean_squared_error(y_true, y_predict, sample_weight=sample_weight))
elif metric_name == "mae":
score = mean_absolute_error(y_true, y_predict, sample_weight=sample_weight)
elif metric_name == "mse":
@ -315,17 +298,11 @@ def sklearn_metric_loss_score(
elif metric_name == "roc_auc":
score = 1.0 - roc_auc_score(y_true, y_predict, sample_weight=sample_weight)
elif metric_name == "roc_auc_ovr":
score = 1.0 - roc_auc_score(
y_true, y_predict, sample_weight=sample_weight, multi_class="ovr"
)
score = 1.0 - roc_auc_score(y_true, y_predict, sample_weight=sample_weight, multi_class="ovr")
elif metric_name == "roc_auc_ovo":
score = 1.0 - roc_auc_score(
y_true, y_predict, sample_weight=sample_weight, multi_class="ovo"
)
score = 1.0 - roc_auc_score(y_true, y_predict, sample_weight=sample_weight, multi_class="ovo")
elif metric_name == "roc_auc_weighted":
score = 1.0 - roc_auc_score(
y_true, y_predict, sample_weight=sample_weight, average="weighted"
)
score = 1.0 - roc_auc_score(y_true, y_predict, sample_weight=sample_weight, average="weighted")
elif metric_name == "roc_auc_ovo_weighted":
score = 1.0 - roc_auc_score(
y_true,
@ -350,19 +327,13 @@ def sklearn_metric_loss_score(
except ValueError:
return np.inf
elif "micro_f1" == metric_name:
score = 1 - f1_score(
y_true, y_predict, sample_weight=sample_weight, average="micro"
)
score = 1 - f1_score(y_true, y_predict, sample_weight=sample_weight, average="micro")
elif "macro_f1" == metric_name:
score = 1 - f1_score(
y_true, y_predict, sample_weight=sample_weight, average="macro"
)
score = 1 - f1_score(y_true, y_predict, sample_weight=sample_weight, average="macro")
elif "f1" == metric_name:
score = 1 - f1_score(y_true, y_predict, sample_weight=sample_weight)
elif "ap" == metric_name:
score = 1 - average_precision_score(
y_true, y_predict, sample_weight=sample_weight
)
score = 1 - average_precision_score(y_true, y_predict, sample_weight=sample_weight)
elif "ndcg" in metric_name:
if "@" in metric_name:
k = int(metric_name.split("@", 1)[-1])
@ -524,9 +495,7 @@ def default_cv_score_agg_func(val_loss_folds, log_metrics_folds):
if metrics_to_log:
n = len(val_loss_folds)
metrics_to_log = (
{k: v / n for k, v in metrics_to_log.items()}
if isinstance(metrics_to_log, dict)
else metrics_to_log / n
{k: v / n for k, v in metrics_to_log.items()} if isinstance(metrics_to_log, dict) else metrics_to_log / n
)
return metric_to_minimize, metrics_to_log
@ -546,9 +515,7 @@ def compute_estimator(
eval_method: str,
eval_metric: Union[str, Callable],
best_val_loss=np.Inf,
n_jobs: Optional[
int
] = 1, # some estimators of EstimatorSubclass don't accept n_jobs. Should be None in that case.
n_jobs: Optional[int] = 1, # some estimators of EstimatorSubclass don't accept n_jobs. Should be None in that case.
estimator_class: Optional[EstimatorSubclass] = None,
cv_score_agg_func: Optional[callable] = None,
log_training_metric: Optional[bool] = False,
@ -583,9 +550,7 @@ def compute_estimator(
groups_val,
eval_metric,
task,
labels=fit_kwargs.get(
"label_list"
), # pass the label list on to compute the evaluation metric
labels=fit_kwargs.get("label_list"), # pass the label list on to compute the evaluation metric
budget=budget,
log_training_metric=log_training_metric,
fit_kwargs=fit_kwargs,
@ -619,9 +584,7 @@ def train_estimator(
y_train,
task: str,
estimator_name: str,
n_jobs: Optional[
int
] = 1, # some estimators of EstimatorSubclass don't accept n_jobs. Should be None in that case.
n_jobs: Optional[int] = 1, # some estimators of EstimatorSubclass don't accept n_jobs. Should be None in that case.
estimator_class: Optional[EstimatorSubclass] = None,
budget=None,
fit_kwargs: Optional[dict] = None,
@ -642,18 +605,14 @@ def train_estimator(
fit_kwargs["metric"] = eval_metric
if X_train is not None:
train_time = estimator.fit(
X_train, y_train, budget, free_mem_ratio, **fit_kwargs
)
train_time = estimator.fit(X_train, y_train, budget, free_mem_ratio, **fit_kwargs)
else:
estimator = estimator.estimator_class(**estimator.params)
train_time = time.time() - start_time
return estimator, train_time
def norm_confusion_matrix(
y_true: Union[np.array, pd.Series], y_pred: Union[np.array, pd.Series]
):
def norm_confusion_matrix(y_true: Union[np.array, pd.Series], y_pred: Union[np.array, pd.Series]):
"""normalized confusion matrix.
Args:

Просмотреть файл

@ -139,9 +139,7 @@ class BaseEstimator:
if "_estimator_type" in config:
self._estimator_type = self.params.pop("_estimator_type")
else:
self._estimator_type = (
"classifier" if task in CLASSIFICATION else "regressor"
)
self._estimator_type = "classifier" if task in CLASSIFICATION else "regressor"
def get_params(self, deep=False):
params = self.params.copy()
@ -255,8 +253,7 @@ class BaseEstimator:
mem = psutil.virtual_memory() if psutil is not None else None
try:
with limit_resource(
mem.available * (1 - free_mem_ratio)
+ psutil.Process(os.getpid()).memory_info().rss
mem.available * (1 - free_mem_ratio) + psutil.Process(os.getpid()).memory_info().rss
if mem is not None
else -1,
budget,
@ -290,9 +287,7 @@ class BaseEstimator:
X = self._preprocess(X)
return self._model.predict(X, **kwargs)
else:
logger.warning(
"Estimator is not fit yet. Please run fit() before predict()."
)
logger.warning("Estimator is not fit yet. Please run fit() before predict().")
return np.ones(X.shape[0])
def predict_proba(self, X, **kwargs):
@ -341,9 +336,7 @@ class BaseEstimator:
if self._model is not None:
if self._task == "rank":
raise NotImplementedError(
"AutoML.score() is not implemented for ranking"
)
raise NotImplementedError("AutoML.score() is not implemented for ranking")
else:
X_val = self._preprocess(X_val)
metric = kwargs.pop("metric", None)
@ -356,9 +349,7 @@ class BaseEstimator:
else:
return self._model.score(X_val, y_val, **kwargs)
else:
logger.warning(
"Estimator is not fit yet. Please run fit() before predict()."
)
logger.warning("Estimator is not fit yet. Please run fit() before predict().")
return 0.0
def cleanup(self):
@ -425,9 +416,7 @@ class SparkEstimator(BaseEstimator):
def __init__(self, task="binary", **config):
if not _have_spark:
raise ImportError(
"pyspark is not installed. Try `pip install flaml[spark]`."
)
raise ImportError("pyspark is not installed. Try `pip install flaml[spark]`.")
super().__init__(task, **config)
self.df_train = None
@ -473,9 +462,7 @@ class SparkEstimator(BaseEstimator):
current_time = time.time()
pipeline_model = self.estimator_class(**self.params, **kwargs)
if logger.level == logging.DEBUG:
logger.debug(
f"flaml.model - {pipeline_model} fit started with params {self.params}"
)
logger.debug(f"flaml.model - {pipeline_model} fit started with params {self.params}")
pipeline_model.fit(df_train)
if logger.level == logging.DEBUG:
logger.debug(f"flaml.model - {pipeline_model} fit finished")
@ -494,9 +481,7 @@ class SparkEstimator(BaseEstimator):
"""
if self._model is not None:
X = self._preprocess(X, index_col=index_col)
predictions = to_pandas_on_spark(
self._model.transform(X), index_col=index_col
)
predictions = to_pandas_on_spark(self._model.transform(X), index_col=index_col)
predictions.index.name = None
pred_y = predictions["prediction"]
if return_all:
@ -504,9 +489,7 @@ class SparkEstimator(BaseEstimator):
else:
return pred_y
else:
logger.warning(
"Estimator is not fit yet. Please run fit() before predict()."
)
logger.warning("Estimator is not fit yet. Please run fit() before predict().")
return np.ones(X.shape[0])
def predict_proba(self, X, index_col="tmp_index_col", return_all=False, **kwargs):
@ -524,9 +507,7 @@ class SparkEstimator(BaseEstimator):
assert self._task in CLASSIFICATION, "predict_proba() only for classification."
if self._model is not None:
X = self._preprocess(X, index_col=index_col)
predictions = to_pandas_on_spark(
self._model.transform(X), index_col=index_col
)
predictions = to_pandas_on_spark(self._model.transform(X), index_col=index_col)
predictions.index.name = None
pred_y = predictions["probability"]
@ -535,9 +516,7 @@ class SparkEstimator(BaseEstimator):
else:
return pred_y
else:
logger.warning(
"Estimator is not fit yet. Please run fit() before predict()."
)
logger.warning("Estimator is not fit yet. Please run fit() before predict().")
return np.ones(X.shape[0])
@ -600,9 +579,7 @@ class SparkLGBMEstimator(SparkEstimator):
@classmethod
def size(cls, config):
num_leaves = int(
round(config.get("numLeaves") or 1 << config.get("maxDepth", 16))
)
num_leaves = int(round(config.get("numLeaves") or 1 << config.get("maxDepth", 16)))
n_estimators = int(round(config["numIterations"]))
return (num_leaves * 3 + (num_leaves - 1) * 4 + 1.0) * n_estimators * 8
@ -653,23 +630,17 @@ class SparkLGBMEstimator(SparkEstimator):
):
start_time = time.time()
if self.model_n_classes_ is None and self._task not in ["regression", "rank"]:
self.model_n_classes_, self.model_classes_ = len_labels(
y_train, return_labels=True
)
self.model_n_classes_, self.model_classes_ = len_labels(y_train, return_labels=True)
df_train = self._preprocess(X_train, y_train, index_col=index_col)
# n_iter = self.params.get(self.ITER_HP, self.DEFAULT_ITER)
# trained = False
# mem0 = psutil.virtual_memory().available if psutil is not None else 1
_kwargs = kwargs.copy()
if self._task not in ["regression", "rank"] and "objective" not in _kwargs:
_kwargs["objective"] = (
"binary" if self.model_n_classes_ == 2 else "multiclass"
)
_kwargs["objective"] = "binary" if self.model_n_classes_ == 2 else "multiclass"
for k in list(_kwargs.keys()):
if k not in self.estimator_params:
logger.warning(
f"[SparkLGBMEstimator] [Warning] Ignored unknown parameter: {k}"
)
logger.warning(f"[SparkLGBMEstimator] [Warning] Ignored unknown parameter: {k}")
_kwargs.pop(k)
# TODO: find a better estimation of early stopping
# if (
@ -803,9 +774,7 @@ class TransformersEstimator(BaseEstimator):
"If you need to fix the value of {} to {}, the only way is to add a single-value domain in the search "
"space by adding:\n '{}': {{ 'domain': {} }} to 'custom_hp'. For example:"
'automl_settings["custom_hp"] = {{ "transformer": {{ "model_path": {{ "domain" : '
'"google/electra-small-discriminator" }} }} }}'.format(
key, key, val, key, val
)
'"google/electra-small-discriminator" }} }} }}'.format(key, key, val, key, val)
)
"""
@ -823,25 +792,18 @@ class TransformersEstimator(BaseEstimator):
"""
Update the attributes in TrainingArguments that depends on the values of self.params
"""
local_dir = os.path.join(
self._training_args.output_dir, "train_{}".format(date_str())
)
local_dir = os.path.join(self._training_args.output_dir, "train_{}".format(date_str()))
if self._use_ray is True:
import ray
self._training_args.output_dir = ray.tune.get_trial_dir()
else:
self._training_args.output_dir = Counter.get_trial_fold_name(
local_dir, self.params, self.trial_id
)
self._training_args.output_dir = Counter.get_trial_fold_name(local_dir, self.params, self.trial_id)
self._training_args.fp16 = self.fp16
self._training_args.no_cuda = self.no_cuda
if (
self._task == TOKENCLASSIFICATION
and self._training_args.max_seq_length is not None
):
if self._task == TOKENCLASSIFICATION and self._training_args.max_seq_length is not None:
logger.warning(
"For token classification task, FLAML currently does not support customizing the max_seq_length, max_seq_length will be reset to None."
)
@ -938,10 +900,7 @@ class TransformersEstimator(BaseEstimator):
}
for key in list(kwargs.keys()):
if (
key not in data_collator_class.__dict__.keys()
and key != "tokenizer"
):
if key not in data_collator_class.__dict__.keys() and key != "tokenizer":
del kwargs[key]
return data_collator_class(**kwargs)
else:
@ -984,9 +943,7 @@ class TransformersEstimator(BaseEstimator):
) # If using roberta model, must set add_prefix_space to True to avoid the assertion error at
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/roberta/tokenization_roberta_fast.py#L249
train_dataset, self._X_train, self._y_train = self._preprocess_data(
X_train, y_train
)
train_dataset, self._X_train, self._y_train = self._preprocess_data(X_train, y_train)
if X_val is not None:
eval_dataset, self._X_val, self._y_val = self._preprocess_data(X_val, y_val)
else:
@ -1007,10 +964,7 @@ class TransformersEstimator(BaseEstimator):
self.time_per_iter = time.time() - self.step_begin_time
if (
budget
and (
time.time() + self.time_per_iter
> self.train_begin_time + budget
)
and (time.time() + self.time_per_iter > self.train_begin_time + budget)
or state.global_step >= this_params[TransformersEstimator.ITER_HP]
):
control.should_training_stop = True
@ -1019,10 +973,7 @@ class TransformersEstimator(BaseEstimator):
return control
def on_epoch_end(self, args, state, control, **callback_kwargs):
if (
control.should_training_stop
or state.epoch + 1 >= args.num_train_epochs
):
if control.should_training_stop or state.epoch + 1 >= args.num_train_epochs:
control.should_save = True
control.should_evaluate = True
@ -1051,9 +1002,7 @@ class TransformersEstimator(BaseEstimator):
# if gpu_per_trial == 0:
# os.environ["CUDA_VISIBLE_DEVICES"] = ""
if tmp_cuda_visible_devices.count(",") != math.ceil(gpu_per_trial) - 1:
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
[str(x) for x in range(math.ceil(gpu_per_trial))]
)
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(x) for x in range(math.ceil(gpu_per_trial))])
import time
@ -1070,10 +1019,7 @@ class TransformersEstimator(BaseEstimator):
if hasattr(self._trainer, "intermediate_results"):
self.intermediate_results = [
x[1]
for x in sorted(
self._trainer.intermediate_results.items(), key=lambda x: x[0]
)
x[1] for x in sorted(self._trainer.intermediate_results.items(), key=lambda x: x[0])
]
self._trainer = None
@ -1094,9 +1040,7 @@ class TransformersEstimator(BaseEstimator):
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
if trainer.ckpt_to_metric:
best_ckpt, _ = min(
trainer.ckpt_to_metric.items(), key=lambda x: x[1]["eval_automl_metric"]
)
best_ckpt, _ = min(trainer.ckpt_to_metric.items(), key=lambda x: x[1]["eval_automl_metric"])
best_ckpt_global_step = trainer.ckpt_to_global_step[best_ckpt]
for each_ckpt in list(trainer.ckpt_to_metric):
if each_ckpt != best_ckpt:
@ -1158,9 +1102,7 @@ class TransformersEstimator(BaseEstimator):
Need to reinit training_args because of a bug in deepspeed: if not reinit, the deepspeed config will be inconsistent
with HF config https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py#L947
"""
training_args = self._TrainingArguments(
local_rank=-1, model_path=self._checkpoint_path, fp16=self.fp16
)
training_args = self._TrainingArguments(local_rank=-1, model_path=self._checkpoint_path, fp16=self.fp16)
for key, val in self._training_args.__dict__.items():
if key not in ("local_rank", "model_path", "fp16"):
setattr(training_args, key, val)
@ -1183,9 +1125,7 @@ class TransformersEstimator(BaseEstimator):
for key, val in pred_kwargs.items():
setattr(self._training_args, key, val)
assert (
self._task in CLASSIFICATION
), "predict_proba() only for classification tasks."
assert self._task in CLASSIFICATION, "predict_proba() only for classification tasks."
X_test, _ = self._tokenize_text(X, **self._kwargs)
test_dataset = Dataset.from_pandas(X_test)
@ -1245,9 +1185,7 @@ class TransformersEstimator(BaseEstimator):
def config2params(self, config: dict) -> dict:
params = super().config2params(config)
params[TransformersEstimator.ITER_HP] = params.get(
TransformersEstimator.ITER_HP, sys.maxsize
)
params[TransformersEstimator.ITER_HP] = params.get(TransformersEstimator.ITER_HP, sys.maxsize)
return params
@ -1257,9 +1195,7 @@ class TransformersEstimatorModelSelection(TransformersEstimator):
@classmethod
def search_space(cls, data_size, task, **params):
search_space_dict = TransformersEstimator.search_space(
data_size, task, **params
)
search_space_dict = TransformersEstimator.search_space(data_size, task, **params)
"""
For model selection, use the same search space regardless of memory constraint
@ -1368,11 +1304,7 @@ class LGBMEstimator(BaseEstimator):
@classmethod
def size(cls, config):
num_leaves = int(
round(
config.get("num_leaves")
or config.get("max_leaves")
or 1 << config.get("max_depth", 16)
)
round(config.get("num_leaves") or config.get("max_leaves") or 1 << config.get("max_depth", 16))
)
n_estimators = int(round(config["n_estimators"]))
return (num_leaves * 3 + (num_leaves - 1) * 4 + 1.0) * n_estimators * 8
@ -1399,11 +1331,7 @@ class LGBMEstimator(BaseEstimator):
self.HAS_CALLBACK = self.HAS_CALLBACK and self._callbacks(0, 0, 0) is not None
def _preprocess(self, X):
if (
not isinstance(X, DataFrame)
and issparse(X)
and np.issubdtype(X.dtype, np.integer)
):
if not isinstance(X, DataFrame) and issparse(X) and np.issubdtype(X.dtype, np.integer):
X = X.astype(float)
elif isinstance(X, np.ndarray) and X.dtype.kind not in "buif":
# numpy array is not of numeric dtype
@ -1422,10 +1350,7 @@ class LGBMEstimator(BaseEstimator):
if not self.HAS_CALLBACK:
mem0 = psutil.virtual_memory().available if psutil is not None else 1
if (
(
not self._time_per_iter
or abs(self._train_size - X_train.shape[0]) > 4
)
(not self._time_per_iter or abs(self._train_size - X_train.shape[0]) > 4)
and budget is not None
or self._mem_per_iter < 0
and psutil is not None
@ -1445,9 +1370,7 @@ class LGBMEstimator(BaseEstimator):
# elif self._mem2 <= 0:
# self._mem_per_iter = self._mem1
# else:
self._mem_per_iter = min(
self._mem1, self._mem2 / self.params[self.ITER_HP]
)
self._mem_per_iter = min(self._mem1, self._mem2 / self.params[self.ITER_HP])
# if self._mem_per_iter <= 1 and psutil is not None:
# n_iter = self.params[self.ITER_HP]
self._time_per_iter = (
@ -1458,11 +1381,7 @@ class LGBMEstimator(BaseEstimator):
else 0.001
)
self._train_size = X_train.shape[0]
if (
budget is not None
and self._t1 + self._t2 >= budget
or n_iter == self.params[self.ITER_HP]
):
if budget is not None and self._t1 + self._t2 >= budget or n_iter == self.params[self.ITER_HP]:
# self.params[self.ITER_HP] = n_iter
return time.time() - start_time
trained = True
@ -1471,11 +1390,7 @@ class LGBMEstimator(BaseEstimator):
if n_iter > 1:
max_iter = min(
n_iter,
int(
(budget - time.time() + start_time - self._t1)
/ self._time_per_iter
+ 1
)
int((budget - time.time() + start_time - self._t1) / self._time_per_iter + 1)
if budget is not None
else n_iter,
int((1 - free_mem_ratio) * mem0 / self._mem_per_iter)
@ -1489,9 +1404,7 @@ class LGBMEstimator(BaseEstimator):
if self.HAS_CALLBACK:
kwargs_callbacks = kwargs.get("callbacks")
if kwargs_callbacks:
callbacks = kwargs_callbacks + self._callbacks(
start_time, deadline, free_mem_ratio
)
callbacks = kwargs_callbacks + self._callbacks(start_time, deadline, free_mem_ratio)
kwargs.pop("callbacks")
else:
callbacks = self._callbacks(start_time, deadline, free_mem_ratio)
@ -1816,9 +1729,7 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
def config2params(self, config: dict) -> dict:
params = super().config2params(config)
if "max_leaves" in params:
params["max_leaf_nodes"] = params.get(
"max_leaf_nodes", params.pop("max_leaves")
)
params["max_leaf_nodes"] = params.get("max_leaf_nodes", params.pop("max_leaves"))
if self._task not in CLASSIFICATION and "criterion" in config:
params.pop("criterion")
if "random_state" not in params:
@ -1952,12 +1863,7 @@ class CatBoostEstimator(BaseEstimator):
if not cat_columns.empty:
X = X.copy()
X[cat_columns] = X[cat_columns].apply(
lambda x: x.cat.rename_categories(
[
str(c) if isinstance(c, float) else c
for c in x.cat.categories
]
)
lambda x: x.cat.rename_categories([str(c) if isinstance(c, float) else c for c in x.cat.categories])
)
elif isinstance(X, np.ndarray) and X.dtype.kind not in "buif":
# numpy array is not of numeric dtype
@ -2005,19 +1911,11 @@ class CatBoostEstimator(BaseEstimator):
else:
cat_features = []
use_best_model = kwargs.get("use_best_model", True)
n = (
max(int(len(y_train) * 0.9), len(y_train) - 1000)
if use_best_model
else len(y_train)
)
n = max(int(len(y_train) * 0.9), len(y_train) - 1000) if use_best_model else len(y_train)
X_tr, y_tr = X_train[:n], y_train[:n]
from catboost import Pool, __version__
eval_set = (
Pool(data=X_train[n:], label=y_train[n:], cat_features=cat_features)
if use_best_model
else None
)
eval_set = Pool(data=X_train[n:], label=y_train[n:], cat_features=cat_features) if use_best_model else None
if "sample_weight" in kwargs:
weight = kwargs["sample_weight"]
if weight is not None:
@ -2190,9 +2088,7 @@ class Prophet(SKLearnEstimator):
forecast = self._model.predict(X, **kwargs)
return forecast["yhat"]
else:
logger.warning(
"Estimator is not fit yet. Please run fit() before predict()."
)
logger.warning("Estimator is not fit yet. Please run fit() before predict().")
return np.ones(X.shape[0])
def score(self, X_val: DataFrame, y_val: Series, **kwargs):
@ -2279,9 +2175,7 @@ class ARIMA(Prophet):
if len(X.columns) > 1:
X = self._preprocess(X.drop(columns=TS_TIMESTAMP_COL))
regressors = list(X)
forecast = self._model.predict(
start=start, end=end, exog=X[regressors], **kwargs
)
forecast = self._model.predict(start=start, end=end, exog=X[regressors], **kwargs)
else:
forecast = self._model.predict(start=start, end=end, **kwargs)
else:
@ -2398,9 +2292,7 @@ class HoltWinters(ARIMA):
},
"use_boxcox": {"domain": tune.choice([False, True]), "init_value": False},
"seasonal_periods": { # statsmodels casts this to None if "seasonal" is None
"domain": tune.choice(
[7, 12, 4, 52, 6]
), # weekly, yearly, quarterly, weekly w yearly data
"domain": tune.choice([7, 12, 4, 52, 6]), # weekly, yearly, quarterly, weekly w yearly data
"init_value": 7,
},
}
@ -2486,9 +2378,7 @@ class TS_SKLearn(SKLearnEstimator):
"low_cost_init_value": False,
},
"lags": {
"domain": tune.randint(
lower=1, upper=max(2, int(np.sqrt(data_size[0])))
),
"domain": tune.randint(lower=1, upper=max(2, int(np.sqrt(data_size[0])))),
"init_value": 3,
},
}
@ -2498,9 +2388,7 @@ class TS_SKLearn(SKLearnEstimator):
def __init__(self, task="ts_forecast", **params):
super().__init__(task, **params)
self.hcrystaball_model = None
self.ts_task = (
"regression" if task in TS_FORECASTREGRESSION else "classification"
)
self.ts_task = "regression" if task in TS_FORECASTREGRESSION else "classification"
def transform_X(self, X):
cols = list(X)
@ -2532,9 +2420,7 @@ class TS_SKLearn(SKLearnEstimator):
(
X_fit,
y_fit,
) = self.hcrystaball_model._transform_data_to_tsmodel_input_format(
X_train, y_train, i
)
) = self.hcrystaball_model._transform_data_to_tsmodel_input_format(X_train, y_train, i)
self.hcrystaball_model.model.set_params(**estimator.params)
model = self.hcrystaball_model.model.fit(X_fit, y_fit)
model_list.append(model)
@ -2543,9 +2429,7 @@ class TS_SKLearn(SKLearnEstimator):
(
X_fit,
y_fit,
) = self.hcrystaball_model._transform_data_to_tsmodel_input_format(
X_train, y_train, kwargs["period"]
)
) = self.hcrystaball_model._transform_data_to_tsmodel_input_format(X_train, y_train, kwargs["period"])
self.hcrystaball_model.model.set_params(**estimator.params)
model = self.hcrystaball_model.model.fit(X_fit, y_fit)
self._model = model
@ -2569,9 +2453,7 @@ class TS_SKLearn(SKLearnEstimator):
(
X_pred,
_,
) = self.hcrystaball_model._transform_data_to_tsmodel_input_format(
X.iloc[:i, :]
)
) = self.hcrystaball_model._transform_data_to_tsmodel_input_format(X.iloc[:i, :])
preds.append(self._model[i - 1].predict(X_pred, **kwargs)[-1])
forecast = Series(preds)
else:
@ -2582,9 +2464,7 @@ class TS_SKLearn(SKLearnEstimator):
forecast = self._model.predict(X_pred, **kwargs)
return forecast
else:
logger.warning(
"Estimator is not fit yet. Please run fit() before predict()."
)
logger.warning("Estimator is not fit yet. Please run fit() before predict().")
return np.ones(X.shape[0])
@ -2681,13 +2561,9 @@ class TemporalFusionTransformerEstimator(SKLearnEstimator):
max_prediction_length=max_prediction_length,
static_categoricals=kwargs.get("static_categoricals", []),
static_reals=kwargs.get("static_reals", []),
time_varying_known_categoricals=kwargs.get(
"time_varying_known_categoricals", []
),
time_varying_known_categoricals=kwargs.get("time_varying_known_categoricals", []),
time_varying_known_reals=kwargs.get("time_varying_known_reals", []),
time_varying_unknown_categoricals=kwargs.get(
"time_varying_unknown_categoricals", []
),
time_varying_unknown_categoricals=kwargs.get("time_varying_unknown_categoricals", []),
time_varying_unknown_reals=kwargs.get("time_varying_unknown_reals", []),
variable_groups=kwargs.get(
"variable_groups", {}
@ -2703,18 +2579,12 @@ class TemporalFusionTransformerEstimator(SKLearnEstimator):
# create validation set (predict=True) which means to predict the last max_prediction_length points in time
# for each series
validation = TimeSeriesDataSet.from_dataset(
training, self.data, predict=True, stop_randomization=True
)
validation = TimeSeriesDataSet.from_dataset(training, self.data, predict=True, stop_randomization=True)
# create dataloaders for model
batch_size = kwargs.get("batch_size", 64)
train_dataloader = training.to_dataloader(
train=True, batch_size=batch_size, num_workers=0
)
val_dataloader = validation.to_dataloader(
train=False, batch_size=batch_size * 10, num_workers=0
)
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)
return training, train_dataloader, val_dataloader
@ -2728,27 +2598,19 @@ class TemporalFusionTransformerEstimator(SKLearnEstimator):
warnings.filterwarnings("ignore")
current_time = time.time()
training, train_dataloader, val_dataloader = self.transform_ds(
X_train, y_train, **kwargs
)
training, train_dataloader, val_dataloader = self.transform_ds(X_train, y_train, **kwargs)
params = self.params.copy()
gradient_clip_val = params.pop("gradient_clip_val")
params.pop("n_jobs")
max_epochs = kwargs.get("max_epochs", 20)
early_stop_callback = EarlyStopping(
monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min"
)
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min")
def _fit(log):
default_trainer_kwargs = dict(
gpus=kwargs.get("gpu_per_trial", [0])
if torch.cuda.is_available()
else None,
gpus=kwargs.get("gpu_per_trial", [0]) if torch.cuda.is_available() else None,
max_epochs=max_epochs,
gradient_clip_val=gradient_clip_val,
callbacks=[LearningRateMonitor(), early_stop_callback]
if log
else [early_stop_callback],
callbacks=[LearningRateMonitor(), early_stop_callback] if log else [early_stop_callback],
logger=log,
)
trainer = pl.Trainer(
@ -2794,9 +2656,7 @@ class TemporalFusionTransformerEstimator(SKLearnEstimator):
ids = self.group_ids.copy()
ids.append(TS_TIMESTAMP_COL)
encoder_data = self.data[
lambda x: x.time_idx > x.time_idx.max() - self.max_encoder_length
]
encoder_data = self.data[lambda x: x.time_idx > x.time_idx.max() - self.max_encoder_length]
# following pytorchforecasting example, make all target values equal to the last data
last_data_cols = self.group_ids.copy()
last_data_cols.append(TS_VALUE_COL)
@ -2804,9 +2664,7 @@ class TemporalFusionTransformerEstimator(SKLearnEstimator):
decoder_data = X
if "time_idx" not in decoder_data:
decoder_data = add_time_idx_col(decoder_data)
decoder_data["time_idx"] += (
encoder_data["time_idx"].max() + 1 - decoder_data["time_idx"].min()
)
decoder_data["time_idx"] += encoder_data["time_idx"].max() + 1 - decoder_data["time_idx"].min()
# decoder_data[TS_VALUE_COL] = 0
decoder_data = decoder_data.merge(last_data, how="inner", on=self.group_ids)
decoder_data = decoder_data.sort_values(ids)

Просмотреть файл

@ -22,22 +22,15 @@ class DataCollatorForMultipleChoiceClassification(DataCollatorWithPadding):
import torch
label_name = "label" if "label" in features[0].keys() else "labels"
labels = (
[feature.pop(label_name) for feature in features]
if label_name in features[0]
else None
)
labels = [feature.pop(label_name) for feature in features] if label_name in features[0] else None
batch_size = len(features)
num_choices = len(features[0]["input_ids"])
flattened_features = [
[{k: v[i] for k, v in feature.items()} for i in range(num_choices)]
for feature in features
[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
]
flattened_features = list(chain(*flattened_features))
batch = super(DataCollatorForMultipleChoiceClassification, self).__call__(
flattened_features
)
batch = super(DataCollatorForMultipleChoiceClassification, self).__call__(flattened_features)
# Un-flatten
batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
# Add back labels

Просмотреть файл

@ -24,9 +24,7 @@ class TrainerForAuto(Seq2SeqTrainer):
num_beams=num_beams,
)
else:
return super(Seq2SeqTrainer, self).predict(
test_dataset, ignore_keys, metric_key_prefix
)
return super(Seq2SeqTrainer, self).predict(test_dataset, ignore_keys, metric_key_prefix)
def prediction_step(
self,
@ -36,13 +34,9 @@ class TrainerForAuto(Seq2SeqTrainer):
ignore_keys,
):
if getattr(self, "_is_seq2seq", None):
return super().prediction_step(
model, inputs, prediction_loss_only, ignore_keys
)
return super().prediction_step(model, inputs, prediction_loss_only, ignore_keys)
else:
return super(Seq2SeqTrainer, self).prediction_step(
model, inputs, prediction_loss_only, ignore_keys
)
return super(Seq2SeqTrainer, self).prediction_step(model, inputs, prediction_loss_only, ignore_keys)
def log(self, logs) -> None:
if getattr(self, "_is_seq2seq", None):
@ -66,9 +60,7 @@ class TrainerForAuto(Seq2SeqTrainer):
"""Overriding transformers.Trainer.evaluate by saving metrics and checkpoint path."""
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
ckpt_dir = os.path.join(
self.args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
)
ckpt_dir = os.path.join(self.args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}")
eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
# TODO: if your task is seq2seq (i.e., SUMMARIZATION), uncomment the code below (add indentation before metrics = eval_dataset...

Просмотреть файл

@ -69,21 +69,13 @@ class TrainingArgumentsForAuto(TrainingArguments):
metadata={"help": "per gpu evaluation batch size"},
)
label_list: Optional[List[str]] = field(
default=None, metadata={"help": "The string list of the label names. "}
)
label_list: Optional[List[str]] = field(default=None, metadata={"help": "The string list of the label names. "})
eval_steps: int = field(
default=500, metadata={"help": "Run an evaluation every X steps."}
)
eval_steps: int = field(default=500, metadata={"help": "Run an evaluation every X steps."})
save_steps: int = field(
default=500, metadata={"help": "Save checkpoint every X updates steps."}
)
save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
logging_steps: int = field(
default=500, metadata={"help": "Log every X updates steps."}
)
logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
@staticmethod
def load_args_from_console():
@ -96,12 +88,8 @@ class TrainingArgumentsForAuto(TrainingArguments):
"--" + each_field.name,
type=each_field.type,
help=each_field.metadata["help"],
required=each_field.metadata["required"]
if "required" in each_field.metadata
else False,
choices=each_field.metadata["choices"]
if "choices" in each_field.metadata
else None,
required=each_field.metadata["required"] if "required" in each_field.metadata else False,
choices=each_field.metadata["choices"] if "choices" in each_field.metadata else None,
default=each_field.default,
)
console_args, unknown = arg_parser.parse_known_args()
@ -112,19 +100,13 @@ class TrainingArgumentsForAuto(TrainingArguments):
class Seq2SeqTrainingArgumentsForAuto(TrainingArgumentsForAuto):
model_path: str = field(
default="t5-small",
metadata={
"help": "model path for HPO natural language generation tasks, default is set to t5-small"
},
metadata={"help": "model path for HPO natural language generation tasks, default is set to t5-small"},
)
sortish_sampler: bool = field(
default=False, metadata={"help": "Whether to use SortishSampler or not."}
)
sortish_sampler: bool = field(default=False, metadata={"help": "Whether to use SortishSampler or not."})
predict_with_generate: bool = field(
default=True,
metadata={
"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."
},
metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."},
)
generation_max_length: Optional[int] = field(
default=None,

Просмотреть файл

@ -38,16 +38,12 @@ def tokenize_text(X, Y=None, task=None, hf_args=None, tokenizer=None):
Y_tokenized = Y
label_col_name = ["label"]
elif task == TOKENCLASSIFICATION:
X_tokenized, Y_tokenized = tokenize_text_tokclassification(
X, Y, tokenizer=tokenizer, hf_args=hf_args
)
X_tokenized, Y_tokenized = tokenize_text_tokclassification(X, Y, tokenizer=tokenizer, hf_args=hf_args)
label_col_name = ["labels"]
elif task in NLG_TASKS:
return tokenize_seq2seq(X, Y, tokenizer=tokenizer, task=task, hf_args=hf_args)
elif task == MULTICHOICECLASSIFICATION:
X_tokenized = tokenize_text_multiplechoice(
X, tokenizer=tokenizer, hf_args=hf_args
)
X_tokenized = tokenize_text_multiplechoice(X, tokenizer=tokenizer, hf_args=hf_args)
label_col_name = ["label"]
Y_tokenized = Y
Y_tokenized = todf(X_tokenized, Y_tokenized, label_col_name)
@ -75,9 +71,7 @@ def tokenize_seq2seq(X, Y, tokenizer, task=None, hf_args=None):
[(each_l if each_l != tokenizer.pad_token_id else -100) for each_l in label]
for label in model_outputs["input_ids"]
]
model_outputs = model_outputs.drop(
columns=["attention_mask", "input_ids", "decoder_input_ids"]
)
model_outputs = model_outputs.drop(columns=["attention_mask", "input_ids", "decoder_input_ids"])
return model_inputs, model_outputs
@ -116,9 +110,7 @@ def tokenize_and_align_labels(
# Use the label_all_tokens to control whether to copy the label to all subtokens or to pad the additional tokens as -100
if hf_args.label_all_tokens:
# If the B- word is converted into multiple subtokens, map the additional subtokens to I-
label_ids.append(
b_to_i_label[label_to_id[examples[Y_sent_key][word_idx]]]
)
label_ids.append(b_to_i_label[label_to_id[examples[Y_sent_key][word_idx]]])
else:
label_ids.append(-100)
previous_word_idx = word_idx
@ -173,9 +165,7 @@ def tokenize_text_tokclassification(X, Y, tokenizer, hf_args=None):
result_type="expand",
)
label_idx = tokenized_column_names.index("labels")
other_indices = sorted(
set(range(len(tokenized_column_names))).difference({label_idx})
)
other_indices = sorted(set(range(len(tokenized_column_names))).difference({label_idx}))
other_column_names = [tokenized_column_names[x] for x in other_indices]
d = X_and_Y_tokenized.iloc[:, other_indices]
y_tokenized = X_and_Y_tokenized.iloc[:, label_idx]
@ -298,10 +288,7 @@ def tokenize_swag(this_row, tokenizer, hf_args=None, return_column_name=False):
# get each 1st sentence, multiply to 4 sentences
question_headers = this_row["sent2"]
# sent2 are the noun part of 2nd line
second_sentences = [
question_headers + " " + this_row[key]
for key in ["ending0", "ending1", "ending2", "ending3"]
]
second_sentences = [question_headers + " " + this_row[key] for key in ["ending0", "ending1", "ending2", "ending3"]]
# now the 2nd-sentences are formed by combing the noun part and 4 ending parts
# Flatten out
@ -322,18 +309,14 @@ def tokenize_swag(this_row, tokenizer, hf_args=None, return_column_name=False):
return [tokenized_example[x] for x in tmp_column_names]
def postprocess_prediction_and_true(
task, y_pred, tokenizer, hf_args, y_true=None, X=None
):
def postprocess_prediction_and_true(task, y_pred, tokenizer, hf_args, y_true=None, X=None):
# postprocess the matrix prediction y_pred and ground truth y_true into user readable format, e.g., for summarization, decode into text
if task == SEQCLASSIFICATION:
return np.argmax(y_pred, axis=1), y_true
elif task == SEQREGRESSION:
return np.squeeze(y_pred), y_true # predictions.reshape((len(predictions),))
elif task == TOKENCLASSIFICATION:
assert (y_true is not None) or (
X is not None
), "One of y_true and X must not be None"
assert (y_true is not None) or (X is not None), "One of y_true and X must not be None"
## If y_true is not None, we use y_true to remove the -100 in the prediction (postprocessing), and return the postprocessed y_true and prediction
# If y_true is None, we use X to compute y_is_pad (i.e., whether y_true is -100 in that position), and use y_is_pad to remove the -100 in the prediction, and return the postprocessed prediction (not the y_true)
y_predict = pd.Series(np.argmax(y_pred, axis=2).tolist())
@ -354,17 +337,12 @@ def postprocess_prediction_and_true(
for (each_pred, each_is_pad) in zip(y_predict, y_is_pad)
]
y_pred_label = [
[
hf_args.label_list[p] if 0 <= p < label_len else -1
for (p, ispd) in each_list
]
[hf_args.label_list[p] if 0 <= p < label_len else -1 for (p, ispd) in each_list]
for each_list in zip_pred_ispad
] # To compute precision and recall, y_pred and y_true must be converted to string labels
# (B-PER, I-PER, etc.), so that the category-based precision/recall (i.e., PER, LOC, etc.) scores can be computed
if y_true is not None:
y_true_label = [
[tr for (p, tr) in each_list] for each_list in zip_pred_ispad
]
y_true_label = [[tr for (p, tr) in each_list] for each_list in zip_pred_ispad]
else:
y_true_label = None
return y_pred_label, y_true_label
@ -381,13 +359,9 @@ def postprocess_prediction_and_true(
if y_true is not None:
y_true_labels = np.where(y_true != -100, y_true, tokenizer.pad_token_id)
decoded_y_true_labels = tokenizer.batch_decode(
y_true_labels, skip_special_tokens=True
)
decoded_y_true_labels = tokenizer.batch_decode(y_true_labels, skip_special_tokens=True)
decoded_y_true_labels = [label.strip() for label in decoded_y_true_labels]
decoded_y_true_labels = [
"\n".join(nltk.sent_tokenize(label)) for label in decoded_y_true_labels
]
decoded_y_true_labels = ["\n".join(nltk.sent_tokenize(label)) for label in decoded_y_true_labels]
else:
decoded_y_true_labels = None
@ -419,17 +393,11 @@ def load_model(checkpoint_path, task, num_labels=None):
checkpoint_path, config=model_config, ignore_mismatched_sizes=True
)
elif task == TOKENCLASSIFICATION:
return AutoModelForTokenClassification.from_pretrained(
checkpoint_path, config=model_config
)
return AutoModelForTokenClassification.from_pretrained(checkpoint_path, config=model_config)
elif task in NLG_TASKS:
return AutoModelForSeq2SeqLM.from_pretrained(
checkpoint_path, config=model_config
)
return AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path, config=model_config)
elif task == MULTICHOICECLASSIFICATION:
return AutoModelForMultipleChoice.from_pretrained(
checkpoint_path, config=model_config
)
return AutoModelForMultipleChoice.from_pretrained(checkpoint_path, config=model_config)
def _set_model_config(checkpoint_path):
if task in (SEQCLASSIFICATION, SEQREGRESSION, TOKENCLASSIFICATION):

Просмотреть файл

@ -85,12 +85,8 @@ class Counter:
@staticmethod
def get_trial_fold_name(local_dir, trial_config, trial_id):
Counter.counter += 1
experiment_tag = "{0}_{1}".format(
str(Counter.counter), format_vars(trial_config)
)
logdir = get_logdir_name(
_generate_dirname(experiment_tag, trial_id=trial_id), local_dir
)
experiment_tag = "{0}_{1}".format(str(Counter.counter), format_vars(trial_config))
logdir = get_logdir_name(_generate_dirname(experiment_tag, trial_id=trial_id), local_dir)
return logdir
@ -99,15 +95,11 @@ class LabelEncoderforTokenClassification:
# if the labels are tokens, convert them to ids
if any(isinstance(id, str) for id in y[0]):
self.label_list = sorted(list(set().union(*y)))
self._tokenlabel_to_id = {
self.label_list[id]: id for id in range(len(self.label_list))
}
self._tokenlabel_to_id = {self.label_list[id]: id for id in range(len(self.label_list))}
y = y.apply(lambda sent: [self._tokenlabel_to_id[token] for token in sent])
# if the labels are not tokens, they must be ids
else:
assert all(
isinstance(id, (int, np.integer)) for id in y[0]
), "The labels must either be tokens or ids"
assert all(isinstance(id, (int, np.integer)) for id in y[0]), "The labels must either be tokens or ids"
return y
def transform(self, y):

Просмотреть файл

@ -40,9 +40,7 @@ def _process_df(df, label_col, prediction_col):
def _compute_label_from_probability(df, probability_col, prediction_col):
# array_max finds the maximum value in the 'probability' array
# array_position finds the index of the maximum value in the 'probability' array
max_index_expr = F.expr(
f"array_position({probability_col}, array_max({probability_col}))-1"
)
max_index_expr = F.expr(f"array_position({probability_col}, array_max({probability_col}))-1")
# Create a new column 'prediction' based on the maximum probability value
df = df.withColumn(prediction_col, max_index_expr.cast("double"))
return df
@ -143,9 +141,7 @@ def spark_metric_loss_score(
)
elif metric_name == "log_loss":
# For log_loss, prediction_col should be probability, and we need to convert it to label
df = _compute_label_from_probability(
df, prediction_col, prediction_col + "_label"
)
df = _compute_label_from_probability(df, prediction_col, prediction_col + "_label")
evaluator = MulticlassClassificationEvaluator(
metricName="logLoss",
labelCol=label_col,
@ -214,17 +210,11 @@ def spark_metric_loss_score(
score /= len(counts)
score += 1
else:
evaluator = RankingEvaluator(
metricName="ndcgAtK", labelCol=label_col, predictionCol=prediction_col
)
evaluator = RankingEvaluator(metricName="ndcgAtK", labelCol=label_col, predictionCol=prediction_col)
df = _process_df(df, label_col, prediction_col)
score = 1 - evaluator.evaluate(df)
return score
else:
raise ValueError(f"Unknown metric name: {metric_name} for spark models.")
return (
evaluator.evaluate(df)
if metric_name in min_mode_metrics
else 1 - evaluator.evaluate(df)
)
return evaluator.evaluate(df) if metric_name in min_mode_metrics else 1 - evaluator.evaluate(df)

Просмотреть файл

@ -73,9 +73,7 @@ def to_pandas_on_spark(
elif isinstance(df, (ps.DataFrame, ps.Series)):
return df
else:
raise TypeError(
f"{type(df)} is not one of pandas.DataFrame, pandas.Series and pyspark.sql.DataFrame"
)
raise TypeError(f"{type(df)} is not one of pandas.DataFrame, pandas.Series and pyspark.sql.DataFrame")
def train_test_split_pyspark(
@ -106,10 +104,7 @@ def train_test_split_pyspark(
if stratify_column:
# Test data
test_fraction_dict = (
df.select(stratify_column)
.distinct()
.withColumn("fraction", F.lit(test_fraction))
.rdd.collectAsMap()
df.select(stratify_column).distinct().withColumn("fraction", F.lit(test_fraction)).rdd.collectAsMap()
)
df_test = df.stat.sampleBy(stratify_column, test_fraction_dict, seed)
# Train data
@ -128,9 +123,7 @@ def train_test_split_pyspark(
return [df_train, df_test]
def unique_pandas_on_spark(
psds: Union[ps.Series, ps.DataFrame]
) -> Tuple[np.ndarray, np.ndarray]:
def unique_pandas_on_spark(psds: Union[ps.Series, ps.DataFrame]) -> Tuple[np.ndarray, np.ndarray]:
"""Get the unique values and counts of a pandas_on_spark series."""
if isinstance(psds, ps.DataFrame):
psds = psds.iloc[:, 0]
@ -140,9 +133,7 @@ def unique_pandas_on_spark(
return label_set, counts
def len_labels(
y: Union[ps.Series, np.ndarray], return_labels=False
) -> Union[int, Optional[np.ndarray]]:
def len_labels(y: Union[ps.Series, np.ndarray], return_labels=False) -> Union[int, Optional[np.ndarray]]:
"""Get the number of unique labels in y."""
if not isinstance(y, (ps.DataFrame, ps.Series)):
labels = np.unique(y)
@ -153,9 +144,7 @@ def len_labels(
return len(labels)
def unique_value_first_index(
y: Union[pd.Series, ps.Series, np.ndarray]
) -> Tuple[np.ndarray, np.ndarray]:
def unique_value_first_index(y: Union[pd.Series, ps.Series, np.ndarray]) -> Tuple[np.ndarray, np.ndarray]:
"""Get the unique values and indices of a pandas series,
pandas_on_spark series or numpy array."""
if isinstance(y, ps.Series):
@ -196,9 +185,7 @@ def iloc_pandas_on_spark(
psdfiloc = psdfiloc.drop(columns=[index_col])
return psdfiloc
else:
raise TypeError(
f"{type(index)} is not one of int, slice and list for pandas_on_spark iloc"
)
raise TypeError(f"{type(index)} is not one of int, slice and list for pandas_on_spark iloc")
def spark_kFold(
@ -241,9 +228,7 @@ def spark_kFold(
condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB)
validation = to_pandas_on_spark(df.filter(condition), index_col=index_col)
train = to_pandas_on_spark(df.filter(~condition), index_col=index_col)
datasets.append(
(train.drop(columns=[randCol]), validation.drop(columns=[randCol]))
)
datasets.append((train.drop(columns=[randCol]), validation.drop(columns=[randCol])))
else:
# Use user-specified fold column
def get_fold_num(foldNum: int) -> int:

Просмотреть файл

@ -63,9 +63,7 @@ class SearchState:
Notice (2) include the case starting point not in user specified search space custom_hp
"""
if isinstance(domain_one_dim, sample.Domain):
renamed_type = list(
inspect.signature(domain_one_dim.is_valid).parameters.values()
)[0].annotation
renamed_type = list(inspect.signature(domain_one_dim.is_valid).parameters.values())[0].annotation
type_match = (
renamed_type == Any
or isinstance(value_one_dim, renamed_type)
@ -106,9 +104,7 @@ class SearchState:
self.learner_class = learner_class
self._budget = budget
if task in TS_FORECAST:
search_space = learner_class.search_space(
data_size=data_size, task=task, pred_horizon=period
)
search_space = learner_class.search_space(data_size=data_size, task=task, pred_horizon=period)
else:
search_space = learner_class.search_space(data_size=data_size, task=task)
@ -117,14 +113,10 @@ class SearchState:
if isinstance(starting_point, dict):
starting_point = AutoMLState.sanitize(starting_point)
if max_iter > 1 and not self.valid_starting_point(
starting_point, search_space
):
if max_iter > 1 and not self.valid_starting_point(starting_point, search_space):
# If the number of iterations is larger than 1, remove invalid point
logger.warning(
"Starting point {} removed because it is outside of the search space".format(
starting_point
)
"Starting point {} removed because it is outside of the search space".format(starting_point)
)
starting_point = None
elif isinstance(starting_point, list):
@ -132,11 +124,7 @@ class SearchState:
if max_iter > len(starting_point):
# If the number of starting points is no smaller than max iter, avoid the checking
starting_point_len = len(starting_point)
starting_point = [
x
for x in starting_point
if self.valid_starting_point(x, search_space)
]
starting_point = [x for x in starting_point if self.valid_starting_point(x, search_space)]
if starting_point_len > len(starting_point):
logger.warning(
"Starting points outside of the search space are removed. "
@ -145,9 +133,7 @@ class SearchState:
starting_point = starting_point or None
for name, space in search_space.items():
assert (
"domain" in space
), f"{name}'s domain is missing in the search space spec {space}"
assert "domain" in space, f"{name}'s domain is missing in the search space spec {space}"
if space["domain"] is None:
# don't search this hp
continue
@ -159,19 +145,14 @@ class SearchState:
self.cat_hp_cost[name] = space["cat_hp_cost"]
# if a starting point is provided, set the init config to be
# the starting point provided
if (
isinstance(starting_point, dict)
and starting_point.get(name) is not None
):
if isinstance(starting_point, dict) and starting_point.get(name) is not None:
if self.init_config is None:
self.init_config = {}
self.init_config[name] = starting_point[name]
elif (
not isinstance(starting_point, list)
and "init_value" in space
and self.valid_starting_point_one_dim(
space["init_value"], space["domain"]
)
and self.valid_starting_point_one_dim(space["init_value"], space["domain"])
):
if self.init_config is None:
self.init_config = {}
@ -241,11 +222,7 @@ class SearchState:
if time2eval:
self.time2eval_best_old = self.time2eval_best
self.time2eval_best = time2eval
if (
self.trained_estimator
and trained_estimator
and self.trained_estimator != trained_estimator
):
if self.trained_estimator and trained_estimator and self.trained_estimator != trained_estimator:
self.trained_estimator.cleanup()
if trained_estimator:
self.trained_estimator = trained_estimator
@ -260,9 +237,7 @@ class SearchState:
return config_sig
def est_retrain_time(self, retrain_sample_size):
assert (
self.best_config_sample_size is not None
), "need to first get best_config_sample_size"
assert self.best_config_sample_size is not None, "need to first get best_config_sample_size"
return self.time2eval_best * retrain_sample_size / self.best_config_sample_size
@ -283,9 +258,7 @@ class AutoMLState:
) # NOTE: _prepare_sample_train_data is before kwargs is updated to fit_kwargs_by_estimator
if weight is not None:
sampled_weight = (
weight.iloc[:sample_size]
if isinstance(weight, (pd.Series, psSeries))
else weight[:sample_size]
weight.iloc[:sample_size] if isinstance(weight, (pd.Series, psSeries)) else weight[:sample_size]
)
if self.groups is not None:
groups = (
@ -338,10 +311,7 @@ class AutoMLState:
if state.time_budget < 0
else state.time_budget - state.time_from_start
if sample_size == state.data_size[0]
else (state.time_budget - state.time_from_start)
/ 2
* sample_size
/ state.data_size[0]
else (state.time_budget - state.time_from_start) / 2 * sample_size / state.data_size[0]
)
(
@ -357,9 +327,7 @@ class AutoMLState:
state.y_val,
state.weight_val,
state.groups_val,
state.train_time_limit
if budget is None
else min(budget, state.train_time_limit or np.inf),
state.train_time_limit if budget is None else min(budget, state.train_time_limit or np.inf),
state.kf,
config,
state.task,
@ -406,9 +374,7 @@ class AutoMLState:
sample_size: Optional[int] = None,
):
if not sample_size:
sample_size = config_w_resource.get(
"FLAML_sample_size", len(self.y_train_all)
)
sample_size = config_w_resource.get("FLAML_sample_size", len(self.y_train_all))
config = AutoMLState.sanitize(config_w_resource)
this_estimator_kwargs = self.fit_kwargs_by_estimator.get(
@ -432,9 +398,7 @@ class AutoMLState:
"groups"
] = groups # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator
budget = (
None if self.time_budget < 0 else self.time_budget - self.time_from_start
)
budget = None if self.time_budget < 0 else self.time_budget - self.time_from_start
estimator, train_time = train_estimator(
X_train=sampled_X_train,

Просмотреть файл

@ -108,44 +108,28 @@ class GenericTask(Task):
groups=None,
):
if X_train_all is not None and y_train_all is not None:
assert isinstance(
X_train_all, (np.ndarray, pd.DataFrame, psDataFrame)
) or issparse(X_train_all), (
assert isinstance(X_train_all, (np.ndarray, pd.DataFrame, psDataFrame)) or issparse(X_train_all), (
"X_train_all must be a numpy array, a pandas dataframe, "
"a Scipy sparse matrix or a pyspark.pandas dataframe."
)
assert isinstance(
y_train_all, (np.ndarray, pd.Series, psSeries)
), "y_train_all must be a numpy array, a pandas series or a pyspark.pandas series."
assert (
X_train_all.size != 0 and y_train_all.size != 0
), "Input data must not be empty."
assert X_train_all.size != 0 and y_train_all.size != 0, "Input data must not be empty."
if isinstance(X_train_all, np.ndarray) and len(X_train_all.shape) == 1:
X_train_all = np.reshape(X_train_all, (X_train_all.size, 1))
if isinstance(y_train_all, np.ndarray):
y_train_all = y_train_all.flatten()
assert (
X_train_all.shape[0] == y_train_all.shape[0]
), "# rows in X_train must match length of y_train."
assert X_train_all.shape[0] == y_train_all.shape[0], "# rows in X_train must match length of y_train."
if isinstance(X_train_all, psDataFrame):
X_train_all = (
X_train_all.spark.cache()
) # cache data to improve compute speed
X_train_all = X_train_all.spark.cache() # cache data to improve compute speed
y_train_all = y_train_all.to_frame().spark.cache()[y_train_all.name]
logger.debug(
f"X_train_all and y_train_all cached, shape of X_train_all: {X_train_all.shape}"
)
logger.debug(f"X_train_all and y_train_all cached, shape of X_train_all: {X_train_all.shape}")
automl._df = isinstance(X_train_all, (pd.DataFrame, psDataFrame))
automl._nrow, automl._ndim = X_train_all.shape
if self.is_ts_forecast():
X_train_all = (
pd.DataFrame(X_train_all)
if isinstance(X_train_all, np.ndarray)
else X_train_all
)
X_train_all, y_train_all = self._validate_ts_data(
X_train_all, y_train_all
)
X_train_all = pd.DataFrame(X_train_all) if isinstance(X_train_all, np.ndarray) else X_train_all
X_train_all, y_train_all = self._validate_ts_data(X_train_all, y_train_all)
X, y = X_train_all, y_train_all
elif dataframe is not None and label is not None:
assert isinstance(
@ -155,9 +139,7 @@ class GenericTask(Task):
label in dataframe.columns
), f"The provided label column name `{label}` doesn't exist in the provided dataframe."
if isinstance(dataframe, psDataFrame):
dataframe = (
dataframe.spark.cache()
) # cache data to improve compute speed
dataframe = dataframe.spark.cache() # cache data to improve compute speed
logger.debug(f"dataframe cached, shape of dataframe: {dataframe.shape}")
automl._df = True
if self.is_ts_forecast():
@ -183,9 +165,7 @@ class GenericTask(Task):
for _, each_cell in X[column].items():
if each_cell is not None:
is_str = isinstance(each_cell, str)
is_list_of_int = isinstance(each_cell, list) and all(
isinstance(x, int) for x in each_cell
)
is_list_of_int = isinstance(each_cell, list) and all(isinstance(x, int) for x in each_cell)
is_list_of_str = is_a_list_of_str(each_cell)
if self.is_token_classification():
assert is_list_of_str, (
@ -222,9 +202,7 @@ class GenericTask(Task):
automl._label_transformer = automl._transformer.label_transformer
if self.is_token_classification():
if hasattr(automl._label_transformer, "label_list"):
state.fit_kwargs.update(
{"label_list": automl._label_transformer.label_list}
)
state.fit_kwargs.update({"label_list": automl._label_transformer.label_list})
elif "label_list" not in state.fit_kwargs:
for each_fit_kwargs in state.fit_kwargs_by_estimator.values():
assert (
@ -232,34 +210,26 @@ class GenericTask(Task):
), "For the token-classification task, you must either (1) pass token labels; or (2) pass id labels and the label list. "
"Please refer to the documentation for more details: https://microsoft.github.io/FLAML/docs/Examples/AutoML-NLP#a-simple-token-classification-example"
automl._feature_names_in_ = (
automl._X_train_all.columns.to_list()
if hasattr(automl._X_train_all, "columns")
else None
automl._X_train_all.columns.to_list() if hasattr(automl._X_train_all, "columns") else None
)
automl._sample_weight_full = state.fit_kwargs.get(
"sample_weight"
) # NOTE: _validate_data is before kwargs is updated to fit_kwargs_by_estimator
if X_val is not None and y_val is not None:
assert isinstance(
X_val, (np.ndarray, pd.DataFrame, psDataFrame)
) or issparse(X_train_all), (
assert isinstance(X_val, (np.ndarray, pd.DataFrame, psDataFrame)) or issparse(X_train_all), (
"X_val must be None, a numpy array, a pandas dataframe, "
"a Scipy sparse matrix or a pyspark.pandas dataframe."
)
assert isinstance(y_val, (np.ndarray, pd.Series, psSeries)), (
"y_val must be None, a numpy array, a pandas series "
"or a pyspark.pandas series."
"y_val must be None, a numpy array, a pandas series " "or a pyspark.pandas series."
)
assert X_val.size != 0 and y_val.size != 0, (
"Validation data are expected to be nonempty. "
"Use None for X_val and y_val if no validation data."
"Validation data are expected to be nonempty. " "Use None for X_val and y_val if no validation data."
)
if isinstance(y_val, np.ndarray):
y_val = y_val.flatten()
assert (
X_val.shape[0] == y_val.shape[0]
), "# rows in X_val must match length of y_val."
assert X_val.shape[0] == y_val.shape[0], "# rows in X_val must match length of y_val."
if automl._transformer:
state.X_val = automl._transformer.transform(X_val)
else:
@ -276,13 +246,9 @@ class GenericTask(Task):
if groups is not None and len(groups) != automl._nrow:
# groups is given as group counts
state.groups = np.concatenate([[i] * c for i, c in enumerate(groups)])
assert (
len(state.groups) == automl._nrow
), "the sum of group counts must match the number of examples"
assert len(state.groups) == automl._nrow, "the sum of group counts must match the number of examples"
state.groups_val = (
np.concatenate([[i] * c for i, c in enumerate(groups_val)])
if groups_val is not None
else None
np.concatenate([[i] * c for i, c in enumerate(groups_val)]) if groups_val is not None else None
)
else:
state.groups_val = groups_val
@ -345,11 +311,7 @@ class GenericTask(Task):
if not isinstance(y_train_all, (psDataFrame, psSeries)):
raise ValueError("y_train_all must be a pyspark.pandas dataframe or series")
df_all_in_one = X_train_all.join(y_train_all)
stratify_column = (
y_train_all.name
if isinstance(y_train_all, psSeries)
else y_train_all.columns[0]
)
stratify_column = y_train_all.name if isinstance(y_train_all, psSeries) else y_train_all.columns[0]
ret_sample_weight = False
if (
"sample_weight" in state.fit_kwargs
@ -367,9 +329,7 @@ class GenericTask(Task):
test_fraction=split_ratio,
seed=RANDOM_SEED,
)
columns_to_drop = [
c for c in df_all_train.columns if c in [stratify_column, "sample_weight"]
]
columns_to_drop = [c for c in df_all_train.columns if c in [stratify_column, "sample_weight"]]
X_train = df_all_train.drop(columns_to_drop)
X_val = df_all_val.drop(columns_to_drop)
y_train = df_all_train[stratify_column]
@ -387,17 +347,13 @@ class GenericTask(Task):
return X_train, X_val, y_train, y_val
@staticmethod
def _train_test_split(
state, X, y, first=None, rest=None, split_ratio=0.2, stratify=None
):
def _train_test_split(state, X, y, first=None, rest=None, split_ratio=0.2, stratify=None):
condition_type = isinstance(X, (psDataFrame, psSeries))
# NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
condition_param = "sample_weight" in state.fit_kwargs
if not condition_type and condition_param:
sample_weight = (
state.fit_kwargs["sample_weight"]
if rest is None
else state.fit_kwargs["sample_weight"][rest]
state.fit_kwargs["sample_weight"] if rest is None else state.fit_kwargs["sample_weight"][rest]
)
(
X_train,
@ -448,9 +404,7 @@ class GenericTask(Task):
state.weight_val = weight_val
state.fit_kwargs["sample_weight"] = weight_train
else:
X_train, X_val, y_train, y_val = GenericTask._split_pyspark(
state, X, y, split_ratio, stratify
)
X_train, X_val, y_train, y_val = GenericTask._split_pyspark(state, X, y, split_ratio, stratify)
return X_train, X_val, y_train, y_val
def prepare_data(
@ -498,21 +452,13 @@ class GenericTask(Task):
n = len(y_train_all)
while count < rare_threshld:
if data_is_df:
X_train_all = concat(
X_train_all, X_train_all.iloc[:n].loc[rare_index]
)
X_train_all = concat(X_train_all, X_train_all.iloc[:n].loc[rare_index])
else:
X_train_all = concat(
X_train_all, X_train_all[:n][rare_index, :]
)
X_train_all = concat(X_train_all, X_train_all[:n][rare_index, :])
if isinstance(y_train_all, (pd.Series, psSeries)):
y_train_all = concat(
y_train_all, y_train_all.iloc[:n].loc[rare_index]
)
y_train_all = concat(y_train_all, y_train_all.iloc[:n].loc[rare_index])
else:
y_train_all = np.concatenate(
[y_train_all, y_train_all[:n][rare_index]]
)
y_train_all = np.concatenate([y_train_all, y_train_all[:n][rare_index]])
count += rare_count
logger.info(f"class {label} augmented from {rare_count} to {count}")
SHUFFLE_SPLIT_TYPES = ["uniform", "stratified"]
@ -535,9 +481,7 @@ class GenericTask(Task):
if isinstance(state.sample_weight_all, pd.Series):
state.sample_weight_all.reset_index(drop=True, inplace=True)
else:
X_train_all, y_train_all = shuffle(
X_train_all, y_train_all, random_state=RANDOM_SEED
)
X_train_all, y_train_all = shuffle(X_train_all, y_train_all, random_state=RANDOM_SEED)
if data_is_df:
X_train_all.reset_index(drop=True, inplace=True)
if isinstance(y_train_all, pd.Series):
@ -569,21 +513,13 @@ class GenericTask(Task):
X_train_all = X_train_all.sort_values(ids)
y_train_all = y_train_all.sort_values(ids)
training_cutoff = X_train_all["time_idx"].max() - period
X_train = X_train_all[
X_train_all["time_idx"] <= training_cutoff
]
y_train = y_train_all[
y_train_all["time_idx"] <= training_cutoff
].drop(columns=ids)
X_train = X_train_all[X_train_all["time_idx"] <= training_cutoff]
y_train = y_train_all[y_train_all["time_idx"] <= training_cutoff].drop(columns=ids)
X_val = X_train_all[X_train_all["time_idx"] > training_cutoff]
y_val = y_train_all[
y_train_all["time_idx"] > training_cutoff
].drop(columns=ids)
y_val = y_train_all[y_train_all["time_idx"] > training_cutoff].drop(columns=ids)
else:
num_samples = X_train_all.shape[0]
assert (
period < num_samples
), f"period={period}>#examples={num_samples}"
assert period < num_samples, f"period={period}>#examples={num_samples}"
split_idx = num_samples - period
X_train = X_train_all[:split_idx]
y_train = y_train_all[:split_idx]
@ -627,20 +563,14 @@ class GenericTask(Task):
"sample_weight"
], # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
state.weight_val,
) = self._split_pyspark(
state, X_train_all, y_train_all, split_ratio
)
) = self._split_pyspark(state, X_train_all, y_train_all, split_ratio)
else:
X_train, X_val, y_train, y_val = self._split_pyspark(
state, X_train_all, y_train_all, split_ratio
)
elif split_type == "group":
gss = GroupShuffleSplit(
n_splits=1, test_size=split_ratio, random_state=RANDOM_SEED
)
for train_idx, val_idx in gss.split(
X_train_all, y_train_all, state.groups_all
):
gss = GroupShuffleSplit(n_splits=1, test_size=split_ratio, random_state=RANDOM_SEED)
for train_idx, val_idx in gss.split(X_train_all, y_train_all, state.groups_all):
if data_is_df:
X_train = X_train_all.iloc[train_idx]
X_val = X_train_all.iloc[val_idx]
@ -674,17 +604,9 @@ class GenericTask(Task):
state, X_rest, y_rest, first, rest, split_ratio, stratify
)
X_train = concat(X_first, X_train)
y_train = (
concat(label_set, y_train)
if data_is_df
else np.concatenate([label_set, y_train])
)
y_train = concat(label_set, y_train) if data_is_df else np.concatenate([label_set, y_train])
X_val = concat(X_first, X_val)
y_val = (
concat(label_set, y_val)
if data_is_df
else np.concatenate([label_set, y_val])
)
y_val = concat(label_set, y_val) if data_is_df else np.concatenate([label_set, y_val])
elif self.is_regression():
X_train, X_val, y_train, y_val = self._train_test_split(
state, X_train_all, y_train_all, split_ratio=split_ratio
@ -700,9 +622,7 @@ class GenericTask(Task):
return
if split_type == "group":
# logger.info("Using GroupKFold")
assert (
len(state.groups_all) == y_train_all_size
), "the length of groups must match the number of examples"
assert len(state.groups_all) == y_train_all_size, "the length of groups must match the number of examples"
assert (
len_labels(state.groups_all) >= n_splits
), "the number of groups must be equal or larger than n_splits"
@ -710,16 +630,13 @@ class GenericTask(Task):
elif split_type == "stratified":
# logger.info("Using StratifiedKFold")
assert y_train_all_size >= n_splits, (
f"{n_splits}-fold cross validation"
f" requires input data with at least {n_splits} examples."
f"{n_splits}-fold cross validation" f" requires input data with at least {n_splits} examples."
)
assert y_train_all_size >= 2 * n_splits, (
f"{n_splits}-fold cross validation with metric=r2 "
f"requires input data with at least {n_splits*2} examples."
)
state.kf = RepeatedStratifiedKFold(
n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED
)
state.kf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED)
elif split_type == "time":
# logger.info("Using TimeSeriesSplit")
if self.is_ts_forecast() and not self.is_ts_forecastpanel():
@ -735,20 +652,14 @@ class GenericTask(Task):
logger.info(f"Using nsplits={n_splits} due to data size limit.")
state.kf = TimeSeriesSplit(n_splits=n_splits, test_size=period)
elif self.is_ts_forecastpanel():
n_groups = len(
X_train.groupby(state.fit_kwargs.get("group_ids")).size()
)
n_groups = len(X_train.groupby(state.fit_kwargs.get("group_ids")).size())
period = state.fit_kwargs.get("period")
state.kf = TimeSeriesSplit(
n_splits=n_splits, test_size=period * n_groups
)
state.kf = TimeSeriesSplit(n_splits=n_splits, test_size=period * n_groups)
else:
state.kf = TimeSeriesSplit(n_splits=n_splits)
elif isinstance(split_type, str):
# logger.info("Using RepeatedKFold")
state.kf = RepeatedKFold(
n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED
)
state.kf = RepeatedKFold(n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED)
else:
# logger.info("Using splitter object")
state.kf = split_type
@ -790,11 +701,7 @@ class GenericTask(Task):
elif self.is_classification():
assert split_type in ["auto", "stratified", "uniform", "time", "group"]
return (
split_type
if split_type != "auto"
else groups is None and "stratified" or "group"
)
return split_type if split_type != "auto" else groups is None and "stratified" or "group"
elif self.is_regression():
assert split_type in ["auto", "uniform", "time", "group"]
@ -825,9 +732,7 @@ class GenericTask(Task):
)
)
except IndexError:
raise IndexError(
"Test data contains more columns than training data, exiting"
)
raise IndexError("Test data contains more columns than training data, exiting")
elif isinstance(X, int):
return X
elif isinstance(X, psDataFrame):
@ -872,9 +777,7 @@ class GenericTask(Task):
if self.is_classification():
labels = _, labels = len_labels(y_train_all, return_labels=True)
else:
labels = fit_kwargs.get(
"label_list"
) # pass the label list on to compute the evaluation metric
labels = fit_kwargs.get("label_list") # pass the label list on to compute the evaluation metric
if "sample_weight" in fit_kwargs:
weight = fit_kwargs["sample_weight"]
weight_val = None
@ -889,9 +792,7 @@ class GenericTask(Task):
if isinstance(kf, (GroupKFold, StratifiedGroupKFold)):
groups = kf.groups
dataframe = dataframe.join(groups)
kf = spark_kFold(
dataframe, nFolds=n, foldCol=groups.name if groups is not None else ""
)
kf = spark_kFold(dataframe, nFolds=n, foldCol=groups.name if groups is not None else "")
shuffle = False
else:
X_train_split, y_train_split = X_train_all, y_train_all
@ -934,15 +835,9 @@ class GenericTask(Task):
)
if groups is not None:
fit_kwargs["groups"] = (
groups[train_index]
if isinstance(groups, np.ndarray)
else groups.iloc[train_index]
)
groups_val = (
groups[val_index]
if isinstance(groups, np.ndarray)
else groups.iloc[val_index]
groups[train_index] if isinstance(groups, np.ndarray) else groups.iloc[train_index]
)
groups_val = groups[val_index] if isinstance(groups, np.ndarray) else groups.iloc[val_index]
else:
groups_val = None
@ -983,16 +878,12 @@ class GenericTask(Task):
pred_time /= n
return val_loss, metric, train_time, pred_time
def default_estimator_list(
self, estimator_list: List[str], is_spark_dataframe: bool = False
) -> List[str]:
def default_estimator_list(self, estimator_list: List[str], is_spark_dataframe: bool = False) -> List[str]:
if "auto" != estimator_list:
n_estimators = len(estimator_list)
if is_spark_dataframe:
# For spark dataframe, only estimators ending with '_spark' are supported
estimator_list = [
est for est in estimator_list if est.endswith("_spark")
]
estimator_list = [est for est in estimator_list if est.endswith("_spark")]
if len(estimator_list) == 0:
raise ValueError(
"Spark dataframes only support estimator names ending with `_spark`. Non-supported "
@ -1005,9 +896,7 @@ class GenericTask(Task):
)
else:
# For non-spark dataframe, only estimators not ending with '_spark' are supported
estimator_list = [
est for est in estimator_list if not est.endswith("_spark")
]
estimator_list = [est for est in estimator_list if not est.endswith("_spark")]
if len(estimator_list) == 0:
raise ValueError(
"Non-spark dataframes only support estimator names not ending with `_spark`. Non-supported "
@ -1069,11 +958,7 @@ class GenericTask(Task):
estimator_list = [
est
for est in estimator_list
if (
est.endswith("_spark")
if is_spark_dataframe
else not est.endswith("_spark")
)
if (est.endswith("_spark") if is_spark_dataframe else not est.endswith("_spark"))
]
return estimator_list

Просмотреть файл

@ -333,9 +333,7 @@ class Task(ABC):
return self.name == other
@classmethod
def estimator_class_from_str(
cls, estimator_name: str
) -> "flaml.automl.ml.BaseEstimator":
def estimator_class_from_str(cls, estimator_name: str) -> "flaml.automl.ml.BaseEstimator":
"""Determine the estimator class corresponding to the provided name.
Args:

Просмотреть файл

@ -111,9 +111,7 @@ class TrainingLogWriter(object):
if self.file is None:
raise IOError("Call open() to open the output file first.")
if self.current_best_loss_record_id is None:
logger.warning(
"flaml.training_log: checkpoint() called before any record is written, skipped."
)
logger.warning("flaml.training_log: checkpoint() called before any record is written, skipped.")
return
record = TrainingLogCheckPoint(self.current_best_loss_record_id)
record.dump(self.file)

Просмотреть файл

@ -75,10 +75,7 @@ def flamlize_estimator(super_class, name: str, task: str, alternatives=None):
break
estimator_name = (
"choose_xgb"
if (
estimator_name == "xgb_limitdepth"
and "max_depth" not in self._params
)
if (estimator_name == "xgb_limitdepth" and "max_depth" not in self._params)
else estimator_name
)
(
@ -88,18 +85,14 @@ def flamlize_estimator(super_class, name: str, task: str, alternatives=None):
y_transformed,
self._feature_transformer,
self._label_transformer,
) = preprocess_and_suggest_hyperparams(
task, X, y, estimator_name, self._default_location
)
) = preprocess_and_suggest_hyperparams(task, X, y, estimator_name, self._default_location)
assert estimator_class == super_class
hyperparams.update(self._params)
return hyperparams, estimator_name, X_transformed, y_transformed
@wraps(super_class.fit)
def fit(self, X, y, *args, **params):
hyperparams, estimator_name, X, y_transformed = self.suggest_hyperparams(
X, y
)
hyperparams, estimator_name, X, y_transformed = self.suggest_hyperparams(X, y)
self.set_params(**hyperparams)
if self._label_transformer and estimator_name in [
"rf",
@ -150,26 +143,16 @@ def flamlize_estimator(super_class, name: str, task: str, alternatives=None):
return EstimatorClass
RandomForestRegressor = flamlize_estimator(
ensemble.RandomForestRegressor, "rf", "regression"
)
RandomForestClassifier = flamlize_estimator(
ensemble.RandomForestClassifier, "rf", "classification"
)
ExtraTreesRegressor = flamlize_estimator(
ensemble.ExtraTreesRegressor, "extra_tree", "regression"
)
ExtraTreesClassifier = flamlize_estimator(
ensemble.ExtraTreesClassifier, "extra_tree", "classification"
)
RandomForestRegressor = flamlize_estimator(ensemble.RandomForestRegressor, "rf", "regression")
RandomForestClassifier = flamlize_estimator(ensemble.RandomForestClassifier, "rf", "classification")
ExtraTreesRegressor = flamlize_estimator(ensemble.ExtraTreesRegressor, "extra_tree", "regression")
ExtraTreesClassifier = flamlize_estimator(ensemble.ExtraTreesClassifier, "extra_tree", "classification")
try:
import lightgbm
LGBMRegressor = flamlize_estimator(lightgbm.LGBMRegressor, "lgbm", "regression")
LGBMClassifier = flamlize_estimator(
lightgbm.LGBMClassifier, "lgbm", "classification"
)
LGBMClassifier = flamlize_estimator(lightgbm.LGBMClassifier, "lgbm", "classification")
except ImportError:
pass

Просмотреть файл

@ -71,17 +71,12 @@ def construct_portfolio(regret_matrix, meta_features, regret_bound):
sorted_losses = np.sort(losses)
if sorted_losses[1] - sorted_losses[0] < eps:
minloss = np.nanmin(losses)
print(
f"tie detected at loss = {sorted_losses[0]}, using alternative metric."
)
print(f"tie detected at loss = {sorted_losses[0]}, using alternative metric.")
tied = np.flatnonzero(losses - minloss < eps)
losses = [(avg_regret[i], i) for i in tied]
minloss, ind = min(losses)
if minloss > prev - eps:
print(
f"May be overfitting at k = {i + 1}, current = {minloss:.5f}, "
f"prev = {prev:.5f}. Stopping."
)
print(f"May be overfitting at k = {i + 1}, current = {minloss:.5f}, " f"prev = {prev:.5f}. Stopping.")
break
configs = candidates[ind]
prev = minloss
@ -89,9 +84,7 @@ def construct_portfolio(regret_matrix, meta_features, regret_bound):
configs = candidates[np.nanargmin(losses)]
i += 1
if sorted_losses[0] <= eps:
print(
f"Reached target regret bound of {regret_bound}! k = {i}. Declining to pick further!"
)
print(f"Reached target regret bound of {regret_bound}! k = {i}. Declining to pick further!")
break
return configs

Просмотреть файл

@ -109,9 +109,7 @@ def serialize(configs, regret, meta_features, output_file, config_path):
except FileNotFoundError:
pass
meta_features_norm, preferences, proc = config_predictor_tuple(
regret.columns, configs, meta_features, regret
)
meta_features_norm, preferences, proc = config_predictor_tuple(regret.columns, configs, meta_features, regret)
portfolio = [load_json(config_path.joinpath(m + ".json")) for m in configs]
regret = regret.loc[configs]
@ -122,9 +120,7 @@ def serialize(configs, regret, meta_features, output_file, config_path):
"preprocessing": proc,
"neighbors": [
{"features": tuple(x), "choice": _filter(preferences[y], regret[y])}
for x, y in zip(
meta_features_norm.to_records(index=False), preferences.columns
)
for x, y in zip(meta_features_norm.to_records(index=False), preferences.columns)
],
"configsource": list(configs),
}
@ -164,9 +160,7 @@ def serialize(configs, regret, meta_features, output_file, config_path):
def main():
parser = argparse.ArgumentParser(description="Build a portfolio.")
parser.add_argument(
"--strategy", help="One of {greedy, greedy-feedback}", default="greedy"
)
parser.add_argument("--strategy", help="One of {greedy, greedy-feedback}", default="greedy")
parser.add_argument("--input", help="Input path")
parser.add_argument("--metafeatures", help="CSV of task metafeatures")
parser.add_argument("--exclude", help="One task name to exclude (for LOO purposes)")
@ -188,9 +182,7 @@ def main():
all_results = None
for estimator in args.estimator:
# produce regret
all, baseline = load_result(
f"{args.input}/{estimator}/results.csv", args.task, "result"
)
all, baseline = load_result(f"{args.input}/{estimator}/results.csv", args.task, "result")
regret = build_regret(all, baseline)
regret = regret.replace(np.inf, np.nan).dropna(axis=1, how="all")
@ -198,9 +190,7 @@ def main():
regret = regret.loc[[i for i in regret.index if args.exclude not in i]]
regret = regret[[c for c in regret.columns if args.exclude not in c]]
print(
f"Regret matrix complete: {100 * regret.count().sum() / regret.shape[0] / regret.shape[1]}%"
)
print(f"Regret matrix complete: {100 * regret.count().sum() / regret.shape[0] / regret.shape[1]}%")
print(f"Num models considered: {regret.shape[0]}")
configs = build_portfolio(meta_features, regret, args.strategy)
@ -214,11 +204,7 @@ def main():
configsource = meta_predictor["configsource"]
all = all.loc[configsource]
all.rename({x: f"{estimator}/{x}" for x in regret.index.values}, inplace=True)
baseline_best = (
baseline
if baseline_best is None
else pd.DataFrame({0: baseline_best, 1: baseline}).max(1)
)
baseline_best = baseline if baseline_best is None else pd.DataFrame({0: baseline_best, 1: baseline}).max(1)
all_results = all if all_results is None else pd.concat([all_results, all])
# analyze(regret, meta_predictor)
regrets = build_regret(all_results, baseline_best)

Просмотреть файл

@ -18,14 +18,8 @@ def load_result(filename, task_type, metric):
(df[metric].notnull()) & (df.type == task_type),
["task", "fold", "params", metric],
]
df["params"] = df["params"].apply(
lambda x: path.splitext(path.basename(eval(x)["_modeljson"]))[0]
)
baseline = (
df.loc[df["task"] == df["params"], ["task", metric]]
.groupby("task")
.mean()[metric]
)
df["params"] = df["params"].apply(lambda x: path.splitext(path.basename(eval(x)["_modeljson"]))[0])
baseline = df.loc[df["task"] == df["params"], ["task", metric]].groupby("task").mean()[metric]
df = df.pivot_table(index="params", columns="task", values=metric)
return df, baseline
@ -34,9 +28,7 @@ def main():
parser = argparse.ArgumentParser(description="Build a regret matrix.")
parser.add_argument("--result_csv", help="File of experiment results")
parser.add_argument("--task_type", help="Type of task")
parser.add_argument(
"--metric", help="Metric for calculating regret", default="result"
)
parser.add_argument("--metric", help="Metric for calculating regret", default="result")
parser.add_argument("--output", help="Location to write regret CSV to")
args = parser.parse_args()

Просмотреть файл

@ -53,10 +53,7 @@ def meta_feature(task, X_train, y_train, meta_feature_names):
try:
# this feature is only supported for dataframe
this_feature.append(
X_train.select_dtypes(
include=[np.number, "float", "int", "long"]
).shape[1]
/ n_feat
X_train.select_dtypes(include=[np.number, "float", "int", "long"]).shape[1] / n_feat
)
except AttributeError:
# 'numpy.ndarray' object has no attribute 'select_dtypes'
@ -79,9 +76,7 @@ def load_config_predictor(estimator_name, task, location=None):
with open(f"{location}/{estimator_name}/{task}.json", "r") as f:
CONFIG_PREDICTORS[key] = predictor = json.load(f)
except FileNotFoundError:
raise FileNotFoundError(
f"Portfolio has not been built for {estimator_name} on {task} task."
)
raise FileNotFoundError(f"Portfolio has not been built for {estimator_name} on {task} task.")
return predictor
@ -99,11 +94,7 @@ def suggest_config(
The returned configs can be used as starting points for AutoML.fit().
`FLAML_sample_size` is removed from the configs.
"""
task = (
get_classification_objective(len_labels(y))
if task == "classification" and y is not None
else task
)
task = get_classification_objective(len_labels(y)) if task == "classification" and y is not None else task
predictor = (
load_config_predictor(estimator_or_predictor, task, location)
if isinstance(estimator_or_predictor, str)
@ -112,15 +103,9 @@ def suggest_config(
older_version = "1.0.2"
# TODO: update older_version when the newer code can no longer handle the older version json file
assert (
version_parse(__version__)
>= version_parse(predictor["version"])
>= version_parse(older_version)
)
assert version_parse(__version__) >= version_parse(predictor["version"]) >= version_parse(older_version)
prep = predictor["preprocessing"]
feature = meta_feature_fn(
task, X_train=X, y_train=y, meta_feature_names=predictor["meta_feature_names"]
)
feature = meta_feature_fn(task, X_train=X, y_train=y, meta_feature_names=predictor["meta_feature_names"])
feature = (np.array(feature) - np.array(prep["center"])) / np.array(prep["scale"])
neighbors = predictor["neighbors"]
nn = NearestNeighbors(n_neighbors=1)
@ -138,9 +123,7 @@ def suggest_config(
return configs
def suggest_learner(
task, X, y, estimator_or_predictor="all", estimator_list=None, location=None
):
def suggest_learner(task, X, y, estimator_or_predictor="all", estimator_list=None, location=None):
"""Suggest best learner within estimator_list."""
configs = suggest_config(task, X, y, estimator_or_predictor, location)
if not estimator_list:
@ -193,9 +176,7 @@ def suggest_hyperparams(task, X, y, estimator_or_predictor, location=None):
hyperparams: A dict of the hyperparameter configurations.
estiamtor_class: A class of the underlying estimator, e.g., lightgbm.LGBMClassifier.
"""
config = suggest_config(task, X, y, estimator_or_predictor, location=location, k=1)[
0
]
config = suggest_config(task, X, y, estimator_or_predictor, location=location, k=1)[0]
estimator = config["class"]
model_class = get_estimator_class(task, estimator)
hyperparams = config["hyperparameters"]
@ -279,9 +260,7 @@ def preprocess_and_suggest_hyperparams(
estimator_list=["xgb_limitdepth", "xgboost"],
location=location,
)
config = suggest_config(task, X, y, estimator_or_predictor, location=location, k=1)[
0
]
config = suggest_config(task, X, y, estimator_or_predictor, location=location, k=1)[0]
estimator = config["class"]
model_class = get_estimator_class(task, estimator)
hyperparams = config["hyperparameters"]

Просмотреть файл

@ -113,12 +113,8 @@ class AutoVW:
search_space = self._search_space.copy()
for k, v in self._search_space.items():
if k == self.VW_INTERACTION_ARG_NAME and v == self.AUTOMATIC:
raw_namespaces = self.get_ns_feature_dim_from_vw_example(
vw_example
).keys()
search_space[k] = polynomial_expansion_set(
init_monomials=set(raw_namespaces)
)
raw_namespaces = self.get_ns_feature_dim_from_vw_example(vw_example).keys()
search_space[k] = polynomial_expansion_set(init_monomials=set(raw_namespaces))
# setup the init config based on the input _init_config and search space
init_config = self._init_config.copy()
for k, v in search_space.items():
@ -158,10 +154,7 @@ class AutoVW:
self._best_trial = self._select_best_trial()
self._y_predict = self._best_trial.predict(data_sample)
# code for debugging purpose
if (
self._prediction_trial_id is None
or self._prediction_trial_id != self._best_trial.trial_id
):
if self._prediction_trial_id is None or self._prediction_trial_id != self._best_trial.trial_id:
self._prediction_trial_id = self._best_trial.trial_id
logger.info(
"prediction trial id changed to %s at iter %s, resource used: %s",
@ -183,14 +176,11 @@ class AutoVW:
def _select_best_trial(self):
"""Select a best trial from the running trials according to the _model_select_policy."""
best_score = (
float("+inf") if self._model_selection_mode == "min" else float("-inf")
)
best_score = float("+inf") if self._model_selection_mode == "min" else float("-inf")
new_best_trial = None
for trial in self._trial_runner.running_trials:
if trial.result is not None and (
"threshold" not in self._model_select_policy
or trial.result.resource_used >= self.WARMSTART_NUM
"threshold" not in self._model_select_policy or trial.result.resource_used >= self.WARMSTART_NUM
):
score = trial.result.get_score(self._model_select_policy)
if ("min" == self._model_selection_mode and score < best_score) or (
@ -199,18 +189,13 @@ class AutoVW:
best_score = score
new_best_trial = trial
if new_best_trial is not None:
logger.debug(
"best_trial resource used: %s", new_best_trial.result.resource_used
)
logger.debug("best_trial resource used: %s", new_best_trial.result.resource_used)
return new_best_trial
else:
# This branch will be triggered when the resource consumption all trials are smaller
# than the WARMSTART_NUM threshold. In this case, we will select the _best_trial
# selected in the previous iteration.
if (
self._best_trial is not None
and self._best_trial.status == Trial.RUNNING
):
if self._best_trial is not None and self._best_trial.status == Trial.RUNNING:
logger.debug("old best trial %s", self._best_trial.trial_id)
return self._best_trial
else:

Просмотреть файл

@ -106,9 +106,7 @@ class OnlineResult:
self._loss_cb = self._update_loss_cb(bound_of_range, data_dimension)
self._loss_queue.append(new_loss)
def _update_loss_cb(
self, bound_of_range, data_dim, bound_name="sample_complexity_bound"
):
def _update_loss_cb(self, bound_of_range, data_dim, bound_name="sample_complexity_bound"):
"""Calculate the coefficient of the confidence bound."""
if bound_name == "sample_complexity_bound":
# set the coefficient in the loss bound
@ -119,9 +117,7 @@ class OnlineResult:
comp_F = math.sqrt(data_dim)
n = self.observation_count
return (
coef * comp_F * math.sqrt((np.log10(n / OnlineResult.prob_delta)) / n)
)
return coef * comp_F * math.sqrt((np.log10(n / OnlineResult.prob_delta)) / n)
else:
raise NotImplementedError
@ -147,11 +143,7 @@ class OnlineResult:
@property
def loss_avg_recent(self):
return (
sum(self._loss_queue) / len(self._loss_queue)
if len(self._loss_queue) != 0
else self._init_loss
)
return sum(self._loss_queue) / len(self._loss_queue) if len(self._loss_queue) != 0 else self._init_loss
def get_score(self, score_name, cb_ratio=1):
if "lcb" in score_name:
@ -282,9 +274,7 @@ class VowpalWabbitTrial(BaseOnlineTrial):
try:
from vowpalwabbit import pyvw
except ImportError:
raise ImportError(
"To use AutoVW, please run pip install flaml[vw] to install vowpalwabbit"
)
raise ImportError("To use AutoVW, please run pip install flaml[vw] to install vowpalwabbit")
# attributes
self.trial_id = self._config_to_id(config) if trial_id is None else trial_id
logger.info("Create trial with trial_id: %s", self.trial_id)
@ -327,14 +317,10 @@ class VowpalWabbitTrial(BaseOnlineTrial):
def _initialize_vw_model(self, vw_example):
"""Initialize a vw model using the trainable_class"""
self._vw_config = self.config.copy()
ns_interactions = self.config.get(
VowpalWabbitTrial.interactions_config_key, None
)
ns_interactions = self.config.get(VowpalWabbitTrial.interactions_config_key, None)
# ensure the feature interaction config is a list (required by VW)
if ns_interactions is not None:
self._vw_config[VowpalWabbitTrial.interactions_config_key] = list(
ns_interactions
)
self._vw_config[VowpalWabbitTrial.interactions_config_key] = list(ns_interactions)
# get the dimensionality of the feature according to the namespace configuration
namespace_feature_dim = get_ns_feature_dim_from_vw_example(vw_example)
self._dim = self._get_dim_from_ns(namespace_feature_dim, ns_interactions)
@ -361,9 +347,7 @@ class VowpalWabbitTrial(BaseOnlineTrial):
# do one step of learning
self.model.learn(data_sample)
# update training related results accordingly
new_loss = self._get_loss(
y, y_pred, self._metric, self._y_min_observed, self._y_max_observed
)
new_loss = self._get_loss(y, y_pred, self._metric, self._y_min_observed, self._y_max_observed)
# udpate sample size, sum of loss, and cost
data_sample_size = 1
bound_of_range = self._y_max_observed - self._y_min_observed
@ -391,11 +375,7 @@ class VowpalWabbitTrial(BaseOnlineTrial):
loss_func = mean_squared_error
elif "mae" in loss_func_name or "absolute" in loss_func_name:
loss_func = mean_absolute_error
if (
y_min_observed is not None
and y_max_observed is not None
and "clip" in loss_func_name
):
if y_min_observed is not None and y_max_observed is not None and "clip" in loss_func_name:
# clip y_pred in the observed range of y
y_pred = min(y_max_observed, max(y_pred, y_min_observed))
else:
@ -410,9 +390,7 @@ class VowpalWabbitTrial(BaseOnlineTrial):
self._y_max_observed = y
@staticmethod
def _get_dim_from_ns(
namespace_feature_dim: dict, namespace_interactions: Union[set, list]
):
def _get_dim_from_ns(namespace_feature_dim: dict, namespace_interactions: Union[set, list]):
"""Get the dimensionality of the corresponding feature of input namespace set."""
total_dim = sum(namespace_feature_dim.values())
if namespace_interactions:

Просмотреть файл

@ -33,12 +33,7 @@ class OnlineTrialRunner:
WARMSTART_NUM = 100
def __init__(
self,
max_live_model_num: int,
searcher=None,
scheduler=None,
champion_test_policy="loss_ucb",
**kwargs
self, max_live_model_num: int, searcher=None, scheduler=None, champion_test_policy="loss_ucb", **kwargs
):
"""Constructor.
@ -192,9 +187,7 @@ class OnlineTrialRunner:
def get_top_running_trials(self, top_ratio=None, top_metric="ucb") -> list:
"""Get a list of trial ids, whose performance is among the top running trials."""
running_valid_trials = [
trial for trial in self._running_trials if trial.result is not None
]
running_valid_trials = [trial for trial in self._running_trials if trial.result is not None]
if not running_valid_trials:
return
if top_ratio is None:
@ -215,20 +208,14 @@ class OnlineTrialRunner:
else:
raise NotImplementedError
top_running_valid_trials = []
logger.info(
"Running trial ids %s", [trial.trial_id for trial in running_valid_trials]
)
logger.info("Running trial ids %s", [trial.trial_id for trial in running_valid_trials])
self._random_state.shuffle(running_valid_trials)
results = [
trial.result.get_score(test_attribute) for trial in running_valid_trials
]
results = [trial.result.get_score(test_attribute) for trial in running_valid_trials]
# sorted result (small to large) index
sorted_index = np.argsort(np.array(results))
for i in range(min(top_number, len(running_valid_trials))):
top_running_valid_trials.append(running_valid_trials[sorted_index[i]])
logger.info(
"Top running ids %s", [trial.trial_id for trial in top_running_valid_trials]
)
logger.info("Top running ids %s", [trial.trial_id for trial in top_running_valid_trials])
return top_running_valid_trials
def _add_trial_from_searcher(self):
@ -240,16 +227,9 @@ class OnlineTrialRunner:
"""
# (optionally) upper bound the number of trials in the OnlineTrialRunner
if self._bound_trial_num and self._first_challenger_pool_size is not None:
active_trial_size = len(
[t for t in self._trials if t.status != Trial.TERMINATED]
)
active_trial_size = len([t for t in self._trials if t.status != Trial.TERMINATED])
trial_num_upper_bound = (
int(
round(
(np.log10(self._total_steps) + 1)
* self._first_challenger_pool_size
)
)
int(round((np.log10(self._total_steps) + 1) * self._first_challenger_pool_size))
if self._first_challenger_pool_size
else np.inf
)
@ -286,9 +266,7 @@ class OnlineTrialRunner:
if self._best_challenger_trial is not None:
assert self._best_challenger_trial.trial_id != self._champion_trial.trial_id
# test whether a new champion is found and set the trial properties accordingly
is_new_champion_found = self._better_than_champion_test(
self._best_challenger_trial
)
is_new_champion_found = self._better_than_champion_test(self._best_challenger_trial)
if is_new_champion_found:
self._set_champion(new_champion_trial=self._best_challenger_trial)
@ -303,10 +281,7 @@ class OnlineTrialRunner:
if worse_than_champion:
to_stop.append(trial_to_test)
# we want to ensure there are at least #max_live_model_num of challengers remaining
max_to_stop_num = (
len([t for t in self._trials if t.status != Trial.TERMINATED])
- self._max_live_model_num
)
max_to_stop_num = len([t for t in self._trials if t.status != Trial.TERMINATED]) - self._max_live_model_num
for i in range(min(max_to_stop_num, len(to_stop))):
self.stop_trial(to_stop[i])
@ -331,9 +306,7 @@ class OnlineTrialRunner:
]
if active_trials:
self._random_state.shuffle(active_trials)
results = [
trial.result.get_score(test_attribute) for trial in active_trials
]
results = [trial.result.get_score(test_attribute) for trial in active_trials]
best_index = np.argmin(results)
self._best_challenger_trial = active_trials[best_index]
@ -358,9 +331,7 @@ class OnlineTrialRunner:
# calling set_search_properties of searcher will trigger
# new challenger generation. we do not do this for init champion
# as this step is already done when first constructing the searcher
self._searcher.set_search_properties(
setting={self._searcher.CHAMPION_TRIAL_NAME: self._champion_trial}
)
self._searcher.set_search_properties(setting={self._searcher.CHAMPION_TRIAL_NAME: self._champion_trial})
else:
self._champion_update_times = 0
@ -450,13 +421,9 @@ class OnlineTrialRunner:
"""
if trial_to_test.result is not None and self._champion_trial.result is not None:
if "ucb" in self._champion_test_policy:
return self._test_lcb_ucb(
self._champion_trial, trial_to_test, self.WARMSTART_NUM
)
return self._test_lcb_ucb(self._champion_trial, trial_to_test, self.WARMSTART_NUM)
elif "avg" in self._champion_test_policy:
return self._test_avg_loss(
self._champion_trial, trial_to_test, self.WARMSTART_NUM
)
return self._test_avg_loss(self._champion_trial, trial_to_test, self.WARMSTART_NUM)
elif "martingale" in self._champion_test_policy:
return self._test_martingale(self._champion_trial, trial_to_test)
else:
@ -474,9 +441,7 @@ class OnlineTrialRunner:
trial.trial_id,
champion_trial.trial_id,
)
logger.info(
"trial %s %s %s", trial.config, trial.result, trial.resource_lease
)
logger.info("trial %s %s %s", trial.config, trial.result, trial.resource_lease)
logger.info(
"trial loss_avg:%s, trial loss_cb %s",
trial.result.loss_avg,
@ -508,13 +473,8 @@ class OnlineTrialRunner:
"""
assert trial.trial_id != champion_trial.trial_id
if trial.result.resource_used >= warmstart_num:
if (
trial.result.loss_ucb
< champion_trial.result.loss_lcb - champion_trial.result.loss_cb
):
logger.info(
"======new champion condition satisfied: using lcb vs ucb====="
)
if trial.result.loss_ucb < champion_trial.result.loss_lcb - champion_trial.result.loss_cb:
logger.info("======new champion condition satisfied: using lcb vs ucb=====")
logger.info(
"new champion trial %s %s %s",
trial.trial_id,

Просмотреть файл

@ -126,9 +126,7 @@ class ExperimentAnalysis:
'metric {} for scope {} not in ["all", "last", "avg", '
'"last-5-avg", "last-10-avg"]. '
"If you didn't pass a `metric` parameter to `tune.run()`, "
"you have to pass one when fetching the best trial.".format(
metric, scope
)
"you have to pass one when fetching the best trial.".format(metric, scope)
)
best_trial = None
best_metric_score = None
@ -155,10 +153,7 @@ class ExperimentAnalysis:
best_metric_score = metric_score
best_trial = trial
if not best_trial:
logger.warning(
"Could not find best trial. Did you pass the correct `metric` "
"parameter?"
)
logger.warning("Could not find best trial. Did you pass the correct `metric` " "parameter?")
return best_trial
def get_best_config(

Просмотреть файл

@ -124,9 +124,7 @@ STDERR_FILE = "__stderr_file__"
# Where Tune writes result files by default
DEFAULT_RESULTS_DIR = (
os.environ.get("TEST_TMPDIR")
or os.environ.get("TUNE_RESULT_DIR")
or os.path.expanduser("~/ray_results")
os.environ.get("TEST_TMPDIR") or os.environ.get("TUNE_RESULT_DIR") or os.path.expanduser("~/ray_results")
)
# Meta file about status under each experiment directory, can be

Просмотреть файл

@ -53,13 +53,9 @@ except ImportError:
def __init__(
self,
generator_or_seed: Optional[
Union["np_random_generator", np.random.RandomState, int]
] = None,
generator_or_seed: Optional[Union["np_random_generator", np.random.RandomState, int]] = None,
):
if generator_or_seed is None or isinstance(
generator_or_seed, (np.random.RandomState, np_random_generator)
):
if generator_or_seed is None or isinstance(generator_or_seed, (np.random.RandomState, np_random_generator)):
self._rng = generator_or_seed
elif LEGACY_RNG:
self._rng = np.random.RandomState(generator_or_seed)
@ -85,9 +81,7 @@ except ImportError:
return getattr(self.rng, name)
RandomState = Union[
None, _BackwardsCompatibleNumpyRng, np_random_generator, np.random.RandomState, int
]
RandomState = Union[None, _BackwardsCompatibleNumpyRng, np_random_generator, np.random.RandomState, int]
class Domain:
@ -112,9 +106,7 @@ class Domain:
raise ValueError(
"You can only choose one sampler for parameter "
"domains. Existing sampler for parameter {}: "
"{}. Tried to add {}".format(
self.__class__.__name__, self.sampler, sampler
)
"{}. Tried to add {}".format(self.__class__.__name__, self.sampler, sampler)
)
self.sampler = sampler
@ -231,9 +223,7 @@ class Float(Domain):
if not isinstance(random_state, _BackwardsCompatibleNumpyRng):
random_state = _BackwardsCompatibleNumpyRng(random_state)
assert domain.lower > 0, "LogUniform needs a lower bound greater than 0"
assert (
0 < domain.upper < float("inf")
), "LogUniform needs a upper bound greater than 0"
assert 0 < domain.upper < float("inf"), "LogUniform needs a upper bound greater than 0"
logmin = np.log(domain.lower) / np.log(self.base)
logmax = np.log(domain.upper) / np.log(self.base)
@ -271,15 +261,9 @@ class Float(Domain):
def uniform(self):
if not self.lower > float("-inf"):
raise ValueError(
"Uniform requires a lower bound. Make sure to set the "
"`lower` parameter of `Float()`."
)
raise ValueError("Uniform requires a lower bound. Make sure to set the " "`lower` parameter of `Float()`.")
if not self.upper < float("inf"):
raise ValueError(
"Uniform requires a upper bound. Make sure to set the "
"`upper` parameter of `Float()`."
)
raise ValueError("Uniform requires a upper bound. Make sure to set the " "`upper` parameter of `Float()`.")
new = copy(self)
new.set_sampler(self._Uniform())
return new
@ -309,20 +293,10 @@ class Float(Domain):
return new
def quantized(self, q: float):
if self.lower > float("-inf") and not isclose(
self.lower / q, round(self.lower / q)
):
raise ValueError(
f"Your lower variable bound {self.lower} is not divisible by "
f"quantization factor {q}."
)
if self.upper < float("inf") and not isclose(
self.upper / q, round(self.upper / q)
):
raise ValueError(
f"Your upper variable bound {self.upper} is not divisible by "
f"quantization factor {q}."
)
if self.lower > float("-inf") and not isclose(self.lower / q, round(self.lower / q)):
raise ValueError(f"Your lower variable bound {self.lower} is not divisible by " f"quantization factor {q}.")
if self.upper < float("inf") and not isclose(self.upper / q, round(self.upper / q)):
raise ValueError(f"Your upper variable bound {self.upper} is not divisible by " f"quantization factor {q}.")
new = copy(self)
new.set_sampler(Quantized(new.get_sampler(), q), allow_override=True)
@ -361,9 +335,7 @@ class Integer(Domain):
if not isinstance(random_state, _BackwardsCompatibleNumpyRng):
random_state = _BackwardsCompatibleNumpyRng(random_state)
assert domain.lower > 0, "LogUniform needs a lower bound greater than 0"
assert (
0 < domain.upper < float("inf")
), "LogUniform needs a upper bound greater than 0"
assert 0 < domain.upper < float("inf"), "LogUniform needs a upper bound greater than 0"
logmin = np.log(domain.lower) / np.log(self.base)
logmax = np.log(domain.upper) / np.log(self.base)
@ -430,9 +402,7 @@ class Categorical(Domain):
random_state = _BackwardsCompatibleNumpyRng(random_state)
# do not use .choice() directly on domain.categories
# as that will coerce them to a single dtype
indices = random_state.choice(
np.arange(0, len(domain.categories)), size=size
)
indices = random_state.choice(np.arange(0, len(domain.categories)), size=size)
items = [domain.categories[index] for index in indices]
return items if len(items) > 1 else domain.cast(items[0])
@ -491,9 +461,7 @@ class Quantized(Sampler):
quantized_domain = copy(domain)
quantized_domain.lower = np.ceil(domain.lower / self.q) * self.q
quantized_domain.upper = np.floor(domain.upper / self.q) * self.q
values = self.sampler.sample(
quantized_domain, spec, size, random_state=random_state
)
values = self.sampler.sample(quantized_domain, spec, size, random_state=random_state)
quantized = np.round(np.divide(values, self.q)) * self.q
if not isinstance(quantized, np.ndarray):
@ -509,11 +477,7 @@ class PolynomialExpansionSet:
allow_self_inter: bool = False,
):
self._init_monomials = init_monomials
self._highest_poly_order = (
highest_poly_order
if highest_poly_order is not None
else len(self._init_monomials)
)
self._highest_poly_order = highest_poly_order if highest_poly_order is not None else len(self._init_monomials)
self._allow_self_inter = allow_self_inter
@property
@ -644,7 +608,5 @@ def qrandn(mean: float, sd: float, q: float):
return Float(None, None).normal(mean, sd).quantized(q)
def polynomial_expansion_set(
init_monomials: set, highest_poly_order: int = None, allow_self_inter: bool = False
):
def polynomial_expansion_set(init_monomials: set, highest_poly_order: int = None, allow_self_inter: bool = False):
return PolynomialExpansionSet(init_monomials, highest_poly_order, allow_self_inter)

Просмотреть файл

@ -27,10 +27,7 @@ class OnlineScheduler(TrialScheduler):
min_paused_resource_trial = None
for trial in trial_runner.get_trials():
# if there is a tie, prefer the earlier added ones
if (
trial.status == Trial.PAUSED
and trial.resource_lease < min_paused_resource
):
if trial.status == Trial.PAUSED and trial.resource_lease < min_paused_resource:
min_paused_resource = trial.resource_lease
min_paused_resource_trial = trial
if min_paused_resource_trial is not None:
@ -122,8 +119,6 @@ class ChaChaScheduler(OnlineSuccessiveDoublingScheduler):
)
logger.debug("top_learners: %s", top_trials)
if trial in top_trials:
logger.debug(
"top runner %s: set from PAUSE to CONTINUE", trial.trial_id
)
logger.debug("top runner %s: set from PAUSE to CONTINUE", trial.trial_id)
return TrialScheduler.CONTINUE
return decision

Просмотреть файл

@ -56,9 +56,7 @@ class BlendSearch(Searcher):
max_resource: Optional[float] = None,
reduction_factor: Optional[float] = None,
global_search_alg: Optional[Searcher] = None,
config_constraints: Optional[
List[Tuple[Callable[[dict], float], str, float]]
] = None,
config_constraints: Optional[List[Tuple[Callable[[dict], float], str, float]]] = None,
metric_constraints: Optional[List[Tuple[str, str, float]]] = None,
seed: Optional[int] = 20,
cost_attr: Optional[str] = "auto",
@ -196,9 +194,7 @@ class BlendSearch(Searcher):
self._config_constraints = config_constraints
self._metric_constraints = metric_constraints
if metric_constraints:
assert all(
x[1] in ["<=", ">="] for x in metric_constraints
), "sign of metric constraints must be <= or >=."
assert all(x[1] in ["<=", ">="] for x in metric_constraints), "sign of metric constraints must be <= or >=."
# metric modified by lagrange
metric += self.lagrange
self._cat_hp_cost = cat_hp_cost or {}
@ -232,9 +228,7 @@ class BlendSearch(Searcher):
if experimental:
import optuna as ot
sampler = ot.samplers.TPESampler(
seed=gs_seed, multivariate=True, group=True
)
sampler = ot.samplers.TPESampler(seed=gs_seed, multivariate=True, group=True)
else:
sampler = None
try:
@ -260,11 +254,7 @@ class BlendSearch(Searcher):
else:
self._gs = None
self._experimental = experimental
if (
getattr(self, "__name__", None) == "CFO"
and points_to_evaluate
and len(self._points_to_evaluate) > 1
):
if getattr(self, "__name__", None) == "CFO" and points_to_evaluate and len(self._points_to_evaluate) > 1:
# use the best config in points_to_evaluate as the start point
self._candidate_start_points = {}
self._started_from_low_cost = not low_cost_partial_config
@ -383,9 +373,7 @@ class BlendSearch(Searcher):
if self._metric_constraints:
self._metric_constraint_satisfied = False
self._metric_constraint_penalty = [
self.penalty for _ in self._metric_constraints
]
self._metric_constraint_penalty = [self.penalty for _ in self._metric_constraints]
else:
self._metric_constraint_satisfied = True
self._metric_constraint_penalty = None
@ -424,9 +412,7 @@ class BlendSearch(Searcher):
def is_ls_ever_converged(self):
return self._is_ls_ever_converged
def on_trial_complete(
self, trial_id: str, result: Optional[Dict] = None, error: bool = False
):
def on_trial_complete(self, trial_id: str, result: Optional[Dict] = None, error: bool = False):
"""search thread updater and cleaner."""
metric_constraint_satisfied = True
if result and not error and self._metric_constraints:
@ -440,11 +426,7 @@ class BlendSearch(Searcher):
violation = (value - threshold) * sign_op
if violation > 0:
# add penalty term to the metric
objective += (
self._metric_constraint_penalty[i]
* violation
* self._ls.metric_op
)
objective += self._metric_constraint_penalty[i] * violation * self._ls.metric_op
metric_constraint_satisfied = False
if self._metric_constraint_penalty[i] < self.penalty:
self._metric_constraint_penalty[i] += violation
@ -455,9 +437,7 @@ class BlendSearch(Searcher):
self._metric_constraint_satisfied |= metric_constraint_satisfied
thread_id = self._trial_proposed_by.get(trial_id)
if thread_id in self._search_thread_pool:
self._search_thread_pool[thread_id].on_trial_complete(
trial_id, result, error
)
self._search_thread_pool[thread_id].on_trial_complete(trial_id, result, error)
del self._trial_proposed_by[trial_id]
if result:
config = result.get("config", {})
@ -467,9 +447,7 @@ class BlendSearch(Searcher):
config[key[7:]] = value
if self._allow_empty_config and not config:
return
signature = self._ls.config_signature(
config, self._subspace.get(trial_id, {})
)
signature = self._ls.config_signature(config, self._subspace.get(trial_id, {}))
if error: # remove from result cache
del self._result[signature]
else: # add to result cache
@ -489,11 +467,7 @@ class BlendSearch(Searcher):
self._ls_bound_max,
self._subspace.get(trial_id, self._ls.space),
)
if (
self._gs is not None
and self._experimental
and (not self._ls.hierarchical)
):
if self._gs is not None and self._experimental and (not self._ls.hierarchical):
self._gs.add_evaluated_point(flatten_dict(config), objective)
# TODO: recover when supported
# converted = convert_key(config, self._gs.space)
@ -502,17 +476,12 @@ class BlendSearch(Searcher):
elif metric_constraint_satisfied and self._create_condition(result):
# thread creator
thread_id = self._thread_count
self._started_from_given = (
self._candidate_start_points
and trial_id in self._candidate_start_points
)
self._started_from_given = self._candidate_start_points and trial_id in self._candidate_start_points
if self._started_from_given:
del self._candidate_start_points[trial_id]
else:
self._started_from_low_cost = True
self._create_thread(
config, result, self._subspace.get(trial_id, self._ls.space)
)
self._create_thread(config, result, self._subspace.get(trial_id, self._ls.space))
# reset admissible region to ls bounding box
self._gs_admissible_min.update(self._ls_bound_min)
self._gs_admissible_max.update(self._ls_bound_max)
@ -595,9 +564,7 @@ class BlendSearch(Searcher):
"""create thread condition"""
if len(self._search_thread_pool) < 2:
return True
obj_median = np.median(
[thread.obj_best1 for id, thread in self._search_thread_pool.items() if id]
)
obj_median = np.median([thread.obj_best1 for id, thread in self._search_thread_pool.items() if id])
return result[self._ls.metric] * self._ls.metric_op < obj_median
def _clean(self, thread_id: int):
@ -648,10 +615,7 @@ class BlendSearch(Searcher):
best_trial_id = None
obj_best = None
for trial_id, r in self._candidate_start_points.items():
if r and (
best_trial_id is None
or r[self._ls.metric] * self._ls.metric_op < obj_best
):
if r and (best_trial_id is None or r[self._ls.metric] * self._ls.metric_op < obj_best):
best_trial_id = trial_id
obj_best = r[self._ls.metric] * self._ls.metric_op
if best_trial_id:
@ -663,9 +627,7 @@ class BlendSearch(Searcher):
config[key[7:]] = value
self._started_from_given = True
del self._candidate_start_points[best_trial_id]
self._create_thread(
config, result, self._subspace.get(best_trial_id, self._ls.space)
)
self._create_thread(config, result, self._subspace.get(best_trial_id, self._ls.space))
def _expand_admissible_region(self, lower, upper, space):
"""expand the admissible region for the subspace `space`"""
@ -674,9 +636,7 @@ class BlendSearch(Searcher):
if isinstance(ub, list):
choice = space[key].get("_choice_")
if choice:
self._expand_admissible_region(
lower[key][choice], upper[key][choice], space[key]
)
self._expand_admissible_region(lower[key][choice], upper[key][choice], space[key])
elif isinstance(ub, dict):
self._expand_admissible_region(lower[key], ub, space[key])
else:
@ -752,9 +712,7 @@ class BlendSearch(Searcher):
if choice == backup:
# use CFO's init point
init_config = self._ls.init_config
config, space = self._ls.complete_config(
init_config, self._ls_bound_min, self._ls_bound_max
)
config, space = self._ls.complete_config(init_config, self._ls_bound_min, self._ls_bound_max)
self._trial_proposed_by[trial_id] = choice
self._search_thread_pool[choice].running += 1
else:
@ -801,9 +759,7 @@ class BlendSearch(Searcher):
if self._allow_empty_config and not init_config:
assert reward is None, "Empty config can't have reward."
return init_config
config, space = self._ls.complete_config(
init_config, self._ls_bound_min, self._ls_bound_max
)
config, space = self._ls.complete_config(init_config, self._ls_bound_min, self._ls_bound_max)
config_signature = self._ls.config_signature(config, space)
if reward is None:
result = self._result.get(config_signature)
@ -827,9 +783,7 @@ class BlendSearch(Searcher):
return
if self._use_incumbent_result_in_evaluation:
if self._trial_proposed_by[trial_id] > 0:
choice_thread = self._search_thread_pool[
self._trial_proposed_by[trial_id]
]
choice_thread = self._search_thread_pool[self._trial_proposed_by[trial_id]]
config[INCUMBENT_RESULT] = choice_thread.best_result
return config
@ -874,9 +828,7 @@ class BlendSearch(Searcher):
if choice >= 0: # not fallback to rs
result = self._result.get(config_signature)
if result: # finished
self._search_thread_pool[choice].on_trial_complete(
trial_id, result, error=False
)
self._search_thread_pool[choice].on_trial_complete(trial_id, result, error=False)
if choice:
# local search thread
self._clean(choice)
@ -938,9 +890,7 @@ class BlendSearch(Searcher):
backup_thread_id = thread_id
return top_thread_id, backup_thread_id
def _valid(
self, config: Dict, space: Dict, subspace: Dict, lower: Dict, upper: Dict
) -> bool:
def _valid(self, config: Dict, space: Dict, subspace: Dict, lower: Dict, upper: Dict) -> bool:
"""config validator"""
normalized_config = normalize(config, subspace, config, {})
for key, lb in lower.items():
@ -962,10 +912,7 @@ class BlendSearch(Searcher):
valid = self._valid(value, domain, nestedspace, lb, ub)
if not valid:
return False
elif (
value + self._ls.STEPSIZE < lower[key]
or value > upper[key] + self._ls.STEPSIZE
):
elif value + self._ls.STEPSIZE < lower[key] or value > upper[key] + self._ls.STEPSIZE:
return False
return True
@ -1033,9 +980,7 @@ class BlendSearchTuner(BlendSearch, NNITuner):
result = {
"config": parameters,
self._metric: extract_scalar_reward(value),
self.cost_attr: 1
if isinstance(value, float)
else value.get(self.cost_attr, value.get("sequence", 1))
self.cost_attr: 1 if isinstance(value, float) else value.get(self.cost_attr, value.get("sequence", 1))
# if nni does not report training cost,
# using sequence as an approximation.
# if no sequence, using a constant 1
@ -1145,11 +1090,7 @@ class CFO(BlendSearchTuner):
if self._candidate_start_points and self._thread_count == 1:
# result needs to match or exceed the best candidate start point
obj_best = min(
(
self._ls.metric_op * r[self._ls.metric]
for r in self._candidate_start_points.values()
if r
),
(self._ls.metric_op * r[self._ls.metric] for r in self._candidate_start_points.values() if r),
default=-np.inf,
)
@ -1157,9 +1098,7 @@ class CFO(BlendSearchTuner):
else:
return True
def on_trial_complete(
self, trial_id: str, result: Optional[Dict] = None, error: bool = False
):
def on_trial_complete(self, trial_id: str, result: Optional[Dict] = None, error: bool = False):
super().on_trial_complete(trial_id, result, error)
if self._candidate_start_points and trial_id in self._candidate_start_points:
# the trial is a candidate start point
@ -1177,9 +1116,7 @@ class RandomSearch(CFO):
config, _ = self._ls.complete_config({})
return config
def on_trial_complete(
self, trial_id: str, result: Optional[Dict] = None, error: bool = False
):
def on_trial_complete(self, trial_id: str, result: Optional[Dict] = None, error: bool = False):
return
def on_trial_result(self, trial_id: str, result: Dict):

Просмотреть файл

@ -124,21 +124,13 @@ class FLOW2(Searcher):
self.lexico_objectives = lexico_objectives
if self.lexico_objectives is not None:
if "modes" not in self.lexico_objectives.keys():
self.lexico_objectives["modes"] = ["min"] * len(
self.lexico_objectives["metrics"]
)
for t_metric, t_mode in zip(
self.lexico_objectives["metrics"], self.lexico_objectives["modes"]
):
self.lexico_objectives["modes"] = ["min"] * len(self.lexico_objectives["metrics"])
for t_metric, t_mode in zip(self.lexico_objectives["metrics"], self.lexico_objectives["modes"]):
if t_metric not in self.lexico_objectives["tolerances"].keys():
self.lexico_objectives["tolerances"][t_metric] = 0
if t_metric not in self.lexico_objectives["targets"].keys():
self.lexico_objectives["targets"][t_metric] = (
-float("inf") if t_mode == "min" else float("inf")
)
self.resource_multiple_factor = (
resource_multiple_factor or SAMPLE_MULTIPLY_FACTOR
)
self.lexico_objectives["targets"][t_metric] = -float("inf") if t_mode == "min" else float("inf")
self.resource_multiple_factor = resource_multiple_factor or SAMPLE_MULTIPLY_FACTOR
self.cost_attr = cost_attr
self.max_resource = max_resource
self._resource = None
@ -166,13 +158,9 @@ class FLOW2(Searcher):
q = sampler.q
sampler = sampler.get_sampler()
if str(sampler) == "Uniform":
self._step_lb = min(
self._step_lb, q / (domain.upper - domain.lower + 1)
)
self._step_lb = min(self._step_lb, q / (domain.upper - domain.lower + 1))
elif isinstance(domain, sample.Integer) and str(sampler) == "Uniform":
self._step_lb = min(
self._step_lb, 1.0 / (domain.upper - domain.lower)
)
self._step_lb = min(self._step_lb, 1.0 / (domain.upper - domain.lower))
if isinstance(domain, sample.Categorical):
if not domain.ordered:
self._unordered_cat_hp[key] = len(domain.categories)
@ -186,11 +174,7 @@ class FLOW2(Searcher):
if not hier:
self._space_keys = sorted(self._tunable_keys)
self.hierarchical = hier
if (
self.resource_attr
and self.resource_attr not in self._space
and self.max_resource
):
if self.resource_attr and self.resource_attr not in self._space and self.max_resource:
self.min_resource = self.min_resource or self._min_resource()
self._resource = self._round(self.min_resource)
if not hier:
@ -244,14 +228,12 @@ class FLOW2(Searcher):
if str(sampler_inner) == "LogUniform":
step_lb = min(
step_lb,
np.log(1.0 + q / self.best_config[key])
/ np.log(domain.upper / domain.lower),
np.log(1.0 + q / self.best_config[key]) / np.log(domain.upper / domain.lower),
)
elif isinstance(domain, sample.Integer) and str(sampler) == "LogUniform":
step_lb = min(
step_lb,
np.log(1.0 + 1.0 / self.best_config[key])
/ np.log((domain.upper - 1) / domain.lower),
np.log(1.0 + 1.0 / self.best_config[key]) / np.log((domain.upper - 1) / domain.lower),
)
if np.isinf(step_lb):
step_lb = self.STEP_LOWER_BOUND
@ -288,18 +270,14 @@ class FLOW2(Searcher):
"""
disturb = self._reset_times and partial_config == self.init_config
# if not the first time to complete init_config, use random gaussian
config, space = complete_config(
partial_config, self.space, self, disturb, lower, upper
)
config, space = complete_config(partial_config, self.space, self, disturb, lower, upper)
if partial_config == self.init_config:
self._reset_times += 1
if self._resource:
config[self.resource_attr] = self.min_resource
return config, space
def create(
self, init_config: Dict, obj: float, cost: float, space: Dict
) -> Searcher:
def create(self, init_config: Dict, obj: float, cost: float, space: Dict) -> Searcher:
# space is the subspace where the init_config is located
flow2 = self.__class__(
init_config,
@ -318,12 +296,7 @@ class FLOW2(Searcher):
flow2.best_obj = {}
for k, v in obj.items():
flow2.best_obj[k] = (
-v
if self.lexico_objectives["modes"][
self.lexico_objectives["metrics"].index(k)
]
== "max"
else v
-v if self.lexico_objectives["modes"][self.lexico_objectives["metrics"].index(k)] == "max" else v
)
else:
flow2.best_obj = obj * self.metric_op # minimize internally
@ -333,15 +306,11 @@ class FLOW2(Searcher):
def normalize(self, config, recursive=False) -> Dict:
"""normalize each dimension in config to [0,1]."""
return normalize(
config, self._space, self.best_config, self.incumbent, recursive
)
return normalize(config, self._space, self.best_config, self.incumbent, recursive)
def denormalize(self, config):
"""denormalize each dimension in config from [0,1]."""
return denormalize(
config, self._space, self.best_config, self.incumbent, self._random
)
return denormalize(config, self._space, self.best_config, self.incumbent, self._random)
def set_search_properties(
self,
@ -374,20 +343,13 @@ class FLOW2(Searcher):
feasible_value = k_values.take(feasible_index)
self._f_best[k_metric] = np.min(feasible_value)
if not isinstance(self.lexico_objectives["tolerances"][k_metric], str):
tolerance_bound = (
self._f_best[k_metric]
+ self.lexico_objectives["tolerances"][k_metric]
)
tolerance_bound = self._f_best[k_metric] + self.lexico_objectives["tolerances"][k_metric]
else:
assert (
self.lexico_objectives["tolerances"][k_metric][-1] == "%"
), "String tolerance of {} should use %% as the suffix".format(k_metric)
tolerance_bound = self._f_best[k_metric] * (
1
+ 0.01
* float(
self.lexico_objectives["tolerances"][k_metric].replace("%", "")
)
1 + 0.01 * float(self.lexico_objectives["tolerances"][k_metric].replace("%", ""))
)
feasible_index_filter = np.where(
feasible_value
@ -409,33 +371,20 @@ class FLOW2(Searcher):
for k in self.lexico_objectives["metrics"]:
self._histories[k].append(result[k])
self.update_fbest()
for k_metric, k_mode in zip(
self.lexico_objectives["metrics"], self.lexico_objectives["modes"]
):
for k_metric, k_mode in zip(self.lexico_objectives["metrics"], self.lexico_objectives["modes"]):
k_target = (
self.lexico_objectives["targets"][k_metric]
if k_mode == "min"
else -self.lexico_objectives["targets"][k_metric]
)
if not isinstance(self.lexico_objectives["tolerances"][k_metric], str):
tolerance_bound = (
self._f_best[k_metric]
+ self.lexico_objectives["tolerances"][k_metric]
)
tolerance_bound = self._f_best[k_metric] + self.lexico_objectives["tolerances"][k_metric]
else:
assert (
self.lexico_objectives["tolerances"][k_metric][-1] == "%"
), "String tolerance of {} should use %% as the suffix".format(
k_metric
)
), "String tolerance of {} should use %% as the suffix".format(k_metric)
tolerance_bound = self._f_best[k_metric] * (
1
+ 0.01
* float(
self.lexico_objectives["tolerances"][k_metric].replace(
"%", ""
)
)
1 + 0.01 * float(self.lexico_objectives["tolerances"][k_metric].replace("%", ""))
)
if (result[k_metric] < max(tolerance_bound, k_target)) and (
self.best_obj[k_metric]
@ -457,9 +406,7 @@ class FLOW2(Searcher):
else:
return False
def on_trial_complete(
self, trial_id: str, result: Optional[Dict] = None, error: bool = False
):
def on_trial_complete(self, trial_id: str, result: Optional[Dict] = None, error: bool = False):
"""
Compare with incumbent.
If better, move, reset num_complete and num_proposed.
@ -512,21 +459,12 @@ class FLOW2(Searcher):
proposed_by = self._proposed_by.get(trial_id)
if proposed_by == self.incumbent:
self._num_complete4incumbent += 1
cost = (
result.get(self.cost_attr, 1)
if result
else self._trial_cost.get(trial_id)
)
cost = result.get(self.cost_attr, 1) if result else self._trial_cost.get(trial_id)
if cost:
self._cost_complete4incumbent += cost
if (
self._num_complete4incumbent >= 2 * self.dim
and self._num_allowed4incumbent == 0
):
if self._num_complete4incumbent >= 2 * self.dim and self._num_allowed4incumbent == 0:
self._num_allowed4incumbent = 2
if self._num_complete4incumbent == self.dir and (
not self._resource or self._resource == self.max_resource
):
if self._num_complete4incumbent == self.dir and (not self._resource or self._resource == self.max_resource):
self._num_complete4incumbent -= 2
self._num_allowed4incumbent = max(self._num_allowed4incumbent, 2)
@ -593,10 +531,7 @@ class FLOW2(Searcher):
and self.cost_incumbent
and self._resource
and self._resource < self.max_resource
and (
self._cost_complete4incumbent
>= self.cost_incumbent * self.resource_multiple_factor
)
and (self._cost_complete4incumbent >= self.cost_incumbent * self.resource_multiple_factor)
):
return self._increase_resource(trial_id)
self._num_allowed4incumbent -= 1
@ -608,9 +543,7 @@ class FLOW2(Searcher):
self._direction_tried = None
else:
# propose a new direction
self._direction_tried = (
self.rand_vector_unit_sphere(self.dim, self._trunc) * self.step
)
self._direction_tried = self.rand_vector_unit_sphere(self.dim, self._trunc) * self.step
for i, key in enumerate(self._tunable_keys):
move[key] += self._direction_tried[i]
self._project(move)
@ -622,25 +555,17 @@ class FLOW2(Searcher):
if self._init_phase:
if self._direction_tried is None:
if self._same:
same = not any(
key not in best_config or value != best_config[key]
for key, value in config.items()
)
same = not any(key not in best_config or value != best_config[key] for key, value in config.items())
if same:
# increase step size
self.step += self.STEPSIZE
self.step = min(self.step, self.step_ub)
else:
same = not any(
key not in best_config or value != best_config[key]
for key, value in config.items()
)
same = not any(key not in best_config or value != best_config[key] for key, value in config.items())
self._same = same
if self._num_proposedby_incumbent == self.dir and (
not self._resource or self._resource == self.max_resource
):
if self._num_proposedby_incumbent == self.dir and (not self._resource or self._resource == self.max_resource):
# check stuck condition if using max resource
self._num_proposedby_incumbent -= 2
self._init_phase = False
@ -714,9 +639,7 @@ class FLOW2(Searcher):
# key must be in space
domain = space[key]
if self.hierarchical and not (
domain is None
or type(domain) in (str, int, float)
or isinstance(domain, sample.Domain)
domain is None or type(domain) in (str, int, float) or isinstance(domain, sample.Domain)
):
# not domain or hashable
# get rid of list type for hierarchical search space.
@ -746,10 +669,5 @@ class FLOW2(Searcher):
# unordered cat choice is hard to reach by chance
if config1[key] != config2.get(key):
return False
delta = np.array(
[
incumbent1[key] - incumbent2.get(key, np.inf)
for key in self._tunable_keys
]
)
delta = np.array([incumbent1[key] - incumbent2.get(key, np.inf) for key in self._tunable_keys])
return np.linalg.norm(delta) <= self.step

Просмотреть файл

@ -128,9 +128,7 @@ class ChampionFrontierSearcher(BaseSearcher):
self._challenger_list = []
# initialize the search in set_search_properties
self.set_search_properties(
setting={self.CHAMPION_TRIAL_NAME: None}, init_call=True
)
self.set_search_properties(setting={self.CHAMPION_TRIAL_NAME: None}, init_call=True)
logger.debug("using random seed %s in config oracle", self._seed)
def set_search_properties(
@ -202,16 +200,12 @@ class ChampionFrontierSearcher(BaseSearcher):
config_domain = self._space[k]
if isinstance(config_domain, PolynomialExpansionSet):
# get candidate configs for hyperparameters of the PolynomialExpansionSet type
partial_new_configs = self._generate_independent_hp_configs(
k, v, config_domain
)
partial_new_configs = self._generate_independent_hp_configs(k, v, config_domain)
if partial_new_configs:
hyperparameter_config_groups.append(partial_new_configs)
# does not have searcher_trial_ids
searcher_trial_ids_groups.append([])
elif isinstance(config_domain, Float) or isinstance(
config_domain, Categorical
):
elif isinstance(config_domain, Float) or isinstance(config_domain, Categorical):
# otherwise we need to deal with them in group
nonpoly_config[k] = v
if k not in self._space_of_nonpoly_hp:
@ -229,29 +223,17 @@ class ChampionFrontierSearcher(BaseSearcher):
metric=self.CFO_SEARCHER_METRIC_NAME,
)
# initialize the search in set_search_properties
self._searcher_for_nonpoly_hp[
seed_config_trial_id
].set_search_properties(
self._searcher_for_nonpoly_hp[seed_config_trial_id].set_search_properties(
setting={"metric_target": self.CFO_SEARCHER_LARGE_LOSS}
)
# We need to call this for once, such that the seed config in points_to_evaluate will be called
# to be tried
self._searcher_for_nonpoly_hp[seed_config_trial_id].suggest(
seed_config_searcher_trial_id
)
self._searcher_for_nonpoly_hp[seed_config_trial_id].suggest(seed_config_searcher_trial_id)
# assuming minimization
if (
self._searcher_for_nonpoly_hp[seed_config_trial_id].metric_target
is None
):
if self._searcher_for_nonpoly_hp[seed_config_trial_id].metric_target is None:
pseudo_loss = self.CFO_SEARCHER_LARGE_LOSS
else:
pseudo_loss = (
self._searcher_for_nonpoly_hp[
seed_config_trial_id
].metric_target
* 0.95
)
pseudo_loss = self._searcher_for_nonpoly_hp[seed_config_trial_id].metric_target * 0.95
pseudo_result_to_report = {}
for k, v in nonpoly_config.items():
pseudo_result_to_report["config/" + str(k)] = v
@ -264,14 +246,10 @@ class ChampionFrontierSearcher(BaseSearcher):
# suggest multiple times
new_searcher_trial_id = Trial.generate_id()
new_searcher_trial_ids.append(new_searcher_trial_id)
suggestion = self._searcher_for_nonpoly_hp[
seed_config_trial_id
].suggest(new_searcher_trial_id)
suggestion = self._searcher_for_nonpoly_hp[seed_config_trial_id].suggest(new_searcher_trial_id)
if suggestion is not None:
partial_new_nonpoly_configs.append(suggestion)
logger.info(
"partial_new_nonpoly_configs %s", partial_new_nonpoly_configs
)
logger.info("partial_new_nonpoly_configs %s", partial_new_nonpoly_configs)
else:
raise NotImplementedError
if partial_new_nonpoly_configs:
@ -298,20 +276,14 @@ class ChampionFrontierSearcher(BaseSearcher):
new_searcher_trial_id = searcher_trial_ids_groups[i][j]
else:
new_searcher_trial_id = None
new_trial = self._create_trial_from_config(
new_seed_config, new_searcher_trial_id
)
new_trial = self._create_trial_from_config(new_seed_config, new_searcher_trial_id)
new_trials.append(new_trial)
logger.info("new_configs %s", [t.trial_id for t in new_trials])
return new_trials
def _generate_independent_hp_configs(
self, hp_name, current_config_value, config_domain
) -> List:
def _generate_independent_hp_configs(self, hp_name, current_config_value, config_domain) -> List:
if isinstance(config_domain, PolynomialExpansionSet):
seed_interactions = list(current_config_value) + list(
config_domain.init_monomials
)
seed_interactions = list(current_config_value) + list(config_domain.init_monomials)
logger.info(
"**Important** Seed namespaces (singletons and interactions): %s",
seed_interactions,
@ -340,13 +312,7 @@ class ChampionFrontierSearcher(BaseSearcher):
champion_all_combinations = self._generate_all_comb(
seed_interactions, order, allow_self_inter, highest_poly_order
)
space = sorted(
list(
itertools.combinations(
champion_all_combinations, interaction_num_to_add
)
)
)
space = sorted(list(itertools.combinations(champion_all_combinations, interaction_num_to_add)))
self._random_state.shuffle(space)
candidate_configs = [set(seed_interactions) | set(item) for item in space]
final_candidate_configs = []
@ -413,15 +379,10 @@ class ChampionFrontierSearcher(BaseSearcher):
all_interactions_no_self_inter = []
for s in all_interactions:
s_no_inter = strip_self_inter(s)
if (
len(s_no_inter) > 1
and s_no_inter not in all_interactions_no_self_inter
):
if len(s_no_inter) > 1 and s_no_inter not in all_interactions_no_self_inter:
all_interactions_no_self_inter.append(s_no_inter)
all_interactions = all_interactions_no_self_inter
if highest_poly_order is not None:
all_interactions = [
c for c in all_interactions if len(c) <= highest_poly_order
]
all_interactions = [c for c in all_interactions if len(c) <= highest_poly_order]
logger.info("all_combinations %s", all_interactions)
return all_interactions

Просмотреть файл

@ -38,14 +38,10 @@ class SearchThread:
self._is_ls = isinstance(search_alg, FLOW2)
self._mode = mode
self._metric_op = 1 if mode == "min" else -1
self.cost_best = self.cost_last = self.cost_total = self.cost_best1 = getattr(
search_alg, "cost_incumbent", 0
)
self.cost_best = self.cost_last = self.cost_total = self.cost_best1 = getattr(search_alg, "cost_incumbent", 0)
self._eps = eps
self.cost_best2 = 0
self.obj_best1 = self.obj_best2 = getattr(
search_alg, "best_obj", np.inf
) # inherently minimize
self.obj_best1 = self.obj_best2 = getattr(search_alg, "best_obj", np.inf) # inherently minimize
self.best_result = None
# eci: estimated cost for improvement
self.eci = self.cost_best
@ -55,11 +51,7 @@ class SearchThread:
self.cost_attr = cost_attr
if search_alg:
self.space = self._space = search_alg.space # unflattened space
if (
self.space
and not isinstance(search_alg, FLOW2)
and isinstance(search_alg._space, dict)
):
if self.space and not isinstance(search_alg, FLOW2) and isinstance(search_alg._space, dict):
# remember const config
self._const = add_cost_to_space(self.space, {}, {})
@ -76,10 +68,7 @@ class SearchThread:
# define by run
config, self.space = unflatten_hierarchical(config, self._space)
except FloatingPointError:
logger.warning(
"The global search method raises FloatingPointError. "
"Ignoring for this iteration."
)
logger.warning("The global search method raises FloatingPointError. " "Ignoring for this iteration.")
config = None
if config is not None:
self.running += 1
@ -94,9 +83,7 @@ class SearchThread:
best_obj = metric_target * self._metric_op
if not self.speed:
self.speed = max_speed
self.eci = max(
self.cost_total - self.cost_best1, self.cost_best1 - self.cost_best2
)
self.eci = max(self.cost_total - self.cost_best1, self.cost_best1 - self.cost_best2)
if self.obj_best1 > best_obj and self.speed > 0:
self.eci = max(self.eci, 2 * (self.obj_best1 - best_obj) / self.speed)
@ -105,31 +92,23 @@ class SearchThread:
if self.obj_best2 > self.obj_best1:
# discount the speed if there are unfinished trials
self.speed = (
(self.obj_best2 - self.obj_best1)
/ self.running
/ (max(self.cost_total - self.cost_best2, self._eps))
(self.obj_best2 - self.obj_best1) / self.running / (max(self.cost_total - self.cost_best2, self._eps))
)
else:
self.speed = 0
def on_trial_complete(
self, trial_id: str, result: Optional[Dict] = None, error: bool = False
):
def on_trial_complete(self, trial_id: str, result: Optional[Dict] = None, error: bool = False):
"""Update the statistics of the thread."""
if not self._search_alg:
return
if not hasattr(self._search_alg, "_ot_trials") or (
not error and trial_id in self._search_alg._ot_trials
):
if not hasattr(self._search_alg, "_ot_trials") or (not error and trial_id in self._search_alg._ot_trials):
# optuna doesn't handle error
if self._is_ls or not self._init_config:
try:
self._search_alg.on_trial_complete(trial_id, result, error)
except RuntimeError as e:
# rs is used in place of optuna sometimes
if not str(e).endswith(
"has already finished and can not be updated."
):
if not str(e).endswith("has already finished and can not be updated."):
raise e
else:
# init config is not proposed by self._search_alg
@ -138,9 +117,7 @@ class SearchThread:
if result:
self.cost_last = result.get(self.cost_attr, 1)
self.cost_total += self.cost_last
if self._search_alg.metric in result and (
getattr(self._search_alg, "lexico_objectives", None) is None
):
if self._search_alg.metric in result and (getattr(self._search_alg, "lexico_objectives", None) is None):
# TODO: Improve this behavior. When lexico_objectives is provided to CFO,
# related variables are not callable.
obj = result[self._search_alg.metric] * self._metric_op
@ -162,9 +139,7 @@ class SearchThread:
# TODO update the statistics of the thread with partial result?
if not self._search_alg:
return
if not hasattr(self._search_alg, "_ot_trials") or (
trial_id in self._search_alg._ot_trials
):
if not hasattr(self._search_alg, "_ot_trials") or (trial_id in self._search_alg._ot_trials):
try:
self._search_alg.on_trial_result(trial_id, result)
except RuntimeError as e:

Просмотреть файл

@ -112,22 +112,16 @@ class Searcher:
# Early return to avoid assertions
return
assert isinstance(
metric, type(mode)
), "metric and mode must be of the same type"
assert isinstance(metric, type(mode)), "metric and mode must be of the same type"
if isinstance(mode, str):
assert mode in ["min", "max"], "if `mode` is a str must be 'min' or 'max'!"
elif isinstance(mode, list):
assert len(mode) == len(metric), "Metric and mode must be the same length"
assert all(
mod in ["min", "max", "obs"] for mod in mode
), "All of mode must be 'min' or 'max' or 'obs'!"
assert all(mod in ["min", "max", "obs"] for mod in mode), "All of mode must be 'min' or 'max' or 'obs'!"
else:
raise ValueError("Mode must either be a list or string")
def set_search_properties(
self, metric: Optional[str], mode: Optional[str], config: Dict
) -> bool:
def set_search_properties(self, metric: Optional[str], mode: Optional[str], config: Dict) -> bool:
"""Pass search properties to searcher.
This method acts as an alternative to instantiating search algorithms
with their own specific search spaces. Instead they can accept a
@ -193,18 +187,13 @@ class ConcurrencyLimiter(Searcher):
self.batch = batch
self.live_trials = set()
self.cached_results = {}
super(ConcurrencyLimiter, self).__init__(
metric=self.searcher.metric, mode=self.searcher.mode
)
super(ConcurrencyLimiter, self).__init__(metric=self.searcher.metric, mode=self.searcher.mode)
def suggest(self, trial_id: str) -> Optional[Dict]:
assert (
trial_id not in self.live_trials
), f"Trial ID {trial_id} must be unique: already found in set."
assert trial_id not in self.live_trials, f"Trial ID {trial_id} must be unique: already found in set."
if len(self.live_trials) >= self.max_concurrent:
logger.debug(
f"Not providing a suggestion for {trial_id} due to "
"concurrency limit: %s/%s.",
f"Not providing a suggestion for {trial_id} due to " "concurrency limit: %s/%s.",
len(self.live_trials),
self.max_concurrent,
)
@ -215,9 +204,7 @@ class ConcurrencyLimiter(Searcher):
self.live_trials.add(trial_id)
return suggestion
def on_trial_complete(
self, trial_id: str, result: Optional[Dict] = None, error: bool = False
):
def on_trial_complete(self, trial_id: str, result: Optional[Dict] = None, error: bool = False):
if trial_id not in self.live_trials:
return
elif self.batch:
@ -226,9 +213,7 @@ class ConcurrencyLimiter(Searcher):
# Update the underlying searcher once the
# full batch is completed.
for trial_id, (result, error) in self.cached_results.items():
self.searcher.on_trial_complete(
trial_id, result=result, error=error
)
self.searcher.on_trial_complete(trial_id, result=result, error=error)
self.live_trials.remove(trial_id)
self.cached_results = {}
else:
@ -257,9 +242,7 @@ class ConcurrencyLimiter(Searcher):
def on_unpause(self, trial_id: str):
self.searcher.on_unpause(trial_id)
def set_search_properties(
self, metric: Optional[str], mode: Optional[str], config: Dict
) -> bool:
def set_search_properties(self, metric: Optional[str], mode: Optional[str], config: Dict) -> bool:
return self.searcher.set_search_properties(metric, mode, config)
@ -301,17 +284,10 @@ def validate_warmstart(
"""
if points_to_evaluate:
if not isinstance(points_to_evaluate, list):
raise TypeError(
"points_to_evaluate expected to be a list, got {}.".format(
type(points_to_evaluate)
)
)
raise TypeError("points_to_evaluate expected to be a list, got {}.".format(type(points_to_evaluate)))
for point in points_to_evaluate:
if not isinstance(point, (dict, list)):
raise TypeError(
f"points_to_evaluate expected to include list or dict, "
f"got {point}."
)
raise TypeError(f"points_to_evaluate expected to include list or dict, " f"got {point}.")
if validate_point_name_lengths and (not len(point) == len(parameter_names)):
raise ValueError(
@ -322,11 +298,7 @@ def validate_warmstart(
if points_to_evaluate and evaluated_rewards:
if not isinstance(evaluated_rewards, list):
raise TypeError(
"evaluated_rewards expected to be a list, got {}.".format(
type(evaluated_rewards)
)
)
raise TypeError("evaluated_rewards expected to be a list, got {}.".format(type(evaluated_rewards)))
if not len(evaluated_rewards) == len(points_to_evaluate):
raise ValueError(
"Dim of evaluated_rewards {}".format(evaluated_rewards)
@ -461,16 +433,12 @@ class OptunaSearch(Searcher):
evaluated_rewards: Optional[List] = None,
):
assert ot is not None, "Optuna must be installed! Run `pip install optuna`."
super(OptunaSearch, self).__init__(
metric=metric, mode=mode, max_concurrent=None, use_early_stopped_trials=None
)
super(OptunaSearch, self).__init__(metric=metric, mode=mode, max_concurrent=None, use_early_stopped_trials=None)
if isinstance(space, dict) and space:
resolved_vars, domain_vars, grid_vars = parse_spec_vars(space)
if domain_vars or grid_vars:
logger.warning(
UNRESOLVED_SEARCH_SPACE.format(par="space", cls=type(self).__name__)
)
logger.warning(UNRESOLVED_SEARCH_SPACE.format(par="space", cls=type(self).__name__))
space = self.convert_search_space(space)
else:
# Flatten to support nested dicts
@ -493,8 +461,7 @@ class OptunaSearch(Searcher):
self._sampler = sampler or ot.samplers.TPESampler(seed=seed)
assert isinstance(self._sampler, BaseSampler), (
"You can only pass an instance of `optuna.samplers.BaseSampler` "
"as a sampler to `OptunaSearcher`."
"You can only pass an instance of `optuna.samplers.BaseSampler` " "as a sampler to `OptunaSearcher`."
)
self._ot_trials = {}
@ -527,17 +494,13 @@ class OptunaSearch(Searcher):
validate_point_name_lengths=not callable(self._space),
)
if self._evaluated_rewards:
for point, reward in zip(
self._points_to_evaluate, self._evaluated_rewards
):
for point, reward in zip(self._points_to_evaluate, self._evaluated_rewards):
self.add_evaluated_point(point, reward)
else:
for point in self._points_to_evaluate:
self._ot_study.enqueue_trial(point)
def set_search_properties(
self, metric: Optional[str], mode: Optional[str], config: Dict
) -> bool:
def set_search_properties(self, metric: Optional[str], mode: Optional[str], config: Dict) -> bool:
if self._space:
return False
space = self.convert_search_space(config)
@ -585,16 +548,10 @@ class OptunaSearch(Searcher):
def suggest(self, trial_id: str) -> Optional[Dict]:
if not self._space:
raise RuntimeError(
UNDEFINED_SEARCH_SPACE.format(
cls=self.__class__.__name__, space="space"
)
)
raise RuntimeError(UNDEFINED_SEARCH_SPACE.format(cls=self.__class__.__name__, space="space"))
if not self._metric or not self._mode:
raise RuntimeError(
UNDEFINED_METRIC_MODE.format(
cls=self.__class__.__name__, metric=self._metric, mode=self._mode
)
UNDEFINED_METRIC_MODE.format(cls=self.__class__.__name__, metric=self._metric, mode=self._mode)
)
if isinstance(self._space, list):
@ -607,9 +564,7 @@ class OptunaSearch(Searcher):
# getattr will fetch the trial.suggest_ function on Optuna trials
params = {
args[0]
if len(args) > 0
else kwargs["name"]: getattr(ot_trial, fn)(*args, **kwargs)
args[0] if len(args) > 0 else kwargs["name"]: getattr(ot_trial, fn)(*args, **kwargs)
for (fn, args, kwargs) in self._space
}
elif callable(self._space):
@ -622,9 +577,7 @@ class OptunaSearch(Searcher):
else:
# Use Optuna ask interface (since version 2.6.0)
if trial_id not in self._ot_trials:
self._ot_trials[trial_id] = self._ot_study.ask(
fixed_distributions=self._space
)
self._ot_trials[trial_id] = self._ot_study.ask(fixed_distributions=self._space)
ot_trial = self._ot_trials[trial_id]
params = ot_trial.params
@ -636,9 +589,7 @@ class OptunaSearch(Searcher):
ot_trial = self._ot_trials[trial_id]
ot_trial.report(metric, step)
def on_trial_complete(
self, trial_id: str, result: Optional[Dict] = None, error: bool = False
):
def on_trial_complete(self, trial_id: str, result: Optional[Dict] = None, error: bool = False):
ot_trial = self._ot_trials[trial_id]
val = result.get(self.metric, None) if result else None
@ -662,16 +613,10 @@ class OptunaSearch(Searcher):
intermediate_values: Optional[List[float]] = None,
):
if not self._space:
raise RuntimeError(
UNDEFINED_SEARCH_SPACE.format(
cls=self.__class__.__name__, space="space"
)
)
raise RuntimeError(UNDEFINED_SEARCH_SPACE.format(cls=self.__class__.__name__, space="space"))
if not self._metric or not self._mode:
raise RuntimeError(
UNDEFINED_METRIC_MODE.format(
cls=self.__class__.__name__, metric=self._metric, mode=self._mode
)
UNDEFINED_METRIC_MODE.format(cls=self.__class__.__name__, metric=self._metric, mode=self._mode)
)
ot_trial_state = OptunaTrialState.COMPLETE
@ -681,9 +626,7 @@ class OptunaSearch(Searcher):
ot_trial_state = OptunaTrialState.PRUNED
if intermediate_values:
intermediate_values_dict = {
i: value for i, value in enumerate(intermediate_values)
}
intermediate_values_dict = {i: value for i, value in enumerate(intermediate_values)}
else:
intermediate_values_dict = None
@ -736,10 +679,7 @@ class OptunaSearch(Searcher):
return {}
if grid_vars:
raise ValueError(
"Grid search parameters cannot be automatically converted "
"to an Optuna search space."
)
raise ValueError("Grid search parameters cannot be automatically converted " "to an Optuna search space.")
# Flatten and resolve again after checking for grid search.
spec = flatten_dict(spec, prevent_delimiter=True)
@ -766,18 +706,12 @@ class OptunaSearch(Searcher):
"Optuna does not support both quantization and "
"sampling from LogUniform. Dropped quantization."
)
return ot.distributions.LogUniformDistribution(
domain.lower, domain.upper
)
return ot.distributions.LogUniformDistribution(domain.lower, domain.upper)
elif isinstance(sampler, Uniform):
if quantize:
return ot.distributions.DiscreteUniformDistribution(
domain.lower, domain.upper, quantize
)
return ot.distributions.UniformDistribution(
domain.lower, domain.upper
)
return ot.distributions.DiscreteUniformDistribution(domain.lower, domain.upper, quantize)
return ot.distributions.UniformDistribution(domain.lower, domain.upper)
elif isinstance(domain, Integer):
if isinstance(sampler, LogUniform):
@ -798,9 +732,7 @@ class OptunaSearch(Searcher):
raise ValueError(
"Optuna search does not support parameters of type "
"`{}` with samplers of type `{}`".format(
type(domain).__name__, type(domain.sampler).__name__
)
"`{}` with samplers of type `{}`".format(type(domain).__name__, type(domain.sampler).__name__)
)
# Parameter name is e.g. "a/b/c" for nested dicts

Просмотреть файл

@ -143,9 +143,7 @@ def _generate_variants(
for resolved_spec in grid_search:
if not constant_grid_search or not all_resolved:
# In this path, we sample the remaining random variables
_, resolved_vars = _resolve_domain_vars(
resolved_spec, to_resolve, random_state=random_state
)
_, resolved_vars = _resolve_domain_vars(resolved_spec, to_resolve, random_state=random_state)
for resolved, spec in _generate_variants(
resolved_spec,
@ -155,11 +153,7 @@ def _generate_variants(
for path, value in grid_vars:
resolved_vars[path] = _get_value(spec, path)
for k, v in resolved.items():
if (
k in resolved_vars
and v != resolved_vars[k]
and _is_resolved(resolved_vars[k])
):
if k in resolved_vars and v != resolved_vars[k] and _is_resolved(resolved_vars[k]):
raise ValueError(
"The variable `{}` could not be unambiguously "
"resolved to a single value. Consider simplifying "
@ -197,9 +191,7 @@ def _resolve_domain_vars(
if path in resolved:
continue
try:
value = domain.sample(
_UnresolvedAccessGuard(spec), random_state=random_state
)
value = domain.sample(_UnresolvedAccessGuard(spec), random_state=random_state)
except RecursiveDependencyError as e:
error = e
# except Exception:
@ -217,9 +209,7 @@ def _resolve_domain_vars(
return True, resolved
def _grid_search_generator(
unresolved_spec: Dict, grid_vars: List
) -> Generator[Dict, None, None]:
def _grid_search_generator(unresolved_spec: Dict, grid_vars: List) -> Generator[Dict, None, None]:
value_indices = [0] * len(grid_vars)
def increment(i):
@ -260,9 +250,7 @@ def _try_resolve(v) -> Tuple[bool, Any]:
# Grid search values
grid_values = v["grid_search"]
if not isinstance(grid_values, list):
raise TuneError(
"Grid search expected list of values, got: {}".format(grid_values)
)
raise TuneError("Grid search expected list of values, got: {}".format(grid_values))
return False, Categorical(grid_values).grid()
return True, v
@ -318,9 +306,7 @@ class _UnresolvedAccessGuard(dict):
def __getattribute__(self, item):
value = dict.__getattribute__(self, item)
if not _is_resolved(value):
raise RecursiveDependencyError(
"`{}` recursively depends on {}".format(item, value)
)
raise RecursiveDependencyError("`{}` recursively depends on {}".format(item, value))
elif isinstance(value, dict):
return _UnresolvedAccessGuard(value)
else:

Просмотреть файл

@ -70,15 +70,11 @@ def define_by_run_func(trial, space: Dict, path: str = "") -> Optional[Dict[str,
else:
raise ValueError(
"Optuna search does not support parameters of type "
"`{}` with samplers of type `{}`".format(
type(domain).__name__, type(domain.sampler).__name__
)
"`{}` with samplers of type `{}`".format(type(domain).__name__, type(domain.sampler).__name__)
)
elif isinstance(domain, sample.Integer):
if isinstance(sampler, sample.LogUniform):
trial.suggest_int(
key, domain.lower, domain.upper - int(bool(not quantize)), log=True
)
trial.suggest_int(key, domain.lower, domain.upper - int(bool(not quantize)), log=True)
elif isinstance(sampler, sample.Uniform):
# Upper bound should be inclusive for quantization and
# exclusive otherwise
@ -103,9 +99,7 @@ def define_by_run_func(trial, space: Dict, path: str = "") -> Optional[Dict[str,
else:
raise ValueError(
"Optuna search does not support parameters of type "
"`{}` with samplers of type `{}`".format(
type(domain).__name__, type(domain.sampler).__name__
)
"`{}` with samplers of type `{}`".format(type(domain).__name__, type(domain.sampler).__name__)
)
# Return all constants in a dictionary.
return config
@ -148,9 +142,7 @@ def unflatten_hierarchical(config: Dict, space: Dict) -> Tuple[Dict, Dict]:
pos = key.rfind(":")
true_key = key[:pos]
choice = int(key[pos + 1 :])
hier[true_key], subspace[true_key] = unflatten_hierarchical(
value, space[true_key][choice]
)
hier[true_key], subspace[true_key] = unflatten_hierarchical(value, space[true_key][choice])
else:
if key.endswith("_choice_"):
key = key[:-8]
@ -212,9 +204,7 @@ def add_cost_to_space(space: Dict, low_cost_point: Dict, choice_cost: Dict):
choice_cost_dict = choice_cost_list[i]
else:
choice_cost_dict = {}
domain.const.append(
add_cost_to_space(cat, low_cost_dict, choice_cost_dict)
)
domain.const.append(add_cost_to_space(cat, low_cost_dict, choice_cost_dict))
else:
domain.const.append(None)
if choice_cost_list:
@ -233,18 +223,14 @@ def add_cost_to_space(space: Dict, low_cost_point: Dict, choice_cost: Dict):
ordered = getattr(domain, "ordered", None)
if ordered is None:
# automatically decide whether to order the choices based on the value type
domain.ordered = ordered = all(
isinstance(x, (int, float)) for x in domain.categories
)
domain.ordered = ordered = all(isinstance(x, (int, float)) for x in domain.categories)
if ordered:
# sort the choices by value
ind = np.argsort(domain.categories)
domain.categories = [domain.categories[i] for i in ind]
if low_cost and low_cost not in domain.categories:
assert isinstance(
low_cost, list
), f"low cost {low_cost} not in domain {domain.categories}"
assert isinstance(low_cost, list), f"low cost {low_cost} not in domain {domain.categories}"
if domain.ordered:
sorted_points = [low_cost[i] for i in ind]
for i, point in enumerate(sorted_points):
@ -292,11 +278,7 @@ def normalize(
# low_cost_point list
norm = []
for i, cat in enumerate(domain.categories):
norm.append(
normalize(value[i], cat, reference_config[key][i], {})
if recursive
else value[i]
)
norm.append(normalize(value[i], cat, reference_config[key][i], {}) if recursive else value[i])
if len(value) > len(domain.categories):
# the low cost index was appended to low_cost_point list
index = value[-1]
@ -335,16 +317,10 @@ def normalize(
else:
quantize = None
if str(sampler) == "LogUniform":
upper = domain.upper - (
isinstance(domain, sample.Integer) & (quantize is None)
)
config_norm[key] = np.log(value / domain.lower) / np.log(
upper / domain.lower
)
upper = domain.upper - (isinstance(domain, sample.Integer) & (quantize is None))
config_norm[key] = np.log(value / domain.lower) / np.log(upper / domain.lower)
elif str(sampler) == "Uniform":
upper = domain.upper - (
isinstance(domain, sample.Integer) & (quantize is None)
)
upper = domain.upper - (isinstance(domain, sample.Integer) & (quantize is None))
config_norm[key] = (value - domain.lower) / (upper - domain.lower)
elif str(sampler) == "Normal":
# N(mean, sd) -> N(0,1)
@ -366,9 +342,7 @@ def denormalize(
if key in space:
# domain: sample.Categorical/Integer/Float/Function
domain = space[key]
if isinstance(value, dict) or not callable(
getattr(domain, "get_sampler", None)
):
if isinstance(value, dict) or not callable(getattr(domain, "get_sampler", None)):
config_denorm[key] = value
else:
if isinstance(domain, sample.Categorical):
@ -376,16 +350,12 @@ def denormalize(
n = len(domain.categories)
if isinstance(value, list):
# denormalize list
choice = min(
n - 1, int(np.floor(value[-1] * n))
) # max choice is n-1
choice = min(n - 1, int(np.floor(value[-1] * n))) # max choice is n-1
config_denorm[key] = point = value[choice]
point["_choice_"] = choice
continue
if domain.ordered:
config_denorm[key] = domain.categories[
min(n - 1, int(np.floor(value * n)))
]
config_denorm[key] = domain.categories[min(n - 1, int(np.floor(value * n)))]
else:
assert key in normalized_reference_config
if min(n - 1, np.floor(value * n)) == min(
@ -394,11 +364,7 @@ def denormalize(
config_denorm[key] = reference_config[key]
else: # ****random value each time!****
config_denorm[key] = random_state.choice(
[
x
for x in domain.categories
if x != reference_config[key]
]
[x for x in domain.categories if x != reference_config[key]]
)
continue
# Uniform/LogUniform/Normal/Base
@ -411,14 +377,10 @@ def denormalize(
quantize = None
# Handle Log/Uniform
if str(sampler) == "LogUniform":
upper = domain.upper - (
isinstance(domain, sample.Integer) & (quantize is None)
)
upper = domain.upper - (isinstance(domain, sample.Integer) & (quantize is None))
config_denorm[key] = (upper / domain.lower) ** value * domain.lower
elif str(sampler) == "Uniform":
upper = domain.upper - (
isinstance(domain, sample.Integer) & (quantize is None)
)
upper = domain.upper - (isinstance(domain, sample.Integer) & (quantize is None))
config_denorm[key] = value * (upper - domain.lower) + domain.lower
elif str(sampler) == "Normal":
# denormalization for 'Normal'
@ -427,9 +389,7 @@ def denormalize(
# config_denorm[key] = value
# Handle quantized
if quantize is not None:
config_denorm[key] = (
np.round(np.divide(config_denorm[key], quantize)) * quantize
)
config_denorm[key] = np.round(np.divide(config_denorm[key], quantize)) * quantize
# Handle int (4.6 -> 5)
if isinstance(domain, sample.Integer):
config_denorm[key] = int(round(config_denorm[key]))
@ -525,9 +485,7 @@ def complete_config(
for key, value in space.items():
if key not in config:
config[key] = value
for _, generated in generate_variants_compatible(
{"config": config}, random_state=flow2.rs_random
):
for _, generated in generate_variants_compatible({"config": config}, random_state=flow2.rs_random):
config = generated["config"]
break
subspace = {}
@ -550,9 +508,7 @@ def complete_config(
lower and lower.get(key) and lower[key][index],
upper and upper.get(key) and upper[key][index],
)
assert (
"_choice_" not in subspace[key]
), "_choice_ is a reserved key for hierarchical search space"
assert "_choice_" not in subspace[key], "_choice_ is a reserved key for hierarchical search space"
subspace[key]["_choice_"] = index
else:
config[key], subspace[key] = complete_config(

Просмотреть файл

@ -69,11 +69,7 @@ def get_n_cpus(node="driver"):
"""
assert node in ["driver", "executor"]
try:
n_cpus = int(
SparkSession.builder.getOrCreate()
.sparkContext.getConf()
.get(f"spark.{node}.cores")
)
n_cpus = int(SparkSession.builder.getOrCreate().sparkContext.getConf().get(f"spark.{node}.cores"))
except (TypeError, RuntimeError):
n_cpus = os.cpu_count()
return n_cpus
@ -113,9 +109,7 @@ def with_parameters(trainable, **kwargs):
if not callable(trainable):
raise ValueError(
f"`with_parameters() only works with function trainables`. "
f"Got type: "
f"{type(trainable)}."
f"`with_parameters() only works with function trainables`. " f"Got type: " f"{type(trainable)}."
)
spark_available, spark_error_msg = check_spark()

Просмотреть файл

@ -116,25 +116,19 @@ class Trial:
self.metric_n_steps[metric][str(n)] = deque([value], maxlen=n)
else:
step = result["training_iteration"] or 1
self.metric_analysis[metric]["max"] = max(
value, self.metric_analysis[metric]["max"]
)
self.metric_analysis[metric]["min"] = min(
value, self.metric_analysis[metric]["min"]
)
self.metric_analysis[metric]["max"] = max(value, self.metric_analysis[metric]["max"])
self.metric_analysis[metric]["min"] = min(value, self.metric_analysis[metric]["min"])
self.metric_analysis[metric]["avg"] = (
1
/ step
* (value + (step - 1) * self.metric_analysis[metric]["avg"])
1 / step * (value + (step - 1) * self.metric_analysis[metric]["avg"])
)
self.metric_analysis[metric]["last"] = value
for n in self.n_steps:
key = "last-{:d}-avg".format(n)
self.metric_n_steps[metric][str(n)].append(value)
self.metric_analysis[metric][key] = sum(
self.metric_analysis[metric][key] = sum(self.metric_n_steps[metric][str(n)]) / len(
self.metric_n_steps[metric][str(n)]
) / len(self.metric_n_steps[metric][str(n)])
)
def set_status(self, status):
"""Sets the status of the trial."""

Просмотреть файл

@ -96,17 +96,13 @@ class BaseTrialRunner:
"""Stops trial."""
if trial.status not in [Trial.ERROR, Trial.TERMINATED]:
if self._scheduler_alg:
self._scheduler_alg.on_trial_complete(
self, trial.trial_id, trial.last_result
)
self._scheduler_alg.on_trial_complete(self, trial.trial_id, trial.last_result)
self._search_alg.on_trial_complete(trial.trial_id, trial.last_result)
trial.set_status(Trial.TERMINATED)
elif self._scheduler_alg:
self._scheduler_alg.on_trial_remove(self, trial)
if trial.status == Trial.ERROR:
self._search_alg.on_trial_complete(
trial.trial_id, trial.last_result, error=True
)
self._search_alg.on_trial_complete(trial.trial_id, trial.last_result, error=True)
class SequentialTrialRunner(BaseTrialRunner):

Просмотреть файл

@ -65,11 +65,7 @@ class ExperimentAnalysis(EA):
return self.get_best_config(self.default_metric, self.default_mode)
def lexico_best(self, trials):
results = {
index: trial.last_result
for index, trial in enumerate(trials)
if trial.last_result
}
results = {index: trial.last_result for index, trial in enumerate(trials) if trial.last_result}
metrics = self.lexico_objectives["metrics"]
modes = self.lexico_objectives["modes"]
f_best = {}
@ -79,15 +75,11 @@ class ExperimentAnalysis(EA):
for time_index in range(length):
for objective, mode in zip(metrics, modes):
histories[objective].append(
results[keys[time_index]][objective]
if mode == "min"
else -results[keys[time_index]][objective]
results[keys[time_index]][objective] if mode == "min" else -results[keys[time_index]][objective]
)
obj_initial = self.lexico_objectives["metrics"][0]
feasible_index = np.array([*range(len(histories[obj_initial]))])
for k_metric, k_mode in zip(
self.lexico_objectives["metrics"], self.lexico_objectives["modes"]
):
for k_metric, k_mode in zip(self.lexico_objectives["metrics"], self.lexico_objectives["modes"]):
k_values = np.array(histories[k_metric])
k_target = (
-self.lexico_objectives["targets"][k_metric]
@ -101,19 +93,9 @@ class ExperimentAnalysis(EA):
feasible_value
<= max(
f_best[k_metric] + self.lexico_objectives["tolerances"][k_metric]
if not isinstance(
self.lexico_objectives["tolerances"][k_metric], str
)
if not isinstance(self.lexico_objectives["tolerances"][k_metric], str)
else f_best[k_metric]
* (
1
+ 0.01
* float(
self.lexico_objectives["tolerances"][k_metric].replace(
"%", ""
)
)
),
* (1 + 0.01 * float(self.lexico_objectives["tolerances"][k_metric].replace("%", ""))),
k_target,
)
)[0]
@ -237,9 +219,7 @@ def run(
local_dir: Optional[str] = None,
num_samples: Optional[int] = 1,
resources_per_trial: Optional[dict] = None,
config_constraints: Optional[
List[Tuple[Callable[[dict], float], str, float]]
] = None,
config_constraints: Optional[List[Tuple[Callable[[dict], float], str, float]]] = None,
metric_constraints: Optional[List[Tuple[str, str, float]]] = None,
max_failure: Optional[int] = 100,
use_ray: Optional[bool] = False,
@ -463,9 +443,7 @@ def run(
os.makedirs(dir_name, exist_ok=True)
elif local_dir and verbose > 0:
os.makedirs(local_dir, exist_ok=True)
log_file_name = os.path.join(
local_dir, "tune_" + str(datetime.datetime.now()).replace(":", "-") + ".log"
)
log_file_name = os.path.join(local_dir, "tune_" + str(datetime.datetime.now()).replace(":", "-") + ".log")
if use_ray and use_spark:
raise ValueError("use_ray and use_spark cannot be both True.")
if not use_ray:
@ -506,9 +484,7 @@ def run(
from .searcher.blendsearch import BlendSearch, CFO
if lexico_objectives is not None:
logger.warning(
"If lexico_objectives is not None, search_alg is forced to be CFO"
)
logger.warning("If lexico_objectives is not None, search_alg is forced to be CFO")
search_alg = None
if search_alg is None:
flaml_scheduler_resource_attr = (
@ -529,14 +505,10 @@ def run(
import optuna as _
SearchAlgorithm = BlendSearch
logger.info(
"Using search algorithm {}.".format(SearchAlgorithm.__name__)
)
logger.info("Using search algorithm {}.".format(SearchAlgorithm.__name__))
except ImportError:
SearchAlgorithm = CFO
logger.warning(
"Using CFO for search. To use BlendSearch, run: pip install flaml[blendsearch]"
)
logger.warning("Using CFO for search. To use BlendSearch, run: pip install flaml[blendsearch]")
metric = metric or DEFAULT_METRIC
else:
SearchAlgorithm = CFO
@ -581,14 +553,8 @@ def run(
]
and use_incumbent_result_in_evaluation is not None
):
search_alg.use_incumbent_result_in_evaluation = (
use_incumbent_result_in_evaluation
)
searcher = (
search_alg.searcher
if isinstance(search_alg, ConcurrencyLimiter)
else search_alg
)
search_alg.use_incumbent_result_in_evaluation = use_incumbent_result_in_evaluation
searcher = search_alg.searcher if isinstance(search_alg, ConcurrencyLimiter) else search_alg
if isinstance(searcher, BlendSearch):
setting = {}
if time_budget_s:
@ -617,10 +583,7 @@ def run(
try:
from ray import tune
except ImportError:
raise ImportError(
"Failed to import ray tune. "
"Please install ray[tune] or set use_ray=False"
)
raise ImportError("Failed to import ray tune. " "Please install ray[tune] or set use_ray=False")
_use_ray = True
try:
analysis = tune.run(
@ -659,19 +622,14 @@ def run(
from joblib import Parallel, delayed, parallel_backend
from joblibspark import register_spark
except ImportError as e:
raise ImportError(
f"{e}. Try pip install flaml[spark] or set use_spark=False."
)
raise ImportError(f"{e}. Try pip install flaml[spark] or set use_spark=False.")
from flaml.tune.searcher.suggestion import ConcurrencyLimiter
from .trial_runner import SparkTrialRunner
register_spark()
spark = SparkSession.builder.getOrCreate()
sc = spark._jsc.sc()
num_executors = (
len([executor.host() for executor in sc.statusTracker().getExecutorInfos()])
- 1
)
num_executors = len([executor.host() for executor in sc.statusTracker().getExecutorInfos()]) - 1
"""
By default, the number of executors is the number of VMs in the cluster. And we can
launch one trial per executor. However, sometimes we can launch more trials than
@ -708,9 +666,7 @@ def run(
max_concurrent,
)
with parallel_backend("spark"):
with Parallel(
n_jobs=n_concurrent_trials, verbose=max(0, (verbose - 1) * 50)
) as parallel:
with Parallel(n_jobs=n_concurrent_trials, verbose=max(0, (verbose - 1) * 50)) as parallel:
try:
_runner = SparkTrialRunner(
search_alg=search_alg,
@ -722,9 +678,7 @@ def run(
if time_budget_s is None:
time_budget_s = np.inf
num_failures = 0
upperbound_num_failures = (
len(evaluated_rewards) if evaluated_rewards else 0
) + max_failure
upperbound_num_failures = (len(evaluated_rewards) if evaluated_rewards else 0) + max_failure
while (
time.time() - time_start < time_budget_s
and (num_samples < 0 or num_trials < num_samples)
@ -742,9 +696,7 @@ def run(
break
trials_to_run = _runner.running_trials
if not trials_to_run:
logger.warning(
f"fail to sample a trial for {max_failure} times in a row, stopping."
)
logger.warning(f"fail to sample a trial for {max_failure} times in a row, stopping.")
break
logger.info(
f"Number of trials: {num_trials}/{num_samples}, {len(_runner.running_trials)} RUNNING,"
@ -754,12 +706,9 @@ def run(
f"Configs of Trials to run: {[trial_to_run.config for trial_to_run in trials_to_run]}"
)
results = None
with PySparkOvertimeMonitor(
time_start, time_budget_s, force_cancel, parallel=parallel
):
with PySparkOvertimeMonitor(time_start, time_budget_s, force_cancel, parallel=parallel):
results = parallel(
delayed(evaluation_function)(trial_to_run.config)
for trial_to_run in trials_to_run
delayed(evaluation_function)(trial_to_run.config) for trial_to_run in trials_to_run
)
# results = [evaluation_function(trial_to_run.config) for trial_to_run in trials_to_run]
while results:
@ -775,9 +724,7 @@ def run(
# When the result returned is an empty dict, set the trial status to error
trial_to_run.set_status(Trial.ERROR)
else:
logger.info(
"Brief result: {}".format({metric: result})
)
logger.info("Brief result: {}".format({metric: result}))
report(_metric=result)
_runner.stop_trial(trial_to_run)
num_failures = 0
@ -817,9 +764,7 @@ def run(
if time_budget_s is None:
time_budget_s = np.inf
num_failures = 0
upperbound_num_failures = (
len(evaluated_rewards) if evaluated_rewards else 0
) + max_failure
upperbound_num_failures = (len(evaluated_rewards) if evaluated_rewards else 0) + max_failure
while (
time.time() - time_start < time_budget_s
and (num_samples < 0 or num_trials < num_samples)
@ -852,9 +797,7 @@ def run(
# break with upperbound_num_failures consecutive failures
num_failures += 1
if num_failures == upperbound_num_failures:
logger.warning(
f"fail to sample a trial for {max_failure} times in a row, stopping."
)
logger.warning(f"fail to sample a trial for {max_failure} times in a row, stopping.")
analysis = ExperimentAnalysis(
_runner.get_trials(),
metric=metric,

Просмотреть файл

@ -23,9 +23,5 @@ def choice(categories: Sequence, order=None):
Numerical categories have an order, while string categories do not.
"""
domain = sample.Categorical(categories).uniform()
domain.ordered = (
order
if order is not None
else all(isinstance(x, (int, float)) for x in categories)
)
domain.ordered = order if order is not None else all(isinstance(x, (int, float)) for x in categories)
return domain

Просмотреть файл

@ -265,9 +265,7 @@ class TestClassification(unittest.TestCase):
import xgboost as xgb
callback = xgb.callback.TrainingCallback()
automl.fit(
X_train=X_train, y_train=y_train, callbacks=[callback], **automl_settings
)
automl.fit(X_train=X_train, y_train=y_train, callbacks=[callback], **automl_settings)
print(automl.predict(X_train))
print(automl.model)
print(automl.config_history)
@ -279,16 +277,12 @@ class TestClassification(unittest.TestCase):
import subprocess
import sys
subprocess.check_call(
[sys.executable, "-m", "pip", "install", "xgboost==1.3.3", "--user"]
)
subprocess.check_call([sys.executable, "-m", "pip", "install", "xgboost==1.3.3", "--user"])
automl = AutoML()
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl.feature_names_in_)
print(automl.feature_importances_)
subprocess.check_call(
[sys.executable, "-m", "pip", "install", "-U", "xgboost", "--user"]
)
subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "xgboost", "--user"])
def test_ray_classification(self):
X, y = load_breast_cancer(return_X_y=True)
@ -337,9 +331,7 @@ class TestClassification(unittest.TestCase):
import ray
X_train_ref = ray.put(X_train)
automl_experiment.fit(
X_train=X_train_ref, y_train=y_train, **automl_settings
)
automl_experiment.fit(X_train=X_train_ref, y_train=y_train, **automl_settings)
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
@ -355,9 +347,7 @@ class TestClassification(unittest.TestCase):
def test_random_skip_oom(self):
automl_experiment = AutoML()
automl_experiment.add_learner(
learner_name="large_lgbm", learner_class=MyLargeLGBM
)
automl_experiment.add_learner(learner_name="large_lgbm", learner_class=MyLargeLGBM)
automl_settings = {
"time_budget": 2,
"task": "classification",
@ -396,9 +386,7 @@ class TestClassification(unittest.TestCase):
}
X_train = scipy.sparse.random(3000, 3000, density=0.1)
y_train = np.random.randint(2, size=3000)
automl_experiment.fit(
X_train=X_train, y_train=y_train, train_time_limit=1, **automl_settings
)
automl_experiment.fit(X_train=X_train, y_train=y_train, train_time_limit=1, **automl_settings)
automl_settings["time_budget"] = 5
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl_experiment.predict(X_train))

Просмотреть файл

@ -31,9 +31,7 @@ def test_metric_constraints():
from sklearn.datasets import load_wine
X, y = load_wine(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl.estimator_list)
print(automl.search_space)
@ -55,9 +53,7 @@ def test_metric_constraints():
min_resource=automl.min_resource,
max_resource=automl.max_resource,
time_budget_s=automl._state.time_budget,
config_constraints=[
(partial(size, automl._state.learner_classes), "<=", automl._mem_thres)
],
config_constraints=[(partial(size, automl._state.learner_classes), "<=", automl._mem_thres)],
metric_constraints=automl.metric_constraints,
num_samples=5,
)
@ -121,18 +117,12 @@ def test_metric_constraints_custom():
from sklearn.datasets import load_wine
X, y = load_wine(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl.estimator_list)
print(automl.search_space)
print(automl.points_to_evaluate)
print(
"Best minimization objective on validation data: {0:.4g}".format(
automl.best_loss
)
)
print("Best minimization objective on validation data: {0:.4g}".format(automl.best_loss))
print(
"pred_time of the best config on validation data: {0:.4g}".format(
automl.metrics_for_best_config[1]["pred_time"]
@ -161,9 +151,7 @@ def test_metric_constraints_custom():
min_resource=automl.min_resource,
max_resource=automl.max_resource,
time_budget_s=automl._state.time_budget,
config_constraints=[
(partial(size, automl._state.learner_classes), "<=", automl._mem_thres)
],
config_constraints=[(partial(size, automl._state.learner_classes), "<=", automl._mem_thres)],
metric_constraints=automl.metric_constraints,
num_samples=5,
)

Просмотреть файл

@ -2,20 +2,12 @@ import numpy as np
from flaml import AutoML
def test_forecast_automl(
budget=5, estimators_when_no_prophet=["arima", "sarimax", "holt-winters"]
):
def test_forecast_automl(budget=5, estimators_when_no_prophet=["arima", "sarimax", "holt-winters"]):
# using dataframe
import statsmodels.api as sm
data = sm.datasets.co2.load_pandas().data["co2"].resample("MS").mean()
data = (
data.bfill()
.ffill()
.to_frame()
.reset_index()
.rename(columns={"index": "ds", "co2": "y"})
)
data = data.bfill().ffill().to_frame().reset_index().rename(columns={"index": "ds", "co2": "y"})
num_samples = data.shape[0]
time_horizon = 12
split_idx = num_samples - time_horizon
@ -163,9 +155,7 @@ def load_multi_dataset():
return df
def test_multivariate_forecast_num(
budget=5, estimators_when_no_prophet=["arima", "sarimax", "holt-winters"]
):
def test_multivariate_forecast_num(budget=5, estimators_when_no_prophet=["arima", "sarimax", "holt-winters"]):
df = load_multi_dataset()
# split data into train and test
time_horizon = 180
@ -282,9 +272,7 @@ def load_multi_dataset_cat(time_horizon):
return 0
df["season"] = df["timeStamp"].apply(season)
df["above_monthly_avg"] = df.apply(
lambda x: above_monthly_avg(x["timeStamp"], x["temp"]), axis=1
)
df["above_monthly_avg"] = df.apply(lambda x: above_monthly_avg(x["timeStamp"], x["temp"]), axis=1)
# split data into train and test
num_samples = df.shape[0]
@ -297,9 +285,7 @@ def load_multi_dataset_cat(time_horizon):
return train_df, test_df
def test_multivariate_forecast_cat(
budget=5, estimators_when_no_prophet=["arima", "sarimax", "holt-winters"]
):
def test_multivariate_forecast_cat(budget=5, estimators_when_no_prophet=["arima", "sarimax", "holt-winters"]):
time_horizon = 180
train_df, test_df = load_multi_dataset_cat(time_horizon)
X_test = test_df[
@ -456,16 +442,10 @@ def get_stalliion_data():
data["time_idx"] = data["date"].dt.year * 12 + data["date"].dt.month
data["time_idx"] -= data["time_idx"].min()
# add additional features
data["month"] = data.date.dt.month.astype(str).astype(
"category"
) # categories have be strings
data["month"] = data.date.dt.month.astype(str).astype("category") # categories have be strings
data["log_volume"] = np.log(data.volume + 1e-8)
data["avg_volume_by_sku"] = data.groupby(
["time_idx", "sku"], observed=True
).volume.transform("mean")
data["avg_volume_by_agency"] = data.groupby(
["time_idx", "agency"], observed=True
).volume.transform("mean")
data["avg_volume_by_sku"] = data.groupby(["time_idx", "sku"], observed=True).volume.transform("mean")
data["avg_volume_by_agency"] = data.groupby(["time_idx", "agency"], observed=True).volume.transform("mean")
# we want to encode special days as one variable and thus need to first reverse one-hot encoding
special_days = [
"easter_day",
@ -479,11 +459,7 @@ def get_stalliion_data():
"beer_capital",
"music_fest",
]
data[special_days] = (
data[special_days]
.apply(lambda x: x.map({0: "-", 1: x.name}))
.astype("category")
)
data[special_days] = data[special_days].apply(lambda x: x.map({0: "-", 1: x.name})).astype("category")
return data, special_days
@ -571,8 +547,7 @@ def test_forecast_panel(budget=5):
y_test, y_pred = np.array(y_test), np.array(y_pred)
return round(
np.mean(np.abs(y_pred - y_test) / ((np.abs(y_pred) + np.abs(y_test)) / 2))
* 100,
np.mean(np.abs(y_pred - y_test) / ((np.abs(y_pred) + np.abs(y_test)) / 2)) * 100,
2,
)

Просмотреть файл

@ -156,9 +156,7 @@ class TestMultiClass(unittest.TestCase):
del settings["time_budget"]
settings["max_iter"] = 5
# test the "_choice_" issue when using ray
automl.fit(
X_train=X_train, y_train=y_train, n_concurrent_trials=2, **settings
)
automl.fit(X_train=X_train, y_train=y_train, n_concurrent_trials=2, **settings)
except ImportError:
return
@ -212,9 +210,7 @@ class TestMultiClass(unittest.TestCase):
print(automl.best_iteration)
print(automl.best_estimator)
automl = AutoML()
estimator = automl.get_estimator_from_log(
settings["log_file_name"], record_id=0, task="multiclass"
)
estimator = automl.get_estimator_from_log(settings["log_file_name"], record_id=0, task="multiclass")
print(estimator)
(
time_history,
@ -233,9 +229,7 @@ class TestMultiClass(unittest.TestCase):
del settings["time_budget"]
settings["max_iter"] = 2
automl.fit(**settings)
estimator = automl.get_estimator_from_log(
settings["log_file_name"], record_id=1, task="multiclass"
)
estimator = automl.get_estimator_from_log(settings["log_file_name"], record_id=1, task="multiclass")
except ImportError:
pass
@ -290,12 +284,8 @@ class TestMultiClass(unittest.TestCase):
"model_history": True,
}
X_train, y_train = load_iris(return_X_y=True)
automl_experiment_micro.fit(
X_train=X_train, y_train=y_train, metric="micro_f1", **automl_settings
)
automl_experiment_macro.fit(
X_train=X_train, y_train=y_train, metric="macro_f1", **automl_settings
)
automl_experiment_micro.fit(X_train=X_train, y_train=y_train, metric="micro_f1", **automl_settings)
automl_experiment_macro.fit(X_train=X_train, y_train=y_train, metric="macro_f1", **automl_settings)
estimator = automl_experiment_macro.model
y_pred = estimator.predict(X_train)
y_pred_proba = estimator.predict_proba(X_train)
@ -389,9 +379,7 @@ class TestMultiClass(unittest.TestCase):
def _test_memory_limit(self):
automl_experiment = AutoML()
automl_experiment.add_learner(
learner_name="large_lgbm", learner_class=MyLargeLGBM
)
automl_experiment.add_learner(learner_name="large_lgbm", learner_class=MyLargeLGBM)
automl_settings = {
"time_budget": -1,
"task": "classification",
@ -403,19 +391,13 @@ class TestMultiClass(unittest.TestCase):
}
X_train, y_train = load_iris(return_X_y=True, as_frame=True)
automl_experiment.fit(
X_train=X_train, y_train=y_train, max_iter=1, **automl_settings
)
automl_experiment.fit(X_train=X_train, y_train=y_train, max_iter=1, **automl_settings)
print(automl_experiment.model)
def test_time_limit(self):
automl_experiment = AutoML()
automl_experiment.add_learner(
learner_name="large_lgbm", learner_class=MyLargeLGBM
)
automl_experiment.add_learner(
learner_name="large_xgb", learner_class=MyLargeXGB
)
automl_experiment.add_learner(learner_name="large_lgbm", learner_class=MyLargeLGBM)
automl_experiment.add_learner(learner_name="large_xgb", learner_class=MyLargeXGB)
automl_settings = {
"time_budget": 0.5,
"task": "classification",
@ -450,21 +432,12 @@ class TestMultiClass(unittest.TestCase):
# test drop column
X_train.columns = range(X_train.shape[1])
X_train[X_train.shape[1]] = np.zeros(len(y_train))
automl.fit(
X_train=X_train,
y_train=y_train,
n_concurrent_trials=n_concurrent_trials,
**settings
)
automl.fit(X_train=X_train, y_train=y_train, n_concurrent_trials=n_concurrent_trials, **settings)
automl_val_accuracy = 1.0 - automl.best_loss
print("Best ML leaner:", automl.best_estimator)
print("Best hyperparmeter config:", automl.best_config)
print("Best accuracy on validation data: {0:.4g}".format(automl_val_accuracy))
print(
"Training duration of best run: {0:.4g} s".format(
automl.best_config_train_time
)
)
print("Training duration of best run: {0:.4g} s".format(automl.best_config_train_time))
starting_points = automl.best_config_per_estimator
print("starting_points", starting_points)
@ -486,14 +459,8 @@ class TestMultiClass(unittest.TestCase):
new_automl_val_accuracy = 1.0 - new_automl.best_loss
print("Best ML leaner:", new_automl.best_estimator)
print("Best hyperparmeter config:", new_automl.best_config)
print(
"Best accuracy on validation data: {0:.4g}".format(new_automl_val_accuracy)
)
print(
"Training duration of best run: {0:.4g} s".format(
new_automl.best_config_train_time
)
)
print("Best accuracy on validation data: {0:.4g}".format(new_automl_val_accuracy))
print("Training duration of best run: {0:.4g} s".format(new_automl.best_config_train_time))
def test_fit_w_starting_point_2(self, as_frame=True):
try:
@ -520,21 +487,12 @@ class TestMultiClass(unittest.TestCase):
# test drop column
X_train.columns = range(X_train.shape[1])
X_train[X_train.shape[1]] = np.zeros(len(y_train))
automl.fit(
X_train=X_train,
y_train=y_train,
n_concurrent_trials=n_concurrent_trials,
**settings
)
automl.fit(X_train=X_train, y_train=y_train, n_concurrent_trials=n_concurrent_trials, **settings)
automl_val_accuracy = 1.0 - automl.best_loss
print("Best ML leaner:", automl.best_estimator)
print("Best hyperparmeter config:", automl.best_config)
print("Best accuracy on validation data: {0:.4g}".format(automl_val_accuracy))
print(
"Training duration of best run: {0:.4g} s".format(
automl.best_config_train_time
)
)
print("Training duration of best run: {0:.4g} s".format(automl.best_config_train_time))
starting_points = {}
log_file_name = settings["log_file_name"]
@ -568,9 +526,7 @@ class TestMultiClass(unittest.TestCase):
new_automl_val_accuracy = 1.0 - new_automl.best_loss
# print('Best ML leaner:', new_automl.best_estimator)
# print('Best hyperparmeter config:', new_automl.best_config)
print(
"Best accuracy on validation data: {0:.4g}".format(new_automl_val_accuracy)
)
print("Best accuracy on validation data: {0:.4g}".format(new_automl_val_accuracy))
# print('Training duration of best run: {0:.4g} s'.format(new_automl_experiment.best_config_train_time))

Просмотреть файл

@ -60,9 +60,7 @@ def test_automl(budget=5, dataset_format="dataframe", hpo_method=None):
print("Best ML leaner:", automl.best_estimator)
print("Best hyperparmeter config:", automl.best_config)
print("Best accuracy on validation data: {0:.4g}".format(1 - automl.best_loss))
print(
"Training duration of best run: {0:.4g} s".format(automl.best_config_train_time)
)
print("Training duration of best run: {0:.4g} s".format(automl.best_config_train_time))
print(automl.model.estimator)
print(automl.best_config_per_estimator)
print("time taken to find best model:", automl.time_to_find_best_model)
@ -81,9 +79,7 @@ def test_automl(budget=5, dataset_format="dataframe", hpo_method=None):
accuracy = 1 - sklearn_metric_loss_score("accuracy", y_pred, y_test)
print("accuracy", "=", accuracy)
print(
"roc_auc", "=", 1 - sklearn_metric_loss_score("roc_auc", y_pred_proba, y_test)
)
print("roc_auc", "=", 1 - sklearn_metric_loss_score("roc_auc", y_pred_proba, y_test))
print("log_loss", "=", sklearn_metric_loss_score("log_loss", y_pred_proba, y_test))
if budget is None:
assert accuracy >= 0.669, "the accuracy of flaml should be larger than 0.67"
@ -122,9 +118,7 @@ def test_mlflow():
from flaml.automl.data import load_openml_task
try:
X_train, X_test, y_train, y_test = load_openml_task(
task_id=7592, data_dir="test/"
)
X_train, X_test, y_train, y_test = load_openml_task(task_id=7592, data_dir="test/")
except (OpenMLServerException, ChunkedEncodingError, SSLError) as e:
print(e)
return

Просмотреть файл

@ -40,11 +40,7 @@ class TestLogging(unittest.TestCase):
n = len(y_train) >> 1
print(automl.model, automl.classes_, automl.predict(X_train))
automl.fit(
X_train=X_train[:n],
y_train=y_train[:n],
X_val=X_train[n:],
y_val=y_train[n:],
**automl_settings
X_train=X_train[:n], y_train=y_train[:n], X_val=X_train[n:], y_val=y_train[n:], **automl_settings
)
logger.info(automl.search_space)
logger.info(automl.low_cost_partial_config)
@ -58,9 +54,7 @@ class TestLogging(unittest.TestCase):
sample = define_by_run_func(study.ask(), automl.search_space)
logger.info(sample)
logger.info(unflatten_hierarchical(sample, automl.search_space))
add_cost_to_space(
automl.search_space, automl.low_cost_partial_config, automl.cat_hp_cost
)
add_cost_to_space(automl.search_space, automl.low_cost_partial_config, automl.cat_hp_cost)
logger.info(automl.search_space["ml"].categories)
if automl.best_config:
config = automl.best_config.copy()

Просмотреть файл

@ -45,13 +45,7 @@ class TestRegression(unittest.TestCase):
}
X_train, y_train = fetch_california_housing(return_X_y=True)
n = int(len(y_train) * 9 // 10)
automl.fit(
X_train=X_train[:n],
y_train=y_train[:n],
X_val=X_train[n:],
y_val=y_train[n:],
**automl_settings
)
automl.fit(X_train=X_train[:n], y_train=y_train[:n], X_val=X_train[n:], y_val=y_train[n:], **automl_settings)
assert automl._state.eval_method == "holdout"
y_pred = automl.predict(X_train)
print(y_pred)
@ -88,10 +82,7 @@ class TestRegression(unittest.TestCase):
print(automl.model.estimator)
y_pred2 = automl.predict(X_train)
# In some rare case, the last config is early stopped and it's the best config. But the logged config's n_estimator is not reduced.
assert (
n_iter != automl.model.estimator.get_params("n_estimator")
or (y_pred == y_pred2).all()
)
assert n_iter != automl.model.estimator.get_params("n_estimator") or (y_pred == y_pred2).all()
def test_sparse_matrix_regression(self):
X_train = scipy.sparse.random(300, 900, density=0.0001)
@ -110,9 +101,7 @@ class TestRegression(unittest.TestCase):
"verbose": 0,
"early_stop": True,
}
automl.fit(
X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **settings
)
automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **settings)
assert automl._state.X_val.shape == X_val.shape
print(automl.predict(X_train))
print(automl.model)
@ -135,9 +124,7 @@ class TestRegression(unittest.TestCase):
"custom_hp": {"catboost": {"n_estimators": {"domain": 100}}},
}
)
automl.fit(
X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **settings
)
automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **settings)
def test_parallel(self, hpo_method=None):
automl_experiment = AutoML()
@ -203,13 +190,7 @@ class TestRegression(unittest.TestCase):
"keep_search_state": True,
"early_stop": True,
}
automl_experiment.fit(
X_train=X_train,
y_train=y_train,
X_val=X_val,
y_val=y_val,
**automl_settings
)
automl_experiment.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings)
assert automl_experiment._state.X_val.shape == X_val.shape
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
@ -231,9 +212,7 @@ def test_multioutput():
X, y = make_regression(n_targets=3)
# split into train and test data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.30, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
# train the model
model = MultiOutputRegressor(AutoML(task="regression", time_budget=1))

Просмотреть файл

@ -11,12 +11,7 @@ class TestScore:
import statsmodels.api as sm
data = sm.datasets.co2.load_pandas().data["co2"].resample("MS").mean()
data = (
data.fillna(data.bfill())
.to_frame()
.reset_index()
.rename(columns={"index": "ds", "co2": "y"})
)
data = data.fillna(data.bfill()).to_frame().reset_index().rename(columns={"index": "ds", "co2": "y"})
num_samples = data.shape[0]
time_horizon = 12
split_idx = num_samples - time_horizon
@ -48,9 +43,7 @@ class TestScore:
with open("automl.pkl", "rb") as f:
pickle.load(f) # v1.1 of prophet raises RecursionError
except (ImportError, RecursionError):
print(
"not using prophet due to ImportError or RecursionError (when unpickling in v1.1)"
)
print("not using prophet due to ImportError or RecursionError (when unpickling in v1.1)")
automl.fit(
dataframe=df,
**settings,

Просмотреть файл

@ -29,13 +29,9 @@ def _test(split_type):
X, y = load_wine(return_X_y=True)
if split_type != "time":
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
else:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, shuffle=False
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=False)
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
pred = automl.predict(X_test)
@ -83,9 +79,7 @@ def test_groups():
automl_settings["split_type"] = GroupKFold(n_splits=3)
try:
automl.fit(X, y, **automl_settings)
raise RuntimeError(
"GroupKFold object as split_type should fail when eval_method is holdout"
)
raise RuntimeError("GroupKFold object as split_type should fail when eval_method is holdout")
except AssertionError:
# eval_method must be 'auto' or 'cv' for custom data splitter.
pass
@ -140,9 +134,7 @@ def test_rank():
"log_file_name": "test/{}.log".format(dataset),
"model_history": True,
"eval_method": "cv",
"groups": np.array( # group labels
[0] * 200 + [1] * 200 + [2] * 200 + [3] * 200 + [4] * 100 + [5] * 100
),
"groups": np.array([0] * 200 + [1] * 200 + [2] * 200 + [3] * 200 + [4] * 100 + [5] * 100), # group labels
"learner_selector": "roundrobin",
}
automl.fit(X, y, **automl_settings)
@ -197,9 +189,7 @@ def test_object():
"split_type": TestKFold(5),
}
automl.fit(X, y, **automl_settings)
assert (
automl._state.eval_method == "cv"
), "eval_method must be 'cv' for custom data splitter"
assert automl._state.eval_method == "cv", "eval_method must be 'cv' for custom data splitter"
kf = TestKFold(5)
kf.shuffle = True

Просмотреть файл

@ -9,9 +9,7 @@ from flaml.automl.training_log import training_log_reader
class TestTrainingLog(unittest.TestCase):
def test_training_log(
self, path="test_training_log.log", estimator_list="auto", use_ray=False
):
def test_training_log(self, path="test_training_log.log", estimator_list="auto", use_ray=False):
with TemporaryDirectory() as d:
filename = os.path.join(d, path)
@ -64,11 +62,9 @@ class TestTrainingLog(unittest.TestCase):
assert (
str(model.estimator) == str(automl.model.estimator)
or estimator == "xgboost"
and str(model.estimator.get_dump())
== str(automl.model.estimator.get_dump())
and str(model.estimator.get_dump()) == str(automl.model.estimator.get_dump())
or estimator == "catboost"
and str(model.estimator.get_all_params())
== str(automl.model.estimator.get_all_params())
and str(model.estimator.get_all_params()) == str(automl.model.estimator.get_all_params())
)
automl.fit(
X_train=X_train,

Просмотреть файл

@ -29,11 +29,7 @@ class TestWarmStart(unittest.TestCase):
print("Best ML leaner:", automl.best_estimator)
print("Best hyperparmeter config:", automl.best_config)
print("Best accuracy on validation data: {0:.4g}".format(automl_val_accuracy))
print(
"Training duration of best run: {0:.4g} s".format(
automl.best_config_train_time
)
)
print("Training duration of best run: {0:.4g} s".format(automl.best_config_train_time))
# 1. Get starting points from previous experiments.
starting_points = automl.best_config_per_estimator
print("starting_points", starting_points)
@ -61,19 +57,13 @@ class TestWarmStart(unittest.TestCase):
revised_hps_to_search = {
"n_estimators": {
"domain": tune.lograndint(lower=10, upper=32768),
"init_value": starting_point.get("n_estimators")
or space["n_estimators"].get("init_value", 10),
"low_cost_init_value": space["n_estimators"].get(
"low_cost_init_value", 10
),
"init_value": starting_point.get("n_estimators") or space["n_estimators"].get("init_value", 10),
"low_cost_init_value": space["n_estimators"].get("low_cost_init_value", 10),
},
"num_leaves": {
"domain": tune.lograndint(lower=10, upper=3276),
"init_value": starting_point.get("num_leaves")
or space["num_leaves"].get("init_value", 10),
"low_cost_init_value": space["num_leaves"].get(
"low_cost_init_value", 10
),
"init_value": starting_point.get("num_leaves") or space["num_leaves"].get("init_value", 10),
"low_cost_init_value": space["num_leaves"].get("low_cost_init_value", 10),
},
# (3.2) Add a new hp which is not in the original search space
"subsample": {
@ -86,9 +76,7 @@ class TestWarmStart(unittest.TestCase):
new_estimator_name = "large_lgbm"
new_automl = AutoML()
new_automl.add_learner(
learner_name=new_estimator_name, learner_class=MyPartiallyFreezedLargeLGBM
)
new_automl.add_learner(learner_name=new_estimator_name, learner_class=MyPartiallyFreezedLargeLGBM)
automl_settings_resume = {
"time_budget": 3,
@ -108,14 +96,8 @@ class TestWarmStart(unittest.TestCase):
new_automl_val_accuracy = 1.0 - new_automl.best_loss
print("Best ML leaner:", new_automl.best_estimator)
print("Best hyperparmeter config:", new_automl.best_config)
print(
"Best accuracy on validation data: {0:.4g}".format(new_automl_val_accuracy)
)
print(
"Training duration of best run: {0:.4g} s".format(
new_automl.best_config_train_time
)
)
print("Best accuracy on validation data: {0:.4g}".format(new_automl_val_accuracy))
print("Training duration of best run: {0:.4g} s".format(new_automl.best_config_train_time))
def test_nobudget(self):
automl = AutoML()
@ -127,9 +109,7 @@ class TestWarmStart(unittest.TestCase):
from flaml.automl.data import load_openml_dataset
from flaml import AutoML
X_train, X_test, y_train, y_test = load_openml_dataset(
dataset_id=1169, data_dir="./"
)
X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir="./")
automl_settings = {
"time_budget": 3,

Просмотреть файл

@ -49,9 +49,7 @@ def test_simple(method=None):
from sklearn.datasets import load_wine
X, y = load_wine(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl.estimator_list)
print(automl.search_space)
@ -77,9 +75,7 @@ def test_simple(method=None):
min_resource=automl.min_resource,
max_resource=automl.max_resource,
time_budget_s=automl._state.time_budget,
config_constraints=[
(partial(size, automl._state.learner_classes), "<=", automl._mem_thres)
],
config_constraints=[(partial(size, automl._state.learner_classes), "<=", automl._mem_thres)],
metric_constraints=automl.metric_constraints,
num_samples=5,
)

Просмотреть файл

@ -31,9 +31,7 @@ def _test_simple(method=None, size_ratio=1.0):
automl.add_learner(learner_name="XGBoost2D", learner_class=XGBoost2D)
X, y = fetch_openml(name=dataset, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
final_size = int(len(y_train) * size_ratio)
X_train = X_train[:final_size]

Просмотреть файл

@ -69,23 +69,15 @@ def test_regret():
def test_suggest_classification():
location = "test/default"
X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True)
suggested = suggest_hyperparams(
"classification", X_train, y_train, "lgbm", location=location
)
suggested = suggest_hyperparams("classification", X_train, y_train, "lgbm", location=location)
print(suggested)
suggested = preprocess_and_suggest_hyperparams(
"classification", X_train, y_train, "xgboost", location=location
)
suggested = preprocess_and_suggest_hyperparams("classification", X_train, y_train, "xgboost", location=location)
print(suggested)
suggested = suggest_hyperparams(
"classification", X_train, y_train, "xgb_limitdepth", location=location
)
suggested = suggest_hyperparams("classification", X_train, y_train, "xgb_limitdepth", location=location)
print(suggested)
X, y = load_iris(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
(
hyperparams,
estimator_class,
@ -93,29 +85,21 @@ def test_suggest_classification():
y,
feature_transformer,
label_transformer,
) = preprocess_and_suggest_hyperparams(
"classification", X_train, y_train, "lgbm", location=location
)
) = preprocess_and_suggest_hyperparams("classification", X_train, y_train, "lgbm", location=location)
with open("test/default/feature_transformer", "wb") as f:
pickle.dump(feature_transformer, f, pickle.HIGHEST_PROTOCOL)
model = estimator_class(**hyperparams) # estimator_class is LGBMClassifier
model.fit(X, y)
X_test = feature_transformer.transform(X_test)
y_pred = label_transformer.inverse_transform(
pd.Series(model.predict(X_test).astype(int))
)
y_pred = label_transformer.inverse_transform(pd.Series(model.predict(X_test).astype(int)))
print(y_pred)
suggested = suggest_hyperparams(
"classification", X_train, y_train, "xgboost", location=location
)
suggested = suggest_hyperparams("classification", X_train, y_train, "xgboost", location=location)
print(suggested)
suggested = preprocess_and_suggest_hyperparams(
"classification", X_train, y_train, "xgb_limitdepth", location=location
)
print(suggested)
suggested = suggest_hyperparams(
"classification", X_train, y_train, "xgb_limitdepth", location=location
)
suggested = suggest_hyperparams("classification", X_train, y_train, "xgb_limitdepth", location=location)
suggested = suggest_learner(
"classification",
X_train,
@ -129,17 +113,11 @@ def test_suggest_classification():
def test_suggest_regression():
location = "test/default"
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True)
suggested = suggest_hyperparams(
"regression", X_train, y_train, "lgbm", location=location
)
suggested = suggest_hyperparams("regression", X_train, y_train, "lgbm", location=location)
print(suggested)
suggested = preprocess_and_suggest_hyperparams(
"regression", X_train, y_train, "xgboost", location=location
)
suggested = preprocess_and_suggest_hyperparams("regression", X_train, y_train, "xgboost", location=location)
print(suggested)
suggested = suggest_hyperparams(
"regression", X_train, y_train, "xgb_limitdepth", location=location
)
suggested = suggest_hyperparams("regression", X_train, y_train, "xgb_limitdepth", location=location)
print(suggested)
suggested = suggest_learner("regression", X_train, y_train, location=location)
print(suggested)

Просмотреть файл

@ -5,9 +5,7 @@ from flaml.automl.ml import sklearn_metric_loss_score
X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=537, data_dir="./")
lgbm = LGBMRegressor()
hyperparams, estimator_name, X_transformed, y_transformed = lgbm.suggest_hyperparams(
X_train, y_train
)
hyperparams, estimator_name, X_transformed, y_transformed = lgbm.suggest_hyperparams(X_train, y_train)
print(hyperparams)
lgbm.fit(X_train, y_train)

Просмотреть файл

@ -21,13 +21,7 @@ def test_hf_data():
automl_settings["preserve_checkpoint"] = False
try:
automl.fit(
X_train=X_train,
y_train=y_train,
X_val=X_val,
y_val=y_val,
**automl_settings
)
automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings)
automl.score(X_val, y_val, **{"metric": "accuracy"})
automl.pickle("automl.pkl")
except requests.exceptions.HTTPError:
@ -54,13 +48,7 @@ def test_hf_data():
automl_settings.pop("use_ray", None)
automl_settings.pop("estimator_list", None)
automl.retrain_from_log(
X_train=X_train,
y_train=y_train,
train_full=True,
record_id=0,
**automl_settings
)
automl.retrain_from_log(X_train=X_train, y_train=y_train, train_full=True, record_id=0, **automl_settings)
automl.predict(X_test, **{"per_device_eval_batch_size": 2})
automl.predict(["", ""])
automl.predict_proba(["", ""])

Просмотреть файл

@ -23,65 +23,47 @@ model_path_list = [
def test_switch_1_1():
data_idx, model_path_idx = 0, 0
_test_switch_classificationhead(
data_list[data_idx], model_path_list[model_path_idx]
)
_test_switch_classificationhead(data_list[data_idx], model_path_list[model_path_idx])
def test_switch_1_2():
data_idx, model_path_idx = 0, 1
_test_switch_classificationhead(
data_list[data_idx], model_path_list[model_path_idx]
)
_test_switch_classificationhead(data_list[data_idx], model_path_list[model_path_idx])
def test_switch_1_3():
data_idx, model_path_idx = 0, 2
_test_switch_classificationhead(
data_list[data_idx], model_path_list[model_path_idx]
)
_test_switch_classificationhead(data_list[data_idx], model_path_list[model_path_idx])
def test_switch_2_1():
data_idx, model_path_idx = 1, 0
_test_switch_classificationhead(
data_list[data_idx], model_path_list[model_path_idx]
)
_test_switch_classificationhead(data_list[data_idx], model_path_list[model_path_idx])
def test_switch_2_2():
data_idx, model_path_idx = 1, 1
_test_switch_classificationhead(
data_list[data_idx], model_path_list[model_path_idx]
)
_test_switch_classificationhead(data_list[data_idx], model_path_list[model_path_idx])
def test_switch_2_3():
data_idx, model_path_idx = 1, 2
_test_switch_classificationhead(
data_list[data_idx], model_path_list[model_path_idx]
)
_test_switch_classificationhead(data_list[data_idx], model_path_list[model_path_idx])
def test_switch_3_1():
data_idx, model_path_idx = 2, 0
_test_switch_classificationhead(
data_list[data_idx], model_path_list[model_path_idx]
)
_test_switch_classificationhead(data_list[data_idx], model_path_list[model_path_idx])
def test_switch_3_2():
data_idx, model_path_idx = 2, 1
_test_switch_classificationhead(
data_list[data_idx], model_path_list[model_path_idx]
)
_test_switch_classificationhead(data_list[data_idx], model_path_list[model_path_idx])
def test_switch_3_3():
data_idx, model_path_idx = 2, 2
_test_switch_classificationhead(
data_list[data_idx], model_path_list[model_path_idx]
)
_test_switch_classificationhead(data_list[data_idx], model_path_list[model_path_idx])
def _test_switch_classificationhead(each_data, each_model_path):
@ -102,13 +84,7 @@ def _test_switch_classificationhead(each_data, each_model_path):
automl_settings["metric"] = "accuracy"
try:
automl.fit(
X_train=X_train,
y_train=y_train,
X_val=X_val,
y_val=y_val,
**automl_settings
)
automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings)
except requests.exceptions.HTTPError:
return

Просмотреть файл

@ -61,22 +61,14 @@ def test_custom_metric():
automl_settings["use_ray"] = {"local_dir": "data/output/"}
try:
automl.fit(
X_train=X_train,
y_train=y_train,
X_val=X_val,
y_val=y_val,
**automl_settings
)
automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings)
except requests.exceptions.HTTPError:
return
# testing calling custom metric in TransformersEstimator._compute_metrics_by_dataset_name
automl_settings["max_iter"] = 3
automl.fit(
X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
)
automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings)
automl.score(X_val, y_val, **{"metric": custom_metric})
automl.pickle("automl.pkl")

Просмотреть файл

@ -5,9 +5,7 @@ import os
import shutil
@pytest.mark.skipif(
sys.platform in ["darwin", "win32"], reason="do not run on mac os or windows"
)
@pytest.mark.skipif(sys.platform in ["darwin", "win32"], reason="do not run on mac os or windows")
def test_cv():
from flaml import AutoML
import requests

Просмотреть файл

@ -2,6 +2,4 @@ def test_load_args():
import subprocess
import sys
subprocess.call(
[sys.executable, "load_args.py", "--output_dir", "data/"], shell=True
)
subprocess.call([sys.executable, "load_args.py", "--output_dir", "data/"], shell=True)

Просмотреть файл

@ -5,9 +5,7 @@ import os
import shutil
@pytest.mark.skipif(
sys.platform in ["darwin", "win32"], reason="do not run on mac os or windows"
)
@pytest.mark.skipif(sys.platform in ["darwin", "win32"], reason="do not run on mac os or windows")
def test_mcc():
from flaml import AutoML
import requests
@ -27,13 +25,7 @@ def test_mcc():
automl_settings["metric"] = "accuracy"
try:
automl.fit(
X_train=X_train,
y_train=y_train,
X_val=X_val,
y_val=y_val,
**automl_settings
)
automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings)
except requests.exceptions.HTTPError:
return

Просмотреть файл

@ -29,9 +29,7 @@ def test_regression():
ray.shutdown()
ray.init()
automl.fit(
X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
)
automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings)
automl.predict(X_val)
if os.path.exists("test/data/output/"):

Просмотреть файл

@ -22,18 +22,10 @@ def test_summarization():
automl_settings["task"] = "summarization"
automl_settings["metric"] = "rouge1"
automl_settings["time_budget"] = 2 * automl_settings["time_budget"]
automl_settings["fit_kwargs_by_estimator"]["transformer"][
"model_path"
] = "patrickvonplaten/t5-tiny-random"
automl_settings["fit_kwargs_by_estimator"]["transformer"]["model_path"] = "patrickvonplaten/t5-tiny-random"
try:
automl.fit(
X_train=X_train,
y_train=y_train,
X_val=X_val,
y_val=y_val,
**automl_settings
)
automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings)
except requests.exceptions.HTTPError:
return
@ -41,13 +33,7 @@ def test_summarization():
automl_settings.pop("use_ray", None)
automl_settings.pop("estimator_list", None)
automl.retrain_from_log(
X_train=X_train,
y_train=y_train,
train_full=True,
record_id=0,
**automl_settings
)
automl.retrain_from_log(X_train=X_train, y_train=y_train, train_full=True, record_id=0, **automl_settings)
automl.predict(X_test)
if os.path.exists("test/data/output/"):

Просмотреть файл

@ -22,9 +22,7 @@ def test_tokenclassification_idlabel():
automl_settings = get_automl_settings()
automl_settings["task"] = "token-classification"
automl_settings[
"metric"
] = "seqeval:overall_f1" # evaluating based on the overall_f1 of seqeval
automl_settings["metric"] = "seqeval:overall_f1" # evaluating based on the overall_f1 of seqeval
automl_settings["fit_kwargs_by_estimator"]["transformer"]["label_list"] = [
"O",
"B-PER",
@ -38,13 +36,7 @@ def test_tokenclassification_idlabel():
]
try:
automl.fit(
X_train=X_train,
y_train=y_train,
X_val=X_val,
y_val=y_val,
**automl_settings
)
automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings)
except requests.exceptions.HTTPError:
return
@ -83,18 +75,10 @@ def test_tokenclassification_tokenlabel():
automl_settings = get_automl_settings()
automl_settings["task"] = "token-classification"
automl_settings[
"metric"
] = "seqeval:overall_f1" # evaluating based on the overall_f1 of seqeval
automl_settings["metric"] = "seqeval:overall_f1" # evaluating based on the overall_f1 of seqeval
try:
automl.fit(
X_train=X_train,
y_train=y_train,
X_val=X_val,
y_val=y_val,
**automl_settings
)
automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings)
except requests.exceptions.HTTPError:
return

Просмотреть файл

@ -32,15 +32,10 @@ def test_starting_point_not_in_search_space():
automl = AutoML()
automl_settings = get_automl_settings(estimator_name=this_estimator_name)
automl_settings["starting_points"] = {
this_estimator_name: [{"learning_rate": 2e-3}]
}
automl_settings["starting_points"] = {this_estimator_name: [{"learning_rate": 2e-3}]}
automl.fit(X_train, y_train, **automl_settings)
assert (
automl._search_states[this_estimator_name].init_config[0]["learning_rate"]
!= 2e-3
)
assert automl._search_states[this_estimator_name].init_config[0]["learning_rate"] != 2e-3
"""
test starting_points located outside of the search space, and custom_hp is set
@ -80,10 +75,7 @@ def test_starting_point_not_in_search_space():
len(automl_settings["custom_hp"][this_estimator_name]),
)
)
assert (
automl._search_states[this_estimator_name].search_space["model_path"]
== "albert-base-v2"
)
assert automl._search_states[this_estimator_name].search_space["model_path"] == "albert-base-v2"
if os.path.exists("test/data/output/"):
try:
@ -103,11 +95,7 @@ def test_points_to_evaluate():
automl_settings["starting_points"] = "data:test/nlp/default/"
automl_settings["custom_hp"] = {
"transformer_ms": {
"model_path": {"domain": "google/electra-small-discriminator"}
}
}
automl_settings["custom_hp"] = {"transformer_ms": {"model_path": {"domain": "google/electra-small-discriminator"}}}
automl.fit(X_train, y_train, **automl_settings)
@ -137,13 +125,9 @@ def test_zero_shot_nomodel():
y_train,
_,
_,
) = preprocess_and_suggest_hyperparams(
"seq-classification", X_train, y_train, estimator_name, location=location
)
) = preprocess_and_suggest_hyperparams("seq-classification", X_train, y_train, estimator_name, location=location)
model = estimator_class(
**hyperparams
) # estimator_class is TransformersEstimatorModelSelection
model = estimator_class(**hyperparams) # estimator_class is TransformersEstimatorModelSelection
fit_kwargs = automl_settings.pop("fit_kwargs_by_estimator", {}).get(estimator_name)
fit_kwargs.update(automl_settings)

Просмотреть файл

@ -314,8 +314,7 @@ def get_toy_data_multiplechoiceclassification():
" its false bottom. He stands and looks around, his eyes",
],
"sent1": [
"Someone leans out of the drive - thru "
"window, grinning at her, holding bags filled with fast food.",
"Someone leans out of the drive - thru " "window, grinning at her, holding bags filled with fast food.",
"Someone looks up suddenly when he hears.",
"Someone drives; someone sits beside her.",
"He opens the drawer in which we know"
@ -343,8 +342,7 @@ def get_toy_data_multiplechoiceclassification():
"ending2": [
"attempts to block her ransacked.",
"talks using the phone and walks away for a few seconds.",
"are too involved with each other to "
"notice someone watching them from the drive - thru window.",
"are too involved with each other to " "notice someone watching them from the drive - thru window.",
"finally landing on: the digicam and a stack of cassettes on a shelf.",
],
"ending3": [

Просмотреть файл

@ -107,9 +107,7 @@ def main(args):
data_dir,
train=True,
download=True,
transform=transforms.Compose(
[transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
),
transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]),
),
batch_size=args["batch_size"],
shuffle=True,
@ -119,9 +117,7 @@ def main(args):
datasets.MNIST(
data_dir,
train=False,
transform=transforms.Compose(
[transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
),
transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]),
),
batch_size=1000,
shuffle=True,
@ -188,12 +184,8 @@ def get_params():
metavar="N",
help="number of epochs to train (default: 10)",
)
parser.add_argument(
"--seed", type=int, default=1, metavar="S", help="random seed (default: 1)"
)
parser.add_argument(
"--no_cuda", action="store_true", default=False, help="disables CUDA training"
)
parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
parser.add_argument("--no_cuda", action="store_true", default=False, help="disables CUDA training")
parser.add_argument(
"--log_interval",
type=int,

Просмотреть файл

@ -8,9 +8,7 @@ import ray
data = fetch_california_housing(return_X_y=False, as_frame=True)
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train_ref = ray.put(X_train)
print(isinstance(X_train_ref, ray.ObjectRef))
@ -33,22 +31,14 @@ def train_lgbm(config: dict) -> dict:
# load a built-in search space from flaml
flaml_lgbm_search_space = LGBMEstimator.search_space(X_train.shape)
# specify the search space as a dict from hp name to domain; you can define your own search space same way
config_search_space = {
hp: space["domain"] for hp, space in flaml_lgbm_search_space.items()
}
config_search_space = {hp: space["domain"] for hp, space in flaml_lgbm_search_space.items()}
# give guidance about hp values corresponding to low training cost, i.e., {"n_estimators": 4, "num_leaves": 4}
low_cost_partial_config = {
hp: space["low_cost_init_value"]
for hp, space in flaml_lgbm_search_space.items()
if "low_cost_init_value" in space
hp: space["low_cost_init_value"] for hp, space in flaml_lgbm_search_space.items() if "low_cost_init_value" in space
}
# initial points to evaluate
points_to_evaluate = [
{
hp: space["init_value"]
for hp, space in flaml_lgbm_search_space.items()
if "init_value" in space
}
{hp: space["init_value"] for hp, space in flaml_lgbm_search_space.items() if "init_value" in space}
]
# run the tuning, minimizing mse, with total time budget 3 seconds
analysis = tune.run(

Просмотреть файл

@ -17,9 +17,7 @@ from flaml.autogen.math_utils import eval_math_responses
reason="do not run on windows",
)
def test_humaneval(num_samples=1):
eval_with_generated_assertions = partial(
eval_function_completions, assertions=generate_assertions
)
eval_with_generated_assertions = partial(eval_function_completions, assertions=generate_assertions)
seed = 41
data = datasets.load_dataset("openai_humaneval")["test"].shuffle(seed=seed)
@ -165,9 +163,7 @@ def test_math(num_samples=-1):
"stop": "###",
}
test_data_sample = test_data[0:3]
result = oai.ChatCompletion.test(
test_data_sample, vanilla_config, eval_math_responses
)
result = oai.ChatCompletion.test(test_data_sample, vanilla_config, eval_math_responses)
test_data_sample = test_data[3:6]
result = oai.ChatCompletion.test(
test_data_sample,

Просмотреть файл

@ -83,9 +83,7 @@ def build_and_submit_aml_pipeline(config):
################################################
# load component functions
################################################
data_prep_component = Component.from_yaml(
ws, yaml_file=LOCAL_DIR / "data_prep/data_prep.yaml"
)
data_prep_component = Component.from_yaml(ws, yaml_file=LOCAL_DIR / "data_prep/data_prep.yaml")
train_component = Component.from_yaml(ws, yaml_file=LOCAL_DIR / "train/train.yaml")
################################################

Просмотреть файл

@ -24,9 +24,7 @@ def remote_run():
# load component functions
################################################
pipeline_tuning_func = Component.from_yaml(
ws, yaml_file=LOCAL_DIR / "tuner/component_spec.yaml"
)
pipeline_tuning_func = Component.from_yaml(ws, yaml_file=LOCAL_DIR / "tuner/component_spec.yaml")
################################################
# build pipeline
@ -61,9 +59,7 @@ if __name__ == "__main__":
help="your_subscription_id",
required=False,
)
parser.add_argument(
"--resource_group", type=str, help="your_resource_group", required=False
)
parser.add_argument("--resource_group", type=str, help="your_resource_group", required=False)
parser.add_argument("--workspace", type=str, help="your_workspace", required=False)
parser.add_argument("--remote", dest="remote", action="store_true")

Просмотреть файл

@ -26,9 +26,7 @@ if __name__ == "__main__":
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
X_train_ref = ray.put(X_train)
flaml_lgbm_search_space = LGBMEstimator.search_space(X_train.shape)
config_search_space = {
hp: space["domain"] for hp, space in flaml_lgbm_search_space.items()
}
config_search_space = {hp: space["domain"] for hp, space in flaml_lgbm_search_space.items()}
low_cost_partial_config = {
hp: space["low_cost_init_value"]
for hp, space in flaml_lgbm_search_space.items()

Просмотреть файл

@ -28,9 +28,7 @@ settings = {
for trial_num in range(8):
automl = AutoML()
automl.add_learner(
learner_name="extra_trees_seeded", learner_class=ExtraTreesEstimatorSeeded
)
automl.add_learner(learner_name="extra_trees_seeded", learner_class=ExtraTreesEstimatorSeeded)
automl.fit(X_train=X_train, y_train=y_train, **settings)
print(automl.best_loss)
print(automl.best_config)

Просмотреть файл

@ -7,16 +7,12 @@ ray_environment_name = "aml-ray-cpu"
ray_environment_dockerfile_path = "./Docker/Dockerfile-cpu"
# Build CPU image for Ray
ray_cpu_env = Environment.from_dockerfile(
name=ray_environment_name, dockerfile=ray_environment_dockerfile_path
)
ray_cpu_env = Environment.from_dockerfile(name=ray_environment_name, dockerfile=ray_environment_dockerfile_path)
ray_cpu_env.register(workspace=ws)
ray_cpu_build_details = ray_cpu_env.build(workspace=ws)
while ray_cpu_build_details.status not in ["Succeeded", "Failed"]:
print(
f"Awaiting completion of ray CPU environment build. Current status is: {ray_cpu_build_details.status}"
)
print(f"Awaiting completion of ray CPU environment build. Current status is: {ray_cpu_build_details.status}")
time.sleep(10)
command = ["python distribute_automl.py"]

Просмотреть файл

@ -7,16 +7,12 @@ ray_environment_name = "aml-ray-cpu"
ray_environment_dockerfile_path = "./Docker/Dockerfile-cpu"
# Build CPU image for Ray
ray_cpu_env = Environment.from_dockerfile(
name=ray_environment_name, dockerfile=ray_environment_dockerfile_path
)
ray_cpu_env = Environment.from_dockerfile(name=ray_environment_name, dockerfile=ray_environment_dockerfile_path)
ray_cpu_env.register(workspace=ws)
ray_cpu_build_details = ray_cpu_env.build(workspace=ws)
while ray_cpu_build_details.status not in ["Succeeded", "Failed"]:
print(
f"Awaiting completion of ray CPU environment build. Current status is: {ray_cpu_build_details.status}"
)
print(f"Awaiting completion of ray CPU environment build. Current status is: {ray_cpu_build_details.status}")
time.sleep(10)
command = ["python distribute_tune.py"]

Просмотреть файл

@ -36,9 +36,7 @@ else:
skip_spark = True
pytestmark = pytest.mark.skipif(
skip_spark, reason="Spark is not installed. Skip all spark tests."
)
pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
def _test_spark_synapseml_lightgbm(spark=None, task="classification"):
@ -83,9 +81,7 @@ def _test_spark_synapseml_lightgbm(spark=None, task="classification"):
columns = X_train.columns
feature_cols = [col for col in columns if col != "label"]
featurizer = VectorAssembler(inputCols=feature_cols, outputCol="features")
X_train = featurizer.transform(X_train.to_spark(index_col="index"))[
"index", "features"
]
X_train = featurizer.transform(X_train.to_spark(index_col="index"))["index", "features"]
X_train = to_pandas_on_spark(X_train)
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
@ -138,9 +134,7 @@ def test_spark_input_df():
spark.read.format("csv")
.option("header", True)
.option("inferSchema", True)
.load(
"wasbs://publicwasb@mmlspark.blob.core.windows.net/company_bankruptcy_prediction_data.csv"
)
.load("wasbs://publicwasb@mmlspark.blob.core.windows.net/company_bankruptcy_prediction_data.csv")
)
train, test = df.randomSplit([0.8, 0.2], seed=1)
feature_cols = df.columns[1:]
@ -151,9 +145,7 @@ def test_spark_input_df():
settings = {
"time_budget": 30, # total running time in seconds
"metric": "roc_auc",
"estimator_list": [
"lgbm_spark"
], # list of ML learners; we tune lightgbm in this example
"estimator_list": ["lgbm_spark"], # list of ML learners; we tune lightgbm in this example
"task": "classification", # task type
"log_file_name": "flaml_experiment.log", # flaml log file
"seed": 7654321, # random seed
@ -187,9 +179,7 @@ def test_spark_input_df():
settings = {
"time_budget": 10, # total running time in seconds
"metric": "roc_auc",
"estimator_list": [
"lgbm"
], # list of ML learners; we tune lightgbm in this example
"estimator_list": ["lgbm"], # list of ML learners; we tune lightgbm in this example
"task": "classification", # task type
}
with pytest.raises(ValueError) as excinfo:

Просмотреть файл

@ -23,9 +23,7 @@ os.environ["FLAML_MAX_CONCURRENT"] = "2"
spark_available, _ = check_spark()
skip_spark = not spark_available
pytestmark = pytest.mark.skipif(
skip_spark, reason="Spark is not installed. Skip all spark tests."
)
pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
def test_parallel_xgboost(hpo_method=None, data_size=1000):
@ -59,9 +57,7 @@ def test_parallel_xgboost_others():
test_parallel_xgboost(hpo_method="random")
@pytest.mark.skip(
reason="currently not supporting too large data, will support spark dataframe in the future"
)
@pytest.mark.skip(reason="currently not supporting too large data, will support spark dataframe in the future")
def test_large_dataset():
test_parallel_xgboost(data_size=90000000)
@ -101,8 +97,6 @@ if __name__ == "__main__":
test_parallel_xgboost_others()
# test_large_dataset()
if skip_my_learner:
print(
"please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file"
)
print("please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file")
else:
test_custom_learner()

Просмотреть файл

@ -7,17 +7,13 @@ import pytest
spark_available, _ = check_spark()
skip_spark = not spark_available
pytestmark = pytest.mark.skipif(
skip_spark, reason="Spark is not installed. Skip all spark tests."
)
pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
os.environ["FLAML_MAX_CONCURRENT"] = "2"
def base_automl(n_concurrent_trials=1, use_ray=False, use_spark=False, verbose=0):
X_train, X_test, y_train, y_test = load_openml_dataset(
dataset_id=537, data_dir="./"
)
X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=537, data_dir="./")
automl = AutoML()
settings = {
"time_budget": 3, # total running time in seconds
@ -37,9 +33,7 @@ def base_automl(n_concurrent_trials=1, use_ray=False, use_spark=False, verbose=0
print("Best ML leaner:", automl.best_estimator)
print("Best hyperparmeter config:", automl.best_config)
print("Best accuracy on validation data: {0:.4g}".format(1 - automl.best_loss))
print(
"Training duration of best run: {0:.4g} s".format(automl.best_config_train_time)
)
print("Training duration of best run: {0:.4g} s".format(automl.best_config_train_time))
def test_both_ray_spark():

Просмотреть файл

@ -107,9 +107,7 @@ class TestMultiClass(unittest.TestCase):
valid_loss_history,
config_history,
metric_history,
) = get_output_from_log(
filename=automl_settings["log_file_name"], time_budget=6
)
) = get_output_from_log(filename=automl_settings["log_file_name"], time_budget=6)
print(metric_history)
def test_classification(self, as_frame=False):
@ -167,12 +165,8 @@ class TestMultiClass(unittest.TestCase):
"use_spark": True,
}
X_train, y_train = load_iris(return_X_y=True)
automl_experiment_micro.fit(
X_train=X_train, y_train=y_train, metric="micro_f1", **automl_settings
)
automl_experiment_macro.fit(
X_train=X_train, y_train=y_train, metric="macro_f1", **automl_settings
)
automl_experiment_micro.fit(X_train=X_train, y_train=y_train, metric="micro_f1", **automl_settings)
automl_experiment_macro.fit(X_train=X_train, y_train=y_train, metric="macro_f1", **automl_settings)
estimator = automl_experiment_macro.model
y_pred = estimator.predict(X_train)
y_pred_proba = estimator.predict_proba(X_train)
@ -280,9 +274,7 @@ class TestMultiClass(unittest.TestCase):
)
def _test_memory_limit(self):
automl_experiment = AutoML()
automl_experiment.add_learner(
learner_name="large_lgbm", learner_class=MyLargeLGBM
)
automl_experiment.add_learner(learner_name="large_lgbm", learner_class=MyLargeLGBM)
automl_settings = {
"time_budget": -1,
"task": "classification",
@ -296,9 +288,7 @@ class TestMultiClass(unittest.TestCase):
}
X_train, y_train = load_iris(return_X_y=True, as_frame=True)
automl_experiment.fit(
X_train=X_train, y_train=y_train, max_iter=1, **automl_settings
)
automl_experiment.fit(X_train=X_train, y_train=y_train, max_iter=1, **automl_settings)
print(automl_experiment.model)
@unittest.skipIf(
@ -307,12 +297,8 @@ class TestMultiClass(unittest.TestCase):
)
def test_time_limit(self):
automl_experiment = AutoML()
automl_experiment.add_learner(
learner_name="large_lgbm", learner_class=MyLargeLGBM
)
automl_experiment.add_learner(
learner_name="large_xgb", learner_class=MyLargeXGB
)
automl_experiment.add_learner(learner_name="large_lgbm", learner_class=MyLargeLGBM)
automl_experiment.add_learner(learner_name="large_xgb", learner_class=MyLargeXGB)
automl_settings = {
"time_budget": 0.5,
"task": "classification",
@ -356,11 +342,7 @@ class TestMultiClass(unittest.TestCase):
print("Best ML leaner:", automl_experiment.best_estimator)
print("Best hyperparmeter config:", automl_experiment.best_config)
print("Best accuracy on validation data: {0:.4g}".format(automl_val_accuracy))
print(
"Training duration of best run: {0:.4g} s".format(
automl_experiment.best_config_train_time
)
)
print("Training duration of best run: {0:.4g} s".format(automl_experiment.best_config_train_time))
starting_points = automl_experiment.best_config_per_estimator
print("starting_points", starting_points)
@ -379,21 +361,13 @@ class TestMultiClass(unittest.TestCase):
"use_spark": True,
}
new_automl_experiment = AutoML()
new_automl_experiment.fit(
X_train=X_train, y_train=y_train, **automl_settings_resume
)
new_automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings_resume)
new_automl_val_accuracy = 1.0 - new_automl_experiment.best_loss
print("Best ML leaner:", new_automl_experiment.best_estimator)
print("Best hyperparmeter config:", new_automl_experiment.best_config)
print(
"Best accuracy on validation data: {0:.4g}".format(new_automl_val_accuracy)
)
print(
"Training duration of best run: {0:.4g} s".format(
new_automl_experiment.best_config_train_time
)
)
print("Best accuracy on validation data: {0:.4g}".format(new_automl_val_accuracy))
print("Training duration of best run: {0:.4g} s".format(new_automl_experiment.best_config_train_time))
def test_fit_w_starting_points_list(self, as_frame=True):
automl_experiment = AutoML()
@ -418,11 +392,7 @@ class TestMultiClass(unittest.TestCase):
print("Best ML leaner:", automl_experiment.best_estimator)
print("Best hyperparmeter config:", automl_experiment.best_config)
print("Best accuracy on validation data: {0:.4g}".format(automl_val_accuracy))
print(
"Training duration of best run: {0:.4g} s".format(
automl_experiment.best_config_train_time
)
)
print("Training duration of best run: {0:.4g} s".format(automl_experiment.best_config_train_time))
starting_points = {}
log_file_name = automl_settings["log_file_name"]
@ -453,16 +423,12 @@ class TestMultiClass(unittest.TestCase):
"use_spark": True,
}
new_automl_experiment = AutoML()
new_automl_experiment.fit(
X_train=X_train, y_train=y_train, **automl_settings_resume
)
new_automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings_resume)
new_automl_val_accuracy = 1.0 - new_automl_experiment.best_loss
# print('Best ML leaner:', new_automl_experiment.best_estimator)
# print('Best hyperparmeter config:', new_automl_experiment.best_config)
print(
"Best accuracy on validation data: {0:.4g}".format(new_automl_val_accuracy)
)
print("Best accuracy on validation data: {0:.4g}".format(new_automl_val_accuracy))
# print('Training duration of best run: {0:.4g} s'.format(new_automl_experiment.best_config_train_time))

Просмотреть файл

@ -8,9 +8,7 @@ import pytest
spark_available, _ = check_spark()
skip_spark = not spark_available
pytestmark = pytest.mark.skipif(
skip_spark, reason="Spark is not installed. Skip all spark tests."
)
pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
here = os.path.abspath(os.path.dirname(__file__))
os.environ["FLAML_MAX_CONCURRENT"] = "2"

Просмотреть файл

@ -24,9 +24,7 @@ try:
except ImportError:
skip_spark = True
pytestmark = pytest.mark.skipif(
skip_spark, reason="Spark is not installed. Skip all spark tests."
)
pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
def test_overtime():
@ -56,11 +54,7 @@ def test_overtime():
start_time = time.time()
automl_experiment.fit(**automl_settings)
elapsed_time = time.time() - start_time
print(
"time budget: {:.2f}s, actual elapsed time: {:.2f}s".format(
time_budget, elapsed_time
)
)
print("time budget: {:.2f}s, actual elapsed time: {:.2f}s".format(time_budget, elapsed_time))
# assert abs(elapsed_time - time_budget) < 5 # cancel assertion because github VM sometimes is super slow, causing the test to fail
print(automl_experiment.predict(df))
print(automl_experiment.model)

Просмотреть файл

@ -8,9 +8,7 @@ import pytest
spark_available, _ = check_spark()
skip_spark = not spark_available
pytestmark = pytest.mark.skipif(
skip_spark, reason="Spark is not installed. Skip all spark tests."
)
pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
os.environ["FLAML_MAX_CONCURRENT"] = "2"
@ -72,9 +70,7 @@ def run_automl(budget=3, dataset_format="dataframe", hpo_method=None):
print("Best ML leaner:", automl.best_estimator)
print("Best hyperparmeter config:", automl.best_config)
print("Best accuracy on validation data: {0:.4g}".format(1 - automl.best_loss))
print(
"Training duration of best run: {0:.4g} s".format(automl.best_config_train_time)
)
print("Training duration of best run: {0:.4g} s".format(automl.best_config_train_time))
print(automl.model.estimator)
print(automl.best_config_per_estimator)
print("time taken to find best model:", automl.time_to_find_best_model)
@ -89,9 +85,7 @@ def run_automl(budget=3, dataset_format="dataframe", hpo_method=None):
accuracy = 1 - sklearn_metric_loss_score("accuracy", y_pred, y_test)
print("accuracy", "=", accuracy)
print(
"roc_auc", "=", 1 - sklearn_metric_loss_score("roc_auc", y_pred_proba, y_test)
)
print("roc_auc", "=", 1 - sklearn_metric_loss_score("roc_auc", y_pred_proba, y_test))
print("log_loss", "=", sklearn_metric_loss_score("log_loss", y_pred_proba, y_test))
if performance_check_budget is None:
assert accuracy >= 0.669, "the accuracy of flaml should be larger than 0.67"

Просмотреть файл

@ -12,9 +12,7 @@ import pytest
spark_available, _ = check_spark()
skip_spark = not spark_available
pytestmark = pytest.mark.skipif(
skip_spark, reason="Spark is not installed. Skip all spark tests."
)
pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
os.environ["FLAML_MAX_CONCURRENT"] = "2"
X, y = load_breast_cancer(return_X_y=True)
@ -35,9 +33,7 @@ def train_breast_cancer(config):
def test_tune_spark():
flaml_lgbm_search_space = LGBMEstimator.search_space(X_train.shape)
config_search_space = {
hp: space["domain"] for hp, space in flaml_lgbm_search_space.items()
}
config_search_space = {hp: space["domain"] for hp, space in flaml_lgbm_search_space.items()}
analysis = tune.run(
train_breast_cancer,

Просмотреть файл

@ -34,9 +34,7 @@ except ImportError:
print("Spark is not installed. Skip all spark tests.")
skip_spark = True
pytestmark = pytest.mark.skipif(
skip_spark, reason="Spark is not installed. Skip all spark tests."
)
pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
def test_with_parameters_spark():
@ -53,9 +51,7 @@ def test_with_parameters_spark():
spark = SparkSession.builder.getOrCreate()
rdd = spark.sparkContext.parallelize(list(range(2)))
t_partial = timeit(
lambda: rdd.map(lambda x: partial_train(config=x)).collect(), number=5
)
t_partial = timeit(lambda: rdd.map(lambda x: partial_train(config=x)).collect(), number=5)
print("python_partial_train: " + str(t_partial))
t_spark = timeit(
@ -139,12 +135,8 @@ def test_train_test_split_pyspark():
spark = SparkSession.builder.getOrCreate()
sdf = spark.createDataFrame(pdf).repartition(1)
psdf = to_pandas_on_spark(sdf).spark.repartition(1)
train_sdf, test_sdf = train_test_split_pyspark(
sdf, test_fraction=0.5, to_pandas_spark=False, seed=1
)
train_psdf, test_psdf = train_test_split_pyspark(
psdf, test_fraction=0.5, stratify_column="y", seed=1
)
train_sdf, test_sdf = train_test_split_pyspark(sdf, test_fraction=0.5, to_pandas_spark=False, seed=1)
train_psdf, test_psdf = train_test_split_pyspark(psdf, test_fraction=0.5, stratify_column="y", seed=1)
assert isinstance(train_sdf, pyspark.sql.dataframe.DataFrame)
assert isinstance(test_sdf, pyspark.sql.dataframe.DataFrame)
assert isinstance(train_psdf, ps.DataFrame)
@ -190,9 +182,7 @@ def test_unique_value_first_index():
def test_n_current_trials():
spark = SparkSession.builder.getOrCreate()
sc = spark._jsc.sc()
num_executors = (
len([executor.host() for executor in sc.statusTracker().getExecutorInfos()]) - 1
)
num_executors = len([executor.host() for executor in sc.statusTracker().getExecutorInfos()]) - 1
def get_n_current_trials(n_concurrent_trials=0, num_executors=num_executors):
try:

Просмотреть файл

@ -18,9 +18,7 @@ NS_LIST = list(string.ascii_lowercase) + list(string.ascii_uppercase)
logger = logging.getLogger(__name__)
def oml_to_vw_w_grouping(
X, y, ds_dir, fname, orginal_dim, group_num, grouping_method="sequential"
):
def oml_to_vw_w_grouping(X, y, ds_dir, fname, orginal_dim, group_num, grouping_method="sequential"):
# split all_indexes into # group_num of groups
max_size_per_group = int(np.ceil(orginal_dim / float(group_num)))
# sequential grouping
@ -49,17 +47,11 @@ def oml_to_vw_w_grouping(
for i in range(len(X)):
NS_content = []
for zz in range(len(group_indexes)):
ns_features = " ".join(
"{}:{:.6f}".format(ind, X[i][ind])
for ind in group_indexes[zz]
)
ns_features = " ".join("{}:{:.6f}".format(ind, X[i][ind]) for ind in group_indexes[zz])
NS_content.append(ns_features)
ns_line = "{} |{}".format(
str(y[i]),
"|".join(
"{} {}".format(NS_LIST[j], NS_content[j])
for j in range(len(group_indexes))
),
"|".join("{} {}".format(NS_LIST[j], NS_content[j]) for j in range(len(group_indexes))),
)
f.write(ns_line)
f.write("\n")
@ -140,10 +132,7 @@ def load_vw_dataset(did, ds_dir, is_regression, max_ns_num):
fname = "ds_{}_{}_{}.vw".format(did, max_ns_num, 0)
vw_dataset_file = os.path.join(ds_dir, fname)
# if file does not exist, generate and save the datasets
if (
not os.path.exists(vw_dataset_file)
or os.stat(vw_dataset_file).st_size < 1000
):
if not os.path.exists(vw_dataset_file) or os.stat(vw_dataset_file).st_size < 1000:
get_oml_to_vw(did, max_ns_num)
print(ds_dir, vw_dataset_file)
if not os.path.exists(ds_dir):
@ -175,9 +164,7 @@ def get_data(
# Y = data.Y
if vw_format:
# vw_examples = data.vw_examples
vw_examples = load_vw_dataset(
did=data_id, ds_dir=VW_DS_DIR, is_regression=True, max_ns_num=max_ns_num
)
vw_examples = load_vw_dataset(did=data_id, ds_dir=VW_DS_DIR, is_regression=True, max_ns_num=max_ns_num)
Y = []
for i, e in enumerate(vw_examples):
Y.append(float(e.split("|")[0]))
@ -230,9 +217,7 @@ class VowpalWabbitNamesspaceTuningProblem:
}
self._problem_info.update(kwargs)
self._fixed_hp_config = kwargs.get("fixed_hp_config", {})
self.namespace_feature_dim = AutoVW.get_ns_feature_dim_from_vw_example(
self.vw_examples[0]
)
self.namespace_feature_dim = AutoVW.get_ns_feature_dim_from_vw_example(self.vw_examples[0])
self._raw_namespaces = list(self.namespace_feature_dim.keys())
self._setup_search()
@ -355,13 +340,9 @@ def get_vw_tuning_problem(tuning_hp="NamesapceInteraction"):
"fixed_hp_config": online_vw_exp_setting["fixed_hp_config"],
}
if tuning_hp == "NamesapceInteraction":
vw_online_aml_problem = VowpalWabbitNamesspaceTuningProblem(
**vw_oml_problem_args
)
vw_online_aml_problem = VowpalWabbitNamesspaceTuningProblem(**vw_oml_problem_args)
elif tuning_hp == "NamesapceInteraction+LearningRate":
vw_online_aml_problem = VowpalWabbitNamesspaceLRTuningProblem(
**vw_oml_problem_args
)
vw_online_aml_problem = VowpalWabbitNamesspaceLRTuningProblem(**vw_oml_problem_args)
else:
NotImplementedError
@ -382,13 +363,9 @@ class TestAutoVW(unittest.TestCase):
vw_online_aml_problem.max_iter_num,
vw_online_aml_problem.vw_examples,
vanilla_vw,
loss_func=vw_oml_problem_args["fixed_hp_config"].get(
"loss_function", "squared"
),
)
print(
"final average loss:", sum(cumulative_loss_list) / len(cumulative_loss_list)
loss_func=vw_oml_problem_args["fixed_hp_config"].get("loss_function", "squared"),
)
print("final average loss:", sum(cumulative_loss_list) / len(cumulative_loss_list))
def test_supervised_vw_tune_namespace(self):
# basic experiment setting
@ -405,13 +382,9 @@ class TestAutoVW(unittest.TestCase):
vw_online_aml_problem.max_iter_num,
vw_online_aml_problem.vw_examples,
autovw,
loss_func=vw_oml_problem_args["fixed_hp_config"].get(
"loss_function", "squared"
),
)
print(
"final average loss:", sum(cumulative_loss_list) / len(cumulative_loss_list)
loss_func=vw_oml_problem_args["fixed_hp_config"].get("loss_function", "squared"),
)
print("final average loss:", sum(cumulative_loss_list) / len(cumulative_loss_list))
def test_supervised_vw_tune_namespace_learningrate(self):
# basic experiment setting
@ -430,13 +403,9 @@ class TestAutoVW(unittest.TestCase):
vw_online_aml_problem.max_iter_num,
vw_online_aml_problem.vw_examples,
autovw,
loss_func=vw_oml_problem_args["fixed_hp_config"].get(
"loss_function", "squared"
),
)
print(
"final average loss:", sum(cumulative_loss_list) / len(cumulative_loss_list)
loss_func=vw_oml_problem_args["fixed_hp_config"].get("loss_function", "squared"),
)
print("final average loss:", sum(cumulative_loss_list) / len(cumulative_loss_list))
def test_bandit_vw_tune_namespace(self):
pass

Просмотреть файл

@ -23,9 +23,7 @@ def test_xgboost():
gpu_per_trial=1,
)
train, label = make_moons(
n_samples=300000, shuffle=True, noise=0.3, random_state=None
)
train, label = make_moons(n_samples=300000, shuffle=True, noise=0.3, random_state=None)
automl = AutoML()
automl.fit(
train,
@ -89,18 +87,10 @@ def _test_hf_data():
}
}
automl.fit(
X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
)
automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings)
automl = AutoML()
automl.retrain_from_log(
X_train=X_train,
y_train=y_train,
train_full=True,
record_id=0,
**automl_settings
)
automl.retrain_from_log(X_train=X_train, y_train=y_train, train_full=True, record_id=0, **automl_settings)
with open("automl.pkl", "wb") as f:
pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
with open("automl.pkl", "rb") as f:

Просмотреть файл

@ -31,9 +31,7 @@ def setup_searcher(searcher_name):
from flaml.tune.searcher.blendsearch import BlendSearch, CFO, RandomSearch
if "cfo" in searcher_name:
searcher = CFO(
space=config_search_space, low_cost_partial_config=low_cost_partial_config
)
searcher = CFO(space=config_search_space, low_cost_partial_config=low_cost_partial_config)
elif searcher_name == "bs":
searcher = BlendSearch(
metric="metric",
@ -48,9 +46,7 @@ def setup_searcher(searcher_name):
return searcher
def _test_flaml_raytune_consistency(
num_samples=-1, max_concurrent_trials=1, searcher_name="cfo"
):
def _test_flaml_raytune_consistency(num_samples=-1, max_concurrent_trials=1, searcher_name="cfo"):
try:
from ray import tune as raytune, __version__ as ray_version
@ -59,9 +55,7 @@ def _test_flaml_raytune_consistency(
else:
from ray.tune.search import ConcurrencyLimiter
except ImportError:
print(
"skip _test_flaml_raytune_consistency because ray tune cannot be imported."
)
print("skip _test_flaml_raytune_consistency because ray tune cannot be imported.")
return
searcher = setup_searcher(searcher_name)
analysis = tune.run(
@ -110,21 +104,13 @@ def _test_flaml_raytune_consistency(
print("flaml config in results", searcher_name, flaml_config_in_results)
print("ray config in results", searcher_name, ray_config_in_results)
assert ray_best_config == flaml_best_config, "best config should be the same"
assert (
flaml_config_in_results == ray_config_in_results
), "results from raytune and flaml should be the same"
assert flaml_config_in_results == ray_config_in_results, "results from raytune and flaml should be the same"
def test_consistency():
_test_flaml_raytune_consistency(
num_samples=5, max_concurrent_trials=1, searcher_name="random"
)
_test_flaml_raytune_consistency(
num_samples=5, max_concurrent_trials=1, searcher_name="cfo"
)
_test_flaml_raytune_consistency(
num_samples=5, max_concurrent_trials=1, searcher_name="bs"
)
_test_flaml_raytune_consistency(num_samples=5, max_concurrent_trials=1, searcher_name="random")
_test_flaml_raytune_consistency(num_samples=5, max_concurrent_trials=1, searcher_name="cfo")
_test_flaml_raytune_consistency(num_samples=5, max_concurrent_trials=1, searcher_name="bs")
if __name__ == "__main__":

Просмотреть файл

@ -89,9 +89,7 @@ def test_lexiflow():
correct += pred.eq(target.view_as(pred)).sum().item()
accuracy = correct / N_VALID_EXAMPLES
flops, params = thop.profile(
model, inputs=(torch.randn(1, 28 * 28).to(DEVICE),), verbose=False
)
flops, params = thop.profile(model, inputs=(torch.randn(1, 28 * 28).to(DEVICE),), verbose=False)
return np.log2(flops), 1 - accuracy, params
def evaluate_function(configuration):
@ -198,9 +196,7 @@ def test_lexiflow_performance():
print(analysis.best_config)
print(analysis.best_result)
assert (
analysis.best_result["currin"] <= 2.2
), "the value of currin function should be less than 2.2"
assert analysis.best_result["currin"] <= 2.2, "the value of currin function should be less than 2.2"
if __name__ == "__main__":

Просмотреть файл

@ -48,17 +48,11 @@ except ImportError:
# __load_data_begin__
def load_data(data_dir="test/data"):
transform = transforms.Compose(
[transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
trainset = torchvision.datasets.CIFAR10(
root=data_dir, train=True, download=True, transform=transform
)
trainset = torchvision.datasets.CIFAR10(root=data_dir, train=True, download=True, transform=transform)
testset = torchvision.datasets.CIFAR10(
root=data_dir, train=False, download=True, transform=transform
)
testset = torchvision.datasets.CIFAR10(root=data_dir, train=False, download=True, transform=transform)
return trainset, testset
@ -93,9 +87,7 @@ def train_cifar(config, checkpoint_dir=None, data_dir=None):
trainset, testset = load_data(data_dir)
test_abs = int(len(trainset) * 0.8)
train_subset, val_subset = random_split(
trainset, [test_abs, len(trainset) - test_abs]
)
train_subset, val_subset = random_split(trainset, [test_abs, len(trainset) - test_abs])
trainloader = torch.utils.data.DataLoader(
train_subset,
@ -112,9 +104,7 @@ def train_cifar(config, checkpoint_dir=None, data_dir=None):
from ray import tune
for epoch in range(
int(round(config["num_epochs"]))
): # loop over the dataset multiple times
for epoch in range(int(round(config["num_epochs"]))): # loop over the dataset multiple times
running_loss = 0.0
epoch_steps = 0
for i, data in enumerate(trainloader, 0):
@ -135,10 +125,7 @@ def train_cifar(config, checkpoint_dir=None, data_dir=None):
running_loss += loss.item()
epoch_steps += 1
if i % 2000 == 1999: # print every 2000 mini-batches
print(
"[%d, %5d] loss: %.3f"
% (epoch + 1, i + 1, running_loss / epoch_steps)
)
print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / epoch_steps))
running_loss = 0.0
# Validation loss
@ -178,9 +165,7 @@ def train_cifar(config, checkpoint_dir=None, data_dir=None):
def _test_accuracy(net, device="cpu"):
trainset, testset = load_data()
testloader = torch.utils.data.DataLoader(
testset, batch_size=4, shuffle=False, num_workers=2
)
testloader = torch.utils.data.DataLoader(testset, batch_size=4, shuffle=False, num_workers=2)
correct = 0
total = 0
@ -200,9 +185,7 @@ def _test_accuracy(net, device="cpu"):
# __main_begin__
def cifar10_main(
method="BlendSearch", num_samples=10, max_num_epochs=100, gpus_per_trial=1
):
def cifar10_main(method="BlendSearch", num_samples=10, max_num_epochs=100, gpus_per_trial=1):
data_dir = os.path.abspath("test/data")
load_data(data_dir) # Download data for all trials before starting the run
if method == "BlendSearch":
@ -294,16 +277,8 @@ def cifar10_main(
logger.info(f"time={time.time()-start_time}")
best_trial = result.get_best_trial("loss", "min", "all")
logger.info("Best trial config: {}".format(best_trial.config))
logger.info(
"Best trial final validation loss: {}".format(
best_trial.metric_analysis["loss"]["min"]
)
)
logger.info(
"Best trial final validation accuracy: {}".format(
best_trial.metric_analysis["accuracy"]["max"]
)
)
logger.info("Best trial final validation loss: {}".format(best_trial.metric_analysis["loss"]["min"]))
logger.info("Best trial final validation accuracy: {}".format(best_trial.metric_analysis["accuracy"]["max"]))
best_trained_model = Net(2 ** best_trial.config["l1"], 2 ** best_trial.config["l2"])
device = "cpu"
@ -313,10 +288,7 @@ def cifar10_main(
best_trained_model = nn.DataParallel(best_trained_model)
best_trained_model.to(device)
checkpoint_value = (
getattr(best_trial.checkpoint, "dir_or_data", None)
or best_trial.checkpoint.value
)
checkpoint_value = getattr(best_trial.checkpoint, "dir_or_data", None) or best_trial.checkpoint.value
checkpoint_path = os.path.join(checkpoint_value, "checkpoint")
model_state, optimizer_state = torch.load(checkpoint_path)

Просмотреть файл

@ -41,9 +41,7 @@ def test_tune(externally_setup_searcher=False, use_ray=False, use_raytune=False)
metric="mean_loss",
mode="min",
)
assert (
searcher.cost_attr == "time_total_s"
), "when time_budget_s is provided, cost_attr should be time_total_s"
assert searcher.cost_attr == "time_total_s", "when time_budget_s is provided, cost_attr should be time_total_s"
searcher = BlendSearch(
space=search_space,
@ -51,9 +49,7 @@ def test_tune(externally_setup_searcher=False, use_ray=False, use_raytune=False)
metric="mean_loss",
mode="min",
)
assert (
searcher.cost_attr is None
), "when time_budget_s is not provided, cost_attr should be None."
assert searcher.cost_attr is None, "when time_budget_s is not provided, cost_attr should be None."
searcher = BlendSearch(
space=search_space,
@ -116,9 +112,7 @@ def test_reproducibility():
best_config_2 = test_tune(externally_setup_searcher=True)
print(best_config_1)
print(best_config_2)
assert (
best_config_1 == best_config_2
), "flaml.tune not reproducible when the searcher is set up externally"
assert best_config_1 == best_config_2, "flaml.tune not reproducible when the searcher is set up externally"
def test_gs_reproducibility():

Просмотреть файл

@ -25,9 +25,7 @@ class AbstractWarmStartTest:
np.random.seed(162)
search_alg, cost = self.set_basic_conf()
search_alg = ConcurrencyLimiter(search_alg, 1)
results_exp_1 = tune.run(
cost, num_samples=5, search_alg=search_alg, verbose=0, local_dir=self.tmpdir
)
results_exp_1 = tune.run(cost, num_samples=5, search_alg=search_alg, verbose=0, local_dir=self.tmpdir)
checkpoint_path = os.path.join(self.tmpdir, self.experiment_name)
search_alg.save(checkpoint_path)
return results_exp_1, np.random.get_state(), checkpoint_path

Просмотреть файл

@ -120,9 +120,7 @@ def test_asha_scheduler(use_ray=False, time_budget_s=1):
except ImportError:
print("skip the test as ray tune cannot be imported.")
return
best_config = test_scheduler(
scheduler="asha", use_ray=use_ray, time_budget_s=time_budget_s
)
best_config = test_scheduler(scheduler="asha", use_ray=use_ray, time_budget_s=time_budget_s)
print("Auto ASHA scheduler, test error:", abs(10 / 2 - best_config["z"] / 2))
@ -132,9 +130,7 @@ def test_custom_scheduler():
except ImportError:
print("skip the test as ray tune cannot be imported.")
return
my_scheduler = HyperBandScheduler(
time_attr="samplesize", max_t=1000, reduction_factor=2
)
my_scheduler = HyperBandScheduler(time_attr="samplesize", max_t=1000, reduction_factor=2)
best_config = test_scheduler(scheduler=my_scheduler)
print("Custom ASHA scheduler, test error:", abs(10 / 2 - best_config["z"] / 2))

Просмотреть файл

@ -91,9 +91,7 @@ def test_searchers():
# 'set' object has no attribute 'keys'
pass
try:
searcher.add_evaluated_point(
{"a": 1, "b": 0.01}, None, intermediate_values=[0.1]
)
searcher.add_evaluated_point({"a": 1, "b": 0.01}, None, intermediate_values=[0.1])
except ValueError:
# `value` is supposed to be set for a complete trial.
pass
@ -113,16 +111,12 @@ def test_searchers():
# Dim of point {'a': 1} and parameter_names {'a': UniformDistribution(high=8.0, low=6.0), 'b': LogUniformDistribution(high=0.01, low=0.0001)} do not match.
pass
try:
searcher = OptunaSearch(
config, points_to_evaluate=[{"a": 1, "b": 0.01}], evaluated_rewards=1
)
searcher = OptunaSearch(config, points_to_evaluate=[{"a": 1, "b": 0.01}], evaluated_rewards=1)
except TypeError:
# valuated_rewards expected to be a list, got <class 'int'>.
pass
try:
searcher = OptunaSearch(
config, points_to_evaluate=[{"a": 1, "b": 0.01}], evaluated_rewards=[1, 2]
)
searcher = OptunaSearch(config, points_to_evaluate=[{"a": 1, "b": 0.01}], evaluated_rewards=[1, 2])
except ValueError:
# Dim of evaluated_rewards [1, 2] and points_to_evaluate [{'a': 1, 'b': 0.01}] do not match.
pass
@ -197,9 +191,7 @@ def test_searchers():
searcher.save("test/tune/optuna.pkl")
searcher.restore("test/tune/optuna.pkl")
try:
searcher = BlendSearch(
metric="m", global_search_alg=searcher, metric_constraints=[("c", "<", 1)]
)
searcher = BlendSearch(metric="m", global_search_alg=searcher, metric_constraints=[("c", "<", 1)])
except AssertionError:
# sign of metric constraints must be <= or >=.
pass
@ -303,12 +295,8 @@ def test_searchers():
from flaml import tune
tune.run(lambda x: 1, config={}, use_ray=use_ray, log_file_name="logs/searcher.log")
searcher = BlendSearch(
space=config, cost_attr="cost", cost_budget=10, metric="m", mode="min"
)
analysis = tune.run(
lambda x: {"cost": 2, "m": x["b"]}, search_alg=searcher, num_samples=10
)
searcher = BlendSearch(space=config, cost_attr="cost", cost_budget=10, metric="m", mode="min")
analysis = tune.run(lambda x: {"cost": 2, "m": x["b"]}, search_alg=searcher, num_samples=10)
assert len(analysis.trials) == 5

Просмотреть файл

@ -85,9 +85,7 @@ def _test_xgboost(method="BlendSearch"):
else:
from ray import tune
search_space = {
"max_depth": tune.randint(1, 9)
if method in ["BlendSearch", "BOHB", "Optuna"]
else tune.randint(1, 9),
"max_depth": tune.randint(1, 9) if method in ["BlendSearch", "BOHB", "Optuna"] else tune.randint(1, 9),
"min_child_weight": tune.choice([1, 2, 3]),
"subsample": tune.uniform(0.5, 1.0),
"eta": tune.loguniform(1e-4, 1e-1),
@ -226,9 +224,7 @@ def test_nested_space():
}
def simple_func(config):
obj = (config["cost_related"]["a"] - 4) ** 2 + (
config["b"] - config["cost_related"]["a"]
) ** 2
obj = (config["cost_related"]["a"] - 4) ** 2 + (config["b"] - config["cost_related"]["a"]) ** 2
tune.report(obj=obj)
tune.report(obj=obj, ab=config["cost_related"]["a"] * config["b"])
@ -291,8 +287,7 @@ def test_nested_space():
low_cost_partial_config={"cost_related": {"a": 1}},
points_to_evaluate=points_to_evaluate,
evaluated_rewards=[
(config["cost_related"]["a"] - 4) ** 2
+ (config["b"] - config["cost_related"]["a"]) ** 2
(config["cost_related"]["a"] - 4) ** 2 + (config["b"] - config["cost_related"]["a"]) ** 2
for config in points_to_evaluate[:-1]
],
metric="obj",

Просмотреть файл

@ -7,9 +7,7 @@ from sklearn.metrics import mean_squared_error
data = fetch_california_housing(return_X_y=False, as_frame=True)
df, X, y = data.frame, data.data, data.target
df_train, _, X_train, X_test, _, y_test = train_test_split(
df, X, y, test_size=0.33, random_state=42
)
df_train, _, X_train, X_test, _, y_test = train_test_split(df, X, y, test_size=0.33, random_state=42)
csv_file_name = "test/housing.csv"
df_train.to_csv(csv_file_name, index=False)
# X, y = fetch_california_housing(return_X_y=True, as_frame=True)
@ -24,9 +22,7 @@ def train_lgbm(config: dict) -> dict:
# train the model
# train_set = lightgbm.Dataset(X_train, y_train)
# LightGBM only accepts the csv with valid number format, if even these string columns are set to ignore.
train_set = lightgbm.Dataset(
csv_file_name, params={"label_column": "name:MedHouseVal", "header": True}
)
train_set = lightgbm.Dataset(csv_file_name, params={"label_column": "name:MedHouseVal", "header": True})
model = lightgbm.train(params, train_set)
# evaluate the model
pred = model.predict(X_test)
@ -39,9 +35,7 @@ def test_tune_lgbm_csv():
# load a built-in search space from flaml
flaml_lgbm_search_space = LGBMEstimator.search_space(X_train.shape)
# specify the search space as a dict from hp name to domain; you can define your own search space same way
config_search_space = {
hp: space["domain"] for hp, space in flaml_lgbm_search_space.items()
}
config_search_space = {hp: space["domain"] for hp, space in flaml_lgbm_search_space.items()}
# give guidance about hp values corresponding to low training cost, i.e., {"n_estimators": 4, "num_leaves": 4}
low_cost_partial_config = {
hp: space["low_cost_init_value"]
@ -50,11 +44,7 @@ def test_tune_lgbm_csv():
}
# initial points to evaluate
points_to_evaluate = [
{
hp: space["init_value"]
for hp, space in flaml_lgbm_search_space.items()
if "init_value" in space
}
{hp: space["init_value"] for hp, space in flaml_lgbm_search_space.items() if "init_value" in space}
]
# run the tuning, minimizing mse, with total time budget 3 seconds
analysis = tune.run(