зеркало из https://github.com/microsoft/FLAML.git
gpt-4 support; openai workflow fix; model str; timeout; voting (#958)
* workflow; model str; timeout * voting * notebook * pull request * recover workflow * voted answer * aoai * ignore None answer * default config * note * gpt-4 * n=5 * cleanup * config name * introduction * readme * avoid None * add output/ to gitignore * openai version * invalid var * comment long running cells
This commit is contained in:
Родитель
50334f2c52
Коммит
595f5a8025
|
@ -10,10 +10,11 @@ on:
|
|||
- 'flaml/integrations/oai/**'
|
||||
- 'test/openai/**'
|
||||
- 'notebook/integrate_openai.ipynb'
|
||||
- 'notebook/integrate_chatgpt_math.ipynb'
|
||||
- '.github/workflows/openai.yml'
|
||||
|
||||
jobs:
|
||||
build:
|
||||
test:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
|
|
|
@ -159,6 +159,6 @@ automl.pkl
|
|||
|
||||
test/nlp/testtmp.py
|
||||
test/nlp/testtmpfl.py
|
||||
|
||||
output/
|
||||
flaml/tune/spark/mylearner.py
|
||||
*.pkl
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
<br>
|
||||
</p>
|
||||
|
||||
:fire: OpenAI GPT-3 models support in v1.1.3. ChatGPT support is coming.
|
||||
:fire: OpenAI GPT-3 models support in v1.1.3. ChatGPT and GPT-4 support will be added in v1.2.0.
|
||||
|
||||
:fire: A [lab forum](https://github.com/microsoft/FLAML/tree/tutorial-aaai23/tutorial) on FLAML at AAAI 2023.
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from time import sleep
|
||||
import logging
|
||||
import numpy as np
|
||||
import time
|
||||
from flaml import tune, BlendSearch
|
||||
|
||||
try:
|
||||
|
@ -11,9 +12,9 @@ try:
|
|||
APIError,
|
||||
InvalidRequestError,
|
||||
APIConnectionError,
|
||||
Timeout,
|
||||
)
|
||||
import diskcache
|
||||
from urllib3.exceptions import ReadTimeoutError
|
||||
|
||||
ERROR = None
|
||||
except ImportError:
|
||||
|
@ -46,7 +47,14 @@ class Completion:
|
|||
"""
|
||||
|
||||
# set of models that support chat completion
|
||||
chat_models = {"gpt-3.5-turbo"}
|
||||
chat_models = {
|
||||
"gpt-3.5-turbo",
|
||||
"gpt-3.5-turbo-0301",
|
||||
"gpt-4",
|
||||
"gpt-4-32k",
|
||||
"gpt-4-32k-0314",
|
||||
"gpt-4-0314",
|
||||
}
|
||||
|
||||
# price per 1k tokens
|
||||
price1K = {
|
||||
|
@ -58,10 +66,23 @@ class Completion:
|
|||
"text-davinci-002": 0.02,
|
||||
"text-davinci-003": 0.02,
|
||||
"gpt-3.5-turbo": 0.002,
|
||||
"gpt-3.5-turbo-0301": 0.002,
|
||||
"gpt-4": (0.03, 0.06),
|
||||
"gpt-4-0314": (0.03, 0.06),
|
||||
"gpt-4-32k": (0.06, 0.12),
|
||||
"gpt-4-32k-0314": (0.06, 0.12),
|
||||
}
|
||||
|
||||
default_search_space = {
|
||||
"model": tune.choice(list(price1K.keys())),
|
||||
"model": tune.choice(
|
||||
[
|
||||
"text-ada-001",
|
||||
"text-babbage-001",
|
||||
"text-davinci-003",
|
||||
"gpt-3.5-turbo",
|
||||
"gpt-4",
|
||||
]
|
||||
),
|
||||
"temperature_or_top_p": tune.choice(
|
||||
[
|
||||
{"temperature": tune.uniform(0, 1)},
|
||||
|
@ -107,13 +128,13 @@ class Completion:
|
|||
if response is not None and (response != -1 or not eval_only):
|
||||
# print("using cached response")
|
||||
return response
|
||||
retry = 0
|
||||
openai_completion = (
|
||||
openai.ChatCompletion
|
||||
if config["model"] in cls.chat_models
|
||||
else openai.Completion
|
||||
)
|
||||
while eval_only or retry * cls.retry_time < cls.retry_timeout:
|
||||
start_time = time.time()
|
||||
while True:
|
||||
try:
|
||||
response = openai_completion.create(**config)
|
||||
cls._cache.set(key, response)
|
||||
|
@ -122,21 +143,26 @@ class Completion:
|
|||
ServiceUnavailableError,
|
||||
APIError,
|
||||
APIConnectionError,
|
||||
ReadTimeoutError,
|
||||
):
|
||||
# transient error
|
||||
logger.warning(f"retrying in {cls.retry_time} seconds...", exc_info=1)
|
||||
sleep(cls.retry_time)
|
||||
except RateLimitError:
|
||||
logger.info(f"retrying in {cls.retry_time} seconds...", exc_info=1)
|
||||
retry += 1
|
||||
except (RateLimitError, Timeout):
|
||||
# retry after retry_time seconds
|
||||
if time.time() - start_time + cls.retry_time < cls.retry_timeout:
|
||||
logger.info(f"retrying in {cls.retry_time} seconds...", exc_info=1)
|
||||
elif not eval_only:
|
||||
break
|
||||
sleep(cls.retry_time)
|
||||
except InvalidRequestError:
|
||||
if "model" in config:
|
||||
if "azure" == openai.api_type and "model" in config:
|
||||
# azure api uses "engine" instead of "model"
|
||||
config = config.copy()
|
||||
config["engine"] = config.pop("model")
|
||||
else:
|
||||
raise
|
||||
logger.warning(
|
||||
f"Failed to get response from openai api due to getting RateLimitError for {cls.retry_timeout} seconds."
|
||||
f"Failed to get response from openai api due to getting RateLimitError or Timeout for {cls.retry_timeout} seconds."
|
||||
)
|
||||
response = -1
|
||||
cls._cache.set(key, response)
|
||||
|
@ -205,16 +231,18 @@ class Completion:
|
|||
data = cls.data
|
||||
model = config["model"]
|
||||
data_length = len(data)
|
||||
target_n_tokens = getattr(cls, "inference_budget", None) and (
|
||||
1000 * cls.inference_budget / cls.price1K[model]
|
||||
if cls.inference_budget and cls.price1K.get(model)
|
||||
else None
|
||||
price = cls.price1K.get(model)
|
||||
price_input, price_output = (
|
||||
price if isinstance(price, tuple) else (price, price)
|
||||
)
|
||||
inference_budget = getattr(cls, "inference_budget", None)
|
||||
prune_hp = getattr(cls, "_prune_hp", "n")
|
||||
metric = cls._metric
|
||||
config_n = config.get(prune_hp, 1) # default value in OpenAI is 1
|
||||
max_tokens = config.get("max_tokens", 16) # default value in OpenAI is 16
|
||||
region_key = cls._get_region_key(config)
|
||||
max_tokens = config.get(
|
||||
"max_tokens", np.inf if model in cls.chat_models else 16
|
||||
)
|
||||
# default value in OpenAI
|
||||
if model in cls.chat_models:
|
||||
# either "prompt" should be in config (for being compatible with non-chat models)
|
||||
# or "messages" should be in config (for tuning chat models only)
|
||||
|
@ -231,17 +259,23 @@ class Completion:
|
|||
else:
|
||||
prompt = cls._prompts[config["prompt"]]
|
||||
stop = cls._stops and cls._stops[config["stop"]]
|
||||
if prune and target_n_tokens:
|
||||
target_output_tokens = None
|
||||
if not cls.avg_input_tokens:
|
||||
input_tokens = [None] * data_length
|
||||
prune = prune and inference_budget and not eval_only
|
||||
if prune:
|
||||
region_key = cls._get_region_key(config)
|
||||
max_valid_n = cls._get_max_valid_n(region_key, max_tokens)
|
||||
if cls.avg_input_tokens:
|
||||
target_output_tokens = (
|
||||
inference_budget * 1000 - cls.avg_input_tokens * price_input
|
||||
) / price_output
|
||||
# max_tokens bounds the maximum tokens
|
||||
# so using it we can calculate a valid n according to the avg # input tokens
|
||||
max_valid_n = max(
|
||||
max_valid_n,
|
||||
int((target_n_tokens - cls.avg_input_tokens) // max_tokens),
|
||||
int(target_output_tokens // max_tokens),
|
||||
)
|
||||
else:
|
||||
input_tokens = [None] * data_length
|
||||
if config_n <= max_valid_n:
|
||||
start_n = config_n
|
||||
else:
|
||||
|
@ -316,24 +350,15 @@ class Completion:
|
|||
if model in cls.chat_models
|
||||
else [r["text"].rstrip() for r in response["choices"]]
|
||||
)
|
||||
n_tokens = (
|
||||
response["usage"]["completion_tokens"]
|
||||
if previous_num_completions
|
||||
else response["usage"]["total_tokens"]
|
||||
)
|
||||
if (
|
||||
prune
|
||||
and target_n_tokens
|
||||
and not cls.avg_input_tokens
|
||||
and not input_tokens[i]
|
||||
):
|
||||
usage = response["usage"]
|
||||
n_input_tokens = usage["prompt_tokens"]
|
||||
n_output_tokens = usage.get("completion_tokens", 0)
|
||||
if not cls.avg_input_tokens and not input_tokens[i]:
|
||||
# store the # input tokens
|
||||
input_tokens[i] = response["usage"]["prompt_tokens"]
|
||||
# Under Assumption 1, we should count both the input and output tokens in the first query,
|
||||
# and only count ouput tokens afterwards
|
||||
input_tokens[i] = n_input_tokens
|
||||
query_cost = (
|
||||
response["usage"]["total_tokens"] * cls.price1K[model] / 1000
|
||||
)
|
||||
price_input * n_input_tokens + price_output * n_output_tokens
|
||||
) / 1000
|
||||
cls._total_cost += query_cost
|
||||
cost += query_cost
|
||||
if (
|
||||
|
@ -348,12 +373,12 @@ class Completion:
|
|||
"cost": cost,
|
||||
}
|
||||
if previous_num_completions:
|
||||
n_tokens_list[i] += n_tokens
|
||||
n_tokens_list[i] += n_output_tokens
|
||||
responses_list[i].extend(responses)
|
||||
# Assumption 1: assuming requesting n1, n2 responses separatively then combining them
|
||||
# is the same as requesting (n1+n2) responses together
|
||||
else:
|
||||
n_tokens_list.append(n_tokens)
|
||||
n_tokens_list.append(n_output_tokens)
|
||||
responses_list.append(responses)
|
||||
avg_n_tokens = np.mean(n_tokens_list[:data_limit])
|
||||
rho = (
|
||||
|
@ -364,8 +389,8 @@ class Completion:
|
|||
# Hoeffding-Serfling bound
|
||||
ratio = 0.1 * np.sqrt(rho / data_limit)
|
||||
if (
|
||||
target_n_tokens
|
||||
and avg_n_tokens > target_n_tokens * (1 + ratio)
|
||||
target_output_tokens
|
||||
and avg_n_tokens > target_output_tokens * (1 + ratio)
|
||||
and not eval_only
|
||||
):
|
||||
cls._update_invalid_n(
|
||||
|
@ -377,8 +402,8 @@ class Completion:
|
|||
return result
|
||||
if (
|
||||
prune
|
||||
and target_n_tokens
|
||||
and avg_n_tokens <= target_n_tokens * (1 - ratio)
|
||||
and target_output_tokens
|
||||
and avg_n_tokens <= target_output_tokens * (1 - ratio)
|
||||
and (
|
||||
num_completions < config_n
|
||||
or num_completions == config_n
|
||||
|
@ -410,16 +435,24 @@ class Completion:
|
|||
metrics = cls._eval_func(responses, **data_i)
|
||||
if result:
|
||||
for key, value in metrics.items():
|
||||
result[key] += value
|
||||
if isinstance(value, (float, int)):
|
||||
result[key] += value
|
||||
else:
|
||||
result = metrics
|
||||
for key in result.keys():
|
||||
result[key] /= data_limit
|
||||
if isinstance(result[key], (float, int)):
|
||||
result[key] /= data_limit
|
||||
result["total_cost"] = cls._total_cost
|
||||
result["cost"] = cost
|
||||
result["inference_cost"] = avg_n_tokens * cls.price1K[model] / 1000
|
||||
if prune and target_n_tokens and not cls.avg_input_tokens:
|
||||
if not cls.avg_input_tokens:
|
||||
cls.avg_input_tokens = np.mean(input_tokens)
|
||||
if prune:
|
||||
target_output_tokens = (
|
||||
inference_budget * 1000 - cls.avg_input_tokens * price_input
|
||||
) / price_output
|
||||
result["inference_cost"] = (
|
||||
avg_n_tokens * price_output + cls.avg_input_tokens * price_input
|
||||
) / 1000
|
||||
break
|
||||
else:
|
||||
if data_early_stop:
|
||||
|
@ -559,11 +592,12 @@ class Completion:
|
|||
mode=mode,
|
||||
space=space,
|
||||
)
|
||||
if len(space["model"]) > 1:
|
||||
space_model = space["model"]
|
||||
if not isinstance(space_model, str) and len(space_model) > 1:
|
||||
# start all the models with the same hp config
|
||||
config0 = search_alg.suggest("t0")
|
||||
points_to_evaluate = [config0]
|
||||
for model in space["model"]:
|
||||
for model in space_model:
|
||||
if model != config0["model"]:
|
||||
point = config0.copy()
|
||||
point["model"] = model
|
||||
|
@ -652,8 +686,13 @@ class ChatCompletion(Completion):
|
|||
|
||||
price1K = {
|
||||
"gpt-3.5-turbo": 0.002,
|
||||
"gpt-3.5-turbo-0301": 0.002,
|
||||
"gpt-4": (0.03, 0.06),
|
||||
"gpt-4-0314": (0.03, 0.06),
|
||||
"gpt-4-32k": (0.06, 0.12),
|
||||
"gpt-4-32k-0314": (0.06, 0.12),
|
||||
}
|
||||
|
||||
default_search_space = Completion.default_search_space.copy()
|
||||
default_search_space["model"] = tune.choice(list(price1K.keys()))
|
||||
default_search_space["model"] = tune.choice(["gpt-3.5-turbo", "gpt-4"])
|
||||
openai_completion_class = not ERROR and openai.ChatCompletion
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -15,6 +15,8 @@
|
|||
"\n",
|
||||
"# Use FLAML to Tune OpenAI Models\n",
|
||||
"\n",
|
||||
"FLAML offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Our study finds that tuning hyperparameters can significantly improve the utility of LLMs.\n",
|
||||
"\n",
|
||||
"In this notebook, we tune OpenAI models for code generation. We use [the HumanEval benchmark](https://huggingface.co/datasets/openai_humaneval) released by OpenAI for synthesizing programs from docstrings. \n",
|
||||
"\n",
|
||||
"## Requirements\n",
|
||||
|
@ -126,7 +128,7 @@
|
|||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "35cd066a31b242bb87b2c106ee72e5f2",
|
||||
"model_id": "d025d7cf0bc3438ba290e24d97855d8f",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
|
@ -441,7 +443,7 @@
|
|||
"\n",
|
||||
"### Perform tuning\n",
|
||||
"\n",
|
||||
"The tuning will take a while to finish, depending on the optimization budget (~1 min for the current budget). The tuning will be performed under the specified optimization budgets.\n",
|
||||
"The tuning will take a while to finish, depending on the optimization budget. The tuning will be performed under the specified optimization budgets.\n",
|
||||
"\n",
|
||||
"* `inference_budget` is the target average inference budget per instance in the benchmark. For example, 0.02 means the target inference budget is 0.02 dollars, which translates to 1000 tokens (input + output combined) if the text Davinci model is used.\n",
|
||||
"* `optimization_budget` is the total budget allowed to perform the tuning. For example, 5 means 5 dollars are allowed in total, which translates to 250K tokens for the text Davinci model.\n",
|
||||
|
@ -450,19 +452,14 @@
|
|||
"Users can specify tuning data, optimization metric, optimization mode, evaluation function, search spaces etc.. The default search space is:\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"price1K = {\n",
|
||||
" \"text-ada-001\": 0.0004,\n",
|
||||
" \"text-babbage-001\": 0.0005,\n",
|
||||
" \"text-curie-001\": 0.002,\n",
|
||||
" \"code-cushman-001\": 0.024,\n",
|
||||
" \"code-davinci-002\": 0.1,\n",
|
||||
" \"text-davinci-002\": 0.02,\n",
|
||||
" \"text-davinci-003\": 0.02,\n",
|
||||
" \"gpt-3.5-turbo\": 0.002,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"default_search_space = {\n",
|
||||
" \"model\": tune.choice(list(price1K.keys())),\n",
|
||||
" \"model\": tune.choice([\n",
|
||||
" \"text-ada-001\",\n",
|
||||
" \"text-babbage-001\",\n",
|
||||
" \"text-davinci-003\",\n",
|
||||
" \"gpt-3.5-turbo\",\n",
|
||||
" \"gpt-4\",\n",
|
||||
" ]),\n",
|
||||
" \"temperature_or_top_p\": tune.choice(\n",
|
||||
" [\n",
|
||||
" {\"temperature\": tune.uniform(0, 1)},\n",
|
||||
|
@ -475,13 +472,13 @@
|
|||
"}\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"The default search space can be overriden by users' input.\n",
|
||||
"For example, the following code specifies two choices for the model, four choices for the prompt and a fixed list of stop sequences. For hyperparameters which don't appear in users' input, the default search space will be used."
|
||||
"The default search space can be overridden by users' input.\n",
|
||||
"For example, the following code specifies four choices for the prompt and a fixed list of stop sequences. For hyperparameters which don't appear in users' input, the default search space will be used. If you don't have access to gpt-4 or would like to modify the choice of models, you can provide a different search space for model."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 21,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2023-02-24T23:25:40.593603Z",
|
||||
|
@ -495,119 +492,69 @@
|
|||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[32m[I 2023-02-24 23:25:40,643]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[32m[I 2023-02-24 23:25:40,646]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n"
|
||||
"\u001b[32m[I 2023-03-26 02:53:26,384]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n",
|
||||
"\u001b[32m[I 2023-03-26 02:53:26,387]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[flaml.tune.tune: 02-24 23:25:40] {811} INFO - trial 1 config: {'model': 'code-davinci-002', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[flaml.tune.tune: 02-24 23:25:44] {215} INFO - result: {'expected_success': 0.6, 'success': 0.6, 'total_cost': 0.4624999999999999, 'cost': 0.4624999999999999, 'inference_cost': 0.023125, 'training_iteration': 0, 'config': {'model': 'code-davinci-002', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}, 'config/model': 'code-davinci-002', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 3.687161445617676}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[flaml.tune.tune: 02-24 23:25:44] {811} INFO - trial 2 config: {'model': 'code-cushman-001', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[flaml.tune.tune: 02-24 23:25:45] {215} INFO - result: {'expected_success': 0.35, 'success': 0.35, 'total_cost': 0.5671159999999997, 'cost': 0.104616, 'inference_cost': 0.0052308, 'training_iteration': 0, 'config': {'model': 'code-cushman-001', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}, 'config/model': 'code-cushman-001', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.6666913032531738}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[flaml.tune.tune: 02-24 23:25:45] {811} INFO - trial 3 config: {'model': 'code-cushman-001', 'temperature_or_top_p': {'top_p': 0.4985070123025904}, 'max_tokens': 97, 'n': 20, 'prompt': 0, 'stop': 0}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[flaml.tune.tune: 02-24 23:26:01] {215} INFO - result: {'expected_success': 0.5080706992649381, 'success': 0.55, 'total_cost': 1.1424679999999998, 'cost': 0.575352, 'inference_cost': 0.0287676, 'training_iteration': 0, 'config': {'model': 'code-cushman-001', 'temperature_or_top_p': {'top_p': 0.4985070123025904}, 'max_tokens': 97, 'n': 20, 'prompt': 0, 'stop': 0}, 'config/model': 'code-cushman-001', 'config/temperature_or_top_p': {'top_p': 0.4985070123025904}, 'config/max_tokens': 97, 'config/n': 20, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 16.66586470603943}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[flaml.tune.tune: 02-24 23:26:01] {811} INFO - trial 4 config: {'model': 'code-cushman-001', 'temperature_or_top_p': {'top_p': 0.6125260668293881}, 'max_tokens': 433, 'n': 29, 'prompt': 0, 'stop': 0}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[flaml.tune.tune: 02-24 23:26:38] {215} INFO - result: {'expected_success': 0.6186627404336135, 'success': 0.65, 'total_cost': 2.3693479999999987, 'cost': 1.2268800000000002, 'inference_cost': 0.059620799999999995, 'training_iteration': 0, 'config': {'model': 'code-cushman-001', 'temperature_or_top_p': {'top_p': 0.6125260668293881}, 'max_tokens': 433, 'n': 29, 'prompt': 0, 'stop': 0}, 'config/model': 'code-cushman-001', 'config/temperature_or_top_p': {'top_p': 0.6125260668293881}, 'config/max_tokens': 433, 'config/n': 29, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 36.605130434036255}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[flaml.tune.tune: 02-24 23:26:38] {811} INFO - trial 5 config: {'model': 'code-davinci-002', 'temperature_or_top_p': {'temperature': 0.6177669784693172}, 'max_tokens': 231, 'n': 65, 'prompt': 3, 'stop': 0}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[flaml.tune.tune: 02-24 23:26:38] {215} INFO - result: {'expected_success': 0, 'total_cost': 2.5295479999999984, 'cost': 0.1602, 'training_iteration': 0, 'config': {'model': 'code-davinci-002', 'temperature_or_top_p': {'temperature': 0.6177669784693172}, 'max_tokens': 231, 'n': 65, 'prompt': 3, 'stop': 0}, 'config/model': 'code-davinci-002', 'config/temperature_or_top_p': {'temperature': 0.6177669784693172}, 'config/max_tokens': 231, 'config/n': 65, 'config/prompt': 3, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0020499229431152344}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[flaml.tune.tune: 02-24 23:26:38] {811} INFO - trial 6 config: {'model': 'code-davinci-002', 'max_tokens': 263, 'n': 41, 'prompt': 0, 'stop': 0, 'temperature_or_top_p': {'top_p': 0.49834557213253655}}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[flaml.tune.tune: 02-24 23:26:38] {215} INFO - result: {'expected_success': 0, 'total_cost': 2.8578479999999984, 'cost': 0.32830000000000004, 'training_iteration': 0, 'config': {'model': 'code-davinci-002', 'max_tokens': 263, 'n': 41, 'prompt': 0, 'stop': 0, 'temperature_or_top_p': {'top_p': 0.49834557213253655}}, 'config/model': 'code-davinci-002', 'config/max_tokens': 263, 'config/n': 41, 'config/prompt': 0, 'config/stop': 0, 'config/temperature_or_top_p': {'top_p': 0.49834557213253655}, 'experiment_tag': 'exp', 'time_total_s': 0.002808809280395508}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[flaml.tune.tune: 02-24 23:26:38] {811} INFO - trial 7 config: {'model': 'code-cushman-001', 'temperature_or_top_p': {'temperature': 0.8286813263076767}, 'max_tokens': 57, 'n': 63, 'prompt': 3, 'stop': 0}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[flaml.tune.tune: 02-24 23:26:38] {215} INFO - result: {'expected_success': 0, 'total_cost': 4.028831999999999, 'cost': 1.170984, 'training_iteration': 0, 'config': {'model': 'code-cushman-001', 'temperature_or_top_p': {'temperature': 0.8286813263076767}, 'max_tokens': 57, 'n': 63, 'prompt': 3, 'stop': 0}, 'config/model': 'code-cushman-001', 'config/temperature_or_top_p': {'temperature': 0.8286813263076767}, 'config/max_tokens': 57, 'config/n': 63, 'config/prompt': 3, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.015198230743408203}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[flaml.tune.tune: 02-24 23:26:38] {834} WARNING - fail to sample a trial for 100 times in a row, stopping.\n"
|
||||
"[flaml.tune.tune: 03-26 02:53:26] {811} INFO - trial 1 config: {'model': 'text-davinci-003', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}\n",
|
||||
"[flaml.tune.tune: 03-26 02:53:29] {215} INFO - result: {'expected_success': 0.6, 'success': 0.6, 'total_cost': 0.09264000000000001, 'cost': 0.09264000000000001, 'inference_cost': 0.004632, 'training_iteration': 0, 'config': {'model': 'text-davinci-003', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}, 'config/model': 'text-davinci-003', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 3.5772321224212646}\n",
|
||||
"[flaml.tune.tune: 03-26 02:53:29] {811} INFO - trial 2 config: {'model': 'text-ada-001', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}\n",
|
||||
"[flaml.tune.tune: 03-26 02:53:30] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 0.09429879999999999, 'cost': 0.0016588, 'inference_cost': 7.264e-05, 'training_iteration': 0, 'config': {'model': 'text-ada-001', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}, 'config/model': 'text-ada-001', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.5873167514801025}\n",
|
||||
"[flaml.tune.tune: 03-26 02:53:30] {811} INFO - trial 3 config: {'model': 'text-babbage-001', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}\n",
|
||||
"[flaml.tune.tune: 03-26 02:53:31] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 0.09782479999999999, 'cost': 0.003526, 'inference_cost': 0.00016342499999999997, 'training_iteration': 0, 'config': {'model': 'text-babbage-001', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}, 'config/model': 'text-babbage-001', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.6068365573883057}\n",
|
||||
"[flaml.tune.tune: 03-26 02:53:31] {811} INFO - trial 4 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}\n",
|
||||
"[flaml.tune.tune: 03-26 02:53:31] {215} INFO - result: {'expected_success': 0.2, 'success': 0.2, 'total_cost': 0.10643079999999999, 'cost': 0.008606, 'inference_cost': 0.0004394, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.5878369808197021}\n",
|
||||
"[flaml.tune.tune: 03-26 02:53:31] {811} INFO - trial 5 config: {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}\n",
|
||||
"[flaml.tune.tune: 03-26 02:53:32] {215} INFO - result: {'expected_success': 0.8, 'success': 0.8, 'total_cost': 0.2603308, 'cost': 0.15389999999999998, 'inference_cost': 0.007861499999999999, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.6071126461029053}\n",
|
||||
"[flaml.tune.tune: 03-26 02:53:32] {811} INFO - trial 6 config: {'model': 'text-ada-001', 'temperature_or_top_p': {'temperature': 0.7605307121989587}, 'max_tokens': 82, 'n': 9, 'prompt': 1, 'stop': 0}\n",
|
||||
"[flaml.tune.tune: 03-26 02:53:37] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 0.2629064, 'cost': 0.0025756000000000004, 'inference_cost': 0.00011848, 'training_iteration': 0, 'config': {'model': 'text-ada-001', 'temperature_or_top_p': {'temperature': 0.7605307121989587}, 'max_tokens': 82, 'n': 9, 'prompt': 1, 'stop': 0}, 'config/model': 'text-ada-001', 'config/temperature_or_top_p': {'temperature': 0.7605307121989587}, 'config/max_tokens': 82, 'config/n': 9, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 5.4761645793914795}\n",
|
||||
"[flaml.tune.tune: 03-26 02:53:37] {811} INFO - trial 7 config: {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.14217004760152696}, 'max_tokens': 152, 'n': 67, 'prompt': 2, 'stop': 0}\n",
|
||||
"[flaml.tune.tune: 03-26 02:53:37] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.4132364, 'cost': 0.15033000000000002, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.14217004760152696}, 'max_tokens': 152, 'n': 67, 'prompt': 2, 'stop': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'temperature': 0.14217004760152696}, 'config/max_tokens': 152, 'config/n': 67, 'config/prompt': 2, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0022079944610595703}\n",
|
||||
"[flaml.tune.tune: 03-26 02:53:37] {811} INFO - trial 8 config: {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.30070005663620336}, 'max_tokens': 70, 'n': 83, 'prompt': 3, 'stop': 0}\n",
|
||||
"[flaml.tune.tune: 03-26 02:53:37] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.6260264, 'cost': 0.21278999999999998, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.30070005663620336}, 'max_tokens': 70, 'n': 83, 'prompt': 3, 'stop': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'temperature': 0.30070005663620336}, 'config/max_tokens': 70, 'config/n': 83, 'config/prompt': 3, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0022161006927490234}\n",
|
||||
"[flaml.tune.tune: 03-26 02:53:37] {811} INFO - trial 9 config: {'model': 'text-babbage-001', 'temperature_or_top_p': {'temperature': 0.16501589771914849}, 'max_tokens': 161, 'n': 10, 'prompt': 3, 'stop': 0}\n",
|
||||
"[flaml.tune.tune: 03-26 02:53:43] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 0.6310854, 'cost': 0.005059, 'inference_cost': 0.00023457499999999997, 'training_iteration': 0, 'config': {'model': 'text-babbage-001', 'temperature_or_top_p': {'temperature': 0.16501589771914849}, 'max_tokens': 161, 'n': 10, 'prompt': 3, 'stop': 0}, 'config/model': 'text-babbage-001', 'config/temperature_or_top_p': {'temperature': 0.16501589771914849}, 'config/max_tokens': 161, 'config/n': 10, 'config/prompt': 3, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 5.868851661682129}\n",
|
||||
"[flaml.tune.tune: 03-26 02:53:43] {811} INFO - trial 10 config: {'model': 'text-ada-001', 'temperature_or_top_p': {'temperature': 0.5902013629854229}, 'max_tokens': 56, 'n': 36, 'prompt': 3, 'stop': 0}\n",
|
||||
"[flaml.tune.tune: 03-26 02:54:05] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 0.6344234000000001, 'cost': 0.003338, 'inference_cost': 0.0001522, 'training_iteration': 0, 'config': {'model': 'text-ada-001', 'temperature_or_top_p': {'temperature': 0.5902013629854229}, 'max_tokens': 56, 'n': 36, 'prompt': 3, 'stop': 0}, 'config/model': 'text-ada-001', 'config/temperature_or_top_p': {'temperature': 0.5902013629854229}, 'config/max_tokens': 56, 'config/n': 36, 'config/prompt': 3, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 21.348156690597534}\n",
|
||||
"[flaml.tune.tune: 03-26 02:54:05] {811} INFO - trial 11 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.763240587143681}, 'max_tokens': 693, 'n': 42, 'prompt': 0, 'stop': 0}\n",
|
||||
"[flaml.tune.tune: 03-26 02:54:33] {215} INFO - result: {'expected_success': 0.3476191678990812, 'success': 0.35, 'total_cost': 0.7530034000000003, 'cost': 0.11858000000000002, 'inference_cost': 0.005490999999999999, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.763240587143681}, 'max_tokens': 693, 'n': 42, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.763240587143681}, 'config/max_tokens': 693, 'config/n': 42, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 28.24349284172058}\n",
|
||||
"[flaml.tune.tune: 03-26 02:54:33] {811} INFO - trial 12 config: {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.2927979762895091}, 'max_tokens': 60, 'n': 97, 'prompt': 2, 'stop': 0}\n",
|
||||
"[flaml.tune.tune: 03-26 02:54:33] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.9340534000000004, 'cost': 0.18105, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.2927979762895091}, 'max_tokens': 60, 'n': 97, 'prompt': 2, 'stop': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'temperature': 0.2927979762895091}, 'config/max_tokens': 60, 'config/n': 97, 'config/prompt': 2, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.002497434616088867}\n",
|
||||
"[flaml.tune.tune: 03-26 02:54:33] {811} INFO - trial 13 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.7186028103822503}, 'max_tokens': 288, 'n': 4, 'prompt': 1, 'stop': 0}\n",
|
||||
"[flaml.tune.tune: 03-26 02:54:35] {215} INFO - result: {'expected_success': 0.28359375, 'success': 0.35, 'total_cost': 0.9496594000000004, 'cost': 0.015605999999999998, 'inference_cost': 0.0007894, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.7186028103822503}, 'max_tokens': 288, 'n': 4, 'prompt': 1, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.7186028103822503}, 'config/max_tokens': 288, 'config/n': 4, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 2.29030704498291}\n",
|
||||
"[flaml.tune.tune: 03-26 02:54:35] {811} INFO - trial 14 config: {'model': 'text-ada-001', 'temperature_or_top_p': {'top_p': 0.3653649712141158}, 'max_tokens': 96, 'n': 75, 'prompt': 1, 'stop': 0}\n",
|
||||
"[flaml.tune.tune: 03-26 02:55:20] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 0.9550898000000005, 'cost': 0.0054304, 'inference_cost': 0.00026122, 'training_iteration': 0, 'config': {'model': 'text-ada-001', 'temperature_or_top_p': {'top_p': 0.3653649712141158}, 'max_tokens': 96, 'n': 75, 'prompt': 1, 'stop': 0}, 'config/model': 'text-ada-001', 'config/temperature_or_top_p': {'top_p': 0.3653649712141158}, 'config/max_tokens': 96, 'config/n': 75, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 44.837317943573}\n",
|
||||
"[flaml.tune.tune: 03-26 02:55:20] {811} INFO - trial 15 config: {'model': 'text-davinci-003', 'temperature_or_top_p': {'temperature': 0.3814115349046321}, 'max_tokens': 791, 'n': 92, 'prompt': 3, 'stop': 0}\n",
|
||||
"[flaml.tune.tune: 03-26 02:55:20] {215} INFO - result: {'expected_success': 0, 'total_cost': 1.0798498000000005, 'cost': 0.12475999999999998, 'training_iteration': 0, 'config': {'model': 'text-davinci-003', 'temperature_or_top_p': {'temperature': 0.3814115349046321}, 'max_tokens': 791, 'n': 92, 'prompt': 3, 'stop': 0}, 'config/model': 'text-davinci-003', 'config/temperature_or_top_p': {'temperature': 0.3814115349046321}, 'config/max_tokens': 791, 'config/n': 92, 'config/prompt': 3, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0024149417877197266}\n",
|
||||
"[flaml.tune.tune: 03-26 02:55:20] {811} INFO - trial 16 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4284507389678964}, 'max_tokens': 398, 'n': 11, 'prompt': 3, 'stop': 0}\n",
|
||||
"[flaml.tune.tune: 03-26 02:55:29] {215} INFO - result: {'expected_success': 0.5484931390416686, 'success': 0.55, 'total_cost': 1.1118038000000003, 'cost': 0.031954, 'inference_cost': 0.0015885000000000003, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4284507389678964}, 'max_tokens': 398, 'n': 11, 'prompt': 3, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.4284507389678964}, 'config/max_tokens': 398, 'config/n': 11, 'config/prompt': 3, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 9.271101951599121}\n",
|
||||
"[flaml.tune.tune: 03-26 02:55:29] {811} INFO - trial 17 config: {'model': 'gpt-4', 'max_tokens': 211, 'n': 13, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.25447895557126815}}\n",
|
||||
"[flaml.tune.tune: 03-26 02:55:46] {215} INFO - result: {'expected_success': 0.8822303234803123, 'success': 0.9, 'total_cost': 2.1304238, 'cost': 1.0186199999999999, 'inference_cost': 0.0484995, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'max_tokens': 211, 'n': 13, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.25447895557126815}}, 'config/model': 'gpt-4', 'config/max_tokens': 211, 'config/n': 13, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.25447895557126815}, 'experiment_tag': 'exp', 'time_total_s': 16.604310512542725}\n",
|
||||
"[flaml.tune.tune: 03-26 02:55:46] {811} INFO - trial 18 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9761031076386442}, 'max_tokens': 349, 'n': 23, 'prompt': 0, 'stop': 0}\n",
|
||||
"[flaml.tune.tune: 03-26 02:56:05] {215} INFO - result: {'expected_success': 0.3551828400470255, 'success': 0.4, 'total_cost': 2.1919698000000003, 'cost': 0.061546, 'inference_cost': 0.0030944, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9761031076386442}, 'max_tokens': 349, 'n': 23, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9761031076386442}, 'config/max_tokens': 349, 'config/n': 23, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 19.451276063919067}\n",
|
||||
"[flaml.tune.tune: 03-26 02:56:05] {811} INFO - trial 19 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9822374507369328}, 'max_tokens': 393, 'n': 22, 'prompt': 0, 'stop': 0}\n",
|
||||
"[flaml.tune.tune: 03-26 02:56:18] {215} INFO - result: {'expected_success': 0.2898979473186428, 'success': 0.35, 'total_cost': 2.2507018000000003, 'cost': 0.058732, 'inference_cost': 0.0029537, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9822374507369328}, 'max_tokens': 393, 'n': 22, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9822374507369328}, 'config/max_tokens': 393, 'config/n': 22, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 13.075204372406006}\n",
|
||||
"[flaml.tune.tune: 03-26 02:56:18] {811} INFO - trial 20 config: {'model': 'gpt-4', 'max_tokens': 348, 'n': 1, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.36865945026811975}}\n",
|
||||
"[flaml.tune.tune: 03-26 02:56:19] {215} INFO - result: {'expected_success': 0.75, 'success': 0.75, 'total_cost': 2.4012418000000006, 'cost': 0.15053999999999995, 'inference_cost': 0.007693499999999999, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'max_tokens': 348, 'n': 1, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.36865945026811975}}, 'config/model': 'gpt-4', 'config/max_tokens': 348, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'experiment_tag': 'exp', 'time_total_s': 0.6143312454223633}\n",
|
||||
"[flaml.tune.tune: 03-26 02:56:19] {811} INFO - trial 21 config: {'model': 'text-ada-001', 'max_tokens': 130, 'n': 22, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.22084263211180838}}\n",
|
||||
"[flaml.tune.tune: 03-26 02:56:32] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 2.4043414000000016, 'cost': 0.0030996000000000005, 'inference_cost': 0.00014468, 'training_iteration': 0, 'config': {'model': 'text-ada-001', 'max_tokens': 130, 'n': 22, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.22084263211180838}}, 'config/model': 'text-ada-001', 'config/max_tokens': 130, 'config/n': 22, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.22084263211180838}, 'experiment_tag': 'exp', 'time_total_s': 13.137321710586548}\n",
|
||||
"[flaml.tune.tune: 03-26 02:56:32] {811} INFO - trial 22 config: {'model': 'text-ada-001', 'max_tokens': 342, 'n': 4, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.2881152790307279}}\n",
|
||||
"[flaml.tune.tune: 03-26 02:56:35] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 2.4061918000000024, 'cost': 0.0018504, 'inference_cost': 8.222e-05, 'training_iteration': 0, 'config': {'model': 'text-ada-001', 'max_tokens': 342, 'n': 4, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.2881152790307279}}, 'config/model': 'text-ada-001', 'config/max_tokens': 342, 'config/n': 4, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.2881152790307279}, 'experiment_tag': 'exp', 'time_total_s': 2.4484035968780518}\n",
|
||||
"[flaml.tune.tune: 03-26 02:56:35] {811} INFO - trial 23 config: {'model': 'gpt-4', 'max_tokens': 253, 'n': 23, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.41254458573656}}\n",
|
||||
"[flaml.tune.tune: 03-26 02:56:35] {215} INFO - result: {'expected_success': 0, 'total_cost': 2.618831800000003, 'cost': 0.21264, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'max_tokens': 253, 'n': 23, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.41254458573656}}, 'config/model': 'gpt-4', 'config/max_tokens': 253, 'config/n': 23, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.41254458573656}, 'experiment_tag': 'exp', 'time_total_s': 0.003139972686767578}\n",
|
||||
"[flaml.tune.tune: 03-26 02:56:35] {811} INFO - trial 24 config: {'model': 'gpt-4', 'max_tokens': 176, 'n': 3, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.0964133254059763}}\n",
|
||||
"[flaml.tune.tune: 03-26 02:56:36] {215} INFO - result: {'expected_success': 0.8185185185185185, 'success': 0.85, 'total_cost': 2.912231800000003, 'cost': 0.29339999999999994, 'inference_cost': 0.014836499999999999, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'max_tokens': 176, 'n': 3, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.0964133254059763}}, 'config/model': 'gpt-4', 'config/max_tokens': 176, 'config/n': 3, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.0964133254059763}, 'experiment_tag': 'exp', 'time_total_s': 1.8556303977966309}\n",
|
||||
"[flaml.tune.tune: 03-26 02:56:36] {811} INFO - trial 25 config: {'model': 'text-babbage-001', 'max_tokens': 343, 'n': 27, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.24286268913046594}}\n",
|
||||
"[flaml.tune.tune: 03-26 02:56:55] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 2.9569863000000023, 'cost': 0.04475450000000001, 'inference_cost': 0.00222485, 'training_iteration': 0, 'config': {'model': 'text-babbage-001', 'max_tokens': 343, 'n': 27, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.24286268913046594}}, 'config/model': 'text-babbage-001', 'config/max_tokens': 343, 'config/n': 27, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.24286268913046594}, 'experiment_tag': 'exp', 'time_total_s': 19.013901472091675}\n",
|
||||
"[flaml.tune.tune: 03-26 02:56:55] {811} INFO - trial 26 config: {'model': 'text-babbage-001', 'max_tokens': 130, 'n': 1, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.26609522201207036}}\n",
|
||||
"[flaml.tune.tune: 03-26 02:56:56] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 2.9595088000000023, 'cost': 0.0025224999999999996, 'inference_cost': 0.00011325, 'training_iteration': 0, 'config': {'model': 'text-babbage-001', 'max_tokens': 130, 'n': 1, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.26609522201207036}}, 'config/model': 'text-babbage-001', 'config/max_tokens': 130, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.26609522201207036}, 'experiment_tag': 'exp', 'time_total_s': 0.5786199569702148}\n",
|
||||
"[flaml.tune.tune: 03-26 02:56:56] {811} INFO - trial 27 config: {'model': 'gpt-4', 'max_tokens': 212, 'n': 29, 'prompt': 0, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.24802150727233283}}\n",
|
||||
"[flaml.tune.tune: 03-26 02:56:56] {215} INFO - result: {'expected_success': 0, 'total_cost': 3.0123088000000022, 'cost': 0.05279999999999999, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'max_tokens': 212, 'n': 29, 'prompt': 0, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.24802150727233283}}, 'config/model': 'gpt-4', 'config/max_tokens': 212, 'config/n': 29, 'config/prompt': 0, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.24802150727233283}, 'experiment_tag': 'exp', 'time_total_s': 0.0019483566284179688}\n",
|
||||
"[flaml.tune.tune: 03-26 02:56:56] {834} WARNING - fail to sample a trial for 100 times in a row, stopping.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -618,20 +565,11 @@
|
|||
" mode=\"max\", # the optimization mode\n",
|
||||
" eval_func=success_metrics, # the evaluation function to return the success metrics\n",
|
||||
" # log_file_name=\"logs/humaneval.log\", # the log file name\n",
|
||||
" inference_budget=0.1, # the inference budget (dollar)\n",
|
||||
" optimization_budget=4, # the optimization budget (dollar)\n",
|
||||
" inference_budget=0.05, # the inference budget (dollar)\n",
|
||||
" optimization_budget=3, # the optimization budget (dollar)\n",
|
||||
" # num_samples can further limit the number of trials for different hyperparameter configurations;\n",
|
||||
" # -1 means decided by the optimization budget only\n",
|
||||
" num_samples=-1,\n",
|
||||
" model=tune.choice(\n",
|
||||
" [\n",
|
||||
" # These two models are currently free to use from OpenAI,\n",
|
||||
" # so no actual cost will incur. They are not free in Azure OpenAI.\n",
|
||||
" # The optimization is based on the price in Azure OpenAI.\n",
|
||||
" \"code-cushman-001\", \n",
|
||||
" \"code-davinci-002\",\n",
|
||||
" ]\n",
|
||||
" ),\n",
|
||||
" prompt=[\n",
|
||||
" \"{prompt}\",\n",
|
||||
" \"# Python 3{prompt}\",\n",
|
||||
|
@ -654,7 +592,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 22,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2023-02-24T23:26:38.352710Z",
|
||||
|
@ -668,8 +606,8 @@
|
|||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"optimized config {'model': 'code-cushman-001', 'max_tokens': 433, 'n': 29, 'prompt': '{prompt}', 'stop': ['\\nclass', '\\ndef', '\\nif', '\\nprint'], 'top_p': 0.6125260668293881}\n",
|
||||
"best result on tuning data {'expected_success': 0.6186627404336135, 'success': 0.65, 'total_cost': 2.3693479999999987, 'cost': 1.2268800000000002, 'inference_cost': 0.059620799999999995, 'training_iteration': 0, 'config': {'model': 'code-cushman-001', 'temperature_or_top_p': {'top_p': 0.6125260668293881}, 'max_tokens': 433, 'n': 29, 'prompt': 0, 'stop': 0}, 'config/model': 'code-cushman-001', 'config/temperature_or_top_p': {'top_p': 0.6125260668293881}, 'config/max_tokens': 433, 'config/n': 29, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 36.605130434036255}\n"
|
||||
"optimized config {'model': 'gpt-4', 'max_tokens': 211, 'n': 13, 'prompt': '# Python 3{prompt}', 'stop': ['\\nclass', '\\ndef', '\\nif', '\\nprint'], 'temperature': 0.25447895557126815}\n",
|
||||
"best result on tuning data {'expected_success': 0.8822303234803123, 'success': 0.9, 'total_cost': 2.1304238, 'cost': 1.0186199999999999, 'inference_cost': 0.0484995, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'max_tokens': 211, 'n': 13, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.25447895557126815}}, 'config/model': 'gpt-4', 'config/max_tokens': 211, 'config/n': 13, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.25447895557126815}, 'experiment_tag': 'exp', 'time_total_s': 16.604310512542725}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -694,7 +632,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": 23,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2023-02-24T23:26:38.359902Z",
|
||||
|
@ -717,194 +655,118 @@
|
|||
" {\n",
|
||||
" \"finish_reason\": \"stop\",\n",
|
||||
" \"index\": 0,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" return [abs(guess[i]-game[i]) for i in range(len(guess))]\"\n",
|
||||
" \"message\": {\n",
|
||||
" \"content\": \"def compare(game, guess):\\n result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\\n\",\n",
|
||||
" \"role\": \"assistant\"\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"stop\",\n",
|
||||
" \"index\": 1,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" return [abs(guess[i]-game[i]) for i in range(len(game))]\\n\"\n",
|
||||
" \"message\": {\n",
|
||||
" \"content\": \"def compare(game, guess):\\n result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\\n\",\n",
|
||||
" \"role\": \"assistant\"\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"stop\",\n",
|
||||
" \"index\": 2,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" return [abs(guess[i]-game[i]) for i in range(len(guess))]\"\n",
|
||||
" \"message\": {\n",
|
||||
" \"content\": \"def compare(game, guess):\\n result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\\n\",\n",
|
||||
" \"role\": \"assistant\"\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"stop\",\n",
|
||||
" \"index\": 3,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" return [abs(game[i]-guess[i]) for i in range(len(game))]\\n\\n#print(compare([1,2,3,4,5,1],[1,2,3,4,2,-2]))\\n#print(compare([0,5,0,0,0,4],[4,1,1,0,0,-2]))\"\n",
|
||||
" \"message\": {\n",
|
||||
" \"content\": \"def compare(game, guess):\\n return [abs(game[i] - guess[i]) for i in range(len(game))]\\n\",\n",
|
||||
" \"role\": \"assistant\"\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"stop\",\n",
|
||||
" \"index\": 4,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" #\\n # TODO: Define compare\\n #\\n return None\\n\"\n",
|
||||
" \"message\": {\n",
|
||||
" \"content\": \"def compare(game, guess):\\n result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\\n\",\n",
|
||||
" \"role\": \"assistant\"\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"stop\",\n",
|
||||
" \"index\": 5,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" # your code here\\n return [abs(game[i]-guess[i]) for i in range(len(game))]\"\n",
|
||||
" \"message\": {\n",
|
||||
" \"content\": \"def compare(game, guess):\\n result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\\n\",\n",
|
||||
" \"role\": \"assistant\"\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"stop\",\n",
|
||||
" \"index\": 6,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" return [abs(game[i]-guess[i]) for i in range(len(game))]\\n \\n\\n# Recursion\\n\"\n",
|
||||
" \"message\": {\n",
|
||||
" \"content\": \"def compare(game, guess):\\n result = []\\n for i in range(len(game)):\\n if game[i] == guess[i]:\\n result.append(0)\\n else:\\n result.append(abs(game[i] - guess[i]))\\n return result\\n\",\n",
|
||||
" \"role\": \"assistant\"\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"length\",\n",
|
||||
" \"finish_reason\": \"stop\",\n",
|
||||
" \"index\": 7,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" #return a list of the absolute difference between guess and score\\n return [abs(score-guess) for score,guess in zip(game,guess)]\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n\"\n",
|
||||
" \"message\": {\n",
|
||||
" \"content\": \"def compare(game, guess):\\n return [abs(game[i] - guess[i]) for i in range(len(game))]\\n\",\n",
|
||||
" \"role\": \"assistant\"\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"stop\",\n",
|
||||
" \"index\": 8,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" #Your code goes here\\n guess = np.array(guess)\\n score = np.array(game)\\n res = np.zeros(len(guess))\\n for i in range(len(guess)):\\n if guess[i] == score[i]:\\n res[i] = 0\\n else:\\n res[i] = abs(guess[i] - score[i])\\n return res\\n\\n#print(compare([1,2,3,4,5,1],[1,2,3,4,2,-2]))\\n#print(compare([0,5,0,0,0,4],[4,1,1,0,0,-2]))\\n\"\n",
|
||||
" \"message\": {\n",
|
||||
" \"content\": \"def compare(game, guess):\\n return [abs(a - b) for a, b in zip(game, guess)]\\n\",\n",
|
||||
" \"role\": \"assistant\"\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"stop\",\n",
|
||||
" \"index\": 9,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" if len(game) != len(guess):\\n return \\\"Your arrays are not of equal length.\\\"\\n else:\\n return [abs(guess[i]-game[i]) for i in range(len(guess))]\"\n",
|
||||
" \"message\": {\n",
|
||||
" \"content\": \"def compare(game, guess):\\n result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\\n\",\n",
|
||||
" \"role\": \"assistant\"\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"length\",\n",
|
||||
" \"finish_reason\": \"stop\",\n",
|
||||
" \"index\": 10,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" l = []\\n for i in range(len(guess)):\\n if guess[i] == game[i]:\\n l.append(0)\\n else:\\n l.append(abs(guess[i] - game[i]))\\n return l\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \"\n",
|
||||
" \"message\": {\n",
|
||||
" \"content\": \"def compare(game, guess):\\n result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\\n\",\n",
|
||||
" \"role\": \"assistant\"\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"stop\",\n",
|
||||
" \"index\": 11,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" return [abs(guess[i]-game[i]) for i in range(len(game))]\\n\"\n",
|
||||
" \"message\": {\n",
|
||||
" \"content\": \"def compare(game, guess):\\n result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\\n\",\n",
|
||||
" \"role\": \"assistant\"\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"stop\",\n",
|
||||
" \"index\": 12,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" assert len(game) == len(guess), \\\"the length of game and guess must be equal\\\"\\n return [abs(guess[i] - game[i]) for i in range(len(game))]\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"stop\",\n",
|
||||
" \"index\": 13,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" return [abs(a-b) for a,b in zip(game,guess)]\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"stop\",\n",
|
||||
" \"index\": 14,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" return [abs(guess[i]-game[i]) for i in range(len(guess))]\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"length\",\n",
|
||||
" \"index\": 15,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" answer = []\\n for i in range(len(guess)):\\n answer.append(guess[i]-game[i])\\n return answer\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"stop\",\n",
|
||||
" \"index\": 16,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" return [abs(guess[i]-game[i]) for i in range(len(guess))]\\n\\n#%%\\n#EXAMPLE\\n#%%\\ngame = [1,2,3,4,5,1]\\nguess = [1,2,3,4,2,-2]\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"stop\",\n",
|
||||
" \"index\": 17,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" return [abs(game[i]-guess[i]) for i in range(len(game))]\\n \\n\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"stop\",\n",
|
||||
" \"index\": 18,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" return [abs(guess[i]-game[i]) for i in range(len(game))]\\n\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"length\",\n",
|
||||
" \"index\": 19,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" return [abs(guess[i]-game[i]) for i in range(len(guess))]\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"length\",\n",
|
||||
" \"index\": 20,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" if len(game) != len(guess):\\n return []\\n results = []\\n for i in range(len(game)):\\n results.append(abs(guess[i] - game[i]))\\n return results\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"stop\",\n",
|
||||
" \"index\": 21,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" return [abs(game[i]-guess[i]) for i in range(len(game))]\\n\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"length\",\n",
|
||||
" \"index\": 22,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" return [abs(guess[i] - game[i]) for i in range(len(game))]\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"length\",\n",
|
||||
" \"index\": 23,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" return [abs(guess[i] - game[i]) for i in range(len(guess))]\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"stop\",\n",
|
||||
" \"index\": 24,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" return [abs(guess[i] - game[i]) for i in range(len(guess))]\\n \"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"stop\",\n",
|
||||
" \"index\": 25,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" return [abs(guess[i]-game[i]) for i in range(len(guess))]\\n\\n#or use the following solution\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"stop\",\n",
|
||||
" \"index\": 26,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" return [abs(guess[i]-game[i]) for i in range(len(game))]\\n\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"stop\",\n",
|
||||
" \"index\": 27,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" return [abs(score-guess) for score,guess in zip(game,guess)]\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"finish_reason\": \"stop\",\n",
|
||||
" \"index\": 28,\n",
|
||||
" \"logprobs\": null,\n",
|
||||
" \"text\": \" results = []\\n for i in range(len(game)):\\n if guess[i] == game[i]:\\n results.append(0)\\n else:\\n results.append(abs(guess[i] - game[i]))\\n return results\\n\"\n",
|
||||
" \"message\": {\n",
|
||||
" \"content\": \"def compare(game, guess):\\n result = []\\n for i in range(len(game)):\\n if game[i] == guess[i]:\\n result.append(0)\\n else:\\n result.append(abs(game[i] - guess[i]))\\n return result\\n\",\n",
|
||||
" \"role\": \"assistant\"\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" \"created\": 1675617146,\n",
|
||||
" \"id\": \"cmpl-6gcqwCz8JXC5eB62rsjxrcIgL3n4B\",\n",
|
||||
" \"model\": \"code-cushman-001\",\n",
|
||||
" \"object\": \"text_completion\",\n",
|
||||
" \"created\": 1679636800,\n",
|
||||
" \"id\": \"chatcmpl-6xUY4niTRrpJ5UShayb9QncgjS8rg\",\n",
|
||||
" \"model\": \"gpt-4-0314\",\n",
|
||||
" \"object\": \"chat.completion\",\n",
|
||||
" \"usage\": {\n",
|
||||
" \"completion_tokens\": 3959,\n",
|
||||
" \"prompt_tokens\": 239,\n",
|
||||
" \"total_tokens\": 4198\n",
|
||||
" \"completion_tokens\": 440,\n",
|
||||
" \"prompt_tokens\": 236,\n",
|
||||
" \"total_tokens\": 676\n",
|
||||
" }\n",
|
||||
"}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"}\n",
|
||||
"{'expected_success': 1.0, 'success': True}\n"
|
||||
]
|
||||
}
|
||||
|
@ -912,7 +774,7 @@
|
|||
"source": [
|
||||
"responses = oai.Completion.create(context=tune_data[1], **config)\n",
|
||||
"print(responses)\n",
|
||||
"print(success_metrics([response[\"text\"].rstrip() for response in responses[\"choices\"]], **tune_data[1]))\n"
|
||||
"print(success_metrics([response[\"message\"][\"content\"] if config[\"model\"] in oai.Completion.chat_models else response[\"text\"] for response in responses[\"choices\"]], **tune_data[1]))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -922,12 +784,12 @@
|
|||
"source": [
|
||||
"### Evaluate the success rate on the test data\n",
|
||||
"\n",
|
||||
"You can use flaml's `oai.Completion.eval` to evaluate the performance of an entire dataset with the tuned config. To do that you need to set `oai.Completion.data` to the data to evaluate. The following code will take a while to evaluate all the 144 test data instances. Compared to the baseline success rate (46%) on the [HELM benchmark](https://crfm.stanford.edu/helm/latest/?group=code_humaneval), the tuned config has a success rate of 68%. It can be further improved if the inference budget and optimization budget are further increased."
|
||||
"You can use flaml's `oai.Completion.eval` to evaluate the performance of an entire dataset with the tuned config. To do that you need to set `oai.Completion.data` to the data to evaluate. The following code will take a while to evaluate all the 144 test data instances. The cost is about $7 if you uncomment it and run it."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 25,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2023-02-24T23:26:39.347295Z",
|
||||
|
@ -941,14 +803,22 @@
|
|||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'expected_success': 0.6364503360372493, 'success': 0.6805555555555556, 'total_cost': 12.210191999999997, 'cost': 8.181360000000003, 'inference_cost': 0.056815}\n"
|
||||
"{'expected_success': 0.8326778348739547, 'success': 0.8472222222222222, 'total_cost': 10.024478799999999, 'cost': 7.01217, 'inference_cost': 0.049131249999999994}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"oai.Completion.data = test_data\n",
|
||||
"result = oai.Completion.eval(analysis.best_config, prune=False, eval_only=True)\n",
|
||||
"print(result)\n"
|
||||
"# oai.Completion.data = test_data\n",
|
||||
"# result = oai.Completion.eval(analysis.best_config, prune=False, eval_only=True)\n",
|
||||
"# print(result)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The result will vary with the inference budget and optimization budget.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -968,7 +838,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.15"
|
||||
"version": "3.9.16"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
|
|
2
setup.py
2
setup.py
|
@ -120,7 +120,7 @@ setuptools.setup(
|
|||
"pytorch-forecasting>=0.9.0",
|
||||
],
|
||||
"benchmark": ["catboost>=0.26", "psutil==5.8.0", "xgboost==1.3.3"],
|
||||
"openai": ["openai==0.23.1", "diskcache", "optuna==2.8.0"],
|
||||
"openai": ["openai==0.27.0", "diskcache", "optuna==2.8.0"],
|
||||
"synapse": ["joblibspark>=0.5.0", "optuna==2.8.0", "pyspark>=3.2.0"],
|
||||
},
|
||||
classifiers=[
|
||||
|
|
|
@ -109,7 +109,6 @@ def test_humaneval(num_samples=1):
|
|||
)
|
||||
responses = oai.ChatCompletion.create(context=test_data[0], **config)
|
||||
print(responses)
|
||||
return
|
||||
# a more comprehensive tuning example
|
||||
config, analysis = oai.Completion.tune(
|
||||
data=tune_data,
|
||||
|
|
|
@ -43,7 +43,7 @@ def test_integrate_openai(save=False):
|
|||
reason="do not run openai test if openai is not installed",
|
||||
)
|
||||
def test_integrate_chatgpt(save=False):
|
||||
run_notebook("integrate_chatgpt_math.ipynb", save=save)
|
||||
run_notebook("integrate_chatgpt.ipynb", save=save)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
FLAML offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673). In this example, we will tune several hyperparameters for the OpenAI's completion API, including the temperature, prompt and n (number of completions), to optimize the inference performance for a code generation task. Our study shows that tuning hyperparameters can significantly affect the utility of the OpenAI API.
|
||||
FLAML offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Our study finds that tuning hyperparameters can significantly improve the utility of the OpenAI API.
|
||||
In this example, we will tune several hyperparameters for the OpenAI's completion API, including the temperature, prompt and n (number of completions), to optimize the inference performance for a code generation task.
|
||||
|
||||
### Prerequisites
|
||||
|
||||
|
@ -7,7 +8,6 @@ Install the [openai] option. The OpenAI integration is in preview. ChaptGPT supp
|
|||
pip install "flaml[openai]==1.2.0"
|
||||
```
|
||||
|
||||
|
||||
Setup your OpenAI key:
|
||||
```python
|
||||
import os
|
||||
|
@ -122,7 +122,7 @@ def success_metrics(responses, prompt, test, entry_point):
|
|||
|
||||
### Tuning Hyperparameters for OpenAI
|
||||
|
||||
The tuning will take a while to finish, depending on the optimization budget (~1 min for the current budget). The tuning will be performed under the specified optimization budgets.
|
||||
The tuning will be performed under the specified optimization budgets.
|
||||
|
||||
* inference_budget is the target average inference budget per instance in the benchmark. For example, 0.02 means the target inference budget is 0.02 dollars, which translates to 1000 tokens (input + output combined) if the text Davinci model is used.
|
||||
* optimization_budget is the total budget allowed to perform the tuning. For example, 5 means 5 dollars are allowed in total, which translates to 250K tokens for the text Davinci model.
|
||||
|
@ -142,15 +142,6 @@ config, analysis = oai.Completion.tune(
|
|||
# num_samples can further limit the number of trials for different hyperparameter configurations;
|
||||
# -1 means decided by the optimization budget only
|
||||
num_samples=-1,
|
||||
model=tune.choice(
|
||||
[
|
||||
# These two models are in Beta test and free to use from OpenAI as of Feb 2023,
|
||||
# so no actual cost will incur (please double check when you run it). They are not free in Azure OpenAI.
|
||||
# The optimization is based on the price in Azure OpenAI as of Feb 2023.
|
||||
"code-cushman-001",
|
||||
"code-davinci-002",
|
||||
]
|
||||
),
|
||||
prompt=[
|
||||
"{prompt}",
|
||||
"# Python 3{prompt}",
|
||||
|
@ -182,7 +173,7 @@ print(success_metrics([response["text"].rstrip() for response in responses["choi
|
|||
|
||||
#### Evaluate the success rate on the test data
|
||||
|
||||
You can use flaml's oai.Completion.eval to evaluate the performance of an entire dataset with the tuned config. To do that you need to set oai.Completion.data to the data to evaluate. The following code will take a while to evaluate all the 144 test data instances. Compared to the baseline success rate (0.46) on the HELM benchmark, the tuned config has a success rate of 0.68. It can be further improved if the inference budget and optimization budget are further increased.
|
||||
You can use flaml's `oai.Completion.eval` to evaluate the performance of an entire dataset with the tuned config. To do that you need to set `oai.Completion.data` to the data to evaluate.
|
||||
|
||||
```python
|
||||
oai.Completion.data = test_data
|
||||
|
@ -190,4 +181,6 @@ result = oai.Completion.eval(analysis.best_config, prune=False, eval_only=True)
|
|||
print(result)
|
||||
```
|
||||
|
||||
The result will vary with the inference budget and optimization budget.
|
||||
|
||||
[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_openai.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_openai.ipynb)
|
||||
|
|
Загрузка…
Ссылка в новой задаче