Update components for AOAI Evals (#3535)

* add new environment aoai-evaluation

* remove azureml-metrics from dependencies

* pin packages

* reorder logic for batch score result

* fix code style issue

* remove env

* update env name

---------

Co-authored-by: Ankush Bhatia <coolankush07@gmail.com>
This commit is contained in:
Rahul Kumar 2024-11-08 17:53:20 +05:30 коммит произвёл GitHub
Родитель cc7556dd57
Коммит 702cec7349
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
4 изменённых файлов: 70 добавлений и 41 удалений

Просмотреть файл

@ -4,7 +4,7 @@ type: command
name: batch_benchmark_config_generator
display_name: Batch Benchmark Config Generator
description: Generates the config for the batch score component.
version: 0.0.8
version: 0.0.9
is_deterministic: true
inputs:
@ -86,7 +86,7 @@ outputs:
description: The config json file for the batch score component.
code: ../src
environment: azureml://registries/azureml/environments/model-evaluation/labels/latest
environment: azureml://registries/azureml/environments/evaluation/labels/latest
command: >-
python -m aml_benchmark.batch_config_generator.main
--scoring_url '${{inputs.scoring_url}}'

Просмотреть файл

@ -4,7 +4,7 @@ type: command
name: batch_inference_preparer
display_name: Batch Inference Preparer
description: Prepare the jsonl file and endpoint for batch inference component.
version: 0.0.13
version: 0.0.14
is_deterministic: true
inputs:
@ -65,7 +65,7 @@ outputs:
description: Path to the folder where the ground truth metadata will be stored.
code: ../src
environment: azureml://registries/azureml/environments/model-evaluation/labels/latest
environment: azureml://registries/azureml/environments/evaluation/labels/latest
command: >-
python -m aml_benchmark.batch_inference_preparer.main
--batch_input_pattern '${{inputs.batch_input_pattern}}'

Просмотреть файл

@ -1,5 +1,5 @@
name: batch_output_formatter
version: 0.0.13
version: 0.0.14
display_name: Batch Output Formatter
is_deterministic: True
type: command
@ -71,7 +71,7 @@ outputs:
unsafe_content_blocked_requests:
type: uri_file
code: ../src
environment: azureml://registries/azureml/environments/model-evaluation/labels/latest
environment: azureml://registries/azureml/environments/evaluation/labels/latest
resources:
instance_count: 1

Просмотреть файл

@ -5,7 +5,7 @@
import argparse
import os
from typing import Optional
from typing import Optional, List, Dict, Any
import pandas as pd
from aml_benchmark.utils.io import resolve_io_path, read_jsonl_files, save_list_to_jsonl_if_path_provided
@ -13,6 +13,7 @@ from aml_benchmark.utils.logging import get_logger, log_params_and_metrics
from aml_benchmark.utils.exceptions import swallow_all_exceptions
from aml_benchmark.utils.aml_run_utils import str2bool
from aml_benchmark.utils.online_endpoint.online_endpoint_model import OnlineEndpointModel
from aml_benchmark.utils.online_endpoint.endpoint_utils import EndpointUtilities
from aml_benchmark.utils.exceptions import BenchmarkUserException
from aml_benchmark.utils.error_definitions import BenchmarkUserError
from azureml._common._error_definition.azureml_error import AzureMLError
@ -58,6 +59,25 @@ def parse_args() -> argparse.Namespace:
return args
def _reorder_batch_score_result(
batch_score_result: List[Dict[str, Any]],
ground_truth: List[Dict[str, Any]],
model_type: str,
) -> List[Dict[str, Any]]:
"""Reorder the batch score result based on the ground truth."""
model = OnlineEndpointModel(model=None, model_version=None, model_type=model_type)
batch_score_dict = {
EndpointUtilities.hash_payload_prompt(row["request"], model): row for row in batch_score_result
}
batch_score_result = []
for ground_truth_dict in ground_truth:
hash_val = ground_truth_dict["payload_id"]
batch_score_result.append(batch_score_dict[hash_val])
return batch_score_result
@swallow_all_exceptions(logger)
def main(
batch_inference_output: str,
@ -108,10 +128,12 @@ def main(
:return: None
"""
logger.info("Read batch output data now.")
data_files = [
f for f in os.listdir(batch_inference_output) if f.endswith("json") or f.endswith("jsonl")
data_files = [os.path.join(
batch_inference_output, f
) for f in os.listdir(batch_inference_output) if f.endswith("json") or f.endswith("jsonl")
]
logger.info(f"Receiving {data_files}")
logger.info(f"Receiving {len(data_files)} files.")
batch_score_result: List[Dict[str, Any]] = read_jsonl_files(data_files)
prediction_list = []
perf_list = []
@ -121,7 +143,17 @@ def main(
successful_response_list = []
if ground_truth_input:
input_file_paths = resolve_io_path(ground_truth_input)
ground_truth_df = pd.DataFrame(read_jsonl_files(input_file_paths))
_ground_truth_data = read_jsonl_files(input_file_paths)
try:
batch_score_result = _reorder_batch_score_result(batch_score_result, _ground_truth_data, model_type)
logger.info("Reordered batch score result successfully.")
except Exception as e:
logger.warning(
"Failed to reorder batch score result, falling back to original order. "
f"This excpetion does not lead to run failure. Exception details:\n{e}"
)
ground_truth_df = pd.DataFrame(_ground_truth_data)
_ground_truth_data = []
else:
ground_truth_df = None
online_model = OnlineEndpointModel(None, None, model_type, endpoint_url=endpoint_url)
@ -135,36 +167,33 @@ def main(
failed_requests = 0
successful_requests = 0
safety_blocked_requests = 0
for f in data_files:
logger.info(f"Processing file {f}")
df = pd.read_json(os.path.join(batch_inference_output, f), lines=True)
for index, row in df.iterrows():
if rc.is_result_content_safety_failure(row):
# check for safety failure before result success.
# blocked requests can be fail or success responses.
safety_blocked_requests += 1
logger.warning("Met request blocked due to safety at index {} of file {}".format(index, f))
blocked_response_list.append(row)
if handle_response_failure == 'neglect':
continue
elif not rc.is_result_success(row):
failed_requests += 1
logger.warning("Met failed response at index {} of file {}".format(index, f))
failed_response_list.append(row)
if handle_response_failure == 'neglect':
continue
else:
successful_requests += 1
successful_response_list.append(row)
prediction_list.append(rc.convert_result(row))
if rc.is_result_success(row):
# Don't calculate perf for failed requests.
perf_list.append(rc.convert_result_perf(row, use_tiktoken))
if not is_performance_test:
ground_truth_list.append(rc.convert_result_ground_truth(row))
else:
logger.info("is performance test")
ground_truth_list.append({"ground_truth": ''})
for index, row in enumerate(batch_score_result):
if rc.is_result_content_safety_failure(row):
# check for safety failure before result success.
# blocked requests can be fail or success responses.
safety_blocked_requests += 1
logger.warning("Met request blocked due to safety at index {}".format(index))
blocked_response_list.append(row)
if handle_response_failure == 'neglect':
continue
elif not rc.is_result_success(row):
failed_requests += 1
logger.warning("Met failed response at index {}".format(index))
failed_response_list.append(row)
if handle_response_failure == 'neglect':
continue
else:
successful_requests += 1
successful_response_list.append(row)
prediction_list.append(rc.convert_result(row))
if rc.is_result_success(row):
# Don't calculate perf for failed requests.
perf_list.append(rc.convert_result_perf(row, use_tiktoken))
if not is_performance_test:
ground_truth_list.append(rc.convert_result_ground_truth(row))
else:
logger.info("is performance test")
ground_truth_list.append({"ground_truth": ''})
logger.info("Output data now.")
save_list_to_jsonl_if_path_provided(prediction_list, prediction_path)