Update components for AOAI Evals (#3535)
* add new environment aoai-evaluation * remove azureml-metrics from dependencies * pin packages * reorder logic for batch score result * fix code style issue * remove env * update env name --------- Co-authored-by: Ankush Bhatia <coolankush07@gmail.com>
This commit is contained in:
Родитель
cc7556dd57
Коммит
702cec7349
|
@ -4,7 +4,7 @@ type: command
|
|||
name: batch_benchmark_config_generator
|
||||
display_name: Batch Benchmark Config Generator
|
||||
description: Generates the config for the batch score component.
|
||||
version: 0.0.8
|
||||
version: 0.0.9
|
||||
is_deterministic: true
|
||||
|
||||
inputs:
|
||||
|
@ -86,7 +86,7 @@ outputs:
|
|||
description: The config json file for the batch score component.
|
||||
|
||||
code: ../src
|
||||
environment: azureml://registries/azureml/environments/model-evaluation/labels/latest
|
||||
environment: azureml://registries/azureml/environments/evaluation/labels/latest
|
||||
command: >-
|
||||
python -m aml_benchmark.batch_config_generator.main
|
||||
--scoring_url '${{inputs.scoring_url}}'
|
||||
|
|
|
@ -4,7 +4,7 @@ type: command
|
|||
name: batch_inference_preparer
|
||||
display_name: Batch Inference Preparer
|
||||
description: Prepare the jsonl file and endpoint for batch inference component.
|
||||
version: 0.0.13
|
||||
version: 0.0.14
|
||||
is_deterministic: true
|
||||
|
||||
inputs:
|
||||
|
@ -65,7 +65,7 @@ outputs:
|
|||
description: Path to the folder where the ground truth metadata will be stored.
|
||||
|
||||
code: ../src
|
||||
environment: azureml://registries/azureml/environments/model-evaluation/labels/latest
|
||||
environment: azureml://registries/azureml/environments/evaluation/labels/latest
|
||||
command: >-
|
||||
python -m aml_benchmark.batch_inference_preparer.main
|
||||
--batch_input_pattern '${{inputs.batch_input_pattern}}'
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
name: batch_output_formatter
|
||||
version: 0.0.13
|
||||
version: 0.0.14
|
||||
display_name: Batch Output Formatter
|
||||
is_deterministic: True
|
||||
type: command
|
||||
|
@ -71,7 +71,7 @@ outputs:
|
|||
unsafe_content_blocked_requests:
|
||||
type: uri_file
|
||||
code: ../src
|
||||
environment: azureml://registries/azureml/environments/model-evaluation/labels/latest
|
||||
environment: azureml://registries/azureml/environments/evaluation/labels/latest
|
||||
|
||||
resources:
|
||||
instance_count: 1
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
|
||||
import argparse
|
||||
import os
|
||||
from typing import Optional
|
||||
from typing import Optional, List, Dict, Any
|
||||
import pandas as pd
|
||||
|
||||
from aml_benchmark.utils.io import resolve_io_path, read_jsonl_files, save_list_to_jsonl_if_path_provided
|
||||
|
@ -13,6 +13,7 @@ from aml_benchmark.utils.logging import get_logger, log_params_and_metrics
|
|||
from aml_benchmark.utils.exceptions import swallow_all_exceptions
|
||||
from aml_benchmark.utils.aml_run_utils import str2bool
|
||||
from aml_benchmark.utils.online_endpoint.online_endpoint_model import OnlineEndpointModel
|
||||
from aml_benchmark.utils.online_endpoint.endpoint_utils import EndpointUtilities
|
||||
from aml_benchmark.utils.exceptions import BenchmarkUserException
|
||||
from aml_benchmark.utils.error_definitions import BenchmarkUserError
|
||||
from azureml._common._error_definition.azureml_error import AzureMLError
|
||||
|
@ -58,6 +59,25 @@ def parse_args() -> argparse.Namespace:
|
|||
return args
|
||||
|
||||
|
||||
def _reorder_batch_score_result(
|
||||
batch_score_result: List[Dict[str, Any]],
|
||||
ground_truth: List[Dict[str, Any]],
|
||||
model_type: str,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Reorder the batch score result based on the ground truth."""
|
||||
model = OnlineEndpointModel(model=None, model_version=None, model_type=model_type)
|
||||
batch_score_dict = {
|
||||
EndpointUtilities.hash_payload_prompt(row["request"], model): row for row in batch_score_result
|
||||
}
|
||||
|
||||
batch_score_result = []
|
||||
for ground_truth_dict in ground_truth:
|
||||
hash_val = ground_truth_dict["payload_id"]
|
||||
batch_score_result.append(batch_score_dict[hash_val])
|
||||
|
||||
return batch_score_result
|
||||
|
||||
|
||||
@swallow_all_exceptions(logger)
|
||||
def main(
|
||||
batch_inference_output: str,
|
||||
|
@ -108,10 +128,12 @@ def main(
|
|||
:return: None
|
||||
"""
|
||||
logger.info("Read batch output data now.")
|
||||
data_files = [
|
||||
f for f in os.listdir(batch_inference_output) if f.endswith("json") or f.endswith("jsonl")
|
||||
data_files = [os.path.join(
|
||||
batch_inference_output, f
|
||||
) for f in os.listdir(batch_inference_output) if f.endswith("json") or f.endswith("jsonl")
|
||||
]
|
||||
logger.info(f"Receiving {data_files}")
|
||||
logger.info(f"Receiving {len(data_files)} files.")
|
||||
batch_score_result: List[Dict[str, Any]] = read_jsonl_files(data_files)
|
||||
|
||||
prediction_list = []
|
||||
perf_list = []
|
||||
|
@ -121,7 +143,17 @@ def main(
|
|||
successful_response_list = []
|
||||
if ground_truth_input:
|
||||
input_file_paths = resolve_io_path(ground_truth_input)
|
||||
ground_truth_df = pd.DataFrame(read_jsonl_files(input_file_paths))
|
||||
_ground_truth_data = read_jsonl_files(input_file_paths)
|
||||
try:
|
||||
batch_score_result = _reorder_batch_score_result(batch_score_result, _ground_truth_data, model_type)
|
||||
logger.info("Reordered batch score result successfully.")
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Failed to reorder batch score result, falling back to original order. "
|
||||
f"This excpetion does not lead to run failure. Exception details:\n{e}"
|
||||
)
|
||||
ground_truth_df = pd.DataFrame(_ground_truth_data)
|
||||
_ground_truth_data = []
|
||||
else:
|
||||
ground_truth_df = None
|
||||
online_model = OnlineEndpointModel(None, None, model_type, endpoint_url=endpoint_url)
|
||||
|
@ -135,36 +167,33 @@ def main(
|
|||
failed_requests = 0
|
||||
successful_requests = 0
|
||||
safety_blocked_requests = 0
|
||||
for f in data_files:
|
||||
logger.info(f"Processing file {f}")
|
||||
df = pd.read_json(os.path.join(batch_inference_output, f), lines=True)
|
||||
for index, row in df.iterrows():
|
||||
if rc.is_result_content_safety_failure(row):
|
||||
# check for safety failure before result success.
|
||||
# blocked requests can be fail or success responses.
|
||||
safety_blocked_requests += 1
|
||||
logger.warning("Met request blocked due to safety at index {} of file {}".format(index, f))
|
||||
blocked_response_list.append(row)
|
||||
if handle_response_failure == 'neglect':
|
||||
continue
|
||||
elif not rc.is_result_success(row):
|
||||
failed_requests += 1
|
||||
logger.warning("Met failed response at index {} of file {}".format(index, f))
|
||||
failed_response_list.append(row)
|
||||
if handle_response_failure == 'neglect':
|
||||
continue
|
||||
else:
|
||||
successful_requests += 1
|
||||
successful_response_list.append(row)
|
||||
prediction_list.append(rc.convert_result(row))
|
||||
if rc.is_result_success(row):
|
||||
# Don't calculate perf for failed requests.
|
||||
perf_list.append(rc.convert_result_perf(row, use_tiktoken))
|
||||
if not is_performance_test:
|
||||
ground_truth_list.append(rc.convert_result_ground_truth(row))
|
||||
else:
|
||||
logger.info("is performance test")
|
||||
ground_truth_list.append({"ground_truth": ''})
|
||||
for index, row in enumerate(batch_score_result):
|
||||
if rc.is_result_content_safety_failure(row):
|
||||
# check for safety failure before result success.
|
||||
# blocked requests can be fail or success responses.
|
||||
safety_blocked_requests += 1
|
||||
logger.warning("Met request blocked due to safety at index {}".format(index))
|
||||
blocked_response_list.append(row)
|
||||
if handle_response_failure == 'neglect':
|
||||
continue
|
||||
elif not rc.is_result_success(row):
|
||||
failed_requests += 1
|
||||
logger.warning("Met failed response at index {}".format(index))
|
||||
failed_response_list.append(row)
|
||||
if handle_response_failure == 'neglect':
|
||||
continue
|
||||
else:
|
||||
successful_requests += 1
|
||||
successful_response_list.append(row)
|
||||
prediction_list.append(rc.convert_result(row))
|
||||
if rc.is_result_success(row):
|
||||
# Don't calculate perf for failed requests.
|
||||
perf_list.append(rc.convert_result_perf(row, use_tiktoken))
|
||||
if not is_performance_test:
|
||||
ground_truth_list.append(rc.convert_result_ground_truth(row))
|
||||
else:
|
||||
logger.info("is performance test")
|
||||
ground_truth_list.append({"ground_truth": ''})
|
||||
logger.info("Output data now.")
|
||||
|
||||
save_list_to_jsonl_if_path_provided(prediction_list, prediction_path)
|
||||
|
|
Загрузка…
Ссылка в новой задаче