add score logic extraction from evaluation sdk for online evaluation feature (#3584)

* add score logic extraction from evaluation sdk

* fix indentation

* fix indentation

* fix doc

* add default trace_id, span_id if user doesn't provides

* fix flake8 reqs

* change score to float and correct logs

* resolve flake issues

* revert unnecessary file changes

* made numeric values compulsory add base condition

* flake 8 whitespace fix

---------

Co-authored-by: apeddauppari <apeddauppari@microsoft.com>
This commit is contained in:
Sai Kothinti 2024-11-11 20:03:49 +05:30 коммит произвёл GitHub
Родитель f6e0a97a18
Коммит 6337cb99ee
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
3 изменённых файлов: 63 добавлений и 18 удалений

Просмотреть файл

@ -5,19 +5,20 @@
import argparse
import json
import logging
from collections import defaultdict
import importlib
import sys
import shutil
import mlflow
import os
import pandas as pd
from typing import Any, Dict, List, Tuple
from promptflow.client import load_flow
from azure.ai.evaluation import evaluate
from azure.ai.ml.identity import AzureMLOnBehalfOfCredential
import pandas as pd
from utils import get_mlclient, extract_model_info
from collections import defaultdict
import os
from utils import get_mlclient, extract_model_info
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
@ -157,13 +158,9 @@ def run_evaluation(command_line_args, evaluators, evaluator_configs):
for evaluator_name in evaluators:
result_key = f"outputs.{evaluator_name}"
filtered_result = {k: v for k, v in result.items() if k.startswith(result_key)}
if len(filtered_result) == 1:
final_results[evaluator_name].append(filtered_result[list(filtered_result.keys())[0]])
else:
if len(filtered_result) == 0:
logger.warning(f"No output score generated for current evaluator {evaluator_name}")
logger.info(f"Found multiple results for {evaluator_name}. Adding as json string.")
final_results[evaluator_name].append(json.dumps(filtered_result))
_, score_value = extract_score(filtered_result)
final_results[evaluator_name].append(score_value)
final_results = pd.DataFrame(final_results)
logger.info(final_results)
final_results.to_json(command_line_args["evaluated_data"], orient="records", lines=True)
@ -176,6 +173,48 @@ def run_evaluation(command_line_args, evaluators, evaluator_configs):
mlflow.log_artifact("instance_results.jsonl")
def extract_score(data: Dict[str, Any]) -> Tuple[List[str], float]:
"""Extract the float score value from the evaluation result."""
# Step 1: If data is None/Empty, return empty list and 0.0
if not data:
return [], 0.0
# Step 2: Filter out non-numeric valued keys
numeric_keys = {}
for k, v in data.items():
try:
numeric_keys[k] = float(v)
except Exception:
continue
if len(numeric_keys) == 0:
return [], 0.0
if len(numeric_keys) == 1:
return list(numeric_keys.keys()), float(list(numeric_keys.values())[0])
# Step 3: Try for keys with '_score' suffix
score_keys = {k: v for k, v in numeric_keys.items() if k.endswith('_score')}
if len(score_keys) == 1:
return list(score_keys.keys()), float(list(score_keys.values())[0])
# Step 4: Deal with no '_score' suffix
if len(score_keys) == 0:
non_gpt_keys = {k: v for k, v in numeric_keys.items() if not k.startswith('gpt_')}
if len(non_gpt_keys) == 1:
return list(non_gpt_keys.keys()), float(list(non_gpt_keys.values())[0])
if len(non_gpt_keys) == 0:
return list(numeric_keys.keys()), sum(numeric_keys.values()) / len(numeric_keys)
return list(non_gpt_keys.keys()), sum(non_gpt_keys.values()) / len(non_gpt_keys)
# Step 5: If multiple '_score' keys, return average of values
return list(score_keys.keys()), sum(score_keys.values()) / len(score_keys)
rai_evaluators = [
"Sexual-Content-Evaluator",
"Hate-and-Unfairness-Evaluator",

Просмотреть файл

@ -20,6 +20,9 @@ import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
DEFAULT_TRACE_ID_COLUMN = "operation_Id"
DEFAULT_SPAN_ID_COLUMN = "operation_ParentId"
def get_args():
"""Get arguments from the command line."""
@ -51,7 +54,7 @@ def configure_logging(args) -> LoggerProvider:
def log_evaluation_event_single(trace_id, span_id, trace_flags, response_id, evaluation, service_name):
"""Log evaluation event."""
for name, value in evaluation.items():
attributes = {"event.name": f"gen_ai.evaluation.{name}", "gen_ai.evaluation.score": json.dumps(value),
attributes = {"event.name": f"gen_ai.evaluation.{name}", "gen_ai.evaluation.score": value,
"gen_ai.response_id": response_id, "service.name": service_name}
body = f"gen_ai.evaluation for response_id: {response_id}"
@ -71,11 +74,14 @@ def log_evaluation_event_single(trace_id, span_id, trace_flags, response_id, eva
def log_evaluation_event(row) -> None:
"""Log evaluation event."""
if "trace_id" not in row or "span_id" not in row or "evaluation" not in row:
logger.warning("Missing required fields in the row: trace_id, span_id, evaluation")
trace_id = int(row.get("trace_id", "0"), 16)
span_id = int(row.get("span_id", "0"), 16)
if "evaluation" not in row:
logger.warning("Missing required fields in the row: evaluation")
if "trace_id" not in row:
logger.debug(f"Missing trace_id from user query result, taking default of column {DEFAULT_TRACE_ID_COLUMN}")
trace_id = int(row.get("trace_id", row.get(DEFAULT_TRACE_ID_COLUMN, "0")), 16)
if "span_id" not in row:
logger.debug(f"Missing span_id from user query result, taking default of column {DEFAULT_SPAN_ID_COLUMN}")
span_id = int(row.get("span_id", row.get(DEFAULT_SPAN_ID_COLUMN, "0")), 16)
trace_flags = TraceFlags(TraceFlags.SAMPLED)
response_id = row.get("gen_ai_response_id", "")
evaluation_results = row.get("evaluation", {})

Просмотреть файл

@ -9,4 +9,4 @@ azure-monitor-query=={{latest-pypi-version}}
croniter=={{latest-pypi-version}}
azure-monitor-opentelemetry=={{latest-pypi-version}}
promptflow-azure=={{latest-pypi-version}}
azure-identity=={{latest-pypi-version}}
azure-identity=={{latest-pypi-version}}