add score logic extraction from evaluation sdk for online evaluation feature (#3584)
* add score logic extraction from evaluation sdk * fix indentation * fix indentation * fix doc * add default trace_id, span_id if user doesn't provides * fix flake8 reqs * change score to float and correct logs * resolve flake issues * revert unnecessary file changes * made numeric values compulsory add base condition * flake 8 whitespace fix --------- Co-authored-by: apeddauppari <apeddauppari@microsoft.com>
This commit is contained in:
Родитель
f6e0a97a18
Коммит
6337cb99ee
|
@ -5,19 +5,20 @@
|
|||
import argparse
|
||||
import json
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
import importlib
|
||||
import sys
|
||||
import shutil
|
||||
import mlflow
|
||||
import os
|
||||
import pandas as pd
|
||||
|
||||
from typing import Any, Dict, List, Tuple
|
||||
from promptflow.client import load_flow
|
||||
from azure.ai.evaluation import evaluate
|
||||
from azure.ai.ml.identity import AzureMLOnBehalfOfCredential
|
||||
import pandas as pd
|
||||
from utils import get_mlclient, extract_model_info
|
||||
from collections import defaultdict
|
||||
|
||||
import os
|
||||
from utils import get_mlclient, extract_model_info
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
@ -157,13 +158,9 @@ def run_evaluation(command_line_args, evaluators, evaluator_configs):
|
|||
for evaluator_name in evaluators:
|
||||
result_key = f"outputs.{evaluator_name}"
|
||||
filtered_result = {k: v for k, v in result.items() if k.startswith(result_key)}
|
||||
if len(filtered_result) == 1:
|
||||
final_results[evaluator_name].append(filtered_result[list(filtered_result.keys())[0]])
|
||||
else:
|
||||
if len(filtered_result) == 0:
|
||||
logger.warning(f"No output score generated for current evaluator {evaluator_name}")
|
||||
logger.info(f"Found multiple results for {evaluator_name}. Adding as json string.")
|
||||
final_results[evaluator_name].append(json.dumps(filtered_result))
|
||||
_, score_value = extract_score(filtered_result)
|
||||
final_results[evaluator_name].append(score_value)
|
||||
|
||||
final_results = pd.DataFrame(final_results)
|
||||
logger.info(final_results)
|
||||
final_results.to_json(command_line_args["evaluated_data"], orient="records", lines=True)
|
||||
|
@ -176,6 +173,48 @@ def run_evaluation(command_line_args, evaluators, evaluator_configs):
|
|||
mlflow.log_artifact("instance_results.jsonl")
|
||||
|
||||
|
||||
def extract_score(data: Dict[str, Any]) -> Tuple[List[str], float]:
|
||||
"""Extract the float score value from the evaluation result."""
|
||||
# Step 1: If data is None/Empty, return empty list and 0.0
|
||||
if not data:
|
||||
return [], 0.0
|
||||
|
||||
# Step 2: Filter out non-numeric valued keys
|
||||
numeric_keys = {}
|
||||
for k, v in data.items():
|
||||
try:
|
||||
numeric_keys[k] = float(v)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if len(numeric_keys) == 0:
|
||||
return [], 0.0
|
||||
|
||||
if len(numeric_keys) == 1:
|
||||
return list(numeric_keys.keys()), float(list(numeric_keys.values())[0])
|
||||
|
||||
# Step 3: Try for keys with '_score' suffix
|
||||
score_keys = {k: v for k, v in numeric_keys.items() if k.endswith('_score')}
|
||||
|
||||
if len(score_keys) == 1:
|
||||
return list(score_keys.keys()), float(list(score_keys.values())[0])
|
||||
|
||||
# Step 4: Deal with no '_score' suffix
|
||||
if len(score_keys) == 0:
|
||||
non_gpt_keys = {k: v for k, v in numeric_keys.items() if not k.startswith('gpt_')}
|
||||
|
||||
if len(non_gpt_keys) == 1:
|
||||
return list(non_gpt_keys.keys()), float(list(non_gpt_keys.values())[0])
|
||||
|
||||
if len(non_gpt_keys) == 0:
|
||||
return list(numeric_keys.keys()), sum(numeric_keys.values()) / len(numeric_keys)
|
||||
|
||||
return list(non_gpt_keys.keys()), sum(non_gpt_keys.values()) / len(non_gpt_keys)
|
||||
|
||||
# Step 5: If multiple '_score' keys, return average of values
|
||||
return list(score_keys.keys()), sum(score_keys.values()) / len(score_keys)
|
||||
|
||||
|
||||
rai_evaluators = [
|
||||
"Sexual-Content-Evaluator",
|
||||
"Hate-and-Unfairness-Evaluator",
|
||||
|
|
|
@ -20,6 +20,9 @@ import logging
|
|||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
DEFAULT_TRACE_ID_COLUMN = "operation_Id"
|
||||
DEFAULT_SPAN_ID_COLUMN = "operation_ParentId"
|
||||
|
||||
|
||||
def get_args():
|
||||
"""Get arguments from the command line."""
|
||||
|
@ -51,7 +54,7 @@ def configure_logging(args) -> LoggerProvider:
|
|||
def log_evaluation_event_single(trace_id, span_id, trace_flags, response_id, evaluation, service_name):
|
||||
"""Log evaluation event."""
|
||||
for name, value in evaluation.items():
|
||||
attributes = {"event.name": f"gen_ai.evaluation.{name}", "gen_ai.evaluation.score": json.dumps(value),
|
||||
attributes = {"event.name": f"gen_ai.evaluation.{name}", "gen_ai.evaluation.score": value,
|
||||
"gen_ai.response_id": response_id, "service.name": service_name}
|
||||
body = f"gen_ai.evaluation for response_id: {response_id}"
|
||||
|
||||
|
@ -71,11 +74,14 @@ def log_evaluation_event_single(trace_id, span_id, trace_flags, response_id, eva
|
|||
|
||||
def log_evaluation_event(row) -> None:
|
||||
"""Log evaluation event."""
|
||||
if "trace_id" not in row or "span_id" not in row or "evaluation" not in row:
|
||||
logger.warning("Missing required fields in the row: trace_id, span_id, evaluation")
|
||||
|
||||
trace_id = int(row.get("trace_id", "0"), 16)
|
||||
span_id = int(row.get("span_id", "0"), 16)
|
||||
if "evaluation" not in row:
|
||||
logger.warning("Missing required fields in the row: evaluation")
|
||||
if "trace_id" not in row:
|
||||
logger.debug(f"Missing trace_id from user query result, taking default of column {DEFAULT_TRACE_ID_COLUMN}")
|
||||
trace_id = int(row.get("trace_id", row.get(DEFAULT_TRACE_ID_COLUMN, "0")), 16)
|
||||
if "span_id" not in row:
|
||||
logger.debug(f"Missing span_id from user query result, taking default of column {DEFAULT_SPAN_ID_COLUMN}")
|
||||
span_id = int(row.get("span_id", row.get(DEFAULT_SPAN_ID_COLUMN, "0")), 16)
|
||||
trace_flags = TraceFlags(TraceFlags.SAMPLED)
|
||||
response_id = row.get("gen_ai_response_id", "")
|
||||
evaluation_results = row.get("evaluation", {})
|
||||
|
|
|
@ -9,4 +9,4 @@ azure-monitor-query=={{latest-pypi-version}}
|
|||
croniter=={{latest-pypi-version}}
|
||||
azure-monitor-opentelemetry=={{latest-pypi-version}}
|
||||
promptflow-azure=={{latest-pypi-version}}
|
||||
azure-identity=={{latest-pypi-version}}
|
||||
azure-identity=={{latest-pypi-version}}
|
Загрузка…
Ссылка в новой задаче