Merge branch 'main' into kaiyi/fix/designer-vulnerable-issues
This commit is contained in:
Коммит
4bcbbc581d
|
@ -0,0 +1,61 @@
|
|||
name: training-model-eval-unittests
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- assets/training/model_evaluation/**
|
||||
- .github/workflows/training-model-eval-unittests.yaml
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
conda_env_prefix: /opt/conda/envs/model_eval
|
||||
model_eval_unittests_dir: assets/training/model_evaluation/tests
|
||||
model_eval_conda_yaml: assets/training/model_evaluation/environment/context/conda.yaml
|
||||
pytest_reports: pytest-reports
|
||||
|
||||
jobs:
|
||||
run_unit_tests:
|
||||
name: Run
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
permissions:
|
||||
# Required for EnricoMi/publish-unit-test-result-action
|
||||
checks: write
|
||||
issues: read
|
||||
pull-requests: write
|
||||
|
||||
steps:
|
||||
- name: Clone branch
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Use Python 3.10 or newer
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '>=3.10'
|
||||
|
||||
- name: Create conda env
|
||||
run: |
|
||||
set -ex
|
||||
apt-get update && apt-get upgrade && apt-get install -y build-essential
|
||||
echo $conda_env_prefix
|
||||
sed -i 's/- pip:/- pip:\n - --extra-index-url https:\/\/download.pytorch.org\/whl\/cpu/' $model_eval_conda_yaml
|
||||
sed -i 's/=={{latest-pypi-version}}//g' $model_eval_conda_yaml
|
||||
sed -i 's/=={{latest-pypi-version:pre}}//g' $model_eval_conda_yaml
|
||||
echo " - pytest" >> $model_eval_conda_yaml
|
||||
echo " - pytest-xdist" >> $model_eval_conda_yaml
|
||||
cat $model_eval_conda_yaml
|
||||
conda env create -p $conda_env_prefix -f $model_eval_conda_yaml
|
||||
echo "conda env successfully created at $conda_env_prefix"
|
||||
conda list -p $conda_env_prefix
|
||||
|
||||
- name: Execute tests
|
||||
run: conda run -p $conda_env_prefix python -m pytest $model_eval_unittests_dir --tb=native --junitxml=$pytest_reports/test-result.xml -x -n 1 -ra --show-capture=no
|
||||
|
||||
- name: Publish test results
|
||||
uses: EnricoMi/publish-unit-test-result-action@v2
|
||||
if: always()
|
||||
with:
|
||||
check_name: Test Results for ${{ github.workflow }}
|
||||
junit_files: ${{ env.pytest_reports }}/**/*.xml
|
|
@ -5,19 +5,20 @@
|
|||
import argparse
|
||||
import json
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
import importlib
|
||||
import sys
|
||||
import shutil
|
||||
import mlflow
|
||||
import os
|
||||
import pandas as pd
|
||||
|
||||
from typing import Any, Dict, List, Tuple
|
||||
from promptflow.client import load_flow
|
||||
from azure.ai.evaluation import evaluate
|
||||
from azure.ai.ml.identity import AzureMLOnBehalfOfCredential
|
||||
import pandas as pd
|
||||
from utils import get_mlclient, extract_model_info
|
||||
from collections import defaultdict
|
||||
|
||||
import os
|
||||
from utils import get_mlclient, extract_model_info
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
@ -157,13 +158,9 @@ def run_evaluation(command_line_args, evaluators, evaluator_configs):
|
|||
for evaluator_name in evaluators:
|
||||
result_key = f"outputs.{evaluator_name}"
|
||||
filtered_result = {k: v for k, v in result.items() if k.startswith(result_key)}
|
||||
if len(filtered_result) == 1:
|
||||
final_results[evaluator_name].append(filtered_result[list(filtered_result.keys())[0]])
|
||||
else:
|
||||
if len(filtered_result) == 0:
|
||||
logger.warning(f"No output score generated for current evaluator {evaluator_name}")
|
||||
logger.info(f"Found multiple results for {evaluator_name}. Adding as json string.")
|
||||
final_results[evaluator_name].append(json.dumps(filtered_result))
|
||||
_, score_value = extract_score(filtered_result)
|
||||
final_results[evaluator_name].append(score_value)
|
||||
|
||||
final_results = pd.DataFrame(final_results)
|
||||
logger.info(final_results)
|
||||
final_results.to_json(command_line_args["evaluated_data"], orient="records", lines=True)
|
||||
|
@ -176,6 +173,48 @@ def run_evaluation(command_line_args, evaluators, evaluator_configs):
|
|||
mlflow.log_artifact("instance_results.jsonl")
|
||||
|
||||
|
||||
def extract_score(data: Dict[str, Any]) -> Tuple[List[str], float]:
|
||||
"""Extract the float score value from the evaluation result."""
|
||||
# Step 1: If data is None/Empty, return empty list and 0.0
|
||||
if not data:
|
||||
return [], 0.0
|
||||
|
||||
# Step 2: Filter out non-numeric valued keys
|
||||
numeric_keys = {}
|
||||
for k, v in data.items():
|
||||
try:
|
||||
numeric_keys[k] = float(v)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if len(numeric_keys) == 0:
|
||||
return [], 0.0
|
||||
|
||||
if len(numeric_keys) == 1:
|
||||
return list(numeric_keys.keys()), float(list(numeric_keys.values())[0])
|
||||
|
||||
# Step 3: Try for keys with '_score' suffix
|
||||
score_keys = {k: v for k, v in numeric_keys.items() if k.endswith('_score')}
|
||||
|
||||
if len(score_keys) == 1:
|
||||
return list(score_keys.keys()), float(list(score_keys.values())[0])
|
||||
|
||||
# Step 4: Deal with no '_score' suffix
|
||||
if len(score_keys) == 0:
|
||||
non_gpt_keys = {k: v for k, v in numeric_keys.items() if not k.startswith('gpt_')}
|
||||
|
||||
if len(non_gpt_keys) == 1:
|
||||
return list(non_gpt_keys.keys()), float(list(non_gpt_keys.values())[0])
|
||||
|
||||
if len(non_gpt_keys) == 0:
|
||||
return list(numeric_keys.keys()), sum(numeric_keys.values()) / len(numeric_keys)
|
||||
|
||||
return list(non_gpt_keys.keys()), sum(non_gpt_keys.values()) / len(non_gpt_keys)
|
||||
|
||||
# Step 5: If multiple '_score' keys, return average of values
|
||||
return list(score_keys.keys()), sum(score_keys.values()) / len(score_keys)
|
||||
|
||||
|
||||
rai_evaluators = [
|
||||
"Sexual-Content-Evaluator",
|
||||
"Hate-and-Unfairness-Evaluator",
|
||||
|
|
|
@ -20,6 +20,9 @@ import logging
|
|||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
DEFAULT_TRACE_ID_COLUMN = "operation_Id"
|
||||
DEFAULT_SPAN_ID_COLUMN = "operation_ParentId"
|
||||
|
||||
|
||||
def get_args():
|
||||
"""Get arguments from the command line."""
|
||||
|
@ -51,7 +54,7 @@ def configure_logging(args) -> LoggerProvider:
|
|||
def log_evaluation_event_single(trace_id, span_id, trace_flags, response_id, evaluation, service_name):
|
||||
"""Log evaluation event."""
|
||||
for name, value in evaluation.items():
|
||||
attributes = {"event.name": f"gen_ai.evaluation.{name}", "gen_ai.evaluation.score": json.dumps(value),
|
||||
attributes = {"event.name": f"gen_ai.evaluation.{name}", "gen_ai.evaluation.score": value,
|
||||
"gen_ai.response_id": response_id, "service.name": service_name}
|
||||
body = f"gen_ai.evaluation for response_id: {response_id}"
|
||||
|
||||
|
@ -71,11 +74,14 @@ def log_evaluation_event_single(trace_id, span_id, trace_flags, response_id, eva
|
|||
|
||||
def log_evaluation_event(row) -> None:
|
||||
"""Log evaluation event."""
|
||||
if "trace_id" not in row or "span_id" not in row or "evaluation" not in row:
|
||||
logger.warning("Missing required fields in the row: trace_id, span_id, evaluation")
|
||||
|
||||
trace_id = int(row.get("trace_id", "0"), 16)
|
||||
span_id = int(row.get("span_id", "0"), 16)
|
||||
if "evaluation" not in row:
|
||||
logger.warning("Missing required fields in the row: evaluation")
|
||||
if "trace_id" not in row:
|
||||
logger.debug(f"Missing trace_id from user query result, taking default of column {DEFAULT_TRACE_ID_COLUMN}")
|
||||
trace_id = int(row.get("trace_id", row.get(DEFAULT_TRACE_ID_COLUMN, "0")), 16)
|
||||
if "span_id" not in row:
|
||||
logger.debug(f"Missing span_id from user query result, taking default of column {DEFAULT_SPAN_ID_COLUMN}")
|
||||
span_id = int(row.get("span_id", row.get(DEFAULT_SPAN_ID_COLUMN, "0")), 16)
|
||||
trace_flags = TraceFlags(TraceFlags.SAMPLED)
|
||||
response_id = row.get("gen_ai_response_id", "")
|
||||
evaluation_results = row.get("evaluation", {})
|
||||
|
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,28 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: amazonpolarityclassification_cohere-embed-v3-english_classification
|
||||
version: 2.04.11
|
||||
display_name: AmazonPolarityClassification_cohere-embed-v3-english_classification
|
||||
description: cohere-embed-v3-english run for AmazonPolarityClassification dataset
|
||||
dataset_name: AmazonPolarityClassification
|
||||
dataset_family: AmazonPolarityClassification
|
||||
|
||||
model_name: cohere-embed-v3-english
|
||||
model_version: "3"
|
||||
model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/3
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/3
|
||||
|
||||
tags:
|
||||
evaluation_type: text_embeddings
|
||||
task: classification
|
||||
primary_metric: accuracy
|
||||
azure_registry_name: azureml-cohere
|
||||
azure_model_name: Cohere-embed-v3-english
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/1
|
||||
|
||||
metrics:
|
||||
accuracy: 0.927643
|
||||
|
||||
properties: {}
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,28 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: amazonpolarityclassification_cohere-embed-v3-multilingual_classification
|
||||
version: 2.04.11
|
||||
display_name: AmazonPolarityClassification_cohere-embed-v3-multilingual_classification
|
||||
description: cohere-embed-v3-multilingual run for AmazonPolarityClassification dataset
|
||||
dataset_name: AmazonPolarityClassification
|
||||
dataset_family: AmazonPolarityClassification
|
||||
|
||||
model_name: cohere-embed-v3-multilingual
|
||||
model_version: "3"
|
||||
model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/3
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/3
|
||||
|
||||
tags:
|
||||
evaluation_type: text_embeddings
|
||||
task: classification
|
||||
primary_metric: accuracy
|
||||
azure_registry_name: azureml-cohere
|
||||
azure_model_name: Cohere-embed-v3-multilingual
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/1
|
||||
|
||||
metrics:
|
||||
accuracy: 0.912307
|
||||
|
||||
properties: {}
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,28 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: amazonpolarityclassification_text-embedding-3-large_classification
|
||||
version: 2.04.11
|
||||
display_name: AmazonPolarityClassification_text-embedding-3-large_classification
|
||||
description: text-embedding-3-large run for AmazonPolarityClassification dataset
|
||||
dataset_name: AmazonPolarityClassification
|
||||
dataset_family: AmazonPolarityClassification
|
||||
|
||||
model_name: text-embedding-3-large
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_embeddings
|
||||
task: classification
|
||||
primary_metric: accuracy
|
||||
azure_registry_name: azure-openai
|
||||
azure_model_name: text-embedding-3-large
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
|
||||
|
||||
metrics:
|
||||
accuracy: 0.92868975
|
||||
|
||||
properties: {}
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,28 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: amazonpolarityclassification_text-embedding-3-small_classification
|
||||
version: 2.04.11
|
||||
display_name: AmazonPolarityClassification_text-embedding-3-small_classification
|
||||
description: text-embedding-3-small run for AmazonPolarityClassification dataset
|
||||
dataset_name: AmazonPolarityClassification
|
||||
dataset_family: AmazonPolarityClassification
|
||||
|
||||
model_name: text-embedding-3-small
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_embeddings
|
||||
task: classification
|
||||
primary_metric: accuracy
|
||||
azure_registry_name: azure-openai
|
||||
azure_model_name: text-embedding-3-small
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
|
||||
|
||||
metrics:
|
||||
accuracy: 0.90878075
|
||||
|
||||
properties: {}
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,28 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: amazonpolarityclassification_text-embedding-ada-002_classification
|
||||
version: 2.04.11
|
||||
display_name: AmazonPolarityClassification_text-embedding-ada-002_classification
|
||||
description: text-embedding-ada-002 run for AmazonPolarityClassification dataset
|
||||
dataset_name: AmazonPolarityClassification
|
||||
dataset_family: AmazonPolarityClassification
|
||||
|
||||
model_name: text-embedding-ada-002
|
||||
model_version: "2"
|
||||
model_asset_id: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
|
||||
|
||||
tags:
|
||||
evaluation_type: text_embeddings
|
||||
task: classification
|
||||
primary_metric: accuracy
|
||||
azure_registry_name: azure-openai
|
||||
azure_model_name: text-embedding-ada-002
|
||||
azure_latest_model_version: 2
|
||||
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
|
||||
|
||||
metrics:
|
||||
accuracy: 0.867263
|
||||
|
||||
properties: {}
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,28 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: arguana_cohere-embed-v3-english_retrieval
|
||||
version: 2.04.11
|
||||
display_name: ArguAna_cohere-embed-v3-english_retrieval
|
||||
description: cohere-embed-v3-english run for ArguAna dataset
|
||||
dataset_name: ArguAna
|
||||
dataset_family: ArguAna
|
||||
|
||||
model_name: cohere-embed-v3-english
|
||||
model_version: "3"
|
||||
model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/3
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/3
|
||||
|
||||
tags:
|
||||
evaluation_type: text_embeddings
|
||||
task: retrieval
|
||||
primary_metric: ndcg_at_10
|
||||
azure_registry_name: azureml-cohere
|
||||
azure_model_name: Cohere-embed-v3-english
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/1
|
||||
|
||||
metrics:
|
||||
ndcg_at_10: 0.57529
|
||||
|
||||
properties: {}
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,28 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: arguana_cohere-embed-v3-multilingual_retrieval
|
||||
version: 2.04.11
|
||||
display_name: ArguAna_cohere-embed-v3-multilingual_retrieval
|
||||
description: cohere-embed-v3-multilingual run for ArguAna dataset
|
||||
dataset_name: ArguAna
|
||||
dataset_family: ArguAna
|
||||
|
||||
model_name: cohere-embed-v3-multilingual
|
||||
model_version: "3"
|
||||
model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/3
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/3
|
||||
|
||||
tags:
|
||||
evaluation_type: text_embeddings
|
||||
task: retrieval
|
||||
primary_metric: ndcg_at_10
|
||||
azure_registry_name: azureml-cohere
|
||||
azure_model_name: Cohere-embed-v3-multilingual
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/1
|
||||
|
||||
metrics:
|
||||
ndcg_at_10: 0.57989
|
||||
|
||||
properties: {}
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,28 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: arguana_text-embedding-3-large_retrieval
|
||||
version: 2.04.11
|
||||
display_name: ArguAna_text-embedding-3-large_retrieval
|
||||
description: text-embedding-3-large run for ArguAna dataset
|
||||
dataset_name: ArguAna
|
||||
dataset_family: ArguAna
|
||||
|
||||
model_name: text-embedding-3-large
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_embeddings
|
||||
task: retrieval
|
||||
primary_metric: ndcg_at_10
|
||||
azure_registry_name: azure-openai
|
||||
azure_model_name: text-embedding-3-large
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
|
||||
|
||||
metrics:
|
||||
ndcg_at_10: 0.58013
|
||||
|
||||
properties: {}
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,28 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: arguana_text-embedding-3-small_retrieval
|
||||
version: 2.04.11
|
||||
display_name: ArguAna_text-embedding-3-small_retrieval
|
||||
description: text-embedding-3-small run for ArguAna dataset
|
||||
dataset_name: ArguAna
|
||||
dataset_family: ArguAna
|
||||
|
||||
model_name: text-embedding-3-small
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_embeddings
|
||||
task: retrieval
|
||||
primary_metric: ndcg_at_10
|
||||
azure_registry_name: azure-openai
|
||||
azure_model_name: text-embedding-3-small
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
|
||||
|
||||
metrics:
|
||||
ndcg_at_10: 0.55694
|
||||
|
||||
properties: {}
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,28 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: arguana_text-embedding-ada-002_retrieval
|
||||
version: 2.04.11
|
||||
display_name: ArguAna_text-embedding-ada-002_retrieval
|
||||
description: text-embedding-ada-002 run for ArguAna dataset
|
||||
dataset_name: ArguAna
|
||||
dataset_family: ArguAna
|
||||
|
||||
model_name: text-embedding-ada-002
|
||||
model_version: "2"
|
||||
model_asset_id: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
|
||||
|
||||
tags:
|
||||
evaluation_type: text_embeddings
|
||||
task: retrieval
|
||||
primary_metric: ndcg_at_10
|
||||
azure_registry_name: azure-openai
|
||||
azure_model_name: text-embedding-ada-002
|
||||
azure_latest_model_version: 2
|
||||
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
|
||||
|
||||
metrics:
|
||||
ndcg_at_10: 0.57455
|
||||
|
||||
properties: {}
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,28 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: arxivclusteringp2p.v2_cohere-embed-v3-english_clustering
|
||||
version: 2.04.11
|
||||
display_name: ArxivClusteringP2P.v2_cohere-embed-v3-english_clustering
|
||||
description: cohere-embed-v3-english run for ArxivClusteringP2P.v2 dataset
|
||||
dataset_name: ArxivClusteringP2P.v2
|
||||
dataset_family: ArxivClusteringP2P.v2
|
||||
|
||||
model_name: cohere-embed-v3-english
|
||||
model_version: "3"
|
||||
model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/3
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/3
|
||||
|
||||
tags:
|
||||
evaluation_type: text_embeddings
|
||||
task: clustering
|
||||
primary_metric: v_measure
|
||||
azure_registry_name: azureml-cohere
|
||||
azure_model_name: Cohere-embed-v3-english
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/1
|
||||
|
||||
metrics:
|
||||
v_measure: 0.5081042703542442
|
||||
|
||||
properties: {}
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,28 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: arxivclusteringp2p.v2_cohere-embed-v3-multilingual_clustering
|
||||
version: 2.04.11
|
||||
display_name: ArxivClusteringP2P.v2_cohere-embed-v3-multilingual_clustering
|
||||
description: cohere-embed-v3-multilingual run for ArxivClusteringP2P.v2 dataset
|
||||
dataset_name: ArxivClusteringP2P.v2
|
||||
dataset_family: ArxivClusteringP2P.v2
|
||||
|
||||
model_name: cohere-embed-v3-multilingual
|
||||
model_version: "3"
|
||||
model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/3
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/3
|
||||
|
||||
tags:
|
||||
evaluation_type: text_embeddings
|
||||
task: clustering
|
||||
primary_metric: v_measure
|
||||
azure_registry_name: azureml-cohere
|
||||
azure_model_name: Cohere-embed-v3-multilingual
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/1
|
||||
|
||||
metrics:
|
||||
v_measure: 0.5029184573976476
|
||||
|
||||
properties: {}
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,28 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: arxivclusteringp2p.v2_text-embedding-3-large_clustering
|
||||
version: 2.04.11
|
||||
display_name: ArxivClusteringP2P.v2_text-embedding-3-large_clustering
|
||||
description: text-embedding-3-large run for ArxivClusteringP2P.v2 dataset
|
||||
dataset_name: ArxivClusteringP2P.v2
|
||||
dataset_family: ArxivClusteringP2P.v2
|
||||
|
||||
model_name: text-embedding-3-large
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_embeddings
|
||||
task: clustering
|
||||
primary_metric: v_measure
|
||||
azure_registry_name: azure-openai
|
||||
azure_model_name: text-embedding-3-large
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
|
||||
|
||||
metrics:
|
||||
v_measure: 0.519053128352996
|
||||
|
||||
properties: {}
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,28 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: arxivclusteringp2p.v2_text-embedding-3-small_clustering
|
||||
version: 2.04.11
|
||||
display_name: ArxivClusteringP2P.v2_text-embedding-3-small_clustering
|
||||
description: text-embedding-3-small run for ArxivClusteringP2P.v2 dataset
|
||||
dataset_name: ArxivClusteringP2P.v2
|
||||
dataset_family: ArxivClusteringP2P.v2
|
||||
|
||||
model_name: text-embedding-3-small
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_embeddings
|
||||
task: clustering
|
||||
primary_metric: v_measure
|
||||
azure_registry_name: azure-openai
|
||||
azure_model_name: text-embedding-3-small
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
|
||||
|
||||
metrics:
|
||||
v_measure: 0.496692276507199
|
||||
|
||||
properties: {}
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,28 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: arxivclusteringp2p.v2_text-embedding-ada-002_clustering
|
||||
version: 2.04.11
|
||||
display_name: ArxivClusteringP2P.v2_text-embedding-ada-002_clustering
|
||||
description: text-embedding-ada-002 run for ArxivClusteringP2P.v2 dataset
|
||||
dataset_name: ArxivClusteringP2P.v2
|
||||
dataset_family: ArxivClusteringP2P.v2
|
||||
|
||||
model_name: text-embedding-ada-002
|
||||
model_version: "2"
|
||||
model_asset_id: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
|
||||
|
||||
tags:
|
||||
evaluation_type: text_embeddings
|
||||
task: clustering
|
||||
primary_metric: v_measure
|
||||
azure_registry_name: azure-openai
|
||||
azure_model_name: text-embedding-ada-002
|
||||
azure_latest_model_version: 2
|
||||
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
|
||||
|
||||
metrics:
|
||||
v_measure: 0.4794210912494528
|
||||
|
||||
properties: {}
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,28 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: arxivclusterings2s_cohere-embed-v3-english_clustering
|
||||
version: 2.04.11
|
||||
display_name: ArxivClusteringS2S_cohere-embed-v3-english_clustering
|
||||
description: cohere-embed-v3-english run for ArxivClusteringS2S dataset
|
||||
dataset_name: ArxivClusteringS2S
|
||||
dataset_family: ArxivClusteringS2S
|
||||
|
||||
model_name: cohere-embed-v3-english
|
||||
model_version: "3"
|
||||
model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/3
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/3
|
||||
|
||||
tags:
|
||||
evaluation_type: text_embeddings
|
||||
task: clustering
|
||||
primary_metric: v_measure
|
||||
azure_registry_name: azureml-cohere
|
||||
azure_model_name: Cohere-embed-v3-english
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/1
|
||||
|
||||
metrics:
|
||||
v_measure: 0.38872349524931893
|
||||
|
||||
properties: {}
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,28 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: arxivclusterings2s_cohere-embed-v3-multilingual_clustering
|
||||
version: 2.04.11
|
||||
display_name: ArxivClusteringS2S_cohere-embed-v3-multilingual_clustering
|
||||
description: cohere-embed-v3-multilingual run for ArxivClusteringS2S dataset
|
||||
dataset_name: ArxivClusteringS2S
|
||||
dataset_family: ArxivClusteringS2S
|
||||
|
||||
model_name: cohere-embed-v3-multilingual
|
||||
model_version: "3"
|
||||
model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/3
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/3
|
||||
|
||||
tags:
|
||||
evaluation_type: text_embeddings
|
||||
task: clustering
|
||||
primary_metric: v_measure
|
||||
azure_registry_name: azureml-cohere
|
||||
azure_model_name: Cohere-embed-v3-multilingual
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/1
|
||||
|
||||
metrics:
|
||||
v_measure: 0.3910885755785807
|
||||
|
||||
properties: {}
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,28 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: arxivclusterings2s_text-embedding-3-large_clustering
|
||||
version: 2.04.11
|
||||
display_name: ArxivClusteringS2S_text-embedding-3-large_clustering
|
||||
description: text-embedding-3-large run for ArxivClusteringS2S dataset
|
||||
dataset_name: ArxivClusteringS2S
|
||||
dataset_family: ArxivClusteringS2S
|
||||
|
||||
model_name: text-embedding-3-large
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_embeddings
|
||||
task: clustering
|
||||
primary_metric: v_measure
|
||||
azure_registry_name: azure-openai
|
||||
azure_model_name: text-embedding-3-large
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
|
||||
|
||||
metrics:
|
||||
v_measure: 0.4429783426306228
|
||||
|
||||
properties: {}
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,28 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: arxivclusterings2s_text-embedding-3-small_clustering
|
||||
version: 2.04.11
|
||||
display_name: ArxivClusteringS2S_text-embedding-3-small_clustering
|
||||
description: text-embedding-3-small run for ArxivClusteringS2S dataset
|
||||
dataset_name: ArxivClusteringS2S
|
||||
dataset_family: ArxivClusteringS2S
|
||||
|
||||
model_name: text-embedding-3-small
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_embeddings
|
||||
task: clustering
|
||||
primary_metric: v_measure
|
||||
azure_registry_name: azure-openai
|
||||
azure_model_name: text-embedding-3-small
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
|
||||
|
||||
metrics:
|
||||
v_measure: 0.3940951744128959
|
||||
|
||||
properties: {}
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,28 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: arxivclusterings2s_text-embedding-ada-002_clustering
|
||||
version: 2.04.11
|
||||
display_name: ArxivClusteringS2S_text-embedding-ada-002_clustering
|
||||
description: text-embedding-ada-002 run for ArxivClusteringS2S dataset
|
||||
dataset_name: ArxivClusteringS2S
|
||||
dataset_family: ArxivClusteringS2S
|
||||
|
||||
model_name: text-embedding-ada-002
|
||||
model_version: "2"
|
||||
model_asset_id: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
|
||||
|
||||
tags:
|
||||
evaluation_type: text_embeddings
|
||||
task: clustering
|
||||
primary_metric: v_measure
|
||||
azure_registry_name: azure-openai
|
||||
azure_model_name: text-embedding-ada-002
|
||||
azure_latest_model_version: 2
|
||||
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
|
||||
|
||||
metrics:
|
||||
v_measure: 0.3719179506563676
|
||||
|
||||
properties: {}
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,28 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: banking77classification_cohere-embed-v3-english_classification
|
||||
version: 2.04.11
|
||||
display_name: Banking77Classification_cohere-embed-v3-english_classification
|
||||
description: cohere-embed-v3-english run for Banking77Classification dataset
|
||||
dataset_name: Banking77Classification
|
||||
dataset_family: Banking77Classification
|
||||
|
||||
model_name: cohere-embed-v3-english
|
||||
model_version: "3"
|
||||
model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/3
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/3
|
||||
|
||||
tags:
|
||||
evaluation_type: text_embeddings
|
||||
task: classification
|
||||
primary_metric: accuracy
|
||||
azure_registry_name: azureml-cohere
|
||||
azure_model_name: Cohere-embed-v3-english
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/1
|
||||
|
||||
metrics:
|
||||
accuracy: 0.7934415584415586
|
||||
|
||||
properties: {}
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,28 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: banking77classification_cohere-embed-v3-multilingual_classification
|
||||
version: 2.04.11
|
||||
display_name: Banking77Classification_cohere-embed-v3-multilingual_classification
|
||||
description: cohere-embed-v3-multilingual run for Banking77Classification dataset
|
||||
dataset_name: Banking77Classification
|
||||
dataset_family: Banking77Classification
|
||||
|
||||
model_name: cohere-embed-v3-multilingual
|
||||
model_version: "3"
|
||||
model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/3
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/3
|
||||
|
||||
tags:
|
||||
evaluation_type: text_embeddings
|
||||
task: classification
|
||||
primary_metric: accuracy
|
||||
azure_registry_name: azureml-cohere
|
||||
azure_model_name: Cohere-embed-v3-multilingual
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/1
|
||||
|
||||
metrics:
|
||||
accuracy: 0.7934415584415585
|
||||
|
||||
properties: {}
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,28 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: banking77classification_text-embedding-3-large_classification
|
||||
version: 2.04.11
|
||||
display_name: Banking77Classification_text-embedding-3-large_classification
|
||||
description: text-embedding-3-large run for Banking77Classification dataset
|
||||
dataset_name: Banking77Classification
|
||||
dataset_family: Banking77Classification
|
||||
|
||||
model_name: text-embedding-3-large
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_embeddings
|
||||
task: classification
|
||||
primary_metric: accuracy
|
||||
azure_registry_name: azure-openai
|
||||
azure_model_name: text-embedding-3-large
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
|
||||
|
||||
metrics:
|
||||
accuracy: 0.8572402597402597
|
||||
|
||||
properties: {}
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,28 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: banking77classification_text-embedding-3-small_classification
|
||||
version: 2.04.11
|
||||
display_name: Banking77Classification_text-embedding-3-small_classification
|
||||
description: text-embedding-3-small run for Banking77Classification dataset
|
||||
dataset_name: Banking77Classification
|
||||
dataset_family: Banking77Classification
|
||||
|
||||
model_name: text-embedding-3-small
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_embeddings
|
||||
task: classification
|
||||
primary_metric: accuracy
|
||||
azure_registry_name: azure-openai
|
||||
azure_model_name: text-embedding-3-small
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
|
||||
|
||||
metrics:
|
||||
accuracy: 0.8299025974025973
|
||||
|
||||
properties: {}
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,28 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: banking77classification_text-embedding-ada-002_classification
|
||||
version: 2.04.11
|
||||
display_name: Banking77Classification_text-embedding-ada-002_classification
|
||||
description: text-embedding-ada-002 run for Banking77Classification dataset
|
||||
dataset_name: Banking77Classification
|
||||
dataset_family: Banking77Classification
|
||||
|
||||
model_name: text-embedding-ada-002
|
||||
model_version: "2"
|
||||
model_asset_id: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
|
||||
|
||||
tags:
|
||||
evaluation_type: text_embeddings
|
||||
task: classification
|
||||
primary_metric: accuracy
|
||||
azure_registry_name: azure-openai
|
||||
azure_model_name: text-embedding-ada-002
|
||||
azure_latest_model_version: 2
|
||||
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
|
||||
|
||||
metrics:
|
||||
accuracy: 0.8053246753246753
|
||||
|
||||
properties: {}
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,35 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: boolq__gpt-4-0125-preview__question_answering
|
||||
version: 2.04.11
|
||||
display_name: boolq__gpt-4-0125-Preview__chat_completion
|
||||
description: Benchmark__gpt40125__hf_boolq__chat_completion
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: gpt-4-0125-Preview
|
||||
model_version: "0125-Preview"
|
||||
model_asset_id: azureml://registries/azure-openai/models/gpt-4/versions/0125-Preview
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/gpt-4/versions/0125-Preview
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
azure_registry_name: azure-openai
|
||||
azure_model_name: gpt-4
|
||||
azure_latest_model_version: turbo-2024-04-09
|
||||
azure_latest_model_asset_id: azureml://registries/azure-openai/models/gpt-4/versions/turbo-2024-04-09
|
||||
|
||||
metrics:
|
||||
accuracy: 0.904892966
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,35 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: boolq__meta-llama-3_1-70b-instruct__question_answering
|
||||
version: 2.04.11
|
||||
display_name: boolq__Meta-Llama-3_1-70B-Instruct__chat_completion
|
||||
description: Benchmark__Llama-3-1-70B-Instruct-bench__hf_boolq__chat_completion
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: Meta-Llama-3.1-70B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
azure_registry_name: azureml-meta
|
||||
azure_model_name: Meta-Llama-3.1-70B-Instruct
|
||||
azure_latest_model_version: 3
|
||||
azure_latest_model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/3
|
||||
|
||||
metrics:
|
||||
accuracy: 0.909785933
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,35 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: boolq__meta-llama-3_1-8b-instruct__question_answering
|
||||
version: 2.04.11
|
||||
display_name: boolq__Meta-Llama-3_1-8B-Instruct__chat_completion
|
||||
description: Benchmark__meta-llama-3-1-8b-instruct-1__hf_boolq__chat_completion
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: Meta-Llama-3.1-8B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
azure_registry_name: azureml-meta
|
||||
azure_model_name: Meta-Llama-3.1-8B-Instruct
|
||||
azure_latest_model_version: 3
|
||||
azure_latest_model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/3
|
||||
|
||||
metrics:
|
||||
accuracy: 0.868501529
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,35 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: boolq_cohere_command_r_plus_question_answering
|
||||
version: 2.04.11
|
||||
display_name: boolq_Cohere_command_r_plus_question_answering
|
||||
description: Cohere-command-r-plus run for boolq dataset
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: Cohere-command-r-plus
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-cohere/models/Cohere-command-r-plus/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-cohere/models/Cohere-command-r-plus/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
azure_registry_name: azureml-cohere
|
||||
azure_model_name: Cohere-command-r-plus
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azureml-cohere/models/Cohere-command-r-plus/versions/1
|
||||
|
||||
metrics:
|
||||
accuracy: 0.909480122
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,35 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: boolq_cohere_command_r_question_answering
|
||||
version: 2.04.11
|
||||
display_name: boolq_Cohere_command_r_question_answering
|
||||
description: Cohere-command-r run for boolq dataset
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: Cohere-command-r
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-cohere/models/Cohere-command-r/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-cohere/models/Cohere-command-r/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
azure_registry_name: azureml-cohere
|
||||
azure_model_name: Cohere-command-r
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azureml-cohere/models/Cohere-command-r/versions/1
|
||||
|
||||
metrics:
|
||||
accuracy: 0.8819571865443425
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,35 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: boolq_databricks-dbrx-base_question_answering
|
||||
version: 2.04.11
|
||||
display_name: boolq_databricks-dbrx-base_question_answering
|
||||
description: databricks-dbrx-base run for boolq dataset
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: databricks-dbrx-base
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-restricted/models/databricks-dbrx-base/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-restricted/models/databricks-dbrx-base/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
azure_registry_name: azureml-restricted
|
||||
azure_model_name: databricks-dbrx-base
|
||||
azure_latest_model_version: 3
|
||||
azure_latest_model_asset_id: azureml://registries/azureml-restricted/models/databricks-dbrx-base/versions/3
|
||||
|
||||
metrics:
|
||||
accuracy: 0.9159021
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,35 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: boolq_databricks-dbrx-instruct_question_answering
|
||||
version: 2.04.11
|
||||
display_name: boolq_databricks-dbrx-instruct_question_answering
|
||||
description: databricks-dbrx-instruct run for boolq dataset
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: databricks-dbrx-instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-restricted/models/databricks-dbrx-instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-restricted/models/databricks-dbrx-instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
azure_registry_name: azureml-restricted
|
||||
azure_model_name: databricks-dbrx-instruct
|
||||
azure_latest_model_version: 3
|
||||
azure_latest_model_asset_id: azureml://registries/azureml-restricted/models/databricks-dbrx-instruct/versions/3
|
||||
|
||||
metrics:
|
||||
accuracy: 0.9051988
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,35 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: boolq_gpt-4-turbo-2024-04-09_chat_completion
|
||||
version: 2.04.11
|
||||
display_name: boolq_gpt-4-turbo-2024-04-09_chat_completion
|
||||
description: boolq_gpt-4-turbo-2024-04-09_chat_completion
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: gpt-4-turbo-2024-04-09
|
||||
model_version: "turbo-2024-04-09"
|
||||
model_asset_id: azureml://registries/azure-openai/models/gpt-4/versions/turbo-2024-04-09
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/gpt-4/versions/turbo-2024-04-09
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
azure_registry_name: azure-openai
|
||||
azure_model_name: gpt-4
|
||||
azure_latest_model_version: turbo-2024-04-09
|
||||
azure_latest_model_asset_id: azureml://registries/azure-openai/models/gpt-4/versions/turbo-2024-04-09
|
||||
|
||||
metrics:
|
||||
accuracy: 0.9125382262996942
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,35 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: boolq_gpt-4o_question_answering
|
||||
version: 2.04.11
|
||||
display_name: boolq_gpt-4o_question_answering
|
||||
description: gpt-4o run for boolq
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: gpt-4o-2024-05-13
|
||||
model_version: "2024-05-13"
|
||||
model_asset_id: azureml://registries/azure-openai/models/gpt-4o/versions/2024-05-13
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/gpt-4o/versions/2024-05-13
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
azure_registry_name: azure-openai
|
||||
azure_model_name: gpt-4o
|
||||
azure_latest_model_version: 2024-08-06
|
||||
azure_latest_model_asset_id: azureml://registries/azure-openai/models/gpt-4o/versions/2024-08-06
|
||||
|
||||
metrics:
|
||||
accuracy: 0.908562691
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,35 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: boolq_gpt_35_turbo_0301_question_answering
|
||||
version: 2.04.11
|
||||
display_name: boolq_gpt_35_turbo_0301_question_answering
|
||||
description: gpt-35-turbo-0301 run for boolq dataset
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: gpt-35-turbo-0301
|
||||
model_version: "0301"
|
||||
model_asset_id: azureml://registries/azure-openai/models/gpt-35-turbo/versions/0301
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/gpt-35-turbo/versions/0301
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
azure_registry_name: azure-openai
|
||||
azure_model_name: gpt-35-turbo
|
||||
azure_latest_model_version: 0125
|
||||
azure_latest_model_asset_id: azureml://registries/azure-openai/models/gpt-35-turbo/versions/0125
|
||||
|
||||
metrics:
|
||||
accuracy: 0.867
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,35 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: boolq_gpt_35_turbo_0613_question_answering
|
||||
version: 2.04.11
|
||||
display_name: boolq_gpt_35_turbo_0613_question_answering
|
||||
description: gpt-35-turbo-0613 run for boolq dataset
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: gpt-35-turbo-0613
|
||||
model_version: "0613"
|
||||
model_asset_id: azureml://registries/azure-openai/models/gpt-35-turbo/versions/0613
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/gpt-35-turbo/versions/0613
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
azure_registry_name: azure-openai
|
||||
azure_model_name: gpt-35-turbo
|
||||
azure_latest_model_version: 0125
|
||||
azure_latest_model_asset_id: azureml://registries/azure-openai/models/gpt-35-turbo/versions/0125
|
||||
|
||||
metrics:
|
||||
accuracy: 0.864
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,35 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: boolq_gpt_4_0314_question_answering
|
||||
version: 2.04.11
|
||||
display_name: boolq_gpt_4_0314_question_answering
|
||||
description: gpt-4-0314 run for boolq dataset
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: gpt-4-0314
|
||||
model_version: "0314"
|
||||
model_asset_id: azureml://registries/azure-openai/models/gpt-4/versions/4
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/gpt-4/versions/4
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
azure_registry_name: azure-openai
|
||||
azure_model_name: gpt-4
|
||||
azure_latest_model_version: turbo-2024-04-09
|
||||
azure_latest_model_asset_id: azureml://registries/azure-openai/models/gpt-4/versions/turbo-2024-04-09
|
||||
|
||||
metrics:
|
||||
accuracy: 0.911
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,35 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: boolq_gpt_4_0613_question_answering
|
||||
version: 2.04.11
|
||||
display_name: boolq_gpt_4_0613_question_answering
|
||||
description: gpt-4-0613 run for boolq dataset
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: gpt-4-0613
|
||||
model_version: "0613"
|
||||
model_asset_id: azureml://registries/azure-openai/models/gpt-4/versions/0613
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/gpt-4/versions/0613
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
azure_registry_name: azure-openai
|
||||
azure_model_name: gpt-4
|
||||
azure_latest_model_version: turbo-2024-04-09
|
||||
azure_latest_model_asset_id: azureml://registries/azure-openai/models/gpt-4/versions/turbo-2024-04-09
|
||||
|
||||
metrics:
|
||||
accuracy: 0.912
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,35 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: boolq_gpt_4_32k_0314_question_answering
|
||||
version: 2.04.11
|
||||
display_name: boolq_gpt_4_32k_0314_question_answering
|
||||
description: gpt-4-32k-0314 run for boolq dataset
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: gpt-4-32k-0314
|
||||
model_version: "0314"
|
||||
model_asset_id: azureml://registries/azure-openai/models/gpt-4-32k/versions/0314
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/gpt-4-32k/versions/0314
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
azure_registry_name: azure-openai
|
||||
azure_model_name: gpt-4-32k
|
||||
azure_latest_model_version: 0613
|
||||
azure_latest_model_asset_id: azureml://registries/azure-openai/models/gpt-4-32k/versions/0613
|
||||
|
||||
metrics:
|
||||
accuracy: 0.913
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,35 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: boolq_gpt_4_32k_0613_question_answering
|
||||
version: 2.04.11
|
||||
display_name: boolq_gpt_4_32k_0613_question_answering
|
||||
description: gpt-4-32k-0613 run for boolq dataset
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: gpt-4-32k-0613
|
||||
model_version: "0613"
|
||||
model_asset_id: azureml://registries/azure-openai/models/gpt-4-32k/versions/0613
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/gpt-4-32k/versions/0613
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
azure_registry_name: azure-openai
|
||||
azure_model_name: gpt-4-32k
|
||||
azure_latest_model_version: 0613
|
||||
azure_latest_model_asset_id: azureml://registries/azure-openai/models/gpt-4-32k/versions/0613
|
||||
|
||||
metrics:
|
||||
accuracy: 0.911
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,35 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: boolq_llama_2_13b_chat_question_answering
|
||||
version: 2.04.11
|
||||
display_name: boolq_llama_2_13b_chat_question_answering
|
||||
description: llama-2-13b-chat run for boolq dataset
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: llama-2-13b-chat
|
||||
model_version: "12"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Llama-2-13b-chat/versions/12
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Llama-2-13b-chat/versions/12
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
azure_registry_name: azureml-meta
|
||||
azure_model_name: Llama-2-13b-chat
|
||||
azure_latest_model_version: 20
|
||||
azure_latest_model_asset_id: azureml://registries/azureml-meta/models/Llama-2-13b-chat/versions/20
|
||||
|
||||
metrics:
|
||||
accuracy: 0.801
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,35 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: boolq_llama_2_13b_question_answering
|
||||
version: 2.04.11
|
||||
display_name: boolq_llama_2_13b_question_answering
|
||||
description: llama-2-13b run for boolq dataset
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: llama-2-13b
|
||||
model_version: "12"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Llama-2-13b/versions/12
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Llama-2-13b/versions/12
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
azure_registry_name: azureml-meta
|
||||
azure_model_name: Llama-2-13b
|
||||
azure_latest_model_version: 23
|
||||
azure_latest_model_asset_id: azureml://registries/azureml-meta/models/Llama-2-13b/versions/23
|
||||
|
||||
metrics:
|
||||
accuracy: 0.723
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,35 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: boolq_llama_2_70b_chat_question_answering
|
||||
version: 2.04.11
|
||||
display_name: boolq_llama_2_70b_chat_question_answering
|
||||
description: llama-2-70b-chat run for boolq dataset
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: llama-2-70b-chat
|
||||
model_version: "12"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Llama-2-70b-chat/versions/12
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Llama-2-70b-chat/versions/12
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
azure_registry_name: azureml-meta
|
||||
azure_model_name: Llama-2-70b-chat
|
||||
azure_latest_model_version: 20
|
||||
azure_latest_model_asset_id: azureml://registries/azureml-meta/models/Llama-2-70b-chat/versions/20
|
||||
|
||||
metrics:
|
||||
accuracy: 0.826
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,35 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: boolq_llama_2_70b_question_answering
|
||||
version: 2.04.11
|
||||
display_name: boolq_llama_2_70b_question_answering
|
||||
description: llama-2-70b run for boolq dataset
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: llama-2-70b
|
||||
model_version: "13"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Llama-2-70b/versions/13
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Llama-2-70b/versions/13
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
azure_registry_name: azureml-meta
|
||||
azure_model_name: Llama-2-70b
|
||||
azure_latest_model_version: 24
|
||||
azure_latest_model_asset_id: azureml://registries/azureml-meta/models/Llama-2-70b/versions/24
|
||||
|
||||
metrics:
|
||||
accuracy: 0.853
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,35 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: boolq_llama_2_7b_chat_question_answering
|
||||
version: 2.04.11
|
||||
display_name: boolq_llama_2_7b_chat_question_answering
|
||||
description: llama-2-7b-chat run for boolq dataset
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: llama-2-7b-chat
|
||||
model_version: "14"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Llama-2-7b-chat/versions/14
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Llama-2-7b-chat/versions/14
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
azure_registry_name: azureml-meta
|
||||
azure_model_name: Llama-2-7b-chat
|
||||
azure_latest_model_version: 24
|
||||
azure_latest_model_asset_id: azureml://registries/azureml-meta/models/Llama-2-7b-chat/versions/24
|
||||
|
||||
metrics:
|
||||
accuracy: 0.771
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,35 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: boolq_llama_2_7b_question_answering
|
||||
version: 2.04.11
|
||||
display_name: boolq_llama_2_7b_question_answering
|
||||
description: llama-2-7b run for boolq dataset
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: llama-2-7b
|
||||
model_version: "12"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Llama-2-7b/versions/12
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Llama-2-7b/versions/12
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
azure_registry_name: azureml-meta
|
||||
azure_model_name: Llama-2-7b
|
||||
azure_latest_model_version: 22
|
||||
azure_latest_model_asset_id: azureml://registries/azureml-meta/models/Llama-2-7b/versions/22
|
||||
|
||||
metrics:
|
||||
accuracy: 0.628
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,35 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: boolq_meta-llama-3-70b-instruct_question_answering
|
||||
version: 2.04.11
|
||||
display_name: boolq_Meta-Llama-3-70B-Instruct_question_answering
|
||||
description: Meta-Llama-3-70B-Instruct run for boolq dataset
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: Meta-Llama-3-70B-Instruct
|
||||
model_version: "2"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3-70B-Instruct/versions/2
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3-70B-Instruct/versions/2
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
azure_registry_name: azureml-meta
|
||||
azure_model_name: Meta-Llama-3-70B-Instruct
|
||||
azure_latest_model_version: 8
|
||||
azure_latest_model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3-70B-Instruct/versions/8
|
||||
|
||||
metrics:
|
||||
accuracy: 0.9027522935779817
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,35 +0,0 @@
|
|||
type: evaluationresult
|
||||
name: boolq_meta-llama-3-70b_question_answering
|
||||
version: 2.04.11
|
||||
display_name: boolq_Meta-Llama-3-70B_question_answering
|
||||
description: Meta-Llama-3-70B run for boolq dataset
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: Meta-Llama-3-70B
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3-70B/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3-70B/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
azure_registry_name: azureml-meta
|
||||
azure_model_name: Meta-Llama-3-70B
|
||||
azure_latest_model_version: 6
|
||||
azure_latest_model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3-70B/versions/6
|
||||
|
||||
metrics:
|
||||
accuracy: 0.8917431192660551
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -1,3 +0,0 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче