Merge branch 'main' into kaiyi/fix/designer-vulnerable-issues

This commit is contained in:
Kai 2024-11-12 11:12:33 +08:00 коммит произвёл GitHub
Родитель a9f9d657d7 92b77d33e5
Коммит 4bcbbc581d
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
1462 изменённых файлов: 352 добавлений и 27055 удалений

61
.github/workflows/training-model-eval-unittests.yaml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,61 @@
name: training-model-eval-unittests
on:
pull_request:
branches:
- main
paths:
- assets/training/model_evaluation/**
- .github/workflows/training-model-eval-unittests.yaml
workflow_dispatch:
env:
conda_env_prefix: /opt/conda/envs/model_eval
model_eval_unittests_dir: assets/training/model_evaluation/tests
model_eval_conda_yaml: assets/training/model_evaluation/environment/context/conda.yaml
pytest_reports: pytest-reports
jobs:
run_unit_tests:
name: Run
runs-on: ubuntu-latest
permissions:
# Required for EnricoMi/publish-unit-test-result-action
checks: write
issues: read
pull-requests: write
steps:
- name: Clone branch
uses: actions/checkout@v3
- name: Use Python 3.10 or newer
uses: actions/setup-python@v4
with:
python-version: '>=3.10'
- name: Create conda env
run: |
set -ex
apt-get update && apt-get upgrade && apt-get install -y build-essential
echo $conda_env_prefix
sed -i 's/- pip:/- pip:\n - --extra-index-url https:\/\/download.pytorch.org\/whl\/cpu/' $model_eval_conda_yaml
sed -i 's/=={{latest-pypi-version}}//g' $model_eval_conda_yaml
sed -i 's/=={{latest-pypi-version:pre}}//g' $model_eval_conda_yaml
echo " - pytest" >> $model_eval_conda_yaml
echo " - pytest-xdist" >> $model_eval_conda_yaml
cat $model_eval_conda_yaml
conda env create -p $conda_env_prefix -f $model_eval_conda_yaml
echo "conda env successfully created at $conda_env_prefix"
conda list -p $conda_env_prefix
- name: Execute tests
run: conda run -p $conda_env_prefix python -m pytest $model_eval_unittests_dir --tb=native --junitxml=$pytest_reports/test-result.xml -x -n 1 -ra --show-capture=no
- name: Publish test results
uses: EnricoMi/publish-unit-test-result-action@v2
if: always()
with:
check_name: Test Results for ${{ github.workflow }}
junit_files: ${{ env.pytest_reports }}/**/*.xml

Просмотреть файл

@ -5,19 +5,20 @@
import argparse
import json
import logging
from collections import defaultdict
import importlib
import sys
import shutil
import mlflow
import os
import pandas as pd
from typing import Any, Dict, List, Tuple
from promptflow.client import load_flow
from azure.ai.evaluation import evaluate
from azure.ai.ml.identity import AzureMLOnBehalfOfCredential
import pandas as pd
from utils import get_mlclient, extract_model_info
from collections import defaultdict
import os
from utils import get_mlclient, extract_model_info
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
@ -157,13 +158,9 @@ def run_evaluation(command_line_args, evaluators, evaluator_configs):
for evaluator_name in evaluators:
result_key = f"outputs.{evaluator_name}"
filtered_result = {k: v for k, v in result.items() if k.startswith(result_key)}
if len(filtered_result) == 1:
final_results[evaluator_name].append(filtered_result[list(filtered_result.keys())[0]])
else:
if len(filtered_result) == 0:
logger.warning(f"No output score generated for current evaluator {evaluator_name}")
logger.info(f"Found multiple results for {evaluator_name}. Adding as json string.")
final_results[evaluator_name].append(json.dumps(filtered_result))
_, score_value = extract_score(filtered_result)
final_results[evaluator_name].append(score_value)
final_results = pd.DataFrame(final_results)
logger.info(final_results)
final_results.to_json(command_line_args["evaluated_data"], orient="records", lines=True)
@ -176,6 +173,48 @@ def run_evaluation(command_line_args, evaluators, evaluator_configs):
mlflow.log_artifact("instance_results.jsonl")
def extract_score(data: Dict[str, Any]) -> Tuple[List[str], float]:
"""Extract the float score value from the evaluation result."""
# Step 1: If data is None/Empty, return empty list and 0.0
if not data:
return [], 0.0
# Step 2: Filter out non-numeric valued keys
numeric_keys = {}
for k, v in data.items():
try:
numeric_keys[k] = float(v)
except Exception:
continue
if len(numeric_keys) == 0:
return [], 0.0
if len(numeric_keys) == 1:
return list(numeric_keys.keys()), float(list(numeric_keys.values())[0])
# Step 3: Try for keys with '_score' suffix
score_keys = {k: v for k, v in numeric_keys.items() if k.endswith('_score')}
if len(score_keys) == 1:
return list(score_keys.keys()), float(list(score_keys.values())[0])
# Step 4: Deal with no '_score' suffix
if len(score_keys) == 0:
non_gpt_keys = {k: v for k, v in numeric_keys.items() if not k.startswith('gpt_')}
if len(non_gpt_keys) == 1:
return list(non_gpt_keys.keys()), float(list(non_gpt_keys.values())[0])
if len(non_gpt_keys) == 0:
return list(numeric_keys.keys()), sum(numeric_keys.values()) / len(numeric_keys)
return list(non_gpt_keys.keys()), sum(non_gpt_keys.values()) / len(non_gpt_keys)
# Step 5: If multiple '_score' keys, return average of values
return list(score_keys.keys()), sum(score_keys.values()) / len(score_keys)
rai_evaluators = [
"Sexual-Content-Evaluator",
"Hate-and-Unfairness-Evaluator",

Просмотреть файл

@ -20,6 +20,9 @@ import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
DEFAULT_TRACE_ID_COLUMN = "operation_Id"
DEFAULT_SPAN_ID_COLUMN = "operation_ParentId"
def get_args():
"""Get arguments from the command line."""
@ -51,7 +54,7 @@ def configure_logging(args) -> LoggerProvider:
def log_evaluation_event_single(trace_id, span_id, trace_flags, response_id, evaluation, service_name):
"""Log evaluation event."""
for name, value in evaluation.items():
attributes = {"event.name": f"gen_ai.evaluation.{name}", "gen_ai.evaluation.score": json.dumps(value),
attributes = {"event.name": f"gen_ai.evaluation.{name}", "gen_ai.evaluation.score": value,
"gen_ai.response_id": response_id, "service.name": service_name}
body = f"gen_ai.evaluation for response_id: {response_id}"
@ -71,11 +74,14 @@ def log_evaluation_event_single(trace_id, span_id, trace_flags, response_id, eva
def log_evaluation_event(row) -> None:
"""Log evaluation event."""
if "trace_id" not in row or "span_id" not in row or "evaluation" not in row:
logger.warning("Missing required fields in the row: trace_id, span_id, evaluation")
trace_id = int(row.get("trace_id", "0"), 16)
span_id = int(row.get("span_id", "0"), 16)
if "evaluation" not in row:
logger.warning("Missing required fields in the row: evaluation")
if "trace_id" not in row:
logger.debug(f"Missing trace_id from user query result, taking default of column {DEFAULT_TRACE_ID_COLUMN}")
trace_id = int(row.get("trace_id", row.get(DEFAULT_TRACE_ID_COLUMN, "0")), 16)
if "span_id" not in row:
logger.debug(f"Missing span_id from user query result, taking default of column {DEFAULT_SPAN_ID_COLUMN}")
span_id = int(row.get("span_id", row.get(DEFAULT_SPAN_ID_COLUMN, "0")), 16)
trace_flags = TraceFlags(TraceFlags.SAMPLED)
response_id = row.get("gen_ai_response_id", "")
evaluation_results = row.get("evaluation", {})

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,28 +0,0 @@
type: evaluationresult
name: amazonpolarityclassification_cohere-embed-v3-english_classification
version: 2.04.11
display_name: AmazonPolarityClassification_cohere-embed-v3-english_classification
description: cohere-embed-v3-english run for AmazonPolarityClassification dataset
dataset_name: AmazonPolarityClassification
dataset_family: AmazonPolarityClassification
model_name: cohere-embed-v3-english
model_version: "3"
model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/3
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/3
tags:
evaluation_type: text_embeddings
task: classification
primary_metric: accuracy
azure_registry_name: azureml-cohere
azure_model_name: Cohere-embed-v3-english
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/1
metrics:
accuracy: 0.927643
properties: {}

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,28 +0,0 @@
type: evaluationresult
name: amazonpolarityclassification_cohere-embed-v3-multilingual_classification
version: 2.04.11
display_name: AmazonPolarityClassification_cohere-embed-v3-multilingual_classification
description: cohere-embed-v3-multilingual run for AmazonPolarityClassification dataset
dataset_name: AmazonPolarityClassification
dataset_family: AmazonPolarityClassification
model_name: cohere-embed-v3-multilingual
model_version: "3"
model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/3
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/3
tags:
evaluation_type: text_embeddings
task: classification
primary_metric: accuracy
azure_registry_name: azureml-cohere
azure_model_name: Cohere-embed-v3-multilingual
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/1
metrics:
accuracy: 0.912307
properties: {}

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,28 +0,0 @@
type: evaluationresult
name: amazonpolarityclassification_text-embedding-3-large_classification
version: 2.04.11
display_name: AmazonPolarityClassification_text-embedding-3-large_classification
description: text-embedding-3-large run for AmazonPolarityClassification dataset
dataset_name: AmazonPolarityClassification
dataset_family: AmazonPolarityClassification
model_name: text-embedding-3-large
model_version: "1"
model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
relationships:
- relationshipType: Source
assetId: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
tags:
evaluation_type: text_embeddings
task: classification
primary_metric: accuracy
azure_registry_name: azure-openai
azure_model_name: text-embedding-3-large
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
metrics:
accuracy: 0.92868975
properties: {}

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,28 +0,0 @@
type: evaluationresult
name: amazonpolarityclassification_text-embedding-3-small_classification
version: 2.04.11
display_name: AmazonPolarityClassification_text-embedding-3-small_classification
description: text-embedding-3-small run for AmazonPolarityClassification dataset
dataset_name: AmazonPolarityClassification
dataset_family: AmazonPolarityClassification
model_name: text-embedding-3-small
model_version: "1"
model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
relationships:
- relationshipType: Source
assetId: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
tags:
evaluation_type: text_embeddings
task: classification
primary_metric: accuracy
azure_registry_name: azure-openai
azure_model_name: text-embedding-3-small
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
metrics:
accuracy: 0.90878075
properties: {}

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,28 +0,0 @@
type: evaluationresult
name: amazonpolarityclassification_text-embedding-ada-002_classification
version: 2.04.11
display_name: AmazonPolarityClassification_text-embedding-ada-002_classification
description: text-embedding-ada-002 run for AmazonPolarityClassification dataset
dataset_name: AmazonPolarityClassification
dataset_family: AmazonPolarityClassification
model_name: text-embedding-ada-002
model_version: "2"
model_asset_id: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
relationships:
- relationshipType: Source
assetId: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
tags:
evaluation_type: text_embeddings
task: classification
primary_metric: accuracy
azure_registry_name: azure-openai
azure_model_name: text-embedding-ada-002
azure_latest_model_version: 2
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
metrics:
accuracy: 0.867263
properties: {}

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,28 +0,0 @@
type: evaluationresult
name: arguana_cohere-embed-v3-english_retrieval
version: 2.04.11
display_name: ArguAna_cohere-embed-v3-english_retrieval
description: cohere-embed-v3-english run for ArguAna dataset
dataset_name: ArguAna
dataset_family: ArguAna
model_name: cohere-embed-v3-english
model_version: "3"
model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/3
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/3
tags:
evaluation_type: text_embeddings
task: retrieval
primary_metric: ndcg_at_10
azure_registry_name: azureml-cohere
azure_model_name: Cohere-embed-v3-english
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/1
metrics:
ndcg_at_10: 0.57529
properties: {}

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,28 +0,0 @@
type: evaluationresult
name: arguana_cohere-embed-v3-multilingual_retrieval
version: 2.04.11
display_name: ArguAna_cohere-embed-v3-multilingual_retrieval
description: cohere-embed-v3-multilingual run for ArguAna dataset
dataset_name: ArguAna
dataset_family: ArguAna
model_name: cohere-embed-v3-multilingual
model_version: "3"
model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/3
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/3
tags:
evaluation_type: text_embeddings
task: retrieval
primary_metric: ndcg_at_10
azure_registry_name: azureml-cohere
azure_model_name: Cohere-embed-v3-multilingual
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/1
metrics:
ndcg_at_10: 0.57989
properties: {}

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,28 +0,0 @@
type: evaluationresult
name: arguana_text-embedding-3-large_retrieval
version: 2.04.11
display_name: ArguAna_text-embedding-3-large_retrieval
description: text-embedding-3-large run for ArguAna dataset
dataset_name: ArguAna
dataset_family: ArguAna
model_name: text-embedding-3-large
model_version: "1"
model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
relationships:
- relationshipType: Source
assetId: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
tags:
evaluation_type: text_embeddings
task: retrieval
primary_metric: ndcg_at_10
azure_registry_name: azure-openai
azure_model_name: text-embedding-3-large
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
metrics:
ndcg_at_10: 0.58013
properties: {}

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,28 +0,0 @@
type: evaluationresult
name: arguana_text-embedding-3-small_retrieval
version: 2.04.11
display_name: ArguAna_text-embedding-3-small_retrieval
description: text-embedding-3-small run for ArguAna dataset
dataset_name: ArguAna
dataset_family: ArguAna
model_name: text-embedding-3-small
model_version: "1"
model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
relationships:
- relationshipType: Source
assetId: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
tags:
evaluation_type: text_embeddings
task: retrieval
primary_metric: ndcg_at_10
azure_registry_name: azure-openai
azure_model_name: text-embedding-3-small
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
metrics:
ndcg_at_10: 0.55694
properties: {}

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,28 +0,0 @@
type: evaluationresult
name: arguana_text-embedding-ada-002_retrieval
version: 2.04.11
display_name: ArguAna_text-embedding-ada-002_retrieval
description: text-embedding-ada-002 run for ArguAna dataset
dataset_name: ArguAna
dataset_family: ArguAna
model_name: text-embedding-ada-002
model_version: "2"
model_asset_id: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
relationships:
- relationshipType: Source
assetId: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
tags:
evaluation_type: text_embeddings
task: retrieval
primary_metric: ndcg_at_10
azure_registry_name: azure-openai
azure_model_name: text-embedding-ada-002
azure_latest_model_version: 2
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
metrics:
ndcg_at_10: 0.57455
properties: {}

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,28 +0,0 @@
type: evaluationresult
name: arxivclusteringp2p.v2_cohere-embed-v3-english_clustering
version: 2.04.11
display_name: ArxivClusteringP2P.v2_cohere-embed-v3-english_clustering
description: cohere-embed-v3-english run for ArxivClusteringP2P.v2 dataset
dataset_name: ArxivClusteringP2P.v2
dataset_family: ArxivClusteringP2P.v2
model_name: cohere-embed-v3-english
model_version: "3"
model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/3
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/3
tags:
evaluation_type: text_embeddings
task: clustering
primary_metric: v_measure
azure_registry_name: azureml-cohere
azure_model_name: Cohere-embed-v3-english
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/1
metrics:
v_measure: 0.5081042703542442
properties: {}

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,28 +0,0 @@
type: evaluationresult
name: arxivclusteringp2p.v2_cohere-embed-v3-multilingual_clustering
version: 2.04.11
display_name: ArxivClusteringP2P.v2_cohere-embed-v3-multilingual_clustering
description: cohere-embed-v3-multilingual run for ArxivClusteringP2P.v2 dataset
dataset_name: ArxivClusteringP2P.v2
dataset_family: ArxivClusteringP2P.v2
model_name: cohere-embed-v3-multilingual
model_version: "3"
model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/3
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/3
tags:
evaluation_type: text_embeddings
task: clustering
primary_metric: v_measure
azure_registry_name: azureml-cohere
azure_model_name: Cohere-embed-v3-multilingual
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/1
metrics:
v_measure: 0.5029184573976476
properties: {}

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,28 +0,0 @@
type: evaluationresult
name: arxivclusteringp2p.v2_text-embedding-3-large_clustering
version: 2.04.11
display_name: ArxivClusteringP2P.v2_text-embedding-3-large_clustering
description: text-embedding-3-large run for ArxivClusteringP2P.v2 dataset
dataset_name: ArxivClusteringP2P.v2
dataset_family: ArxivClusteringP2P.v2
model_name: text-embedding-3-large
model_version: "1"
model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
relationships:
- relationshipType: Source
assetId: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
tags:
evaluation_type: text_embeddings
task: clustering
primary_metric: v_measure
azure_registry_name: azure-openai
azure_model_name: text-embedding-3-large
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
metrics:
v_measure: 0.519053128352996
properties: {}

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,28 +0,0 @@
type: evaluationresult
name: arxivclusteringp2p.v2_text-embedding-3-small_clustering
version: 2.04.11
display_name: ArxivClusteringP2P.v2_text-embedding-3-small_clustering
description: text-embedding-3-small run for ArxivClusteringP2P.v2 dataset
dataset_name: ArxivClusteringP2P.v2
dataset_family: ArxivClusteringP2P.v2
model_name: text-embedding-3-small
model_version: "1"
model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
relationships:
- relationshipType: Source
assetId: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
tags:
evaluation_type: text_embeddings
task: clustering
primary_metric: v_measure
azure_registry_name: azure-openai
azure_model_name: text-embedding-3-small
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
metrics:
v_measure: 0.496692276507199
properties: {}

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,28 +0,0 @@
type: evaluationresult
name: arxivclusteringp2p.v2_text-embedding-ada-002_clustering
version: 2.04.11
display_name: ArxivClusteringP2P.v2_text-embedding-ada-002_clustering
description: text-embedding-ada-002 run for ArxivClusteringP2P.v2 dataset
dataset_name: ArxivClusteringP2P.v2
dataset_family: ArxivClusteringP2P.v2
model_name: text-embedding-ada-002
model_version: "2"
model_asset_id: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
relationships:
- relationshipType: Source
assetId: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
tags:
evaluation_type: text_embeddings
task: clustering
primary_metric: v_measure
azure_registry_name: azure-openai
azure_model_name: text-embedding-ada-002
azure_latest_model_version: 2
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
metrics:
v_measure: 0.4794210912494528
properties: {}

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,28 +0,0 @@
type: evaluationresult
name: arxivclusterings2s_cohere-embed-v3-english_clustering
version: 2.04.11
display_name: ArxivClusteringS2S_cohere-embed-v3-english_clustering
description: cohere-embed-v3-english run for ArxivClusteringS2S dataset
dataset_name: ArxivClusteringS2S
dataset_family: ArxivClusteringS2S
model_name: cohere-embed-v3-english
model_version: "3"
model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/3
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/3
tags:
evaluation_type: text_embeddings
task: clustering
primary_metric: v_measure
azure_registry_name: azureml-cohere
azure_model_name: Cohere-embed-v3-english
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/1
metrics:
v_measure: 0.38872349524931893
properties: {}

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,28 +0,0 @@
type: evaluationresult
name: arxivclusterings2s_cohere-embed-v3-multilingual_clustering
version: 2.04.11
display_name: ArxivClusteringS2S_cohere-embed-v3-multilingual_clustering
description: cohere-embed-v3-multilingual run for ArxivClusteringS2S dataset
dataset_name: ArxivClusteringS2S
dataset_family: ArxivClusteringS2S
model_name: cohere-embed-v3-multilingual
model_version: "3"
model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/3
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/3
tags:
evaluation_type: text_embeddings
task: clustering
primary_metric: v_measure
azure_registry_name: azureml-cohere
azure_model_name: Cohere-embed-v3-multilingual
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/1
metrics:
v_measure: 0.3910885755785807
properties: {}

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,28 +0,0 @@
type: evaluationresult
name: arxivclusterings2s_text-embedding-3-large_clustering
version: 2.04.11
display_name: ArxivClusteringS2S_text-embedding-3-large_clustering
description: text-embedding-3-large run for ArxivClusteringS2S dataset
dataset_name: ArxivClusteringS2S
dataset_family: ArxivClusteringS2S
model_name: text-embedding-3-large
model_version: "1"
model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
relationships:
- relationshipType: Source
assetId: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
tags:
evaluation_type: text_embeddings
task: clustering
primary_metric: v_measure
azure_registry_name: azure-openai
azure_model_name: text-embedding-3-large
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
metrics:
v_measure: 0.4429783426306228
properties: {}

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,28 +0,0 @@
type: evaluationresult
name: arxivclusterings2s_text-embedding-3-small_clustering
version: 2.04.11
display_name: ArxivClusteringS2S_text-embedding-3-small_clustering
description: text-embedding-3-small run for ArxivClusteringS2S dataset
dataset_name: ArxivClusteringS2S
dataset_family: ArxivClusteringS2S
model_name: text-embedding-3-small
model_version: "1"
model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
relationships:
- relationshipType: Source
assetId: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
tags:
evaluation_type: text_embeddings
task: clustering
primary_metric: v_measure
azure_registry_name: azure-openai
azure_model_name: text-embedding-3-small
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
metrics:
v_measure: 0.3940951744128959
properties: {}

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,28 +0,0 @@
type: evaluationresult
name: arxivclusterings2s_text-embedding-ada-002_clustering
version: 2.04.11
display_name: ArxivClusteringS2S_text-embedding-ada-002_clustering
description: text-embedding-ada-002 run for ArxivClusteringS2S dataset
dataset_name: ArxivClusteringS2S
dataset_family: ArxivClusteringS2S
model_name: text-embedding-ada-002
model_version: "2"
model_asset_id: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
relationships:
- relationshipType: Source
assetId: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
tags:
evaluation_type: text_embeddings
task: clustering
primary_metric: v_measure
azure_registry_name: azure-openai
azure_model_name: text-embedding-ada-002
azure_latest_model_version: 2
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
metrics:
v_measure: 0.3719179506563676
properties: {}

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,28 +0,0 @@
type: evaluationresult
name: banking77classification_cohere-embed-v3-english_classification
version: 2.04.11
display_name: Banking77Classification_cohere-embed-v3-english_classification
description: cohere-embed-v3-english run for Banking77Classification dataset
dataset_name: Banking77Classification
dataset_family: Banking77Classification
model_name: cohere-embed-v3-english
model_version: "3"
model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/3
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/3
tags:
evaluation_type: text_embeddings
task: classification
primary_metric: accuracy
azure_registry_name: azureml-cohere
azure_model_name: Cohere-embed-v3-english
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-english/versions/1
metrics:
accuracy: 0.7934415584415586
properties: {}

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,28 +0,0 @@
type: evaluationresult
name: banking77classification_cohere-embed-v3-multilingual_classification
version: 2.04.11
display_name: Banking77Classification_cohere-embed-v3-multilingual_classification
description: cohere-embed-v3-multilingual run for Banking77Classification dataset
dataset_name: Banking77Classification
dataset_family: Banking77Classification
model_name: cohere-embed-v3-multilingual
model_version: "3"
model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/3
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/3
tags:
evaluation_type: text_embeddings
task: classification
primary_metric: accuracy
azure_registry_name: azureml-cohere
azure_model_name: Cohere-embed-v3-multilingual
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azureml-cohere/models/Cohere-embed-v3-multilingual/versions/1
metrics:
accuracy: 0.7934415584415585
properties: {}

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,28 +0,0 @@
type: evaluationresult
name: banking77classification_text-embedding-3-large_classification
version: 2.04.11
display_name: Banking77Classification_text-embedding-3-large_classification
description: text-embedding-3-large run for Banking77Classification dataset
dataset_name: Banking77Classification
dataset_family: Banking77Classification
model_name: text-embedding-3-large
model_version: "1"
model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
relationships:
- relationshipType: Source
assetId: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
tags:
evaluation_type: text_embeddings
task: classification
primary_metric: accuracy
azure_registry_name: azure-openai
azure_model_name: text-embedding-3-large
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-large/versions/1
metrics:
accuracy: 0.8572402597402597
properties: {}

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,28 +0,0 @@
type: evaluationresult
name: banking77classification_text-embedding-3-small_classification
version: 2.04.11
display_name: Banking77Classification_text-embedding-3-small_classification
description: text-embedding-3-small run for Banking77Classification dataset
dataset_name: Banking77Classification
dataset_family: Banking77Classification
model_name: text-embedding-3-small
model_version: "1"
model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
relationships:
- relationshipType: Source
assetId: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
tags:
evaluation_type: text_embeddings
task: classification
primary_metric: accuracy
azure_registry_name: azure-openai
azure_model_name: text-embedding-3-small
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-3-small/versions/1
metrics:
accuracy: 0.8299025974025973
properties: {}

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,28 +0,0 @@
type: evaluationresult
name: banking77classification_text-embedding-ada-002_classification
version: 2.04.11
display_name: Banking77Classification_text-embedding-ada-002_classification
description: text-embedding-ada-002 run for Banking77Classification dataset
dataset_name: Banking77Classification
dataset_family: Banking77Classification
model_name: text-embedding-ada-002
model_version: "2"
model_asset_id: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
relationships:
- relationshipType: Source
assetId: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
tags:
evaluation_type: text_embeddings
task: classification
primary_metric: accuracy
azure_registry_name: azure-openai
azure_model_name: text-embedding-ada-002
azure_latest_model_version: 2
azure_latest_model_asset_id: azureml://registries/azure-openai/models/text-embedding-ada-002/versions/2
metrics:
accuracy: 0.8053246753246753
properties: {}

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,35 +0,0 @@
type: evaluationresult
name: boolq__gpt-4-0125-preview__question_answering
version: 2.04.11
display_name: boolq__gpt-4-0125-Preview__chat_completion
description: Benchmark__gpt40125__hf_boolq__chat_completion
dataset_family: boolq
dataset_name: boolq
model_name: gpt-4-0125-Preview
model_version: "0125-Preview"
model_asset_id: azureml://registries/azure-openai/models/gpt-4/versions/0125-Preview
relationships:
- relationshipType: Source
assetId: azureml://registries/azure-openai/models/gpt-4/versions/0125-Preview
tags:
evaluation_type: text_generation
task: question-answering
accuracy_metric_name: exact_match
azure_registry_name: azure-openai
azure_model_name: gpt-4
azure_latest_model_version: turbo-2024-04-09
azure_latest_model_asset_id: azureml://registries/azure-openai/models/gpt-4/versions/turbo-2024-04-09
metrics:
accuracy: 0.904892966
properties:
n_shot: 5
evaluation_sampling_ratio: 1.0
evaluation_split: "validation"
fewshot_sampling_ratio: 1.0
fewshot_split: "train"

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,35 +0,0 @@
type: evaluationresult
name: boolq__meta-llama-3_1-70b-instruct__question_answering
version: 2.04.11
display_name: boolq__Meta-Llama-3_1-70B-Instruct__chat_completion
description: Benchmark__Llama-3-1-70B-Instruct-bench__hf_boolq__chat_completion
dataset_family: boolq
dataset_name: boolq
model_name: Meta-Llama-3.1-70B-Instruct
model_version: "1"
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
tags:
evaluation_type: text_generation
task: question-answering
accuracy_metric_name: exact_match
azure_registry_name: azureml-meta
azure_model_name: Meta-Llama-3.1-70B-Instruct
azure_latest_model_version: 3
azure_latest_model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/3
metrics:
accuracy: 0.909785933
properties:
n_shot: 5
evaluation_sampling_ratio: 1.0
evaluation_split: "validation"
fewshot_sampling_ratio: 1.0
fewshot_split: "train"

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,35 +0,0 @@
type: evaluationresult
name: boolq__meta-llama-3_1-8b-instruct__question_answering
version: 2.04.11
display_name: boolq__Meta-Llama-3_1-8B-Instruct__chat_completion
description: Benchmark__meta-llama-3-1-8b-instruct-1__hf_boolq__chat_completion
dataset_family: boolq
dataset_name: boolq
model_name: Meta-Llama-3.1-8B-Instruct
model_version: "1"
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
tags:
evaluation_type: text_generation
task: question-answering
accuracy_metric_name: exact_match
azure_registry_name: azureml-meta
azure_model_name: Meta-Llama-3.1-8B-Instruct
azure_latest_model_version: 3
azure_latest_model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/3
metrics:
accuracy: 0.868501529
properties:
n_shot: 5
evaluation_sampling_ratio: 1.0
evaluation_split: "validation"
fewshot_sampling_ratio: 1.0
fewshot_split: "train"

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,35 +0,0 @@
type: evaluationresult
name: boolq_cohere_command_r_plus_question_answering
version: 2.04.11
display_name: boolq_Cohere_command_r_plus_question_answering
description: Cohere-command-r-plus run for boolq dataset
dataset_family: boolq
dataset_name: boolq
model_name: Cohere-command-r-plus
model_version: "1"
model_asset_id: azureml://registries/azureml-cohere/models/Cohere-command-r-plus/versions/1
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml-cohere/models/Cohere-command-r-plus/versions/1
tags:
evaluation_type: text_generation
task: question-answering
accuracy_metric_name: exact_match
azure_registry_name: azureml-cohere
azure_model_name: Cohere-command-r-plus
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azureml-cohere/models/Cohere-command-r-plus/versions/1
metrics:
accuracy: 0.909480122
properties:
n_shot: 5
evaluation_sampling_ratio: 1.0
evaluation_split: "validation"
fewshot_sampling_ratio: 1.0
fewshot_split: "train"

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,35 +0,0 @@
type: evaluationresult
name: boolq_cohere_command_r_question_answering
version: 2.04.11
display_name: boolq_Cohere_command_r_question_answering
description: Cohere-command-r run for boolq dataset
dataset_family: boolq
dataset_name: boolq
model_name: Cohere-command-r
model_version: "1"
model_asset_id: azureml://registries/azureml-cohere/models/Cohere-command-r/versions/1
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml-cohere/models/Cohere-command-r/versions/1
tags:
evaluation_type: text_generation
task: question-answering
accuracy_metric_name: exact_match
azure_registry_name: azureml-cohere
azure_model_name: Cohere-command-r
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azureml-cohere/models/Cohere-command-r/versions/1
metrics:
accuracy: 0.8819571865443425
properties:
n_shot: 5
evaluation_sampling_ratio: 1.0
evaluation_split: "validation"
fewshot_sampling_ratio: 1.0
fewshot_split: "train"

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,35 +0,0 @@
type: evaluationresult
name: boolq_databricks-dbrx-base_question_answering
version: 2.04.11
display_name: boolq_databricks-dbrx-base_question_answering
description: databricks-dbrx-base run for boolq dataset
dataset_family: boolq
dataset_name: boolq
model_name: databricks-dbrx-base
model_version: "1"
model_asset_id: azureml://registries/azureml-restricted/models/databricks-dbrx-base/versions/1
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml-restricted/models/databricks-dbrx-base/versions/1
tags:
evaluation_type: text_generation
task: question-answering
accuracy_metric_name: exact_match
azure_registry_name: azureml-restricted
azure_model_name: databricks-dbrx-base
azure_latest_model_version: 3
azure_latest_model_asset_id: azureml://registries/azureml-restricted/models/databricks-dbrx-base/versions/3
metrics:
accuracy: 0.9159021
properties:
n_shot: 5
evaluation_sampling_ratio: 1.0
evaluation_split: "validation"
fewshot_sampling_ratio: 1.0
fewshot_split: "train"

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,35 +0,0 @@
type: evaluationresult
name: boolq_databricks-dbrx-instruct_question_answering
version: 2.04.11
display_name: boolq_databricks-dbrx-instruct_question_answering
description: databricks-dbrx-instruct run for boolq dataset
dataset_family: boolq
dataset_name: boolq
model_name: databricks-dbrx-instruct
model_version: "1"
model_asset_id: azureml://registries/azureml-restricted/models/databricks-dbrx-instruct/versions/1
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml-restricted/models/databricks-dbrx-instruct/versions/1
tags:
evaluation_type: text_generation
task: question-answering
accuracy_metric_name: exact_match
azure_registry_name: azureml-restricted
azure_model_name: databricks-dbrx-instruct
azure_latest_model_version: 3
azure_latest_model_asset_id: azureml://registries/azureml-restricted/models/databricks-dbrx-instruct/versions/3
metrics:
accuracy: 0.9051988
properties:
n_shot: 5
evaluation_sampling_ratio: 1.0
evaluation_split: "validation"
fewshot_sampling_ratio: 1.0
fewshot_split: "train"

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,35 +0,0 @@
type: evaluationresult
name: boolq_gpt-4-turbo-2024-04-09_chat_completion
version: 2.04.11
display_name: boolq_gpt-4-turbo-2024-04-09_chat_completion
description: boolq_gpt-4-turbo-2024-04-09_chat_completion
dataset_family: boolq
dataset_name: boolq
model_name: gpt-4-turbo-2024-04-09
model_version: "turbo-2024-04-09"
model_asset_id: azureml://registries/azure-openai/models/gpt-4/versions/turbo-2024-04-09
relationships:
- relationshipType: Source
assetId: azureml://registries/azure-openai/models/gpt-4/versions/turbo-2024-04-09
tags:
evaluation_type: text_generation
task: question-answering
accuracy_metric_name: exact_match
azure_registry_name: azure-openai
azure_model_name: gpt-4
azure_latest_model_version: turbo-2024-04-09
azure_latest_model_asset_id: azureml://registries/azure-openai/models/gpt-4/versions/turbo-2024-04-09
metrics:
accuracy: 0.9125382262996942
properties:
n_shot: 5
evaluation_sampling_ratio: 1.0
evaluation_split: "validation"
fewshot_sampling_ratio: 1.0
fewshot_split: "train"

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,35 +0,0 @@
type: evaluationresult
name: boolq_gpt-4o_question_answering
version: 2.04.11
display_name: boolq_gpt-4o_question_answering
description: gpt-4o run for boolq
dataset_family: boolq
dataset_name: boolq
model_name: gpt-4o-2024-05-13
model_version: "2024-05-13"
model_asset_id: azureml://registries/azure-openai/models/gpt-4o/versions/2024-05-13
relationships:
- relationshipType: Source
assetId: azureml://registries/azure-openai/models/gpt-4o/versions/2024-05-13
tags:
evaluation_type: text_generation
task: question-answering
accuracy_metric_name: exact_match
azure_registry_name: azure-openai
azure_model_name: gpt-4o
azure_latest_model_version: 2024-08-06
azure_latest_model_asset_id: azureml://registries/azure-openai/models/gpt-4o/versions/2024-08-06
metrics:
accuracy: 0.908562691
properties:
n_shot: 5
evaluation_sampling_ratio: 1.0
evaluation_split: "validation"
fewshot_sampling_ratio: 1.0
fewshot_split: "train"

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,35 +0,0 @@
type: evaluationresult
name: boolq_gpt_35_turbo_0301_question_answering
version: 2.04.11
display_name: boolq_gpt_35_turbo_0301_question_answering
description: gpt-35-turbo-0301 run for boolq dataset
dataset_family: boolq
dataset_name: boolq
model_name: gpt-35-turbo-0301
model_version: "0301"
model_asset_id: azureml://registries/azure-openai/models/gpt-35-turbo/versions/0301
relationships:
- relationshipType: Source
assetId: azureml://registries/azure-openai/models/gpt-35-turbo/versions/0301
tags:
evaluation_type: text_generation
task: question-answering
accuracy_metric_name: exact_match
azure_registry_name: azure-openai
azure_model_name: gpt-35-turbo
azure_latest_model_version: 0125
azure_latest_model_asset_id: azureml://registries/azure-openai/models/gpt-35-turbo/versions/0125
metrics:
accuracy: 0.867
properties:
n_shot: 5
evaluation_sampling_ratio: 1.0
evaluation_split: "validation"
fewshot_sampling_ratio: 1.0
fewshot_split: "train"

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,35 +0,0 @@
type: evaluationresult
name: boolq_gpt_35_turbo_0613_question_answering
version: 2.04.11
display_name: boolq_gpt_35_turbo_0613_question_answering
description: gpt-35-turbo-0613 run for boolq dataset
dataset_family: boolq
dataset_name: boolq
model_name: gpt-35-turbo-0613
model_version: "0613"
model_asset_id: azureml://registries/azure-openai/models/gpt-35-turbo/versions/0613
relationships:
- relationshipType: Source
assetId: azureml://registries/azure-openai/models/gpt-35-turbo/versions/0613
tags:
evaluation_type: text_generation
task: question-answering
accuracy_metric_name: exact_match
azure_registry_name: azure-openai
azure_model_name: gpt-35-turbo
azure_latest_model_version: 0125
azure_latest_model_asset_id: azureml://registries/azure-openai/models/gpt-35-turbo/versions/0125
metrics:
accuracy: 0.864
properties:
n_shot: 5
evaluation_sampling_ratio: 1.0
evaluation_split: "validation"
fewshot_sampling_ratio: 1.0
fewshot_split: "train"

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,35 +0,0 @@
type: evaluationresult
name: boolq_gpt_4_0314_question_answering
version: 2.04.11
display_name: boolq_gpt_4_0314_question_answering
description: gpt-4-0314 run for boolq dataset
dataset_family: boolq
dataset_name: boolq
model_name: gpt-4-0314
model_version: "0314"
model_asset_id: azureml://registries/azure-openai/models/gpt-4/versions/4
relationships:
- relationshipType: Source
assetId: azureml://registries/azure-openai/models/gpt-4/versions/4
tags:
evaluation_type: text_generation
task: question-answering
accuracy_metric_name: exact_match
azure_registry_name: azure-openai
azure_model_name: gpt-4
azure_latest_model_version: turbo-2024-04-09
azure_latest_model_asset_id: azureml://registries/azure-openai/models/gpt-4/versions/turbo-2024-04-09
metrics:
accuracy: 0.911
properties:
n_shot: 5
evaluation_sampling_ratio: 1.0
evaluation_split: "validation"
fewshot_sampling_ratio: 1.0
fewshot_split: "train"

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,35 +0,0 @@
type: evaluationresult
name: boolq_gpt_4_0613_question_answering
version: 2.04.11
display_name: boolq_gpt_4_0613_question_answering
description: gpt-4-0613 run for boolq dataset
dataset_family: boolq
dataset_name: boolq
model_name: gpt-4-0613
model_version: "0613"
model_asset_id: azureml://registries/azure-openai/models/gpt-4/versions/0613
relationships:
- relationshipType: Source
assetId: azureml://registries/azure-openai/models/gpt-4/versions/0613
tags:
evaluation_type: text_generation
task: question-answering
accuracy_metric_name: exact_match
azure_registry_name: azure-openai
azure_model_name: gpt-4
azure_latest_model_version: turbo-2024-04-09
azure_latest_model_asset_id: azureml://registries/azure-openai/models/gpt-4/versions/turbo-2024-04-09
metrics:
accuracy: 0.912
properties:
n_shot: 5
evaluation_sampling_ratio: 1.0
evaluation_split: "validation"
fewshot_sampling_ratio: 1.0
fewshot_split: "train"

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,35 +0,0 @@
type: evaluationresult
name: boolq_gpt_4_32k_0314_question_answering
version: 2.04.11
display_name: boolq_gpt_4_32k_0314_question_answering
description: gpt-4-32k-0314 run for boolq dataset
dataset_family: boolq
dataset_name: boolq
model_name: gpt-4-32k-0314
model_version: "0314"
model_asset_id: azureml://registries/azure-openai/models/gpt-4-32k/versions/0314
relationships:
- relationshipType: Source
assetId: azureml://registries/azure-openai/models/gpt-4-32k/versions/0314
tags:
evaluation_type: text_generation
task: question-answering
accuracy_metric_name: exact_match
azure_registry_name: azure-openai
azure_model_name: gpt-4-32k
azure_latest_model_version: 0613
azure_latest_model_asset_id: azureml://registries/azure-openai/models/gpt-4-32k/versions/0613
metrics:
accuracy: 0.913
properties:
n_shot: 5
evaluation_sampling_ratio: 1.0
evaluation_split: "validation"
fewshot_sampling_ratio: 1.0
fewshot_split: "train"

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,35 +0,0 @@
type: evaluationresult
name: boolq_gpt_4_32k_0613_question_answering
version: 2.04.11
display_name: boolq_gpt_4_32k_0613_question_answering
description: gpt-4-32k-0613 run for boolq dataset
dataset_family: boolq
dataset_name: boolq
model_name: gpt-4-32k-0613
model_version: "0613"
model_asset_id: azureml://registries/azure-openai/models/gpt-4-32k/versions/0613
relationships:
- relationshipType: Source
assetId: azureml://registries/azure-openai/models/gpt-4-32k/versions/0613
tags:
evaluation_type: text_generation
task: question-answering
accuracy_metric_name: exact_match
azure_registry_name: azure-openai
azure_model_name: gpt-4-32k
azure_latest_model_version: 0613
azure_latest_model_asset_id: azureml://registries/azure-openai/models/gpt-4-32k/versions/0613
metrics:
accuracy: 0.911
properties:
n_shot: 5
evaluation_sampling_ratio: 1.0
evaluation_split: "validation"
fewshot_sampling_ratio: 1.0
fewshot_split: "train"

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,35 +0,0 @@
type: evaluationresult
name: boolq_llama_2_13b_chat_question_answering
version: 2.04.11
display_name: boolq_llama_2_13b_chat_question_answering
description: llama-2-13b-chat run for boolq dataset
dataset_family: boolq
dataset_name: boolq
model_name: llama-2-13b-chat
model_version: "12"
model_asset_id: azureml://registries/azureml-meta/models/Llama-2-13b-chat/versions/12
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml-meta/models/Llama-2-13b-chat/versions/12
tags:
evaluation_type: text_generation
task: question-answering
accuracy_metric_name: exact_match
azure_registry_name: azureml-meta
azure_model_name: Llama-2-13b-chat
azure_latest_model_version: 20
azure_latest_model_asset_id: azureml://registries/azureml-meta/models/Llama-2-13b-chat/versions/20
metrics:
accuracy: 0.801
properties:
n_shot: 5
evaluation_sampling_ratio: 1.0
evaluation_split: "validation"
fewshot_sampling_ratio: 1.0
fewshot_split: "train"

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,35 +0,0 @@
type: evaluationresult
name: boolq_llama_2_13b_question_answering
version: 2.04.11
display_name: boolq_llama_2_13b_question_answering
description: llama-2-13b run for boolq dataset
dataset_family: boolq
dataset_name: boolq
model_name: llama-2-13b
model_version: "12"
model_asset_id: azureml://registries/azureml-meta/models/Llama-2-13b/versions/12
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml-meta/models/Llama-2-13b/versions/12
tags:
evaluation_type: text_generation
task: question-answering
accuracy_metric_name: exact_match
azure_registry_name: azureml-meta
azure_model_name: Llama-2-13b
azure_latest_model_version: 23
azure_latest_model_asset_id: azureml://registries/azureml-meta/models/Llama-2-13b/versions/23
metrics:
accuracy: 0.723
properties:
n_shot: 5
evaluation_sampling_ratio: 1.0
evaluation_split: "validation"
fewshot_sampling_ratio: 1.0
fewshot_split: "train"

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,35 +0,0 @@
type: evaluationresult
name: boolq_llama_2_70b_chat_question_answering
version: 2.04.11
display_name: boolq_llama_2_70b_chat_question_answering
description: llama-2-70b-chat run for boolq dataset
dataset_family: boolq
dataset_name: boolq
model_name: llama-2-70b-chat
model_version: "12"
model_asset_id: azureml://registries/azureml-meta/models/Llama-2-70b-chat/versions/12
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml-meta/models/Llama-2-70b-chat/versions/12
tags:
evaluation_type: text_generation
task: question-answering
accuracy_metric_name: exact_match
azure_registry_name: azureml-meta
azure_model_name: Llama-2-70b-chat
azure_latest_model_version: 20
azure_latest_model_asset_id: azureml://registries/azureml-meta/models/Llama-2-70b-chat/versions/20
metrics:
accuracy: 0.826
properties:
n_shot: 5
evaluation_sampling_ratio: 1.0
evaluation_split: "validation"
fewshot_sampling_ratio: 1.0
fewshot_split: "train"

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,35 +0,0 @@
type: evaluationresult
name: boolq_llama_2_70b_question_answering
version: 2.04.11
display_name: boolq_llama_2_70b_question_answering
description: llama-2-70b run for boolq dataset
dataset_family: boolq
dataset_name: boolq
model_name: llama-2-70b
model_version: "13"
model_asset_id: azureml://registries/azureml-meta/models/Llama-2-70b/versions/13
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml-meta/models/Llama-2-70b/versions/13
tags:
evaluation_type: text_generation
task: question-answering
accuracy_metric_name: exact_match
azure_registry_name: azureml-meta
azure_model_name: Llama-2-70b
azure_latest_model_version: 24
azure_latest_model_asset_id: azureml://registries/azureml-meta/models/Llama-2-70b/versions/24
metrics:
accuracy: 0.853
properties:
n_shot: 5
evaluation_sampling_ratio: 1.0
evaluation_split: "validation"
fewshot_sampling_ratio: 1.0
fewshot_split: "train"

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,35 +0,0 @@
type: evaluationresult
name: boolq_llama_2_7b_chat_question_answering
version: 2.04.11
display_name: boolq_llama_2_7b_chat_question_answering
description: llama-2-7b-chat run for boolq dataset
dataset_family: boolq
dataset_name: boolq
model_name: llama-2-7b-chat
model_version: "14"
model_asset_id: azureml://registries/azureml-meta/models/Llama-2-7b-chat/versions/14
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml-meta/models/Llama-2-7b-chat/versions/14
tags:
evaluation_type: text_generation
task: question-answering
accuracy_metric_name: exact_match
azure_registry_name: azureml-meta
azure_model_name: Llama-2-7b-chat
azure_latest_model_version: 24
azure_latest_model_asset_id: azureml://registries/azureml-meta/models/Llama-2-7b-chat/versions/24
metrics:
accuracy: 0.771
properties:
n_shot: 5
evaluation_sampling_ratio: 1.0
evaluation_split: "validation"
fewshot_sampling_ratio: 1.0
fewshot_split: "train"

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,35 +0,0 @@
type: evaluationresult
name: boolq_llama_2_7b_question_answering
version: 2.04.11
display_name: boolq_llama_2_7b_question_answering
description: llama-2-7b run for boolq dataset
dataset_family: boolq
dataset_name: boolq
model_name: llama-2-7b
model_version: "12"
model_asset_id: azureml://registries/azureml-meta/models/Llama-2-7b/versions/12
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml-meta/models/Llama-2-7b/versions/12
tags:
evaluation_type: text_generation
task: question-answering
accuracy_metric_name: exact_match
azure_registry_name: azureml-meta
azure_model_name: Llama-2-7b
azure_latest_model_version: 22
azure_latest_model_asset_id: azureml://registries/azureml-meta/models/Llama-2-7b/versions/22
metrics:
accuracy: 0.628
properties:
n_shot: 5
evaluation_sampling_ratio: 1.0
evaluation_split: "validation"
fewshot_sampling_ratio: 1.0
fewshot_split: "train"

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,35 +0,0 @@
type: evaluationresult
name: boolq_meta-llama-3-70b-instruct_question_answering
version: 2.04.11
display_name: boolq_Meta-Llama-3-70B-Instruct_question_answering
description: Meta-Llama-3-70B-Instruct run for boolq dataset
dataset_family: boolq
dataset_name: boolq
model_name: Meta-Llama-3-70B-Instruct
model_version: "2"
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3-70B-Instruct/versions/2
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3-70B-Instruct/versions/2
tags:
evaluation_type: text_generation
task: question-answering
accuracy_metric_name: exact_match
azure_registry_name: azureml-meta
azure_model_name: Meta-Llama-3-70B-Instruct
azure_latest_model_version: 8
azure_latest_model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3-70B-Instruct/versions/8
metrics:
accuracy: 0.9027522935779817
properties:
n_shot: 5
evaluation_sampling_ratio: 1.0
evaluation_split: "validation"
fewshot_sampling_ratio: 1.0
fewshot_split: "train"

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Просмотреть файл

@ -1,35 +0,0 @@
type: evaluationresult
name: boolq_meta-llama-3-70b_question_answering
version: 2.04.11
display_name: boolq_Meta-Llama-3-70B_question_answering
description: Meta-Llama-3-70B run for boolq dataset
dataset_family: boolq
dataset_name: boolq
model_name: Meta-Llama-3-70B
model_version: "1"
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3-70B/versions/1
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3-70B/versions/1
tags:
evaluation_type: text_generation
task: question-answering
accuracy_metric_name: exact_match
azure_registry_name: azureml-meta
azure_model_name: Meta-Llama-3-70B
azure_latest_model_version: 6
azure_latest_model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3-70B/versions/6
metrics:
accuracy: 0.8917431192660551
properties:
n_shot: 5
evaluation_sampling_ratio: 1.0
evaluation_split: "validation"
fewshot_sampling_ratio: 1.0
fewshot_split: "train"

Просмотреть файл

@ -1,3 +0,0 @@
type: evaluationresult
spec: spec.yaml
categories: ["EvaluationResult"]

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше