Benchmark Assets for new models (#3261)
* GPT 4o, llama 3.1 and GPT 0125 assets * Removing old gpt 4o assets
This commit is contained in:
Родитель
ffa592987e
Коммит
57c6c7072a
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: boolq__gpt-4-0125-preview__question_answering
|
||||
version: 2.12.08
|
||||
display_name: boolq__gpt-4-0125-Preview__chat_completion
|
||||
description: Benchmark__gpt40125__hf_boolq__chat_completion
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: gpt-4-0125-Preview
|
||||
model_version: "4"
|
||||
model_asset_id: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.904892966
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: boolq__meta-llama-3_1-70b-instruct__question_answering
|
||||
version: 2.12.08
|
||||
display_name: boolq__Meta-Llama-3_1-70B-Instruct__chat_completion
|
||||
description: Benchmark__Llama-3-1-70B-Instruct-bench__hf_boolq__chat_completion
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: Meta-Llama-3.1-70B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.909785933
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: boolq__meta-llama-3_1-8b-instruct__question_answering
|
||||
version: 2.12.08
|
||||
display_name: boolq__Meta-Llama-3_1-8B-Instruct__chat_completion
|
||||
description: Benchmark__meta-llama-3-1-8b-instruct-1__hf_boolq__chat_completion
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: Meta-Llama-3.1-8B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.868501529
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -1,8 +1,8 @@
|
|||
type: evaluationresult
|
||||
name: boolq_gpt-4o_chat_completion
|
||||
version: 2.13.06
|
||||
display_name: boolq_gpt-4o_chat_completion
|
||||
description: boolq_gpt-4o_chat_completion
|
||||
name: boolq_gpt-4o_question_answering
|
||||
version: 2.12.08
|
||||
display_name: boolq_gpt-4o_question_answering
|
||||
description: gpt-4o run for boolq
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
|
@ -20,7 +20,7 @@ tags:
|
|||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.9051987767584098
|
||||
accuracy: 0.908562691
|
||||
|
||||
|
||||
properties:
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: boolq_meta-llama-3_1-405b-instruct_question_answering
|
||||
version: 2.12.08
|
||||
display_name: boolq_Meta-Llama-3_1-405B-Instruct_question_answering
|
||||
description: Meta-Llama-3.1-405B-Instruct run for boolq
|
||||
dataset_family: boolq
|
||||
dataset_name: boolq
|
||||
|
||||
model_name: Meta-Llama-3.1-405B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.920489297
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: gsm8k__gpt-4-0125-preview__question_answering
|
||||
version: 2.12.08
|
||||
display_name: gsm8k__gpt-4-0125-Preview__chat_completion
|
||||
description: Benchmark__gpt40125__hf_gsm8k__chat_completion
|
||||
dataset_family: gsm8k
|
||||
dataset_name: gsm8k
|
||||
|
||||
model_name: gpt-4-0125-Preview
|
||||
model_version: "4"
|
||||
model_asset_id: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.93555724
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 8
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "test"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "dev"
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: gsm8k__meta-llama-3_1-70b-instruct__question_answering
|
||||
version: 2.12.08
|
||||
display_name: gsm8k__Meta-Llama-3_1-70B-Instruct__chat_completion
|
||||
description: Benchmark__Llama-3-1-70B-Instruct-bench__hf_gsm8k__chat_completion
|
||||
dataset_family: gsm8k
|
||||
dataset_name: gsm8k
|
||||
|
||||
model_name: Meta-Llama-3.1-70B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.946929492
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 8
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "test"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "dev"
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: gsm8k__meta-llama-3_1-8b-instruct__question_answering
|
||||
version: 2.12.08
|
||||
display_name: gsm8k__Meta-Llama-3_1-8B-Instruct__chat_completion
|
||||
description: Benchmark__meta-llama-3-1-8b-instruct-1__hf_gsm8k__chat_completion
|
||||
dataset_family: gsm8k
|
||||
dataset_name: gsm8k
|
||||
|
||||
model_name: Meta-Llama-3.1-8B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.843062926
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 8
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "test"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "dev"
|
|
@ -1,8 +1,8 @@
|
|||
type: evaluationresult
|
||||
name: gsm8k_gpt-4o_chat_completion
|
||||
version: 2.13.06
|
||||
display_name: gsm8k_gpt-4o_chat_completion
|
||||
description: gsm8k_gpt-4o_chat_completion
|
||||
name: gsm8k_gpt-4o_question_answering
|
||||
version: 2.12.08
|
||||
display_name: gsm8k_gpt-4o_question_answering
|
||||
description: gpt-4o run for gsm8k
|
||||
dataset_family: gsm8k
|
||||
dataset_name: gsm8k
|
||||
|
||||
|
@ -20,7 +20,7 @@ tags:
|
|||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.9423805913570887
|
||||
accuracy: 0.945413192
|
||||
|
||||
|
||||
properties:
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: gsm8k_meta-llama-3_1-405b-instruct_question_answering
|
||||
version: 2.12.08
|
||||
display_name: gsm8k_Meta-Llama-3_1-405B-Instruct_question_answering
|
||||
description: Meta-Llama-3.1-405B-Instruct run for gsm8k
|
||||
dataset_family: gsm8k
|
||||
dataset_name: gsm8k
|
||||
|
||||
model_name: Meta-Llama-3.1-405B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.966
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 8
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "test"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "dev"
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: hellaswag__gpt-4-0125-preview__question_answering
|
||||
version: 2.12.08
|
||||
display_name: hellaswag__gpt-4-0125-Preview__chat_completion
|
||||
description: Benchmark__gpt40125__hf_hellaswag__chat_completion
|
||||
dataset_family: hellaswag
|
||||
dataset_name: hellaswag
|
||||
|
||||
model_name: gpt-4-0125-Preview
|
||||
model_version: "4"
|
||||
model_asset_id: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.923322047
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: hellaswag__meta-llama-3_1-70b-instruct__question_answering
|
||||
version: 2.12.08
|
||||
display_name: hellaswag__Meta-Llama-3_1-70B-Instruct__chat_completion
|
||||
description: Benchmark__Llama-3-1-70B-Instruct-bench__hf_hellaswag__chat_completion
|
||||
dataset_family: hellaswag
|
||||
dataset_name: hellaswag
|
||||
|
||||
model_name: Meta-Llama-3.1-70B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.908783111
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: hellaswag__meta-llama-3_1-8b-instruct__question_answering
|
||||
version: 2.12.08
|
||||
display_name: hellaswag__Meta-Llama-3_1-8B-Instruct__chat_completion
|
||||
description: Benchmark__meta-llama-3-1-8b-instruct-1__hf_hellaswag__chat_completion
|
||||
dataset_family: hellaswag
|
||||
dataset_name: hellaswag
|
||||
|
||||
model_name: Meta-Llama-3.1-8B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.796554471
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -1,8 +1,8 @@
|
|||
type: evaluationresult
|
||||
name: hellaswag_gpt-4o_chat_completion
|
||||
version: 2.13.06
|
||||
display_name: hellaswag_gpt-4o_chat_completion
|
||||
description: hellaswag_gpt-4o_chat_completion
|
||||
name: hellaswag_gpt-4o_question_answering
|
||||
version: 2.12.08
|
||||
display_name: hellaswag_gpt-4o_question_answering
|
||||
description: gpt-4o run for hellaswag
|
||||
dataset_family: hellaswag
|
||||
dataset_name: hellaswag
|
||||
|
||||
|
@ -20,7 +20,7 @@ tags:
|
|||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.8914558852818164
|
||||
accuracy: 0.948018323
|
||||
|
||||
|
||||
properties:
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: hellaswag_meta-llama-3_1-405b-instruct_question_answering
|
||||
version: 2.12.08
|
||||
display_name: hellaswag_Meta-Llama-3_1-405B-Instruct_question_answering
|
||||
description: Meta-Llama-3.1-405B-Instruct run for hellaswag
|
||||
dataset_family: hellaswag
|
||||
dataset_name: hellaswag
|
||||
|
||||
model_name: Meta-Llama-3.1-405B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.928
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: human_eval__gpt-4-0125-preview__text_generation
|
||||
version: 2.12.08
|
||||
display_name: human_eval__gpt-4-0125-Preview__chat_completion
|
||||
description: Benchmark__gpt40125__hf_openai_humaneval__chat_completion
|
||||
dataset_family: human_eval
|
||||
dataset_name: human_eval
|
||||
|
||||
model_name: gpt-4-0125-Preview
|
||||
model_version: "4"
|
||||
model_asset_id: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: text-generation
|
||||
accuracy_metric_name: pass@1
|
||||
|
||||
metrics:
|
||||
accuracy: 0.87804878
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 0
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "test"
|
||||
fewshot_sampling_ratio: None
|
||||
fewshot_split: "None"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: human_eval__meta-llama-3_1-70b-instruct__text_generation
|
||||
version: 2.12.08
|
||||
display_name: human_eval__Meta-Llama-3_1-70B-Instruct__chat_completion
|
||||
description: Benchmark__Llama-3-1-70B-Instruct-bench__hf_openai_humaneval__chat_completion
|
||||
dataset_family: human_eval
|
||||
dataset_name: human_eval
|
||||
|
||||
model_name: Meta-Llama-3.1-70B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: text-generation
|
||||
accuracy_metric_name: pass@1
|
||||
|
||||
metrics:
|
||||
accuracy: 0.786585366
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 0
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "test"
|
||||
fewshot_sampling_ratio: None
|
||||
fewshot_split: "None"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: human_eval__meta-llama-3_1-8b-instruct__text_generation
|
||||
version: 2.12.08
|
||||
display_name: human_eval__Meta-Llama-3_1-8B-Instruct__chat_completion
|
||||
description: Benchmark__meta-llama-3-1-8b-instruct-1__hf_openai_humaneval__chat_completion
|
||||
dataset_family: human_eval
|
||||
dataset_name: human_eval
|
||||
|
||||
model_name: Meta-Llama-3.1-8B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: text-generation
|
||||
accuracy_metric_name: pass@1
|
||||
|
||||
metrics:
|
||||
accuracy: 0.682926829
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 0
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "test"
|
||||
fewshot_sampling_ratio: None
|
||||
fewshot_split: "None"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,8 +1,8 @@
|
|||
type: evaluationresult
|
||||
name: human_eval_gpt-4o_chat_completion
|
||||
version: 2.13.06
|
||||
display_name: human_eval_gpt-4o_chat_completion
|
||||
description: human_eval_gpt-4o_chat_completion
|
||||
name: human_eval_gpt-4o_text_generation
|
||||
version: 2.12.08
|
||||
display_name: human_eval_gpt-4o_text_generation
|
||||
description: gpt-4o run for human_eval
|
||||
dataset_family: human_eval
|
||||
dataset_name: human_eval
|
||||
|
||||
|
@ -20,7 +20,7 @@ tags:
|
|||
accuracy_metric_name: pass@1
|
||||
|
||||
metrics:
|
||||
accuracy: 0.9207317073170732
|
||||
accuracy: 0.920731707
|
||||
|
||||
|
||||
properties:
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: human_eval_meta-llama-3_1-405b-instruct_text_generation
|
||||
version: 2.12.08
|
||||
display_name: human_eval_Meta-Llama-3_1-405B-Instruct_text_generation
|
||||
description: Meta-Llama-3.1-405B-Instruct run for human_eval
|
||||
dataset_family: human_eval
|
||||
dataset_name: human_eval
|
||||
|
||||
model_name: Meta-Llama-3.1-405B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: text-generation
|
||||
accuracy_metric_name: pass@1
|
||||
|
||||
metrics:
|
||||
accuracy: 0.853658537
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 0
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "test"
|
||||
fewshot_sampling_ratio: None
|
||||
fewshot_split: "None"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: mmlu_humanities__gpt-4-0125-preview__question_answering
|
||||
version: 2.12.08
|
||||
display_name: mmlu_humanities__gpt-4-0125-Preview__chat_completion
|
||||
description: Benchmark__gpt40125__remote_mmlu__chat_completion
|
||||
dataset_family: mmlu
|
||||
dataset_name: mmlu_humanities
|
||||
|
||||
model_name: gpt-4-0125-Preview
|
||||
model_version: "4"
|
||||
model_asset_id: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.788310308
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "test"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "dev"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: mmlu_humanities__meta-llama-3_1-70b-instruct__question_answering
|
||||
version: 2.12.08
|
||||
display_name: mmlu_humanities__Meta-Llama-3_1-70B-Instruct__chat_completion
|
||||
description: Benchmark__Llama-3-1-70B-Instruct-bench__remote_mmlu__chat_completion
|
||||
dataset_family: mmlu
|
||||
dataset_name: mmlu_humanities
|
||||
|
||||
model_name: Meta-Llama-3.1-70B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.794899044
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "test"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "dev"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: mmlu_humanities__meta-llama-3_1-8b-instruct__question_answering
|
||||
version: 2.12.08
|
||||
display_name: mmlu_humanities__Meta-Llama-3_1-8B-Instruct__chat_completion
|
||||
description: Benchmark__meta-llama-3-1-8b-instruct-1__remote_mmlu__chat_completion
|
||||
dataset_family: mmlu
|
||||
dataset_name: mmlu_humanities
|
||||
|
||||
model_name: Meta-Llama-3.1-8B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.636556854
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "test"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "dev"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,8 +1,8 @@
|
|||
type: evaluationresult
|
||||
name: mmlu_humanities_gpt-4o_chat_completion
|
||||
version: 2.13.06
|
||||
display_name: mmlu_humanities_gpt-4o_chat_completion
|
||||
description: mmlu_humanities_gpt-4o_chat_completion
|
||||
name: mmlu_humanities_gpt-4o_question_answering
|
||||
version: 2.12.08
|
||||
display_name: mmlu_humanities_gpt-4o_question_answering
|
||||
description: gpt-4o run for mmlu_humanities
|
||||
dataset_family: mmlu
|
||||
dataset_name: mmlu_humanities
|
||||
|
||||
|
@ -20,7 +20,7 @@ tags:
|
|||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.8021253985122211
|
||||
accuracy: 0.810201913
|
||||
|
||||
|
||||
properties:
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: mmlu_humanities_meta-llama-3_1-405b-instruct_question_answering
|
||||
version: 2.12.08
|
||||
display_name: mmlu_humanities_Meta-Llama-3_1-405B-Instruct_question_answering
|
||||
description: Meta-Llama-3.1-405B-Instruct run for mmlu_humanities
|
||||
dataset_family: mmlu
|
||||
dataset_name: mmlu_humanities
|
||||
|
||||
model_name: Meta-Llama-3.1-405B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.817215728
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "test"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "dev"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: mmlu_other__gpt-4-0125-preview__question_answering
|
||||
version: 2.12.08
|
||||
display_name: mmlu_other__gpt-4-0125-Preview__chat_completion
|
||||
description: Benchmark__gpt40125__remote_mmlu__chat_completion
|
||||
dataset_family: mmlu
|
||||
dataset_name: mmlu_other
|
||||
|
||||
model_name: gpt-4-0125-Preview
|
||||
model_version: "4"
|
||||
model_asset_id: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.865786933
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "test"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "dev"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: mmlu_other__meta-llama-3_1-70b-instruct__question_answering
|
||||
version: 2.12.08
|
||||
display_name: mmlu_other__Meta-Llama-3_1-70B-Instruct__chat_completion
|
||||
description: Benchmark__Llama-3-1-70B-Instruct-bench__remote_mmlu__chat_completion
|
||||
dataset_family: mmlu
|
||||
dataset_name: mmlu_other
|
||||
|
||||
model_name: Meta-Llama-3.1-70B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.848728677
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "test"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "dev"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: mmlu_other__meta-llama-3_1-8b-instruct__question_answering
|
||||
version: 2.12.08
|
||||
display_name: mmlu_other__Meta-Llama-3_1-8B-Instruct__chat_completion
|
||||
description: Benchmark__meta-llama-3-1-8b-instruct-1__remote_mmlu__chat_completion
|
||||
dataset_family: mmlu
|
||||
dataset_name: mmlu_other
|
||||
|
||||
model_name: Meta-Llama-3.1-8B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.736723528
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "test"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "dev"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,8 +1,8 @@
|
|||
type: evaluationresult
|
||||
name: mmlu_other_gpt-4o_chat_completion
|
||||
version: 2.13.06
|
||||
display_name: mmlu_other_gpt-4o_chat_completion
|
||||
description: mmlu_other_gpt-4o_chat_completion
|
||||
name: mmlu_other_gpt-4o_question_answering
|
||||
version: 2.12.08
|
||||
display_name: mmlu_other_gpt-4o_question_answering
|
||||
description: gpt-4o run for mmlu_other
|
||||
dataset_family: mmlu
|
||||
dataset_name: mmlu_other
|
||||
|
||||
|
@ -20,7 +20,7 @@ tags:
|
|||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.8715803025426456
|
||||
accuracy: 0.891857097
|
||||
|
||||
|
||||
properties:
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: mmlu_other_meta-llama-3_1-405b-instruct_question_answering
|
||||
version: 2.12.08
|
||||
display_name: mmlu_other_Meta-Llama-3_1-405B-Instruct_question_answering
|
||||
description: Meta-Llama-3.1-405B-Instruct run for mmlu_other
|
||||
dataset_family: mmlu
|
||||
dataset_name: mmlu_other
|
||||
|
||||
model_name: Meta-Llama-3.1-405B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.878339234
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "test"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "dev"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: mmlu_social_sciences__gpt-4-0125-preview__question_answering
|
||||
version: 2.12.08
|
||||
display_name: mmlu_social_sciences__gpt-4-0125-Preview__chat_completion
|
||||
description: Benchmark__gpt40125__remote_mmlu__chat_completion
|
||||
dataset_family: mmlu
|
||||
dataset_name: mmlu_social_sciences
|
||||
|
||||
model_name: gpt-4-0125-Preview
|
||||
model_version: "4"
|
||||
model_asset_id: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.90120247
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "test"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "dev"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: mmlu_social_sciences__meta-llama-3_1-70b-instruct__question_answering
|
||||
version: 2.12.08
|
||||
display_name: mmlu_social_sciences__Meta-Llama-3_1-70B-Instruct__chat_completion
|
||||
description: Benchmark__Llama-3-1-70B-Instruct-bench__remote_mmlu__chat_completion
|
||||
dataset_family: mmlu
|
||||
dataset_name: mmlu_social_sciences
|
||||
|
||||
model_name: Meta-Llama-3.1-70B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.876178096
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "test"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "dev"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: mmlu_social_sciences__meta-llama-3_1-8b-instruct__question_answering
|
||||
version: 2.12.08
|
||||
display_name: mmlu_social_sciences__Meta-Llama-3_1-8B-Instruct__chat_completion
|
||||
description: Benchmark__meta-llama-3-1-8b-instruct-1__remote_mmlu__chat_completion
|
||||
dataset_family: mmlu
|
||||
dataset_name: mmlu_social_sciences
|
||||
|
||||
model_name: Meta-Llama-3.1-8B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.765680858
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "test"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "dev"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,8 +1,8 @@
|
|||
type: evaluationresult
|
||||
name: mmlu_social_sciences_gpt-4o_chat_completion
|
||||
version: 2.13.06
|
||||
display_name: mmlu_social_sciences_gpt-4o_chat_completion
|
||||
description: mmlu_social_sciences_gpt-4o_chat_completion
|
||||
name: mmlu_social_sciences_gpt-4o_question_answering
|
||||
version: 2.12.08
|
||||
display_name: mmlu_social_sciences_gpt-4o_question_answering
|
||||
description: gpt-4o run for mmlu_social_sciences
|
||||
dataset_family: mmlu
|
||||
dataset_name: mmlu_social_sciences
|
||||
|
||||
|
@ -20,7 +20,7 @@ tags:
|
|||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.9129021774455639
|
||||
accuracy: 0.919077023
|
||||
|
||||
|
||||
properties:
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: mmlu_social_sciences_meta-llama-3_1-405b-instruct_question_answering
|
||||
version: 2.12.08
|
||||
display_name: mmlu_social_sciences_Meta-Llama-3_1-405B-Instruct_question_answering
|
||||
description: Meta-Llama-3.1-405B-Instruct run for mmlu_social_sciences
|
||||
dataset_family: mmlu
|
||||
dataset_name: mmlu_social_sciences
|
||||
|
||||
model_name: Meta-Llama-3.1-405B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.900227494
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "test"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "dev"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: mmlu_stem__gpt-4-0125-preview__question_answering
|
||||
version: 2.12.08
|
||||
display_name: mmlu_stem__gpt-4-0125-Preview__chat_completion
|
||||
description: Benchmark__gpt40125__remote_mmlu__chat_completion
|
||||
dataset_family: mmlu
|
||||
dataset_name: mmlu_stem
|
||||
|
||||
model_name: gpt-4-0125-Preview
|
||||
model_version: "4"
|
||||
model_asset_id: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.787821123
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "test"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "dev"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: mmlu_stem__meta-llama-3_1-70b-instruct__question_answering
|
||||
version: 2.12.08
|
||||
display_name: mmlu_stem__Meta-Llama-3_1-70B-Instruct__chat_completion
|
||||
description: Benchmark__Llama-3-1-70B-Instruct-bench__remote_mmlu__chat_completion
|
||||
dataset_family: mmlu
|
||||
dataset_name: mmlu_stem
|
||||
|
||||
model_name: Meta-Llama-3.1-70B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.769425944
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "test"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "dev"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: mmlu_stem__meta-llama-3_1-8b-instruct__question_answering
|
||||
version: 2.12.08
|
||||
display_name: mmlu_stem__Meta-Llama-3_1-8B-Instruct__chat_completion
|
||||
description: Benchmark__meta-llama-3-1-8b-instruct-1__remote_mmlu__chat_completion
|
||||
dataset_family: mmlu
|
||||
dataset_name: mmlu_stem
|
||||
|
||||
model_name: Meta-Llama-3.1-8B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.592134475
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "test"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "dev"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,8 +1,8 @@
|
|||
type: evaluationresult
|
||||
name: mmlu_stem_gpt-4o_chat_completion
|
||||
version: 2.13.06
|
||||
display_name: mmlu_stem_gpt-4o_chat_completion
|
||||
description: mmlu_stem_gpt-4o_chat_completion
|
||||
name: mmlu_stem_gpt-4o_question_answering
|
||||
version: 2.12.08
|
||||
display_name: mmlu_stem_gpt-4o_question_answering
|
||||
description: gpt-4o run for mmlu_stem
|
||||
dataset_family: mmlu
|
||||
dataset_name: mmlu_stem
|
||||
|
||||
|
@ -20,7 +20,7 @@ tags:
|
|||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.6955280685061845
|
||||
accuracy: 0.802410403
|
||||
|
||||
|
||||
properties:
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: mmlu_stem_meta-llama-3_1-405b-instruct_question_answering
|
||||
version: 2.12.08
|
||||
display_name: mmlu_stem_Meta-Llama-3_1-405B-Instruct_question_answering
|
||||
description: Meta-Llama-3.1-405B-Instruct run for mmlu_stem
|
||||
dataset_family: mmlu
|
||||
dataset_name: mmlu_stem
|
||||
|
||||
model_name: Meta-Llama-3.1-405B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.836980653
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "test"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "dev"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: openbookqa__gpt-4-0125-preview__question_answering
|
||||
version: 2.12.08
|
||||
display_name: openbookqa__gpt-4-0125-Preview__chat_completion
|
||||
description: Benchmark__gpt40125__hf_openbookqa__chat_completion
|
||||
dataset_family: openbookqa
|
||||
dataset_name: openbookqa
|
||||
|
||||
model_name: gpt-4-0125-Preview
|
||||
model_version: "4"
|
||||
model_asset_id: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.934
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 10
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: openbookqa__meta-llama-3_1-70b-instruct__question_answering
|
||||
version: 2.12.08
|
||||
display_name: openbookqa__Meta-Llama-3_1-70B-Instruct__chat_completion
|
||||
description: Benchmark__Llama-3-1-70B-Instruct-bench__hf_openbookqa__chat_completion
|
||||
dataset_family: openbookqa
|
||||
dataset_name: openbookqa
|
||||
|
||||
model_name: Meta-Llama-3.1-70B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.938
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 10
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: openbookqa__meta-llama-3_1-8b-instruct__question_answering
|
||||
version: 2.12.08
|
||||
display_name: openbookqa__Meta-Llama-3_1-8B-Instruct__chat_completion
|
||||
description: Benchmark__meta-llama-3-1-8b-instruct-1__hf_openbookqa__chat_completion
|
||||
dataset_family: openbookqa
|
||||
dataset_name: openbookqa
|
||||
|
||||
model_name: Meta-Llama-3.1-8B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.852
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 10
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,8 +1,8 @@
|
|||
type: evaluationresult
|
||||
name: openbookqa_gpt-4o_chat_completion
|
||||
version: 2.13.06
|
||||
display_name: openbookqa_gpt-4o_chat_completion
|
||||
description: openbookqa_gpt-4o_chat_completion
|
||||
name: openbookqa_gpt-4o_question_answering
|
||||
version: 2.12.08
|
||||
display_name: openbookqa_gpt-4o_question_answering
|
||||
description: gpt-4o run for openbookqa
|
||||
dataset_family: openbookqa
|
||||
dataset_name: openbookqa
|
||||
|
||||
|
@ -20,7 +20,7 @@ tags:
|
|||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.882
|
||||
accuracy: 0.954
|
||||
|
||||
|
||||
properties:
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: openbookqa_meta-llama-3_1-405b-instruct_question_answering
|
||||
version: 2.12.08
|
||||
display_name: openbookqa_Meta-Llama-3_1-405B-Instruct_question_answering
|
||||
description: Meta-Llama-3.1-405B-Instruct run for openbookqa
|
||||
dataset_family: openbookqa
|
||||
dataset_name: openbookqa
|
||||
|
||||
model_name: Meta-Llama-3.1-405B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.918
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 10
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 1.0
|
||||
fewshot_split: "train"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: piqa__gpt-4-0125-preview__question_answering
|
||||
version: 2.12.08
|
||||
display_name: piqa__gpt-4-0125-Preview__chat_completion
|
||||
description: Benchmark__gpt40125__hf_piqa__chat_completion
|
||||
dataset_family: piqa
|
||||
dataset_name: piqa
|
||||
|
||||
model_name: gpt-4-0125-Preview
|
||||
model_version: "4"
|
||||
model_asset_id: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.936343852
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 0.3
|
||||
fewshot_split: "train"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: piqa__meta-llama-3_1-70b-instruct__question_answering
|
||||
version: 2.12.08
|
||||
display_name: piqa__Meta-Llama-3_1-70B-Instruct__chat_completion
|
||||
description: Benchmark__Llama-3-1-70B-Instruct-bench__hf_piqa__chat_completion
|
||||
dataset_family: piqa
|
||||
dataset_name: piqa
|
||||
|
||||
model_name: Meta-Llama-3.1-70B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.880848749
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 0.3
|
||||
fewshot_split: "train"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: piqa__meta-llama-3_1-8b-instruct__question_answering
|
||||
version: 2.12.08
|
||||
display_name: piqa__Meta-Llama-3_1-8B-Instruct__chat_completion
|
||||
description: Benchmark__meta-llama-3-1-8b-instruct-1__hf_piqa__chat_completion
|
||||
dataset_family: piqa
|
||||
dataset_name: piqa
|
||||
|
||||
model_name: Meta-Llama-3.1-8B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.821001088
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 0.3
|
||||
fewshot_split: "train"
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -1,8 +1,8 @@
|
|||
type: evaluationresult
|
||||
name: piqa_gpt-4o_chat_completion
|
||||
version: 2.13.06
|
||||
display_name: piqa_gpt-4o_chat_completion
|
||||
description: piqa_gpt-4o_chat_completion
|
||||
name: piqa_gpt-4o_question_answering
|
||||
version: 2.12.08
|
||||
display_name: piqa_gpt-4o_question_answering
|
||||
description: gpt-4o run for piqa
|
||||
dataset_family: piqa
|
||||
dataset_name: piqa
|
||||
|
||||
|
@ -20,7 +20,7 @@ tags:
|
|||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.8443960826985855
|
||||
accuracy: 0.938520131
|
||||
|
||||
|
||||
properties:
|
|
@ -0,0 +1,3 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories: ["EvaluationResult"]
|
|
@ -0,0 +1,31 @@
|
|||
type: evaluationresult
|
||||
name: piqa_meta-llama-3_1-405b-instruct_question_answering
|
||||
version: 2.12.08
|
||||
display_name: piqa_Meta-Llama-3_1-405B-Instruct_question_answering
|
||||
description: Meta-Llama-3.1-405B-Instruct run for piqa
|
||||
dataset_family: piqa
|
||||
dataset_name: piqa
|
||||
|
||||
model_name: Meta-Llama-3.1-405B-Instruct
|
||||
model_version: "1"
|
||||
model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_generation
|
||||
task: question-answering
|
||||
accuracy_metric_name: exact_match
|
||||
|
||||
metrics:
|
||||
accuracy: 0.886289445
|
||||
|
||||
|
||||
properties:
|
||||
n_shot: 5
|
||||
evaluation_sampling_ratio: 1.0
|
||||
evaluation_split: "validation"
|
||||
fewshot_sampling_ratio: 0.3
|
||||
fewshot_split: "train"
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче