Benchmark Assets for new models (#3261)

* GPT 4o, llama 3.1 and GPT 0125 assets * Removing old gpt 4o assets
2024-08-14 10:08:27 +05:30 · 2024-08-14 10:08:27 +05:30 · 57c6c7072a
--- a/assets/evaluation_results/boolqgpt-4-0125-previewquestion_answering/asset.yaml
+++ b/assets/evaluation_results/boolqgpt-4-0125-previewquestion_answering/asset.yaml
--- a/assets/evaluation_results/boolqgpt-4-0125-previewquestion_answering/spec.yaml
+++ b/assets/evaluation_results/boolqgpt-4-0125-previewquestion_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: boolq__gpt-4-0125-preview__question_answering
+version: 2.12.08
+display_name: boolq__gpt-4-0125-Preview__chat_completion
+description: Benchmark__gpt40125__hf_boolq__chat_completion
+dataset_family: boolq
+dataset_name: boolq
+
+model_name: gpt-4-0125-Preview
+model_version: "4"
+model_asset_id: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.904892966
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "validation"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "train"
--- a/assets/evaluation_results/boolq__meta-llama-3_1-70b-instruct__question_answering/asset.yaml
+++ b/assets/evaluation_results/boolq__meta-llama-3_1-70b-instruct__question_answering/asset.yaml
--- a/assets/evaluation_results/boolq__meta-llama-3_1-70b-instruct__question_answering/spec.yaml
+++ b/assets/evaluation_results/boolq__meta-llama-3_1-70b-instruct__question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: boolq__meta-llama-3_1-70b-instruct__question_answering
+version: 2.12.08
+display_name: boolq__Meta-Llama-3_1-70B-Instruct__chat_completion
+description: Benchmark__Llama-3-1-70B-Instruct-bench__hf_boolq__chat_completion
+dataset_family: boolq
+dataset_name: boolq
+
+model_name: Meta-Llama-3.1-70B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.909785933
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "validation"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "train"
--- a/assets/evaluation_results/boolq__meta-llama-3_1-8b-instruct__question_answering/asset.yaml
+++ b/assets/evaluation_results/boolq__meta-llama-3_1-8b-instruct__question_answering/asset.yaml
--- a/assets/evaluation_results/boolq__meta-llama-3_1-8b-instruct__question_answering/spec.yaml
+++ b/assets/evaluation_results/boolq__meta-llama-3_1-8b-instruct__question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: boolq__meta-llama-3_1-8b-instruct__question_answering
+version: 2.12.08
+display_name: boolq__Meta-Llama-3_1-8B-Instruct__chat_completion
+description: Benchmark__meta-llama-3-1-8b-instruct-1__hf_boolq__chat_completion
+dataset_family: boolq
+dataset_name: boolq
+
+model_name: Meta-Llama-3.1-8B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.868501529
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "validation"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "train"
--- a/assets/evaluation_results/human_eval_gpt-4o_chat_completion/asset.yaml
+++ b/assets/evaluation_results/human_eval_gpt-4o_chat_completion/asset.yaml
--- a/assets/evaluation_results/boolq_gpt-4o_question_answering/spec.yaml
+++ b/assets/evaluation_results/boolq_gpt-4o_question_answering/spec.yaml
@ -1,8 +1,8 @@
 type: evaluationresult
-name: boolq_gpt-4o_chat_completion
-version: 2.13.06
-display_name: boolq_gpt-4o_chat_completion
-description: boolq_gpt-4o_chat_completion
+name: boolq_gpt-4o_question_answering
+version: 2.12.08
+display_name: boolq_gpt-4o_question_answering
+description: gpt-4o run for boolq
 dataset_family: boolq
 dataset_name: boolq

@ -20,7 +20,7 @@ tags:
  accuracy_metric_name: exact_match

 metrics:
-  accuracy: 0.9051987767584098
+  accuracy: 0.908562691


 properties:
--- a/assets/evaluation_results/boolq_meta-llama-3_1-405b-instruct_question_answering/asset.yaml
+++ b/assets/evaluation_results/boolq_meta-llama-3_1-405b-instruct_question_answering/asset.yaml
--- a/assets/evaluation_results/boolq_meta-llama-3_1-405b-instruct_question_answering/spec.yaml
+++ b/assets/evaluation_results/boolq_meta-llama-3_1-405b-instruct_question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: boolq_meta-llama-3_1-405b-instruct_question_answering
+version: 2.12.08
+display_name: boolq_Meta-Llama-3_1-405B-Instruct_question_answering
+description: Meta-Llama-3.1-405B-Instruct run for boolq
+dataset_family: boolq
+dataset_name: boolq
+
+model_name: Meta-Llama-3.1-405B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.920489297
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "validation"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "train"
--- a/assets/evaluation_results/gsm8kgpt-4-0125-previewquestion_answering/asset.yaml
+++ b/assets/evaluation_results/gsm8kgpt-4-0125-previewquestion_answering/asset.yaml
--- a/assets/evaluation_results/gsm8kgpt-4-0125-previewquestion_answering/spec.yaml
+++ b/assets/evaluation_results/gsm8kgpt-4-0125-previewquestion_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: gsm8k__gpt-4-0125-preview__question_answering
+version: 2.12.08
+display_name: gsm8k__gpt-4-0125-Preview__chat_completion
+description: Benchmark__gpt40125__hf_gsm8k__chat_completion
+dataset_family: gsm8k
+dataset_name: gsm8k
+
+model_name: gpt-4-0125-Preview
+model_version: "4"
+model_asset_id: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.93555724
+
+
+properties:
+  n_shot: 8
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "test"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "dev"
--- a/assets/evaluation_results/gsm8k__meta-llama-3_1-70b-instruct__question_answering/asset.yaml
+++ b/assets/evaluation_results/gsm8k__meta-llama-3_1-70b-instruct__question_answering/asset.yaml
--- a/assets/evaluation_results/gsm8k__meta-llama-3_1-70b-instruct__question_answering/spec.yaml
+++ b/assets/evaluation_results/gsm8k__meta-llama-3_1-70b-instruct__question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: gsm8k__meta-llama-3_1-70b-instruct__question_answering
+version: 2.12.08
+display_name: gsm8k__Meta-Llama-3_1-70B-Instruct__chat_completion
+description: Benchmark__Llama-3-1-70B-Instruct-bench__hf_gsm8k__chat_completion
+dataset_family: gsm8k
+dataset_name: gsm8k
+
+model_name: Meta-Llama-3.1-70B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.946929492
+
+
+properties:
+  n_shot: 8
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "test"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "dev"
--- a/assets/evaluation_results/gsm8k__meta-llama-3_1-8b-instruct__question_answering/asset.yaml
+++ b/assets/evaluation_results/gsm8k__meta-llama-3_1-8b-instruct__question_answering/asset.yaml
--- a/assets/evaluation_results/gsm8k__meta-llama-3_1-8b-instruct__question_answering/spec.yaml
+++ b/assets/evaluation_results/gsm8k__meta-llama-3_1-8b-instruct__question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: gsm8k__meta-llama-3_1-8b-instruct__question_answering
+version: 2.12.08
+display_name: gsm8k__Meta-Llama-3_1-8B-Instruct__chat_completion
+description: Benchmark__meta-llama-3-1-8b-instruct-1__hf_gsm8k__chat_completion
+dataset_family: gsm8k
+dataset_name: gsm8k
+
+model_name: Meta-Llama-3.1-8B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.843062926
+
+
+properties:
+  n_shot: 8
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "test"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "dev"
--- a/assets/evaluation_results/openbookqa_gpt-4o_chat_completion/asset.yaml
+++ b/assets/evaluation_results/openbookqa_gpt-4o_chat_completion/asset.yaml
--- a/assets/evaluation_results/gsm8k_gpt-4o_question_answering/spec.yaml
+++ b/assets/evaluation_results/gsm8k_gpt-4o_question_answering/spec.yaml
@ -1,8 +1,8 @@
 type: evaluationresult
-name: gsm8k_gpt-4o_chat_completion
-version: 2.13.06
-display_name: gsm8k_gpt-4o_chat_completion
-description: gsm8k_gpt-4o_chat_completion
+name: gsm8k_gpt-4o_question_answering
+version: 2.12.08
+display_name: gsm8k_gpt-4o_question_answering
+description: gpt-4o run for gsm8k
 dataset_family: gsm8k
 dataset_name: gsm8k

@ -20,7 +20,7 @@ tags:
  accuracy_metric_name: exact_match

 metrics:
-  accuracy: 0.9423805913570887
+  accuracy: 0.945413192


 properties:
--- a/assets/evaluation_results/gsm8k_meta-llama-3_1-405b-instruct_question_answering/asset.yaml
+++ b/assets/evaluation_results/gsm8k_meta-llama-3_1-405b-instruct_question_answering/asset.yaml
--- a/assets/evaluation_results/gsm8k_meta-llama-3_1-405b-instruct_question_answering/spec.yaml
+++ b/assets/evaluation_results/gsm8k_meta-llama-3_1-405b-instruct_question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: gsm8k_meta-llama-3_1-405b-instruct_question_answering
+version: 2.12.08
+display_name: gsm8k_Meta-Llama-3_1-405B-Instruct_question_answering
+description: Meta-Llama-3.1-405B-Instruct run for gsm8k
+dataset_family: gsm8k
+dataset_name: gsm8k
+
+model_name: Meta-Llama-3.1-405B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.966
+
+
+properties:
+  n_shot: 8
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "test"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "dev"
--- a/assets/evaluation_results/hellaswaggpt-4-0125-previewquestion_answering/asset.yaml
+++ b/assets/evaluation_results/hellaswaggpt-4-0125-previewquestion_answering/asset.yaml
--- a/assets/evaluation_results/hellaswaggpt-4-0125-previewquestion_answering/spec.yaml
+++ b/assets/evaluation_results/hellaswaggpt-4-0125-previewquestion_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: hellaswag__gpt-4-0125-preview__question_answering
+version: 2.12.08
+display_name: hellaswag__gpt-4-0125-Preview__chat_completion
+description: Benchmark__gpt40125__hf_hellaswag__chat_completion
+dataset_family: hellaswag
+dataset_name: hellaswag
+
+model_name: gpt-4-0125-Preview
+model_version: "4"
+model_asset_id: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.923322047
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "validation"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "train"
--- a/assets/evaluation_results/hellaswag__meta-llama-3_1-70b-instruct__question_answering/asset.yaml
+++ b/assets/evaluation_results/hellaswag__meta-llama-3_1-70b-instruct__question_answering/asset.yaml
--- a/assets/evaluation_results/hellaswag__meta-llama-3_1-70b-instruct__question_answering/spec.yaml
+++ b/assets/evaluation_results/hellaswag__meta-llama-3_1-70b-instruct__question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: hellaswag__meta-llama-3_1-70b-instruct__question_answering
+version: 2.12.08
+display_name: hellaswag__Meta-Llama-3_1-70B-Instruct__chat_completion
+description: Benchmark__Llama-3-1-70B-Instruct-bench__hf_hellaswag__chat_completion
+dataset_family: hellaswag
+dataset_name: hellaswag
+
+model_name: Meta-Llama-3.1-70B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.908783111
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "validation"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "train"
--- a/assets/evaluation_results/hellaswag__meta-llama-3_1-8b-instruct__question_answering/asset.yaml
+++ b/assets/evaluation_results/hellaswag__meta-llama-3_1-8b-instruct__question_answering/asset.yaml
--- a/assets/evaluation_results/hellaswag__meta-llama-3_1-8b-instruct__question_answering/spec.yaml
+++ b/assets/evaluation_results/hellaswag__meta-llama-3_1-8b-instruct__question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: hellaswag__meta-llama-3_1-8b-instruct__question_answering
+version: 2.12.08
+display_name: hellaswag__Meta-Llama-3_1-8B-Instruct__chat_completion
+description: Benchmark__meta-llama-3-1-8b-instruct-1__hf_hellaswag__chat_completion
+dataset_family: hellaswag
+dataset_name: hellaswag
+
+model_name: Meta-Llama-3.1-8B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.796554471
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "validation"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "train"
--- a/assets/evaluation_results/truthfulqa_mc1_gpt-4o_chat_completion/asset.yaml
+++ b/assets/evaluation_results/truthfulqa_mc1_gpt-4o_chat_completion/asset.yaml
--- a/assets/evaluation_results/hellaswag_gpt-4o_question_answering/spec.yaml
+++ b/assets/evaluation_results/hellaswag_gpt-4o_question_answering/spec.yaml
@ -1,8 +1,8 @@
 type: evaluationresult
-name: hellaswag_gpt-4o_chat_completion
-version: 2.13.06
-display_name: hellaswag_gpt-4o_chat_completion
-description: hellaswag_gpt-4o_chat_completion
+name: hellaswag_gpt-4o_question_answering
+version: 2.12.08
+display_name: hellaswag_gpt-4o_question_answering
+description: gpt-4o run for hellaswag
 dataset_family: hellaswag
 dataset_name: hellaswag

@ -20,7 +20,7 @@ tags:
  accuracy_metric_name: exact_match

 metrics:
-  accuracy: 0.8914558852818164
+  accuracy: 0.948018323


 properties:
--- a/assets/evaluation_results/hellaswag_meta-llama-3_1-405b-instruct_question_answering/asset.yaml
+++ b/assets/evaluation_results/hellaswag_meta-llama-3_1-405b-instruct_question_answering/asset.yaml
--- a/assets/evaluation_results/hellaswag_meta-llama-3_1-405b-instruct_question_answering/spec.yaml
+++ b/assets/evaluation_results/hellaswag_meta-llama-3_1-405b-instruct_question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: hellaswag_meta-llama-3_1-405b-instruct_question_answering
+version: 2.12.08
+display_name: hellaswag_Meta-Llama-3_1-405B-Instruct_question_answering
+description: Meta-Llama-3.1-405B-Instruct run for hellaswag
+dataset_family: hellaswag
+dataset_name: hellaswag
+
+model_name: Meta-Llama-3.1-405B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.928
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "validation"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "train"
--- a/assets/evaluation_results/human_evalgpt-4-0125-previewtext_generation/asset.yaml
+++ b/assets/evaluation_results/human_evalgpt-4-0125-previewtext_generation/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/human_evalgpt-4-0125-previewtext_generation/spec.yaml
+++ b/assets/evaluation_results/human_evalgpt-4-0125-previewtext_generation/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: human_eval__gpt-4-0125-preview__text_generation
+version: 2.12.08
+display_name: human_eval__gpt-4-0125-Preview__chat_completion
+description: Benchmark__gpt40125__hf_openai_humaneval__chat_completion
+dataset_family: human_eval
+dataset_name: human_eval
+
+model_name: gpt-4-0125-Preview
+model_version: "4"
+model_asset_id: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
+
+tags:
+  evaluation_type: text_generation
+  task: text-generation
+  accuracy_metric_name: pass@1
+
+metrics:
+  accuracy: 0.87804878
+
+
+properties:
+  n_shot: 0
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "test"
+  fewshot_sampling_ratio: None
+  fewshot_split: "None"
--- a/assets/evaluation_results/human_eval__meta-llama-3_1-70b-instruct__text_generation/asset.yaml
+++ b/assets/evaluation_results/human_eval__meta-llama-3_1-70b-instruct__text_generation/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/human_eval__meta-llama-3_1-70b-instruct__text_generation/spec.yaml
+++ b/assets/evaluation_results/human_eval__meta-llama-3_1-70b-instruct__text_generation/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: human_eval__meta-llama-3_1-70b-instruct__text_generation
+version: 2.12.08
+display_name: human_eval__Meta-Llama-3_1-70B-Instruct__chat_completion
+description: Benchmark__Llama-3-1-70B-Instruct-bench__hf_openai_humaneval__chat_completion
+dataset_family: human_eval
+dataset_name: human_eval
+
+model_name: Meta-Llama-3.1-70B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: text-generation
+  accuracy_metric_name: pass@1
+
+metrics:
+  accuracy: 0.786585366
+
+
+properties:
+  n_shot: 0
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "test"
+  fewshot_sampling_ratio: None
+  fewshot_split: "None"
--- a/assets/evaluation_results/human_eval__meta-llama-3_1-8b-instruct__text_generation/asset.yaml
+++ b/assets/evaluation_results/human_eval__meta-llama-3_1-8b-instruct__text_generation/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/human_eval__meta-llama-3_1-8b-instruct__text_generation/spec.yaml
+++ b/assets/evaluation_results/human_eval__meta-llama-3_1-8b-instruct__text_generation/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: human_eval__meta-llama-3_1-8b-instruct__text_generation
+version: 2.12.08
+display_name: human_eval__Meta-Llama-3_1-8B-Instruct__chat_completion
+description: Benchmark__meta-llama-3-1-8b-instruct-1__hf_openai_humaneval__chat_completion
+dataset_family: human_eval
+dataset_name: human_eval
+
+model_name: Meta-Llama-3.1-8B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: text-generation
+  accuracy_metric_name: pass@1
+
+metrics:
+  accuracy: 0.682926829
+
+
+properties:
+  n_shot: 0
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "test"
+  fewshot_sampling_ratio: None
+  fewshot_split: "None"
--- a/assets/evaluation_results/human_eval_gpt-4o_text_generation/asset.yaml
+++ b/assets/evaluation_results/human_eval_gpt-4o_text_generation/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/human_eval_gpt-4o_text_generation/spec.yaml
+++ b/assets/evaluation_results/human_eval_gpt-4o_text_generation/spec.yaml
@ -1,8 +1,8 @@
 type: evaluationresult
-name: human_eval_gpt-4o_chat_completion
-version: 2.13.06
-display_name: human_eval_gpt-4o_chat_completion
-description: human_eval_gpt-4o_chat_completion
+name: human_eval_gpt-4o_text_generation
+version: 2.12.08
+display_name: human_eval_gpt-4o_text_generation
+description: gpt-4o run for human_eval
 dataset_family: human_eval
 dataset_name: human_eval

@ -20,7 +20,7 @@ tags:
  accuracy_metric_name: pass@1

 metrics:
-  accuracy: 0.9207317073170732
+  accuracy: 0.920731707


 properties:
--- a/assets/evaluation_results/human_eval_meta-llama-3_1-405b-instruct_text_generation/asset.yaml
+++ b/assets/evaluation_results/human_eval_meta-llama-3_1-405b-instruct_text_generation/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/human_eval_meta-llama-3_1-405b-instruct_text_generation/spec.yaml
+++ b/assets/evaluation_results/human_eval_meta-llama-3_1-405b-instruct_text_generation/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: human_eval_meta-llama-3_1-405b-instruct_text_generation
+version: 2.12.08
+display_name: human_eval_Meta-Llama-3_1-405B-Instruct_text_generation
+description: Meta-Llama-3.1-405B-Instruct run for human_eval
+dataset_family: human_eval
+dataset_name: human_eval
+
+model_name: Meta-Llama-3.1-405B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: text-generation
+  accuracy_metric_name: pass@1
+
+metrics:
+  accuracy: 0.853658537
+
+
+properties:
+  n_shot: 0
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "test"
+  fewshot_sampling_ratio: None
+  fewshot_split: "None"
--- a/assets/evaluation_results/mmlu_humanitiesgpt-4-0125-previewquestion_answering/asset.yaml
+++ b/assets/evaluation_results/mmlu_humanitiesgpt-4-0125-previewquestion_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/mmlu_humanitiesgpt-4-0125-previewquestion_answering/spec.yaml
+++ b/assets/evaluation_results/mmlu_humanitiesgpt-4-0125-previewquestion_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: mmlu_humanities__gpt-4-0125-preview__question_answering
+version: 2.12.08
+display_name: mmlu_humanities__gpt-4-0125-Preview__chat_completion
+description: Benchmark__gpt40125__remote_mmlu__chat_completion
+dataset_family: mmlu
+dataset_name: mmlu_humanities
+
+model_name: gpt-4-0125-Preview
+model_version: "4"
+model_asset_id: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.788310308
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "test"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "dev"
--- a/assets/evaluation_results/mmlu_humanities__meta-llama-3_1-70b-instruct__question_answering/asset.yaml
+++ b/assets/evaluation_results/mmlu_humanities__meta-llama-3_1-70b-instruct__question_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/mmlu_humanities__meta-llama-3_1-70b-instruct__question_answering/spec.yaml
+++ b/assets/evaluation_results/mmlu_humanities__meta-llama-3_1-70b-instruct__question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: mmlu_humanities__meta-llama-3_1-70b-instruct__question_answering
+version: 2.12.08
+display_name: mmlu_humanities__Meta-Llama-3_1-70B-Instruct__chat_completion
+description: Benchmark__Llama-3-1-70B-Instruct-bench__remote_mmlu__chat_completion
+dataset_family: mmlu
+dataset_name: mmlu_humanities
+
+model_name: Meta-Llama-3.1-70B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.794899044
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "test"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "dev"
--- a/assets/evaluation_results/mmlu_humanities__meta-llama-3_1-8b-instruct__question_answering/asset.yaml
+++ b/assets/evaluation_results/mmlu_humanities__meta-llama-3_1-8b-instruct__question_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/mmlu_humanities__meta-llama-3_1-8b-instruct__question_answering/spec.yaml
+++ b/assets/evaluation_results/mmlu_humanities__meta-llama-3_1-8b-instruct__question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: mmlu_humanities__meta-llama-3_1-8b-instruct__question_answering
+version: 2.12.08
+display_name: mmlu_humanities__Meta-Llama-3_1-8B-Instruct__chat_completion
+description: Benchmark__meta-llama-3-1-8b-instruct-1__remote_mmlu__chat_completion
+dataset_family: mmlu
+dataset_name: mmlu_humanities
+
+model_name: Meta-Llama-3.1-8B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.636556854
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "test"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "dev"
--- a/assets/evaluation_results/mmlu_humanities_gpt-4o_question_answering/asset.yaml
+++ b/assets/evaluation_results/mmlu_humanities_gpt-4o_question_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/mmlu_humanities_gpt-4o_question_answering/spec.yaml
+++ b/assets/evaluation_results/mmlu_humanities_gpt-4o_question_answering/spec.yaml
@ -1,8 +1,8 @@
 type: evaluationresult
-name: mmlu_humanities_gpt-4o_chat_completion
-version: 2.13.06
-display_name: mmlu_humanities_gpt-4o_chat_completion
-description: mmlu_humanities_gpt-4o_chat_completion
+name: mmlu_humanities_gpt-4o_question_answering
+version: 2.12.08
+display_name: mmlu_humanities_gpt-4o_question_answering
+description: gpt-4o run for mmlu_humanities
 dataset_family: mmlu
 dataset_name: mmlu_humanities

@ -20,7 +20,7 @@ tags:
  accuracy_metric_name: exact_match

 metrics:
-  accuracy: 0.8021253985122211
+  accuracy: 0.810201913


 properties:
--- a/assets/evaluation_results/mmlu_humanities_meta-llama-3_1-405b-instruct_question_answering/asset.yaml
+++ b/assets/evaluation_results/mmlu_humanities_meta-llama-3_1-405b-instruct_question_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/mmlu_humanities_meta-llama-3_1-405b-instruct_question_answering/spec.yaml
+++ b/assets/evaluation_results/mmlu_humanities_meta-llama-3_1-405b-instruct_question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: mmlu_humanities_meta-llama-3_1-405b-instruct_question_answering
+version: 2.12.08
+display_name: mmlu_humanities_Meta-Llama-3_1-405B-Instruct_question_answering
+description: Meta-Llama-3.1-405B-Instruct run for mmlu_humanities
+dataset_family: mmlu
+dataset_name: mmlu_humanities
+
+model_name: Meta-Llama-3.1-405B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.817215728
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "test"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "dev"
--- a/assets/evaluation_results/mmlu_othergpt-4-0125-previewquestion_answering/asset.yaml
+++ b/assets/evaluation_results/mmlu_othergpt-4-0125-previewquestion_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/mmlu_othergpt-4-0125-previewquestion_answering/spec.yaml
+++ b/assets/evaluation_results/mmlu_othergpt-4-0125-previewquestion_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: mmlu_other__gpt-4-0125-preview__question_answering
+version: 2.12.08
+display_name: mmlu_other__gpt-4-0125-Preview__chat_completion
+description: Benchmark__gpt40125__remote_mmlu__chat_completion
+dataset_family: mmlu
+dataset_name: mmlu_other
+
+model_name: gpt-4-0125-Preview
+model_version: "4"
+model_asset_id: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.865786933
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "test"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "dev"
--- a/assets/evaluation_results/mmlu_other__meta-llama-3_1-70b-instruct__question_answering/asset.yaml
+++ b/assets/evaluation_results/mmlu_other__meta-llama-3_1-70b-instruct__question_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/mmlu_other__meta-llama-3_1-70b-instruct__question_answering/spec.yaml
+++ b/assets/evaluation_results/mmlu_other__meta-llama-3_1-70b-instruct__question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: mmlu_other__meta-llama-3_1-70b-instruct__question_answering
+version: 2.12.08
+display_name: mmlu_other__Meta-Llama-3_1-70B-Instruct__chat_completion
+description: Benchmark__Llama-3-1-70B-Instruct-bench__remote_mmlu__chat_completion
+dataset_family: mmlu
+dataset_name: mmlu_other
+
+model_name: Meta-Llama-3.1-70B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.848728677
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "test"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "dev"
--- a/assets/evaluation_results/mmlu_other__meta-llama-3_1-8b-instruct__question_answering/asset.yaml
+++ b/assets/evaluation_results/mmlu_other__meta-llama-3_1-8b-instruct__question_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/mmlu_other__meta-llama-3_1-8b-instruct__question_answering/spec.yaml
+++ b/assets/evaluation_results/mmlu_other__meta-llama-3_1-8b-instruct__question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: mmlu_other__meta-llama-3_1-8b-instruct__question_answering
+version: 2.12.08
+display_name: mmlu_other__Meta-Llama-3_1-8B-Instruct__chat_completion
+description: Benchmark__meta-llama-3-1-8b-instruct-1__remote_mmlu__chat_completion
+dataset_family: mmlu
+dataset_name: mmlu_other
+
+model_name: Meta-Llama-3.1-8B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.736723528
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "test"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "dev"
--- a/assets/evaluation_results/mmlu_other_gpt-4o_question_answering/asset.yaml
+++ b/assets/evaluation_results/mmlu_other_gpt-4o_question_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/mmlu_other_gpt-4o_question_answering/spec.yaml
+++ b/assets/evaluation_results/mmlu_other_gpt-4o_question_answering/spec.yaml
@ -1,8 +1,8 @@
 type: evaluationresult
-name: mmlu_other_gpt-4o_chat_completion
-version: 2.13.06
-display_name: mmlu_other_gpt-4o_chat_completion
-description: mmlu_other_gpt-4o_chat_completion
+name: mmlu_other_gpt-4o_question_answering
+version: 2.12.08
+display_name: mmlu_other_gpt-4o_question_answering
+description: gpt-4o run for mmlu_other
 dataset_family: mmlu
 dataset_name: mmlu_other

@ -20,7 +20,7 @@ tags:
  accuracy_metric_name: exact_match

 metrics:
-  accuracy: 0.8715803025426456
+  accuracy: 0.891857097


 properties:
--- a/assets/evaluation_results/mmlu_other_meta-llama-3_1-405b-instruct_question_answering/asset.yaml
+++ b/assets/evaluation_results/mmlu_other_meta-llama-3_1-405b-instruct_question_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/mmlu_other_meta-llama-3_1-405b-instruct_question_answering/spec.yaml
+++ b/assets/evaluation_results/mmlu_other_meta-llama-3_1-405b-instruct_question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: mmlu_other_meta-llama-3_1-405b-instruct_question_answering
+version: 2.12.08
+display_name: mmlu_other_Meta-Llama-3_1-405B-Instruct_question_answering
+description: Meta-Llama-3.1-405B-Instruct run for mmlu_other
+dataset_family: mmlu
+dataset_name: mmlu_other
+
+model_name: Meta-Llama-3.1-405B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.878339234
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "test"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "dev"
--- a/assets/evaluation_results/mmlu_social_sciencesgpt-4-0125-previewquestion_answering/asset.yaml
+++ b/assets/evaluation_results/mmlu_social_sciencesgpt-4-0125-previewquestion_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/mmlu_social_sciencesgpt-4-0125-previewquestion_answering/spec.yaml
+++ b/assets/evaluation_results/mmlu_social_sciencesgpt-4-0125-previewquestion_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: mmlu_social_sciences__gpt-4-0125-preview__question_answering
+version: 2.12.08
+display_name: mmlu_social_sciences__gpt-4-0125-Preview__chat_completion
+description: Benchmark__gpt40125__remote_mmlu__chat_completion
+dataset_family: mmlu
+dataset_name: mmlu_social_sciences
+
+model_name: gpt-4-0125-Preview
+model_version: "4"
+model_asset_id: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.90120247
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "test"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "dev"
--- a/assets/evaluation_results/mmlu_social_sciences__meta-llama-3_1-70b-instruct__question_answering/asset.yaml
+++ b/assets/evaluation_results/mmlu_social_sciences__meta-llama-3_1-70b-instruct__question_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/mmlu_social_sciences__meta-llama-3_1-70b-instruct__question_answering/spec.yaml
+++ b/assets/evaluation_results/mmlu_social_sciences__meta-llama-3_1-70b-instruct__question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: mmlu_social_sciences__meta-llama-3_1-70b-instruct__question_answering
+version: 2.12.08
+display_name: mmlu_social_sciences__Meta-Llama-3_1-70B-Instruct__chat_completion
+description: Benchmark__Llama-3-1-70B-Instruct-bench__remote_mmlu__chat_completion
+dataset_family: mmlu
+dataset_name: mmlu_social_sciences
+
+model_name: Meta-Llama-3.1-70B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.876178096
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "test"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "dev"
--- a/assets/evaluation_results/mmlu_social_sciences__meta-llama-3_1-8b-instruct__question_answering/asset.yaml
+++ b/assets/evaluation_results/mmlu_social_sciences__meta-llama-3_1-8b-instruct__question_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/mmlu_social_sciences__meta-llama-3_1-8b-instruct__question_answering/spec.yaml
+++ b/assets/evaluation_results/mmlu_social_sciences__meta-llama-3_1-8b-instruct__question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: mmlu_social_sciences__meta-llama-3_1-8b-instruct__question_answering
+version: 2.12.08
+display_name: mmlu_social_sciences__Meta-Llama-3_1-8B-Instruct__chat_completion
+description: Benchmark__meta-llama-3-1-8b-instruct-1__remote_mmlu__chat_completion
+dataset_family: mmlu
+dataset_name: mmlu_social_sciences
+
+model_name: Meta-Llama-3.1-8B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.765680858
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "test"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "dev"
--- a/assets/evaluation_results/mmlu_social_sciences_gpt-4o_question_answering/asset.yaml
+++ b/assets/evaluation_results/mmlu_social_sciences_gpt-4o_question_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/mmlu_social_sciences_gpt-4o_question_answering/spec.yaml
+++ b/assets/evaluation_results/mmlu_social_sciences_gpt-4o_question_answering/spec.yaml
@ -1,8 +1,8 @@
 type: evaluationresult
-name: mmlu_social_sciences_gpt-4o_chat_completion
-version: 2.13.06
-display_name: mmlu_social_sciences_gpt-4o_chat_completion
-description: mmlu_social_sciences_gpt-4o_chat_completion
+name: mmlu_social_sciences_gpt-4o_question_answering
+version: 2.12.08
+display_name: mmlu_social_sciences_gpt-4o_question_answering
+description: gpt-4o run for mmlu_social_sciences
 dataset_family: mmlu
 dataset_name: mmlu_social_sciences

@ -20,7 +20,7 @@ tags:
  accuracy_metric_name: exact_match

 metrics:
-  accuracy: 0.9129021774455639
+  accuracy: 0.919077023


 properties:
--- a/assets/evaluation_results/mmlu_social_sciences_meta-llama-3_1-405b-instruct_question_answering/asset.yaml
+++ b/assets/evaluation_results/mmlu_social_sciences_meta-llama-3_1-405b-instruct_question_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/mmlu_social_sciences_meta-llama-3_1-405b-instruct_question_answering/spec.yaml
+++ b/assets/evaluation_results/mmlu_social_sciences_meta-llama-3_1-405b-instruct_question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: mmlu_social_sciences_meta-llama-3_1-405b-instruct_question_answering
+version: 2.12.08
+display_name: mmlu_social_sciences_Meta-Llama-3_1-405B-Instruct_question_answering
+description: Meta-Llama-3.1-405B-Instruct run for mmlu_social_sciences
+dataset_family: mmlu
+dataset_name: mmlu_social_sciences
+
+model_name: Meta-Llama-3.1-405B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.900227494
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "test"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "dev"
--- a/assets/evaluation_results/mmlu_stemgpt-4-0125-previewquestion_answering/asset.yaml
+++ b/assets/evaluation_results/mmlu_stemgpt-4-0125-previewquestion_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/mmlu_stemgpt-4-0125-previewquestion_answering/spec.yaml
+++ b/assets/evaluation_results/mmlu_stemgpt-4-0125-previewquestion_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: mmlu_stem__gpt-4-0125-preview__question_answering
+version: 2.12.08
+display_name: mmlu_stem__gpt-4-0125-Preview__chat_completion
+description: Benchmark__gpt40125__remote_mmlu__chat_completion
+dataset_family: mmlu
+dataset_name: mmlu_stem
+
+model_name: gpt-4-0125-Preview
+model_version: "4"
+model_asset_id: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.787821123
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "test"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "dev"
--- a/assets/evaluation_results/mmlu_stem__meta-llama-3_1-70b-instruct__question_answering/asset.yaml
+++ b/assets/evaluation_results/mmlu_stem__meta-llama-3_1-70b-instruct__question_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/mmlu_stem__meta-llama-3_1-70b-instruct__question_answering/spec.yaml
+++ b/assets/evaluation_results/mmlu_stem__meta-llama-3_1-70b-instruct__question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: mmlu_stem__meta-llama-3_1-70b-instruct__question_answering
+version: 2.12.08
+display_name: mmlu_stem__Meta-Llama-3_1-70B-Instruct__chat_completion
+description: Benchmark__Llama-3-1-70B-Instruct-bench__remote_mmlu__chat_completion
+dataset_family: mmlu
+dataset_name: mmlu_stem
+
+model_name: Meta-Llama-3.1-70B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.769425944
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "test"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "dev"
--- a/assets/evaluation_results/mmlu_stem__meta-llama-3_1-8b-instruct__question_answering/asset.yaml
+++ b/assets/evaluation_results/mmlu_stem__meta-llama-3_1-8b-instruct__question_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/mmlu_stem__meta-llama-3_1-8b-instruct__question_answering/spec.yaml
+++ b/assets/evaluation_results/mmlu_stem__meta-llama-3_1-8b-instruct__question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: mmlu_stem__meta-llama-3_1-8b-instruct__question_answering
+version: 2.12.08
+display_name: mmlu_stem__Meta-Llama-3_1-8B-Instruct__chat_completion
+description: Benchmark__meta-llama-3-1-8b-instruct-1__remote_mmlu__chat_completion
+dataset_family: mmlu
+dataset_name: mmlu_stem
+
+model_name: Meta-Llama-3.1-8B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.592134475
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "test"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "dev"
--- a/assets/evaluation_results/mmlu_stem_gpt-4o_question_answering/asset.yaml
+++ b/assets/evaluation_results/mmlu_stem_gpt-4o_question_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/mmlu_stem_gpt-4o_question_answering/spec.yaml
+++ b/assets/evaluation_results/mmlu_stem_gpt-4o_question_answering/spec.yaml
@ -1,8 +1,8 @@
 type: evaluationresult
-name: mmlu_stem_gpt-4o_chat_completion
-version: 2.13.06
-display_name: mmlu_stem_gpt-4o_chat_completion
-description: mmlu_stem_gpt-4o_chat_completion
+name: mmlu_stem_gpt-4o_question_answering
+version: 2.12.08
+display_name: mmlu_stem_gpt-4o_question_answering
+description: gpt-4o run for mmlu_stem
 dataset_family: mmlu
 dataset_name: mmlu_stem

@ -20,7 +20,7 @@ tags:
  accuracy_metric_name: exact_match

 metrics:
-  accuracy: 0.6955280685061845
+  accuracy: 0.802410403


 properties:
--- a/assets/evaluation_results/mmlu_stem_meta-llama-3_1-405b-instruct_question_answering/asset.yaml
+++ b/assets/evaluation_results/mmlu_stem_meta-llama-3_1-405b-instruct_question_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/mmlu_stem_meta-llama-3_1-405b-instruct_question_answering/spec.yaml
+++ b/assets/evaluation_results/mmlu_stem_meta-llama-3_1-405b-instruct_question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: mmlu_stem_meta-llama-3_1-405b-instruct_question_answering
+version: 2.12.08
+display_name: mmlu_stem_Meta-Llama-3_1-405B-Instruct_question_answering
+description: Meta-Llama-3.1-405B-Instruct run for mmlu_stem
+dataset_family: mmlu
+dataset_name: mmlu_stem
+
+model_name: Meta-Llama-3.1-405B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.836980653
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "test"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "dev"
--- a/assets/evaluation_results/openbookqagpt-4-0125-previewquestion_answering/asset.yaml
+++ b/assets/evaluation_results/openbookqagpt-4-0125-previewquestion_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/openbookqagpt-4-0125-previewquestion_answering/spec.yaml
+++ b/assets/evaluation_results/openbookqagpt-4-0125-previewquestion_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: openbookqa__gpt-4-0125-preview__question_answering
+version: 2.12.08
+display_name: openbookqa__gpt-4-0125-Preview__chat_completion
+description: Benchmark__gpt40125__hf_openbookqa__chat_completion
+dataset_family: openbookqa
+dataset_name: openbookqa
+
+model_name: gpt-4-0125-Preview
+model_version: "4"
+model_asset_id: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.934
+
+
+properties:
+  n_shot: 10
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "validation"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "train"
--- a/assets/evaluation_results/openbookqa__meta-llama-3_1-70b-instruct__question_answering/asset.yaml
+++ b/assets/evaluation_results/openbookqa__meta-llama-3_1-70b-instruct__question_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/openbookqa__meta-llama-3_1-70b-instruct__question_answering/spec.yaml
+++ b/assets/evaluation_results/openbookqa__meta-llama-3_1-70b-instruct__question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: openbookqa__meta-llama-3_1-70b-instruct__question_answering
+version: 2.12.08
+display_name: openbookqa__Meta-Llama-3_1-70B-Instruct__chat_completion
+description: Benchmark__Llama-3-1-70B-Instruct-bench__hf_openbookqa__chat_completion
+dataset_family: openbookqa
+dataset_name: openbookqa
+
+model_name: Meta-Llama-3.1-70B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.938
+
+
+properties:
+  n_shot: 10
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "validation"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "train"
--- a/assets/evaluation_results/openbookqa__meta-llama-3_1-8b-instruct__question_answering/asset.yaml
+++ b/assets/evaluation_results/openbookqa__meta-llama-3_1-8b-instruct__question_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/openbookqa__meta-llama-3_1-8b-instruct__question_answering/spec.yaml
+++ b/assets/evaluation_results/openbookqa__meta-llama-3_1-8b-instruct__question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: openbookqa__meta-llama-3_1-8b-instruct__question_answering
+version: 2.12.08
+display_name: openbookqa__Meta-Llama-3_1-8B-Instruct__chat_completion
+description: Benchmark__meta-llama-3-1-8b-instruct-1__hf_openbookqa__chat_completion
+dataset_family: openbookqa
+dataset_name: openbookqa
+
+model_name: Meta-Llama-3.1-8B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.852
+
+
+properties:
+  n_shot: 10
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "validation"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "train"
--- a/assets/evaluation_results/openbookqa_gpt-4o_question_answering/asset.yaml
+++ b/assets/evaluation_results/openbookqa_gpt-4o_question_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/openbookqa_gpt-4o_question_answering/spec.yaml
+++ b/assets/evaluation_results/openbookqa_gpt-4o_question_answering/spec.yaml
@ -1,8 +1,8 @@
 type: evaluationresult
-name: openbookqa_gpt-4o_chat_completion
-version: 2.13.06
-display_name: openbookqa_gpt-4o_chat_completion
-description: openbookqa_gpt-4o_chat_completion
+name: openbookqa_gpt-4o_question_answering
+version: 2.12.08
+display_name: openbookqa_gpt-4o_question_answering
+description: gpt-4o run for openbookqa
 dataset_family: openbookqa
 dataset_name: openbookqa

@ -20,7 +20,7 @@ tags:
  accuracy_metric_name: exact_match

 metrics:
-  accuracy: 0.882
+  accuracy: 0.954


 properties:
--- a/assets/evaluation_results/openbookqa_meta-llama-3_1-405b-instruct_question_answering/asset.yaml
+++ b/assets/evaluation_results/openbookqa_meta-llama-3_1-405b-instruct_question_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/openbookqa_meta-llama-3_1-405b-instruct_question_answering/spec.yaml
+++ b/assets/evaluation_results/openbookqa_meta-llama-3_1-405b-instruct_question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: openbookqa_meta-llama-3_1-405b-instruct_question_answering
+version: 2.12.08
+display_name: openbookqa_Meta-Llama-3_1-405B-Instruct_question_answering
+description: Meta-Llama-3.1-405B-Instruct run for openbookqa
+dataset_family: openbookqa
+dataset_name: openbookqa
+
+model_name: Meta-Llama-3.1-405B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.918
+
+
+properties:
+  n_shot: 10
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "validation"
+  fewshot_sampling_ratio: 1.0
+  fewshot_split: "train"
--- a/assets/evaluation_results/piqagpt-4-0125-previewquestion_answering/asset.yaml
+++ b/assets/evaluation_results/piqagpt-4-0125-previewquestion_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/piqagpt-4-0125-previewquestion_answering/spec.yaml
+++ b/assets/evaluation_results/piqagpt-4-0125-previewquestion_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: piqa__gpt-4-0125-preview__question_answering
+version: 2.12.08
+display_name: piqa__gpt-4-0125-Preview__chat_completion
+description: Benchmark__gpt40125__hf_piqa__chat_completion
+dataset_family: piqa
+dataset_name: piqa
+
+model_name: gpt-4-0125-Preview
+model_version: "4"
+model_asset_id: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azure-openai/models/gpt-4-0125-Preview/versions/4
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.936343852
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "validation"
+  fewshot_sampling_ratio: 0.3
+  fewshot_split: "train"
--- a/assets/evaluation_results/piqa__meta-llama-3_1-70b-instruct__question_answering/asset.yaml
+++ b/assets/evaluation_results/piqa__meta-llama-3_1-70b-instruct__question_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/piqa__meta-llama-3_1-70b-instruct__question_answering/spec.yaml
+++ b/assets/evaluation_results/piqa__meta-llama-3_1-70b-instruct__question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: piqa__meta-llama-3_1-70b-instruct__question_answering
+version: 2.12.08
+display_name: piqa__Meta-Llama-3_1-70B-Instruct__chat_completion
+description: Benchmark__Llama-3-1-70B-Instruct-bench__hf_piqa__chat_completion
+dataset_family: piqa
+dataset_name: piqa
+
+model_name: Meta-Llama-3.1-70B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-70B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.880848749
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "validation"
+  fewshot_sampling_ratio: 0.3
+  fewshot_split: "train"
--- a/assets/evaluation_results/piqa__meta-llama-3_1-8b-instruct__question_answering/asset.yaml
+++ b/assets/evaluation_results/piqa__meta-llama-3_1-8b-instruct__question_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/piqa__meta-llama-3_1-8b-instruct__question_answering/spec.yaml
+++ b/assets/evaluation_results/piqa__meta-llama-3_1-8b-instruct__question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: piqa__meta-llama-3_1-8b-instruct__question_answering
+version: 2.12.08
+display_name: piqa__Meta-Llama-3_1-8B-Instruct__chat_completion
+description: Benchmark__meta-llama-3-1-8b-instruct-1__hf_piqa__chat_completion
+dataset_family: piqa
+dataset_name: piqa
+
+model_name: Meta-Llama-3.1-8B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-8B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.821001088
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "validation"
+  fewshot_sampling_ratio: 0.3
+  fewshot_split: "train"
--- a/assets/evaluation_results/piqa_gpt-4o_question_answering/asset.yaml
+++ b/assets/evaluation_results/piqa_gpt-4o_question_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/piqa_gpt-4o_question_answering/spec.yaml
+++ b/assets/evaluation_results/piqa_gpt-4o_question_answering/spec.yaml
@ -1,8 +1,8 @@
 type: evaluationresult
-name: piqa_gpt-4o_chat_completion
-version: 2.13.06
-display_name: piqa_gpt-4o_chat_completion
-description: piqa_gpt-4o_chat_completion
+name: piqa_gpt-4o_question_answering
+version: 2.12.08
+display_name: piqa_gpt-4o_question_answering
+description: gpt-4o run for piqa
 dataset_family: piqa
 dataset_name: piqa

@ -20,7 +20,7 @@ tags:
  accuracy_metric_name: exact_match

 metrics:
-  accuracy: 0.8443960826985855
+  accuracy: 0.938520131


 properties:
--- a/assets/evaluation_results/piqa_meta-llama-3_1-405b-instruct_question_answering/asset.yaml
+++ b/assets/evaluation_results/piqa_meta-llama-3_1-405b-instruct_question_answering/asset.yaml
@ -0,0 +1,3 @@
+type: evaluationresult
+spec: spec.yaml
+categories: ["EvaluationResult"]
--- a/assets/evaluation_results/piqa_meta-llama-3_1-405b-instruct_question_answering/spec.yaml
+++ b/assets/evaluation_results/piqa_meta-llama-3_1-405b-instruct_question_answering/spec.yaml
@ -0,0 +1,31 @@
+type: evaluationresult
+name: piqa_meta-llama-3_1-405b-instruct_question_answering
+version: 2.12.08
+display_name: piqa_Meta-Llama-3_1-405B-Instruct_question_answering
+description: Meta-Llama-3.1-405B-Instruct run for piqa
+dataset_family: piqa
+dataset_name: piqa
+
+model_name: Meta-Llama-3.1-405B-Instruct
+model_version: "1"
+model_asset_id: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
+
+relationships:
+  - relationshipType: Source
+    assetId: azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct/versions/1
+
+tags:
+  evaluation_type: text_generation
+  task: question-answering
+  accuracy_metric_name: exact_match
+
+metrics:
+  accuracy: 0.886289445
+
+
+properties:
+  n_shot: 5
+  evaluation_sampling_ratio: 1.0
+  evaluation_split: "validation"
+  fewshot_sampling_ratio: 0.3
+  fewshot_split: "train"
--- a/Показать больше
+++ b/Показать больше