Fix the issue with supporting new evaluationresult asset types (#3544)

* updating model evaluation environment with modelling_llama patch. * add cost, quality, performance tags for evaluationresult. * reverting the changes for transformers patch. * updating the changelog and bump up the version. * fix the issue with supporting new evaluationresult assets. * updated the change log file. * updating the test cases. * updating the test cases. * fixing unit tests. * fixing change log file. --------- Co-authored-by: Chandra Sekhar Gupta Aravapalli <caravapalli@microsoft.com>
2024-11-04 08:34:59 +05:30 · 2024-11-04 08:34:59 +05:30 · 4f983518a6
--- a/scripts/azureml-assets/CHANGELOG.md
+++ b/scripts/azureml-assets/CHANGELOG.md
@ -1,7 +1,10 @@
 ## 1.17.0 (Unreleased)
 ### 🚀 New Features

+
+## 1.16.65 (2024-11-04)
 ### 🐛 Bugs Fixed
+- [#3544](https://github.com/Azure/azureml-assets/pull/3544) Fix validate assets for new evaluationresult asset tags


 ## 1.16.64 (2024-10-31)
--- a/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_shared.yaml
+++ b/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_shared.yaml
@ -5,7 +5,7 @@ evaluation_type:
  values:
    - text_generation
    - text_embeddings
-    - vision
    - text_cost
    - text_performance
    - text_quality
+    - vision
--- a/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_text_cost.yaml
+++ b/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_text_cost.yaml
@ -0,0 +1,6 @@
+# Valid tag values that can be applied to evaluation results whose evaluation_type is `text_cost`.
+index_metric_key:
+  required: True
+  allow_multiple: False
+  values:
+  - total_cost_per_1M_tokens
--- a/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_text_performance.yaml
+++ b/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_text_performance.yaml
@ -0,0 +1,6 @@
+# Valid tag values that can be applied to evaluation results whose evaluation_type is `text_performance`.
+index_metric_key:
+  required: True
+  allow_multiple: False
+  values:
+  - throughput_gtps_token_count
--- a/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_text_quality.yaml
+++ b/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_text_quality.yaml
@ -0,0 +1,6 @@
+# Valid tag values that can be applied to evaluation results whose evaluation_type is `text_quality`.
+index_metric_key:
+  required: True
+  allow_multiple: False
+  values:
+  - index_metric
--- a/scripts/azureml-assets/azureml/assets/validate_assets.py
+++ b/scripts/azureml-assets/azureml/assets/validate_assets.py
@ -1077,12 +1077,17 @@ def validate_assets(input_dirs: List[Path],
                asset_spec = asset_config._spec._yaml
                evaluation_type = asset_spec.get('tags', {}).get('evaluation_type', None)

-                if evaluation_type == 'text_generation':
-                    error_count += validate_tags(asset_config, 'evaluationresult/tag_values_text_generation.yaml')
-                elif evaluation_type == 'text_embeddings':
-                    error_count += validate_tags(asset_config, 'evaluationresult/tag_values_text_embeddings.yaml')
-                elif evaluation_type == 'vision':
-                    error_count += validate_tags(asset_config, 'evaluationresult/tag_values_vision.yaml')
+                evaluation_tag_files = {
+                    'text_generation': 'evaluationresult/tag_values_text_generation.yaml',
+                    'text_embeddings': 'evaluationresult/tag_values_text_embeddings.yaml',
+                    'vision': 'evaluationresult/tag_values_vision.yaml',
+                    'text_quality': 'evaluationresult/tag_values_text_quality.yaml',
+                    'text_performance': 'evaluationresult/tag_values_text_performance.yaml',
+                    'text_cost': 'evaluationresult/tag_values_text_cost.yaml'
+                }
+
+                if evaluation_type in evaluation_tag_files:
+                    error_count += validate_tags(asset_config, evaluation_tag_files[evaluation_type])
                else:
                    _log_error(
                        asset_config.file_name_with_path,
--- a/scripts/azureml-assets/setup.py
+++ b/scripts/azureml-assets/setup.py
@ -7,7 +7,7 @@ from setuptools import setup, find_packages

 setup(
   name="azureml-assets",
-   version="1.16.64",
+   version="1.16.65",
   description="Utilities for publishing assets to Azure Machine Learning system registries.",
   author="Microsoft Corp",
   packages=find_packages(),
--- a/test/resources/validate/evaluationresult/text_cost_correct/asset.yaml
+++ b/test/resources/validate/evaluationresult/text_cost_correct/asset.yaml
@ -0,0 +1,5 @@
+type: evaluationresult
+spec: spec.yaml
+categories:
+- EvaluationResult
+
--- a/test/resources/validate/evaluationresult/text_cost_correct/spec.yaml
+++ b/test/resources/validate/evaluationresult/text_cost_correct/spec.yaml
@ -0,0 +1,37 @@
+type: evaluationresult
+name: model1-16k_cost
+version: 21.10.24
+display_name: model1
+description: Cost benchmark results for model1
+model_name: model1
+model_version: '1'
+model_asset_id: azureml://registries/azureml/models/model1/versions/1
+
+dataset_family: synthetic
+dataset_name: synthetic
+relationships:
+    - relationshipType: Source
+      assetId: azureml://registries/azureml/models/model1/versions/1
+tags:
+  evaluation_type: text_cost
+  index_metric_key: total_cost_per_1M_tokens
+
+metrics:
+  input_token_cost_per_1M_tokens: 3.0
+  output_token_cost_per_1M_tokens: 4.0
+  total_cost_per_1M_tokens: 3.25
+
+properties:
+  deployment_category: azureml
+  disclaimer: Cost Calculation is indicative and may vary based on the actual usage
+    and configuration.
+  additional_info: https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/
+  currency: USD
+  input_output_token_ratio: '3:1'
+  input_token_weightage_per_million: 750000
+  output_token_weightage_per_million: 250000
+  deployment_type: global
+  region: eastus
+  input_meter_id: input_meter_id
+  output_meter_id: output_meter_id
+
--- a/test/resources/validate/evaluationresult/text_cost_incorrect/asset.yaml
+++ b/test/resources/validate/evaluationresult/text_cost_incorrect/asset.yaml
@ -0,0 +1,5 @@
+type: evaluationresult
+spec: spec.yaml
+categories:
+- EvaluationResult
+
--- a/test/resources/validate/evaluationresult/text_cost_incorrect/spec.yaml
+++ b/test/resources/validate/evaluationresult/text_cost_incorrect/spec.yaml
@ -0,0 +1,37 @@
+type: evaluationresult
+name: model1-16k_cost
+version: 21.10.24
+display_name: model1
+description: Cost benchmark results for model1
+model_name: model1
+model_version: '1'
+model_asset_id: azureml://registries/azureml/models/model1/versions/1
+
+dataset_family: synthetic
+dataset_name: synthetic
+relationships:
+    - relationshipType: Source
+      assetId: azureml://registries/azureml/models/model1/versions/1
+tags:
+  evaluation_type: text_cost
+  index_metric_key: random_key
+
+metrics:
+  input_token_cost_per_1M_tokens: 3.0
+  output_token_cost_per_1M_tokens: 4.0
+  total_cost_per_1M_tokens: 3.25
+
+properties:
+  deployment_category: azureml
+  disclaimer: Cost Calculation is indicative and may vary based on the actual usage
+    and configuration.
+  additional_info: https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/
+  currency: USD
+  input_output_token_ratio: '3:1'
+  input_token_weightage_per_million: 750000
+  output_token_weightage_per_million: 250000
+  deployment_type: global
+  region: eastus
+  input_meter_id: input_meter_id
+  output_meter_id: output_meter_id
+
--- a/test/resources/validate/evaluationresult/text_performance_correct/asset.yaml
+++ b/test/resources/validate/evaluationresult/text_performance_correct/asset.yaml
@ -0,0 +1,5 @@
+type: evaluationresult
+spec: spec.yaml
+categories:
+- EvaluationResult
+
--- a/test/resources/validate/evaluationresult/text_performance_correct/spec.yaml
+++ b/test/resources/validate/evaluationresult/text_performance_correct/spec.yaml
@ -0,0 +1,54 @@
+type: evaluationresult
+name: synthetic_model_perf
+version: 10.30.24
+display_name: synthetic_model
+description: Performance benchmark results for model on synthetic data
+
+dataset_family: synthetic
+dataset_name: synthetic
+
+model_name: model1
+model_version: '1'
+model_asset_id: azureml://registries/azureml/models/model1/versions/1
+relationships:
+    - relationshipType: Source
+      assetId: azureml://registries/azureml/models/model1/versions/1
+
+tags:
+  evaluation_type: text_performance
+  index_metric_value: generated_tokens_per_sec
+  index_metric_key: throughput_gtps_token_count
+  azure_registry_name: azureml
+  azure_model_name: model1
+  azure_latest_model_version: 1
+  azure_latest_model_asset_id: azureml://registries/azureml/models/model1/versions/1
+
+metrics:
+  throughput_gtps_token_count: 25.3
+  throughput_ttps_token_count: 145.43
+  throughput_rps_request_count: 0.14
+  latency_p50_secs: 7.13
+  latency_p90_secs: 7.4
+  latency_p95_secs: 7.52
+  latency_p99_secs: 8.2
+  latency_mean_secs: 7.17
+  latency_ttft_secs: 1.37
+  time_between_tokens_secs: 0.29
+  index_metric: 25.3
+
+properties:
+  deployment_category: azure_openai
+  deployment_type: standard
+  tokens_rate_limit: 30k
+  total_token_length_per_request: 1000
+  prompt_token_generated_token_ratio: '80:20'
+  input_prompt_tokens: 800
+  output_generated_tokens: 200
+  num_of_inference_requests: 2
+  num_of_inference_aggregations: 336
+  payload_task_type: chat_completion
+  num_parallel_inference_requests: '1'
+  stream: true
+  tokenizer: gpt-4-0314
+  region: uksouth
+
--- a/test/resources/validate/evaluationresult/text_performance_incorrect/asset.yaml
+++ b/test/resources/validate/evaluationresult/text_performance_incorrect/asset.yaml
@ -0,0 +1,5 @@
+type: evaluationresult
+spec: spec.yaml
+categories:
+- EvaluationResult
+
--- a/test/resources/validate/evaluationresult/text_performance_incorrect/spec.yaml
+++ b/test/resources/validate/evaluationresult/text_performance_incorrect/spec.yaml
@ -0,0 +1,54 @@
+type: evaluationresult
+name: synthetic_model_perf
+version: 10.30.24
+display_name: synthetic_model
+description: Performance benchmark results for model on synthetic data
+
+dataset_family: synthetic
+dataset_name: synthetic
+
+model_name: model1
+model_version: '1'
+model_asset_id: azureml://registries/azureml/models/model1/versions/1
+relationships:
+    - relationshipType: Source
+      assetId: azureml://registries/azureml/models/model1/versions/1
+
+tags:
+  evaluation_type: text_performance
+  index_metric_value: generated_tokens_per_sec
+  index_metric_key: random_key
+  azure_registry_name: azureml
+  azure_model_name: model1
+  azure_latest_model_version: 1
+  azure_latest_model_asset_id: azureml://registries/azureml/models/model1/versions/1
+
+metrics:
+  throughput_gtps_token_count: 25.3
+  throughput_ttps_token_count: 145.43
+  throughput_rps_request_count: 0.14
+  latency_p50_secs: 7.13
+  latency_p90_secs: 7.4
+  latency_p95_secs: 7.52
+  latency_p99_secs: 8.2
+  latency_mean_secs: 7.17
+  latency_ttft_secs: 1.37
+  time_between_tokens_secs: 0.29
+  index_metric: 25.3
+
+properties:
+  deployment_category: azure_openai
+  deployment_type: standard
+  tokens_rate_limit: 30k
+  total_token_length_per_request: 1000
+  prompt_token_generated_token_ratio: '80:20'
+  input_prompt_tokens: 800
+  output_generated_tokens: 200
+  num_of_inference_requests: 2
+  num_of_inference_aggregations: 336
+  payload_task_type: chat_completion
+  num_parallel_inference_requests: '1'
+  stream: true
+  tokenizer: gpt-4-0314
+  region: uksouth
+
--- a/test/resources/validate/evaluationresult/text_quality_correct/asset.yaml
+++ b/test/resources/validate/evaluationresult/text_quality_correct/asset.yaml
@ -0,0 +1,5 @@
+type: evaluationresult
+spec: spec.yaml
+categories:
+- EvaluationResult
+
--- a/test/resources/validate/evaluationresult/text_quality_correct/spec.yaml
+++ b/test/resources/validate/evaluationresult/text_quality_correct/spec.yaml
@ -0,0 +1,37 @@
+type: evaluationresult
+name: model1-0613_quality_index
+version: 10.30.24
+display_name: model1_quality_index
+description: aggregated quality benchmark results for model1
+
+model_name: model1
+model_version: '1'
+model_asset_id: azureml://registries/azureml/models/model1/versions/1
+
+dataset_family: aggregate
+dataset_name: aggregate
+
+relationships:
+    - relationshipType: Source
+      assetId: azureml://registries/azureml/models/model1/versions/1
+
+tags:
+  evaluation_type: text_quality
+  index_metric_key: index_metric
+  azure_registry_name: azureml
+  azure_model_name: model1
+  azure_latest_model_version: 1
+  azure_latest_model_asset_id: azureml://registries/azureml/models/model1/versions/1
+
+metrics:
+  accuracy: 0.873692
+  coherence: 4.882209
+  fluency: 4.924
+  GPTSimilarity: 3.916613
+  groundedness: 4.296203
+  relevance: 4.333895
+  index_metric: 0.85442
+
+properties:
+  total_datasets: 15
+
--- a/test/resources/validate/evaluationresult/text_quality_incorrect/asset.yaml
+++ b/test/resources/validate/evaluationresult/text_quality_incorrect/asset.yaml
@ -0,0 +1,5 @@
+type: evaluationresult
+spec: spec.yaml
+categories:
+- EvaluationResult
+
--- a/test/resources/validate/evaluationresult/text_quality_incorrect/spec.yaml
+++ b/test/resources/validate/evaluationresult/text_quality_incorrect/spec.yaml
@ -0,0 +1,37 @@
+type: evaluationresult
+name: model1-0613_quality_index
+version: 10.30.24
+display_name: model1_quality_index
+description: aggregated quality benchmark results for model1
+
+model_name: model1
+model_version: '1'
+model_asset_id: azureml://registries/azureml/models/model1/versions/1
+
+dataset_family: aggregate
+dataset_name: aggregate
+
+relationships:
+    - relationshipType: Source
+      assetId: azureml://registries/azureml/models/model1/versions/1
+
+tags:
+  evaluation_type: text_quality
+  index_metric_key: random_key
+  azure_registry_name: azureml
+  azure_model_name: model1
+  azure_latest_model_version: 1
+  azure_latest_model_asset_id: azureml://registries/azureml/models/model1/versions/1
+
+metrics:
+  accuracy: 0.873692
+  coherence: 4.882209
+  fluency: 4.924
+  GPTSimilarity: 3.916613
+  groundedness: 4.296203
+  relevance: 4.333895
+  index_metric: 0.85442
+
+properties:
+  total_datasets: 15
+
--- a/test/test_validate_assets.py
+++ b/test/test_validate_assets.py
@ -51,6 +51,12 @@ MODEL_VALIDATION_RESULTS = Path("resources/model_validation_results")
        ("evaluationresult/text_generation_incorrect", False, True, None, False),
        ("evaluationresult/vision_correct", False, True, None, True),
        ("evaluationresult/vision_incorrect", False, True, None, False),
+        ("evaluationresult/text_cost_correct", False, True, None, True),
+        ("evaluationresult/text_cost_incorrect", False, True, None, False),
+        ("evaluationresult/text_quality_correct", False, True, None, True),
+        ("evaluationresult/text_quality_incorrect", False, True, None, False),
+        ("evaluationresult/text_performance_correct", False, True, None, True),
+        ("evaluationresult/text_performance_incorrect", False, True, None, False),
    ]
 )
 def test_validate_assets(test_subdir: str, check_images: bool, check_names: bool,