Fix the issue with supporting new evaluationresult asset types (#3544)

* updating model evaluation environment with modelling_llama patch.

* add cost, quality, performance tags for evaluationresult.

* reverting the changes for transformers patch.

* updating the changelog and bump up the version.

* fix the issue with supporting new evaluationresult assets.

* updated the change log file.

* updating the test cases.

* updating the test cases.

* fixing unit tests.

* fixing change log file.

---------

Co-authored-by: Chandra Sekhar Gupta Aravapalli <caravapalli@microsoft.com>
This commit is contained in:
Chandra Sekhar Gupta 2024-11-04 08:34:59 +05:30 коммит произвёл GitHub
Родитель 81f80b9b91
Коммит 4f983518a6
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
20 изменённых файлов: 326 добавлений и 8 удалений

Просмотреть файл

@ -1,7 +1,10 @@
## 1.17.0 (Unreleased)
### 🚀 New Features
## 1.16.65 (2024-11-04)
### 🐛 Bugs Fixed
- [#3544](https://github.com/Azure/azureml-assets/pull/3544) Fix validate assets for new evaluationresult asset tags
## 1.16.64 (2024-10-31)

Просмотреть файл

@ -5,7 +5,7 @@ evaluation_type:
values:
- text_generation
- text_embeddings
- vision
- text_cost
- text_performance
- text_quality
- vision

Просмотреть файл

@ -0,0 +1,6 @@
# Valid tag values that can be applied to evaluation results whose evaluation_type is `text_cost`.
index_metric_key:
required: True
allow_multiple: False
values:
- total_cost_per_1M_tokens

Просмотреть файл

@ -0,0 +1,6 @@
# Valid tag values that can be applied to evaluation results whose evaluation_type is `text_performance`.
index_metric_key:
required: True
allow_multiple: False
values:
- throughput_gtps_token_count

Просмотреть файл

@ -0,0 +1,6 @@
# Valid tag values that can be applied to evaluation results whose evaluation_type is `text_quality`.
index_metric_key:
required: True
allow_multiple: False
values:
- index_metric

Просмотреть файл

@ -1077,12 +1077,17 @@ def validate_assets(input_dirs: List[Path],
asset_spec = asset_config._spec._yaml
evaluation_type = asset_spec.get('tags', {}).get('evaluation_type', None)
if evaluation_type == 'text_generation':
error_count += validate_tags(asset_config, 'evaluationresult/tag_values_text_generation.yaml')
elif evaluation_type == 'text_embeddings':
error_count += validate_tags(asset_config, 'evaluationresult/tag_values_text_embeddings.yaml')
elif evaluation_type == 'vision':
error_count += validate_tags(asset_config, 'evaluationresult/tag_values_vision.yaml')
evaluation_tag_files = {
'text_generation': 'evaluationresult/tag_values_text_generation.yaml',
'text_embeddings': 'evaluationresult/tag_values_text_embeddings.yaml',
'vision': 'evaluationresult/tag_values_vision.yaml',
'text_quality': 'evaluationresult/tag_values_text_quality.yaml',
'text_performance': 'evaluationresult/tag_values_text_performance.yaml',
'text_cost': 'evaluationresult/tag_values_text_cost.yaml'
}
if evaluation_type in evaluation_tag_files:
error_count += validate_tags(asset_config, evaluation_tag_files[evaluation_type])
else:
_log_error(
asset_config.file_name_with_path,

Просмотреть файл

@ -7,7 +7,7 @@ from setuptools import setup, find_packages
setup(
name="azureml-assets",
version="1.16.64",
version="1.16.65",
description="Utilities for publishing assets to Azure Machine Learning system registries.",
author="Microsoft Corp",
packages=find_packages(),

Просмотреть файл

@ -0,0 +1,5 @@
type: evaluationresult
spec: spec.yaml
categories:
- EvaluationResult

Просмотреть файл

@ -0,0 +1,37 @@
type: evaluationresult
name: model1-16k_cost
version: 21.10.24
display_name: model1
description: Cost benchmark results for model1
model_name: model1
model_version: '1'
model_asset_id: azureml://registries/azureml/models/model1/versions/1
dataset_family: synthetic
dataset_name: synthetic
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml/models/model1/versions/1
tags:
evaluation_type: text_cost
index_metric_key: total_cost_per_1M_tokens
metrics:
input_token_cost_per_1M_tokens: 3.0
output_token_cost_per_1M_tokens: 4.0
total_cost_per_1M_tokens: 3.25
properties:
deployment_category: azureml
disclaimer: Cost Calculation is indicative and may vary based on the actual usage
and configuration.
additional_info: https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/
currency: USD
input_output_token_ratio: '3:1'
input_token_weightage_per_million: 750000
output_token_weightage_per_million: 250000
deployment_type: global
region: eastus
input_meter_id: input_meter_id
output_meter_id: output_meter_id

Просмотреть файл

@ -0,0 +1,5 @@
type: evaluationresult
spec: spec.yaml
categories:
- EvaluationResult

Просмотреть файл

@ -0,0 +1,37 @@
type: evaluationresult
name: model1-16k_cost
version: 21.10.24
display_name: model1
description: Cost benchmark results for model1
model_name: model1
model_version: '1'
model_asset_id: azureml://registries/azureml/models/model1/versions/1
dataset_family: synthetic
dataset_name: synthetic
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml/models/model1/versions/1
tags:
evaluation_type: text_cost
index_metric_key: random_key
metrics:
input_token_cost_per_1M_tokens: 3.0
output_token_cost_per_1M_tokens: 4.0
total_cost_per_1M_tokens: 3.25
properties:
deployment_category: azureml
disclaimer: Cost Calculation is indicative and may vary based on the actual usage
and configuration.
additional_info: https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/
currency: USD
input_output_token_ratio: '3:1'
input_token_weightage_per_million: 750000
output_token_weightage_per_million: 250000
deployment_type: global
region: eastus
input_meter_id: input_meter_id
output_meter_id: output_meter_id

Просмотреть файл

@ -0,0 +1,5 @@
type: evaluationresult
spec: spec.yaml
categories:
- EvaluationResult

Просмотреть файл

@ -0,0 +1,54 @@
type: evaluationresult
name: synthetic_model_perf
version: 10.30.24
display_name: synthetic_model
description: Performance benchmark results for model on synthetic data
dataset_family: synthetic
dataset_name: synthetic
model_name: model1
model_version: '1'
model_asset_id: azureml://registries/azureml/models/model1/versions/1
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml/models/model1/versions/1
tags:
evaluation_type: text_performance
index_metric_value: generated_tokens_per_sec
index_metric_key: throughput_gtps_token_count
azure_registry_name: azureml
azure_model_name: model1
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azureml/models/model1/versions/1
metrics:
throughput_gtps_token_count: 25.3
throughput_ttps_token_count: 145.43
throughput_rps_request_count: 0.14
latency_p50_secs: 7.13
latency_p90_secs: 7.4
latency_p95_secs: 7.52
latency_p99_secs: 8.2
latency_mean_secs: 7.17
latency_ttft_secs: 1.37
time_between_tokens_secs: 0.29
index_metric: 25.3
properties:
deployment_category: azure_openai
deployment_type: standard
tokens_rate_limit: 30k
total_token_length_per_request: 1000
prompt_token_generated_token_ratio: '80:20'
input_prompt_tokens: 800
output_generated_tokens: 200
num_of_inference_requests: 2
num_of_inference_aggregations: 336
payload_task_type: chat_completion
num_parallel_inference_requests: '1'
stream: true
tokenizer: gpt-4-0314
region: uksouth

Просмотреть файл

@ -0,0 +1,5 @@
type: evaluationresult
spec: spec.yaml
categories:
- EvaluationResult

Просмотреть файл

@ -0,0 +1,54 @@
type: evaluationresult
name: synthetic_model_perf
version: 10.30.24
display_name: synthetic_model
description: Performance benchmark results for model on synthetic data
dataset_family: synthetic
dataset_name: synthetic
model_name: model1
model_version: '1'
model_asset_id: azureml://registries/azureml/models/model1/versions/1
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml/models/model1/versions/1
tags:
evaluation_type: text_performance
index_metric_value: generated_tokens_per_sec
index_metric_key: random_key
azure_registry_name: azureml
azure_model_name: model1
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azureml/models/model1/versions/1
metrics:
throughput_gtps_token_count: 25.3
throughput_ttps_token_count: 145.43
throughput_rps_request_count: 0.14
latency_p50_secs: 7.13
latency_p90_secs: 7.4
latency_p95_secs: 7.52
latency_p99_secs: 8.2
latency_mean_secs: 7.17
latency_ttft_secs: 1.37
time_between_tokens_secs: 0.29
index_metric: 25.3
properties:
deployment_category: azure_openai
deployment_type: standard
tokens_rate_limit: 30k
total_token_length_per_request: 1000
prompt_token_generated_token_ratio: '80:20'
input_prompt_tokens: 800
output_generated_tokens: 200
num_of_inference_requests: 2
num_of_inference_aggregations: 336
payload_task_type: chat_completion
num_parallel_inference_requests: '1'
stream: true
tokenizer: gpt-4-0314
region: uksouth

Просмотреть файл

@ -0,0 +1,5 @@
type: evaluationresult
spec: spec.yaml
categories:
- EvaluationResult

Просмотреть файл

@ -0,0 +1,37 @@
type: evaluationresult
name: model1-0613_quality_index
version: 10.30.24
display_name: model1_quality_index
description: aggregated quality benchmark results for model1
model_name: model1
model_version: '1'
model_asset_id: azureml://registries/azureml/models/model1/versions/1
dataset_family: aggregate
dataset_name: aggregate
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml/models/model1/versions/1
tags:
evaluation_type: text_quality
index_metric_key: index_metric
azure_registry_name: azureml
azure_model_name: model1
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azureml/models/model1/versions/1
metrics:
accuracy: 0.873692
coherence: 4.882209
fluency: 4.924
GPTSimilarity: 3.916613
groundedness: 4.296203
relevance: 4.333895
index_metric: 0.85442
properties:
total_datasets: 15

Просмотреть файл

@ -0,0 +1,5 @@
type: evaluationresult
spec: spec.yaml
categories:
- EvaluationResult

Просмотреть файл

@ -0,0 +1,37 @@
type: evaluationresult
name: model1-0613_quality_index
version: 10.30.24
display_name: model1_quality_index
description: aggregated quality benchmark results for model1
model_name: model1
model_version: '1'
model_asset_id: azureml://registries/azureml/models/model1/versions/1
dataset_family: aggregate
dataset_name: aggregate
relationships:
- relationshipType: Source
assetId: azureml://registries/azureml/models/model1/versions/1
tags:
evaluation_type: text_quality
index_metric_key: random_key
azure_registry_name: azureml
azure_model_name: model1
azure_latest_model_version: 1
azure_latest_model_asset_id: azureml://registries/azureml/models/model1/versions/1
metrics:
accuracy: 0.873692
coherence: 4.882209
fluency: 4.924
GPTSimilarity: 3.916613
groundedness: 4.296203
relevance: 4.333895
index_metric: 0.85442
properties:
total_datasets: 15

Просмотреть файл

@ -51,6 +51,12 @@ MODEL_VALIDATION_RESULTS = Path("resources/model_validation_results")
("evaluationresult/text_generation_incorrect", False, True, None, False),
("evaluationresult/vision_correct", False, True, None, True),
("evaluationresult/vision_incorrect", False, True, None, False),
("evaluationresult/text_cost_correct", False, True, None, True),
("evaluationresult/text_cost_incorrect", False, True, None, False),
("evaluationresult/text_quality_correct", False, True, None, True),
("evaluationresult/text_quality_incorrect", False, True, None, False),
("evaluationresult/text_performance_correct", False, True, None, True),
("evaluationresult/text_performance_incorrect", False, True, None, False),
]
)
def test_validate_assets(test_subdir: str, check_images: bool, check_names: bool,