Fix the issue with supporting new evaluationresult asset types (#3544)
* updating model evaluation environment with modelling_llama patch. * add cost, quality, performance tags for evaluationresult. * reverting the changes for transformers patch. * updating the changelog and bump up the version. * fix the issue with supporting new evaluationresult assets. * updated the change log file. * updating the test cases. * updating the test cases. * fixing unit tests. * fixing change log file. --------- Co-authored-by: Chandra Sekhar Gupta Aravapalli <caravapalli@microsoft.com>
This commit is contained in:
Родитель
81f80b9b91
Коммит
4f983518a6
|
@ -1,7 +1,10 @@
|
|||
## 1.17.0 (Unreleased)
|
||||
### 🚀 New Features
|
||||
|
||||
|
||||
## 1.16.65 (2024-11-04)
|
||||
### 🐛 Bugs Fixed
|
||||
- [#3544](https://github.com/Azure/azureml-assets/pull/3544) Fix validate assets for new evaluationresult asset tags
|
||||
|
||||
|
||||
## 1.16.64 (2024-10-31)
|
||||
|
|
|
@ -5,7 +5,7 @@ evaluation_type:
|
|||
values:
|
||||
- text_generation
|
||||
- text_embeddings
|
||||
- vision
|
||||
- text_cost
|
||||
- text_performance
|
||||
- text_quality
|
||||
- vision
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
# Valid tag values that can be applied to evaluation results whose evaluation_type is `text_cost`.
|
||||
index_metric_key:
|
||||
required: True
|
||||
allow_multiple: False
|
||||
values:
|
||||
- total_cost_per_1M_tokens
|
|
@ -0,0 +1,6 @@
|
|||
# Valid tag values that can be applied to evaluation results whose evaluation_type is `text_performance`.
|
||||
index_metric_key:
|
||||
required: True
|
||||
allow_multiple: False
|
||||
values:
|
||||
- throughput_gtps_token_count
|
|
@ -0,0 +1,6 @@
|
|||
# Valid tag values that can be applied to evaluation results whose evaluation_type is `text_quality`.
|
||||
index_metric_key:
|
||||
required: True
|
||||
allow_multiple: False
|
||||
values:
|
||||
- index_metric
|
|
@ -1077,12 +1077,17 @@ def validate_assets(input_dirs: List[Path],
|
|||
asset_spec = asset_config._spec._yaml
|
||||
evaluation_type = asset_spec.get('tags', {}).get('evaluation_type', None)
|
||||
|
||||
if evaluation_type == 'text_generation':
|
||||
error_count += validate_tags(asset_config, 'evaluationresult/tag_values_text_generation.yaml')
|
||||
elif evaluation_type == 'text_embeddings':
|
||||
error_count += validate_tags(asset_config, 'evaluationresult/tag_values_text_embeddings.yaml')
|
||||
elif evaluation_type == 'vision':
|
||||
error_count += validate_tags(asset_config, 'evaluationresult/tag_values_vision.yaml')
|
||||
evaluation_tag_files = {
|
||||
'text_generation': 'evaluationresult/tag_values_text_generation.yaml',
|
||||
'text_embeddings': 'evaluationresult/tag_values_text_embeddings.yaml',
|
||||
'vision': 'evaluationresult/tag_values_vision.yaml',
|
||||
'text_quality': 'evaluationresult/tag_values_text_quality.yaml',
|
||||
'text_performance': 'evaluationresult/tag_values_text_performance.yaml',
|
||||
'text_cost': 'evaluationresult/tag_values_text_cost.yaml'
|
||||
}
|
||||
|
||||
if evaluation_type in evaluation_tag_files:
|
||||
error_count += validate_tags(asset_config, evaluation_tag_files[evaluation_type])
|
||||
else:
|
||||
_log_error(
|
||||
asset_config.file_name_with_path,
|
||||
|
|
|
@ -7,7 +7,7 @@ from setuptools import setup, find_packages
|
|||
|
||||
setup(
|
||||
name="azureml-assets",
|
||||
version="1.16.64",
|
||||
version="1.16.65",
|
||||
description="Utilities for publishing assets to Azure Machine Learning system registries.",
|
||||
author="Microsoft Corp",
|
||||
packages=find_packages(),
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories:
|
||||
- EvaluationResult
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
type: evaluationresult
|
||||
name: model1-16k_cost
|
||||
version: 21.10.24
|
||||
display_name: model1
|
||||
description: Cost benchmark results for model1
|
||||
model_name: model1
|
||||
model_version: '1'
|
||||
model_asset_id: azureml://registries/azureml/models/model1/versions/1
|
||||
|
||||
dataset_family: synthetic
|
||||
dataset_name: synthetic
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml/models/model1/versions/1
|
||||
tags:
|
||||
evaluation_type: text_cost
|
||||
index_metric_key: total_cost_per_1M_tokens
|
||||
|
||||
metrics:
|
||||
input_token_cost_per_1M_tokens: 3.0
|
||||
output_token_cost_per_1M_tokens: 4.0
|
||||
total_cost_per_1M_tokens: 3.25
|
||||
|
||||
properties:
|
||||
deployment_category: azureml
|
||||
disclaimer: Cost Calculation is indicative and may vary based on the actual usage
|
||||
and configuration.
|
||||
additional_info: https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/
|
||||
currency: USD
|
||||
input_output_token_ratio: '3:1'
|
||||
input_token_weightage_per_million: 750000
|
||||
output_token_weightage_per_million: 250000
|
||||
deployment_type: global
|
||||
region: eastus
|
||||
input_meter_id: input_meter_id
|
||||
output_meter_id: output_meter_id
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories:
|
||||
- EvaluationResult
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
type: evaluationresult
|
||||
name: model1-16k_cost
|
||||
version: 21.10.24
|
||||
display_name: model1
|
||||
description: Cost benchmark results for model1
|
||||
model_name: model1
|
||||
model_version: '1'
|
||||
model_asset_id: azureml://registries/azureml/models/model1/versions/1
|
||||
|
||||
dataset_family: synthetic
|
||||
dataset_name: synthetic
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml/models/model1/versions/1
|
||||
tags:
|
||||
evaluation_type: text_cost
|
||||
index_metric_key: random_key
|
||||
|
||||
metrics:
|
||||
input_token_cost_per_1M_tokens: 3.0
|
||||
output_token_cost_per_1M_tokens: 4.0
|
||||
total_cost_per_1M_tokens: 3.25
|
||||
|
||||
properties:
|
||||
deployment_category: azureml
|
||||
disclaimer: Cost Calculation is indicative and may vary based on the actual usage
|
||||
and configuration.
|
||||
additional_info: https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/
|
||||
currency: USD
|
||||
input_output_token_ratio: '3:1'
|
||||
input_token_weightage_per_million: 750000
|
||||
output_token_weightage_per_million: 250000
|
||||
deployment_type: global
|
||||
region: eastus
|
||||
input_meter_id: input_meter_id
|
||||
output_meter_id: output_meter_id
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories:
|
||||
- EvaluationResult
|
||||
|
|
@ -0,0 +1,54 @@
|
|||
type: evaluationresult
|
||||
name: synthetic_model_perf
|
||||
version: 10.30.24
|
||||
display_name: synthetic_model
|
||||
description: Performance benchmark results for model on synthetic data
|
||||
|
||||
dataset_family: synthetic
|
||||
dataset_name: synthetic
|
||||
|
||||
model_name: model1
|
||||
model_version: '1'
|
||||
model_asset_id: azureml://registries/azureml/models/model1/versions/1
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml/models/model1/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_performance
|
||||
index_metric_value: generated_tokens_per_sec
|
||||
index_metric_key: throughput_gtps_token_count
|
||||
azure_registry_name: azureml
|
||||
azure_model_name: model1
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azureml/models/model1/versions/1
|
||||
|
||||
metrics:
|
||||
throughput_gtps_token_count: 25.3
|
||||
throughput_ttps_token_count: 145.43
|
||||
throughput_rps_request_count: 0.14
|
||||
latency_p50_secs: 7.13
|
||||
latency_p90_secs: 7.4
|
||||
latency_p95_secs: 7.52
|
||||
latency_p99_secs: 8.2
|
||||
latency_mean_secs: 7.17
|
||||
latency_ttft_secs: 1.37
|
||||
time_between_tokens_secs: 0.29
|
||||
index_metric: 25.3
|
||||
|
||||
properties:
|
||||
deployment_category: azure_openai
|
||||
deployment_type: standard
|
||||
tokens_rate_limit: 30k
|
||||
total_token_length_per_request: 1000
|
||||
prompt_token_generated_token_ratio: '80:20'
|
||||
input_prompt_tokens: 800
|
||||
output_generated_tokens: 200
|
||||
num_of_inference_requests: 2
|
||||
num_of_inference_aggregations: 336
|
||||
payload_task_type: chat_completion
|
||||
num_parallel_inference_requests: '1'
|
||||
stream: true
|
||||
tokenizer: gpt-4-0314
|
||||
region: uksouth
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories:
|
||||
- EvaluationResult
|
||||
|
|
@ -0,0 +1,54 @@
|
|||
type: evaluationresult
|
||||
name: synthetic_model_perf
|
||||
version: 10.30.24
|
||||
display_name: synthetic_model
|
||||
description: Performance benchmark results for model on synthetic data
|
||||
|
||||
dataset_family: synthetic
|
||||
dataset_name: synthetic
|
||||
|
||||
model_name: model1
|
||||
model_version: '1'
|
||||
model_asset_id: azureml://registries/azureml/models/model1/versions/1
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml/models/model1/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_performance
|
||||
index_metric_value: generated_tokens_per_sec
|
||||
index_metric_key: random_key
|
||||
azure_registry_name: azureml
|
||||
azure_model_name: model1
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azureml/models/model1/versions/1
|
||||
|
||||
metrics:
|
||||
throughput_gtps_token_count: 25.3
|
||||
throughput_ttps_token_count: 145.43
|
||||
throughput_rps_request_count: 0.14
|
||||
latency_p50_secs: 7.13
|
||||
latency_p90_secs: 7.4
|
||||
latency_p95_secs: 7.52
|
||||
latency_p99_secs: 8.2
|
||||
latency_mean_secs: 7.17
|
||||
latency_ttft_secs: 1.37
|
||||
time_between_tokens_secs: 0.29
|
||||
index_metric: 25.3
|
||||
|
||||
properties:
|
||||
deployment_category: azure_openai
|
||||
deployment_type: standard
|
||||
tokens_rate_limit: 30k
|
||||
total_token_length_per_request: 1000
|
||||
prompt_token_generated_token_ratio: '80:20'
|
||||
input_prompt_tokens: 800
|
||||
output_generated_tokens: 200
|
||||
num_of_inference_requests: 2
|
||||
num_of_inference_aggregations: 336
|
||||
payload_task_type: chat_completion
|
||||
num_parallel_inference_requests: '1'
|
||||
stream: true
|
||||
tokenizer: gpt-4-0314
|
||||
region: uksouth
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories:
|
||||
- EvaluationResult
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
type: evaluationresult
|
||||
name: model1-0613_quality_index
|
||||
version: 10.30.24
|
||||
display_name: model1_quality_index
|
||||
description: aggregated quality benchmark results for model1
|
||||
|
||||
model_name: model1
|
||||
model_version: '1'
|
||||
model_asset_id: azureml://registries/azureml/models/model1/versions/1
|
||||
|
||||
dataset_family: aggregate
|
||||
dataset_name: aggregate
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml/models/model1/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_quality
|
||||
index_metric_key: index_metric
|
||||
azure_registry_name: azureml
|
||||
azure_model_name: model1
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azureml/models/model1/versions/1
|
||||
|
||||
metrics:
|
||||
accuracy: 0.873692
|
||||
coherence: 4.882209
|
||||
fluency: 4.924
|
||||
GPTSimilarity: 3.916613
|
||||
groundedness: 4.296203
|
||||
relevance: 4.333895
|
||||
index_metric: 0.85442
|
||||
|
||||
properties:
|
||||
total_datasets: 15
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
type: evaluationresult
|
||||
spec: spec.yaml
|
||||
categories:
|
||||
- EvaluationResult
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
type: evaluationresult
|
||||
name: model1-0613_quality_index
|
||||
version: 10.30.24
|
||||
display_name: model1_quality_index
|
||||
description: aggregated quality benchmark results for model1
|
||||
|
||||
model_name: model1
|
||||
model_version: '1'
|
||||
model_asset_id: azureml://registries/azureml/models/model1/versions/1
|
||||
|
||||
dataset_family: aggregate
|
||||
dataset_name: aggregate
|
||||
|
||||
relationships:
|
||||
- relationshipType: Source
|
||||
assetId: azureml://registries/azureml/models/model1/versions/1
|
||||
|
||||
tags:
|
||||
evaluation_type: text_quality
|
||||
index_metric_key: random_key
|
||||
azure_registry_name: azureml
|
||||
azure_model_name: model1
|
||||
azure_latest_model_version: 1
|
||||
azure_latest_model_asset_id: azureml://registries/azureml/models/model1/versions/1
|
||||
|
||||
metrics:
|
||||
accuracy: 0.873692
|
||||
coherence: 4.882209
|
||||
fluency: 4.924
|
||||
GPTSimilarity: 3.916613
|
||||
groundedness: 4.296203
|
||||
relevance: 4.333895
|
||||
index_metric: 0.85442
|
||||
|
||||
properties:
|
||||
total_datasets: 15
|
||||
|
|
@ -51,6 +51,12 @@ MODEL_VALIDATION_RESULTS = Path("resources/model_validation_results")
|
|||
("evaluationresult/text_generation_incorrect", False, True, None, False),
|
||||
("evaluationresult/vision_correct", False, True, None, True),
|
||||
("evaluationresult/vision_incorrect", False, True, None, False),
|
||||
("evaluationresult/text_cost_correct", False, True, None, True),
|
||||
("evaluationresult/text_cost_incorrect", False, True, None, False),
|
||||
("evaluationresult/text_quality_correct", False, True, None, True),
|
||||
("evaluationresult/text_quality_incorrect", False, True, None, False),
|
||||
("evaluationresult/text_performance_correct", False, True, None, True),
|
||||
("evaluationresult/text_performance_incorrect", False, True, None, False),
|
||||
]
|
||||
)
|
||||
def test_validate_assets(test_subdir: str, check_images: bool, check_names: bool,
|
||||
|
|
Загрузка…
Ссылка в новой задаче