2023-04-26 02:12:03 +03:00
resource " azurerm_resource_group " " rg " {
location = var . resource_group_location
name = " defaultPrometheusOnboardingResourceGroup "
}
resource " azurerm_kubernetes_cluster " " k8s " {
location = azurerm_resource_group . rg . location
name = var . cluster_name
resource_group_name = azurerm_resource_group . rg . name
2023-08-03 04:08:56 +03:00
dns_prefix = var . dns_prefix
tags = {
2023-04-26 02:12:03 +03:00
Environment = " Development "
}
default_node_pool {
name = " agentpool "
vm_size = " Standard_D2_v2 "
node_count = var . agent_count
}
monitor_metrics {
annotations_allowed = var . metric_annotations_allowlist
2023-08-03 04:08:56 +03:00
labels_allowed = var . metric_labels_allowlist
2023-04-26 02:12:03 +03:00
}
network_profile {
network_plugin = " kubenet "
load_balancer_sku = " standard "
}
identity {
type = " SystemAssigned "
}
}
2023-08-03 04:08:56 +03:00
resource " azurerm_monitor_workspace " " amw " {
name = var . monitor_workspace_name
resource_group_name = azurerm_resource_group . rg . name
location = azurerm_resource_group . rg . location
}
2023-04-26 02:12:03 +03:00
resource " azurerm_monitor_data_collection_endpoint " " dce " {
[fix] Add min function to the dc name substring functionparameters to avoid template deployment failure for shorter names (#951)
This pr adds min function to the dc name substrings so that the template
deployment does not fail for shorter(than 44/64) string lengths.
I was testing onboarding a cluster with a short name testsohambicep and
caught this bug.
This fixes current failure for shorter strings:
Getting this error from selected code : {"code": "InvalidTemplate",
"message": "Deployment template validation failed: 'The template
variable 'dceName' is not valid: Unable to evaluate the template
language function 'substring'. The index and length parameters must
refer to a location within the string. The index parameter: '0', the
length parameter: '44', the length of the string parameter: '28'. Please
see https://aka.ms/arm-function-substring for usage details.. Please see
https://aka.ms/arm-functions for usage details.'.", "additionalInfo":
[{"type": "TemplateViolation", "info": {"lineNumber": 55,
"linePosition": 132, "path": "properties.template.variables.dceName"}}]}
[comment]: # (Note that your PR title should follow the conventional
commit format: https://conventionalcommits.org/en/v1.0.0/#summary)
# PR Description
[comment]: # (The below checklist is for PRs adding new features. If a
box is not checked, add a reason why it's not needed.)
# New Feature Checklist
- [ ] List telemetry added about the feature.
- [ ] Link to the one-pager about the feature.
- [ ] List any tasks necessary for release (3P docs, AKS RP chart
changes, etc.) after merging the PR.
- [ ] Attach results of scale and perf testing.
[comment]: # (The below checklist is for code changes. Not all boxes
necessarily need to be checked. Build, doc, and template changes do not
need to fill out the checklist.)
# Tests Checklist
- [ ] Have end-to-end Ginkgo tests been run on your cluster and passed?
To bootstrap your cluster to run the tests, follow [these
instructions](/otelcollector/test/README.md#bootstrap-a-dev-cluster-to-run-ginkgo-tests).
- Labels used when running the tests on your cluster:
- [ ] `operator`
- [ ] `windows`
- [ ] `arm64`
- [ ] `arc-extension`
- [ ] `fips`
- [ ] Have new tests been added? For features, have tests been added for
this feature? For fixes, is there a test that could have caught this
issue and could validate that the fix works?
- [ ] Is a new scrape job needed?
- [ ] The scrape job was added to the folder
[test-cluster-yamls](/otelcollector/test/test-cluster-yamls/) in the
correct configmap or as a CR.
- [ ] Was a new test label added?
- [ ] A string constant for the label was added to
[constants.go](/otelcollector/test/utils/constants.go).
- [ ] The label and description was added to the [test
README](/otelcollector/test/README.md).
- [ ] The label was added to this [PR
checklist](/.github/pull_request_template).
- [ ] The label was added as needed to
[testkube-test-crs.yaml](/otelcollector/test/testkube/testkube-test-crs.yaml).
- [ ] Are additional API server permissions needed for the new tests?
- [ ] These permissions have been added to
[api-server-permissions.yaml](/otelcollector/test/testkube/api-server-permissions.yaml).
- [ ] Was a new test suite (a new folder under `/tests`) added?
- [ ] The new test suite is included in
[testkube-test-crs.yaml](/otelcollector/test/testkube/testkube-test-crs.yaml).
2024-07-31 01:32:18 +03:00
name = substr ( " MSProm- ${ azurerm_resource_group . rg . location } - ${ var . cluster_name } " , 0 , min ( 44 , length ( " MSProm- ${ azurerm_resource_group . rg . location } - ${ var . cluster_name } " ) ) )
2023-08-03 04:08:56 +03:00
resource_group_name = azurerm_resource_group . rg . name
location = azurerm_resource_group . rg . location
kind = " Linux "
2023-04-26 02:12:03 +03:00
}
2024-10-16 02:00:45 +03:00
# Logic to determine region mismatch
locals {
dce_region_mismatch = var . cluster_region ! = var . amw_region
}
# Create another DCE if the regions don't match and is_private_cluster is true
resource " azurerm_monitor_data_collection_endpoint " " dce_mismatch " {
count = ( local . dce_region_mismatch && var . is_private_cluster ) ? 1 : 0
name = substr ( " MSProm-PL- ${ azurerm_resource_group . rg . location } - ${ var . cluster_name } " , 0 , min ( 44 , length ( " MSProm-PL- ${ azurerm_resource_group . rg . location } - ${ var . cluster_name } " ) ) )
resource_group_name = azurerm_resource_group . rg . name
location = var . cluster_region
kind = " Linux "
}
2023-04-26 02:12:03 +03:00
resource " azurerm_monitor_data_collection_rule " " dcr " {
[fix] Add min function to the dc name substring functionparameters to avoid template deployment failure for shorter names (#951)
This pr adds min function to the dc name substrings so that the template
deployment does not fail for shorter(than 44/64) string lengths.
I was testing onboarding a cluster with a short name testsohambicep and
caught this bug.
This fixes current failure for shorter strings:
Getting this error from selected code : {"code": "InvalidTemplate",
"message": "Deployment template validation failed: 'The template
variable 'dceName' is not valid: Unable to evaluate the template
language function 'substring'. The index and length parameters must
refer to a location within the string. The index parameter: '0', the
length parameter: '44', the length of the string parameter: '28'. Please
see https://aka.ms/arm-function-substring for usage details.. Please see
https://aka.ms/arm-functions for usage details.'.", "additionalInfo":
[{"type": "TemplateViolation", "info": {"lineNumber": 55,
"linePosition": 132, "path": "properties.template.variables.dceName"}}]}
[comment]: # (Note that your PR title should follow the conventional
commit format: https://conventionalcommits.org/en/v1.0.0/#summary)
# PR Description
[comment]: # (The below checklist is for PRs adding new features. If a
box is not checked, add a reason why it's not needed.)
# New Feature Checklist
- [ ] List telemetry added about the feature.
- [ ] Link to the one-pager about the feature.
- [ ] List any tasks necessary for release (3P docs, AKS RP chart
changes, etc.) after merging the PR.
- [ ] Attach results of scale and perf testing.
[comment]: # (The below checklist is for code changes. Not all boxes
necessarily need to be checked. Build, doc, and template changes do not
need to fill out the checklist.)
# Tests Checklist
- [ ] Have end-to-end Ginkgo tests been run on your cluster and passed?
To bootstrap your cluster to run the tests, follow [these
instructions](/otelcollector/test/README.md#bootstrap-a-dev-cluster-to-run-ginkgo-tests).
- Labels used when running the tests on your cluster:
- [ ] `operator`
- [ ] `windows`
- [ ] `arm64`
- [ ] `arc-extension`
- [ ] `fips`
- [ ] Have new tests been added? For features, have tests been added for
this feature? For fixes, is there a test that could have caught this
issue and could validate that the fix works?
- [ ] Is a new scrape job needed?
- [ ] The scrape job was added to the folder
[test-cluster-yamls](/otelcollector/test/test-cluster-yamls/) in the
correct configmap or as a CR.
- [ ] Was a new test label added?
- [ ] A string constant for the label was added to
[constants.go](/otelcollector/test/utils/constants.go).
- [ ] The label and description was added to the [test
README](/otelcollector/test/README.md).
- [ ] The label was added to this [PR
checklist](/.github/pull_request_template).
- [ ] The label was added as needed to
[testkube-test-crs.yaml](/otelcollector/test/testkube/testkube-test-crs.yaml).
- [ ] Are additional API server permissions needed for the new tests?
- [ ] These permissions have been added to
[api-server-permissions.yaml](/otelcollector/test/testkube/api-server-permissions.yaml).
- [ ] Was a new test suite (a new folder under `/tests`) added?
- [ ] The new test suite is included in
[testkube-test-crs.yaml](/otelcollector/test/testkube/testkube-test-crs.yaml).
2024-07-31 01:32:18 +03:00
name = substr ( " MSProm- ${ azurerm_resource_group . rg . location } - ${ var . cluster_name } " , 0 , min ( 64 , length ( " MSProm- ${ azurerm_resource_group . rg . location } - ${ var . cluster_name } " ) ) )
2023-08-03 04:08:56 +03:00
resource_group_name = azurerm_resource_group . rg . name
location = azurerm_resource_group . rg . location
2023-04-26 02:12:03 +03:00
data_collection_endpoint_id = azurerm_monitor_data_collection_endpoint . dce . id
2023-08-03 04:08:56 +03:00
kind = " Linux "
2023-04-26 02:12:03 +03:00
destinations {
monitor_account {
2023-08-03 04:08:56 +03:00
monitor_account_id = azurerm_monitor_workspace . amw . id
name = " MonitoringAccount1 "
2023-04-26 02:12:03 +03:00
}
}
data _ flow {
streams = [ " Microsoft-PrometheusMetrics " ]
destinations = [ " MonitoringAccount1 " ]
}
data _ sources {
prometheus_forwarder {
2023-08-03 04:08:56 +03:00
streams = [ " Microsoft-PrometheusMetrics " ]
name = " PrometheusDataSource "
2023-04-26 02:12:03 +03:00
}
}
description = " DCR for Azure Monitor Metrics Profile (Managed Prometheus) "
depends_on = [
azurerm_monitor_data_collection_endpoint . dce
]
}
resource " azurerm_monitor_data_collection_rule_association " " dcra " {
2023-08-03 04:08:56 +03:00
name = " MSProm- ${ azurerm_resource_group . rg . location } - ${ var . cluster_name } "
target_resource_id = azurerm_kubernetes_cluster . k8s . id
data_collection_rule_id = azurerm_monitor_data_collection_rule . dcr . id
description = " Association of data collection rule. Deleting this association will break the data collection for this AKS Cluster. "
2023-05-26 20:05:34 +03:00
depends_on = [
azurerm_monitor_data_collection_rule . dcr
]
2023-04-26 02:12:03 +03:00
}
2024-10-16 02:00:45 +03:00
resource " azurerm_monitor_data_collection_rule_association " " dcra_mismatch " {
count = ( local . dce_region_mismatch && var . is_private_cluster ) ? 1 : 0
target_resource_id = azurerm_kubernetes_cluster . k8s . id
data_collection_endpoint_id = local . dce_region_mismatch ? azurerm_monitor_data_collection_endpoint . dce_mismatch [ 0 ] . id : azurerm_monitor_data_collection_endpoint . dce . id
description = " Association of data collection endpoint for private link clusters. Deleting this association will break the data collection for this AKS Cluster. "
depends_on = [
azurerm_monitor_data_collection_endpoint . dce
]
}
2023-04-26 02:12:03 +03:00
resource " azurerm_dashboard_grafana " " grafana " {
2023-08-03 04:08:56 +03:00
name = var . grafana_name
resource_group_name = azurerm_resource_group . rg . name
location = var . grafana_location
2024-10-16 02:00:45 +03:00
grafana_major_version = var . grafana_version
2023-04-26 02:12:03 +03:00
identity {
type = " SystemAssigned "
}
azure_monitor_workspace_integrations {
2023-08-03 04:08:56 +03:00
resource_id = azurerm_monitor_workspace . amw . id
2023-04-26 02:12:03 +03:00
}
}
resource " azurerm_role_assignment " " datareaderrole " {
2023-08-03 04:08:56 +03:00
scope = azurerm_monitor_workspace . amw . id
role_definition_id = " /subscriptions/ ${ split ( " / " , azurerm_monitor_workspace . amw . id ) [ 2 ] } /providers/Microsoft.Authorization/roleDefinitions/b0d8363b-8ddd-447d-831f-62ca05bff136 "
2023-04-26 02:12:03 +03:00
principal_id = azurerm_dashboard_grafana . grafana . identity . 0 . principal_id
}
2024-10-16 02:00:45 +03:00
2023-08-03 04:08:56 +03:00
resource " azurerm_monitor_alert_prometheus_rule_group " " node_recording_rules_rule_group " {
name = " NodeRecordingRulesRuleGroup- ${ var . cluster_name } "
location = azurerm_resource_group . rg . location
resource_group_name = azurerm_resource_group . rg . name
cluster_name = var . cluster_name
description = " Node Recording Rules Rule Group "
rule_group_enabled = true
interval = " PT1M "
2023-09-28 00:37:22 +03:00
scopes = [ azurerm_monitor_workspace . amw . id , azurerm_kubernetes_cluster . k8s . id ]
2023-04-26 02:12:03 +03:00
2023-08-03 04:08:56 +03:00
rule {
enabled = true
record = " instance:node_num_cpu:sum "
expression = < < EOF
count without ( cpu , mode ) ( node_cpu_seconds_total { job =" node " , mode =" idle " } )
EOF
}
rule {
enabled = true
record = " instance:node_cpu_utilisation:rate5m "
expression = < < EOF
1 - avg without ( cpu ) ( sum without ( mode ) ( rate ( node_cpu_seconds_total { job =" node " , mode =~ " idle|iowait|steal " } [ 5 m ] ) ) )
EOF
}
rule {
enabled = true
record = " instance:node_load1_per_cpu:ratio "
expression = < < EOF
( node_load1 { job =" node " } / instance : node_num_cpu : sum { job =" node " } )
EOF
}
rule {
enabled = true
record = " instance:node_memory_utilisation:ratio "
expression = < < EOF
1 - ( ( node_memory_MemAvailable_bytes { job =" node " } or ( node_memory_Buffers_bytes { job =" node " } + node_memory_Cached_bytes { job =" node " } + node_memory_MemFree_bytes { job =" node " } + node_memory_Slab_bytes { job =" node " } ) ) / node_memory_MemTotal_bytes { job =" node " } )
EOF
}
rule {
enabled = true
record = " instance:node_vmstat_pgmajfault:rate5m "
expression = < < EOF
rate ( node_vmstat_pgmajfault { job =" node " } [ 5 m ] )
EOF
}
rule {
enabled = true
record = " instance_device:node_disk_io_time_seconds:rate5m "
expression = < < EOF
rate ( node_disk_io_time_seconds_total { job =" node " , device ! = " " } [ 5 m ] )
EOF
}
rule {
enabled = true
record = " instance_device:node_disk_io_time_weighted_seconds:rate5m "
expression = < < EOF
rate ( node_disk_io_time_weighted_seconds_total { job =" node " , device ! = " " } [ 5 m ] )
EOF
}
rule {
enabled = true
record = " instance:node_network_receive_bytes_excluding_lo:rate5m "
expression = < < EOF
sum without ( device ) ( rate ( node_network_receive_bytes_total { job =" node " , device ! = " lo " } [ 5 m ] ) )
EOF
}
rule {
enabled = true
record = " instance:node_network_transmit_bytes_excluding_lo:rate5m "
expression = < < EOF
sum without ( device ) ( rate ( node_network_transmit_bytes_total { job =" node " , device ! = " lo " } [ 5 m ] ) )
EOF
}
rule {
enabled = true
record = " instance:node_network_receive_drop_excluding_lo:rate5m "
expression = < < EOF
sum without ( device ) ( rate ( node_network_receive_drop_total { job =" node " , device ! = " lo " } [ 5 m ] ) )
EOF
}
rule {
enabled = true
record = " instance:node_network_transmit_drop_excluding_lo:rate5m "
expression = < < EOF
sum without ( device ) ( rate ( node_network_transmit_drop_total { job =" node " , device ! = " lo " } [ 5 m ] ) )
EOF
}
2023-04-26 02:12:03 +03:00
}
2023-08-03 04:08:56 +03:00
resource " azurerm_monitor_alert_prometheus_rule_group " " kubernetes_recording_rules_rule_group " {
name = " KubernetesRecordingRulesRuleGroup- ${ var . cluster_name } "
location = azurerm_resource_group . rg . location
resource_group_name = azurerm_resource_group . rg . name
cluster_name = var . cluster_name
description = " Kubernetes Recording Rules Rule Group "
rule_group_enabled = true
interval = " PT1M "
2023-09-28 00:37:22 +03:00
scopes = [ azurerm_monitor_workspace . amw . id , azurerm_kubernetes_cluster . k8s . id ]
2023-04-26 02:12:03 +03:00
2023-08-03 04:08:56 +03:00
rule {
enabled = true
record = " node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate "
expression = < < EOF
sum by ( cluster , namespace , pod , container ) ( irate ( container_cpu_usage_seconds_total { job =" cadvisor " , image ! = " " } [ 5 m ] ) ) * on ( cluster , namespace , pod ) group_left ( node ) topk by ( cluster , namespace , pod ) ( 1 , max by ( cluster , namespace , pod , node ) ( kube_pod_info { node ! = " " } ) )
EOF
}
rule {
enabled = true
record = " node_namespace_pod_container:container_memory_working_set_bytes "
expression = < < EOF
container_memory_working_set_bytes { job =" cadvisor " , image ! = " " } * on ( namespace , pod ) group_left ( node ) topk by ( namespace , pod ) ( 1 , max by ( namespace , pod , node ) ( kube_pod_info { node ! = " " } ) )
EOF
}
rule {
enabled = true
record = " node_namespace_pod_container:container_memory_rss "
expression = < < EOF
container_memory_rss { job =" cadvisor " , image ! = " " } * on ( namespace , pod ) group_left ( node ) topk by ( namespace , pod ) ( 1 , max by ( namespace , pod , node ) ( kube_pod_info { node ! = " " } ) )
EOF
}
rule {
enabled = true
record = " node_namespace_pod_container:container_memory_cache "
expression = < < EOF
container_memory_cache { job =" cadvisor " , image ! = " " } * on ( namespace , pod ) group_left ( node ) topk by ( namespace , pod ) ( 1 , max by ( namespace , pod , node ) ( kube_pod_info { node ! = " " } ) )
EOF
}
rule {
enabled = true
record = " node_namespace_pod_container:container_memory_swap "
expression = < < EOF
container_memory_swap { job =" cadvisor " , image ! = " " } * on ( namespace , pod ) group_left ( node ) topk by ( namespace , pod ) ( 1 , max by ( namespace , pod , node ) ( kube_pod_info { node ! = " " } ) )
EOF
}
rule {
enabled = true
record = " cluster:namespace:pod_memory:active:kube_pod_container_resource_requests "
expression = < < EOF
kube_pod_container_resource_requests { resource =" memory " , job =" kube-state-metrics " } * on ( namespace , pod , cluster ) group_left ( ) max by ( namespace , pod , cluster ) ( ( kube_pod_status_phase { phase =~ " Pending|Running " } = = 1 ) )
EOF
}
rule {
enabled = true
record = " namespace_memory:kube_pod_container_resource_requests:sum "
expression = < < EOF
sum by ( namespace , cluster ) ( sum by ( namespace , pod , cluster ) ( max by ( namespace , pod , container , cluster ) ( kube_pod_container_resource_requests { resource =" memory " , job =" kube-state-metrics " } ) * on ( namespace , pod , cluster ) group_left ( ) max by ( namespace , pod , cluster ) ( kube_pod_status_phase { phase =~ " Pending|Running " } = = 1 ) ) )
EOF
}
rule {
enabled = true
record = " cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests "
expression = < < EOF
kube_pod_container_resource_requests { resource =" cpu " , job =" kube-state-metrics " } * on ( namespace , pod , cluster ) group_left ( ) max by ( namespace , pod , cluster ) ( ( kube_pod_status_phase { phase =~ " Pending|Running " } = = 1 ) )
EOF
}
rule {
enabled = true
record = " namespace_cpu:kube_pod_container_resource_requests:sum "
expression = < < EOF
sum by ( namespace , cluster ) ( sum by ( namespace , pod , cluster ) ( max by ( namespace , pod , container , cluster ) ( kube_pod_container_resource_requests { resource =" cpu " , job =" kube-state-metrics " } ) * on ( namespace , pod , cluster ) group_left ( ) max by ( namespace , pod , cluster ) ( kube_pod_status_phase { phase =~ " Pending|Running " } = = 1 ) ) )
EOF
}
rule {
enabled = true
record = " cluster:namespace:pod_memory:active:kube_pod_container_resource_limits "
expression = < < EOF
kube_pod_container_resource_limits { resource =" memory " , job =" kube-state-metrics " } * on ( namespace , pod , cluster ) group_left ( ) max by ( namespace , pod , cluster ) ( ( kube_pod_status_phase { phase =~ " Pending|Running " } = = 1 ) )
EOF
}
rule {
enabled = true
record = " namespace_memory:kube_pod_container_resource_limits:sum "
expression = < < EOF
sum by ( namespace , cluster ) ( sum by ( namespace , pod , cluster ) ( max by ( namespace , pod , container , cluster ) ( kube_pod_container_resource_limits { resource =" memory " , job =" kube-state-metrics " } ) * on ( namespace , pod , cluster ) group_left ( ) max by ( namespace , pod , cluster ) ( kube_pod_status_phase { phase =~ " Pending|Running " } = = 1 ) ) )
EOF
}
rule {
enabled = true
record = " cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits "
expression = < < EOF
kube_pod_container_resource_limits { resource =" cpu " , job =" kube-state-metrics " } * on ( namespace , pod , cluster ) group_left ( ) max by ( namespace , pod , cluster ) ( ( kube_pod_status_phase { phase =~ " Pending|Running " } = = 1 ) )
EOF
}
rule {
enabled = true
record = " namespace_cpu:kube_pod_container_resource_limits:sum "
expression = < < EOF
sum by ( namespace , cluster ) ( sum by ( namespace , pod , cluster ) ( max by ( namespace , pod , container , cluster ) ( kube_pod_container_resource_limits { resource =" cpu " , job =" kube-state-metrics " } ) * on ( namespace , pod , cluster ) group_left ( ) max by ( namespace , pod , cluster ) ( kube_pod_status_phase { phase =~ " Pending|Running " } = = 1 ) ) )
EOF
}
rule {
enabled = true
record = " namespace_workload_pod:kube_pod_owner:relabel "
expression = < < EOF
max by ( cluster , namespace , workload , pod ) ( label_replace ( label_replace ( kube_pod_owner { job =" kube-state-metrics " , owner_kind =" ReplicaSet " } , " replicaset " , " $ 1 " , " owner_name " , " (.*) " ) * on ( replicaset , namespace ) group_left ( owner_name ) topk by ( replicaset , namespace ) ( 1 , max by ( replicaset , namespace , owner_name ) ( kube_replicaset_owner { job =" kube-state-metrics " } ) ) , " workload " , " $ 1 " , " owner_name " , " (.*) " ) )
EOF
labels = {
workload_type = " deployment "
}
}
rule {
enabled = true
record = " namespace_workload_pod:kube_pod_owner:relabel "
expression = < < EOF
max by ( cluster , namespace , workload , pod ) ( label_replace ( kube_pod_owner { job =" kube-state-metrics " , owner_kind =" DaemonSet " } , " workload " , " $ 1 " , " owner_name " , " (.*) " ) )
EOF
labels = {
workload_type = " daemonset "
}
}
rule {
enabled = true
record = " namespace_workload_pod:kube_pod_owner:relabel "
expression = < < EOF
max by ( cluster , namespace , workload , pod ) ( label_replace ( kube_pod_owner { job =" kube-state-metrics " , owner_kind =" StatefulSet " } , " workload " , " $ 1 " , " owner_name " , " (.*) " ) )
EOF
labels = {
workload_type = " statefulset "
}
}
rule {
enabled = true
record = " namespace_workload_pod:kube_pod_owner:relabel "
expression = < < EOF
max by ( cluster , namespace , workload , pod ) ( label_replace ( kube_pod_owner { job =" kube-state-metrics " , owner_kind =" Job " } , " workload " , " $ 1 " , " owner_name " , " (.*) " ) )
EOF
labels = {
workload_type = " job "
}
}
rule {
enabled = true
record = " :node_memory_MemAvailable_bytes:sum "
expression = < < EOF
sum ( node_memory_MemAvailable_bytes { job =" node " } or ( node_memory_Buffers_bytes { job =" node " } + node_memory_Cached_bytes { job =" node " } + node_memory_MemFree_bytes { job =" node " } + node_memory_Slab_bytes { job =" node " } ) ) by ( cluster )
EOF
}
rule {
enabled = true
record = " cluster:node_cpu:ratio_rate5m "
expression = < < EOF
sum ( rate ( node_cpu_seconds_total { job =" node " , mode ! = " idle " , mode ! = " iowait " , mode ! = " steal " } [ 5 m ] ) ) by ( cluster ) / count ( sum ( node_cpu_seconds_total { job =" node " } ) by ( cluster , instance , cpu ) ) by ( cluster )
EOF
}
2023-04-26 02:12:03 +03:00
}
2023-08-03 04:08:56 +03:00
resource " azurerm_monitor_alert_prometheus_rule_group " " node_and_kubernetes_recording_rules_rule_group_win " {
name = " NodeAndKubernetesRecordingRulesRuleGroup-Win- ${ var . cluster_name } "
location = azurerm_resource_group . rg . location
resource_group_name = azurerm_resource_group . rg . name
cluster_name = var . cluster_name
description = " Node and Kubernetes Recording Rules Rule Group for Windows Nodes "
rule_group_enabled = true
interval = " PT1M "
2023-09-28 00:37:22 +03:00
scopes = [ azurerm_monitor_workspace . amw . id , azurerm_kubernetes_cluster . k8s . id ]
2023-08-03 04:08:56 +03:00
rule {
enabled = true
record = " node:windows_node_filesystem_usage: "
expression = < < EOF
max by ( instance , volume ) ( ( windows_logical_disk_size_bytes { job =" windows-exporter " } - windows_logical_disk_free_bytes { job =" windows-exporter " } ) / windows_logical_disk_size_bytes { job =" windows-exporter " } )
EOF
}
rule {
enabled = true
record = " node:windows_node_filesystem_avail: "
expression = < < EOF
max by ( instance , volume ) ( windows_logical_disk_free_bytes { job =" windows-exporter " } / windows_logical_disk_size_bytes { job =" windows-exporter " } )
EOF
}
rule {
enabled = true
record = " :windows_node_net_utilisation:sum_irate "
expression = < < EOF
sum ( irate ( windows_net_bytes_total { job =" windows-exporter " } [ 5 m ] ) )
EOF
}
rule {
enabled = true
record = " node:windows_node_net_utilisation:sum_irate "
expression = < < EOF
sum by ( instance ) ( ( irate ( windows_net_bytes_total { job =" windows-exporter " } [ 5 m ] ) ) )
EOF
}
rule {
enabled = true
record = " :windows_node_net_saturation:sum_irate "
expression = < < EOF
sum ( irate ( windows_net_packets_received_discarded_total { job =" windows-exporter " } [ 5 m ] ) ) + sum ( irate ( windows_net_packets_outbound_discarded_total { job =" windows-exporter " } [ 5 m ] ) )
EOF
}
rule {
enabled = true
record = " node:windows_node_net_saturation:sum_irate "
expression = < < EOF
sum by ( instance ) ( ( irate ( windows_net_packets_received_discarded_total { job =" windows-exporter " } [ 5 m ] ) + irate ( windows_net_packets_outbound_discarded_total { job =" windows-exporter " } [ 5 m ] ) ) )
EOF
}
rule {
enabled = true
record = " windows_pod_container_available "
expression = < < EOF
windows_container_available { job =" windows-exporter " , container_id ! = " " } * on ( container_id ) group_left ( container , pod , namespace ) max ( kube_pod_container_info { job =" kube-state-metrics " , container_id ! = " " } ) by ( container , container_id , pod , namespace )
EOF
}
rule {
enabled = true
record = " windows_container_total_runtime "
expression = < < EOF
windows_container_cpu_usage_seconds_total { job =" windows-exporter " , container_id ! = " " } * on ( container_id ) group_left ( container , pod , namespace ) max ( kube_pod_container_info { job =" kube-state-metrics " , container_id ! = " " } ) by ( container , container_id , pod , namespace )
EOF
}
rule {
enabled = true
record = " windows_container_memory_usage "
expression = < < EOF
windows_container_memory_usage_commit_bytes { job =" windows-exporter " , container_id ! = " " } * on ( container_id ) group_left ( container , pod , namespace ) max ( kube_pod_container_info { job =" kube-state-metrics " , container_id ! = " " } ) by ( container , container_id , pod , namespace )
EOF
}
rule {
enabled = true
record = " windows_container_private_working_set_usage "
expression = < < EOF
windows_container_memory_usage_private_working_set_bytes { job =" windows-exporter " , container_id ! = " " } * on ( container_id ) group_left ( container , pod , namespace ) max ( kube_pod_container_info { job =" kube-state-metrics " , container_id ! = " " } ) by ( container , container_id , pod , namespace )
EOF
}
rule {
enabled = true
record = " windows_container_network_received_bytes_total "
expression = < < EOF
windows_container_network_receive_bytes_total { job =" windows-exporter " , container_id ! = " " } * on ( container_id ) group_left ( container , pod , namespace ) max ( kube_pod_container_info { job =" kube-state-metrics " , container_id ! = " " } ) by ( container , container_id , pod , namespace )
EOF
}
rule {
enabled = true
record = " windows_container_network_transmitted_bytes_total "
expression = < < EOF
windows_container_network_transmit_bytes_total { job =" windows-exporter " , container_id ! = " " } * on ( container_id ) group_left ( container , pod , namespace ) max ( kube_pod_container_info { job =" kube-state-metrics " , container_id ! = " " } ) by ( container , container_id , pod , namespace )
EOF
}
rule {
enabled = true
record = " kube_pod_windows_container_resource_memory_request "
expression = < < EOF
max by ( namespace , pod , container ) ( kube_pod_container_resource_requests { resource =" memory " , job =" kube-state-metrics " } ) * on ( container , pod , namespace ) ( windows_pod_container_available )
EOF
}
rule {
enabled = true
record = " kube_pod_windows_container_resource_memory_limit "
expression = < < EOF
kube_pod_container_resource_limits { resource =" memory " , job =" kube-state-metrics " } * on ( container , pod , namespace ) ( windows_pod_container_available )
EOF
}
rule {
enabled = true
record = " kube_pod_windows_container_resource_cpu_cores_request "
expression = < < EOF
max by ( namespace , pod , container ) ( kube_pod_container_resource_requests { resource =" cpu " , job =" kube-state-metrics " } ) * on ( container , pod , namespace ) ( windows_pod_container_available )
EOF
}
rule {
enabled = true
record = " kube_pod_windows_container_resource_cpu_cores_limit "
expression = < < EOF
kube_pod_container_resource_limits { resource =" cpu " , job =" kube-state-metrics " } * on ( container , pod , namespace ) ( windows_pod_container_available )
EOF
}
rule {
enabled = true
record = " namespace_pod_container:windows_container_cpu_usage_seconds_total:sum_rate "
expression = < < EOF
sum by ( namespace , pod , container ) ( rate ( windows_container_total_runtime { } [ 5 m ] ) )
EOF
}
2023-04-26 02:12:03 +03:00
}
2023-08-03 04:08:56 +03:00
resource " azurerm_monitor_alert_prometheus_rule_group " " node_recording_rules_rule_group_win " {
name = " NodeRecordingRulesRuleGroup-Win- ${ var . cluster_name } "
location = azurerm_resource_group . rg . location
resource_group_name = azurerm_resource_group . rg . name
cluster_name = var . cluster_name
description = " Node and Kubernetes Recording Rules Rule Group for Windows Nodes "
rule_group_enabled = true
interval = " PT1M "
2023-09-28 00:37:22 +03:00
scopes = [ azurerm_monitor_workspace . amw . id , azurerm_kubernetes_cluster . k8s . id ]
2023-04-26 02:12:03 +03:00
2023-08-03 04:08:56 +03:00
rule {
enabled = true
record = " node:windows_node:sum "
expression = < < EOF
count ( windows_system_system_up_time { job =" windows-exporter " } )
EOF
}
rule {
enabled = true
record = " node:windows_node_num_cpu:sum "
expression = < < EOF
count by ( instance ) ( sum by ( instance , core ) ( windows_cpu_time_total { job =" windows-exporter " } ) )
EOF
}
rule {
enabled = true
record = " :windows_node_cpu_utilisation:avg5m "
expression = < < EOF
1 - avg ( rate ( windows_cpu_time_total { job =" windows-exporter " , mode =" idle " } [ 5 m ] ) )
EOF
}
rule {
enabled = true
record = " node:windows_node_cpu_utilisation:avg5m "
expression = < < EOF
1 - avg by ( instance ) ( rate ( windows_cpu_time_total { job =" windows-exporter " , mode =" idle " } [ 5 m ] ) )
EOF
}
rule {
enabled = true
record = " :windows_node_memory_utilisation: "
expression = < < EOF
1 - sum ( windows_memory_available_bytes { job =" windows-exporter " } ) / sum ( windows_os_visible_memory_bytes { job =" windows-exporter " } )
EOF
}
rule {
enabled = true
record = " :windows_node_memory_MemFreeCached_bytes:sum "
expression = < < EOF
sum ( windows_memory_available_bytes { job =" windows-exporter " } + windows_memory_cache_bytes { job =" windows-exporter " } )
EOF
}
rule {
enabled = true
record = " node:windows_node_memory_totalCached_bytes:sum "
expression = < < EOF
( windows_memory_cache_bytes { job =" windows-exporter " } + windows_memory_modified_page_list_bytes { job =" windows-exporter " } + windows_memory_standby_cache_core_bytes { job =" windows-exporter " } + windows_memory_standby_cache_normal_priority_bytes { job =" windows-exporter " } + windows_memory_standby_cache_reserve_bytes { job =" windows-exporter " } )
EOF
}
rule {
enabled = true
record = " :windows_node_memory_MemTotal_bytes:sum "
expression = < < EOF
sum ( windows_os_visible_memory_bytes { job =" windows-exporter " } )
EOF
}
rule {
enabled = true
record = " node:windows_node_memory_bytes_available:sum "
expression = < < EOF
sum by ( instance ) ( ( windows_memory_available_bytes { job =" windows-exporter " } ) )
EOF
}
rule {
enabled = true
record = " node:windows_node_memory_bytes_total:sum "
expression = < < EOF
sum by ( instance ) ( windows_os_visible_memory_bytes { job =" windows-exporter " } )
EOF
}
rule {
enabled = true
record = " node:windows_node_memory_utilisation:ratio "
expression = < < EOF
( node : windows_node_memory_bytes_total : sum - node : windows_node_memory_bytes_available : sum ) / scalar ( sum ( node : windows_node_memory_bytes_total : sum ) )
EOF
}
rule {
enabled = true
record = " node:windows_node_memory_utilisation: "
expression = < < EOF
1 - ( node : windows_node_memory_bytes_available : sum / node : windows_node_memory_bytes_total : sum )
EOF
}
rule {
enabled = true
record = " node:windows_node_memory_swap_io_pages:irate "
expression = < < EOF
irate ( windows_memory_swap_page_operations_total { job =" windows-exporter " } [ 5 m ] )
EOF
}
rule {
enabled = true
record = " :windows_node_disk_utilisation:avg_irate "
expression = < < EOF
avg ( irate ( windows_logical_disk_read_seconds_total { job =" windows-exporter " } [ 5 m ] ) + irate ( windows_logical_disk_write_seconds_total { job =" windows-exporter " } [ 5 m ] ) )
EOF
}
rule {
enabled = true
record = " node:windows_node_disk_utilisation:avg_irate "
expression = < < EOF
avg by ( instance ) ( ( irate ( windows_logical_disk_read_seconds_total { job =" windows-exporter " } [ 5 m ] ) + irate ( windows_logical_disk_write_seconds_total { job =" windows-exporter " } [ 5 m ] ) ) )
EOF
}
2023-04-26 02:12:03 +03:00
}