This commit is contained in:
Di Xu 2018-09-10 15:40:27 +08:00 коммит произвёл GitHub
Родитель 2ec8f9a6b3
Коммит b6e0784aa8
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
16 изменённых файлов: 193 добавлений и 83 удалений

Просмотреть файл

@ -12,6 +12,8 @@ data:
smtp_from: {{ alert_info['smtp_from'] }}
smtp_auth_username: {{ alert_info['smtp_auth_username'] }}
smtp_auth_password: {{ alert_info['smtp_auth_password'] }}
templates:
- '/etc/alertmanager/template/*.tmpl'
route:
receiver: pai-alert
group_wait: 30s
@ -22,4 +24,5 @@ data:
- name: 'pai-alert'
email_configs:
- to: {{ alert_info['alert_receiver'] }}
html: '{{ '{{' }} template "email.pai.html" . {{ '}}' }}'
{% endif %}

Просмотреть файл

@ -16,12 +16,15 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
{% set prom_info = clusterinfo['prometheusinfo'] %}
{% if 'alerting' in prom_info and 'alert_manager_port' in prom_info['alerting'] %}
{% set has_alert_manager = 'alerting' in prom_info %}
{% if has_alert_manager and 'alert_manager_port' in prom_info['alerting'] %}
{% set host = clusterinfo['prometheusinfo']['alerting']['alert-manager-hosts'] %}
{% set port = clusterinfo['prometheusinfo']['alerting']['alert_manager_port'] %}
{% else %}
{% set port = 9093 %}
{% endif %}
{% if has_alert_manager %}
apiVersion: apps/v1
kind: Deployment
metadata:
@ -45,17 +48,24 @@ spec:
args:
- '--config.file=/etc/alertmanager/config.yml'
- '--storage.path=/alertmanager'
- '--web.external-url=http://{{host}}:{{port}}'
ports:
- name: alertmanager
containerPort: {{ port }}
volumeMounts:
- name: config-volume
mountPath: /etc/alertmanager
- name: templates-volume
mountPath: /etc/alertmanager/template
- name: alertmanager
mountPath: /alertmanager
volumes:
- name: config-volume
configMap:
name: alertmanager
- name: templates-volume
configMap:
name: alert-templates
- name: alertmanager
emptyDir: {}
{% endif %}

Просмотреть файл

@ -21,9 +21,10 @@
pushd $(dirname "$0") > /dev/null
{% if clusterinfo['prometheusinfo']['alerting'] %}
{% if 'alerting' in clusterinfo['prometheusinfo'] %}
kubectl create configmap alert-templates --from-file=../../../prometheus/alert-templates --dry-run -o yaml | kubectl apply --overwrite=true -f - || exit $?
kubectl apply --overwrite=true -f alert-configmap.yaml || exit $?
kubectl apply --overwrite=true -f alert-manager.yaml || exit $?
{% endif %}
popd > /dev/null
popd > /dev/null

Просмотреть файл

@ -19,8 +19,10 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
INSTANCES="deployment/alertmanager
INSTANCES="
deployment/alertmanager
configmap/alertmanager
configmap/alert-templates
"
for instance in ${INSTANCES}; do

Просмотреть файл

@ -16,7 +16,9 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
{% set prom_info = clusterinfo['prometheusinfo'] %}
{% if 'alerting' in prom_info and 'alert_manager_port' in prom_info['alerting'] %}
{% set has_alert_manager = 'alerting' in prom_info %}
{% if has_alert_manager and 'alert_manager_port' in prom_info['alerting'] %}
{% set port = clusterinfo['prometheusinfo']['alerting']['alert_manager_port'] %}
{% else %}
{% set port = 9093 %}
@ -33,7 +35,7 @@ data:
- "/etc/prometheus-alert/*.rules"
scrape_configs:
- job_name: 'node_exporter'
scrape_interval: {{ clusterinfo['prometheusinfo']['scrape_interval']|default(30) }}s
scrape_interval: {{ prom_info['scrape_interval']|default(30) }}s
kubernetes_sd_configs:
- api_server: '{{ clusterinfo['webportalinfo']['k8s_api_server_uri'] }}'
role: node
@ -68,12 +70,10 @@ data:
- source_labels: [__meta_kubernetes_pod_label_app]
action: replace
target_label: pai_service_name
{% if clusterinfo['prometheusinfo']['alerting'] %}
{% if has_alert_manager %}
alerting:
alertmanagers:
- static_configs:
- targets:
{% for host in clusterinfo['prometheusinfo']['alerting']['alert-manager-hosts'] %}
- {{ host }}:{{ port }}
{% endfor %}
{% endif %}
- {{ prom_info['alerting']['alert-manager-hosts'] }}:{{ port }}
{% endif %}

Просмотреть файл

@ -15,6 +15,9 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
{% set prometheus_url = clusterinfo["prometheusinfo"]["prometheus_url"] %}
{% set prometheus_port = clusterinfo["prometheusinfo"]["prometheus_port"] %}
apiVersion: apps/v1
kind: Deployment
metadata:
@ -40,10 +43,11 @@ spec:
memory: "10Gi"
args:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--web.listen-address=0.0.0.0:{{clusterinfo['prometheusinfo']['prometheus_port']}}'
- '--web.listen-address=0.0.0.0:{{prometheus_port}}'
- '--web.external-url={{prometheus_url}}:{{prometheus_port}}'
ports:
- name: web
containerPort: {{clusterinfo['prometheusinfo']['prometheus_port']}}
containerPort: {{prometheus_port}}
volumeMounts:
- name: config-volume
mountPath: /etc/prometheus

Просмотреть файл

@ -25,4 +25,5 @@ kubectl create configmap prometheus-alert --from-file=../../../prometheus/promet
kubectl apply --overwrite=true -f prometheus-configmap.yaml || exit $?
kubectl apply --overwrite=true -f prometheus-deployment.yaml || exit $?
popd > /dev/null
popd > /dev/null

Просмотреть файл

@ -19,7 +19,8 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
INSTANCES="deployment/prometheus-deployment
INSTANCES="
deployment/prometheus-deployment
configmap/prometheus-configmap
configmap/prometheus-alert
"

Просмотреть файл

@ -252,17 +252,9 @@ class paiObjectModel:
serviceDict["clusterinfo"]["prometheusinfo"]["node_exporter_port"] = \
serviceDict["clusterinfo"]["prometheusinfo"]["node-exporter-port"]
alert_manager_hosts = []
for host in self.rawData["clusterConfiguration"]["machine-list"]:
if host.get("alert-manager") is None or host["alert-manager"].lower() != "true":
continue
alert_manager_hosts.append(host["hostip"])
# template can check clusterinfo['prometheusinfo']['alerting'] to see if alert is enabled
if serviceDict["clusterinfo"]["prometheusinfo"].get("alerting") is not None:
serviceDict["clusterinfo"]["prometheusinfo"]["alerting"]["alert-manager-hosts"] = \
alert_manager_hosts
serviceDict["clusterinfo"]["prometheusinfo"]["alerting"]["alert-manager-hosts"] = self.getMasterIP()
# section

Просмотреть файл

@ -115,7 +115,7 @@ class serivce_management_start:
time.sleep(10)
except Exception as error:
self.logger.error("Some error occurs when starting service {0}".format(serv))
self.logger.exception("Some error occurs when starting service {0}".format(serv))
sys.exit(1)
self.done_dict[serv] = True

Просмотреть файл

@ -62,23 +62,25 @@ class service_template_generate:
# according to the "deploy-rules" in service.yaml config file
# Currently support "In" and "NotIn" rules or the combination of them.
def add_deploy_rule_to_yaml(self, str_src_yaml):
service_deploy_kind_list = ['DaemonSet', 'Deployment', 'StatefulSets', 'Pod']
config = yaml.load(str_src_yaml)
# judge whether it's a service deploy file, eg. exclude configmap
if 'kind' in config and config['kind'] in service_deploy_kind_list:
# Some service may not being configured to run, for example when alert manager is not
# configure, alert-manager-deployment.yaml contains nothing, and hence config is None.
# In this case, return original content.
if config is not None and 'kind' in config and config['kind'] in service_deploy_kind_list:
match_expressions_arr = []
deploy_rules = self.service_conf['deploy-rules']
for operator, label in deploy_rules.items():
match_expression = dict()
if operator.lower() == 'in':
if operator.lower() == 'in':
match_expression['operator'] = 'In'
if operator.lower() == 'notin':
match_expression['operator'] = 'NotIn'
match_expression['key'] = label
match_expression['values'] = ['true']
match_expressions_arr.append(match_expression)
@ -86,7 +88,6 @@ class service_template_generate:
config['spec']['template']['spec']['affinity'] = {'nodeAffinity': \
{'requiredDuringSchedulingIgnoredDuringExecution': {'nodeSelectorTerms': \
[{'matchExpressions': match_expressions_arr}]}}}
else:
logging.info("It is not a service deploy file! Only support " + str(service_deploy_kind_list))
return str_src_yaml
@ -127,8 +128,6 @@ class service_template_generate:
file_handler.write_generated_file(target_path, generated_template)
self.logger.info("The template file of service {0} is generated.".format(self.service_name))

Просмотреть файл

@ -0,0 +1,124 @@
{{ define "email.pai.html" }}
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<!--
Style and HTML derived from https://github.com/mailgun/transactional-email-templates
The MIT License (MIT)
Copyright (c) 2014 Mailgun
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
-->
<html xmlns="http://www.w3.org/1999/xhtml" xmlns="http://www.w3.org/1999/xhtml" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<head>
<meta name="viewport" content="width=device-width"/>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
<title>{{ template "__subject" . }}</title>
</head>
<body itemscope="" itemtype="http://schema.org/EmailMessage" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; -webkit-font-smoothing: antialiased; -webkit-text-size-adjust: none; height: 100%; line-height: 1.6em; width: 100% !important; background-color: #f6f6f6; margin: 0; padding: 0;" bgcolor="#f6f6f6">
<table style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; width: 100%; background-color: #f6f6f6; margin: 0;" bgcolor="#f6f6f6">
<tr>
<td valign="top"></td>
<td width="600" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; display: block !important; max-width: 600px !important; clear: both !important; width: 100% !important; margin: 0 auto; padding: 0;" valign="top">
<div style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; max-width: 600px; display: block; margin: 0 auto; padding: 0;">
<table width="100%" cellpadding="0" cellspacing="0" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; border-radius: 3px; background-color: #fff; margin: 0; border: 1px solid #e9e9e9;" bgcolor="#fff">
<tr>
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 16px; vertical-align: top; color: #fff; font-weight: 500; text-align: center; border-radius: 3px 3px 0 0; background-color: #E6522C; margin: 0; padding: 20px;" align="center" bgcolor="#E6522C" valign="top">
{{ .Alerts | len }} alert{{ if gt (len .Alerts) 1 }}s{{ end }} for {{ range .GroupLabels.SortedPairs }}
{{ .Name }}={{ .Value }}
{{ end }}
</td>
</tr>
<tr>
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 10px;" valign="top">
<table width="100%" cellpadding="0" cellspacing="0">
<tr>
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 0 0 20px;" valign="top">
<a href="{{ template "__alertmanagerURL" . }}" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; color: #FFF; text-decoration: none; line-height: 2em; font-weight: bold; text-align: center; cursor: pointer; display: inline-block; border-radius: 5px; text-transform: capitalize; background-color: #348eda; margin: 0; border-color: #348eda; border-style: solid; border-width: 10px 20px;">View in {{ template "__alertmanager" . }}</a>
</td>
</tr>
{{ if gt (len .Alerts.Firing) 0 }}
<tr>
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 0 0 20px;" valign="top">
<strong>[{{ .Alerts.Firing | len }}] Firing</strong>
</td>
</tr>
{{ end }}
{{ range .Alerts.Firing }}
<tr>
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 0 0 20px;" valign="top">
{{ if .Annotations.summary }}
{{ .Annotations.summary }}
{{ else }}
<strong>Labels</strong><br/>
{{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}<br/>{{ end }}
{{ if gt (len .Annotations) 0 }}<strong>Annotations</strong><br/>{{ end }}
{{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}<br/>{{ end }}
{{ end }}
<a href="{{ .GeneratorURL }}" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; color: #348eda; text-decoration: underline; margin: 0;">Source</a><br/>
</td>
</tr>
{{ end }}
{{ if gt (len .Alerts.Resolved) 0 }}
<tr>
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 0 0 20px;" valign="top">
<strong>[{{ .Alerts.Resolved | len }}] Resolved</strong>
</td>
</tr>
{{ end }}
{{ range .Alerts.Resolved }}
<tr>
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 0 0 20px;" valign="top">
{{ if .Annotations.summary }}
{{ .Annotations.summary }}
{{ else }}
<strong>Labels</strong><br/>
{{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}<br/>{{ end }}
{{ if gt (len .Annotations) 0 }}<strong>Annotations</strong><br/>{{ end }}
{{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}<br/>{{ end }}
{{ end }}
<a href="{{ .GeneratorURL }}" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; color: #348eda; text-decoration: underline; margin: 0;">Source</a><br/>
</td>
</tr>
{{ end }}
</table>
</td>
</tr>
</table>
<div style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; width: 100%; clear: both; color: #999; margin: 0; padding: 20px;">
<table width="100%">
<tr>
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 12px; vertical-align: top; text-align: center; color: #999; margin: 0; padding: 0 0 20px;" align="center" valign="top"><a href="{{ .ExternalURL }}" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 12px; color: #999; text-decoration: underline; margin: 0;">Sent by {{ template "__alertmanager" . }}</a></td>
</tr>
</table>
</div></div>
</td>
<td valign="top"></td>
</tr>
</table>
</body>
</html>
{{ end }}

Просмотреть файл

@ -4,10 +4,8 @@ release.
# Configuration
To enable Alert Manager, follow the following steps:
* select a node to deploy Alert Manager, both master and work node can be used as Alter Manager.
* in `cluster-configuration` file, set `alert-manager: "true"` for this node.
* configure Alert Manager by adding `alerting` fields under `prometheus` to services-configuration file.
To enable Alert Manager, please configure Alert Manager by adding `alerting` fields under `prometheus`
to services-configuration file.
Refer to example [`cluster-configuration`](../../cluster-configuration/cluster-configuration.yaml) and
[`service-configuration`](../../cluster-configuration/services-configuration.yaml) for more
@ -47,6 +45,9 @@ Following are these rule's triggering condition:
| PaiServicePodNotRunning | kubernetes indicate one of pai service pod is not in running status |
| PaiServicePodNotReady | kubernetes indicate one of pai service pod is not in ready status |
Our email template is similar to original Alert Manager's, except We only render annotation.summary
if the key exist. This can make alert email simpler to read and understand.
If you want to add more rules, please reference syntax
[here](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/).
After adding rules, you should stop and start prometheus by using paictl

Просмотреть файл

@ -22,36 +22,24 @@ groups:
rules:
- alert: k8sApiServerNotOk
expr: k8s_api_server_count{error!="ok"} > 0
for: 1m
labels:
type: k8s_component
for: 5m
annotations:
summary: "api server in {{$labels.address}} is not ok"
description: "api server in {{$labels.address}} is {{$labels.error}"
summary: "api server in {{$labels.address}} is {{$labels.error}"
- alert: k8sEtcdNotOk
expr: k8s_etcd_count{error!="ok"} > 0
for: 1m
labels:
type: k8s_component
for: 5m
annotations:
summary: "etcd server in {{$labels.address}} is not ok"
description: "etcd server in {{$labels.address}} is {{$labels.error}"
summary: "etcd server in {{$labels.address}} is {{$labels.error}"
- alert: k8sKubeletNotOk
expr: k8s_kubelet_count{error!="ok"} > 0
for: 1m
labels:
type: k8s_component
for: 5m
annotations:
summary: "kubelet in {{$labels.address}} is not ok"
description: "kubelet in {{$labels.address}} is {{$labels.error}"
summary: "kubelet in {{$labels.address}} is {{$labels.error}"
- alert: k8sDockerDaemonNotOk
expr: docker_daemon_count{error!="ok"} > 0
for: 1m
labels:
type: docker_daemon
for: 5m
annotations:
summary: "docker daemon in {{$labels.ip}} is not ok"
description: "docker daemon in {{$labels.ip}} is {{$labels.error}"
summary: "docker daemon in {{$labels.ip}} is {{$labels.error}"

Просмотреть файл

@ -22,35 +22,24 @@ groups:
rules:
- alert: NodeFilesystemUsage
expr: (node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100 > 80
for: 2m
labels:
type: node
for: 5m
annotations:
summary: "{{$labels.instance}}: High Filesystem usage detected"
description: "{{$labels.instance}}: Filesystem usage is above 80% (current value is: {{ $value }})"
summary: "FileSystem usage in {{$labels.instance}} is above 80% (current value is: {{ $value }})"
- alert: NodeMemoryUsage
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 80
for: 2m
labels:
type: node
for: 5m
annotations:
summary: "{{$labels.instance}}: High Memory usage detected"
description: "{{$labels.instance}}: Memory usage is above 80% (current value is: {{ $value }})"
summary: "Memory usage in {{$labels.instance}} is above 80% (current value is: {{ $value }})"
- alert: NodeCPUUsage
expr: (100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)) > 80
for: 2m
labels:
type: node
for: 5m
annotations:
summary: "{{$labels.instance}}: High CPU usage detected"
description: "{{$labels.instance}}: CPU usage is above 80% (current value is: {{ $value }})"
summary: "CPU usage in {{$labels.instance}} is above 80% (current value is: {{ $value }})"
- alert: NodeDiskPressure
expr: pai_node_count{disk_pressure="true"} > 1
for: 1m
labels:
type: node
for: 5m
annotations:
summary: "node {{$labels.instance}} is under disk pressure"
description: "node {{$labels.instance}} is under disk pressure"
summary: "{{$labels.instance}} is under disk pressure"

Просмотреть файл

@ -23,11 +23,8 @@ groups:
- alert: PaiServicePodNotRunning
expr: pai_pod_count{phase!="running"} > 0
for: 1m
labels:
type: pai_service
annotations:
summary: "{{$labels.name}} in {{$labels.host_ip}} not running detected"
description: "{{$labels.name}} in {{$labels.host_ip}} is not running"
- alert: PaiServicePodNotReady
expr: pai_pod_count{phase="running", ready="false"} > 0
@ -36,7 +33,6 @@ groups:
type: pai_service
annotations:
summary: "{{$labels.name}} in {{$labels.host_ip}} not ready detected"
description: "{{$labels.name}} in {{$labels.host_ip}} not ready detected"
- alert: PaiServiceNotUp
expr: up != 1
@ -44,5 +40,4 @@ groups:
labels:
type: pai_service
annotations:
summary: "{{$labels.name}} in {{$labels.host_ip}} not up detected"
description: "{{$labels.name}} in {{$labels.host_ip}} not up detected"
summary: "{{$labels.job}} in {{$labels.instance}} not up detected"