зеркало из https://github.com/microsoft/pai.git
change email template (#1282)
This commit is contained in:
Родитель
2ec8f9a6b3
Коммит
b6e0784aa8
|
@ -12,6 +12,8 @@ data:
|
|||
smtp_from: {{ alert_info['smtp_from'] }}
|
||||
smtp_auth_username: {{ alert_info['smtp_auth_username'] }}
|
||||
smtp_auth_password: {{ alert_info['smtp_auth_password'] }}
|
||||
templates:
|
||||
- '/etc/alertmanager/template/*.tmpl'
|
||||
route:
|
||||
receiver: pai-alert
|
||||
group_wait: 30s
|
||||
|
@ -22,4 +24,5 @@ data:
|
|||
- name: 'pai-alert'
|
||||
email_configs:
|
||||
- to: {{ alert_info['alert_receiver'] }}
|
||||
html: '{{ '{{' }} template "email.pai.html" . {{ '}}' }}'
|
||||
{% endif %}
|
||||
|
|
|
@ -16,12 +16,15 @@
|
|||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
{% set prom_info = clusterinfo['prometheusinfo'] %}
|
||||
{% if 'alerting' in prom_info and 'alert_manager_port' in prom_info['alerting'] %}
|
||||
{% set has_alert_manager = 'alerting' in prom_info %}
|
||||
{% if has_alert_manager and 'alert_manager_port' in prom_info['alerting'] %}
|
||||
{% set host = clusterinfo['prometheusinfo']['alerting']['alert-manager-hosts'] %}
|
||||
{% set port = clusterinfo['prometheusinfo']['alerting']['alert_manager_port'] %}
|
||||
{% else %}
|
||||
{% set port = 9093 %}
|
||||
{% endif %}
|
||||
|
||||
{% if has_alert_manager %}
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
|
@ -45,17 +48,24 @@ spec:
|
|||
args:
|
||||
- '--config.file=/etc/alertmanager/config.yml'
|
||||
- '--storage.path=/alertmanager'
|
||||
- '--web.external-url=http://{{host}}:{{port}}'
|
||||
ports:
|
||||
- name: alertmanager
|
||||
containerPort: {{ port }}
|
||||
volumeMounts:
|
||||
- name: config-volume
|
||||
mountPath: /etc/alertmanager
|
||||
- name: templates-volume
|
||||
mountPath: /etc/alertmanager/template
|
||||
- name: alertmanager
|
||||
mountPath: /alertmanager
|
||||
volumes:
|
||||
- name: config-volume
|
||||
configMap:
|
||||
name: alertmanager
|
||||
- name: templates-volume
|
||||
configMap:
|
||||
name: alert-templates
|
||||
- name: alertmanager
|
||||
emptyDir: {}
|
||||
{% endif %}
|
||||
|
|
|
@ -21,9 +21,10 @@
|
|||
|
||||
pushd $(dirname "$0") > /dev/null
|
||||
|
||||
{% if clusterinfo['prometheusinfo']['alerting'] %}
|
||||
{% if 'alerting' in clusterinfo['prometheusinfo'] %}
|
||||
kubectl create configmap alert-templates --from-file=../../../prometheus/alert-templates --dry-run -o yaml | kubectl apply --overwrite=true -f - || exit $?
|
||||
kubectl apply --overwrite=true -f alert-configmap.yaml || exit $?
|
||||
kubectl apply --overwrite=true -f alert-manager.yaml || exit $?
|
||||
{% endif %}
|
||||
|
||||
popd > /dev/null
|
||||
popd > /dev/null
|
||||
|
|
|
@ -19,8 +19,10 @@
|
|||
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
INSTANCES="deployment/alertmanager
|
||||
INSTANCES="
|
||||
deployment/alertmanager
|
||||
configmap/alertmanager
|
||||
configmap/alert-templates
|
||||
"
|
||||
|
||||
for instance in ${INSTANCES}; do
|
||||
|
|
|
@ -16,7 +16,9 @@
|
|||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
{% set prom_info = clusterinfo['prometheusinfo'] %}
|
||||
{% if 'alerting' in prom_info and 'alert_manager_port' in prom_info['alerting'] %}
|
||||
{% set has_alert_manager = 'alerting' in prom_info %}
|
||||
|
||||
{% if has_alert_manager and 'alert_manager_port' in prom_info['alerting'] %}
|
||||
{% set port = clusterinfo['prometheusinfo']['alerting']['alert_manager_port'] %}
|
||||
{% else %}
|
||||
{% set port = 9093 %}
|
||||
|
@ -33,7 +35,7 @@ data:
|
|||
- "/etc/prometheus-alert/*.rules"
|
||||
scrape_configs:
|
||||
- job_name: 'node_exporter'
|
||||
scrape_interval: {{ clusterinfo['prometheusinfo']['scrape_interval']|default(30) }}s
|
||||
scrape_interval: {{ prom_info['scrape_interval']|default(30) }}s
|
||||
kubernetes_sd_configs:
|
||||
- api_server: '{{ clusterinfo['webportalinfo']['k8s_api_server_uri'] }}'
|
||||
role: node
|
||||
|
@ -68,12 +70,10 @@ data:
|
|||
- source_labels: [__meta_kubernetes_pod_label_app]
|
||||
action: replace
|
||||
target_label: pai_service_name
|
||||
{% if clusterinfo['prometheusinfo']['alerting'] %}
|
||||
{% if has_alert_manager %}
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
{% for host in clusterinfo['prometheusinfo']['alerting']['alert-manager-hosts'] %}
|
||||
- {{ host }}:{{ port }}
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
- {{ prom_info['alerting']['alert-manager-hosts'] }}:{{ port }}
|
||||
{% endif %}
|
||||
|
|
|
@ -15,6 +15,9 @@
|
|||
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
{% set prometheus_url = clusterinfo["prometheusinfo"]["prometheus_url"] %}
|
||||
{% set prometheus_port = clusterinfo["prometheusinfo"]["prometheus_port"] %}
|
||||
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
|
@ -40,10 +43,11 @@ spec:
|
|||
memory: "10Gi"
|
||||
args:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--web.listen-address=0.0.0.0:{{clusterinfo['prometheusinfo']['prometheus_port']}}'
|
||||
- '--web.listen-address=0.0.0.0:{{prometheus_port}}'
|
||||
- '--web.external-url={{prometheus_url}}:{{prometheus_port}}'
|
||||
ports:
|
||||
- name: web
|
||||
containerPort: {{clusterinfo['prometheusinfo']['prometheus_port']}}
|
||||
containerPort: {{prometheus_port}}
|
||||
volumeMounts:
|
||||
- name: config-volume
|
||||
mountPath: /etc/prometheus
|
||||
|
|
|
@ -25,4 +25,5 @@ kubectl create configmap prometheus-alert --from-file=../../../prometheus/promet
|
|||
kubectl apply --overwrite=true -f prometheus-configmap.yaml || exit $?
|
||||
kubectl apply --overwrite=true -f prometheus-deployment.yaml || exit $?
|
||||
|
||||
popd > /dev/null
|
||||
|
||||
popd > /dev/null
|
||||
|
|
|
@ -19,7 +19,8 @@
|
|||
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
INSTANCES="deployment/prometheus-deployment
|
||||
INSTANCES="
|
||||
deployment/prometheus-deployment
|
||||
configmap/prometheus-configmap
|
||||
configmap/prometheus-alert
|
||||
"
|
||||
|
|
|
@ -252,17 +252,9 @@ class paiObjectModel:
|
|||
serviceDict["clusterinfo"]["prometheusinfo"]["node_exporter_port"] = \
|
||||
serviceDict["clusterinfo"]["prometheusinfo"]["node-exporter-port"]
|
||||
|
||||
alert_manager_hosts = []
|
||||
for host in self.rawData["clusterConfiguration"]["machine-list"]:
|
||||
if host.get("alert-manager") is None or host["alert-manager"].lower() != "true":
|
||||
continue
|
||||
|
||||
alert_manager_hosts.append(host["hostip"])
|
||||
|
||||
# template can check clusterinfo['prometheusinfo']['alerting'] to see if alert is enabled
|
||||
if serviceDict["clusterinfo"]["prometheusinfo"].get("alerting") is not None:
|
||||
serviceDict["clusterinfo"]["prometheusinfo"]["alerting"]["alert-manager-hosts"] = \
|
||||
alert_manager_hosts
|
||||
serviceDict["clusterinfo"]["prometheusinfo"]["alerting"]["alert-manager-hosts"] = self.getMasterIP()
|
||||
|
||||
# section
|
||||
|
||||
|
|
|
@ -115,7 +115,7 @@ class serivce_management_start:
|
|||
time.sleep(10)
|
||||
|
||||
except Exception as error:
|
||||
self.logger.error("Some error occurs when starting service {0}".format(serv))
|
||||
self.logger.exception("Some error occurs when starting service {0}".format(serv))
|
||||
sys.exit(1)
|
||||
|
||||
self.done_dict[serv] = True
|
||||
|
|
|
@ -62,23 +62,25 @@ class service_template_generate:
|
|||
# according to the "deploy-rules" in service.yaml config file
|
||||
# Currently support "In" and "NotIn" rules or the combination of them.
|
||||
def add_deploy_rule_to_yaml(self, str_src_yaml):
|
||||
|
||||
service_deploy_kind_list = ['DaemonSet', 'Deployment', 'StatefulSets', 'Pod']
|
||||
|
||||
|
||||
config = yaml.load(str_src_yaml)
|
||||
|
||||
# judge whether it's a service deploy file, eg. exclude configmap
|
||||
if 'kind' in config and config['kind'] in service_deploy_kind_list:
|
||||
# Some service may not being configured to run, for example when alert manager is not
|
||||
# configure, alert-manager-deployment.yaml contains nothing, and hence config is None.
|
||||
# In this case, return original content.
|
||||
if config is not None and 'kind' in config and config['kind'] in service_deploy_kind_list:
|
||||
match_expressions_arr = []
|
||||
|
||||
deploy_rules = self.service_conf['deploy-rules']
|
||||
for operator, label in deploy_rules.items():
|
||||
match_expression = dict()
|
||||
if operator.lower() == 'in':
|
||||
if operator.lower() == 'in':
|
||||
match_expression['operator'] = 'In'
|
||||
if operator.lower() == 'notin':
|
||||
match_expression['operator'] = 'NotIn'
|
||||
|
||||
|
||||
match_expression['key'] = label
|
||||
match_expression['values'] = ['true']
|
||||
match_expressions_arr.append(match_expression)
|
||||
|
@ -86,7 +88,6 @@ class service_template_generate:
|
|||
config['spec']['template']['spec']['affinity'] = {'nodeAffinity': \
|
||||
{'requiredDuringSchedulingIgnoredDuringExecution': {'nodeSelectorTerms': \
|
||||
[{'matchExpressions': match_expressions_arr}]}}}
|
||||
|
||||
else:
|
||||
logging.info("It is not a service deploy file! Only support " + str(service_deploy_kind_list))
|
||||
return str_src_yaml
|
||||
|
@ -127,8 +128,6 @@ class service_template_generate:
|
|||
|
||||
file_handler.write_generated_file(target_path, generated_template)
|
||||
|
||||
|
||||
|
||||
self.logger.info("The template file of service {0} is generated.".format(self.service_name))
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,124 @@
|
|||
{{ define "email.pai.html" }}
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
<!--
|
||||
Style and HTML derived from https://github.com/mailgun/transactional-email-templates
|
||||
|
||||
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2014 Mailgun
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
-->
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xmlns="http://www.w3.org/1999/xhtml" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
|
||||
<head>
|
||||
<meta name="viewport" content="width=device-width"/>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
|
||||
<title>{{ template "__subject" . }}</title>
|
||||
|
||||
</head>
|
||||
|
||||
<body itemscope="" itemtype="http://schema.org/EmailMessage" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; -webkit-font-smoothing: antialiased; -webkit-text-size-adjust: none; height: 100%; line-height: 1.6em; width: 100% !important; background-color: #f6f6f6; margin: 0; padding: 0;" bgcolor="#f6f6f6">
|
||||
|
||||
<table style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; width: 100%; background-color: #f6f6f6; margin: 0;" bgcolor="#f6f6f6">
|
||||
<tr>
|
||||
<td valign="top"></td>
|
||||
<td width="600" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; display: block !important; max-width: 600px !important; clear: both !important; width: 100% !important; margin: 0 auto; padding: 0;" valign="top">
|
||||
<div style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; max-width: 600px; display: block; margin: 0 auto; padding: 0;">
|
||||
<table width="100%" cellpadding="0" cellspacing="0" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; border-radius: 3px; background-color: #fff; margin: 0; border: 1px solid #e9e9e9;" bgcolor="#fff">
|
||||
<tr>
|
||||
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 16px; vertical-align: top; color: #fff; font-weight: 500; text-align: center; border-radius: 3px 3px 0 0; background-color: #E6522C; margin: 0; padding: 20px;" align="center" bgcolor="#E6522C" valign="top">
|
||||
{{ .Alerts | len }} alert{{ if gt (len .Alerts) 1 }}s{{ end }} for {{ range .GroupLabels.SortedPairs }}
|
||||
{{ .Name }}={{ .Value }}
|
||||
{{ end }}
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 10px;" valign="top">
|
||||
<table width="100%" cellpadding="0" cellspacing="0">
|
||||
<tr>
|
||||
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 0 0 20px;" valign="top">
|
||||
<a href="{{ template "__alertmanagerURL" . }}" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; color: #FFF; text-decoration: none; line-height: 2em; font-weight: bold; text-align: center; cursor: pointer; display: inline-block; border-radius: 5px; text-transform: capitalize; background-color: #348eda; margin: 0; border-color: #348eda; border-style: solid; border-width: 10px 20px;">View in {{ template "__alertmanager" . }}</a>
|
||||
</td>
|
||||
</tr>
|
||||
{{ if gt (len .Alerts.Firing) 0 }}
|
||||
<tr>
|
||||
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 0 0 20px;" valign="top">
|
||||
<strong>[{{ .Alerts.Firing | len }}] Firing</strong>
|
||||
</td>
|
||||
</tr>
|
||||
{{ end }}
|
||||
{{ range .Alerts.Firing }}
|
||||
<tr>
|
||||
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 0 0 20px;" valign="top">
|
||||
{{ if .Annotations.summary }}
|
||||
{{ .Annotations.summary }}
|
||||
{{ else }}
|
||||
<strong>Labels</strong><br/>
|
||||
{{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}<br/>{{ end }}
|
||||
{{ if gt (len .Annotations) 0 }}<strong>Annotations</strong><br/>{{ end }}
|
||||
{{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}<br/>{{ end }}
|
||||
{{ end }}
|
||||
<a href="{{ .GeneratorURL }}" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; color: #348eda; text-decoration: underline; margin: 0;">Source</a><br/>
|
||||
</td>
|
||||
</tr>
|
||||
{{ end }}
|
||||
|
||||
{{ if gt (len .Alerts.Resolved) 0 }}
|
||||
<tr>
|
||||
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 0 0 20px;" valign="top">
|
||||
<strong>[{{ .Alerts.Resolved | len }}] Resolved</strong>
|
||||
</td>
|
||||
</tr>
|
||||
{{ end }}
|
||||
{{ range .Alerts.Resolved }}
|
||||
<tr>
|
||||
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 0 0 20px;" valign="top">
|
||||
{{ if .Annotations.summary }}
|
||||
{{ .Annotations.summary }}
|
||||
{{ else }}
|
||||
<strong>Labels</strong><br/>
|
||||
{{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}<br/>{{ end }}
|
||||
{{ if gt (len .Annotations) 0 }}<strong>Annotations</strong><br/>{{ end }}
|
||||
{{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}<br/>{{ end }}
|
||||
{{ end }}
|
||||
<a href="{{ .GeneratorURL }}" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; color: #348eda; text-decoration: underline; margin: 0;">Source</a><br/>
|
||||
</td>
|
||||
</tr>
|
||||
{{ end }}
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<div style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; width: 100%; clear: both; color: #999; margin: 0; padding: 20px;">
|
||||
<table width="100%">
|
||||
<tr>
|
||||
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 12px; vertical-align: top; text-align: center; color: #999; margin: 0; padding: 0 0 20px;" align="center" valign="top"><a href="{{ .ExternalURL }}" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 12px; color: #999; text-decoration: underline; margin: 0;">Sent by {{ template "__alertmanager" . }}</a></td>
|
||||
</tr>
|
||||
</table>
|
||||
</div></div>
|
||||
</td>
|
||||
<td valign="top"></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
{{ end }}
|
|
@ -4,10 +4,8 @@ release.
|
|||
|
||||
# Configuration
|
||||
|
||||
To enable Alert Manager, follow the following steps:
|
||||
* select a node to deploy Alert Manager, both master and work node can be used as Alter Manager.
|
||||
* in `cluster-configuration` file, set `alert-manager: "true"` for this node.
|
||||
* configure Alert Manager by adding `alerting` fields under `prometheus` to services-configuration file.
|
||||
To enable Alert Manager, please configure Alert Manager by adding `alerting` fields under `prometheus`
|
||||
to services-configuration file.
|
||||
|
||||
Refer to example [`cluster-configuration`](../../cluster-configuration/cluster-configuration.yaml) and
|
||||
[`service-configuration`](../../cluster-configuration/services-configuration.yaml) for more
|
||||
|
@ -47,6 +45,9 @@ Following are these rule's triggering condition:
|
|||
| PaiServicePodNotRunning | kubernetes indicate one of pai service pod is not in running status |
|
||||
| PaiServicePodNotReady | kubernetes indicate one of pai service pod is not in ready status |
|
||||
|
||||
Our email template is similar to original Alert Manager's, except We only render annotation.summary
|
||||
if the key exist. This can make alert email simpler to read and understand.
|
||||
|
||||
If you want to add more rules, please reference syntax
|
||||
[here](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/).
|
||||
After adding rules, you should stop and start prometheus by using paictl
|
||||
|
|
|
@ -22,36 +22,24 @@ groups:
|
|||
rules:
|
||||
- alert: k8sApiServerNotOk
|
||||
expr: k8s_api_server_count{error!="ok"} > 0
|
||||
for: 1m
|
||||
labels:
|
||||
type: k8s_component
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: "api server in {{$labels.address}} is not ok"
|
||||
description: "api server in {{$labels.address}} is {{$labels.error}"
|
||||
summary: "api server in {{$labels.address}} is {{$labels.error}"
|
||||
|
||||
- alert: k8sEtcdNotOk
|
||||
expr: k8s_etcd_count{error!="ok"} > 0
|
||||
for: 1m
|
||||
labels:
|
||||
type: k8s_component
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: "etcd server in {{$labels.address}} is not ok"
|
||||
description: "etcd server in {{$labels.address}} is {{$labels.error}"
|
||||
summary: "etcd server in {{$labels.address}} is {{$labels.error}"
|
||||
|
||||
- alert: k8sKubeletNotOk
|
||||
expr: k8s_kubelet_count{error!="ok"} > 0
|
||||
for: 1m
|
||||
labels:
|
||||
type: k8s_component
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: "kubelet in {{$labels.address}} is not ok"
|
||||
description: "kubelet in {{$labels.address}} is {{$labels.error}"
|
||||
summary: "kubelet in {{$labels.address}} is {{$labels.error}"
|
||||
|
||||
- alert: k8sDockerDaemonNotOk
|
||||
expr: docker_daemon_count{error!="ok"} > 0
|
||||
for: 1m
|
||||
labels:
|
||||
type: docker_daemon
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: "docker daemon in {{$labels.ip}} is not ok"
|
||||
description: "docker daemon in {{$labels.ip}} is {{$labels.error}"
|
||||
summary: "docker daemon in {{$labels.ip}} is {{$labels.error}"
|
||||
|
|
|
@ -22,35 +22,24 @@ groups:
|
|||
rules:
|
||||
- alert: NodeFilesystemUsage
|
||||
expr: (node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100 > 80
|
||||
for: 2m
|
||||
labels:
|
||||
type: node
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: "{{$labels.instance}}: High Filesystem usage detected"
|
||||
description: "{{$labels.instance}}: Filesystem usage is above 80% (current value is: {{ $value }})"
|
||||
summary: "FileSystem usage in {{$labels.instance}} is above 80% (current value is: {{ $value }})"
|
||||
|
||||
- alert: NodeMemoryUsage
|
||||
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 80
|
||||
for: 2m
|
||||
labels:
|
||||
type: node
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: "{{$labels.instance}}: High Memory usage detected"
|
||||
description: "{{$labels.instance}}: Memory usage is above 80% (current value is: {{ $value }})"
|
||||
summary: "Memory usage in {{$labels.instance}} is above 80% (current value is: {{ $value }})"
|
||||
|
||||
- alert: NodeCPUUsage
|
||||
expr: (100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)) > 80
|
||||
for: 2m
|
||||
labels:
|
||||
type: node
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: "{{$labels.instance}}: High CPU usage detected"
|
||||
description: "{{$labels.instance}}: CPU usage is above 80% (current value is: {{ $value }})"
|
||||
summary: "CPU usage in {{$labels.instance}} is above 80% (current value is: {{ $value }})"
|
||||
|
||||
- alert: NodeDiskPressure
|
||||
expr: pai_node_count{disk_pressure="true"} > 1
|
||||
for: 1m
|
||||
labels:
|
||||
type: node
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: "node {{$labels.instance}} is under disk pressure"
|
||||
description: "node {{$labels.instance}} is under disk pressure"
|
||||
summary: "{{$labels.instance}} is under disk pressure"
|
||||
|
|
|
@ -23,11 +23,8 @@ groups:
|
|||
- alert: PaiServicePodNotRunning
|
||||
expr: pai_pod_count{phase!="running"} > 0
|
||||
for: 1m
|
||||
labels:
|
||||
type: pai_service
|
||||
annotations:
|
||||
summary: "{{$labels.name}} in {{$labels.host_ip}} not running detected"
|
||||
description: "{{$labels.name}} in {{$labels.host_ip}} is not running"
|
||||
|
||||
- alert: PaiServicePodNotReady
|
||||
expr: pai_pod_count{phase="running", ready="false"} > 0
|
||||
|
@ -36,7 +33,6 @@ groups:
|
|||
type: pai_service
|
||||
annotations:
|
||||
summary: "{{$labels.name}} in {{$labels.host_ip}} not ready detected"
|
||||
description: "{{$labels.name}} in {{$labels.host_ip}} not ready detected"
|
||||
|
||||
- alert: PaiServiceNotUp
|
||||
expr: up != 1
|
||||
|
@ -44,5 +40,4 @@ groups:
|
|||
labels:
|
||||
type: pai_service
|
||||
annotations:
|
||||
summary: "{{$labels.name}} in {{$labels.host_ip}} not up detected"
|
||||
description: "{{$labels.name}} in {{$labels.host_ip}} not up detected"
|
||||
summary: "{{$labels.job}} in {{$labels.instance}} not up detected"
|
||||
|
|
Загрузка…
Ссылка в новой задаче