From b6e0784aa89f893fe8b575f522a25f2df7165426 Mon Sep 17 00:00:00 2001 From: Di Xu Date: Mon, 10 Sep 2018 15:40:27 +0800 Subject: [PATCH] change email template (#1282) --- .../alert-configmap.yaml.template | 3 + .../alert-manager/alert-manager.yaml.template | 12 +- .../bootstrap/alert-manager/start.sh.template | 5 +- .../bootstrap/alert-manager/stop.sh | 4 +- .../prometheus-configmap.yaml.template | 14 +- .../prometheus-deployment.yaml.template | 8 +- pai-management/bootstrap/prometheus/start.sh | 3 +- pai-management/bootstrap/prometheus/stop.sh | 3 +- .../clusterObjectModel/paiObjectModel.py | 10 +- .../paiService/service_management_start.py | 2 +- .../paiService/service_template_generate.py | 15 +-- prometheus/alert-templates/pai.tmpl | 124 ++++++++++++++++++ prometheus/doc/alert-manager.md | 9 +- prometheus/prometheus-alert/k8s.rules | 28 ++-- prometheus/prometheus-alert/node.rules | 29 ++-- .../prometheus-alert/pai-services.rules | 7 +- 16 files changed, 193 insertions(+), 83 deletions(-) create mode 100644 prometheus/alert-templates/pai.tmpl diff --git a/pai-management/bootstrap/alert-manager/alert-configmap.yaml.template b/pai-management/bootstrap/alert-manager/alert-configmap.yaml.template index 59da6187a..04b380770 100644 --- a/pai-management/bootstrap/alert-manager/alert-configmap.yaml.template +++ b/pai-management/bootstrap/alert-manager/alert-configmap.yaml.template @@ -12,6 +12,8 @@ data: smtp_from: {{ alert_info['smtp_from'] }} smtp_auth_username: {{ alert_info['smtp_auth_username'] }} smtp_auth_password: {{ alert_info['smtp_auth_password'] }} + templates: + - '/etc/alertmanager/template/*.tmpl' route: receiver: pai-alert group_wait: 30s @@ -22,4 +24,5 @@ data: - name: 'pai-alert' email_configs: - to: {{ alert_info['alert_receiver'] }} + html: '{{ '{{' }} template "email.pai.html" . {{ '}}' }}' {% endif %} diff --git a/pai-management/bootstrap/alert-manager/alert-manager.yaml.template b/pai-management/bootstrap/alert-manager/alert-manager.yaml.template index 97a51b97b..af0661db3 100644 --- a/pai-management/bootstrap/alert-manager/alert-manager.yaml.template +++ b/pai-management/bootstrap/alert-manager/alert-manager.yaml.template @@ -16,12 +16,15 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. {% set prom_info = clusterinfo['prometheusinfo'] %} -{% if 'alerting' in prom_info and 'alert_manager_port' in prom_info['alerting'] %} +{% set has_alert_manager = 'alerting' in prom_info %} +{% if has_alert_manager and 'alert_manager_port' in prom_info['alerting'] %} +{% set host = clusterinfo['prometheusinfo']['alerting']['alert-manager-hosts'] %} {% set port = clusterinfo['prometheusinfo']['alerting']['alert_manager_port'] %} {% else %} {% set port = 9093 %} {% endif %} +{% if has_alert_manager %} apiVersion: apps/v1 kind: Deployment metadata: @@ -45,17 +48,24 @@ spec: args: - '--config.file=/etc/alertmanager/config.yml' - '--storage.path=/alertmanager' + - '--web.external-url=http://{{host}}:{{port}}' ports: - name: alertmanager containerPort: {{ port }} volumeMounts: - name: config-volume mountPath: /etc/alertmanager + - name: templates-volume + mountPath: /etc/alertmanager/template - name: alertmanager mountPath: /alertmanager volumes: - name: config-volume configMap: name: alertmanager + - name: templates-volume + configMap: + name: alert-templates - name: alertmanager emptyDir: {} +{% endif %} diff --git a/pai-management/bootstrap/alert-manager/start.sh.template b/pai-management/bootstrap/alert-manager/start.sh.template index f1f5b863a..4fab98f3b 100644 --- a/pai-management/bootstrap/alert-manager/start.sh.template +++ b/pai-management/bootstrap/alert-manager/start.sh.template @@ -21,9 +21,10 @@ pushd $(dirname "$0") > /dev/null -{% if clusterinfo['prometheusinfo']['alerting'] %} +{% if 'alerting' in clusterinfo['prometheusinfo'] %} +kubectl create configmap alert-templates --from-file=../../../prometheus/alert-templates --dry-run -o yaml | kubectl apply --overwrite=true -f - || exit $? kubectl apply --overwrite=true -f alert-configmap.yaml || exit $? kubectl apply --overwrite=true -f alert-manager.yaml || exit $? {% endif %} -popd > /dev/null \ No newline at end of file +popd > /dev/null diff --git a/pai-management/bootstrap/alert-manager/stop.sh b/pai-management/bootstrap/alert-manager/stop.sh index 6dffaadbe..897156735 100644 --- a/pai-management/bootstrap/alert-manager/stop.sh +++ b/pai-management/bootstrap/alert-manager/stop.sh @@ -19,8 +19,10 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -INSTANCES="deployment/alertmanager +INSTANCES=" +deployment/alertmanager configmap/alertmanager +configmap/alert-templates " for instance in ${INSTANCES}; do diff --git a/pai-management/bootstrap/prometheus/prometheus-configmap.yaml.template b/pai-management/bootstrap/prometheus/prometheus-configmap.yaml.template index 824ebf4db..338254871 100644 --- a/pai-management/bootstrap/prometheus/prometheus-configmap.yaml.template +++ b/pai-management/bootstrap/prometheus/prometheus-configmap.yaml.template @@ -16,7 +16,9 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. {% set prom_info = clusterinfo['prometheusinfo'] %} -{% if 'alerting' in prom_info and 'alert_manager_port' in prom_info['alerting'] %} +{% set has_alert_manager = 'alerting' in prom_info %} + +{% if has_alert_manager and 'alert_manager_port' in prom_info['alerting'] %} {% set port = clusterinfo['prometheusinfo']['alerting']['alert_manager_port'] %} {% else %} {% set port = 9093 %} @@ -33,7 +35,7 @@ data: - "/etc/prometheus-alert/*.rules" scrape_configs: - job_name: 'node_exporter' - scrape_interval: {{ clusterinfo['prometheusinfo']['scrape_interval']|default(30) }}s + scrape_interval: {{ prom_info['scrape_interval']|default(30) }}s kubernetes_sd_configs: - api_server: '{{ clusterinfo['webportalinfo']['k8s_api_server_uri'] }}' role: node @@ -68,12 +70,10 @@ data: - source_labels: [__meta_kubernetes_pod_label_app] action: replace target_label: pai_service_name -{% if clusterinfo['prometheusinfo']['alerting'] %} +{% if has_alert_manager %} alerting: alertmanagers: - static_configs: - targets: - {% for host in clusterinfo['prometheusinfo']['alerting']['alert-manager-hosts'] %} - - {{ host }}:{{ port }} - {% endfor %} -{% endif %} \ No newline at end of file + - {{ prom_info['alerting']['alert-manager-hosts'] }}:{{ port }} +{% endif %} diff --git a/pai-management/bootstrap/prometheus/prometheus-deployment.yaml.template b/pai-management/bootstrap/prometheus/prometheus-deployment.yaml.template index b90618e98..ba36a903a 100644 --- a/pai-management/bootstrap/prometheus/prometheus-deployment.yaml.template +++ b/pai-management/bootstrap/prometheus/prometheus-deployment.yaml.template @@ -15,6 +15,9 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +{% set prometheus_url = clusterinfo["prometheusinfo"]["prometheus_url"] %} +{% set prometheus_port = clusterinfo["prometheusinfo"]["prometheus_port"] %} + apiVersion: apps/v1 kind: Deployment metadata: @@ -40,10 +43,11 @@ spec: memory: "10Gi" args: - '--config.file=/etc/prometheus/prometheus.yml' - - '--web.listen-address=0.0.0.0:{{clusterinfo['prometheusinfo']['prometheus_port']}}' + - '--web.listen-address=0.0.0.0:{{prometheus_port}}' + - '--web.external-url={{prometheus_url}}:{{prometheus_port}}' ports: - name: web - containerPort: {{clusterinfo['prometheusinfo']['prometheus_port']}} + containerPort: {{prometheus_port}} volumeMounts: - name: config-volume mountPath: /etc/prometheus diff --git a/pai-management/bootstrap/prometheus/start.sh b/pai-management/bootstrap/prometheus/start.sh index e6bfc9651..e86085d0f 100644 --- a/pai-management/bootstrap/prometheus/start.sh +++ b/pai-management/bootstrap/prometheus/start.sh @@ -25,4 +25,5 @@ kubectl create configmap prometheus-alert --from-file=../../../prometheus/promet kubectl apply --overwrite=true -f prometheus-configmap.yaml || exit $? kubectl apply --overwrite=true -f prometheus-deployment.yaml || exit $? -popd > /dev/null \ No newline at end of file + +popd > /dev/null diff --git a/pai-management/bootstrap/prometheus/stop.sh b/pai-management/bootstrap/prometheus/stop.sh index f61f3ee14..b0b2b5e74 100644 --- a/pai-management/bootstrap/prometheus/stop.sh +++ b/pai-management/bootstrap/prometheus/stop.sh @@ -19,7 +19,8 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -INSTANCES="deployment/prometheus-deployment +INSTANCES=" +deployment/prometheus-deployment configmap/prometheus-configmap configmap/prometheus-alert " diff --git a/pai-management/paiLibrary/clusterObjectModel/paiObjectModel.py b/pai-management/paiLibrary/clusterObjectModel/paiObjectModel.py index d6546d599..997f2ec03 100644 --- a/pai-management/paiLibrary/clusterObjectModel/paiObjectModel.py +++ b/pai-management/paiLibrary/clusterObjectModel/paiObjectModel.py @@ -252,17 +252,9 @@ class paiObjectModel: serviceDict["clusterinfo"]["prometheusinfo"]["node_exporter_port"] = \ serviceDict["clusterinfo"]["prometheusinfo"]["node-exporter-port"] - alert_manager_hosts = [] - for host in self.rawData["clusterConfiguration"]["machine-list"]: - if host.get("alert-manager") is None or host["alert-manager"].lower() != "true": - continue - - alert_manager_hosts.append(host["hostip"]) - # template can check clusterinfo['prometheusinfo']['alerting'] to see if alert is enabled if serviceDict["clusterinfo"]["prometheusinfo"].get("alerting") is not None: - serviceDict["clusterinfo"]["prometheusinfo"]["alerting"]["alert-manager-hosts"] = \ - alert_manager_hosts + serviceDict["clusterinfo"]["prometheusinfo"]["alerting"]["alert-manager-hosts"] = self.getMasterIP() # section diff --git a/pai-management/paiLibrary/paiService/service_management_start.py b/pai-management/paiLibrary/paiService/service_management_start.py index 22ece9f92..b53d4ec12 100644 --- a/pai-management/paiLibrary/paiService/service_management_start.py +++ b/pai-management/paiLibrary/paiService/service_management_start.py @@ -115,7 +115,7 @@ class serivce_management_start: time.sleep(10) except Exception as error: - self.logger.error("Some error occurs when starting service {0}".format(serv)) + self.logger.exception("Some error occurs when starting service {0}".format(serv)) sys.exit(1) self.done_dict[serv] = True diff --git a/pai-management/paiLibrary/paiService/service_template_generate.py b/pai-management/paiLibrary/paiService/service_template_generate.py index 6d65bf179..01c3a77d9 100644 --- a/pai-management/paiLibrary/paiService/service_template_generate.py +++ b/pai-management/paiLibrary/paiService/service_template_generate.py @@ -62,23 +62,25 @@ class service_template_generate: # according to the "deploy-rules" in service.yaml config file # Currently support "In" and "NotIn" rules or the combination of them. def add_deploy_rule_to_yaml(self, str_src_yaml): - service_deploy_kind_list = ['DaemonSet', 'Deployment', 'StatefulSets', 'Pod'] - + config = yaml.load(str_src_yaml) # judge whether it's a service deploy file, eg. exclude configmap - if 'kind' in config and config['kind'] in service_deploy_kind_list: + # Some service may not being configured to run, for example when alert manager is not + # configure, alert-manager-deployment.yaml contains nothing, and hence config is None. + # In this case, return original content. + if config is not None and 'kind' in config and config['kind'] in service_deploy_kind_list: match_expressions_arr = [] deploy_rules = self.service_conf['deploy-rules'] for operator, label in deploy_rules.items(): match_expression = dict() - if operator.lower() == 'in': + if operator.lower() == 'in': match_expression['operator'] = 'In' if operator.lower() == 'notin': match_expression['operator'] = 'NotIn' - + match_expression['key'] = label match_expression['values'] = ['true'] match_expressions_arr.append(match_expression) @@ -86,7 +88,6 @@ class service_template_generate: config['spec']['template']['spec']['affinity'] = {'nodeAffinity': \ {'requiredDuringSchedulingIgnoredDuringExecution': {'nodeSelectorTerms': \ [{'matchExpressions': match_expressions_arr}]}}} - else: logging.info("It is not a service deploy file! Only support " + str(service_deploy_kind_list)) return str_src_yaml @@ -127,8 +128,6 @@ class service_template_generate: file_handler.write_generated_file(target_path, generated_template) - - self.logger.info("The template file of service {0} is generated.".format(self.service_name)) diff --git a/prometheus/alert-templates/pai.tmpl b/prometheus/alert-templates/pai.tmpl new file mode 100644 index 000000000..def6c1545 --- /dev/null +++ b/prometheus/alert-templates/pai.tmpl @@ -0,0 +1,124 @@ +{{ define "email.pai.html" }} + + + + + + +{{ template "__subject" . }} + + + + + + + + + + + +
+
+ + + + + + + +
+ {{ .Alerts | len }} alert{{ if gt (len .Alerts) 1 }}s{{ end }} for {{ range .GroupLabels.SortedPairs }} + {{ .Name }}={{ .Value }} + {{ end }} +
+ + + + + {{ if gt (len .Alerts.Firing) 0 }} + + + + {{ end }} + {{ range .Alerts.Firing }} + + + + {{ end }} + + {{ if gt (len .Alerts.Resolved) 0 }} + + + + {{ end }} + {{ range .Alerts.Resolved }} + + + + {{ end }} +
+ View in {{ template "__alertmanager" . }} +
+ [{{ .Alerts.Firing | len }}] Firing +
+ {{ if .Annotations.summary }} + {{ .Annotations.summary }} + {{ else }} + Labels
+ {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + {{ if gt (len .Annotations) 0 }}Annotations
{{ end }} + {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + {{ end }} + Source
+
+ [{{ .Alerts.Resolved | len }}] Resolved +
+ {{ if .Annotations.summary }} + {{ .Annotations.summary }} + {{ else }} + Labels
+ {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + {{ if gt (len .Annotations) 0 }}Annotations
{{ end }} + {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + {{ end }} + Source
+
+
+ +
+
+ + + +{{ end }} diff --git a/prometheus/doc/alert-manager.md b/prometheus/doc/alert-manager.md index 28f19ab2c..ed612fffb 100644 --- a/prometheus/doc/alert-manager.md +++ b/prometheus/doc/alert-manager.md @@ -4,10 +4,8 @@ release. # Configuration -To enable Alert Manager, follow the following steps: -* select a node to deploy Alert Manager, both master and work node can be used as Alter Manager. -* in `cluster-configuration` file, set `alert-manager: "true"` for this node. -* configure Alert Manager by adding `alerting` fields under `prometheus` to services-configuration file. +To enable Alert Manager, please configure Alert Manager by adding `alerting` fields under `prometheus` +to services-configuration file. Refer to example [`cluster-configuration`](../../cluster-configuration/cluster-configuration.yaml) and [`service-configuration`](../../cluster-configuration/services-configuration.yaml) for more @@ -47,6 +45,9 @@ Following are these rule's triggering condition: | PaiServicePodNotRunning | kubernetes indicate one of pai service pod is not in running status | | PaiServicePodNotReady | kubernetes indicate one of pai service pod is not in ready status | +Our email template is similar to original Alert Manager's, except We only render annotation.summary +if the key exist. This can make alert email simpler to read and understand. + If you want to add more rules, please reference syntax [here](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/). After adding rules, you should stop and start prometheus by using paictl diff --git a/prometheus/prometheus-alert/k8s.rules b/prometheus/prometheus-alert/k8s.rules index 8577baf87..be99b1c64 100644 --- a/prometheus/prometheus-alert/k8s.rules +++ b/prometheus/prometheus-alert/k8s.rules @@ -22,36 +22,24 @@ groups: rules: - alert: k8sApiServerNotOk expr: k8s_api_server_count{error!="ok"} > 0 - for: 1m - labels: - type: k8s_component + for: 5m annotations: - summary: "api server in {{$labels.address}} is not ok" - description: "api server in {{$labels.address}} is {{$labels.error}" + summary: "api server in {{$labels.address}} is {{$labels.error}" - alert: k8sEtcdNotOk expr: k8s_etcd_count{error!="ok"} > 0 - for: 1m - labels: - type: k8s_component + for: 5m annotations: - summary: "etcd server in {{$labels.address}} is not ok" - description: "etcd server in {{$labels.address}} is {{$labels.error}" + summary: "etcd server in {{$labels.address}} is {{$labels.error}" - alert: k8sKubeletNotOk expr: k8s_kubelet_count{error!="ok"} > 0 - for: 1m - labels: - type: k8s_component + for: 5m annotations: - summary: "kubelet in {{$labels.address}} is not ok" - description: "kubelet in {{$labels.address}} is {{$labels.error}" + summary: "kubelet in {{$labels.address}} is {{$labels.error}" - alert: k8sDockerDaemonNotOk expr: docker_daemon_count{error!="ok"} > 0 - for: 1m - labels: - type: docker_daemon + for: 5m annotations: - summary: "docker daemon in {{$labels.ip}} is not ok" - description: "docker daemon in {{$labels.ip}} is {{$labels.error}" + summary: "docker daemon in {{$labels.ip}} is {{$labels.error}" diff --git a/prometheus/prometheus-alert/node.rules b/prometheus/prometheus-alert/node.rules index 055357a39..57ca01677 100644 --- a/prometheus/prometheus-alert/node.rules +++ b/prometheus/prometheus-alert/node.rules @@ -22,35 +22,24 @@ groups: rules: - alert: NodeFilesystemUsage expr: (node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100 > 80 - for: 2m - labels: - type: node + for: 5m annotations: - summary: "{{$labels.instance}}: High Filesystem usage detected" - description: "{{$labels.instance}}: Filesystem usage is above 80% (current value is: {{ $value }})" + summary: "FileSystem usage in {{$labels.instance}} is above 80% (current value is: {{ $value }})" + - alert: NodeMemoryUsage expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 80 - for: 2m - labels: - type: node + for: 5m annotations: - summary: "{{$labels.instance}}: High Memory usage detected" - description: "{{$labels.instance}}: Memory usage is above 80% (current value is: {{ $value }})" + summary: "Memory usage in {{$labels.instance}} is above 80% (current value is: {{ $value }})" - alert: NodeCPUUsage expr: (100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)) > 80 - for: 2m - labels: - type: node + for: 5m annotations: - summary: "{{$labels.instance}}: High CPU usage detected" - description: "{{$labels.instance}}: CPU usage is above 80% (current value is: {{ $value }})" + summary: "CPU usage in {{$labels.instance}} is above 80% (current value is: {{ $value }})" - alert: NodeDiskPressure expr: pai_node_count{disk_pressure="true"} > 1 - for: 1m - labels: - type: node + for: 5m annotations: - summary: "node {{$labels.instance}} is under disk pressure" - description: "node {{$labels.instance}} is under disk pressure" + summary: "{{$labels.instance}} is under disk pressure" diff --git a/prometheus/prometheus-alert/pai-services.rules b/prometheus/prometheus-alert/pai-services.rules index 57c88f322..7ac067797 100644 --- a/prometheus/prometheus-alert/pai-services.rules +++ b/prometheus/prometheus-alert/pai-services.rules @@ -23,11 +23,8 @@ groups: - alert: PaiServicePodNotRunning expr: pai_pod_count{phase!="running"} > 0 for: 1m - labels: - type: pai_service annotations: summary: "{{$labels.name}} in {{$labels.host_ip}} not running detected" - description: "{{$labels.name}} in {{$labels.host_ip}} is not running" - alert: PaiServicePodNotReady expr: pai_pod_count{phase="running", ready="false"} > 0 @@ -36,7 +33,6 @@ groups: type: pai_service annotations: summary: "{{$labels.name}} in {{$labels.host_ip}} not ready detected" - description: "{{$labels.name}} in {{$labels.host_ip}} not ready detected" - alert: PaiServiceNotUp expr: up != 1 @@ -44,5 +40,4 @@ groups: labels: type: pai_service annotations: - summary: "{{$labels.name}} in {{$labels.host_ip}} not up detected" - description: "{{$labels.name}} in {{$labels.host_ip}} not up detected" + summary: "{{$labels.job}} in {{$labels.instance}} not up detected"