This commit is contained in:
Serge Ohl 2020-02-17 15:56:33 -05:00
Родитель 4a2e3c1fb5
Коммит 0c4887ed24
25 изменённых файлов: 604 добавлений и 0 удалений

1
.gitignore поставляемый Normal file
Просмотреть файл

@ -0,0 +1 @@
.terragrunt-cache

10
Dockerfile Normal file
Просмотреть файл

@ -0,0 +1,10 @@
FROM python:alpine
WORKDIR /app
COPY ./app .
RUN pip install -r requirements.txt
EXPOSE 8000
CMD [ "python", "/app/rules-exporter.py"]

28
README.md Normal file
Просмотреть файл

@ -0,0 +1,28 @@
# Rules exporter
The alerts system of prometheus does not retrun when an alert is not firing. This exporter will get rules configured on the [inventory website](https://inventory.internal.unity3d.com/alerts/rules) and create a new metric with all rules up and firing in prometheus format.
## Rules
We use labels to select rules in the list. To be consider, a rule have to have labels:
* **type**: cloud_health
* **service**: what ever the content, it will use in the global dashboard to group health status by services
## Configuration
| Environment variable | Default Value | Required |
| --------------------- | --------------- | ---------- |
| INVENTORY_URL | None | yes |
| INVENTORY_TOKEN | None | yes |
| INVENTORY_ENV | None | yes |
| EXPORTER_PORT | 8000 | no |
| LOG_LEVEL | info | no |
## Development
Log in on [inventory website](https://inventory.internal.unity3d.com) and go to [rules list](https://inventory.internal.unity3d.com/alerts/rules). When you inspect the code with your browser in the network tab you will find the bear token ok api call

4
app/requirements.txt Normal file
Просмотреть файл

@ -0,0 +1,4 @@
click
prometheus_client
requests

268
app/rules-exporter.py Normal file
Просмотреть файл

@ -0,0 +1,268 @@
import logging
import math
import os
import sys
import time
import click
import requests
from prometheus_client import start_http_server, Gauge
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
class RulesExporter():
'''
The RulesExporter collect metrics and alerts to create a prometheus metric page.
Args:
url_inventory (str): unity inventory url
token (str): bear token to authentificate in the inventory api
port (int): listen port of http server
log (str): log level (debug, info, error)
inventory_api_batch_size (int): size of batch in paginated api call
env (str): envrinoment
'''
__slots__ = [
'url_inventory',
'url_promehteus',
'token',
'env',
'log',
'port',
'inventory_api_batch_size',
'gauge'
]
def __init__(self, url_inventory, url_promehteus, token, env, port=8000, log_level='info', inventory_api_batch_size=500):
self._init_logging(log_level=log_level)
self.log.info('Initialisation of the rules exporter')
if not url_inventory:
self.log.critical(f'Inventory url not found')
sys.exit(128)
if not url_promehteus:
self.log.critical('Prometheus url not found')
sys.exit(128)
if not token:
self.log.critical(f'Parameters token not found')
sys.exit(128)
if not env:
self.log.critical(f'Parameters env not found')
sys.exit(128)
self.inventory_api_batch_size = inventory_api_batch_size
self.url_inventory = url_inventory
self.url_promehteus = url_promehteus
if env == "test":
self.env = "int"
else:
self.env = env
self.token = token
self.port = int(port)
self.gauge = Gauge(
'CLOUD_ALERTS',
'List of rules',
['env', 'alertname', 'service', 'type', 'alertstate'],
)
def start_http_server(self):
'''
Start the http server to publish prometheus metric page on the root.
'''
self.log.info(f'Starting the web server on port {self.port}')
start_http_server(self.port)
while True:
self._process_request()
time.sleep(60)
def _init_logging(self, log_level='info'):
'''
Initialize the logging service.
Args:
log_level (str): log level (debug, info, error)
'''
level = self._get_log_level(log_level)
logging.basicConfig(
format='%(asctime)s-%(levelname)s: %(message)s',
level=level)
self.log = logging.getLogger()
self.log.debug(f'Log level to debug')
def _get_log_level(self, log_level):
'''
Get the log level in string and return the log level enum value.
Args:
log_level (str): log level (debug, info, error)
Return:
int: log level
'''
level = {
'debug': logging.DEBUG,
'info': logging.INFO,
'error': logging.ERROR
}
return level.get(log_level, logging.INFO)
def _process_request(self):
'''
Get all data and generate the metrics page.
'''
self.log.info(f'Get rules on {self.url_inventory}')
rules = self._get_rules()
alerts = self._get_alertes_triggered()
self._generate_metric(self._filter_rules(rules), alerts)
def _filter_rules(self, rules):
'''
Get list of of rules and return rule filter by labels (type=cloud_health && service).
Args:
rules (list): List of rules form the inventory api
Return:
list: List of rules filtered
'''
return [
rule
for rule in rules
if rule['labels'].get('type') == 'cloud_health' and 'service' in rule['labels']
]
def _get_rules(self):
'''
Get rules on the inventory api.
Return:
list: List of rules form the inventory api
'''
alerts = self._get_request_paginated(
self.url_inventory+'/api/inventory/alert-rules')
return alerts
def _get_alertes_triggered(self):
'''
Get alerts firing on the promtheus api.
Return:
list: List of all alerts form prometheus
'''
payload = {
'dedup': 'true',
'query': 'ALERTS{alertstate="firing"}'
}
response = self._get_request(
self.url_promehteus + '/api/v1/query', payload)
return response['data']['result']
def _get_request_paginated(self, url):
'''
Make GET request with pagination.
Args:
url (str): full api url to reach
Return:
list: all items contacted in one list
'''
page = 1
result = []
while True:
self.log.debug(f'Get page {page} by batch of {url}')
payload = {
'pageSize': self.inventory_api_batch_size,
'page': page,
'env': self.env
}
data = self._get_request(url, payload)
result.extend(data['items'])
if page >= math.ceil(data['total']/self.inventory_api_batch_size):
break
page += 1
return result
def _get_request(self, url, payload):
'''
Make a GET request to an api with a payload.
Args:
url (str): api url to reach
payload (dict): payload to add in the request
Return:
dict: Json of the result
'''
session = requests.Session()
header = {'Authorization': f'Bearer {self.token}'}
retries = Retry(total=100, backoff_factor=5,
status_forcelist=[502, 503, 504])
session.mount('https://', HTTPAdapter(max_retries=retries))
response = session.get(url=url,
headers=header, params=payload)
if response.status_code == 401:
self.log.error(
f'Authentification failed with token on {url}.')
exit(1)
response.raise_for_status()
data = response.json()
return data
def _generate_metric(self, rules, alerts):
'''
Generate prometheus metrics collections with the rules and the alerts.
Args:
rules (list): list of all rules
alerts (list): list of all alerts
'''
self.gauge._metrics.clear()
for rule in rules:
for env in rule['env']:
if env == "int":
env = "test"
state = 'up'
alerts_count = 0
for alert in list(alerts):
if rule['id'] == alert['metric']['ruleId']:
state = 'firing'
alerts_count += 1
alerts.remove(alert)
self.gauge.labels(env, rule['alert'], rule['labels'].get('service'),
rule['labels'].get('type'), state).set(alerts_count)
@click.command()
@click.option('--url-inventory', '-u', required=True, help='Base url to reach the inventory. env var INVENTORY_URL', envvar='INVENTORY_URL')
@click.option('--url-promehteus', '-m', required=True, help='Base url to reach the the prometheus. env var PROMETHEUS_URL', envvar='PROMETHEUS_URL')
@click.option('--token', '-t', required=True, help='Bear token to get authentificate to the inventory. env var INVENTORY_TOKEN', envvar='INVENTORY_TOKEN')
@click.option('--env', '-e', required=True, help='Environment for rules. env var INVENTORY_ENV', envvar='INVENTORY_ENV')
@click.option('--listen-port', '-p', default=8000, help='Port where the prometheus page will be publish (default=8000) or env var EXPORTER_PORT', envvar='EXPORTER_PORT')
@click.option('--log-level', '-l', default='info', help='Log level can be: debug, info, error (default=info) or env var LOG_LEVEL', envvar='LOG_LEVEL')
def rules_exporter(url_inventory, url_promehteus, env, token, listen_port, log_level):
exporter = RulesExporter(url_inventory, url_promehteus, token, env,
port=listen_port, log_level=log_level)
exporter.start_http_server()
if __name__ == '__main__':
rules_exporter()

Просмотреть файл

@ -0,0 +1,22 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/

Просмотреть файл

@ -0,0 +1,5 @@
apiVersion: v1
appVersion: "1.0"
description: A Helm chart to deploy the rules exporter from the inventory
name: rules-exporter
version: 0.1.0

Просмотреть файл

@ -0,0 +1,32 @@
{{/* vim: set filetype=mustache: */}}
{{/*
Expand the name of the chart.
*/}}
{{- define "rules-exporter.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "rules-exporter.fullname" -}}
{{- if .Values.fullnameOverride -}}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- $name := default .Chart.Name .Values.nameOverride -}}
{{- if contains $name .Release.Name -}}
{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{- end -}}
{{- end -}}
{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "rules-exporter.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
{{- end -}}

Просмотреть файл

@ -0,0 +1,64 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "rules-exporter.fullname" . }}
namespace: {{ .Release.Namespace }}
labels:
app.kubernetes.io/name: {{ include "rules-exporter.name" . }}
helm.sh/chart: {{ include "rules-exporter.chart" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
spec:
replicas: {{ .Values.replicaCount }}
selector:
matchLabels:
app.kubernetes.io/name: {{ include "rules-exporter.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
template:
metadata:
labels:
app.kubernetes.io/name: {{ include "rules-exporter.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
spec:
containers:
- name: {{ .Chart.Name }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
env:
- name: INVENTORY_TOKEN
valueFrom:
secretKeyRef:
name: prometheus-rules-exporter-secrets
key: inventory-token
- name: INVENTORY_URL
value: {{ .Values.urlInventory }}
- name: INVENTORY_ENV
value: {{ .Values.env }}
- name: PROMETHEUS_URL
value: {{ .Values.urlPrometheus }}
ports:
- name: http
containerPort: 8000
protocol: TCP
livenessProbe:
httpGet:
path: /
port: http
readinessProbe:
httpGet:
path: /
port: http
resources:
{{- toYaml .Values.resources | nindent 12 }}
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}

Просмотреть файл

@ -0,0 +1,23 @@
apiVersion: v1
kind: Service
metadata:
name: {{ include "rules-exporter.fullname" . }}
namespace: {{ .Release.Namespace }}
annotations:
prometheus.io/scrape: "true"
prometheus.io/path: "/"
labels:
app.kubernetes.io/name: {{ include "rules-exporter.name" . }}
helm.sh/chart: {{ include "rules-exporter.chart" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
spec:
type: {{ .Values.service.type }}
ports:
- port: {{ .Values.service.port }}
targetPort: http
protocol: TCP
name: http
selector:
app.kubernetes.io/name: {{ include "rules-exporter.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}

Просмотреть файл

@ -0,0 +1,2 @@
urlPrometheus: https://prometheus.prd.mon.corp.unity3d.com
env: prd

Просмотреть файл

@ -0,0 +1,2 @@
urlPrometheus: https://prometheus.stg.mon.corp.unity3d.com
env: stg

Просмотреть файл

@ -0,0 +1,2 @@
urlPrometheus: https://prometheus.test.mon.corp.unity3d.com
env: test

Просмотреть файл

@ -0,0 +1,34 @@
# Default values for rules-exporter.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
replicaCount: 1
image:
repository: us.gcr.io/unity-cs-devops-gcr-prd/prometheus-rules-exporter
tag: "latest"
pullPolicy: Always
nameOverride: ""
fullnameOverride: ""
service:
type: ClusterIP
port: 8000
urlInventory: https://inventory.internal.unity3d.com
urlPrometheus:
resources: {}
nodeSelector:
product: devops
tolerations:
- effect: NoSchedule
key: product
operator: Equal
value: devops
affinity: {}
env: {}

Просмотреть файл

@ -0,0 +1,3 @@
terraform {
backend "gcs" {}
}

Просмотреть файл

@ -0,0 +1,12 @@
data "vault_generic_secret" "secrets" {
path = "${var.vault_base_path}/${var.env}/prometheus-rules-exporter/inventory-token"
}
resource "kubernetes_secret" "config" {
metadata {
name = "prometheus-rules-exporter-secrets"
namespace = var.namespace
}
data = data.vault_generic_secret.secrets.data
}

Просмотреть файл

@ -0,0 +1,11 @@
provider "kubernetes" {
}
provider "vault" {
address = "https://vault.corp.unity3d.com"
}
provider "google" {
project = var.project
region = var.region
}

Просмотреть файл

@ -0,0 +1,21 @@
variable "project" {
type = string
description = "Project ID to use."
}
variable "env" {
type = string
description = "The environment where we are deploying."
}
variable "vault_base_path" {
type = string
description = "The base path of the vault secret."
default = "secret/common-devops"
}
variable "namespace" {
type = string
description = "Kubernetes namespace where we will deploy the container"
default = "devops-monitoring"
}

Просмотреть файл

Просмотреть файл

@ -0,0 +1,2 @@
project = "unity-cs-common-prd"
env = "prd"

Просмотреть файл

@ -0,0 +1,2 @@
project = "unity-cs-common-stg"
env = "stg"

Просмотреть файл

@ -0,0 +1,7 @@
terraform {
source = "${get_parent_terragrunt_dir()}/../modules/${path_relative_to_include()}"
}
include {
path = find_in_parent_folders()
}

Просмотреть файл

@ -0,0 +1,2 @@
project = "unity-cs-common-test"
env = "test"

Просмотреть файл

@ -0,0 +1,29 @@
remote_state {
backend = "gcs"
disable_init = tobool(get_env("TG_DISABLE_INIT", "false"))
config = {
bucket = "${get_env("TF_VAR_project", "unity-cs-common-test")}-terraform-tg"
prefix = "${get_env("TF_VAR_region", "us-central1")}/${path_relative_to_include()}"
project = "${get_env("TF_VAR_project", "unity-cs-common-test")}"
location = "us"
gcs_bucket_labels = {
business_unit = "cloudservices"
role = "terraform_state"
product = "devops_monitoring"
}
}
}
terraform {
extra_arguments "common_var" {
commands = get_terraform_commands_that_need_vars()
optional_var_files = [
"${get_terragrunt_dir()}/default.tfvars",
"${get_terragrunt_dir()}/${get_env("TF_VAR_env", "test")}.tfvars",
"${get_terragrunt_dir()}/${get_env("TF_VAR_region", "us-central1")}-${get_env("TF_VAR_env", "test")}.tfvars",
]
}
}

18
main.go Normal file
Просмотреть файл

@ -0,0 +1,18 @@
package main
import (
"fmt"
"net/http"
)
func home(w http.ResponseWriter, req *http.Request) {
fmt.Fprintf(w, "Home\n")
}
func main() {
http.HandleFunc("/", home)
http.ListenAndServe(":8090", nil)
}