diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..34a775e --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.terragrunt-cache \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..b6e54c4 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,10 @@ +FROM python:alpine + +WORKDIR /app +COPY ./app . + +RUN pip install -r requirements.txt + +EXPOSE 8000 + +CMD [ "python", "/app/rules-exporter.py"] \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..4f35220 --- /dev/null +++ b/README.md @@ -0,0 +1,28 @@ +# Rules exporter + +The alerts system of prometheus does not retrun when an alert is not firing. This exporter will get rules configured on the [inventory website](https://inventory.internal.unity3d.com/alerts/rules) and create a new metric with all rules up and firing in prometheus format. + + +## Rules + +We use labels to select rules in the list. To be consider, a rule have to have labels: + +* **type**: cloud_health +* **service**: what ever the content, it will use in the global dashboard to group health status by services + +## Configuration + +| Environment variable | Default Value | Required | +| --------------------- | --------------- | ---------- | +| INVENTORY_URL | None | yes | +| INVENTORY_TOKEN | None | yes | +| INVENTORY_ENV | None | yes | +| EXPORTER_PORT | 8000 | no | +| LOG_LEVEL | info | no | + + +## Development + +Log in on [inventory website](https://inventory.internal.unity3d.com) and go to [rules list](https://inventory.internal.unity3d.com/alerts/rules). When you inspect the code with your browser in the network tab you will find the bear token ok api call + + diff --git a/app/requirements.txt b/app/requirements.txt new file mode 100644 index 0000000..c31eb06 --- /dev/null +++ b/app/requirements.txt @@ -0,0 +1,4 @@ +click +prometheus_client +requests + diff --git a/app/rules-exporter.py b/app/rules-exporter.py new file mode 100644 index 0000000..790137b --- /dev/null +++ b/app/rules-exporter.py @@ -0,0 +1,268 @@ +import logging +import math +import os +import sys +import time + +import click +import requests + +from prometheus_client import start_http_server, Gauge +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry + + +class RulesExporter(): + ''' + The RulesExporter collect metrics and alerts to create a prometheus metric page. + + Args: + url_inventory (str): unity inventory url + token (str): bear token to authentificate in the inventory api + port (int): listen port of http server + log (str): log level (debug, info, error) + inventory_api_batch_size (int): size of batch in paginated api call + env (str): envrinoment + ''' + __slots__ = [ + 'url_inventory', + 'url_promehteus', + 'token', + 'env', + 'log', + 'port', + 'inventory_api_batch_size', + 'gauge' + ] + + def __init__(self, url_inventory, url_promehteus, token, env, port=8000, log_level='info', inventory_api_batch_size=500): + self._init_logging(log_level=log_level) + self.log.info('Initialisation of the rules exporter') + if not url_inventory: + self.log.critical(f'Inventory url not found') + sys.exit(128) + + if not url_promehteus: + self.log.critical('Prometheus url not found') + sys.exit(128) + + if not token: + self.log.critical(f'Parameters token not found') + sys.exit(128) + + if not env: + self.log.critical(f'Parameters env not found') + sys.exit(128) + + self.inventory_api_batch_size = inventory_api_batch_size + self.url_inventory = url_inventory + self.url_promehteus = url_promehteus + + if env == "test": + self.env = "int" + else: + self.env = env + + self.token = token + self.port = int(port) + self.gauge = Gauge( + 'CLOUD_ALERTS', + 'List of rules', + ['env', 'alertname', 'service', 'type', 'alertstate'], + ) + + def start_http_server(self): + ''' + Start the http server to publish prometheus metric page on the root. + ''' + self.log.info(f'Starting the web server on port {self.port}') + start_http_server(self.port) + while True: + self._process_request() + time.sleep(60) + + def _init_logging(self, log_level='info'): + ''' + Initialize the logging service. + + Args: + log_level (str): log level (debug, info, error) + ''' + level = self._get_log_level(log_level) + logging.basicConfig( + format='%(asctime)s-%(levelname)s: %(message)s', + level=level) + self.log = logging.getLogger() + self.log.debug(f'Log level to debug') + + def _get_log_level(self, log_level): + ''' + Get the log level in string and return the log level enum value. + + Args: + log_level (str): log level (debug, info, error) + + Return: + int: log level + + ''' + level = { + 'debug': logging.DEBUG, + 'info': logging.INFO, + 'error': logging.ERROR + } + return level.get(log_level, logging.INFO) + + def _process_request(self): + ''' + Get all data and generate the metrics page. + ''' + self.log.info(f'Get rules on {self.url_inventory}') + rules = self._get_rules() + alerts = self._get_alertes_triggered() + self._generate_metric(self._filter_rules(rules), alerts) + + def _filter_rules(self, rules): + ''' + Get list of of rules and return rule filter by labels (type=cloud_health && service). + + Args: + rules (list): List of rules form the inventory api + + Return: + list: List of rules filtered + ''' + return [ + rule + for rule in rules + if rule['labels'].get('type') == 'cloud_health' and 'service' in rule['labels'] + ] + + def _get_rules(self): + ''' + Get rules on the inventory api. + + Return: + list: List of rules form the inventory api + ''' + alerts = self._get_request_paginated( + self.url_inventory+'/api/inventory/alert-rules') + return alerts + + def _get_alertes_triggered(self): + ''' + Get alerts firing on the promtheus api. + + Return: + list: List of all alerts form prometheus + ''' + payload = { + 'dedup': 'true', + 'query': 'ALERTS{alertstate="firing"}' + } + + response = self._get_request( + self.url_promehteus + '/api/v1/query', payload) + + return response['data']['result'] + + def _get_request_paginated(self, url): + ''' + Make GET request with pagination. + + Args: + url (str): full api url to reach + + Return: + list: all items contacted in one list + ''' + page = 1 + result = [] + + while True: + self.log.debug(f'Get page {page} by batch of {url}') + payload = { + 'pageSize': self.inventory_api_batch_size, + 'page': page, + 'env': self.env + } + + data = self._get_request(url, payload) + result.extend(data['items']) + if page >= math.ceil(data['total']/self.inventory_api_batch_size): + break + page += 1 + return result + + def _get_request(self, url, payload): + ''' + Make a GET request to an api with a payload. + + Args: + url (str): api url to reach + payload (dict): payload to add in the request + + Return: + dict: Json of the result + ''' + session = requests.Session() + + header = {'Authorization': f'Bearer {self.token}'} + + retries = Retry(total=100, backoff_factor=5, + status_forcelist=[502, 503, 504]) + + session.mount('https://', HTTPAdapter(max_retries=retries)) + + response = session.get(url=url, + headers=header, params=payload) + + if response.status_code == 401: + self.log.error( + f'Authentification failed with token on {url}.') + exit(1) + + response.raise_for_status() + + data = response.json() + return data + + def _generate_metric(self, rules, alerts): + ''' + Generate prometheus metrics collections with the rules and the alerts. + + Args: + rules (list): list of all rules + alerts (list): list of all alerts + ''' + self.gauge._metrics.clear() + for rule in rules: + for env in rule['env']: + if env == "int": + env = "test" + state = 'up' + alerts_count = 0 + for alert in list(alerts): + if rule['id'] == alert['metric']['ruleId']: + state = 'firing' + alerts_count += 1 + alerts.remove(alert) + self.gauge.labels(env, rule['alert'], rule['labels'].get('service'), + rule['labels'].get('type'), state).set(alerts_count) + + +@click.command() +@click.option('--url-inventory', '-u', required=True, help='Base url to reach the inventory. env var INVENTORY_URL', envvar='INVENTORY_URL') +@click.option('--url-promehteus', '-m', required=True, help='Base url to reach the the prometheus. env var PROMETHEUS_URL', envvar='PROMETHEUS_URL') +@click.option('--token', '-t', required=True, help='Bear token to get authentificate to the inventory. env var INVENTORY_TOKEN', envvar='INVENTORY_TOKEN') +@click.option('--env', '-e', required=True, help='Environment for rules. env var INVENTORY_ENV', envvar='INVENTORY_ENV') +@click.option('--listen-port', '-p', default=8000, help='Port where the prometheus page will be publish (default=8000) or env var EXPORTER_PORT', envvar='EXPORTER_PORT') +@click.option('--log-level', '-l', default='info', help='Log level can be: debug, info, error (default=info) or env var LOG_LEVEL', envvar='LOG_LEVEL') +def rules_exporter(url_inventory, url_promehteus, env, token, listen_port, log_level): + exporter = RulesExporter(url_inventory, url_promehteus, token, env, + port=listen_port, log_level=log_level) + exporter.start_http_server() + + +if __name__ == '__main__': + rules_exporter() diff --git a/deploy/helm/rules-exporter/.helmignore b/deploy/helm/rules-exporter/.helmignore new file mode 100644 index 0000000..50af031 --- /dev/null +++ b/deploy/helm/rules-exporter/.helmignore @@ -0,0 +1,22 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/deploy/helm/rules-exporter/Chart.yaml b/deploy/helm/rules-exporter/Chart.yaml new file mode 100644 index 0000000..4a0cec3 --- /dev/null +++ b/deploy/helm/rules-exporter/Chart.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +appVersion: "1.0" +description: A Helm chart to deploy the rules exporter from the inventory +name: rules-exporter +version: 0.1.0 diff --git a/deploy/helm/rules-exporter/templates/_helpers.tpl b/deploy/helm/rules-exporter/templates/_helpers.tpl new file mode 100644 index 0000000..424f137 --- /dev/null +++ b/deploy/helm/rules-exporter/templates/_helpers.tpl @@ -0,0 +1,32 @@ +{{/* vim: set filetype=mustache: */}} +{{/* +Expand the name of the chart. +*/}} +{{- define "rules-exporter.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "rules-exporter.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "rules-exporter.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} diff --git a/deploy/helm/rules-exporter/templates/deployment.yaml b/deploy/helm/rules-exporter/templates/deployment.yaml new file mode 100644 index 0000000..142360f --- /dev/null +++ b/deploy/helm/rules-exporter/templates/deployment.yaml @@ -0,0 +1,64 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "rules-exporter.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/name: {{ include "rules-exporter.name" . }} + helm.sh/chart: {{ include "rules-exporter.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + app.kubernetes.io/name: {{ include "rules-exporter.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + template: + metadata: + labels: + app.kubernetes.io/name: {{ include "rules-exporter.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + spec: + containers: + - name: {{ .Chart.Name }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + env: + - name: INVENTORY_TOKEN + valueFrom: + secretKeyRef: + name: prometheus-rules-exporter-secrets + key: inventory-token + - name: INVENTORY_URL + value: {{ .Values.urlInventory }} + - name: INVENTORY_ENV + value: {{ .Values.env }} + - name: PROMETHEUS_URL + value: {{ .Values.urlPrometheus }} + ports: + - name: http + containerPort: 8000 + protocol: TCP + livenessProbe: + httpGet: + path: / + port: http + readinessProbe: + httpGet: + path: / + port: http + resources: + {{- toYaml .Values.resources | nindent 12 }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/deploy/helm/rules-exporter/templates/service.yaml b/deploy/helm/rules-exporter/templates/service.yaml new file mode 100644 index 0000000..2a24503 --- /dev/null +++ b/deploy/helm/rules-exporter/templates/service.yaml @@ -0,0 +1,23 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "rules-exporter.fullname" . }} + namespace: {{ .Release.Namespace }} + annotations: + prometheus.io/scrape: "true" + prometheus.io/path: "/" + labels: + app.kubernetes.io/name: {{ include "rules-exporter.name" . }} + helm.sh/chart: {{ include "rules-exporter.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + app.kubernetes.io/name: {{ include "rules-exporter.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} diff --git a/deploy/helm/rules-exporter/values-prd.yaml b/deploy/helm/rules-exporter/values-prd.yaml new file mode 100644 index 0000000..aac56ba --- /dev/null +++ b/deploy/helm/rules-exporter/values-prd.yaml @@ -0,0 +1,2 @@ +urlPrometheus: https://prometheus.prd.mon.corp.unity3d.com +env: prd \ No newline at end of file diff --git a/deploy/helm/rules-exporter/values-stg.yaml b/deploy/helm/rules-exporter/values-stg.yaml new file mode 100644 index 0000000..df20410 --- /dev/null +++ b/deploy/helm/rules-exporter/values-stg.yaml @@ -0,0 +1,2 @@ +urlPrometheus: https://prometheus.stg.mon.corp.unity3d.com +env: stg \ No newline at end of file diff --git a/deploy/helm/rules-exporter/values-test.yaml b/deploy/helm/rules-exporter/values-test.yaml new file mode 100644 index 0000000..3fd5e83 --- /dev/null +++ b/deploy/helm/rules-exporter/values-test.yaml @@ -0,0 +1,2 @@ +urlPrometheus: https://prometheus.test.mon.corp.unity3d.com +env: test \ No newline at end of file diff --git a/deploy/helm/rules-exporter/values.yaml b/deploy/helm/rules-exporter/values.yaml new file mode 100644 index 0000000..4f69124 --- /dev/null +++ b/deploy/helm/rules-exporter/values.yaml @@ -0,0 +1,34 @@ +# Default values for rules-exporter. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. +replicaCount: 1 + +image: + repository: us.gcr.io/unity-cs-devops-gcr-prd/prometheus-rules-exporter + tag: "latest" + pullPolicy: Always + +nameOverride: "" +fullnameOverride: "" + +service: + type: ClusterIP + port: 8000 + +urlInventory: https://inventory.internal.unity3d.com +urlPrometheus: + +resources: {} + +nodeSelector: + product: devops + +tolerations: + - effect: NoSchedule + key: product + operator: Equal + value: devops + +affinity: {} + +env: {} \ No newline at end of file diff --git a/deploy/modules/secrets/backend.tf b/deploy/modules/secrets/backend.tf new file mode 100644 index 0000000..2770f71 --- /dev/null +++ b/deploy/modules/secrets/backend.tf @@ -0,0 +1,3 @@ +terraform { + backend "gcs" {} +} diff --git a/deploy/modules/secrets/main.tf b/deploy/modules/secrets/main.tf new file mode 100644 index 0000000..7dba739 --- /dev/null +++ b/deploy/modules/secrets/main.tf @@ -0,0 +1,12 @@ +data "vault_generic_secret" "secrets" { + path = "${var.vault_base_path}/${var.env}/prometheus-rules-exporter/inventory-token" +} + +resource "kubernetes_secret" "config" { + metadata { + name = "prometheus-rules-exporter-secrets" + namespace = var.namespace + } + + data = data.vault_generic_secret.secrets.data +} diff --git a/deploy/modules/secrets/provider.tf b/deploy/modules/secrets/provider.tf new file mode 100644 index 0000000..26be886 --- /dev/null +++ b/deploy/modules/secrets/provider.tf @@ -0,0 +1,11 @@ +provider "kubernetes" { +} + +provider "vault" { + address = "https://vault.corp.unity3d.com" +} + +provider "google" { + project = var.project + region = var.region +} diff --git a/deploy/modules/secrets/variables.tf b/deploy/modules/secrets/variables.tf new file mode 100644 index 0000000..3f599c2 --- /dev/null +++ b/deploy/modules/secrets/variables.tf @@ -0,0 +1,21 @@ +variable "project" { + type = string + description = "Project ID to use." +} + +variable "env" { + type = string + description = "The environment where we are deploying." +} + +variable "vault_base_path" { + type = string + description = "The base path of the vault secret." + default = "secret/common-devops" +} + +variable "namespace" { + type = string + description = "Kubernetes namespace where we will deploy the container" + default = "devops-monitoring" +} diff --git a/deploy/terragrunt/secrets/default.tfvars b/deploy/terragrunt/secrets/default.tfvars new file mode 100644 index 0000000..e69de29 diff --git a/deploy/terragrunt/secrets/prd.tfvars b/deploy/terragrunt/secrets/prd.tfvars new file mode 100644 index 0000000..215c699 --- /dev/null +++ b/deploy/terragrunt/secrets/prd.tfvars @@ -0,0 +1,2 @@ +project = "unity-cs-common-prd" +env = "prd" diff --git a/deploy/terragrunt/secrets/stg.tfvars b/deploy/terragrunt/secrets/stg.tfvars new file mode 100644 index 0000000..6da8082 --- /dev/null +++ b/deploy/terragrunt/secrets/stg.tfvars @@ -0,0 +1,2 @@ +project = "unity-cs-common-stg" +env = "stg" diff --git a/deploy/terragrunt/secrets/terragrunt.hcl b/deploy/terragrunt/secrets/terragrunt.hcl new file mode 100644 index 0000000..ca22bd2 --- /dev/null +++ b/deploy/terragrunt/secrets/terragrunt.hcl @@ -0,0 +1,7 @@ +terraform { + source = "${get_parent_terragrunt_dir()}/../modules/${path_relative_to_include()}" +} + +include { + path = find_in_parent_folders() +} diff --git a/deploy/terragrunt/secrets/test.tfvars b/deploy/terragrunt/secrets/test.tfvars new file mode 100644 index 0000000..8d7bf33 --- /dev/null +++ b/deploy/terragrunt/secrets/test.tfvars @@ -0,0 +1,2 @@ +project = "unity-cs-common-test" +env = "test" diff --git a/deploy/terragrunt/terragrunt.hcl b/deploy/terragrunt/terragrunt.hcl new file mode 100644 index 0000000..8b83616 --- /dev/null +++ b/deploy/terragrunt/terragrunt.hcl @@ -0,0 +1,29 @@ +remote_state { + backend = "gcs" + disable_init = tobool(get_env("TG_DISABLE_INIT", "false")) + + config = { + bucket = "${get_env("TF_VAR_project", "unity-cs-common-test")}-terraform-tg" + prefix = "${get_env("TF_VAR_region", "us-central1")}/${path_relative_to_include()}" + project = "${get_env("TF_VAR_project", "unity-cs-common-test")}" + location = "us" + gcs_bucket_labels = { + business_unit = "cloudservices" + role = "terraform_state" + product = "devops_monitoring" + } + } +} + +terraform { + + extra_arguments "common_var" { + commands = get_terraform_commands_that_need_vars() + + optional_var_files = [ + "${get_terragrunt_dir()}/default.tfvars", + "${get_terragrunt_dir()}/${get_env("TF_VAR_env", "test")}.tfvars", + "${get_terragrunt_dir()}/${get_env("TF_VAR_region", "us-central1")}-${get_env("TF_VAR_env", "test")}.tfvars", + ] + } +} diff --git a/main.go b/main.go new file mode 100644 index 0000000..56db121 --- /dev/null +++ b/main.go @@ -0,0 +1,18 @@ +package main + +import ( + "fmt" + "net/http" +) + +func home(w http.ResponseWriter, req *http.Request) { + + fmt.Fprintf(w, "Home\n") +} + +func main() { + + http.HandleFunc("/", home) + + http.ListenAndServe(":8090", nil) +}