From 33bbea849ecd2744a3b078b5fb7d0f9d0b912e59 Mon Sep 17 00:00:00 2001 From: Eric Wiseblatt Date: Tue, 7 Feb 2017 01:56:43 +0000 Subject: [PATCH] Install scripts for monitoring support. --- config/default-spinnaker-local.yml | 2 +- config/spinnaker.yml | 2 +- .../datadog/KitchenSinkTimeboard.json} | 317 +++- .../config/datadog/MinimalTimeboard.json | 299 ++++ .../SpecificApplicationTimeboard.json} | 294 ++-- .../config/datadog/install.sh | 52 + .../prometheus/KitchenSinkDashboard.json} | 698 ++++++-- .../config/prometheus/MachineDashboard.json | 585 +++++++ .../config/prometheus/MinimalDashboard.json | 1256 +++++++++++++++ .../SpecificApplicationDashboard.json} | 1431 +++++++++-------- .../config/prometheus/install.sh | 80 + .../config/prometheus/node_exporter.conf | 2 + .../config/prometheus/prometheus.conf | 6 + .../prometheus/spinnaker-prometheus.yml | 33 + .../config/stackdriver/MinimalDashboard.json | 324 ++++ .../config/stackdriver/install.sh | 31 + .../create_install_tar.sh | 23 +- .../install_monitoring.sh | 204 +++ .../stackdriver_handlers.py | 115 ++ .../stackdriver_service.py | 15 +- install/first_google_boot.sh | 14 + 21 files changed, 4708 insertions(+), 1075 deletions(-) rename google/stackdriver_monitoring/{SampleDatadogTimeboard.json => config/datadog/KitchenSinkTimeboard.json} (83%) create mode 100644 google/stackdriver_monitoring/config/datadog/MinimalTimeboard.json rename google/stackdriver_monitoring/{MinimalDatadogTimeboard.json => config/datadog/SpecificApplicationTimeboard.json} (57%) create mode 100755 google/stackdriver_monitoring/config/datadog/install.sh rename google/stackdriver_monitoring/{SamplePrometheusGrafanaDashboard.json => config/prometheus/KitchenSinkDashboard.json} (82%) create mode 100644 google/stackdriver_monitoring/config/prometheus/MachineDashboard.json create mode 100644 google/stackdriver_monitoring/config/prometheus/MinimalDashboard.json rename google/stackdriver_monitoring/{MinimalPrometheusGrafanaDashboard.json => config/prometheus/SpecificApplicationDashboard.json} (69%) create mode 100755 google/stackdriver_monitoring/config/prometheus/install.sh create mode 100644 google/stackdriver_monitoring/config/prometheus/node_exporter.conf create mode 100644 google/stackdriver_monitoring/config/prometheus/prometheus.conf create mode 100644 google/stackdriver_monitoring/config/prometheus/spinnaker-prometheus.yml create mode 100644 google/stackdriver_monitoring/config/stackdriver/MinimalDashboard.json create mode 100755 google/stackdriver_monitoring/config/stackdriver/install.sh create mode 100755 google/stackdriver_monitoring/install_monitoring.sh diff --git a/config/default-spinnaker-local.yml b/config/default-spinnaker-local.yml index 6df495e..56267df 100644 --- a/config/default-spinnaker-local.yml +++ b/config/default-spinnaker-local.yml @@ -231,7 +231,7 @@ services: spectator: webEndpoint: - enabled: false + enabled: true stackdriver: enabled: false diff --git a/config/spinnaker.yml b/config/spinnaker.yml index 7636002..bb16842 100644 --- a/config/spinnaker.yml +++ b/config/spinnaker.yml @@ -238,7 +238,7 @@ services: spectator: webEndpoint: - enabled: false + enabled: true stackdriver: enabled: ${SPINNAKER_STACKDRIVER_ENABLED:false} diff --git a/google/stackdriver_monitoring/SampleDatadogTimeboard.json b/google/stackdriver_monitoring/config/datadog/KitchenSinkTimeboard.json similarity index 83% rename from google/stackdriver_monitoring/SampleDatadogTimeboard.json rename to google/stackdriver_monitoring/config/datadog/KitchenSinkTimeboard.json index 949e6c8..0f0dacf 100644 --- a/google/stackdriver_monitoring/SampleDatadogTimeboard.json +++ b/google/stackdriver_monitoring/config/datadog/KitchenSinkTimeboard.json @@ -309,6 +309,36 @@ }, "title": "Avg Clouddriver Controller Invocation Time (ms per minute)" }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "diff(sum:clouddriver.operations_count{success:true} by {operationtype})", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars" + } + ], + "autoscale": true + }, + "title": "Successful Operations (per minute)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "diff(sum:clouddriver.operations_count{success:false} by {operationtype})", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars" + } + ], + "autoscale": true + }, + "title": "Failed Operations (per minute)" + }, { "definition": { "viz": "timeseries", @@ -422,28 +452,99 @@ "viz": "timeseries", "requests": [ { - "q": "sum:orca.threadpool.activeCount{*}", + "q": "diff(sum:orca.task.invocations{executiontype:orchestration,status:running} by {taskname})", "aggregator": "avg", "conditional_formats": [], - "type": "line" + "type": "bars" } - ] + ], + "autoscale": true }, - "title": "Active Orca Threads (per minute)" + "title": "Active Orchestrations (orca)" }, { "definition": { "viz": "timeseries", "requests": [ { - "q": "per_minute(sum:igor.controller.invocations_totalTime{*} by {method}) / 1000000 / per_minute(avg:igor.controller.invocations_count{*} by {method})", + "q": "diff(sum:orca.task.invocations{executiontype:orchestration,status:succeeded} by {taskname})", "aggregator": "avg", "conditional_formats": [], - "type": "line" + "type": "bars", + "style": { + "palette": "cool" + } + }, + { + "q": "- diff(sum:orca.task.invocations{executiontype:orchestration,status:terminal} by {taskname})", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars", + "style": { + "palette": "warm" + } } - ] + ], + "autoscale": true }, - "title": "Igor Controller Invocation Time (ms per minute)" + "title": "Completed Orchestrations (orca)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "diff(sum:orca.task.invocations{status:running,executiontype:pipeline} by {taskname})", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars" + } + ], + "autoscale": true + }, + "title": "Active Pipelines (orca)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "diff(sum:orca.task.invocations{status:succeeded,executiontype:pipeline} by {taskname})", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars", + "style": { + "palette": "cool" + } + }, + { + "q": "- diff(sum:orca.task.invocations{status:terminal,executiontype:pipeline} by {taskname})", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars", + "style": { + "palette": "warm" + } + } + ], + "autoscale": true + }, + "title": "Completed Pipelines (orca)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "sum:orca.threadpool.activeCount{*} by {id}", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars" + } + ], + "autoscale": true + }, + "title": "Active Orca Threads (per minute)" }, { "definition": { @@ -460,6 +561,20 @@ }, "title": "Last known Orca Active Threads" }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "per_minute(sum:igor.controller.invocations_totalTime{*} by {method}) / 1000000 / per_minute(avg:igor.controller.invocations_count{*} by {method})", + "aggregator": "avg", + "conditional_formats": [], + "type": "line" + } + ] + }, + "title": "Igor Controller Invocation Time (ms per minute)" + }, { "definition": { "viz": "timeseries", @@ -545,24 +660,31 @@ "viz": "timeseries", "requests": [ { - "q": "per_minute(sum:rosco.bakes{*} by {success})", + "q": "diff(sum:rosco.bakesActive{*})", "aggregator": "avg", - "style": { - "palette": "cool" - }, - "type": "bars", - "conditional_formats": [] + "conditional_formats": [], + "type": "line" }, { - "q": "per_minute(sum:rosco.bakes.local{*} by {active})", + "q": "diff(sum:rosco.bakesRequested{*} by {flavor})", + "conditional_formats": [], + "type": "bars", + "style": { + "palette": "cool" + } + }, + { + "q": "diff(sum:rosco.bakesCompleted_count{success:false} by {region})", + "conditional_formats": [], + "type": "bars", "style": { "palette": "warm" - }, - "type": "bars" + } } - ] + ], + "autoscale": true }, - "title": "Rosco Bakes (local warm, per minute)" + "title": "Rosco Bake Activity" }, { "definition": { @@ -817,7 +939,7 @@ "viz": "timeseries", "requests": [ { - "q": " - diff(sum:clouddriver.google.api_count{success:false} by {api,scope}.as_rate()), diff(sum:clouddriver.google.api_count{success:true} by {api,scope}.as_rate())", + "q": "- diff(sum:clouddriver.google.api_count{success:false} by {api,scope}.as_rate()), diff(sum:clouddriver.google.api_count{success:true} by {api,scope}.as_rate())", "aggregator": "avg", "style": { "palette": "dog_classic" @@ -828,7 +950,7 @@ ], "autoscale": true }, - "title": "Google API Call Rate (per minute, lines by scope, bars by api)" + "title": "Google API Call Rate" }, { "definition": { @@ -856,7 +978,7 @@ "q": "per_minute(sum:clouddriver.google.api_totalTime{*} by {api,scope}) / 1000000 / per_minute(sum:clouddriver.google.api_count{*} by {api,scope}.as_count())", "aggregator": "avg", "conditional_formats": [], - "type": "bars" + "type": "line" } ], "autoscale": true @@ -868,20 +990,58 @@ "viz": "timeseries", "requests": [ { - "q": "diff(sum:clouddriver.google.operationWaits_count{*} by {basephase})", + "q": "per_minute(sum:clouddriver.google.batchExecute_count{*} by {context})", + "aggregator": "avg", + "conditional_formats": [], + "type": "line" + } + ], + "autoscale": true + }, + "title": "Google Batch Count (per minute)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "per_minute(sum:clouddriver.google.batchExecute_totalTime{*} by {context}) / 1000000 / per_minute(sum:clouddriver.google.batchExecute_count{*} by {context})", + "aggregator": "avg", + "conditional_formats": [], + "type": "line" + } + ], + "autoscale": true + }, + "title": "Batch Call Latency (ms per call minute)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "per_minute(sum:clouddriver.google.batchSize{*} by {context})", + "aggregator": "avg", + "conditional_formats": [], + "type": "line" + } + ], + "autoscale": true + }, + "title": "Batch Call Size (per minute)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "diff(sum:clouddriver.google.operationWaits_count{status:done} by {basephase,scope})", "aggregator": "avg", "conditional_formats": [], "type": "bars", "style": { - "palette": "cool" + "palette": "dog_classic" } - }, - { - "q": "diff(sum:clouddriver.google.operationWaits_count{*} by {scope})", - "style": { - "palette": "warm" - }, - "type": "line" } ], "autoscale": true, @@ -891,7 +1051,53 @@ } } }, - "title": "Google Waiting Operations (diff, bars by phase, lines by scope)" + "title": "Successful Google Operations (clouddriver)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "diff(sum:clouddriver.google.operationWaits_count{!status:done} by {basephase,scope})", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars", + "style": { + "palette": "dog_classic" + } + } + ], + "autoscale": true, + "yaxis": { + "filter": { + "below": 0.01 + } + } + }, + "title": "Failed Google Operations (clouddriver)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "diff(sum:clouddriver.google.operationWaitRequests{*} by {basephase,scope})", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars", + "style": { + "palette": "dog_classic" + } + } + ], + "autoscale": true, + "yaxis": { + "filter": { + "below": 0.01 + } + } + }, + "title": "Google Operations Started (clouddriver)" }, { "definition": { @@ -928,36 +1134,6 @@ }, "title": "Google Operation Waits by Phase" }, - { - "definition": { - "viz": "timeseries", - "requests": [ - { - "q": "per_minute(sum:clouddriver.onDemand_total_totalTime{*} by {ondemandtype}) / 1000000000 / per_minute(sum:clouddriver.onDemand_total_count{*} by {ondemandtype})", - "aggregator": "avg", - "conditional_formats": [], - "type": "bars" - } - ], - "autoscale": true - }, - "title": "Clouddriver OnDemand Invocation Time (ms per call per minute)" - }, - { - "definition": { - "viz": "timeseries", - "requests": [ - { - "q": "diff(sum:clouddriver.operations_count{*} by {operationtype})", - "aggregator": "avg", - "conditional_formats": [], - "type": "bars" - } - ], - "autoscale": true - }, - "title": "Clouddriver Operations (type per minute)" - }, { "definition": { "viz": "timeseries", @@ -972,23 +1148,8 @@ "autoscale": true }, "title": "Spectator Time Series Streams" - }, - { - "definition": { - "viz": "timeseries", - "requests": [ - { - "q": "diff(sum:clouddriver.google.batchExecute_count{*} by {context})", - "aggregator": "avg", - "conditional_formats": [], - "type": "area" - } - ], - "autoscale": true - }, - "title": "Google Batch Execution Count" } ], "description": "Contains graphs of various metrics within Spinnaker to illustrate what is available.", - "title": "FullSpinnaker" + "title": "Spinnaker Kitchen Sink" } \ No newline at end of file diff --git a/google/stackdriver_monitoring/config/datadog/MinimalTimeboard.json b/google/stackdriver_monitoring/config/datadog/MinimalTimeboard.json new file mode 100644 index 0000000..23eee6d --- /dev/null +++ b/google/stackdriver_monitoring/config/datadog/MinimalTimeboard.json @@ -0,0 +1,299 @@ +{ + "read_only": false, + "graphs": [ + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "diff(sum:gate.hystrix.rollingCountShortCircuited{*} by {metricgroup})", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars" + }, + { + "q": "diff(sum:igor.hystrix.rollingCountShortCircuited{*} by {metricgroup})", + "type": "bars" + }, + { + "q": "diff(sum:front50.hystrix.rollingCountShortCircuited{*} by {metricgroup})", + "type": "bars" + } + ], + "autoscale": true + }, + "title": "Hystrix Short Circuited (global)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "diff(sum:front50.hystrix.countExceptionsThrown{*})", + "aggregator": "avg", + "conditional_formats": [], + "type": "line" + }, + { + "q": "diff(sum:igor.hystrix.countExceptionsThrown{*}.as_count())", + "type": "bars" + }, + { + "q": "diff(sum:gate.hystrix.countExceptionsThrown{*})", + "type": "line" + } + ], + "autoscale": true + }, + "title": "Hystrix Exceptions (global)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "diff(sum:orca.task.invocations{executiontype:orchestration,status:running} by {taskname})", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars" + } + ], + "autoscale": true + }, + "title": "Active Orchestrations (orca)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "diff(sum:orca.task.invocations{executiontype:orchestration,status:succeeded} by {taskname})", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars", + "style": { + "palette": "cool" + } + }, + { + "q": "- diff(sum:orca.task.invocations{executiontype:orchestration,status:terminal} by {taskname})", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars", + "style": { + "palette": "warm" + } + } + ], + "autoscale": true + }, + "title": "Completed Orchestrations (orca)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "diff(sum:orca.task.invocations{status:running,executiontype:pipeline} by {taskname})", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars" + } + ], + "autoscale": true + }, + "title": "Active Pipelines (orca)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "diff(sum:orca.task.invocations{status:succeeded,executiontype:pipeline} by {taskname})", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars", + "style": { + "palette": "cool" + } + }, + { + "q": " - diff(sum:orca.task.invocations{status:terminal,executiontype:pipeline} by {taskname})", + "style": { + "palette": "warm" + }, + "type": "bars" + } + ], + "autoscale": true + }, + "title": "Completed Pipelines (orca)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "diff(sum:clouddriver.operations_count{success:true} by {operationtype}), - diff(sum:clouddriver.operations_count{!success:true} by {operationtype})", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars" + } + ], + "autoscale": true + }, + "title": "Active Threads (orca)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "diff(sum:echo.pipelines.triggered{*} by {application})", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars" + } + ], + "autoscale": true + }, + "title": "Pipelines Triggered (echo)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "diff(sum:rosco.bakesActive{*})", + "aggregator": "avg", + "conditional_formats": [], + "type": "line" + }, + { + "q": "diff(sum:rosco.bakesRequested{*} by {flavor})", + "conditional_formats": [], + "type": "bars", + "style": { + "palette": "cool" + } + }, + { + "q": "diff(sum:rosco.bakesCompleted_count{success:false} by {region})", + "conditional_formats": [], + "type": "bars", + "style": { + "palette": "warm" + } + } + ], + "autoscale": true + }, + "title": "Bake Activity (rosco)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "sum:front50.storageServiceSupport.cacheSize{*} by {objecttype}", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars" + } + ], + "autoscale": true + }, + "title": "Item Cache Size (front50)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "diff(sum:clouddriver.operations_count{success:true} by {operationtype})", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars" + } + ], + "autoscale": true + }, + "title": "Successful Operations (clouddriver)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "diff(sum:clouddriver.operations_count{success:false} by {operationtype})", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars" + } + ], + "autoscale": true + }, + "title": "Failed Operations (clouddriver)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "avg:system.load.1{*}", + "aggregator": "avg", + "conditional_formats": [], + "type": "line" + }, + { + "q": "avg:system.mem.used{*} / avg:system.mem.total{*}", + "type": "line" + } + ], + "autoscale": true + }, + "title": "System Load and Pct Memory Used" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "sum:front50.jvm.memory.used{*}", + "aggregator": "avg", + "conditional_formats": [], + "type": "line" + }, + { + "q": "sum:clouddriver.jvm.memory.used{*}", + "type": "line" + }, + { + "q": "sum:orca.jvm.memory.used{*}", + "type": "line" + }, + { + "q": "sum:gate.jvm.memory.used{*}", + "type": "line" + }, + { + "q": "sum:igor.jvm.memory.used{*}", + "type": "line" + }, + { + "q": "sum:rosco.jvm.memory.used{*}", + "type": "line" + }, + { + "q": "sum:echo.jvm.memory.used{*}", + "type": "line" + } + ], + "autoscale": true + }, + "title": "Microservice JVM Memory Used" + } + ], + "description": "A bare-bones dashboard for monitoring a Spinnaker deployment", + "title": "Minimal Spinnaker" +} \ No newline at end of file diff --git a/google/stackdriver_monitoring/MinimalDatadogTimeboard.json b/google/stackdriver_monitoring/config/datadog/SpecificApplicationTimeboard.json similarity index 57% rename from google/stackdriver_monitoring/MinimalDatadogTimeboard.json rename to google/stackdriver_monitoring/config/datadog/SpecificApplicationTimeboard.json index 7cedcbf..4b64e92 100644 --- a/google/stackdriver_monitoring/MinimalDatadogTimeboard.json +++ b/google/stackdriver_monitoring/config/datadog/SpecificApplicationTimeboard.json @@ -1,6 +1,139 @@ { "read_only": false, "graphs": [ + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "diff(sum:orca.task.invocations{$SourceApplication,executiontype:orchestration,status:running} by {taskname})", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars" + } + ], + "autoscale": true + }, + "title": "$SourceApplication Active Orchestrations (orca)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "diff(sum:orca.task.invocations{$SourceApplication,executiontype:orchestration,status:running} by {taskname})", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars" + } + ], + "autoscale": true + }, + "title": "$SourceApplication Active Orchestrations (orca)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "diff(sum:orca.task.invocations{$SourceApplication,status:running,executiontype:pipeline} by {taskname})", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars" + } + ], + "autoscale": true + }, + "title": "$SourceApplication Active Pipelines (orca)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "diff(sum:orca.task.invocations{$SourceApplication,status:succeeded,executiontype:pipeline} by {taskname})", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars", + "style": { + "palette": "cool" + } + }, + { + "q": "- diff(sum:orca.task.invocations{$SourceApplication,status:terminal,executiontype:pipeline} by {taskname})", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars", + "style": { + "palette": "warm" + } + } + ], + "autoscale": true + }, + "title": "$SourceApplication Completed Pipelines (orca)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "diff(sum:echo.pipelines.triggered{$Application} by {name})", + "aggregator": "avg", + "conditional_formats": [], + "type": "bars" + } + ], + "autoscale": true + }, + "title": "$Application Pipelines Triggered (echo)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "diff(sum:rosco.bakesActive{*})", + "aggregator": "avg", + "conditional_formats": [], + "type": "line" + }, + { + "q": "diff(sum:rosco.bakesRequested{*} by {flavor})", + "conditional_formats": [], + "type": "bars", + "style": { + "palette": "cool" + } + }, + { + "q": "diff(sum:rosco.bakesCompleted_count{success:false} by {region})", + "conditional_formats": [], + "type": "bars", + "style": { + "palette": "warm" + } + } + ], + "autoscale": true + }, + "title": "Global Bake Activity (rosco)" + }, + { + "definition": { + "viz": "timeseries", + "requests": [ + { + "q": "per_minute(sum:rosco.bakesCompleted_totalTime{*} by {region}) / 1000000000 / per_minute(sum:rosco.bakesCompleted_count{*} by {region}) / 60", + "aggregator": "avg", + "conditional_formats": [], + "type": "area" + } + ], + "autoscale": true + }, + "title": "Global Bake Completion Time Minutes (rosco)" + }, { "definition": { "viz": "timeseries", @@ -22,7 +155,7 @@ ], "autoscale": true }, - "title": "Hystrix Short Circuited" + "title": "Hystrix Short Circuited (global)" }, { "definition": { @@ -35,8 +168,8 @@ "type": "line" }, { - "q": "diff(sum:igor.hystrix.countExceptionsThrown{*})", - "type": "line" + "q": "diff(sum:igor.hystrix.countExceptionsThrown{*}.as_count())", + "type": "bars" }, { "q": "diff(sum:gate.hystrix.countExceptionsThrown{*})", @@ -45,156 +178,9 @@ ], "autoscale": true }, - "title": "Hystrix Exceptions" - }, - { - "definition": { - "viz": "timeseries", - "requests": [ - { - "q": "diff(sum:clouddriver.operations_count{success:true} by {operationtype}), - diff(sum:clouddriver.operations_count{!success:true} by {operationtype})", - "aggregator": "avg", - "conditional_formats": [], - "type": "bars" - } - ], - "autoscale": true - }, - "title": "Clouddriver Operations" - }, - { - "definition": { - "viz": "timeseries", - "requests": [ - { - "q": "avg:system.load.1{*}", - "aggregator": "avg", - "conditional_formats": [], - "type": "line" - } - ], - "autoscale": true - }, - "title": "This Slot Is Reserved For Future Use" - }, - { - "definition": { - "viz": "timeseries", - "requests": [ - { - "q": "diff(sum:clouddriver.operations_count{success:true} by {operationtype}), - diff(sum:clouddriver.operations_count{!success:true} by {operationtype})", - "aggregator": "avg", - "conditional_formats": [], - "type": "bars" - } - ], - "autoscale": true - }, - "title": "Active Orca Threads" - }, - { - "definition": { - "viz": "timeseries", - "requests": [ - { - "q": "diff(sum:echo.pipelines.triggered{*} by {application})", - "aggregator": "avg", - "conditional_formats": [], - "type": "bars" - } - ], - "autoscale": true - }, - "title": "Pipelines Triggered" - }, - { - "definition": { - "viz": "timeseries", - "requests": [ - { - "q": "sum:rosco.bakesActive{*}", - "conditional_formats": [], - "type": "line" - } - ], - "autoscale": true - }, - "title": "Bakes in Progress" - }, - { - "definition": { - "viz": "timeseries", - "requests": [ - { - "q": "sum:front50.storageServiceSupport.cacheSize{*} by {objecttype}", - "aggregator": "avg", - "conditional_formats": [], - "type": "bars" - } - ], - "autoscale": true - }, - "title": "Front50 Item Cache Sizes" - }, - { - "definition": { - "viz": "timeseries", - "requests": [ - { - "q": "avg:system.load.1{*}", - "aggregator": "avg", - "conditional_formats": [], - "type": "line" - }, - { - "q": "avg:system.mem.used{*} / avg:system.mem.total{*}", - "type": "line" - } - ], - "autoscale": true - }, - "title": "System Load and Pct Memory Used" - }, - { - "definition": { - "viz": "timeseries", - "requests": [ - { - "q": "sum:front50.jvm.memory.used{*}", - "aggregator": "avg", - "conditional_formats": [], - "type": "line" - }, - { - "q": "sum:clouddriver.jvm.memory.used{*}", - "type": "line" - }, - { - "q": "sum:orca.jvm.memory.used{*}", - "type": "line" - }, - { - "q": "sum:gate.jvm.memory.used{*}", - "type": "line" - }, - { - "q": "sum:igor.jvm.memory.used{*}", - "type": "line" - }, - { - "q": "sum:rosco.jvm.memory.used{*}", - "type": "line" - }, - { - "q": "sum:echo.jvm.memory.used{*}", - "type": "line" - } - ], - "autoscale": true - }, - "title": "Microservice JVM Memory Used" + "title": "Hystrix Exceptions Thrown (global)" } ], - "description": "A bare-bones dashboard for monitoring a Spinnaker deployment", - "title": "MinimalSpinnaker" -} \ No newline at end of file + "description": "Templated dashboard to show details for a specific Application", + "title": "Specific Spinnaker Application" +} diff --git a/google/stackdriver_monitoring/config/datadog/install.sh b/google/stackdriver_monitoring/config/datadog/install.sh new file mode 100755 index 0000000..63a006f --- /dev/null +++ b/google/stackdriver_monitoring/config/datadog/install.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SOURCE_DIR=$(dirname $0) +HAVE_KEYS=0 + +function prompt_if_unset() { + local name=$1 + local tmp + while [[ "${!name}" == "" ]]; do + read -e -p "ENTER $name: " tmp + eval ${name}=$tmp + done +} + +prompt_if_unset DATADOG_API_KEY +prompt_if_unset DATADOG_APP_KEY + +environ_file=$(readlink -f "${SOURCE_DIR}/../../environ") +echo "Storing keys into $environ_file" +if [[ ! -f "${environ_file}" ]]; then + sudo touch "${environ_file}" +fi +sudo chmod 600 "${environ_file}" +sudo cat >> "$environ_file" <{{AWSErrorCode}}", "metric": "clouddriver:aws:request:httpRequestTime__count", @@ -2758,7 +3086,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "AWS Errors", + "title": "AWS Errors (clouddriver)", "tooltip": { "shared": true, "sort": 0, @@ -2777,7 +3105,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { @@ -2785,7 +3113,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true } ] @@ -2833,7 +3161,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(sum(delta(clouddriver:aws:request:httpRequestTime__count{serviceName=\"AmazonEC2\", error=\"false\"}[1m])) by (requestType, serviceName), \"requestType\", \"$1\", \"requestType\", \"(.*)Request\")", + "expr": "label_replace(sum(idelta(clouddriver:aws:request:httpRequestTime__count{serviceName=\"AmazonEC2\", error=\"false\"}[$SamplePeriod])) by (requestType, serviceName), \"requestType\", \"$1\", \"requestType\", \"(.*)Request\")", "intervalFactor": 2, "legendFormat": "{{requestType}}", "metric": "clouddriver:aws:request:httpRequestTime__count", @@ -2844,7 +3172,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "AWS EC2 Requests", + "title": "AWS EC2 Requests (clouddriver)", "tooltip": { "shared": true, "sort": 0, @@ -2863,7 +3191,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { @@ -2871,7 +3199,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true } ] @@ -2907,7 +3235,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(delta(clouddriver:aws:request:httpRequestTime__count{serviceName!=\"AmazonEC2\", error=\"false\"}[1m])) by (requestType, serviceName)", + "expr": "sum(idelta(clouddriver:aws:request:httpRequestTime__count{serviceName!=\"AmazonEC2\", error=\"false\"}[$SamplePeriod])) by (requestType, serviceName)", "intervalFactor": 2, "legendFormat": "{{requestType}}({{serviceName}})", "metric": "clouddriver:aws:request:httpRequestTime__count", @@ -2918,7 +3246,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "AWS Requests (non EC2)", + "title": "AWS Requests (non EC2) (clouddrier)", "tooltip": { "shared": true, "sort": 0, @@ -2937,7 +3265,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { @@ -2945,7 +3273,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true } ] @@ -2993,7 +3321,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(delta(clouddriver:aws:request:httpRequestTime__totalTime{serviceName=\"AmazonEC2\"}[1m])) by (requestType, serviceName) / 1000000 / sum(delta(clouddriver:aws:request:httpRequestTime__count{serviceName=\"AmazonEC2\", error=\"false\"}[1m])) by (requestType, serviceName)", + "expr": "sum(rate(clouddriver:aws:request:httpRequestTime__totalTime{serviceName=\"AmazonEC2\"}[$SamplePeriod])) by (requestType, serviceName) / 1000000 / sum(rate(clouddriver:aws:request:httpRequestTime__count{serviceName=\"AmazonEC2\", error=\"false\"}[$SamplePeriod])) by (requestType, serviceName)", "intervalFactor": 2, "legendFormat": "{{requestType}}", "metric": "", @@ -3004,7 +3332,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "AWS EC2 Request Latency", + "title": "AWS EC2 Request Latency (clouddriver)", "tooltip": { "shared": true, "sort": 0, @@ -3067,7 +3395,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(sum(delta(clouddriver:aws:request:httpRequestTime__totalTime{serviceName!=\"AmazonEC2\"}[1m])) by (requestType, serviceName) / 1000000 / sum(delta(clouddriver:aws:request:httpRequestTime__count{serviceName!=\"AmazonEC2\", error=\"false\"}[1m])) by (requestType, serviceName), \"requestType\", \"$1\", \"requestType\", \"(.*)Request\")", + "expr": "label_replace(sum(rate(clouddriver:aws:request:httpRequestTime__totalTime{serviceName!=\"AmazonEC2\"}[$SamplePeriod])) by (requestType, serviceName) / 1000000 / sum(rate(clouddriver:aws:request:httpRequestTime__count{serviceName!=\"AmazonEC2\", error=\"false\"}[$SamplePeriod])) by (requestType, serviceName), \"requestType\", \"$1\", \"requestType\", \"(.*)Request\")", "intervalFactor": 2, "legendFormat": "{{requestType}}", "metric": "", @@ -3078,7 +3406,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "AWS Non-EC2 Request Latency", + "title": "AWS Non-EC2 Request Latency (clouddriver)", "tooltip": { "shared": true, "sort": 0, @@ -3153,7 +3481,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(delta(clouddriver:google:operationWaits__count{status=\"DONE\"}[30s])) by (basePhase)", + "expr": "sum(idelta(clouddriver:google:operationWaits__count{status=\"DONE\"}[$SamplePeriod])) by (basePhase)", "hide": false, "intervalFactor": 2, "legendFormat": "{{basePhase}}", @@ -3165,7 +3493,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Successful Google Operations", + "title": "Successful Google Operations (clouddriver)", "tooltip": { "shared": false, "sort": 0, @@ -3186,7 +3514,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { @@ -3194,7 +3522,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true } ] @@ -3230,7 +3558,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(idelta(clouddriver:google:operationWaits__count{status!=\"DONE\"}[1m])) by (basePhase, scope) ", + "expr": "sum(idelta(clouddriver:google:operationWaits__count{status!=\"DONE\"}[$SamplePeriod])) by (basePhase, scope) ", "intervalFactor": 2, "legendFormat": "{{scope}}/{{basePhase}}", "metric": "", @@ -3241,7 +3569,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Failed Google Operations", + "title": "Failed Google Operations (clouddriver)", "tooltip": { "shared": true, "sort": 0, @@ -3260,7 +3588,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { @@ -3268,7 +3596,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true } ] @@ -3316,7 +3644,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(idelta(clouddriver:google:operationWaitRequests[1m])) by (basePhase, scope)", + "expr": "sum(idelta(clouddriver:google:operationWaitRequests[$SamplePeriod])) by (basePhase, scope)", "intervalFactor": 2, "legendFormat": "{{scope}}/{{basePhase}}", "metric": "clouddriver:google:operationWaitRequests", @@ -3327,7 +3655,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Google Operations Started", + "title": "Google Operations Started (clouddriver)", "tooltip": { "shared": true, "sort": 0, @@ -3346,7 +3674,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { @@ -3354,7 +3682,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true } ] @@ -3390,7 +3718,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(clouddriver:google:operationWaits__totalTime[1m])) by (basePhase, scope) / 1000000000 / sum(rate(clouddriver:google:operationWaits__count[1m])) by (basePhase, scope) ", + "expr": "sum(rate(clouddriver:google:operationWaits__totalTime[$SamplePeriod])) by (basePhase, scope) / 1000000000 / sum(rate(clouddriver:google:operationWaits__count[$SamplePeriod])) by (basePhase, scope) ", "hide": false, "intervalFactor": 2, "legendFormat": "{{scope}}/{{basePhase}}", @@ -3420,7 +3748,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Google Operation Wait Until Done Time", + "title": "Google Operation Wait Until Done Time (clouddriver)", "tooltip": { "shared": true, "sort": 0, @@ -3495,7 +3823,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(sum(delta(clouddriver:google:api__count[1m])) by (api), \"api\", \"$1\", \"api\", \"compute.(.*)\")", + "expr": "label_replace(sum(idelta(clouddriver:google:api__count[$SamplePeriod])) by (api), \"api\", \"$1\", \"api\", \"compute.(.*)\")", "hide": false, "intervalFactor": 2, "legendFormat": "{{api}}", @@ -3507,7 +3835,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Google API Call Count", + "title": "Google API Call Count (clouddriver)", "tooltip": { "shared": true, "sort": 0, @@ -3527,7 +3855,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { @@ -3535,7 +3863,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true } ] @@ -3571,7 +3899,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(sum(rate(clouddriver:google:api__totalTime{success=\"true\"}[1m])) by (api) / 1000000 / sum(rate(clouddriver:google:api__count{success=\"true\"}[1m])) by (api), \"api\", \"$1\", \"api\", \"compute.(.*)\")", + "expr": "label_replace(sum(rate(clouddriver:google:api__totalTime{success=\"true\"}[$SamplePeriod])) by (api) / 1000000 / sum(rate(clouddriver:google:api__count{success=\"true\"}[$SamplePeriod])) by (api), \"api\", \"$1\", \"api\", \"compute.(.*)\")", "intervalFactor": 2, "legendFormat": "{{api}}", "metric": "clouddriver:google:api__count", @@ -3582,7 +3910,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Google API Call Latency", + "title": "Google API Call Latency (clouddriver)", "tooltip": { "shared": true, "sort": 0, @@ -3645,7 +3973,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(sum(idelta(clouddriver:google:api__count{success=\"false\"}[1m])) by (api), \"api\", \"$1\", \"api\", \"compute.(.*)\")", + "expr": "label_replace(sum(idelta(clouddriver:google:api__count{success=\"false\"}[$SamplePeriod])) by (api), \"api\", \"$1\", \"api\", \"compute.(.*)\")", "intervalFactor": 2, "legendFormat": "{{api}}", "metric": "clouddriver:google:api__count", @@ -3656,7 +3984,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Google API Failures", + "title": "Google API Failures (clouddriver)", "tooltip": { "shared": true, "sort": 0, @@ -3675,7 +4003,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { @@ -3683,7 +4011,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true } ] @@ -3731,7 +4059,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(sum(idelta(clouddriver:google:batchSize[1m])) by (context), \"context\", \"$1$2\", \"context\", \"(.*)Caching(.*)\")", + "expr": "label_replace(sum(idelta(clouddriver:google:batchSize[$SamplePeriod])) by (context), \"context\", \"$1$2\", \"context\", \"(.*)Caching(.*)\")", "intervalFactor": 2, "legendFormat": "{{context}}", "metric": "clouddriver:google:batchSize", @@ -3742,7 +4070,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Clouddriver Google API Batch Size", + "title": "Google API Batch Size (clouddriver)", "tooltip": { "shared": true, "sort": 0, @@ -3761,7 +4089,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { @@ -3769,7 +4097,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true } ] @@ -3805,7 +4133,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(sum(delta(clouddriver:google:batchExecute__count[1m])) by (context), \"context\", \"$1$2\", \"context\", \"(.*)Caching(.*)\")", + "expr": "label_replace(sum(idelta(clouddriver:google:batchExecute__count[$SamplePeriod])) by (context), \"context\", \"$1$2\", \"context\", \"(.*)Caching(.*)\")", "intervalFactor": 2, "legendFormat": "{{context}}", "metric": "clouddriver:google:batchExecute__count", @@ -3816,7 +4144,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Clouddriver Google API Batch Count", + "title": "Google API Batch Count (clouddriver)", "tooltip": { "shared": true, "sort": 0, @@ -3835,7 +4163,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { @@ -3843,7 +4171,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true } ] @@ -3879,7 +4207,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(rate(clouddriver:google:batchExecute__totalTime[1m]) / 1000000 / rate(clouddriver:google:batchExecute__count[1m]), \"context\", \"$1$2\", \"context\", \"(.*)Caching(.*)\")", + "expr": "label_replace(rate(clouddriver:google:batchExecute__totalTime[$SamplePeriod]) / 1000000 / rate(clouddriver:google:batchExecute__count[$SamplePeriod]), \"context\", \"$1$2\", \"context\", \"(.*)Caching(.*)\")", "intervalFactor": 2, "legendFormat": "{{context}}", "metric": "", @@ -3890,7 +4218,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Clouddriver Google API Batch Latency", + "title": "Google API Batch Latency (clouddriver)", "tooltip": { "shared": true, "sort": 0, @@ -3965,7 +4293,7 @@ "steppedLine": false, "targets": [ { - "expr": "delta(clouddriver:google:safeRetry__count[1m])", + "expr": "idelta(clouddriver:google:safeRetry__count[$SamplePeriod])", "hide": false, "intervalFactor": 2, "legendFormat": "{{operation}}({{phase}})", @@ -3977,7 +4305,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Google API Safe Retries", + "title": "Google API Safe Retries (clouddriver)", "tooltip": { "shared": true, "sort": 0, @@ -3996,7 +4324,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { @@ -4004,7 +4332,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true } ] @@ -4040,7 +4368,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(clouddriver:google:safeRetry__totalTime[1m])) by (operation, phase) / 100000000000000 / sum(rate(clouddriver:google:safeRetry__count[1m])) by (operation, phase)", + "expr": "sum(rate(clouddriver:google:safeRetry__totalTime[$SamplePeriod])) by (operation, phase) / 1000000 / sum(rate(clouddriver:google:safeRetry__count[$SamplePeriod])) by (operation, phase)", "intervalFactor": 2, "legendFormat": "{{operation}}({{phase}})", "metric": "clouddriver:google:safeRetry__count", @@ -4051,7 +4379,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Google API Safe Retry Latency", + "title": "Google API Safe Retry Latency (clouddriver)", "tooltip": { "shared": true, "sort": 0, @@ -4090,24 +4418,56 @@ "showTitle": false, "title": "Google Safe Retry", "titleSize": "h6" - }, - { - "collapse": false, - "height": 250, - "panels": [], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" } ], "schemaVersion": 14, "style": "dark", "tags": [], "templating": { - "list": [] + "list": [ + { + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "text": "1m", + "value": "1m" + }, + "hide": 0, + "label": "Sample Period", + "name": "SamplePeriod", + "options": [ + { + "selected": true, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "15m", + "value": "15m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + } + ], + "query": "1m,5m,10m,15m,30m", + "refresh": 2, + "type": "interval" + } + ] }, "time": { "from": "now-1h", @@ -4139,6 +4499,6 @@ ] }, "timezone": "browser", - "title": "Spinnaker", - "version": 9 -} \ No newline at end of file + "title": "Spinnaker Kitchen Sink", + "version": 1 +} diff --git a/google/stackdriver_monitoring/config/prometheus/MachineDashboard.json b/google/stackdriver_monitoring/config/prometheus/MachineDashboard.json new file mode 100644 index 0000000..99b152c --- /dev/null +++ b/google/stackdriver_monitoring/config/prometheus/MachineDashboard.json @@ -0,0 +1,585 @@ +{ + "__inputs": [ + { + "name": "DS_SPINNAKER", + "label": "Spinnaker", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.1.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "height": 254, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_SPINNAKER}", + "fill": 1, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "cpu0", + "yaxis": 1 + }, + { + "alias": "All", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_cpu{mode!=\"idle\"}[$SamplePeriod])) / sum(rate(node_cpu[$SamplePeriod])) ", + "hide": false, + "intervalFactor": 2, + "legendFormat": "All", + "metric": "", + "refId": "A", + "step": 30 + }, + { + "expr": "sum(rate(node_cpu{mode!=\"idle\"}[$SamplePeriod])) by (cpu)", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{cpu}}", + "metric": "node_cpu", + "refId": "B", + "step": 30 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "CPU Utilization", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_SPINNAKER}", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Dirty", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemFree ", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Free", + "metric": "", + "refId": "B", + "step": 30 + }, + { + "expr": "(node_memory_MemTotal - node_memory_Committed_AS) ", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Uncommitted", + "metric": "node_memory_Committed_AS", + "refId": "G", + "step": 30 + }, + { + "expr": "node_memory_Dirty", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Dirty", + "metric": "", + "refId": "F", + "step": 30 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "System Memory Available", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 255, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_SPINNAKER}", + "fill": 1, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "cpu0", + "yaxis": 1 + } + ], + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_netstat_Ip_InReceives[$SamplePeriod]) ", + "hide": false, + "intervalFactor": 2, + "legendFormat": "In Packets", + "metric": "", + "refId": "A", + "step": 120 + }, + { + "expr": "rate(node_netstat_Ip_OutRequests[$SamplePeriod])", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Out Packets", + "metric": "", + "refId": "B", + "step": 120 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Networking", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_SPINNAKER}", + "fill": 1, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "READ", + "yaxis": 2 + } + ], + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "delta(node_disk_bytes_written[$SamplePeriod])", + "intervalFactor": 2, + "legendFormat": "WRITE", + "metric": "node_disk_bytes_written", + "refId": "A", + "step": 60 + }, + { + "expr": "delta(node_disk_bytes_read[$SamplePeriod])", + "intervalFactor": 2, + "legendFormat": "READ", + "metric": "node_disk_bytes_read", + "refId": "B", + "step": 60 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Disk IO", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_SPINNAKER}", + "fill": 1, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "READ", + "yaxis": 2 + } + ], + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_filesystem_free{mountpoint!~\"/run.*\"}", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{mountpoint}}", + "metric": "", + "refId": "B", + "step": 60 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Disk Available", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "text": "1m", + "value": "1m" + }, + "hide": 0, + "label": "Sample Period", + "name": "SamplePeriod", + "options": [ + { + "selected": true, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "15m", + "value": "15m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + } + ], + "query": "1m,5m,10m,15m,30m", + "refresh": 2, + "type": "interval" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Machine Stats", + "version": 1 +} diff --git a/google/stackdriver_monitoring/config/prometheus/MinimalDashboard.json b/google/stackdriver_monitoring/config/prometheus/MinimalDashboard.json new file mode 100644 index 0000000..b969815 --- /dev/null +++ b/google/stackdriver_monitoring/config/prometheus/MinimalDashboard.json @@ -0,0 +1,1256 @@ +{ + "__inputs": [ + { + "name": "DS_SPINNAKER", + "label": "Spinnaker", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.1.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "height": 200, + "panels": [ + { + "aliasColors": {}, + "bars": true, + "datasource": "${DS_SPINNAKER}", + "fill": 1, + "id": 11, + "legend": { + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(front50:hystrix:countShortCircuited) by (metricGroup, metricType)", + "intervalFactor": 2, + "legendFormat": "front50/{{metricGroup}}({{metricType}})", + "metric": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(gate:hystrix:countShortCircuited) by (metricGroup, metricType)", + "intervalFactor": 2, + "legendFormat": "gate/{{metricGroup}}({{metricType}})", + "metric": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum(igor:hystrix:countShortCircuited) by (metricGroup, metricType)", + "intervalFactor": 2, + "legendFormat": "igor/{{metricGroup}}({{metricType}})", + "metric": "", + "refId": "C", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Hystrix Short Circuited", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_SPINNAKER}", + "fill": 1, + "id": 37, + "legend": { + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "idelta(front50:hystrix:countExceptionsThrown[1m])", + "intervalFactor": 2, + "legendFormat": "front50/{{metricType}}({{metricGroup}})", + "metric": "front50:hystrix:countExceptionsThrown", + "refId": "A", + "step": 10 + }, + { + "expr": "idelta(gate:hystrix:countExceptionsThrown[1m])", + "intervalFactor": 2, + "legendFormat": "gate/{{metricType}}({{metricGroup}})", + "metric": "gate:hystrix:countExceptionsThrown", + "refId": "B", + "step": 10 + }, + { + "expr": "idelta(igor:hystrix:countExceptionsThrown[1m])", + "intervalFactor": 2, + "legendFormat": "igor/{{metricType}}({{metricGroup}})", + "metric": "igor:hystrix:countExceptionsThrown", + "refId": "C", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Hystrix Exceptions Thrown", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Hystrix Error Signals", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 262, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_SPINNAKER}", + "fill": 1, + "id": 55, + "legend": { + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(idelta(orca:task:invocations{executionType=\"Orchestration\", isComplete=\"false\"}[1m])) by (taskName)", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{taskName}}", + "metric": "", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Active Orchestrations (orca)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_SPINNAKER}", + "fill": 1, + "id": 58, + "legend": { + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(idelta(orca:task:invocations{executionType=\"Pipeline\", isComplete=\"false\"}[1m])) by (taskName)", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{taskName}}", + "metric": "", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Active Pipelines (orca)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Orca Active", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_SPINNAKER}", + "fill": 1, + "id": 56, + "legend": { + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(idelta(orca:task:invocations{executionType=\"Orchestration\",isComplete=\"true\", status=\"SUCCEEDED\"}[1m])) by (taskName)", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{taskName}}", + "metric": "orca:task:invocations", + "refId": "C", + "step": 10 + }, + { + "expr": "-1 * sum(idelta(orca:task:invocations{executionType=\"Orchestration\",isComplete=\"true\", status!=\"SUCCEEDED\"}[1m])) by (taskName)", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "ERR {{taskName}}", + "metric": "orca:task:invocations", + "refId": "D", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Orchestrations Completed (orca)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_SPINNAKER}", + "fill": 1, + "id": 59, + "legend": { + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(idelta(orca:task:invocations{executionType=\"Pipeline\",isComplete=\"true\", status=\"SUCCEEDED\"}[1m])) by (taskName)", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{taskName}}", + "metric": "orca:task:invocations", + "refId": "C", + "step": 10 + }, + { + "expr": "-1 * sum(idelta(orca:task:invocations{executionType=\"Pipeline\",isComplete=\"true\", status!=\"SUCCEEDED\"}[1m])) by (taskName)", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "ERR {{taskName}}", + "metric": "orca:task:invocations", + "refId": "D", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Pipelines Completed (orca)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Orca Completed", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 269, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_SPINNAKER}", + "fill": 1, + "id": 18, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(orca:threadpool:activeCount) by (id)", + "intervalFactor": 2, + "legendFormat": "{{id}}", + "metric": "orca:threadpool:activeCount", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Active Threads (orca)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_SPINNAKER}", + "fill": 1, + "id": 38, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(idelta(echo:pipelines:triggered[1m])) by (application, name)", + "intervalFactor": 2, + "legendFormat": "{{name}}({{application}})", + "metric": "echo:pipelines:triggered", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Pipelines Triggered (echo)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Threads and Pipelines", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_SPINNAKER}", + "fill": 1, + "id": 41, + "legend": { + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rosco:bakesActive)", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Active", + "metric": "rosco:bakesActive", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(idelta(rosco:bakesRequested[1m])) by (flavor)", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Request({{flavor}})", + "metric": "rosco:bakes", + "refId": "B", + "step": 10 + }, + { + "expr": "-1 * sum(idelta(rosco:bakesCompleted__count{success=\"false\"}[1m])) by (region)", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Failed {{region}}", + "metric": "bakesC", + "refId": "C", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Bake Activity (rosco)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "-5", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "-5", + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_SPINNAKER}", + "decimals": null, + "fill": 1, + "id": 20, + "legend": { + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(front50:storageServiceSupport:cacheSize) by (objectType)", + "intervalFactor": 2, + "legendFormat": "{{objectType}}", + "metric": "front50:storageServiceSupport:cacheSize", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Item Cache Size (front50)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Bakes and Items", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 282, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_SPINNAKER}", + "fill": 1, + "id": 1, + "legend": { + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(sum(idelta(clouddriver:operations__count{success=\"true\"}[1m])) by (OperationType), \"OperationType\", \"$1\", \"OperationType\", \"(.*)AtomicOperation\")", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{OperationType}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Successful Operations (clouddriver)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_SPINNAKER}", + "fill": 1, + "id": 42, + "legend": { + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(sum(idelta(clouddriver:operations__count{success!=\"true\"}[1m])) by (OperationType), \"OperationType\", \"$1\", \"OperationType\", \"(.*)AtomicOperation\")", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{OperationType}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Failed Operations (clouddriver)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Operations", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 291, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_SPINNAKER}", + "fill": 1, + "id": 21, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "drop_common_labels(clouddriver:jvm:memory:used{memtype=\"HEAP\"}) / (1024 * 1024)", + "intervalFactor": 2, + "legendFormat": "Clouddriver({{id}}/{{memtype}})", + "metric": "clouddriver:jvm:memory:used", + "refId": "A", + "step": 4 + }, + { + "expr": "echo:jvm:memory:used{memtype=\"HEAP\"} / (1024 * 1024)", + "intervalFactor": 2, + "legendFormat": "Echo({{id}}/{{memtype}})", + "metric": "echo:jvm:memory:used", + "refId": "B", + "step": 4 + }, + { + "expr": "fiat:jvm:memory:used{memtype=\"HEAP\"} / (1024 * 1024)", + "intervalFactor": 2, + "legendFormat": "Fiat({{id}}/{{memtype}})", + "metric": "fiat:jvm:memory:used", + "refId": "C", + "step": 4 + }, + { + "expr": "front50:jvm:memory:used{memtype=\"HEAP\"} / (1024 * 1024)", + "intervalFactor": 2, + "legendFormat": "Front50({{id}}/{{memtype}})", + "metric": "front50:jvm:memory:used", + "refId": "D", + "step": 4 + }, + { + "expr": "gate:jvm:memory:used{memtype=\"HEAP\"} / (1024 * 1024)", + "intervalFactor": 2, + "legendFormat": "Gate({{id}}/{{memtype}})", + "metric": "gate:jvm:memory:used", + "refId": "E", + "step": 4 + }, + { + "expr": "igor:jvm:memory:used{memtype=\"HEAP\"} / (1024 * 1024)", + "intervalFactor": 2, + "legendFormat": "Igor({{id}}/{{memtype}})", + "metric": "igor:jvm:memory:used", + "refId": "F", + "step": 4 + }, + { + "expr": "orca:jvm:memory:used{memtype=\"HEAP\"} / (1024 * 1024)", + "intervalFactor": 2, + "legendFormat": "Orca({{id}}/{{memtype}})", + "metric": "orca:jvm:memory:used", + "refId": "G", + "step": 4 + }, + { + "expr": "rosco:jvm:memory:used{memtype=\"HEAP\"} / (1024 * 1024)", + "intervalFactor": 2, + "legendFormat": "Rosco({{id}}/{{memtype}})", + "metric": "rosco:jvm:memory:used", + "refId": "H", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "JVM Memory Usage (Megabytes)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "JVM Memory", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Spinnaker Minimalist", + "version": 1 +} diff --git a/google/stackdriver_monitoring/MinimalPrometheusGrafanaDashboard.json b/google/stackdriver_monitoring/config/prometheus/SpecificApplicationDashboard.json similarity index 69% rename from google/stackdriver_monitoring/MinimalPrometheusGrafanaDashboard.json rename to google/stackdriver_monitoring/config/prometheus/SpecificApplicationDashboard.json index aa90f57..ff33569 100644 --- a/google/stackdriver_monitoring/MinimalPrometheusGrafanaDashboard.json +++ b/google/stackdriver_monitoring/config/prometheus/SpecificApplicationDashboard.json @@ -42,7 +42,606 @@ "rows": [ { "collapse": false, - "height": 200, + "height": 276, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_SPINNAKER}", + "fill": 1, + "id": 55, + "legend": { + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(idelta(orca:task:invocations{sourceApplication=~\"$Application\", executionType=\"Orchestration\", isComplete=\"false\"}[$SamplePeriod])) by (taskName)", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{taskName}}", + "metric": "", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "$Application Active Orchestrations (orca)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_SPINNAKER}", + "fill": 1, + "id": 56, + "legend": { + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(idelta(orca:task:invocations{sourceApplication=~\"$Application\", executionType=\"Orchestration\",isComplete=\"true\", status=\"SUCCEEDED\"}[$SamplePeriod])) by (taskName)", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{taskName}}", + "metric": "orca:task:invocations", + "refId": "C", + "step": 10 + }, + { + "expr": "-1 * sum(idelta(orca:task:invocations{sourceApplication=~\"$Application\", executionType=\"Orchestration\",isComplete=\"true\", status!=\"SUCCEEDED\"}[$SamplePeriod])) by (taskName)", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "ERR {{taskName}}", + "metric": "orca:task:invocations", + "refId": "D", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "$Application Orchestrations Completed (orca)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Echo", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 217, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_SPINNAKER}", + "fill": 1, + "id": 58, + "legend": { + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(idelta(orca:task:invocations{sourceApplication=~\"$Application\", executionType=\"Pipeline\", isComplete=\"false\"}[$SamplePeriod])) by (taskName)", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{taskName}}", + "metric": "", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "$Application Active Pipelines (orca)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_SPINNAKER}", + "fill": 1, + "id": 59, + "legend": { + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(idelta(orca:task:invocations{sourceApplication=~\"$Application\", executionType=\"Pipeline\",isComplete=\"true\", status=\"SUCCEEDED\"}[$SamplePeriod])) by (taskName)", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{taskName}}", + "metric": "orca:task:invocations", + "refId": "C", + "step": 10 + }, + { + "expr": "-1 * sum(idelta(orca:task:invocations{sourceApplication=~\"$Application\", executionType=\"Pipeline\",isComplete=\"true\", status!=\"SUCCEEDED\"}[$SamplePeriod])) by (taskName)", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "ERR {{taskName}}", + "metric": "orca:task:invocations", + "refId": "D", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "$Application Pipelines Completed (orca)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Orca Pipelines", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 235, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_SPINNAKER}", + "fill": 1, + "id": 38, + "legend": { + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(idelta(echo:pipelines:triggered{application=~\"$Application\"}[$SamplePeriod])) by (name, application)", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{name}}({{application}})", + "metric": "echo:pipelines:triggered", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "$Application Pipelines Triggered (echo)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_SPINNAKER}", + "fill": 1, + "id": 41, + "legend": { + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 5, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rosco:bakesActive)", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Active", + "metric": "rosco:bakesActive", + "refId": "A", + "step": 20 + }, + { + "expr": "sum(idelta(rosco:bakesRequested[$SamplePeriod])) by (flavor)", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Request({{flavor}})", + "metric": "rosco:bakes", + "refId": "B", + "step": 20 + }, + { + "expr": "-1 * sum(idelta(rosco:bakesCompleted__count{success=\"false\"}[$SamplePeriod])) by (region)", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Failed {{region}}", + "metric": "bakesC", + "refId": "C", + "step": 20 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Bake Requests and Failures (global)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_SPINNAKER}", + "fill": 1, + "id": 14, + "legend": { + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": true, + "renderer": "flot", + "seriesOverrides": [], + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(rosco:bakesCompleted__totalTime[$SamplePeriod]) / 1000000000) by (region) / sum(rate(rosco:bakesCompleted__count[$SamplePeriod])) by (region)", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{region}}", + "metric": "", + "refId": "A", + "step": 20 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Bakes Completed (global)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Bakes", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 217, "panels": [ { "aliasColors": {}, @@ -70,7 +669,7 @@ "points": false, "renderer": "flot", "seriesOverrides": [], - "span": 6, + "span": 4, "stack": true, "steppedLine": false, "targets": [ @@ -80,7 +679,7 @@ "legendFormat": "front50/{{metricGroup}}({{metricType}})", "metric": "", "refId": "A", - "step": 10 + "step": 20 }, { "expr": "sum(gate:hystrix:countShortCircuited) by (metricGroup, metricType)", @@ -88,7 +687,7 @@ "legendFormat": "gate/{{metricGroup}}({{metricType}})", "metric": "", "refId": "B", - "step": 10 + "step": 20 }, { "expr": "sum(igor:hystrix:countShortCircuited) by (metricGroup, metricType)", @@ -96,13 +695,13 @@ "legendFormat": "igor/{{metricGroup}}({{metricType}})", "metric": "", "refId": "C", - "step": 10 + "step": 20 } ], "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Hystrix Short Circuited", + "title": "Hystrix Short Circuited (global)", "tooltip": { "shared": true, "sort": 0, @@ -160,12 +759,12 @@ "points": false, "renderer": "flot", "seriesOverrides": [], - "span": 6, + "span": 5, "stack": false, "steppedLine": false, "targets": [ { - "expr": "delta(front50:hystrix:countExceptionsThrown[1m])", + "expr": "idelta(front50:hystrix:countExceptionsThrown[$SamplePeriod])", "intervalFactor": 2, "legendFormat": "front50/{{metricType}}({{metricGroup}})", "metric": "front50:hystrix:countExceptionsThrown", @@ -173,7 +772,7 @@ "step": 10 }, { - "expr": "delta(gate:hystrix:countExceptionsThrown[1m])", + "expr": "idelta(gate:hystrix:countExceptionsThrown[$SamplePeriod])", "intervalFactor": 2, "legendFormat": "gate/{{metricType}}({{metricGroup}})", "metric": "gate:hystrix:countExceptionsThrown", @@ -181,7 +780,7 @@ "step": 10 }, { - "expr": "delta(igor:hystrix:countExceptionsThrown[1m])", + "expr": "idelta(igor:hystrix:countExceptionsThrown[$SamplePeriod])", "intervalFactor": 2, "legendFormat": "igor/{{metricType}}({{metricGroup}})", "metric": "igor:hystrix:countExceptionsThrown", @@ -192,7 +791,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Hystrix Exceptions Thrown", + "title": "Hystrix Exceptions Thrown (global)", "tooltip": { "shared": true, "sort": 0, @@ -205,6 +804,97 @@ "show": true, "values": [] }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": true, + "datasource": "${DS_SPINNAKER}", + "fill": 1, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(front50:hystrix:rollingCountFailure) by (metricGroup, metricType)", + "hide": true, + "intervalFactor": 2, + "legendFormat": "front50({{metricGroup}}/{{metricType}})", + "metric": "", + "refId": "A", + "step": 2 + }, + { + "expr": "sum(gate:hystrix:rollingCountFailure) by (metricGroup, metricType)", + "hide": true, + "intervalFactor": 2, + "legendFormat": "gate({{metricGroup}}/{{metricType}})", + "metric": "", + "refId": "B", + "step": 2 + }, + { + "expr": "sum(igor:hystrix:rollingCountFailure) by (metricGroup, metricType)", + "hide": true, + "intervalFactor": 2, + "legendFormat": "igor({{metricGroup}}/{{metricType}})", + "metric": "", + "refId": "C", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Hystrix Rolling Count Failure (global)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, "yaxes": [ { "format": "short", @@ -231,657 +921,80 @@ "showTitle": false, "title": "Hystrix Error Signals", "titleSize": "h6" - }, - { - "collapse": false, - "height": 282, - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_SPINNAKER}", - "fill": 1, - "id": 1, - "legend": { - "avg": false, - "current": false, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "label_replace(sum(increase(clouddriver:operations__count{success=\"true\"}[1m])) by (OperationType), \"OperationType\", \"$1\", \"OperationType\", \"(.*)AtomicOperation\")", - "hide": false, - "intervalFactor": 2, - "legendFormat": "{{OperationType}}", - "refId": "A", - "step": 10 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Successful Operations", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_SPINNAKER}", - "fill": 1, - "id": 42, - "legend": { - "avg": false, - "current": false, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "label_replace(sum(increase(clouddriver:operations__count{success!=\"true\"}[1m])) by (OperationType), \"OperationType\", \"$1\", \"OperationType\", \"(.*)AtomicOperation\")", - "hide": false, - "intervalFactor": 2, - "legendFormat": "{{OperationType}}", - "refId": "A", - "step": 10 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Failed Operations", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Operations", - "titleSize": "h6" - }, - { - "collapse": false, - "height": 269, - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_SPINNAKER}", - "fill": 1, - "id": 18, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(orca:threadpool:activeCount) by (id)", - "intervalFactor": 2, - "legendFormat": "{{id}}", - "metric": "orca:threadpool:activeCount", - "refId": "A", - "step": 10 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Orca Active Threads", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_SPINNAKER}", - "fill": 1, - "id": 38, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(delta(echo:pipelines:triggered[1m])) by (application, name)", - "intervalFactor": 2, - "legendFormat": "{{name}}({{application}})", - "metric": "echo:pipelines:triggered", - "refId": "A", - "step": 10 - }, - { - "expr": "sum(delta(echo:pipelines:triggered[1m])) by (application, name)", - "intervalFactor": 2, - "legendFormat": "{{name}}({{application}})", - "metric": "echo:pipelines:triggered", - "refId": "B", - "step": 10 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Pipelines Triggered", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Threads and Pipelines", - "titleSize": "h6" - }, - { - "collapse": false, - "height": 250, - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_SPINNAKER}", - "fill": 1, - "id": 41, - "legend": { - "avg": false, - "current": false, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rosco:bakesActive)", - "hide": false, - "intervalFactor": 2, - "legendFormat": "Active", - "metric": "rosco:bakesActive", - "refId": "A", - "step": 10 - }, - { - "expr": "sum(idelta(rosco:bakeRequests[1m])) by (flavor)", - "hide": false, - "intervalFactor": 2, - "legendFormat": "Request({{flavor}})", - "metric": "rosco:bakeRequests", - "refId": "B", - "step": 10 - }, - { - "expr": "-1 * sum(idelta(rosco:bakesCompleted{success=\"false\"}[1m])) by (region)", - "intervalFactor": 2, - "legendFormat": "Failed {{region}}", - "refId": "C", - "step": 10 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Bake Requests", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_SPINNAKER}", - "decimals": null, - "fill": 1, - "id": 20, - "legend": { - "avg": false, - "current": false, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(front50:storageServiceSupport:cacheSize) by (objectType)", - "intervalFactor": 2, - "legendFormat": "{{objectType}}", - "metric": "front50:storageServiceSupport:cacheSize", - "refId": "A", - "step": 10 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Item Cache Size", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Bakes and Items", - "titleSize": "h6" - }, - { - "collapse": false, - "height": 291, - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_SPINNAKER}", - "fill": 1, - "id": 21, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "drop_common_labels(clouddriver:jvm:memory:used{memtype=\"HEAP\"}) / (1024 * 1024)", - "intervalFactor": 2, - "legendFormat": "Clouddriver({{id}}/{{memtype}})", - "metric": "clouddriver:jvm:memory:used", - "refId": "A", - "step": 4 - }, - { - "expr": "echo:jvm:memory:used{memtype=\"HEAP\"} / (1024 * 1024)", - "intervalFactor": 2, - "legendFormat": "Echo({{id}}/{{memtype}})", - "metric": "echo:jvm:memory:used", - "refId": "B", - "step": 4 - }, - { - "expr": "fiat:jvm:memory:used{memtype=\"HEAP\"} / (1024 * 1024)", - "intervalFactor": 2, - "legendFormat": "Fiat({{id}}/{{memtype}})", - "metric": "fiat:jvm:memory:used", - "refId": "C", - "step": 4 - }, - { - "expr": "front50:jvm:memory:used{memtype=\"HEAP\"} / (1024 * 1024)", - "intervalFactor": 2, - "legendFormat": "Front50({{id}}/{{memtype}})", - "metric": "front50:jvm:memory:used", - "refId": "D", - "step": 4 - }, - { - "expr": "gate:jvm:memory:used{memtype=\"HEAP\"} / (1024 * 1024)", - "intervalFactor": 2, - "legendFormat": "Gate({{id}}/{{memtype}})", - "metric": "gate:jvm:memory:used", - "refId": "E", - "step": 4 - }, - { - "expr": "igor:jvm:memory:used{memtype=\"HEAP\"} / (1024 * 1024)", - "intervalFactor": 2, - "legendFormat": "Igor({{id}}/{{memtype}})", - "metric": "igor:jvm:memory:used", - "refId": "F", - "step": 4 - }, - { - "expr": "orca:jvm:memory:used{memtype=\"HEAP\"} / (1024 * 1024)", - "intervalFactor": 2, - "legendFormat": "Orca({{id}}/{{memtype}})", - "metric": "orca:jvm:memory:used", - "refId": "G", - "step": 4 - }, - { - "expr": "rosco:jvm:memory:used{memtype=\"HEAP\"} / (1024 * 1024)", - "intervalFactor": 2, - "legendFormat": "Rosco({{id}}/{{memtype}})", - "metric": "rosco:jvm:memory:used", - "refId": "H", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "JVM Memory Usage (Megabytes)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "JVM Memory", - "titleSize": "h6" } ], "schemaVersion": 14, "style": "dark", "tags": [], "templating": { - "list": [] + "list": [ + { + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "text": "1m", + "value": "1m" + }, + "hide": 0, + "label": "Sample Period", + "name": "SamplePeriod", + "options": [ + { + "selected": true, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "15m", + "value": "15m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + } + ], + "query": "1m,5m,10m,15m,30m", + "refresh": 2, + "type": "interval" + }, + { + "allValue": ".*", + "current": { + "tags": [], + "text": "All", + "value": "$__all" + }, + "datasource": "Spinnaker", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "Application", + "options": [], + "query": "orca:task:invocations", + "refresh": 1, + "regex": "/sourceApplication=\"([^\"]+)/", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] }, "time": { "from": "now-1h", @@ -913,6 +1026,6 @@ ] }, "timezone": "browser", - "title": "Minimal Spinnaker", - "version": 3 -} \ No newline at end of file + "title": "Specific Spinnaker Application Details", + "version": 1 +} diff --git a/google/stackdriver_monitoring/config/prometheus/install.sh b/google/stackdriver_monitoring/config/prometheus/install.sh new file mode 100755 index 0000000..ea00df9 --- /dev/null +++ b/google/stackdriver_monitoring/config/prometheus/install.sh @@ -0,0 +1,80 @@ +#!/bin/bash +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +PROMETHEUS_VERSION=prometheus-1.5.0.linux-amd64 +PROMETHEUS_PORT=9090 +GRAFANA_PORT=3000 +CONFIG_DIR=$(readlink -f `dirname $0`) +cd /opt + +# Install Prometheus +curl -L -o /tmp/prometheus.gz \ + https://github.com/prometheus/prometheus/releases/download/v1.5.0/prometheus-1.5.0.linux-amd64.tar.gz +sudo tar xzf /tmp/prometheus.gz -C /opt +rm /tmp/prometheus.gz + +curl -L -o /tmp/node_exporter.gz \ + https://github.com/prometheus/node_exporter/releases/download/v0.13.0/node_exporter-0.13.0.linux-amd64.tar.gz +sudo tar xzf /tmp/node_exporter.gz -C /opt/prometheus-1.5.0.linux-amd64 +sudo ln -s /opt/prometheus-1.5.0.linux-amd64/node_exporter-0.13.0.linux-amd64/node_exporter /usr/bin/node_exporter +rm /tmp/node_exporter.gz + +sudo cp $CONFIG_DIR/spinnaker-prometheus.yml prometheus-1.5.0.linux-amd64 +sudo cp $CONFIG_DIR/prometheus.conf /etc/init/prometheus.conf +sudo cp $CONFIG_DIR/node_exporter.conf /etc/init/node_exporter.conf + + +# Install Grafana +cd /tmp +wget https://grafanarel.s3.amazonaws.com/builds/grafana_4.1.1-1484211277_amd64.deb +sudo apt-get install -y adduser libfontconfig +sudo dpkg -i grafana_4.1.1-1484211277_amd64.deb +sudo update-rc.d grafana-server defaults +rm grafana_4.1.1-1484211277_amd64.deb + + +# Startup +echo "Starting Prometheus" +sudo service node_exporter start +sudo service prometheus start +sudo service grafana-server start + +TRIES=0 +until nc -z localhost $GRAFANA_PORT || [[ $TRIES -gt 5 ]]; do + sleep 1 + let TRIES+=1 +done + +echo "Adding datasource" +PAYLOAD="{'name':'Spinnaker','type':'prometheus','url':'http://localhost:${PROMETHEUS_PORT}','access':'direct','isDefault':true}" +curl -u admin:admin http://localhost:${GRAFANA_PORT}/api/datasources \ + -H "Content-Type: application/json" \ + -X POST \ + -d "${PAYLOAD//\'/\"}" + +for dashboard in ${CONFIG_DIR}/*Dashboard.json; do + echo "Installing $(basename $dashboard)" + x=$(sed -e "/\"__inputs\"/,/],/d" \ + -e "/\"__requires\"/,/],/d" \ + -e "s/\${DS_SPINNAKER\}/Spinnaker/g" < "$dashboard") + temp_file=$(mktemp) + echo "{ \"dashboard\": $x }" > $temp_file + curl -u admin:admin http://localhost:${GRAFANA_PORT}/api/dashboards/import \ + -H "Content-Type: application/json" \ + -X POST \ + -d @${temp_file} + rm -f $temp_file +done diff --git a/google/stackdriver_monitoring/config/prometheus/node_exporter.conf b/google/stackdriver_monitoring/config/prometheus/node_exporter.conf new file mode 100644 index 0000000..bf1b32d --- /dev/null +++ b/google/stackdriver_monitoring/config/prometheus/node_exporter.conf @@ -0,0 +1,2 @@ +start on filesystem or runlevel [2345] +exec /usr/bin/node_exporter diff --git a/google/stackdriver_monitoring/config/prometheus/prometheus.conf b/google/stackdriver_monitoring/config/prometheus/prometheus.conf new file mode 100644 index 0000000..ebfb5bc --- /dev/null +++ b/google/stackdriver_monitoring/config/prometheus/prometheus.conf @@ -0,0 +1,6 @@ +start on filesystem or runlevel [2345] + +exec /opt/prometheus-1.5.0.linux-amd64/prometheus \ + -config.file /opt/prometheus-1.5.0.linux-amd64/spinnaker-prometheus.yml \ + -storage.local.path /opt/prometheus-1.5.0.linux-amd64/data \ + > /var/log/prometheus.log 2>&1 diff --git a/google/stackdriver_monitoring/config/prometheus/spinnaker-prometheus.yml b/google/stackdriver_monitoring/config/prometheus/spinnaker-prometheus.yml new file mode 100644 index 0000000..cc4d822 --- /dev/null +++ b/google/stackdriver_monitoring/config/prometheus/spinnaker-prometheus.yml @@ -0,0 +1,33 @@ +# my global config +global: + scrape_interval: 15s + evaluation_interval: 15s + # scrape_timeout is set to the global default (10s). + + # Attach these labels to any time series or alerts when communicating with + # external systems (federation, remote storage, Alertmanager). + external_labels: + monitor: 'codelab-monitor' + +# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. +rule_files: + # - "first.rules" + # - "second.rules" + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: + - job_name: 'spinnaker' + static_configs: + - targets: ['localhost:8008'] + metrics_path: '/prometheus_metrics' + honor_labels: true + + - job_name: 'node' + static_configs: + - targets: ['localhost:9100'] + +# - job_name: 'prometheus' +# static_configs: +# - targets: ['localhost:9090'] + diff --git a/google/stackdriver_monitoring/config/stackdriver/MinimalDashboard.json b/google/stackdriver_monitoring/config/stackdriver/MinimalDashboard.json new file mode 100644 index 0000000..8f40b62 --- /dev/null +++ b/google/stackdriver_monitoring/config/stackdriver/MinimalDashboard.json @@ -0,0 +1,324 @@ +{ + "displayName": "Minimal Spinnaker Dashboard", + "version": 14, + "root": { + "gridLayout": { + "widgets": [ + { + "title": "Hystrix Short Circuited (global)", + "xyChart": { + "dataSets": [ + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/front50/hystrix.countShortCircuited\"", + "perSeriesAligner": "ALIGN_DELTA" + } + }, + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/gate/hystrix.countShortCircuited\"", + "perSeriesAligner": "ALIGN_DELTA" + } + }, + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/igor/hystrix.countShortCircuited\"", + "perSeriesAligner": "ALIGN_DELTA" + } + } + ], + "constantLines": [ + {} + ], + "options": {}, + "y1Axis": {}, + "xAxis": {} + } + }, + { + "title": "Hystrix Exceptions Thrown (global)", + "xyChart": { + "dataSets": [ + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/front50/hystrix.countExceptionsThrown\"", + "perSeriesAligner": "ALIGN_DELTA" + } + }, + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/gate/hystrix.countExceptionsThrown\"", + "perSeriesAligner": "ALIGN_DELTA" + } + }, + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/igor/hystrix.countExceptionsThrown\"", + "perSeriesAligner": "ALIGN_DELTA" + } + } + ], + "constantLines": [ + {} + ], + "options": {}, + "y1Axis": {}, + "xAxis": {} + } + }, + { + "title": "Active Orchestrations (orca)", + "xyChart": { + "dataSets": [ + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/orca/task.invocations\" AND metric.label.isComplete=\"false\" AND metric.label.executionType=\"Orchestration\"", + "perSeriesAligner": "ALIGN_DELTA" + } + } + ], + "constantLines": [ + {} + ], + "options": {}, + "y1Axis": {}, + "xAxis": {} + } + }, + { + "title": "Active Pipelines (orca)", + "xyChart": { + "dataSets": [ + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/orca/task.invocations\" AND metric.label.isComplete=\"false\" AND metric.label.executionType=\"Pipeline\"", + "perSeriesAligner": "ALIGN_DELTA" + } + } + ], + "constantLines": [ + {} + ], + "options": {}, + "y1Axis": {}, + "xAxis": {} + } + }, + { + "title": "Completed Orchestrations (orca)", + "xyChart": { + "dataSets": [ + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/orca/task.invocations\" AND metric.label.status=\"SUCCEEDED\" AND metric.label.executionType=\"Orchestration\"", + "perSeriesAligner": "ALIGN_DELTA" + } + }, + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/orca/task.invocations\" AND metric.label.status!=\"SUCCEEDED\" AND metric.label.executionType=\"Orchestration\"", + "perSeriesAligner": "ALIGN_DELTA" + } + } + ], + "constantLines": [ + {} + ], + "options": {}, + "y1Axis": {}, + "xAxis": {} + } + }, + { + "title": "Completed Pipelines (orca)", + "xyChart": { + "dataSets": [ + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/orca/task.invocations\" AND metric.label.status=\"SUCCEEDED\" AND metric.label.executionType=\"Pipeline\"", + "perSeriesAligner": "ALIGN_DELTA" + } + }, + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/orca/task.invocations\" AND metric.label.status!=\"SUCCEEDED\" AND metric.label.executionType=\"Pipeline\"", + "perSeriesAligner": "ALIGN_DELTA" + } + } + ], + "constantLines": [ + {} + ], + "options": {}, + "y1Axis": {}, + "xAxis": {} + } + }, + { + "title": "Active Threads (orca)", + "xyChart": { + "dataSets": [ + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/orca/threadpool.activeCount\"" + } + } + ], + "constantLines": [ + {} + ], + "options": {}, + "y1Axis": {}, + "xAxis": {} + } + }, + { + "title": "Pipelines Triggered (echo)", + "xyChart": { + "dataSets": [ + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/echo/pipelines.triggered\"", + "perSeriesAligner": "ALIGN_DELTA" + } + } + ], + "constantLines": [ + {} + ], + "options": {}, + "y1Axis": {}, + "xAxis": {} + } + }, + { + "title": "Bake Activity (rosco)", + "xyChart": { + "dataSets": [ + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/rosco/bakesActive\"" + } + } + ], + "constantLines": [ + {} + ], + "options": {}, + "y1Axis": {}, + "xAxis": {} + } + }, + { + "title": "Cached Items (front50)", + "xyChart": { + "dataSets": [ + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/front50/storageServiceSupport.cacheSize\"" + } + } + ], + "constantLines": [ + {} + ], + "options": {}, + "y1Axis": {}, + "xAxis": {} + } + }, + { + "title": "Successful Operations (clouddriver)", + "xyChart": { + "dataSets": [ + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/clouddriver/operations__count\" AND metric.label.success=\"true\"", + "perSeriesAligner": "ALIGN_DELTA" + } + } + ], + "constantLines": [ + {} + ], + "options": {}, + "y1Axis": {}, + "xAxis": {} + } + }, + { + "title": "Failed Operations (clouddriver)", + "xyChart": { + "dataSets": [ + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/clouddriver/operations__count\" AND metric.label.success=\"false\"", + "perSeriesAligner": "ALIGN_DELTA" + } + } + ], + "constantLines": [ + {} + ], + "options": {}, + "y1Axis": {}, + "xAxis": {} + } + }, + { + "title": "JVM Memory (global)", + "xyChart": { + "dataSets": [ + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/clouddriver/jvm.memory.used\" AND metric.label.memtype=\"HEAP\"" + } + }, + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/echo/jvm.memory.used\" AND metric.label.memtype=\"HEAP\"" + } + }, + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/fiat/jvm.memory.used\" AND metric.label.memtype=\"HEAP\"" + } + }, + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/front50/jvm.memory.used\" AND metric.label.memtype=\"HEAP\"" + } + }, + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/gate/jvm.memory.used\" AND metric.label.memtype=\"HEAP\"" + } + }, + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/igor/jvm.memory.used\" AND metric.label.memtype=\"HEAP\"" + } + }, + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/orca/jvm.memory.used\" AND metric.label.memtype=\"HEAP\"" + } + }, + { + "timeSeriesFilter": { + "filter": "metric.type=\"custom.googleapis.com/spinnaker/rosco/jvm.memory.used\" AND metric.label.memtype=\"HEAP\"" + } + } + ], + "constantLines": [ + {} + ], + "options": {}, + "y1Axis": {}, + "xAxis": {} + } + } + ] + } + } +} diff --git a/google/stackdriver_monitoring/config/stackdriver/install.sh b/google/stackdriver_monitoring/config/stackdriver/install.sh new file mode 100755 index 0000000..2bb1dea --- /dev/null +++ b/google/stackdriver_monitoring/config/stackdriver/install.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +echo "See https://cloud.google.com/monitoring/agent/install-agent" +echo "The agent is optional (and only available on GCP and AWS)" + +if [[ -z $STACKDRIVER_API_KEY ]]; then + # Remove this once API is no longer whitelisted. + echo "You need a STACKDRIVER_API_KEY to use this installer." + exit -1 +fi + +for dashboard in *Dashboard.json; do + google/stackdriver_monitoring/spinnaker_metric_tool.sh \ + upload_stackdriver_dashboard --dashboard ${dashboard} \ + "$@" +done +--credentials_path=$HOME/.spinnaker/google-credentials.json --dashboard xyz --update diff --git a/google/stackdriver_monitoring/create_install_tar.sh b/google/stackdriver_monitoring/create_install_tar.sh index cbc6dce..f413fdb 100755 --- a/google/stackdriver_monitoring/create_install_tar.sh +++ b/google/stackdriver_monitoring/create_install_tar.sh @@ -30,37 +30,36 @@ function make_spinnaker_monitor_zip() { local zip_file="$TEMP_DIR/monitor_spinnaker.zip" cd "$SOURCE_DIR" - zip -r "$zip_file" `ls *.py | grep -v _test.py` + zip -qr "$zip_file" `ls *.py | grep -v _test.py` cp spinnaker_metric_tool.py $TEMP_DIR/__main__.py cd $TEMP_DIR - zip $zip_file __main__.py + zip -q $zip_file __main__.py rm -f __main__.py cd "$BUILD_DIR/spinnaker" - zip -r $zip_file pylib - - cd "$BUILD_DIR/citest" - zip -r $zip_file citest + zip -qr $zip_file pylib } function make_install_tar() { - local tar_file="$TEMP_DIR/install.tz" + local tar_file="$1" local staging_dir="$TEMP_DIR/monitor_spinnaker" mkdir $staging_dir cd "$SOURCE_DIR" - cp *.json README.md $TEMP_DIR/monitor_spinnaker.zip $staging_dir + cp -pr install_monitoring.sh config README.md $TEMP_DIR/monitor_spinnaker.zip $staging_dir cat requirements.txt | grep -v mock > $staging_dir/requirements.txt cd $TEMP_DIR - tar czf $tar_file monitor_spinnaker + if [[ "$tar_file" == *.tz || "$tar_file" == *.tar.gz ]]; then + tar czf $tar_file monitor_spinnaker + else + tar cf $tar_file monitor_spinnaker + fi } make_spinnaker_monitor_zip -make_install_tar +make_install_tar "$TARGET_PATH" -cp "$TEMP_DIR/install.tz" "$TARGET_PATH" -rm -rf $TEMP_DIR echo "WROTE $TARGET_PATH" diff --git a/google/stackdriver_monitoring/install_monitoring.sh b/google/stackdriver_monitoring/install_monitoring.sh new file mode 100755 index 0000000..1f1b4ba --- /dev/null +++ b/google/stackdriver_monitoring/install_monitoring.sh @@ -0,0 +1,204 @@ +#!/bin/bash +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +SOURCE_DIR=$(readlink -f `dirname $0`) +COMMAND_LINE_FLAGS=("$@") +USE_DATADOG=false +USE_PROMETHEUS=false +USE_STACKDRIVER=false +PROVIDERS="" +EXTRA_ARGS="" + +function print_usage() { + cat <<-EOF + `basename $0`: + \ + * \ + * \ + * + + is one or more of: + --datadog + Install and configure a Datadog agent. + Spinnaker's metric monitoring tool will publish metrics to Datadog. + You will be prompted for your API and APP keys unless you define + environment variables DATADOG_APP_KEY and DATADOG_API_KEY. + + --prometheus + Install and configure Prometheus and Grafana Dashboard. + Spinnaker's metric monitoring tool will publish metrics to Prometheus. + + --stackdriver + Spinnaker's metric monitoring tool will publish metrics to Stackdriver. + You may also need --credentials_path= + + + zero or more of: + --port=8008 + The port number to use for the embedded HTTP server within the monitor. + + --period=60 + Number of seconds between pollings of microservices. + + --service_hosts + + A comma-delimited list of hostnames (or IPs) to poll by default. + The default is localhost. + Make this empty "" to not poll any services by default. + + + are in the form --=* where: + is one of clouddriver, echo, fiat, front50, gate, igor, rosco + is a comma-delimited list of either a or : + If only a is provided, then the dfeault port will be used. + + An empty list will disable polling on the service entirely. + A value of "*" refers to all the --service_hosts. + The default for each of the services is "*". + + + are zero or more of: + --credentials_path= + If using --stackdriver, the path for the Google Credentials to use. + The default will be the application default credentials. +EOF +} + + +function process_args() { + while [[ $# > 0 ]] + do + local key="$1" + shift + case $key in + --datadog) + USE_DATADOG=true + PROVIDERS="$PROVIDERS --datadog" + ;; + + --prometheus) + USE_PROMETHEUS=true + PROVIDERS="$PROVIDERS --prometheus" + ;; + + --stackdriver) + USE_STACKDRIVER=true + PROVIDERS="$PROVIDERS --stackdriver" + ;; + + --help|-h) + print_usage + exit 1 + ;; + + *) + ;; # ignore + + esac + done +} + + +function install_dependencies() { + apt-get update + apt-get install python-pip python-dev -y + pip install -r $SOURCE_DIR/requirements.txt +} + + +function install_metric_services() { + if [[ "$USE_DATADOG" == "true" ]]; then + $SOURCE_DIR/config/datadog/install.sh + fi + if [[ "$USE_PROMETHEUS" == "true" ]]; then + $SOURCE_DIR/config/prometheus/install.sh + fi + if [[ "$USE_STACKDRIVER" == "true" ]]; then + local credentials="" + for arg in ${COMMAND_LINE_FLAGS[@]}; do + if [[ $arg = --credentials_path=* ]]; then + credentials=$arg + fi + done + $SOURCE_DIR/config/stackdriver/install.sh $credentials + fi +} + + +function write_startup_script() { + cat <<-EOF > "$SOURCE_DIR/monitor_spinnaker.sh" + #!/bin/bash + + set -o allexport + if [[ -f /etc/default/spinnaker ]]; then + source /etc/default/spinnaker + fi + if [[ -f "$SOURCE_DIR/environ" ]]; then + source "$SOURCE_DIR/environ" + fi + set +o allexport + + PYTHONWARNINGS=once \ + python "$SOURCE_DIR/monitor_spinnaker.zip" \ + monitor $@ "\$@" +EOF + chmod 755 "$SOURCE_DIR/monitor_spinnaker.sh" +} + + +function write_upstart_script() { + cat <<-EOF > /etc/init/monitor_spinnaker.conf + start on filesystem or runlevel [2345] + + exec $SOURCE_DIR/monitor_spinnaker.sh > /var/log/monitor_spinnaker.log 2>&1 +EOF + chmod 644 /etc/init/monitor_spinnaker.conf +} + + +process_args "${COMMAND_LINE_FLAGS[@]}" +if [[ "$PROVIDERS" == "" ]]; then + print_usage + echo "" + echo "ERROR: No options were provided." + exit -1 +fi + + +if [[ `/usr/bin/id -u` -ne 0 ]]; then + echo "$0 must be executed with root permissions; exiting" + exit 1 +fi + +install_dependencies +install_metric_services +write_startup_script "${COMMAND_LINE_FLAGS[@]}" +write_upstart_script + +echo "Starting to monitor Spinnaker services..." +service monitor_spinnaker start + +cat < 0 ]] do @@ -382,6 +394,8 @@ extract_spinnaker_credentials echo "$STATUS_PREFIX Configuring Spinnaker" $SPINNAKER_INSTALL_DIR/scripts/reconfigure_spinnaker.sh +do_experimental_startup + # Replace this first time boot with the normal startup script # that just starts spinnaker (and its dependencies) without configuring anymore.