add metric for tracking failure to start the controller-runtime manager (#1860)
Signed-off-by: Evan Baker <rbtr@users.noreply.github.com>
This commit is contained in:
Родитель
b77f715274
Коммит
ae8a11c7c8
|
@ -1260,7 +1260,8 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
|
|||
if err := manager.Start(ctx); err != nil {
|
||||
logger.Errorf("[Azure CNS] Failed to start request controller: %v", err)
|
||||
// retry to start the request controller
|
||||
// todo: add a CNS metric to count # of failures
|
||||
// inc the managerStartFailures metric for failure tracking
|
||||
managerStartFailures.Inc()
|
||||
} else {
|
||||
logger.Printf("exiting NodeNetworkConfig reconciler")
|
||||
return
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"sigs.k8s.io/controller-runtime/pkg/metrics"
|
||||
)
|
||||
|
||||
// managerStartFailures is a monotic counter which tracks the number of times the controller-runtime
|
||||
// manager failed to start. To drive alerting based on this metric, it is recommended to use the rate
|
||||
// of increase over a period of time. A positive rate of change indicates that the CNS is actively
|
||||
// failing and retrying.
|
||||
var managerStartFailures = prometheus.NewCounter(
|
||||
prometheus.CounterOpts{
|
||||
Name: "manager_start_failures_total",
|
||||
Help: "Number of times the controller-runtime manager failed to start.",
|
||||
},
|
||||
)
|
||||
|
||||
func init() {
|
||||
metrics.Registry.MustRegister(
|
||||
managerStartFailures,
|
||||
)
|
||||
}
|
Загрузка…
Ссылка в новой задаче