add metric for tracking failure to start the controller-runtime manager (#1860)

Signed-off-by: Evan Baker <rbtr@users.noreply.github.com>
This commit is contained in:
Evan Baker 2023-03-21 11:20:26 -05:00 коммит произвёл GitHub
Родитель b77f715274
Коммит ae8a11c7c8
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
2 изменённых файлов: 25 добавлений и 1 удалений

Просмотреть файл

@ -1260,7 +1260,8 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
if err := manager.Start(ctx); err != nil { if err := manager.Start(ctx); err != nil {
logger.Errorf("[Azure CNS] Failed to start request controller: %v", err) logger.Errorf("[Azure CNS] Failed to start request controller: %v", err)
// retry to start the request controller // retry to start the request controller
// todo: add a CNS metric to count # of failures // inc the managerStartFailures metric for failure tracking
managerStartFailures.Inc()
} else { } else {
logger.Printf("exiting NodeNetworkConfig reconciler") logger.Printf("exiting NodeNetworkConfig reconciler")
return return

23
cns/service/metrics.go Normal file
Просмотреть файл

@ -0,0 +1,23 @@
package main
import (
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)
// managerStartFailures is a monotic counter which tracks the number of times the controller-runtime
// manager failed to start. To drive alerting based on this metric, it is recommended to use the rate
// of increase over a period of time. A positive rate of change indicates that the CNS is actively
// failing and retrying.
var managerStartFailures = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "manager_start_failures_total",
Help: "Number of times the controller-runtime manager failed to start.",
},
)
func init() {
metrics.Registry.MustRegister(
managerStartFailures,
)
}