add summary metric and dummy cache for reporting

This commit is contained in:
Mangirdas Judeikis 2020-08-06 12:21:55 +01:00
Родитель 01f13f2565
Коммит 32dfd1aaa2
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: AA071F630E926BBD
15 изменённых файлов: 257 добавлений и 43 удалений

Просмотреть файл

@ -119,6 +119,10 @@ func (t ProvisioningState) IsTerminal() bool {
return ProvisioningStateFailed == t || ProvisioningStateSucceeded == t
}
func (t ProvisioningState) String() string {
return string(t)
}
// ClusterProfile represents a cluster profile.
type ClusterProfile struct {
MissingFields

Просмотреть файл

@ -35,7 +35,7 @@ func (mon *Monitor) emitAroOperatorConditions(ctx context.Context) error {
"type": string(c.Type),
})
if mon.logMessages {
if mon.hourlyRun {
mon.log.WithFields(logrus.Fields{
"metric": "arooperator.conditions",
"status": c.Status,

Просмотреть файл

@ -0,0 +1,40 @@
package cluster
// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.
import (
configv1 "github.com/openshift/api/config/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
func (mon *Monitor) getClusterVersion() (*configv1.ClusterVersion, error) {
if mon.cache.cv != nil {
return mon.cache.cv, nil
}
var err error
mon.cache.cv, err = mon.configcli.ConfigV1().ClusterVersions().Get("version", metav1.GetOptions{})
return mon.cache.cv, err
}
func (mon *Monitor) listClusterOperators() (*configv1.ClusterOperatorList, error) {
if mon.cache.cos != nil {
return mon.cache.cos, nil
}
var err error
mon.cache.cos, err = mon.configcli.ConfigV1().ClusterOperators().List(metav1.ListOptions{})
return mon.cache.cos, err
}
func (mon *Monitor) listNodes() (*v1.NodeList, error) {
if mon.cache.ns != nil {
return mon.cache.ns, nil
}
var err error
mon.cache.ns, err = mon.cli.CoreV1().Nodes().List(metav1.ListOptions{})
return mon.cache.ns, err
}

Просмотреть файл

@ -10,9 +10,11 @@ import (
"runtime"
"github.com/Azure/go-autorest/autorest/azure"
configv1 "github.com/openshift/api/config/v1"
configclient "github.com/openshift/client-go/config/clientset/versioned"
mcoclient "github.com/openshift/machine-config-operator/pkg/generated/clientset/versioned"
"github.com/sirupsen/logrus"
v1 "k8s.io/api/core/v1"
"k8s.io/client-go/kubernetes"
"github.com/Azure/ARO-RP/pkg/api"
@ -23,9 +25,9 @@ import (
)
type Monitor struct {
env env.Interface
log *logrus.Entry
logMessages bool
env env.Interface
log *logrus.Entry
hourlyRun bool
oc *api.OpenShiftCluster
dims map[string]string
@ -35,9 +37,16 @@ type Monitor struct {
mcocli mcoclient.Interface
m metrics.Interface
arocli aroclient.AroV1alpha1Interface
// access below only via the helper functions in cache.go
cache struct {
cos *configv1.ClusterOperatorList
cv *configv1.ClusterVersion
ns *v1.NodeList
}
}
func NewMonitor(ctx context.Context, env env.Interface, log *logrus.Entry, oc *api.OpenShiftCluster, m metrics.Interface, logMessages bool) (*Monitor, error) {
func NewMonitor(ctx context.Context, env env.Interface, log *logrus.Entry, oc *api.OpenShiftCluster, m metrics.Interface, hourlyRun bool) (*Monitor, error) {
r, err := azure.ParseResourceID(oc.ID)
if err != nil {
return nil, err
@ -84,9 +93,9 @@ func NewMonitor(ctx context.Context, env env.Interface, log *logrus.Entry, oc *a
}
return &Monitor{
env: env,
log: log,
logMessages: logMessages,
env: env,
log: log,
hourlyRun: hourlyRun,
oc: oc,
dims: dims,
@ -126,6 +135,7 @@ func (mon *Monitor) Monitor(ctx context.Context) {
mon.emitPodConditions,
mon.emitReplicasetStatuses,
mon.emitStatefulsetStatuses,
mon.emitSummary,
mon.emitPrometheusAlerts, // at the end for now because it's the slowest/least reliable
} {
err = f(ctx)

Просмотреть файл

@ -8,7 +8,6 @@ import (
configv1 "github.com/openshift/api/config/v1"
"github.com/sirupsen/logrus"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
type clusterOperatorConditionsIgnoreStruct struct {
@ -35,11 +34,10 @@ var clusterOperatorConditionsExpected = map[configv1.ClusterStatusConditionType]
}
func (mon *Monitor) emitClusterOperatorConditions(ctx context.Context) error {
cos, err := mon.configcli.ConfigV1().ClusterOperators().List(metav1.ListOptions{})
cos, err := mon.listClusterOperators()
if err != nil {
return err
}
mon.emitGauge("clusteroperator.count", int64(len(cos.Items)), nil)
for _, co := range cos.Items {
@ -54,7 +52,7 @@ func (mon *Monitor) emitClusterOperatorConditions(ctx context.Context) error {
"type": string(c.Type),
})
if mon.logMessages {
if mon.hourlyRun {
mon.log.WithFields(logrus.Fields{
"metric": "clusteroperator.conditions",
"name": co.Name,

Просмотреть файл

@ -5,17 +5,15 @@ package cluster
import (
"context"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
func (mon *Monitor) emitClusterOperatorVersions(ctx context.Context) error {
cv, err := mon.configcli.ConfigV1().ClusterVersions().Get("version", metav1.GetOptions{})
cv, err := mon.getClusterVersion()
if err != nil {
return err
}
cos, err := mon.configcli.ConfigV1().ClusterOperators().List(metav1.ListOptions{})
cos, err := mon.listClusterOperators()
if err != nil {
return err
}

Просмотреть файл

@ -8,7 +8,6 @@ import (
configv1 "github.com/openshift/api/config/v1"
"github.com/sirupsen/logrus"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
var clusterVersionConditionsExpected = map[configv1.ClusterStatusConditionType]configv1.ConditionStatus{
@ -19,7 +18,7 @@ var clusterVersionConditionsExpected = map[configv1.ClusterStatusConditionType]c
}
func (mon *Monitor) emitClusterVersionConditions(ctx context.Context) error {
cv, err := mon.configcli.ConfigV1().ClusterVersions().Get("version", metav1.GetOptions{})
cv, err := mon.getClusterVersion()
if err != nil {
return err
}
@ -28,13 +27,12 @@ func (mon *Monitor) emitClusterVersionConditions(ctx context.Context) error {
if c.Status == clusterVersionConditionsExpected[c.Type] {
continue
}
mon.emitGauge("clusterversion.conditions", 1, map[string]string{
"status": string(c.Status),
"type": string(c.Type),
})
if mon.logMessages {
if mon.hourlyRun {
mon.log.WithFields(logrus.Fields{
"metric": "clusterversion.conditions",
"status": c.Status,

Просмотреть файл

@ -7,28 +7,15 @@ import (
"context"
configv1 "github.com/openshift/api/config/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
func (mon *Monitor) emitClusterVersions(ctx context.Context) error {
cv, err := mon.configcli.ConfigV1().ClusterVersions().Get("version", metav1.GetOptions{})
cv, err := mon.getClusterVersion()
if err != nil {
return err
}
// Find the actual current cluster state. The history is ordered by most
// recent first, so find the latest "Completed" status to get current
// cluster version
var actualVersion string
for _, history := range cv.Status.History {
if history.State == configv1.CompletedUpdate {
actualVersion = history.Version
break
}
}
mon.emitGauge("cluster.versions", 1, map[string]string{
"actualVersion": actualVersion,
"actualVersion": actualVersion(cv),
"desiredVersion": desiredVersion(cv),
"resourceProviderVersion": mon.oc.Properties.ProvisionedBy,
})
@ -36,6 +23,18 @@ func (mon *Monitor) emitClusterVersions(ctx context.Context) error {
return nil
}
// actualVersion finds the actual current cluster state. The history is ordered by most
// recent first, so find the latest "Completed" status to get current
// cluster version
func actualVersion(cv *configv1.ClusterVersion) string {
for _, history := range cv.Status.History {
if history.State == configv1.CompletedUpdate {
return history.Version
}
}
return ""
}
func desiredVersion(cv *configv1.ClusterVersion) string {
if cv.Spec.DesiredUpdate != nil &&
cv.Spec.DesiredUpdate.Version != "" {

Просмотреть файл

@ -38,7 +38,7 @@ func (mon *Monitor) emitMachineConfigPoolConditions(ctx context.Context) error {
"type": string(c.Type),
})
if mon.logMessages {
if mon.hourlyRun {
mon.log.WithFields(logrus.Fields{
"metric": "machineconfigpool.conditions",
"name": mcp.Name,

Просмотреть файл

@ -8,7 +8,6 @@ import (
"github.com/sirupsen/logrus"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
var nodeConditionsExpected = map[v1.NodeConditionType]v1.ConditionStatus{
@ -19,7 +18,7 @@ var nodeConditionsExpected = map[v1.NodeConditionType]v1.ConditionStatus{
}
func (mon *Monitor) emitNodeConditions(ctx context.Context) error {
ns, err := mon.cli.CoreV1().Nodes().List(metav1.ListOptions{})
ns, err := mon.listNodes()
if err != nil {
return err
}
@ -38,7 +37,7 @@ func (mon *Monitor) emitNodeConditions(ctx context.Context) error {
"type": string(c.Type),
})
if mon.logMessages {
if mon.hourlyRun {
mon.log.WithFields(logrus.Fields{
"metric": "node.conditions",
"name": n.Name,
@ -48,6 +47,12 @@ func (mon *Monitor) emitNodeConditions(ctx context.Context) error {
}).Print()
}
}
mon.emitGauge("node.kubelet.version", 1, map[string]string{
"name": n.Name,
"kubeletVersion": n.Status.NodeInfo.KubeletVersion,
})
}
return nil

Просмотреть файл

@ -29,6 +29,9 @@ func TestEmitNodeConditions(t *testing.T) {
Status: corev1.ConditionTrue,
},
},
NodeInfo: corev1.NodeSystemInfo{
KubeletVersion: "v1.17.1+9d33dd3",
},
},
}, &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
@ -41,6 +44,9 @@ func TestEmitNodeConditions(t *testing.T) {
Status: corev1.ConditionFalse,
},
},
NodeInfo: corev1.NodeSystemInfo{
KubeletVersion: "v1.17.1+9d33dd3",
},
},
})
@ -66,6 +72,15 @@ func TestEmitNodeConditions(t *testing.T) {
"type": "Ready",
})
m.EXPECT().EmitGauge("node.kubelet.version", int64(1), map[string]string{
"name": "aro-master-0",
"kubeletVersion": "v1.17.1+9d33dd3",
})
m.EXPECT().EmitGauge("node.kubelet.version", int64(1), map[string]string{
"name": "aro-master-1",
"kubeletVersion": "v1.17.1+9d33dd3",
})
err := mon.emitNodeConditions(ctx)
if err != nil {
t.Fatal(err)

Просмотреть файл

@ -55,7 +55,7 @@ func (mon *Monitor) _emitPodConditions(ps *v1.PodList) {
"type": string(c.Type),
})
if mon.logMessages {
if mon.hourlyRun {
mon.log.WithFields(logrus.Fields{
"metric": "pod.conditions",
"name": p.Name,
@ -91,7 +91,7 @@ func (mon *Monitor) _emitPodContainerStatuses(ps *v1.PodList) {
"reason": cs.State.Waiting.Reason,
})
if mon.logMessages {
if mon.hourlyRun {
mon.log.WithFields(logrus.Fields{
"metric": "pod.containerstatuses",
"name": p.Name,

Просмотреть файл

@ -0,0 +1,52 @@
package cluster
// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.
import (
"context"
"strconv"
)
const (
masterRoleLabel = "node-role.kubernetes.io/master"
workerRoleLabel = "node-role.kubernetes.io/worker"
)
// emitSummary emits joined metric to be able to report better on all clusters
// state in single dashboard
func (mon *Monitor) emitSummary(ctx context.Context) error {
if !mon.hourlyRun {
return nil
}
cv, err := mon.getClusterVersion()
if err != nil {
return err
}
ns, err := mon.listNodes()
if err != nil {
return err
}
var masterCount, workerCount int
for _, node := range ns.Items {
if _, ok := node.Labels[masterRoleLabel]; ok {
masterCount++
}
if _, ok := node.Labels[workerRoleLabel]; ok {
workerCount++
}
}
mon.emitGauge("cluster.summary", 1, map[string]string{
"actualVersion": actualVersion(cv),
"desiredVersion": desiredVersion(cv),
"masterCount": strconv.Itoa(masterCount),
"workerCount": strconv.Itoa(workerCount),
"provisioningState": mon.oc.Properties.ProvisioningState.String(),
})
return nil
}

Просмотреть файл

@ -0,0 +1,95 @@
package cluster
// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.
import (
"context"
"testing"
"github.com/golang/mock/gomock"
configv1 "github.com/openshift/api/config/v1"
configfake "github.com/openshift/client-go/config/clientset/versioned/fake"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/fake"
"github.com/Azure/ARO-RP/pkg/api"
mock_metrics "github.com/Azure/ARO-RP/pkg/util/mocks/metrics"
)
func TestEmitSummary(t *testing.T) {
ctx := context.Background()
configcli := configfake.NewSimpleClientset(&configv1.ClusterVersion{
ObjectMeta: metav1.ObjectMeta{
Name: "version",
},
Status: configv1.ClusterVersionStatus{
Desired: configv1.Update{
Version: "4.3.3",
},
History: []configv1.UpdateHistory{
{
State: configv1.CompletedUpdate,
Version: "4.3.0",
},
},
},
})
cli := fake.NewSimpleClientset(&corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "aro-master-0",
Labels: map[string]string{
masterRoleLabel: "",
},
},
},
&corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "aro-node-1",
Labels: map[string]string{
workerRoleLabel: "",
},
},
},
&corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "aro-node-2",
Labels: map[string]string{
workerRoleLabel: "",
},
},
})
controller := gomock.NewController(t)
defer controller.Finish()
m := mock_metrics.NewMockInterface(controller)
mon := &Monitor{
configcli: configcli,
cli: cli,
m: m,
oc: &api.OpenShiftCluster{
Properties: api.OpenShiftClusterProperties{
ProvisioningState: api.ProvisioningStateSucceeded,
},
},
hourlyRun: true,
}
m.EXPECT().EmitGauge("cluster.summary", int64(1), map[string]string{
"actualVersion": "4.3.0",
"desiredVersion": "4.3.3",
"masterCount": "1",
"workerCount": "2",
"provisioningState": mon.oc.Properties.ProvisioningState.String(),
})
err := mon.emitSummary(ctx)
if err != nil {
t.Fatal(err)
}
}

Просмотреть файл

@ -187,11 +187,11 @@ out:
}
// workOne checks the API server health of a cluster
func (mon *monitor) workOne(ctx context.Context, log *logrus.Entry, doc *api.OpenShiftClusterDocument, logMessages bool) {
func (mon *monitor) workOne(ctx context.Context, log *logrus.Entry, doc *api.OpenShiftClusterDocument, hourlyRun bool) {
ctx, cancel := context.WithTimeout(ctx, 50*time.Second)
defer cancel()
c, err := cluster.NewMonitor(ctx, mon.env, log, doc.OpenShiftCluster, mon.clusterm, logMessages)
c, err := cluster.NewMonitor(ctx, mon.env, log, doc.OpenShiftCluster, mon.clusterm, hourlyRun)
if err != nil {
log.Error(err)
return