From 32dfd1aaa213f014c03b79f8563cbd0064bcd7d2 Mon Sep 17 00:00:00 2001 From: Mangirdas Judeikis Date: Thu, 6 Aug 2020 12:21:55 +0100 Subject: [PATCH] add summary metric and dummy cache for reporting --- pkg/api/openshiftcluster.go | 4 + pkg/monitor/cluster/arooperatorconditions.go | 2 +- pkg/monitor/cluster/cache.go | 40 ++++++++ pkg/monitor/cluster/cluster.go | 24 +++-- .../cluster/clusteroperatorconditions.go | 6 +- .../cluster/clusteroperatorversions.go | 6 +- .../cluster/clusterversionconditions.go | 6 +- pkg/monitor/cluster/clusterversions.go | 29 +++--- .../cluster/machineconfigpoolconditions.go | 2 +- pkg/monitor/cluster/nodeconditions.go | 11 ++- pkg/monitor/cluster/nodeconditions_test.go | 15 +++ pkg/monitor/cluster/podconditions.go | 4 +- pkg/monitor/cluster/summary.go | 52 ++++++++++ pkg/monitor/cluster/summary_test.go | 95 +++++++++++++++++++ pkg/monitor/worker.go | 4 +- 15 files changed, 257 insertions(+), 43 deletions(-) create mode 100644 pkg/monitor/cluster/cache.go create mode 100644 pkg/monitor/cluster/summary.go create mode 100644 pkg/monitor/cluster/summary_test.go diff --git a/pkg/api/openshiftcluster.go b/pkg/api/openshiftcluster.go index 40524d350..6d88fe70c 100644 --- a/pkg/api/openshiftcluster.go +++ b/pkg/api/openshiftcluster.go @@ -119,6 +119,10 @@ func (t ProvisioningState) IsTerminal() bool { return ProvisioningStateFailed == t || ProvisioningStateSucceeded == t } +func (t ProvisioningState) String() string { + return string(t) +} + // ClusterProfile represents a cluster profile. type ClusterProfile struct { MissingFields diff --git a/pkg/monitor/cluster/arooperatorconditions.go b/pkg/monitor/cluster/arooperatorconditions.go index ee552e6b6..2d30d0e54 100644 --- a/pkg/monitor/cluster/arooperatorconditions.go +++ b/pkg/monitor/cluster/arooperatorconditions.go @@ -35,7 +35,7 @@ func (mon *Monitor) emitAroOperatorConditions(ctx context.Context) error { "type": string(c.Type), }) - if mon.logMessages { + if mon.hourlyRun { mon.log.WithFields(logrus.Fields{ "metric": "arooperator.conditions", "status": c.Status, diff --git a/pkg/monitor/cluster/cache.go b/pkg/monitor/cluster/cache.go new file mode 100644 index 000000000..8b22cc572 --- /dev/null +++ b/pkg/monitor/cluster/cache.go @@ -0,0 +1,40 @@ +package cluster + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + configv1 "github.com/openshift/api/config/v1" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func (mon *Monitor) getClusterVersion() (*configv1.ClusterVersion, error) { + if mon.cache.cv != nil { + return mon.cache.cv, nil + } + + var err error + mon.cache.cv, err = mon.configcli.ConfigV1().ClusterVersions().Get("version", metav1.GetOptions{}) + return mon.cache.cv, err +} + +func (mon *Monitor) listClusterOperators() (*configv1.ClusterOperatorList, error) { + if mon.cache.cos != nil { + return mon.cache.cos, nil + } + + var err error + mon.cache.cos, err = mon.configcli.ConfigV1().ClusterOperators().List(metav1.ListOptions{}) + return mon.cache.cos, err +} + +func (mon *Monitor) listNodes() (*v1.NodeList, error) { + if mon.cache.ns != nil { + return mon.cache.ns, nil + } + + var err error + mon.cache.ns, err = mon.cli.CoreV1().Nodes().List(metav1.ListOptions{}) + return mon.cache.ns, err +} diff --git a/pkg/monitor/cluster/cluster.go b/pkg/monitor/cluster/cluster.go index c6cc8471b..5e3b2ec7a 100644 --- a/pkg/monitor/cluster/cluster.go +++ b/pkg/monitor/cluster/cluster.go @@ -10,9 +10,11 @@ import ( "runtime" "github.com/Azure/go-autorest/autorest/azure" + configv1 "github.com/openshift/api/config/v1" configclient "github.com/openshift/client-go/config/clientset/versioned" mcoclient "github.com/openshift/machine-config-operator/pkg/generated/clientset/versioned" "github.com/sirupsen/logrus" + v1 "k8s.io/api/core/v1" "k8s.io/client-go/kubernetes" "github.com/Azure/ARO-RP/pkg/api" @@ -23,9 +25,9 @@ import ( ) type Monitor struct { - env env.Interface - log *logrus.Entry - logMessages bool + env env.Interface + log *logrus.Entry + hourlyRun bool oc *api.OpenShiftCluster dims map[string]string @@ -35,9 +37,16 @@ type Monitor struct { mcocli mcoclient.Interface m metrics.Interface arocli aroclient.AroV1alpha1Interface + + // access below only via the helper functions in cache.go + cache struct { + cos *configv1.ClusterOperatorList + cv *configv1.ClusterVersion + ns *v1.NodeList + } } -func NewMonitor(ctx context.Context, env env.Interface, log *logrus.Entry, oc *api.OpenShiftCluster, m metrics.Interface, logMessages bool) (*Monitor, error) { +func NewMonitor(ctx context.Context, env env.Interface, log *logrus.Entry, oc *api.OpenShiftCluster, m metrics.Interface, hourlyRun bool) (*Monitor, error) { r, err := azure.ParseResourceID(oc.ID) if err != nil { return nil, err @@ -84,9 +93,9 @@ func NewMonitor(ctx context.Context, env env.Interface, log *logrus.Entry, oc *a } return &Monitor{ - env: env, - log: log, - logMessages: logMessages, + env: env, + log: log, + hourlyRun: hourlyRun, oc: oc, dims: dims, @@ -126,6 +135,7 @@ func (mon *Monitor) Monitor(ctx context.Context) { mon.emitPodConditions, mon.emitReplicasetStatuses, mon.emitStatefulsetStatuses, + mon.emitSummary, mon.emitPrometheusAlerts, // at the end for now because it's the slowest/least reliable } { err = f(ctx) diff --git a/pkg/monitor/cluster/clusteroperatorconditions.go b/pkg/monitor/cluster/clusteroperatorconditions.go index 05b03481a..6a3f129ac 100644 --- a/pkg/monitor/cluster/clusteroperatorconditions.go +++ b/pkg/monitor/cluster/clusteroperatorconditions.go @@ -8,7 +8,6 @@ import ( configv1 "github.com/openshift/api/config/v1" "github.com/sirupsen/logrus" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) type clusterOperatorConditionsIgnoreStruct struct { @@ -35,11 +34,10 @@ var clusterOperatorConditionsExpected = map[configv1.ClusterStatusConditionType] } func (mon *Monitor) emitClusterOperatorConditions(ctx context.Context) error { - cos, err := mon.configcli.ConfigV1().ClusterOperators().List(metav1.ListOptions{}) + cos, err := mon.listClusterOperators() if err != nil { return err } - mon.emitGauge("clusteroperator.count", int64(len(cos.Items)), nil) for _, co := range cos.Items { @@ -54,7 +52,7 @@ func (mon *Monitor) emitClusterOperatorConditions(ctx context.Context) error { "type": string(c.Type), }) - if mon.logMessages { + if mon.hourlyRun { mon.log.WithFields(logrus.Fields{ "metric": "clusteroperator.conditions", "name": co.Name, diff --git a/pkg/monitor/cluster/clusteroperatorversions.go b/pkg/monitor/cluster/clusteroperatorversions.go index 5b6f34431..d539b1530 100644 --- a/pkg/monitor/cluster/clusteroperatorversions.go +++ b/pkg/monitor/cluster/clusteroperatorversions.go @@ -5,17 +5,15 @@ package cluster import ( "context" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) func (mon *Monitor) emitClusterOperatorVersions(ctx context.Context) error { - cv, err := mon.configcli.ConfigV1().ClusterVersions().Get("version", metav1.GetOptions{}) + cv, err := mon.getClusterVersion() if err != nil { return err } - cos, err := mon.configcli.ConfigV1().ClusterOperators().List(metav1.ListOptions{}) + cos, err := mon.listClusterOperators() if err != nil { return err } diff --git a/pkg/monitor/cluster/clusterversionconditions.go b/pkg/monitor/cluster/clusterversionconditions.go index 0b3ef5e0d..a8ae07bcb 100644 --- a/pkg/monitor/cluster/clusterversionconditions.go +++ b/pkg/monitor/cluster/clusterversionconditions.go @@ -8,7 +8,6 @@ import ( configv1 "github.com/openshift/api/config/v1" "github.com/sirupsen/logrus" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) var clusterVersionConditionsExpected = map[configv1.ClusterStatusConditionType]configv1.ConditionStatus{ @@ -19,7 +18,7 @@ var clusterVersionConditionsExpected = map[configv1.ClusterStatusConditionType]c } func (mon *Monitor) emitClusterVersionConditions(ctx context.Context) error { - cv, err := mon.configcli.ConfigV1().ClusterVersions().Get("version", metav1.GetOptions{}) + cv, err := mon.getClusterVersion() if err != nil { return err } @@ -28,13 +27,12 @@ func (mon *Monitor) emitClusterVersionConditions(ctx context.Context) error { if c.Status == clusterVersionConditionsExpected[c.Type] { continue } - mon.emitGauge("clusterversion.conditions", 1, map[string]string{ "status": string(c.Status), "type": string(c.Type), }) - if mon.logMessages { + if mon.hourlyRun { mon.log.WithFields(logrus.Fields{ "metric": "clusterversion.conditions", "status": c.Status, diff --git a/pkg/monitor/cluster/clusterversions.go b/pkg/monitor/cluster/clusterversions.go index ef9d7e1d2..1e4019b23 100644 --- a/pkg/monitor/cluster/clusterversions.go +++ b/pkg/monitor/cluster/clusterversions.go @@ -7,28 +7,15 @@ import ( "context" configv1 "github.com/openshift/api/config/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) func (mon *Monitor) emitClusterVersions(ctx context.Context) error { - cv, err := mon.configcli.ConfigV1().ClusterVersions().Get("version", metav1.GetOptions{}) + cv, err := mon.getClusterVersion() if err != nil { return err } - - // Find the actual current cluster state. The history is ordered by most - // recent first, so find the latest "Completed" status to get current - // cluster version - var actualVersion string - for _, history := range cv.Status.History { - if history.State == configv1.CompletedUpdate { - actualVersion = history.Version - break - } - } - mon.emitGauge("cluster.versions", 1, map[string]string{ - "actualVersion": actualVersion, + "actualVersion": actualVersion(cv), "desiredVersion": desiredVersion(cv), "resourceProviderVersion": mon.oc.Properties.ProvisionedBy, }) @@ -36,6 +23,18 @@ func (mon *Monitor) emitClusterVersions(ctx context.Context) error { return nil } +// actualVersion finds the actual current cluster state. The history is ordered by most +// recent first, so find the latest "Completed" status to get current +// cluster version +func actualVersion(cv *configv1.ClusterVersion) string { + for _, history := range cv.Status.History { + if history.State == configv1.CompletedUpdate { + return history.Version + } + } + return "" +} + func desiredVersion(cv *configv1.ClusterVersion) string { if cv.Spec.DesiredUpdate != nil && cv.Spec.DesiredUpdate.Version != "" { diff --git a/pkg/monitor/cluster/machineconfigpoolconditions.go b/pkg/monitor/cluster/machineconfigpoolconditions.go index daee42912..19d2b6d26 100644 --- a/pkg/monitor/cluster/machineconfigpoolconditions.go +++ b/pkg/monitor/cluster/machineconfigpoolconditions.go @@ -38,7 +38,7 @@ func (mon *Monitor) emitMachineConfigPoolConditions(ctx context.Context) error { "type": string(c.Type), }) - if mon.logMessages { + if mon.hourlyRun { mon.log.WithFields(logrus.Fields{ "metric": "machineconfigpool.conditions", "name": mcp.Name, diff --git a/pkg/monitor/cluster/nodeconditions.go b/pkg/monitor/cluster/nodeconditions.go index 6d15b2db5..6fcb712f2 100644 --- a/pkg/monitor/cluster/nodeconditions.go +++ b/pkg/monitor/cluster/nodeconditions.go @@ -8,7 +8,6 @@ import ( "github.com/sirupsen/logrus" v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) var nodeConditionsExpected = map[v1.NodeConditionType]v1.ConditionStatus{ @@ -19,7 +18,7 @@ var nodeConditionsExpected = map[v1.NodeConditionType]v1.ConditionStatus{ } func (mon *Monitor) emitNodeConditions(ctx context.Context) error { - ns, err := mon.cli.CoreV1().Nodes().List(metav1.ListOptions{}) + ns, err := mon.listNodes() if err != nil { return err } @@ -38,7 +37,7 @@ func (mon *Monitor) emitNodeConditions(ctx context.Context) error { "type": string(c.Type), }) - if mon.logMessages { + if mon.hourlyRun { mon.log.WithFields(logrus.Fields{ "metric": "node.conditions", "name": n.Name, @@ -48,6 +47,12 @@ func (mon *Monitor) emitNodeConditions(ctx context.Context) error { }).Print() } } + + mon.emitGauge("node.kubelet.version", 1, map[string]string{ + "name": n.Name, + "kubeletVersion": n.Status.NodeInfo.KubeletVersion, + }) + } return nil diff --git a/pkg/monitor/cluster/nodeconditions_test.go b/pkg/monitor/cluster/nodeconditions_test.go index e9cf737b0..c5deadb5b 100644 --- a/pkg/monitor/cluster/nodeconditions_test.go +++ b/pkg/monitor/cluster/nodeconditions_test.go @@ -29,6 +29,9 @@ func TestEmitNodeConditions(t *testing.T) { Status: corev1.ConditionTrue, }, }, + NodeInfo: corev1.NodeSystemInfo{ + KubeletVersion: "v1.17.1+9d33dd3", + }, }, }, &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -41,6 +44,9 @@ func TestEmitNodeConditions(t *testing.T) { Status: corev1.ConditionFalse, }, }, + NodeInfo: corev1.NodeSystemInfo{ + KubeletVersion: "v1.17.1+9d33dd3", + }, }, }) @@ -66,6 +72,15 @@ func TestEmitNodeConditions(t *testing.T) { "type": "Ready", }) + m.EXPECT().EmitGauge("node.kubelet.version", int64(1), map[string]string{ + "name": "aro-master-0", + "kubeletVersion": "v1.17.1+9d33dd3", + }) + m.EXPECT().EmitGauge("node.kubelet.version", int64(1), map[string]string{ + "name": "aro-master-1", + "kubeletVersion": "v1.17.1+9d33dd3", + }) + err := mon.emitNodeConditions(ctx) if err != nil { t.Fatal(err) diff --git a/pkg/monitor/cluster/podconditions.go b/pkg/monitor/cluster/podconditions.go index 8522463cb..8d7affd8c 100644 --- a/pkg/monitor/cluster/podconditions.go +++ b/pkg/monitor/cluster/podconditions.go @@ -55,7 +55,7 @@ func (mon *Monitor) _emitPodConditions(ps *v1.PodList) { "type": string(c.Type), }) - if mon.logMessages { + if mon.hourlyRun { mon.log.WithFields(logrus.Fields{ "metric": "pod.conditions", "name": p.Name, @@ -91,7 +91,7 @@ func (mon *Monitor) _emitPodContainerStatuses(ps *v1.PodList) { "reason": cs.State.Waiting.Reason, }) - if mon.logMessages { + if mon.hourlyRun { mon.log.WithFields(logrus.Fields{ "metric": "pod.containerstatuses", "name": p.Name, diff --git a/pkg/monitor/cluster/summary.go b/pkg/monitor/cluster/summary.go new file mode 100644 index 000000000..e2801f630 --- /dev/null +++ b/pkg/monitor/cluster/summary.go @@ -0,0 +1,52 @@ +package cluster + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "context" + "strconv" +) + +const ( + masterRoleLabel = "node-role.kubernetes.io/master" + workerRoleLabel = "node-role.kubernetes.io/worker" +) + +// emitSummary emits joined metric to be able to report better on all clusters +// state in single dashboard +func (mon *Monitor) emitSummary(ctx context.Context) error { + if !mon.hourlyRun { + return nil + } + + cv, err := mon.getClusterVersion() + if err != nil { + return err + } + + ns, err := mon.listNodes() + if err != nil { + return err + } + + var masterCount, workerCount int + for _, node := range ns.Items { + if _, ok := node.Labels[masterRoleLabel]; ok { + masterCount++ + } + if _, ok := node.Labels[workerRoleLabel]; ok { + workerCount++ + } + } + + mon.emitGauge("cluster.summary", 1, map[string]string{ + "actualVersion": actualVersion(cv), + "desiredVersion": desiredVersion(cv), + "masterCount": strconv.Itoa(masterCount), + "workerCount": strconv.Itoa(workerCount), + "provisioningState": mon.oc.Properties.ProvisioningState.String(), + }) + + return nil +} diff --git a/pkg/monitor/cluster/summary_test.go b/pkg/monitor/cluster/summary_test.go new file mode 100644 index 000000000..76f5c9105 --- /dev/null +++ b/pkg/monitor/cluster/summary_test.go @@ -0,0 +1,95 @@ +package cluster + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "context" + "testing" + + "github.com/golang/mock/gomock" + configv1 "github.com/openshift/api/config/v1" + configfake "github.com/openshift/client-go/config/clientset/versioned/fake" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" + + "github.com/Azure/ARO-RP/pkg/api" + mock_metrics "github.com/Azure/ARO-RP/pkg/util/mocks/metrics" +) + +func TestEmitSummary(t *testing.T) { + ctx := context.Background() + + configcli := configfake.NewSimpleClientset(&configv1.ClusterVersion{ + ObjectMeta: metav1.ObjectMeta{ + Name: "version", + }, + Status: configv1.ClusterVersionStatus{ + Desired: configv1.Update{ + Version: "4.3.3", + }, + History: []configv1.UpdateHistory{ + { + State: configv1.CompletedUpdate, + Version: "4.3.0", + }, + }, + }, + }) + + cli := fake.NewSimpleClientset(&corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "aro-master-0", + Labels: map[string]string{ + masterRoleLabel: "", + }, + }, + }, + &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "aro-node-1", + Labels: map[string]string{ + workerRoleLabel: "", + }, + }, + }, + &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "aro-node-2", + Labels: map[string]string{ + workerRoleLabel: "", + }, + }, + }) + + controller := gomock.NewController(t) + defer controller.Finish() + + m := mock_metrics.NewMockInterface(controller) + + mon := &Monitor{ + configcli: configcli, + cli: cli, + m: m, + oc: &api.OpenShiftCluster{ + Properties: api.OpenShiftClusterProperties{ + ProvisioningState: api.ProvisioningStateSucceeded, + }, + }, + hourlyRun: true, + } + + m.EXPECT().EmitGauge("cluster.summary", int64(1), map[string]string{ + "actualVersion": "4.3.0", + "desiredVersion": "4.3.3", + "masterCount": "1", + "workerCount": "2", + "provisioningState": mon.oc.Properties.ProvisioningState.String(), + }) + + err := mon.emitSummary(ctx) + if err != nil { + t.Fatal(err) + } +} diff --git a/pkg/monitor/worker.go b/pkg/monitor/worker.go index 0a1fbc568..f95fa7094 100644 --- a/pkg/monitor/worker.go +++ b/pkg/monitor/worker.go @@ -187,11 +187,11 @@ out: } // workOne checks the API server health of a cluster -func (mon *monitor) workOne(ctx context.Context, log *logrus.Entry, doc *api.OpenShiftClusterDocument, logMessages bool) { +func (mon *monitor) workOne(ctx context.Context, log *logrus.Entry, doc *api.OpenShiftClusterDocument, hourlyRun bool) { ctx, cancel := context.WithTimeout(ctx, 50*time.Second) defer cancel() - c, err := cluster.NewMonitor(ctx, mon.env, log, doc.OpenShiftCluster, mon.clusterm, logMessages) + c, err := cluster.NewMonitor(ctx, mon.env, log, doc.OpenShiftCluster, mon.clusterm, hourlyRun) if err != nil { log.Error(err) return