Merge pull request #533 from mjudeikis/cluster.monitor

add more cluster monitoring
This commit is contained in:
Jim Minter 2020-04-21 16:10:41 -05:00 коммит произвёл GitHub
Родитель aed35c028a b463e3ffea
Коммит b3976e064c
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
29 изменённых файлов: 1115 добавлений и 217 удалений

Просмотреть файл

@ -297,35 +297,12 @@ func TestValidateAdminKubernetesObjectsNonCustomer(t *testing.T) {
name string
wantErr string
}{
{
test: "valid openshift-ns namespace",
groupKind: "Valid-kind.openshift.io",
namespace: "openshift-ns",
name: "Valid-NAME-01",
},
{
test: "valid openshift namespace",
groupKind: "Valid-kind.openshift.io",
namespace: "openshift",
name: "Valid-NAME-01",
},
{
test: "valid kube-ns namespace",
groupKind: "Valid-kind.openshift.io",
namespace: "kube-ns",
name: "Valid-NAME-01",
},
{
test: "valid default namespace",
groupKind: "Valid-kind.openshift.io",
namespace: "default",
name: "Valid-NAME-01",
},
{
test: "valid empty namespace",
groupKind: "Valid-kind.openshift.io",
name: "Valid-NAME-01",
},
{
test: "invalid customer namespace",
groupKind: "Valid-kind.openshift.io",

Просмотреть файл

@ -14,6 +14,7 @@ import (
"github.com/Azure/ARO-RP/pkg/api"
"github.com/Azure/ARO-RP/pkg/database/cosmosdb"
pkgnamespace "github.com/Azure/ARO-RP/pkg/util/namespace"
)
func validateTerminalProvisioningState(state api.ProvisioningState) error {
@ -84,11 +85,7 @@ func validateAdminJmespathFilter(filter string) (*jmespath.JMESPath, error) {
var rxKubernetesString = regexp.MustCompile(`(?i)^[-a-z0-9.]{0,255}$`)
func validateAdminKubernetesObjectsNonCustomer(method, groupKind, namespace, name string) error {
if namespace != "" &&
namespace != "default" &&
namespace != "openshift" &&
!strings.HasPrefix(string(namespace), "kube-") &&
!strings.HasPrefix(string(namespace), "openshift-") {
if !pkgnamespace.IsOpenShift(namespace) {
return api.NewCloudError(http.StatusForbidden, api.CloudErrorCodeForbidden, "", "Access to the provided namespace '%s' is forbidden.", namespace)
}

Просмотреть файл

@ -22,8 +22,9 @@ import (
)
type Monitor struct {
env env.Interface
log *logrus.Entry
env env.Interface
log *logrus.Entry
logMessages bool
oc *api.OpenShiftCluster
dims map[string]string
@ -34,7 +35,7 @@ type Monitor struct {
m metrics.Interface
}
func NewMonitor(ctx context.Context, env env.Interface, log *logrus.Entry, oc *api.OpenShiftCluster, m metrics.Interface) (*Monitor, error) {
func NewMonitor(ctx context.Context, env env.Interface, log *logrus.Entry, oc *api.OpenShiftCluster, m metrics.Interface, logMessages bool) (*Monitor, error) {
r, err := azure.ParseResourceID(oc.ID)
if err != nil {
return nil, err
@ -76,8 +77,9 @@ func NewMonitor(ctx context.Context, env env.Interface, log *logrus.Entry, oc *a
}
return &Monitor{
env: env,
log: log,
env: env,
log: log,
logMessages: logMessages,
oc: oc,
dims: dims,
@ -103,12 +105,18 @@ func (mon *Monitor) Monitor(ctx context.Context) {
return
}
for _, f := range []func(ctx context.Context) error{
mon.emitClusterOperatorsMetrics,
mon.emitClusterVersionMetrics,
mon.emitNodesMetrics,
for _, f := range []func(context.Context) error{
mon.emitClusterOperatorConditions,
mon.emitClusterOperatorVersions,
mon.emitClusterVersions,
mon.emitDaemonsetStatuses,
mon.emitDeploymentStatuses,
mon.emitMachineConfigPoolConditions,
mon.emitNodeConditions,
mon.emitPodConditions,
mon.emitPrometheusAlerts,
mon.emitMachineConfigPoolMetrics,
mon.emitReplicasetStatuses,
mon.emitStatefulsetStatuses,
} {
err = f(ctx)
if err != nil {

Просмотреть файл

@ -0,0 +1,80 @@
package cluster
// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.
import (
"context"
configv1 "github.com/openshift/api/config/v1"
"github.com/sirupsen/logrus"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
type clusterOperatorConditionsIgnoreStruct struct {
Name string
Type configv1.ClusterStatusConditionType
Status configv1.ConditionStatus
}
// clusterOperatorConditionsIgnore contains list of failures we know we can
// ignore for now
var clusterOperatorConditionsIgnore = map[clusterOperatorConditionsIgnoreStruct]struct{}{
{"insights", "Disabled", configv1.ConditionFalse}: {},
{"insights", "Disabled", configv1.ConditionTrue}: {},
{"openshift-controller-manager", configv1.OperatorUpgradeable, configv1.ConditionUnknown}: {},
{"service-ca", configv1.OperatorUpgradeable, configv1.ConditionUnknown}: {},
{"service-catalog-apiserver", configv1.OperatorUpgradeable, configv1.ConditionUnknown}: {},
}
var clusterOperatorConditionsExpected = map[configv1.ClusterStatusConditionType]configv1.ConditionStatus{
configv1.OperatorAvailable: configv1.ConditionTrue,
configv1.OperatorDegraded: configv1.ConditionFalse,
configv1.OperatorProgressing: configv1.ConditionFalse,
configv1.OperatorUpgradeable: configv1.ConditionTrue,
}
func (mon *Monitor) emitClusterOperatorConditions(ctx context.Context) error {
cos, err := mon.configcli.ConfigV1().ClusterOperators().List(metav1.ListOptions{})
if err != nil {
return err
}
for _, co := range cos.Items {
for _, c := range co.Status.Conditions {
if clusterOperatorConditionIsExpected(&co, &c) {
continue
}
mon.emitGauge("clusteroperator.conditions", 1, map[string]string{
"name": co.Name,
"status": string(c.Status),
"type": string(c.Type),
})
if mon.logMessages {
mon.log.WithFields(logrus.Fields{
"metric": "clusteroperator.conditions",
"name": co.Name,
"status": c.Status,
"type": c.Type,
"message": c.Message,
}).Print()
}
}
}
return nil
}
func clusterOperatorConditionIsExpected(co *configv1.ClusterOperator, c *configv1.ClusterOperatorStatusCondition) bool {
if _, ok := clusterOperatorConditionsIgnore[clusterOperatorConditionsIgnoreStruct{
Name: co.Name,
Type: c.Type,
Status: c.Status,
}]; ok {
return true
}
return c.Status == clusterOperatorConditionsExpected[c.Type]
}

Просмотреть файл

@ -15,7 +15,7 @@ import (
mock_metrics "github.com/Azure/ARO-RP/pkg/util/mocks/metrics"
)
func TestEmitClusterOperatorsMetrics(t *testing.T) {
func TestEmitClusterOperatorConditions(t *testing.T) {
ctx := context.Background()
configcli := fake.NewSimpleClientset(&configv1.ClusterOperator{
@ -28,10 +28,22 @@ func TestEmitClusterOperatorsMetrics(t *testing.T) {
Type: configv1.OperatorAvailable,
Status: configv1.ConditionFalse,
},
{
Type: configv1.OperatorAvailable,
Status: configv1.ConditionTrue,
},
{
Type: configv1.OperatorDegraded,
Status: configv1.ConditionFalse,
},
{
Type: configv1.OperatorDegraded,
Status: configv1.ConditionTrue,
},
{
Type: configv1.OperatorProgressing,
Status: configv1.ConditionFalse,
},
{
Type: configv1.OperatorProgressing,
Status: configv1.ConditionTrue,
@ -41,22 +53,12 @@ func TestEmitClusterOperatorsMetrics(t *testing.T) {
Status: configv1.ConditionFalse,
},
{
Type: "dummy",
Type: configv1.OperatorUpgradeable,
Status: configv1.ConditionTrue,
},
},
Versions: []configv1.OperandVersion{
{
Name: "dummy",
Version: "4.3.2",
},
{
Name: "operator",
Version: "4.3.1",
},
{
Name: "operator",
Version: "4.3.0",
Type: "dummy",
Status: configv1.ConditionTrue,
},
},
},
@ -72,32 +74,37 @@ func TestEmitClusterOperatorsMetrics(t *testing.T) {
m: m,
}
m.EXPECT().EmitGauge("clusteroperators.conditions.count", int64(1), map[string]string{
"clusteroperator": "console",
"condition": "NotAvailable",
m.EXPECT().EmitGauge("clusteroperator.conditions", int64(1), map[string]string{
"name": "console",
"type": "Available",
"status": "False",
})
m.EXPECT().EmitGauge("clusteroperators.conditions.count", int64(1), map[string]string{
"clusteroperator": "console",
"condition": "Degraded",
m.EXPECT().EmitGauge("clusteroperator.conditions", int64(1), map[string]string{
"name": "console",
"type": "Degraded",
"status": "True",
})
m.EXPECT().EmitGauge("clusteroperators.conditions.count", int64(1), map[string]string{
"clusteroperator": "console",
"condition": "Progressing",
m.EXPECT().EmitGauge("clusteroperator.conditions", int64(1), map[string]string{
"name": "console",
"type": "Progressing",
"status": "True",
})
m.EXPECT().EmitGauge("clusteroperators.conditions.count", int64(1), map[string]string{
"clusteroperator": "console",
"condition": "NotUpgradeable",
m.EXPECT().EmitGauge("clusteroperator.conditions", int64(1), map[string]string{
"name": "console",
"type": "Upgradeable",
"status": "False",
})
m.EXPECT().EmitGauge("clusteroperators.version", int64(1), map[string]string{
"clusteroperator": "console",
"version": "4.3.1",
m.EXPECT().EmitGauge("clusteroperator.conditions", int64(1), map[string]string{
"name": "console",
"type": "dummy",
"status": "True",
})
err := mon.emitClusterOperatorsMetrics(ctx)
err := mon.emitClusterOperatorConditions(ctx)
if err != nil {
t.Fatal(err)
}

Просмотреть файл

@ -1,67 +0,0 @@
package cluster
// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.
import (
"context"
configv1 "github.com/openshift/api/config/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
var clusterOperatorsConditionsWhitelist = map[configv1.ClusterStatusConditionType]struct{}{
configv1.OperatorAvailable: {},
configv1.OperatorDegraded: {},
configv1.OperatorProgressing: {},
configv1.OperatorUpgradeable: {},
}
var clusterOperatorsNotConditions = map[configv1.ClusterStatusConditionType]struct{}{
configv1.OperatorAvailable: {},
configv1.OperatorUpgradeable: {},
}
func (mon *Monitor) emitClusterOperatorsMetrics(ctx context.Context) error {
cos, err := mon.configcli.ConfigV1().ClusterOperators().List(metav1.ListOptions{})
if err != nil {
return err
}
for _, co := range cos.Items {
for _, c := range co.Status.Conditions {
if _, ok := clusterOperatorsConditionsWhitelist[c.Type]; !ok {
continue
}
if _, ok := clusterOperatorsNotConditions[c.Type]; ok {
if c.Status == configv1.ConditionFalse {
mon.emitGauge("clusteroperators.conditions.count", 1, map[string]string{
"clusteroperator": co.Name,
"condition": "Not" + string(c.Type),
})
}
} else {
if c.Status == configv1.ConditionTrue {
mon.emitGauge("clusteroperators.conditions.count", 1, map[string]string{
"clusteroperator": co.Name,
"condition": string(c.Type),
})
}
}
}
out:
for _, v := range co.Status.Versions {
if v.Name == "operator" {
mon.emitGauge("clusteroperators.version", 1, map[string]string{
"clusteroperator": co.Name,
"version": v.Version,
})
break out
}
}
}
return nil
}

Просмотреть файл

@ -0,0 +1,41 @@
package cluster
// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.
import (
"context"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
func (mon *Monitor) emitClusterOperatorVersions(ctx context.Context) error {
cv, err := mon.configcli.ConfigV1().ClusterVersions().Get("version", metav1.GetOptions{})
if err != nil {
return err
}
cos, err := mon.configcli.ConfigV1().ClusterOperators().List(metav1.ListOptions{})
if err != nil {
return err
}
for _, co := range cos.Items {
for _, v := range co.Status.Versions {
if v.Name != "operator" {
continue
}
if v.Version == desiredVersion(cv) {
continue
}
mon.emitGauge("clusteroperator.versions", 1, map[string]string{
"name": co.Name,
"version": v.Version,
})
}
}
return nil
}

Просмотреть файл

@ -0,0 +1,69 @@
package cluster
// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.
import (
"context"
"testing"
"github.com/golang/mock/gomock"
configv1 "github.com/openshift/api/config/v1"
"github.com/openshift/client-go/config/clientset/versioned/fake"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
mock_metrics "github.com/Azure/ARO-RP/pkg/util/mocks/metrics"
)
func TestEmitClusterOperatorVersion(t *testing.T) {
ctx := context.Background()
configcli := fake.NewSimpleClientset(
&configv1.ClusterOperator{
ObjectMeta: metav1.ObjectMeta{
Name: "console",
},
Status: configv1.ClusterOperatorStatus{
Versions: []configv1.OperandVersion{
{
Name: "operator",
Version: "4.3.0",
},
{
Name: "operator-good", // no metrics exected
Version: "4.3.1",
},
},
},
},
&configv1.ClusterVersion{
ObjectMeta: metav1.ObjectMeta{
Name: "version",
},
Status: configv1.ClusterVersionStatus{
Desired: configv1.Update{
Version: "4.3.1",
},
},
})
controller := gomock.NewController(t)
defer controller.Finish()
m := mock_metrics.NewMockInterface(controller)
mon := &Monitor{
configcli: configcli,
m: m,
}
m.EXPECT().EmitGauge("clusteroperator.versions", int64(1), map[string]string{
"name": "console",
"version": "4.3.0",
})
err := mon.emitClusterOperatorVersions(ctx)
if err != nil {
t.Fatal(err)
}
}

Просмотреть файл

@ -10,18 +10,12 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
func (mon *Monitor) emitClusterVersionMetrics(ctx context.Context) error {
func (mon *Monitor) emitClusterVersions(ctx context.Context) error {
cv, err := mon.configcli.ConfigV1().ClusterVersions().Get("version", metav1.GetOptions{})
if err != nil {
return err
}
desiredVersion := cv.Status.Desired.Version
if cv.Spec.DesiredUpdate != nil &&
cv.Spec.DesiredUpdate.Version != "" {
desiredVersion = cv.Spec.DesiredUpdate.Version
}
// Find the actual current cluster state. The history is ordered by most
// recent first, so find the latest "Completed" status to get current
// cluster version
@ -33,10 +27,19 @@ func (mon *Monitor) emitClusterVersionMetrics(ctx context.Context) error {
}
}
mon.emitGauge("cluster.version", 1, map[string]string{
mon.emitGauge("cluster.versions", 1, map[string]string{
"actualVersion": actualVersion,
"desiredVersion": desiredVersion,
"desiredVersion": desiredVersion(cv),
})
return nil
}
func desiredVersion(cv *configv1.ClusterVersion) string {
if cv.Spec.DesiredUpdate != nil &&
cv.Spec.DesiredUpdate.Version != "" {
return cv.Spec.DesiredUpdate.Version
}
return cv.Status.Desired.Version
}

Просмотреть файл

@ -15,7 +15,7 @@ import (
mock_metrics "github.com/Azure/ARO-RP/pkg/util/mocks/metrics"
)
func TestEmitClusterVersionMetrics(t *testing.T) {
func TestEmitClusterVersion(t *testing.T) {
ctx := context.Background()
for _, tt := range []struct {
@ -86,12 +86,12 @@ func TestEmitClusterVersionMetrics(t *testing.T) {
m: m,
}
m.EXPECT().EmitGauge("cluster.version", int64(1), map[string]string{
m.EXPECT().EmitGauge("cluster.versions", int64(1), map[string]string{
"actualVersion": tt.wantActualVersion,
"desiredVersion": tt.wantDesiredVersion,
})
err := mon.emitClusterVersionMetrics(ctx)
err := mon.emitClusterVersions(ctx)
if err != nil {
t.Fatal(err)
}

Просмотреть файл

@ -0,0 +1,39 @@
package cluster
// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.
import (
"context"
"strconv"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"github.com/Azure/ARO-RP/pkg/util/namespace"
)
func (mon *Monitor) emitDaemonsetStatuses(ctx context.Context) error {
dss, err := mon.cli.AppsV1().DaemonSets("").List(metav1.ListOptions{})
if err != nil {
return err
}
for _, ds := range dss.Items {
if !namespace.IsOpenShift(ds.Namespace) {
continue
}
if ds.Status.DesiredNumberScheduled == ds.Status.NumberAvailable {
continue
}
mon.emitGauge("daemonset.statuses", 1, map[string]string{
"desiredNumberScheduled": strconv.Itoa(int(ds.Status.DesiredNumberScheduled)),
"name": ds.Name,
"namespace": ds.Namespace,
"numberAvailable": strconv.Itoa(int(ds.Status.NumberAvailable)),
})
}
return nil
}

Просмотреть файл

@ -0,0 +1,74 @@
package cluster
// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.
import (
"context"
"strconv"
"testing"
"github.com/golang/mock/gomock"
appsv1 "k8s.io/api/apps/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/fake"
mock_metrics "github.com/Azure/ARO-RP/pkg/util/mocks/metrics"
)
func TestEmitDaemonsetStatuses(t *testing.T) {
ctx := context.Background()
cli := fake.NewSimpleClientset(
&appsv1.DaemonSet{ // metrics expected
ObjectMeta: metav1.ObjectMeta{
Name: "name1",
Namespace: "openshift",
},
Status: appsv1.DaemonSetStatus{
DesiredNumberScheduled: 2,
NumberAvailable: 1,
},
}, &appsv1.DaemonSet{ // no metric expected
ObjectMeta: metav1.ObjectMeta{
Name: "name2",
Namespace: "openshift",
},
Status: appsv1.DaemonSetStatus{
DesiredNumberScheduled: 2,
NumberAvailable: 2,
},
}, &appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{ // no metric expected -customer
Name: "name2",
Namespace: "customer",
},
Status: appsv1.DaemonSetStatus{
DesiredNumberScheduled: 2,
NumberAvailable: 1,
},
},
)
controller := gomock.NewController(t)
defer controller.Finish()
m := mock_metrics.NewMockInterface(controller)
mon := &Monitor{
cli: cli,
m: m,
}
m.EXPECT().EmitGauge("daemonset.statuses", int64(1), map[string]string{
"desiredNumberScheduled": strconv.Itoa(2),
"name": "name1",
"namespace": "openshift",
"numberAvailable": strconv.Itoa(1),
})
err := mon.emitDaemonsetStatuses(ctx)
if err != nil {
t.Fatal(err)
}
}

Просмотреть файл

@ -0,0 +1,39 @@
package cluster
// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.
import (
"context"
"strconv"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"github.com/Azure/ARO-RP/pkg/util/namespace"
)
func (mon *Monitor) emitDeploymentStatuses(ctx context.Context) error {
ds, err := mon.cli.AppsV1().Deployments("").List(metav1.ListOptions{})
if err != nil {
return err
}
for _, d := range ds.Items {
if !namespace.IsOpenShift(d.Namespace) {
continue
}
if d.Status.Replicas == d.Status.AvailableReplicas {
continue
}
mon.emitGauge("deployment.statuses", 1, map[string]string{
"availableReplicas": strconv.Itoa(int(d.Status.AvailableReplicas)),
"name": d.Name,
"namespace": d.Namespace,
"replicas": strconv.Itoa(int(d.Status.Replicas)),
})
}
return nil
}

Просмотреть файл

@ -0,0 +1,75 @@
package cluster
// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.
import (
"context"
"strconv"
"testing"
"github.com/golang/mock/gomock"
appsv1 "k8s.io/api/apps/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/fake"
mock_metrics "github.com/Azure/ARO-RP/pkg/util/mocks/metrics"
)
func TestEmitDeploymentStatuses(t *testing.T) {
ctx := context.Background()
cli := fake.NewSimpleClientset(
&appsv1.Deployment{ // metrics expected
ObjectMeta: metav1.ObjectMeta{
Name: "name1",
Namespace: "openshift",
},
Status: appsv1.DeploymentStatus{
Replicas: 2,
AvailableReplicas: 1,
},
}, &appsv1.Deployment{ // no metric expected
ObjectMeta: metav1.ObjectMeta{
Name: "name2",
Namespace: "openshift",
},
Status: appsv1.DeploymentStatus{
Replicas: 2,
AvailableReplicas: 2,
},
}, &appsv1.Deployment{
ObjectMeta: metav1.ObjectMeta{ // no metric expected -customer
Name: "name2",
Namespace: "customer",
},
Status: appsv1.DeploymentStatus{
Replicas: 2,
AvailableReplicas: 1,
},
},
)
controller := gomock.NewController(t)
defer controller.Finish()
m := mock_metrics.NewMockInterface(controller)
mon := &Monitor{
cli: cli,
m: m,
}
m.EXPECT().EmitGauge("deployment.statuses", int64(1), map[string]string{
"availableReplicas": strconv.Itoa(1),
"name": "name1",
"namespace": "openshift",
"replicas": strconv.Itoa(2),
})
err := mon.emitDeploymentStatuses(ctx)
if err != nil {
t.Fatal(err)
}
}

Просмотреть файл

@ -7,6 +7,7 @@ import (
"context"
v1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1"
"github.com/sirupsen/logrus"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
@ -19,7 +20,7 @@ var machineConfigPoolConditionsExpected = map[v1.MachineConfigPoolConditionType]
v1.MachineConfigPoolUpdating: corev1.ConditionFalse,
}
func (mon *Monitor) emitMachineConfigPoolMetrics(ctx context.Context) error {
func (mon *Monitor) emitMachineConfigPoolConditions(ctx context.Context) error {
mcps, err := mon.mcocli.MachineconfigurationV1().MachineConfigPools().List(metav1.ListOptions{})
if err != nil {
return err
@ -31,11 +32,21 @@ func (mon *Monitor) emitMachineConfigPoolMetrics(ctx context.Context) error {
continue
}
mon.emitGauge("machineconfigpools.conditions", 1, map[string]string{
mon.emitGauge("machineconfigpool.conditions", 1, map[string]string{
"name": mcp.Name,
"type": string(c.Type),
"status": string(c.Status),
"type": string(c.Type),
})
if mon.logMessages {
mon.log.WithFields(logrus.Fields{
"metric": "machineconfigpool.conditions",
"name": mcp.Name,
"status": c.Status,
"type": c.Type,
"message": c.Message,
}).Print()
}
}
}

Просмотреть файл

@ -16,7 +16,7 @@ import (
mock_metrics "github.com/Azure/ARO-RP/pkg/util/mocks/metrics"
)
func TestEmitMachineConfigPoolMetrics(t *testing.T) {
func TestEmitMachineConfigPoolConditions(t *testing.T) {
ctx := context.Background()
mcocli := fake.NewSimpleClientset(&v1.MachineConfigPool{
@ -59,37 +59,37 @@ func TestEmitMachineConfigPoolMetrics(t *testing.T) {
m: m,
}
m.EXPECT().EmitGauge("machineconfigpools.conditions", int64(1), map[string]string{
m.EXPECT().EmitGauge("machineconfigpool.conditions", int64(1), map[string]string{
"name": "machine-config-pool",
"type": "Degraded",
"status": "True",
})
m.EXPECT().EmitGauge("machineconfigpools.conditions", int64(1), map[string]string{
m.EXPECT().EmitGauge("machineconfigpool.conditions", int64(1), map[string]string{
"name": "machine-config-pool",
"type": "NodeDegraded",
"status": "True",
})
m.EXPECT().EmitGauge("machineconfigpools.conditions", int64(1), map[string]string{
m.EXPECT().EmitGauge("machineconfigpool.conditions", int64(1), map[string]string{
"name": "machine-config-pool",
"type": "RenderDegraded",
"status": "True",
})
m.EXPECT().EmitGauge("machineconfigpools.conditions", int64(1), map[string]string{
m.EXPECT().EmitGauge("machineconfigpool.conditions", int64(1), map[string]string{
"name": "machine-config-pool",
"type": "Updated",
"status": "False",
})
m.EXPECT().EmitGauge("machineconfigpools.conditions", int64(1), map[string]string{
m.EXPECT().EmitGauge("machineconfigpool.conditions", int64(1), map[string]string{
"name": "machine-config-pool",
"type": "Updating",
"status": "True",
})
err := mon.emitMachineConfigPoolMetrics(ctx)
err := mon.emitMachineConfigPoolConditions(ctx)
if err != nil {
t.Fatal(err)
}

Просмотреть файл

@ -0,0 +1,54 @@
package cluster
// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.
import (
"context"
"github.com/sirupsen/logrus"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
var nodeConditionsExpected = map[v1.NodeConditionType]v1.ConditionStatus{
v1.NodeDiskPressure: v1.ConditionFalse,
v1.NodeMemoryPressure: v1.ConditionFalse,
v1.NodePIDPressure: v1.ConditionFalse,
v1.NodeReady: v1.ConditionTrue,
}
func (mon *Monitor) emitNodeConditions(ctx context.Context) error {
ns, err := mon.cli.CoreV1().Nodes().List(metav1.ListOptions{})
if err != nil {
return err
}
mon.emitGauge("nodes.count", int64(len(ns.Items)), nil)
for _, n := range ns.Items {
for _, c := range n.Status.Conditions {
if c.Status == nodeConditionsExpected[c.Type] {
continue
}
mon.emitGauge("node.conditions", 1, map[string]string{
"name": n.Name,
"status": string(c.Status),
"type": string(c.Type),
})
if mon.logMessages {
mon.log.WithFields(logrus.Fields{
"metric": "node.conditions",
"name": n.Name,
"status": c.Status,
"type": c.Type,
"message": c.Message,
}).Print()
}
}
}
return nil
}

Просмотреть файл

@ -15,7 +15,7 @@ import (
mock_metrics "github.com/Azure/ARO-RP/pkg/util/mocks/metrics"
)
func TestEmitNodesMetrics(t *testing.T) {
func TestEmitNodeConditions(t *testing.T) {
ctx := context.Background()
cli := fake.NewSimpleClientset(&corev1.Node{
@ -55,14 +55,18 @@ func TestEmitNodesMetrics(t *testing.T) {
}
m.EXPECT().EmitGauge("nodes.count", int64(2), map[string]string{})
m.EXPECT().EmitGauge("nodes.conditions.count", int64(1), map[string]string{
"condition": "NotReady",
m.EXPECT().EmitGauge("node.conditions", int64(1), map[string]string{
"name": "aro-master-0",
"status": "True",
"type": "MemoryPressure",
})
m.EXPECT().EmitGauge("nodes.conditions.count", int64(1), map[string]string{
"condition": "MemoryPressure",
m.EXPECT().EmitGauge("node.conditions", int64(1), map[string]string{
"name": "aro-master-1",
"status": "False",
"type": "Ready",
})
err := mon.emitNodesMetrics(ctx)
err := mon.emitNodeConditions(ctx)
if err != nil {
t.Fatal(err)
}

Просмотреть файл

@ -1,51 +0,0 @@
package cluster
// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.
import (
"context"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
var nodesNotConditions = map[corev1.NodeConditionType]struct{}{
corev1.NodeReady: {},
}
func (mon *Monitor) emitNodesMetrics(ctx context.Context) error {
nodes, err := mon.cli.CoreV1().Nodes().List(metav1.ListOptions{})
if err != nil {
return err
}
mon.emitGauge("nodes.count", int64(len(nodes.Items)), nil)
counters := map[string]int64{}
for _, node := range nodes.Items {
for _, c := range node.Status.Conditions {
// count 'Unknown' status as unhealthy state for each condition. In this way
// we can flag issues without creating additional timeseries for each condition.
// for NodeReady count a node when the status is False (not ready) or Unknown
// for other conditions count when the status is True or Unknown
if _, ok := nodesNotConditions[c.Type]; ok {
if c.Status != corev1.ConditionTrue {
counters["Not"+string(c.Type)]++
}
} else {
if c.Status != corev1.ConditionFalse {
counters[string(c.Type)]++
}
}
}
}
for condition, count := range counters {
mon.emitGauge("nodes.conditions.count", count, map[string]string{
"condition": condition,
})
}
return nil
}

Просмотреть файл

@ -0,0 +1,106 @@
package cluster
// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.
import (
"context"
"github.com/sirupsen/logrus"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"github.com/Azure/ARO-RP/pkg/util/namespace"
)
var podConditionsExpected = map[v1.PodConditionType]v1.ConditionStatus{
v1.ContainersReady: v1.ConditionTrue,
v1.PodInitialized: v1.ConditionTrue,
v1.PodScheduled: v1.ConditionTrue,
v1.PodReady: v1.ConditionTrue,
}
func (mon *Monitor) emitPodConditions(ctx context.Context) error {
// to list pods once
ps, err := mon.cli.CoreV1().Pods("").List(metav1.ListOptions{})
if err != nil {
return err
}
mon._emitPodConditions(ps)
mon._emitPodContainerStatuses(ps)
return nil
}
func (mon *Monitor) _emitPodConditions(ps *v1.PodList) {
for _, p := range ps.Items {
if !namespace.IsOpenShift(p.Namespace) {
continue
}
if p.Status.Phase == v1.PodSucceeded {
continue
}
for _, c := range p.Status.Conditions {
if c.Status == podConditionsExpected[c.Type] {
continue
}
mon.emitGauge("pod.conditions", 1, map[string]string{
"name": p.Name,
"namespace": p.Namespace,
"status": string(c.Status),
"type": string(c.Type),
})
if mon.logMessages {
mon.log.WithFields(logrus.Fields{
"metric": "pod.conditions",
"name": p.Name,
"namespace": p.Namespace,
"status": c.Status,
"type": c.Type,
"message": c.Message,
}).Print()
}
}
}
}
func (mon *Monitor) _emitPodContainerStatuses(ps *v1.PodList) {
for _, p := range ps.Items {
if !namespace.IsOpenShift(p.Namespace) {
continue
}
if p.Status.Phase == v1.PodSucceeded {
continue
}
for _, cs := range p.Status.ContainerStatuses {
if cs.State.Waiting == nil {
continue
}
mon.emitGauge("pod.containerstatuses", 1, map[string]string{
"name": p.Name,
"namespace": p.Namespace,
"containername": cs.Name,
"reason": cs.State.Waiting.Reason,
})
if mon.logMessages {
mon.log.WithFields(logrus.Fields{
"metric": "pod.containerstatuses",
"name": p.Name,
"namespace": p.Namespace,
"containername": cs.Name,
"reason": cs.State.Waiting.Reason,
"message": cs.State.Waiting.Message,
}).Print()
}
}
}
}

Просмотреть файл

@ -0,0 +1,131 @@
package cluster
// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.
import (
"testing"
"github.com/golang/mock/gomock"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/fake"
mock_metrics "github.com/Azure/ARO-RP/pkg/util/mocks/metrics"
)
func TestEmitPodConditions(t *testing.T) {
cli := fake.NewSimpleClientset(
&corev1.Pod{ // metrics expected
ObjectMeta: metav1.ObjectMeta{
Name: "name",
Namespace: "openshift",
},
Status: corev1.PodStatus{
Conditions: []corev1.PodCondition{
{
Type: corev1.PodReady,
Status: corev1.ConditionFalse,
},
{
Type: corev1.PodInitialized,
Status: corev1.ConditionFalse,
},
{
Type: corev1.PodScheduled,
Status: corev1.ConditionFalse,
},
{
Type: corev1.ContainersReady,
Status: corev1.ConditionFalse,
},
{
Type: corev1.PodReady,
Status: corev1.ConditionTrue,
},
},
},
},
)
controller := gomock.NewController(t)
defer controller.Finish()
m := mock_metrics.NewMockInterface(controller)
mon := &Monitor{
cli: cli,
m: m,
}
m.EXPECT().EmitGauge("pod.conditions", int64(1), map[string]string{
"name": "name",
"namespace": "openshift",
"status": "False",
"type": "ContainersReady",
})
m.EXPECT().EmitGauge("pod.conditions", int64(1), map[string]string{
"name": "name",
"namespace": "openshift",
"status": "False",
"type": "Initialized",
})
m.EXPECT().EmitGauge("pod.conditions", int64(1), map[string]string{
"name": "name",
"namespace": "openshift",
"status": "False",
"type": "PodScheduled",
})
m.EXPECT().EmitGauge("pod.conditions", int64(1), map[string]string{
"name": "name",
"namespace": "openshift",
"status": "False",
"type": "Ready",
})
ps, _ := cli.CoreV1().Pods("").List(metav1.ListOptions{})
mon._emitPodConditions(ps)
}
func TestEmitPodContainerStatuses(t *testing.T) {
cli := fake.NewSimpleClientset(
&corev1.Pod{ // metrics expected
ObjectMeta: metav1.ObjectMeta{
Name: "name",
Namespace: "openshift",
},
Status: corev1.PodStatus{
ContainerStatuses: []corev1.ContainerStatus{
{
Name: "containername",
State: corev1.ContainerState{
Waiting: &corev1.ContainerStateWaiting{
Reason: "ImagePullBackOff",
},
},
},
},
},
},
)
controller := gomock.NewController(t)
defer controller.Finish()
m := mock_metrics.NewMockInterface(controller)
mon := &Monitor{
cli: cli,
m: m,
}
m.EXPECT().EmitGauge("pod.containerstatuses", int64(1), map[string]string{
"name": "name",
"namespace": "openshift",
"containername": "containername",
"reason": "ImagePullBackOff",
})
ps, _ := cli.CoreV1().Pods("").List(metav1.ListOptions{})
mon._emitPodContainerStatuses(ps)
}

Просмотреть файл

@ -13,6 +13,7 @@ import (
"github.com/prometheus/common/model"
"github.com/Azure/ARO-RP/pkg/util/namespace"
"github.com/Azure/ARO-RP/pkg/util/portforward"
)
@ -66,6 +67,10 @@ func (mon *Monitor) emitPrometheusAlerts(ctx context.Context) error {
}{}
for _, alert := range alerts {
if !namespace.IsOpenShift(string(alert.Labels["namespace"])) {
continue
}
if strings.HasPrefix(alert.Name(), "UsingDeprecatedAPI") {
continue
}

Просмотреть файл

@ -0,0 +1,39 @@
package cluster
// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.
import (
"context"
"strconv"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"github.com/Azure/ARO-RP/pkg/util/namespace"
)
func (mon *Monitor) emitReplicasetStatuses(ctx context.Context) error {
rss, err := mon.cli.AppsV1().ReplicaSets("").List(metav1.ListOptions{})
if err != nil {
return err
}
for _, rs := range rss.Items {
if !namespace.IsOpenShift(rs.Namespace) {
continue
}
if rs.Status.Replicas == rs.Status.AvailableReplicas {
continue
}
mon.emitGauge("replicaset.statuses", 1, map[string]string{
"availableReplicas": strconv.Itoa(int(rs.Status.AvailableReplicas)),
"name": rs.Name,
"namespace": rs.Namespace,
"replicas": strconv.Itoa(int(rs.Status.Replicas)),
})
}
return nil
}

Просмотреть файл

@ -0,0 +1,75 @@
package cluster
// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.
import (
"context"
"strconv"
"testing"
"github.com/golang/mock/gomock"
appsv1 "k8s.io/api/apps/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/fake"
mock_metrics "github.com/Azure/ARO-RP/pkg/util/mocks/metrics"
)
func TestEmitReplicasetStatuses(t *testing.T) {
ctx := context.Background()
cli := fake.NewSimpleClientset(
&appsv1.ReplicaSet{ // metrics expected
ObjectMeta: metav1.ObjectMeta{
Name: "name1",
Namespace: "openshift",
},
Status: appsv1.ReplicaSetStatus{
Replicas: 2,
AvailableReplicas: 1,
},
}, &appsv1.ReplicaSet{ // no metric expected
ObjectMeta: metav1.ObjectMeta{
Name: "name2",
Namespace: "openshift",
},
Status: appsv1.ReplicaSetStatus{
Replicas: 2,
AvailableReplicas: 2,
},
}, &appsv1.ReplicaSet{
ObjectMeta: metav1.ObjectMeta{ // no metric expected -customer
Name: "name2",
Namespace: "customer",
},
Status: appsv1.ReplicaSetStatus{
Replicas: 2,
AvailableReplicas: 1,
},
},
)
controller := gomock.NewController(t)
defer controller.Finish()
m := mock_metrics.NewMockInterface(controller)
mon := &Monitor{
cli: cli,
m: m,
}
m.EXPECT().EmitGauge("replicaset.statuses", int64(1), map[string]string{
"availableReplicas": strconv.Itoa(1),
"name": "name1",
"namespace": "openshift",
"replicas": strconv.Itoa(2),
})
err := mon.emitReplicasetStatuses(ctx)
if err != nil {
t.Fatal(err)
}
}

Просмотреть файл

@ -0,0 +1,39 @@
package cluster
// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.
import (
"context"
"strconv"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"github.com/Azure/ARO-RP/pkg/util/namespace"
)
func (mon *Monitor) emitStatefulsetStatuses(ctx context.Context) error {
sss, err := mon.cli.AppsV1().StatefulSets("").List(metav1.ListOptions{})
if err != nil {
return err
}
for _, ss := range sss.Items {
if !namespace.IsOpenShift(ss.Namespace) {
continue
}
if ss.Status.Replicas == ss.Status.ReadyReplicas {
continue
}
mon.emitGauge("statefulset.statuses", 1, map[string]string{
"name": ss.Name,
"namespace": ss.Namespace,
"replicas": strconv.Itoa(int(ss.Status.Replicas)),
"readyReplicas": strconv.Itoa(int(ss.Status.ReadyReplicas)),
})
}
return nil
}

Просмотреть файл

@ -0,0 +1,75 @@
package cluster
// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.
import (
"context"
"strconv"
"testing"
"github.com/golang/mock/gomock"
appsv1 "k8s.io/api/apps/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/fake"
mock_metrics "github.com/Azure/ARO-RP/pkg/util/mocks/metrics"
)
func TestEmitStatefulsetStatuses(t *testing.T) {
ctx := context.Background()
cli := fake.NewSimpleClientset(
&appsv1.StatefulSet{ // metrics expected
ObjectMeta: metav1.ObjectMeta{
Name: "name1",
Namespace: "openshift",
},
Status: appsv1.StatefulSetStatus{
Replicas: 2,
ReadyReplicas: 1,
},
}, &appsv1.StatefulSet{ // no metric expected
ObjectMeta: metav1.ObjectMeta{
Name: "name2",
Namespace: "openshift",
},
Status: appsv1.StatefulSetStatus{
Replicas: 2,
ReadyReplicas: 2,
},
}, &appsv1.StatefulSet{
ObjectMeta: metav1.ObjectMeta{ // no metric expected -customer
Name: "name2",
Namespace: "customer",
},
Status: appsv1.StatefulSetStatus{
Replicas: 2,
ReadyReplicas: 1,
},
},
)
controller := gomock.NewController(t)
defer controller.Finish()
m := mock_metrics.NewMockInterface(controller)
mon := &Monitor{
cli: cli,
m: m,
}
m.EXPECT().EmitGauge("statefulset.statuses", int64(1), map[string]string{
"name": "name1",
"namespace": "openshift",
"replicas": strconv.Itoa(2),
"readyReplicas": strconv.Itoa(1),
})
err := mon.emitStatefulsetStatuses(ctx)
if err != nil {
t.Fatal(err)
}
}

Просмотреть файл

@ -111,6 +111,8 @@ func (mon *monitor) worker(stop <-chan struct{}, delay time.Duration, id string)
t := time.NewTicker(time.Minute)
defer t.Stop()
h := time.Now().Hour()
out:
for {
mon.mu.RLock()
@ -121,27 +123,31 @@ out:
break
}
newh := time.Now().Hour()
// TODO: later can modify here to poll once per N minutes and re-issue
// cached metrics in the remaining minutes
mon.workOne(context.Background(), log, v.doc)
mon.workOne(context.Background(), log, v.doc, newh != h)
select {
case <-t.C:
case <-stop:
break out
}
h = newh
}
log.Debug("stopping monitoring")
}
// workOne checks the API server health of a cluster
func (mon *monitor) workOne(ctx context.Context, log *logrus.Entry, doc *api.OpenShiftClusterDocument) {
func (mon *monitor) workOne(ctx context.Context, log *logrus.Entry, doc *api.OpenShiftClusterDocument, logMessages bool) {
ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
defer cancel()
c, err := cluster.NewMonitor(ctx, mon.env, log, doc.OpenShiftCluster, mon.clusterm)
c, err := cluster.NewMonitor(ctx, mon.env, log, doc.OpenShiftCluster, mon.clusterm, logMessages)
if err != nil {
log.Error(err)
return

Просмотреть файл

@ -0,0 +1,17 @@
package namespace
// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.
import (
"strings"
)
// IsOpenShift returns true if ns is an openshift managed namespace.
func IsOpenShift(ns string) bool {
return ns == "" ||
ns == "default" ||
ns == "openshift" ||
strings.HasPrefix(ns, "kube-") ||
strings.HasPrefix(ns, "openshift-")
}

Просмотреть файл

@ -0,0 +1,45 @@
package namespace
// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.
import (
"testing"
)
func TestIsOpenShift(t *testing.T) {
for _, tt := range []struct {
namespace string
want bool
}{
{
want: true,
},
{
namespace: "openshift-ns",
want: true,
},
{
namespace: "openshift",
want: true,
},
{
namespace: "kube-ns",
want: true,
},
{
namespace: "default",
want: true,
},
{
namespace: "customer",
},
} {
t.Run(tt.namespace, func(t *testing.T) {
got := IsOpenShift(tt.namespace)
if tt.want != got {
t.Error(got)
}
})
}
}