From 40022dab8634ae696f930fcd560288f947753f1b Mon Sep 17 00:00:00 2001 From: Evan Baker Date: Mon, 22 May 2023 12:01:09 -0500 Subject: [PATCH] add retry to nnc update during scaledown (#1970) * add retry to nnc update during scaledown Signed-off-by: Evan Baker * test for panic in pool monitor Signed-off-by: Evan Baker --------- Signed-off-by: Evan Baker --- cns/ipampool/monitor.go | 20 ++++++++--- cns/ipampool/monitor_test.go | 69 ++++++++++++++++++++++++++++-------- go.mod | 3 +- go.sum | 6 ++-- 4 files changed, 75 insertions(+), 23 deletions(-) diff --git a/cns/ipampool/monitor.go b/cns/ipampool/monitor.go index 0a35660fb..b3a5cfb7e 100644 --- a/cns/ipampool/monitor.go +++ b/cns/ipampool/monitor.go @@ -13,6 +13,7 @@ import ( "github.com/Azure/azure-container-networking/cns/types" "github.com/Azure/azure-container-networking/crd/clustersubnetstate/api/v1alpha1" "github.com/Azure/azure-container-networking/crd/nodenetworkconfig/api/v1alpha" + "github.com/avast/retry-go/v4" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" ) @@ -334,10 +335,20 @@ func (pm *Monitor) decreasePoolSize(ctx context.Context, meta metaState, state i tempNNCSpec.RequestedIPCount -= int64(len(pendingIPAddresses)) logger.Printf("[ipam-pool-monitor] Decreasing pool size, pool %+v, spec %+v", state, tempNNCSpec) - _, err := pm.nnccli.UpdateSpec(ctx, &tempNNCSpec) - if err != nil { - // caller will retry to update the CRD again - return errors.Wrap(err, "executing UpdateSpec with NNC client") + attempts := 0 + if err := retry.Do(func() error { + attempts++ + _, err := pm.nnccli.UpdateSpec(ctx, &tempNNCSpec) + if err != nil { + // caller will retry to update the CRD again + logger.Printf("failed to update NNC spec attempt #%d, err: %v", attempts, err) + return errors.Wrap(err, "executing UpdateSpec with NNC client") + } + logger.Printf("successfully updated NNC spec attempt #%d", attempts) + return nil + }, retry.Attempts(5), retry.DelayType(retry.BackOffDelay)); err != nil { //nolint:gomnd // ignore retry magic number + logger.Errorf("all attempts failed to update NNC during scale-down, state is corrupt: %v", err) + panic(err) } logger.Printf("[ipam-pool-monitor] Decreasing pool size: UpdateCRDSpec succeeded for spec %+v", tempNNCSpec) @@ -350,7 +361,6 @@ func (pm *Monitor) decreasePoolSize(ctx context.Context, meta metaState, state i // clear the updatingPendingIpsNotInUse, as we have Updated the CRD logger.Printf("[ipam-pool-monitor] cleaning the updatingPendingIpsNotInUse, existing length %d", pm.metastate.notInUseCount) pm.metastate.notInUseCount = 0 - return nil } diff --git a/cns/ipampool/monitor_test.go b/cns/ipampool/monitor_test.go index 0980631d5..2c71bb7f8 100644 --- a/cns/ipampool/monitor_test.go +++ b/cns/ipampool/monitor_test.go @@ -2,6 +2,7 @@ package ipampool import ( "context" + "errors" "testing" "time" @@ -21,6 +22,12 @@ func (f *fakeNodeNetworkConfigUpdater) UpdateSpec(ctx context.Context, spec *v1a return f.nnc, nil } +type fakeNodeNetworkConfigUpdaterFunc func(ctx context.Context, spec *v1alpha.NodeNetworkConfigSpec) (*v1alpha.NodeNetworkConfig, error) + +func (f fakeNodeNetworkConfigUpdaterFunc) UpdateSpec(ctx context.Context, spec *v1alpha.NodeNetworkConfigSpec) (*v1alpha.NodeNetworkConfig, error) { + return f(ctx, spec) +} + type directUpdatePoolMonitor struct { m *Monitor cns.IPAMPoolMonitor @@ -45,7 +52,7 @@ type testState struct { totalIPs int64 } -func initFakes(state testState) (*fakes.HTTPServiceFake, *fakes.RequestControllerFake, *Monitor) { +func initFakes(state testState, nnccli nodeNetworkConfigSpecUpdater) (*fakes.HTTPServiceFake, *fakes.RequestControllerFake, *Monitor) { logger.InitLogger("testlogs", 0, 0, "./") scalarUnits := v1alpha.Scaler{ @@ -61,8 +68,11 @@ func initFakes(state testState) (*fakes.HTTPServiceFake, *fakes.RequestControlle } fakecns := fakes.NewHTTPServiceFake() fakerc := fakes.NewRequestControllerFake(fakecns, scalarUnits, subnetaddresspace, state.totalIPs) + if nnccli == nil { + nnccli = &fakeNodeNetworkConfigUpdater{fakerc.NNC} + } - poolmonitor := NewMonitor(fakecns, &fakeNodeNetworkConfigUpdater{fakerc.NNC}, nil, &Options{RefreshDelay: 100 * time.Second}) + poolmonitor := NewMonitor(fakecns, nnccli, nil, &Options{RefreshDelay: 100 * time.Second}) poolmonitor.metastate = metaState{ batch: state.batch, max: state.max, @@ -127,7 +137,7 @@ func TestPoolSizeIncrease(t *testing.T) { for _, tt := range tests { tt := tt t.Run(tt.name, func(t *testing.T) { - fakecns, fakerc, poolmonitor := initFakes(tt.in) + fakecns, fakerc, poolmonitor := initFakes(tt.in, nil) assert.NoError(t, fakerc.Reconcile(true)) // When poolmonitor reconcile is called, trigger increase and cache goal state @@ -162,7 +172,7 @@ func TestPoolIncreaseDoesntChangeWhenIncreaseIsAlreadyInProgress(t *testing.T) { max: 30, } - fakecns, fakerc, poolmonitor := initFakes(initState) + fakecns, fakerc, poolmonitor := initFakes(initState, nil) assert.NoError(t, fakerc.Reconcile(true)) // When poolmonitor reconcile is called, trigger increase and cache goal state @@ -201,7 +211,7 @@ func TestPoolSizeIncreaseIdempotency(t *testing.T) { max: 30, } - _, fakerc, poolmonitor := initFakes(initState) + _, fakerc, poolmonitor := initFakes(initState, nil) assert.NoError(t, fakerc.Reconcile(true)) // When poolmonitor reconcile is called, trigger increase and cache goal state @@ -227,7 +237,7 @@ func TestPoolIncreasePastNodeLimit(t *testing.T) { max: 30, } - _, fakerc, poolmonitor := initFakes(initState) + _, fakerc, poolmonitor := initFakes(initState, nil) assert.NoError(t, fakerc.Reconcile(true)) // When poolmonitor reconcile is called, trigger increase and cache goal state @@ -247,7 +257,7 @@ func TestPoolIncreaseBatchSizeGreaterThanMaxPodIPCount(t *testing.T) { max: 30, } - _, fakerc, poolmonitor := initFakes(initState) + _, fakerc, poolmonitor := initFakes(initState, nil) assert.NoError(t, fakerc.Reconcile(true)) // When poolmonitor reconcile is called, trigger increase and cache goal state @@ -267,7 +277,7 @@ func TestIncreaseWithPendingRelease(t *testing.T) { max: 250, pendingRelease: 16, } - _, rc, mon := initFakes(initState) + _, rc, mon := initFakes(initState, nil) assert.NoError(t, rc.Reconcile(true)) assert.NoError(t, mon.reconcile(context.Background())) assert.Equal(t, int64(32), mon.spec.RequestedIPCount) @@ -333,7 +343,7 @@ func TestPoolDecrease(t *testing.T) { for _, tt := range tests { tt := tt t.Run(tt.name, func(t *testing.T) { - fakecns, fakerc, poolmonitor := initFakes(tt.in) + fakecns, fakerc, poolmonitor := initFakes(tt.in, nil) assert.NoError(t, fakerc.Reconcile(true)) // Decrease the number of allocated IPs down to target. This may trigger a scale down. @@ -374,7 +384,7 @@ func TestPoolSizeDecreaseWhenDecreaseHasAlreadyBeenRequested(t *testing.T) { max: 30, } - fakecns, fakerc, poolmonitor := initFakes(initState) + fakecns, fakerc, poolmonitor := initFakes(initState, nil) assert.NoError(t, fakerc.Reconcile(true)) // Pool monitor does nothing, as the current number of IPs falls in the threshold @@ -413,7 +423,7 @@ func TestDecreaseAndIncreaseToSameCount(t *testing.T) { max: 30, } - fakecns, fakerc, poolmonitor := initFakes(initState) + fakecns, fakerc, poolmonitor := initFakes(initState, nil) assert.NoError(t, fakerc.Reconcile(true)) assert.NoError(t, poolmonitor.reconcile(context.Background())) assert.EqualValues(t, 20, poolmonitor.spec.RequestedIPCount) @@ -458,7 +468,7 @@ func TestPoolSizeDecreaseToReallyLow(t *testing.T) { max: 30, } - fakecns, fakerc, poolmonitor := initFakes(initState) + fakecns, fakerc, poolmonitor := initFakes(initState, nil) assert.NoError(t, fakerc.Reconcile(true)) // Pool monitor does nothing, as the current number of IPs falls in the threshold @@ -499,7 +509,7 @@ func TestDecreaseAfterNodeLimitReached(t *testing.T) { releaseThresholdPercent: 150, max: 30, } - fakecns, fakerc, poolmonitor := initFakes(initState) + fakecns, fakerc, poolmonitor := initFakes(initState, nil) assert.NoError(t, fakerc.Reconcile(true)) assert.NoError(t, poolmonitor.reconcile(context.Background())) @@ -524,7 +534,7 @@ func TestDecreaseWithPendingRelease(t *testing.T) { totalIPs: 64, max: 250, } - fakecns, fakerc, poolmonitor := initFakes(initState) + fakecns, fakerc, poolmonitor := initFakes(initState, nil) fakerc.NNC.Spec.RequestedIPCount = 48 assert.NoError(t, fakerc.Reconcile(true)) @@ -547,6 +557,35 @@ func TestDecreaseWithPendingRelease(t *testing.T) { assert.Len(t, poolmonitor.spec.IPsNotInUse, int(initState.batch)+int(initState.pendingRelease)) } +func TestDecreaseWithAPIServerFailure(t *testing.T) { + initState := testState{ + batch: 16, + assigned: 46, + allocated: 64, + pendingRelease: 0, + requestThresholdPercent: 50, + releaseThresholdPercent: 150, + totalIPs: 64, + max: 250, + } + var errNNCCLi fakeNodeNetworkConfigUpdaterFunc = func(ctx context.Context, spec *v1alpha.NodeNetworkConfigSpec) (*v1alpha.NodeNetworkConfig, error) { + return nil, errors.New("fake APIServer failure") //nolint:goerr113 // this is a fake error + } + + fakecns, fakerc, poolmonitor := initFakes(initState, errNNCCLi) + fakerc.NNC.Spec.RequestedIPCount = initState.totalIPs + assert.NoError(t, fakerc.Reconcile(true)) + + assert.NoError(t, poolmonitor.reconcile(context.Background())) + + // release some IPs + assert.NoError(t, fakecns.SetNumberOfAssignedIPs(40)) + // check that the pool monitor panics if it is not able to publish the updated NNC + assert.Panics(t, func() { + _ = poolmonitor.reconcile(context.Background()) + }) +} + func TestPoolDecreaseBatchSizeGreaterThanMaxPodIPCount(t *testing.T) { initState := testState{ batch: 31, @@ -557,7 +596,7 @@ func TestPoolDecreaseBatchSizeGreaterThanMaxPodIPCount(t *testing.T) { max: 30, } - fakecns, fakerc, poolmonitor := initFakes(initState) + fakecns, fakerc, poolmonitor := initFakes(initState, nil) assert.NoError(t, fakerc.Reconcile(true)) // When poolmonitor reconcile is called, trigger increase and cache goal state diff --git a/go.mod b/go.mod index f1ef4e4ea..6822c1183 100644 --- a/go.mod +++ b/go.mod @@ -10,6 +10,7 @@ require ( github.com/Microsoft/go-winio v0.4.17 github.com/Microsoft/hcsshim v0.8.23 github.com/avast/retry-go/v3 v3.1.1 + github.com/avast/retry-go/v4 v4.3.4 github.com/billgraziano/dpapi v0.4.0 github.com/containernetworking/cni v1.1.2 github.com/docker/libnetwork v0.8.0-dev.2.0.20210525090646-64b7a4574d14 @@ -30,7 +31,7 @@ require ( github.com/spf13/cobra v1.6.1 github.com/spf13/pflag v1.0.5 github.com/spf13/viper v1.14.0 - github.com/stretchr/testify v1.8.1 + github.com/stretchr/testify v1.8.2 go.uber.org/zap v1.24.0 golang.org/x/sys v0.6.0 google.golang.org/grpc v1.52.0 diff --git a/go.sum b/go.sum index 110a06de4..0b988e7d9 100644 --- a/go.sum +++ b/go.sum @@ -89,6 +89,8 @@ github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPd github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY= github.com/avast/retry-go/v3 v3.1.1 h1:49Scxf4v8PmiQ/nY0aY3p0hDueqSmc7++cBbtiDGu2g= github.com/avast/retry-go/v3 v3.1.1/go.mod h1:6cXRK369RpzFL3UQGqIUp9Q7GDrams+KsYWrfNA1/nQ= +github.com/avast/retry-go/v4 v4.3.4 h1:pHLkL7jvCvP317I8Ge+Km2Yhntv3SdkJm7uekkqbKhM= +github.com/avast/retry-go/v4 v4.3.4/go.mod h1:rv+Nla6Vk3/ilU0H51VHddWHiwimzX66yZ0JT6T+UvE= github.com/aws/aws-sdk-go v1.15.11/go.mod h1:mFuSZ37Z9YOHbQEwBWztmVzqXrEkub65tZoCYDt7FT0= github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8= github.com/beorn7/perks v0.0.0-20160804104726-4c0e84591b9a/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= @@ -741,8 +743,8 @@ github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= -github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8= +github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/subosito/gotenv v1.4.1 h1:jyEFiXpy21Wm81FBN71l9VoMMV8H8jG+qIK3GCpY6Qs= github.com/subosito/gotenv v1.4.1/go.mod h1:ayKnFf/c6rvx/2iiLrJUk1e6plDbT3edrFNGqEflhK0= github.com/syndtr/gocapability v0.0.0-20170704070218-db04d3cc01c8/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww=