add retry to nnc update during scaledown (#1970)

* add retry to nnc update during scaledown

Signed-off-by: Evan Baker <rbtr@users.noreply.github.com>

* test for panic in pool monitor

Signed-off-by: Evan Baker <rbtr@users.noreply.github.com>

---------

Signed-off-by: Evan Baker <rbtr@users.noreply.github.com>
This commit is contained in:
Evan Baker 2023-05-22 12:01:09 -05:00 коммит произвёл GitHub
Родитель 92abf0306b
Коммит 40022dab86
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
4 изменённых файлов: 75 добавлений и 23 удалений

Просмотреть файл

@ -13,6 +13,7 @@ import (
"github.com/Azure/azure-container-networking/cns/types"
"github.com/Azure/azure-container-networking/crd/clustersubnetstate/api/v1alpha1"
"github.com/Azure/azure-container-networking/crd/nodenetworkconfig/api/v1alpha"
"github.com/avast/retry-go/v4"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
)
@ -334,10 +335,20 @@ func (pm *Monitor) decreasePoolSize(ctx context.Context, meta metaState, state i
tempNNCSpec.RequestedIPCount -= int64(len(pendingIPAddresses))
logger.Printf("[ipam-pool-monitor] Decreasing pool size, pool %+v, spec %+v", state, tempNNCSpec)
_, err := pm.nnccli.UpdateSpec(ctx, &tempNNCSpec)
if err != nil {
// caller will retry to update the CRD again
return errors.Wrap(err, "executing UpdateSpec with NNC client")
attempts := 0
if err := retry.Do(func() error {
attempts++
_, err := pm.nnccli.UpdateSpec(ctx, &tempNNCSpec)
if err != nil {
// caller will retry to update the CRD again
logger.Printf("failed to update NNC spec attempt #%d, err: %v", attempts, err)
return errors.Wrap(err, "executing UpdateSpec with NNC client")
}
logger.Printf("successfully updated NNC spec attempt #%d", attempts)
return nil
}, retry.Attempts(5), retry.DelayType(retry.BackOffDelay)); err != nil { //nolint:gomnd // ignore retry magic number
logger.Errorf("all attempts failed to update NNC during scale-down, state is corrupt: %v", err)
panic(err)
}
logger.Printf("[ipam-pool-monitor] Decreasing pool size: UpdateCRDSpec succeeded for spec %+v", tempNNCSpec)
@ -350,7 +361,6 @@ func (pm *Monitor) decreasePoolSize(ctx context.Context, meta metaState, state i
// clear the updatingPendingIpsNotInUse, as we have Updated the CRD
logger.Printf("[ipam-pool-monitor] cleaning the updatingPendingIpsNotInUse, existing length %d", pm.metastate.notInUseCount)
pm.metastate.notInUseCount = 0
return nil
}

Просмотреть файл

@ -2,6 +2,7 @@ package ipampool
import (
"context"
"errors"
"testing"
"time"
@ -21,6 +22,12 @@ func (f *fakeNodeNetworkConfigUpdater) UpdateSpec(ctx context.Context, spec *v1a
return f.nnc, nil
}
type fakeNodeNetworkConfigUpdaterFunc func(ctx context.Context, spec *v1alpha.NodeNetworkConfigSpec) (*v1alpha.NodeNetworkConfig, error)
func (f fakeNodeNetworkConfigUpdaterFunc) UpdateSpec(ctx context.Context, spec *v1alpha.NodeNetworkConfigSpec) (*v1alpha.NodeNetworkConfig, error) {
return f(ctx, spec)
}
type directUpdatePoolMonitor struct {
m *Monitor
cns.IPAMPoolMonitor
@ -45,7 +52,7 @@ type testState struct {
totalIPs int64
}
func initFakes(state testState) (*fakes.HTTPServiceFake, *fakes.RequestControllerFake, *Monitor) {
func initFakes(state testState, nnccli nodeNetworkConfigSpecUpdater) (*fakes.HTTPServiceFake, *fakes.RequestControllerFake, *Monitor) {
logger.InitLogger("testlogs", 0, 0, "./")
scalarUnits := v1alpha.Scaler{
@ -61,8 +68,11 @@ func initFakes(state testState) (*fakes.HTTPServiceFake, *fakes.RequestControlle
}
fakecns := fakes.NewHTTPServiceFake()
fakerc := fakes.NewRequestControllerFake(fakecns, scalarUnits, subnetaddresspace, state.totalIPs)
if nnccli == nil {
nnccli = &fakeNodeNetworkConfigUpdater{fakerc.NNC}
}
poolmonitor := NewMonitor(fakecns, &fakeNodeNetworkConfigUpdater{fakerc.NNC}, nil, &Options{RefreshDelay: 100 * time.Second})
poolmonitor := NewMonitor(fakecns, nnccli, nil, &Options{RefreshDelay: 100 * time.Second})
poolmonitor.metastate = metaState{
batch: state.batch,
max: state.max,
@ -127,7 +137,7 @@ func TestPoolSizeIncrease(t *testing.T) {
for _, tt := range tests {
tt := tt
t.Run(tt.name, func(t *testing.T) {
fakecns, fakerc, poolmonitor := initFakes(tt.in)
fakecns, fakerc, poolmonitor := initFakes(tt.in, nil)
assert.NoError(t, fakerc.Reconcile(true))
// When poolmonitor reconcile is called, trigger increase and cache goal state
@ -162,7 +172,7 @@ func TestPoolIncreaseDoesntChangeWhenIncreaseIsAlreadyInProgress(t *testing.T) {
max: 30,
}
fakecns, fakerc, poolmonitor := initFakes(initState)
fakecns, fakerc, poolmonitor := initFakes(initState, nil)
assert.NoError(t, fakerc.Reconcile(true))
// When poolmonitor reconcile is called, trigger increase and cache goal state
@ -201,7 +211,7 @@ func TestPoolSizeIncreaseIdempotency(t *testing.T) {
max: 30,
}
_, fakerc, poolmonitor := initFakes(initState)
_, fakerc, poolmonitor := initFakes(initState, nil)
assert.NoError(t, fakerc.Reconcile(true))
// When poolmonitor reconcile is called, trigger increase and cache goal state
@ -227,7 +237,7 @@ func TestPoolIncreasePastNodeLimit(t *testing.T) {
max: 30,
}
_, fakerc, poolmonitor := initFakes(initState)
_, fakerc, poolmonitor := initFakes(initState, nil)
assert.NoError(t, fakerc.Reconcile(true))
// When poolmonitor reconcile is called, trigger increase and cache goal state
@ -247,7 +257,7 @@ func TestPoolIncreaseBatchSizeGreaterThanMaxPodIPCount(t *testing.T) {
max: 30,
}
_, fakerc, poolmonitor := initFakes(initState)
_, fakerc, poolmonitor := initFakes(initState, nil)
assert.NoError(t, fakerc.Reconcile(true))
// When poolmonitor reconcile is called, trigger increase and cache goal state
@ -267,7 +277,7 @@ func TestIncreaseWithPendingRelease(t *testing.T) {
max: 250,
pendingRelease: 16,
}
_, rc, mon := initFakes(initState)
_, rc, mon := initFakes(initState, nil)
assert.NoError(t, rc.Reconcile(true))
assert.NoError(t, mon.reconcile(context.Background()))
assert.Equal(t, int64(32), mon.spec.RequestedIPCount)
@ -333,7 +343,7 @@ func TestPoolDecrease(t *testing.T) {
for _, tt := range tests {
tt := tt
t.Run(tt.name, func(t *testing.T) {
fakecns, fakerc, poolmonitor := initFakes(tt.in)
fakecns, fakerc, poolmonitor := initFakes(tt.in, nil)
assert.NoError(t, fakerc.Reconcile(true))
// Decrease the number of allocated IPs down to target. This may trigger a scale down.
@ -374,7 +384,7 @@ func TestPoolSizeDecreaseWhenDecreaseHasAlreadyBeenRequested(t *testing.T) {
max: 30,
}
fakecns, fakerc, poolmonitor := initFakes(initState)
fakecns, fakerc, poolmonitor := initFakes(initState, nil)
assert.NoError(t, fakerc.Reconcile(true))
// Pool monitor does nothing, as the current number of IPs falls in the threshold
@ -413,7 +423,7 @@ func TestDecreaseAndIncreaseToSameCount(t *testing.T) {
max: 30,
}
fakecns, fakerc, poolmonitor := initFakes(initState)
fakecns, fakerc, poolmonitor := initFakes(initState, nil)
assert.NoError(t, fakerc.Reconcile(true))
assert.NoError(t, poolmonitor.reconcile(context.Background()))
assert.EqualValues(t, 20, poolmonitor.spec.RequestedIPCount)
@ -458,7 +468,7 @@ func TestPoolSizeDecreaseToReallyLow(t *testing.T) {
max: 30,
}
fakecns, fakerc, poolmonitor := initFakes(initState)
fakecns, fakerc, poolmonitor := initFakes(initState, nil)
assert.NoError(t, fakerc.Reconcile(true))
// Pool monitor does nothing, as the current number of IPs falls in the threshold
@ -499,7 +509,7 @@ func TestDecreaseAfterNodeLimitReached(t *testing.T) {
releaseThresholdPercent: 150,
max: 30,
}
fakecns, fakerc, poolmonitor := initFakes(initState)
fakecns, fakerc, poolmonitor := initFakes(initState, nil)
assert.NoError(t, fakerc.Reconcile(true))
assert.NoError(t, poolmonitor.reconcile(context.Background()))
@ -524,7 +534,7 @@ func TestDecreaseWithPendingRelease(t *testing.T) {
totalIPs: 64,
max: 250,
}
fakecns, fakerc, poolmonitor := initFakes(initState)
fakecns, fakerc, poolmonitor := initFakes(initState, nil)
fakerc.NNC.Spec.RequestedIPCount = 48
assert.NoError(t, fakerc.Reconcile(true))
@ -547,6 +557,35 @@ func TestDecreaseWithPendingRelease(t *testing.T) {
assert.Len(t, poolmonitor.spec.IPsNotInUse, int(initState.batch)+int(initState.pendingRelease))
}
func TestDecreaseWithAPIServerFailure(t *testing.T) {
initState := testState{
batch: 16,
assigned: 46,
allocated: 64,
pendingRelease: 0,
requestThresholdPercent: 50,
releaseThresholdPercent: 150,
totalIPs: 64,
max: 250,
}
var errNNCCLi fakeNodeNetworkConfigUpdaterFunc = func(ctx context.Context, spec *v1alpha.NodeNetworkConfigSpec) (*v1alpha.NodeNetworkConfig, error) {
return nil, errors.New("fake APIServer failure") //nolint:goerr113 // this is a fake error
}
fakecns, fakerc, poolmonitor := initFakes(initState, errNNCCLi)
fakerc.NNC.Spec.RequestedIPCount = initState.totalIPs
assert.NoError(t, fakerc.Reconcile(true))
assert.NoError(t, poolmonitor.reconcile(context.Background()))
// release some IPs
assert.NoError(t, fakecns.SetNumberOfAssignedIPs(40))
// check that the pool monitor panics if it is not able to publish the updated NNC
assert.Panics(t, func() {
_ = poolmonitor.reconcile(context.Background())
})
}
func TestPoolDecreaseBatchSizeGreaterThanMaxPodIPCount(t *testing.T) {
initState := testState{
batch: 31,
@ -557,7 +596,7 @@ func TestPoolDecreaseBatchSizeGreaterThanMaxPodIPCount(t *testing.T) {
max: 30,
}
fakecns, fakerc, poolmonitor := initFakes(initState)
fakecns, fakerc, poolmonitor := initFakes(initState, nil)
assert.NoError(t, fakerc.Reconcile(true))
// When poolmonitor reconcile is called, trigger increase and cache goal state

3
go.mod
Просмотреть файл

@ -10,6 +10,7 @@ require (
github.com/Microsoft/go-winio v0.4.17
github.com/Microsoft/hcsshim v0.8.23
github.com/avast/retry-go/v3 v3.1.1
github.com/avast/retry-go/v4 v4.3.4
github.com/billgraziano/dpapi v0.4.0
github.com/containernetworking/cni v1.1.2
github.com/docker/libnetwork v0.8.0-dev.2.0.20210525090646-64b7a4574d14
@ -30,7 +31,7 @@ require (
github.com/spf13/cobra v1.6.1
github.com/spf13/pflag v1.0.5
github.com/spf13/viper v1.14.0
github.com/stretchr/testify v1.8.1
github.com/stretchr/testify v1.8.2
go.uber.org/zap v1.24.0
golang.org/x/sys v0.6.0
google.golang.org/grpc v1.52.0

6
go.sum
Просмотреть файл

@ -89,6 +89,8 @@ github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPd
github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY=
github.com/avast/retry-go/v3 v3.1.1 h1:49Scxf4v8PmiQ/nY0aY3p0hDueqSmc7++cBbtiDGu2g=
github.com/avast/retry-go/v3 v3.1.1/go.mod h1:6cXRK369RpzFL3UQGqIUp9Q7GDrams+KsYWrfNA1/nQ=
github.com/avast/retry-go/v4 v4.3.4 h1:pHLkL7jvCvP317I8Ge+Km2Yhntv3SdkJm7uekkqbKhM=
github.com/avast/retry-go/v4 v4.3.4/go.mod h1:rv+Nla6Vk3/ilU0H51VHddWHiwimzX66yZ0JT6T+UvE=
github.com/aws/aws-sdk-go v1.15.11/go.mod h1:mFuSZ37Z9YOHbQEwBWztmVzqXrEkub65tZoCYDt7FT0=
github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8=
github.com/beorn7/perks v0.0.0-20160804104726-4c0e84591b9a/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
@ -741,8 +743,8 @@ github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8=
github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/subosito/gotenv v1.4.1 h1:jyEFiXpy21Wm81FBN71l9VoMMV8H8jG+qIK3GCpY6Qs=
github.com/subosito/gotenv v1.4.1/go.mod h1:ayKnFf/c6rvx/2iiLrJUk1e6plDbT3edrFNGqEflhK0=
github.com/syndtr/gocapability v0.0.0-20170704070218-db04d3cc01c8/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww=