add retry to nnc update during scaledown (#1970)
* add retry to nnc update during scaledown Signed-off-by: Evan Baker <rbtr@users.noreply.github.com> * test for panic in pool monitor Signed-off-by: Evan Baker <rbtr@users.noreply.github.com> --------- Signed-off-by: Evan Baker <rbtr@users.noreply.github.com>
This commit is contained in:
Родитель
92abf0306b
Коммит
40022dab86
|
@ -13,6 +13,7 @@ import (
|
|||
"github.com/Azure/azure-container-networking/cns/types"
|
||||
"github.com/Azure/azure-container-networking/crd/clustersubnetstate/api/v1alpha1"
|
||||
"github.com/Azure/azure-container-networking/crd/nodenetworkconfig/api/v1alpha"
|
||||
"github.com/avast/retry-go/v4"
|
||||
"github.com/pkg/errors"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
@ -334,10 +335,20 @@ func (pm *Monitor) decreasePoolSize(ctx context.Context, meta metaState, state i
|
|||
tempNNCSpec.RequestedIPCount -= int64(len(pendingIPAddresses))
|
||||
logger.Printf("[ipam-pool-monitor] Decreasing pool size, pool %+v, spec %+v", state, tempNNCSpec)
|
||||
|
||||
_, err := pm.nnccli.UpdateSpec(ctx, &tempNNCSpec)
|
||||
if err != nil {
|
||||
// caller will retry to update the CRD again
|
||||
return errors.Wrap(err, "executing UpdateSpec with NNC client")
|
||||
attempts := 0
|
||||
if err := retry.Do(func() error {
|
||||
attempts++
|
||||
_, err := pm.nnccli.UpdateSpec(ctx, &tempNNCSpec)
|
||||
if err != nil {
|
||||
// caller will retry to update the CRD again
|
||||
logger.Printf("failed to update NNC spec attempt #%d, err: %v", attempts, err)
|
||||
return errors.Wrap(err, "executing UpdateSpec with NNC client")
|
||||
}
|
||||
logger.Printf("successfully updated NNC spec attempt #%d", attempts)
|
||||
return nil
|
||||
}, retry.Attempts(5), retry.DelayType(retry.BackOffDelay)); err != nil { //nolint:gomnd // ignore retry magic number
|
||||
logger.Errorf("all attempts failed to update NNC during scale-down, state is corrupt: %v", err)
|
||||
panic(err)
|
||||
}
|
||||
|
||||
logger.Printf("[ipam-pool-monitor] Decreasing pool size: UpdateCRDSpec succeeded for spec %+v", tempNNCSpec)
|
||||
|
@ -350,7 +361,6 @@ func (pm *Monitor) decreasePoolSize(ctx context.Context, meta metaState, state i
|
|||
// clear the updatingPendingIpsNotInUse, as we have Updated the CRD
|
||||
logger.Printf("[ipam-pool-monitor] cleaning the updatingPendingIpsNotInUse, existing length %d", pm.metastate.notInUseCount)
|
||||
pm.metastate.notInUseCount = 0
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ package ipampool
|
|||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
|
@ -21,6 +22,12 @@ func (f *fakeNodeNetworkConfigUpdater) UpdateSpec(ctx context.Context, spec *v1a
|
|||
return f.nnc, nil
|
||||
}
|
||||
|
||||
type fakeNodeNetworkConfigUpdaterFunc func(ctx context.Context, spec *v1alpha.NodeNetworkConfigSpec) (*v1alpha.NodeNetworkConfig, error)
|
||||
|
||||
func (f fakeNodeNetworkConfigUpdaterFunc) UpdateSpec(ctx context.Context, spec *v1alpha.NodeNetworkConfigSpec) (*v1alpha.NodeNetworkConfig, error) {
|
||||
return f(ctx, spec)
|
||||
}
|
||||
|
||||
type directUpdatePoolMonitor struct {
|
||||
m *Monitor
|
||||
cns.IPAMPoolMonitor
|
||||
|
@ -45,7 +52,7 @@ type testState struct {
|
|||
totalIPs int64
|
||||
}
|
||||
|
||||
func initFakes(state testState) (*fakes.HTTPServiceFake, *fakes.RequestControllerFake, *Monitor) {
|
||||
func initFakes(state testState, nnccli nodeNetworkConfigSpecUpdater) (*fakes.HTTPServiceFake, *fakes.RequestControllerFake, *Monitor) {
|
||||
logger.InitLogger("testlogs", 0, 0, "./")
|
||||
|
||||
scalarUnits := v1alpha.Scaler{
|
||||
|
@ -61,8 +68,11 @@ func initFakes(state testState) (*fakes.HTTPServiceFake, *fakes.RequestControlle
|
|||
}
|
||||
fakecns := fakes.NewHTTPServiceFake()
|
||||
fakerc := fakes.NewRequestControllerFake(fakecns, scalarUnits, subnetaddresspace, state.totalIPs)
|
||||
if nnccli == nil {
|
||||
nnccli = &fakeNodeNetworkConfigUpdater{fakerc.NNC}
|
||||
}
|
||||
|
||||
poolmonitor := NewMonitor(fakecns, &fakeNodeNetworkConfigUpdater{fakerc.NNC}, nil, &Options{RefreshDelay: 100 * time.Second})
|
||||
poolmonitor := NewMonitor(fakecns, nnccli, nil, &Options{RefreshDelay: 100 * time.Second})
|
||||
poolmonitor.metastate = metaState{
|
||||
batch: state.batch,
|
||||
max: state.max,
|
||||
|
@ -127,7 +137,7 @@ func TestPoolSizeIncrease(t *testing.T) {
|
|||
for _, tt := range tests {
|
||||
tt := tt
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
fakecns, fakerc, poolmonitor := initFakes(tt.in)
|
||||
fakecns, fakerc, poolmonitor := initFakes(tt.in, nil)
|
||||
assert.NoError(t, fakerc.Reconcile(true))
|
||||
|
||||
// When poolmonitor reconcile is called, trigger increase and cache goal state
|
||||
|
@ -162,7 +172,7 @@ func TestPoolIncreaseDoesntChangeWhenIncreaseIsAlreadyInProgress(t *testing.T) {
|
|||
max: 30,
|
||||
}
|
||||
|
||||
fakecns, fakerc, poolmonitor := initFakes(initState)
|
||||
fakecns, fakerc, poolmonitor := initFakes(initState, nil)
|
||||
assert.NoError(t, fakerc.Reconcile(true))
|
||||
|
||||
// When poolmonitor reconcile is called, trigger increase and cache goal state
|
||||
|
@ -201,7 +211,7 @@ func TestPoolSizeIncreaseIdempotency(t *testing.T) {
|
|||
max: 30,
|
||||
}
|
||||
|
||||
_, fakerc, poolmonitor := initFakes(initState)
|
||||
_, fakerc, poolmonitor := initFakes(initState, nil)
|
||||
assert.NoError(t, fakerc.Reconcile(true))
|
||||
|
||||
// When poolmonitor reconcile is called, trigger increase and cache goal state
|
||||
|
@ -227,7 +237,7 @@ func TestPoolIncreasePastNodeLimit(t *testing.T) {
|
|||
max: 30,
|
||||
}
|
||||
|
||||
_, fakerc, poolmonitor := initFakes(initState)
|
||||
_, fakerc, poolmonitor := initFakes(initState, nil)
|
||||
assert.NoError(t, fakerc.Reconcile(true))
|
||||
|
||||
// When poolmonitor reconcile is called, trigger increase and cache goal state
|
||||
|
@ -247,7 +257,7 @@ func TestPoolIncreaseBatchSizeGreaterThanMaxPodIPCount(t *testing.T) {
|
|||
max: 30,
|
||||
}
|
||||
|
||||
_, fakerc, poolmonitor := initFakes(initState)
|
||||
_, fakerc, poolmonitor := initFakes(initState, nil)
|
||||
assert.NoError(t, fakerc.Reconcile(true))
|
||||
|
||||
// When poolmonitor reconcile is called, trigger increase and cache goal state
|
||||
|
@ -267,7 +277,7 @@ func TestIncreaseWithPendingRelease(t *testing.T) {
|
|||
max: 250,
|
||||
pendingRelease: 16,
|
||||
}
|
||||
_, rc, mon := initFakes(initState)
|
||||
_, rc, mon := initFakes(initState, nil)
|
||||
assert.NoError(t, rc.Reconcile(true))
|
||||
assert.NoError(t, mon.reconcile(context.Background()))
|
||||
assert.Equal(t, int64(32), mon.spec.RequestedIPCount)
|
||||
|
@ -333,7 +343,7 @@ func TestPoolDecrease(t *testing.T) {
|
|||
for _, tt := range tests {
|
||||
tt := tt
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
fakecns, fakerc, poolmonitor := initFakes(tt.in)
|
||||
fakecns, fakerc, poolmonitor := initFakes(tt.in, nil)
|
||||
assert.NoError(t, fakerc.Reconcile(true))
|
||||
|
||||
// Decrease the number of allocated IPs down to target. This may trigger a scale down.
|
||||
|
@ -374,7 +384,7 @@ func TestPoolSizeDecreaseWhenDecreaseHasAlreadyBeenRequested(t *testing.T) {
|
|||
max: 30,
|
||||
}
|
||||
|
||||
fakecns, fakerc, poolmonitor := initFakes(initState)
|
||||
fakecns, fakerc, poolmonitor := initFakes(initState, nil)
|
||||
assert.NoError(t, fakerc.Reconcile(true))
|
||||
|
||||
// Pool monitor does nothing, as the current number of IPs falls in the threshold
|
||||
|
@ -413,7 +423,7 @@ func TestDecreaseAndIncreaseToSameCount(t *testing.T) {
|
|||
max: 30,
|
||||
}
|
||||
|
||||
fakecns, fakerc, poolmonitor := initFakes(initState)
|
||||
fakecns, fakerc, poolmonitor := initFakes(initState, nil)
|
||||
assert.NoError(t, fakerc.Reconcile(true))
|
||||
assert.NoError(t, poolmonitor.reconcile(context.Background()))
|
||||
assert.EqualValues(t, 20, poolmonitor.spec.RequestedIPCount)
|
||||
|
@ -458,7 +468,7 @@ func TestPoolSizeDecreaseToReallyLow(t *testing.T) {
|
|||
max: 30,
|
||||
}
|
||||
|
||||
fakecns, fakerc, poolmonitor := initFakes(initState)
|
||||
fakecns, fakerc, poolmonitor := initFakes(initState, nil)
|
||||
assert.NoError(t, fakerc.Reconcile(true))
|
||||
|
||||
// Pool monitor does nothing, as the current number of IPs falls in the threshold
|
||||
|
@ -499,7 +509,7 @@ func TestDecreaseAfterNodeLimitReached(t *testing.T) {
|
|||
releaseThresholdPercent: 150,
|
||||
max: 30,
|
||||
}
|
||||
fakecns, fakerc, poolmonitor := initFakes(initState)
|
||||
fakecns, fakerc, poolmonitor := initFakes(initState, nil)
|
||||
assert.NoError(t, fakerc.Reconcile(true))
|
||||
|
||||
assert.NoError(t, poolmonitor.reconcile(context.Background()))
|
||||
|
@ -524,7 +534,7 @@ func TestDecreaseWithPendingRelease(t *testing.T) {
|
|||
totalIPs: 64,
|
||||
max: 250,
|
||||
}
|
||||
fakecns, fakerc, poolmonitor := initFakes(initState)
|
||||
fakecns, fakerc, poolmonitor := initFakes(initState, nil)
|
||||
fakerc.NNC.Spec.RequestedIPCount = 48
|
||||
assert.NoError(t, fakerc.Reconcile(true))
|
||||
|
||||
|
@ -547,6 +557,35 @@ func TestDecreaseWithPendingRelease(t *testing.T) {
|
|||
assert.Len(t, poolmonitor.spec.IPsNotInUse, int(initState.batch)+int(initState.pendingRelease))
|
||||
}
|
||||
|
||||
func TestDecreaseWithAPIServerFailure(t *testing.T) {
|
||||
initState := testState{
|
||||
batch: 16,
|
||||
assigned: 46,
|
||||
allocated: 64,
|
||||
pendingRelease: 0,
|
||||
requestThresholdPercent: 50,
|
||||
releaseThresholdPercent: 150,
|
||||
totalIPs: 64,
|
||||
max: 250,
|
||||
}
|
||||
var errNNCCLi fakeNodeNetworkConfigUpdaterFunc = func(ctx context.Context, spec *v1alpha.NodeNetworkConfigSpec) (*v1alpha.NodeNetworkConfig, error) {
|
||||
return nil, errors.New("fake APIServer failure") //nolint:goerr113 // this is a fake error
|
||||
}
|
||||
|
||||
fakecns, fakerc, poolmonitor := initFakes(initState, errNNCCLi)
|
||||
fakerc.NNC.Spec.RequestedIPCount = initState.totalIPs
|
||||
assert.NoError(t, fakerc.Reconcile(true))
|
||||
|
||||
assert.NoError(t, poolmonitor.reconcile(context.Background()))
|
||||
|
||||
// release some IPs
|
||||
assert.NoError(t, fakecns.SetNumberOfAssignedIPs(40))
|
||||
// check that the pool monitor panics if it is not able to publish the updated NNC
|
||||
assert.Panics(t, func() {
|
||||
_ = poolmonitor.reconcile(context.Background())
|
||||
})
|
||||
}
|
||||
|
||||
func TestPoolDecreaseBatchSizeGreaterThanMaxPodIPCount(t *testing.T) {
|
||||
initState := testState{
|
||||
batch: 31,
|
||||
|
@ -557,7 +596,7 @@ func TestPoolDecreaseBatchSizeGreaterThanMaxPodIPCount(t *testing.T) {
|
|||
max: 30,
|
||||
}
|
||||
|
||||
fakecns, fakerc, poolmonitor := initFakes(initState)
|
||||
fakecns, fakerc, poolmonitor := initFakes(initState, nil)
|
||||
assert.NoError(t, fakerc.Reconcile(true))
|
||||
|
||||
// When poolmonitor reconcile is called, trigger increase and cache goal state
|
||||
|
|
3
go.mod
3
go.mod
|
@ -10,6 +10,7 @@ require (
|
|||
github.com/Microsoft/go-winio v0.4.17
|
||||
github.com/Microsoft/hcsshim v0.8.23
|
||||
github.com/avast/retry-go/v3 v3.1.1
|
||||
github.com/avast/retry-go/v4 v4.3.4
|
||||
github.com/billgraziano/dpapi v0.4.0
|
||||
github.com/containernetworking/cni v1.1.2
|
||||
github.com/docker/libnetwork v0.8.0-dev.2.0.20210525090646-64b7a4574d14
|
||||
|
@ -30,7 +31,7 @@ require (
|
|||
github.com/spf13/cobra v1.6.1
|
||||
github.com/spf13/pflag v1.0.5
|
||||
github.com/spf13/viper v1.14.0
|
||||
github.com/stretchr/testify v1.8.1
|
||||
github.com/stretchr/testify v1.8.2
|
||||
go.uber.org/zap v1.24.0
|
||||
golang.org/x/sys v0.6.0
|
||||
google.golang.org/grpc v1.52.0
|
||||
|
|
6
go.sum
6
go.sum
|
@ -89,6 +89,8 @@ github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPd
|
|||
github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY=
|
||||
github.com/avast/retry-go/v3 v3.1.1 h1:49Scxf4v8PmiQ/nY0aY3p0hDueqSmc7++cBbtiDGu2g=
|
||||
github.com/avast/retry-go/v3 v3.1.1/go.mod h1:6cXRK369RpzFL3UQGqIUp9Q7GDrams+KsYWrfNA1/nQ=
|
||||
github.com/avast/retry-go/v4 v4.3.4 h1:pHLkL7jvCvP317I8Ge+Km2Yhntv3SdkJm7uekkqbKhM=
|
||||
github.com/avast/retry-go/v4 v4.3.4/go.mod h1:rv+Nla6Vk3/ilU0H51VHddWHiwimzX66yZ0JT6T+UvE=
|
||||
github.com/aws/aws-sdk-go v1.15.11/go.mod h1:mFuSZ37Z9YOHbQEwBWztmVzqXrEkub65tZoCYDt7FT0=
|
||||
github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8=
|
||||
github.com/beorn7/perks v0.0.0-20160804104726-4c0e84591b9a/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
|
||||
|
@ -741,8 +743,8 @@ github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
|
|||
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
|
||||
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
|
||||
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
|
||||
github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8=
|
||||
github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
|
||||
github.com/subosito/gotenv v1.4.1 h1:jyEFiXpy21Wm81FBN71l9VoMMV8H8jG+qIK3GCpY6Qs=
|
||||
github.com/subosito/gotenv v1.4.1/go.mod h1:ayKnFf/c6rvx/2iiLrJUk1e6plDbT3edrFNGqEflhK0=
|
||||
github.com/syndtr/gocapability v0.0.0-20170704070218-db04d3cc01c8/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww=
|
||||
|
|
Загрузка…
Ссылка в новой задаче