CNS: Fix IPAM scaler down to handle incremental decrease (#731)

2020-11-16 16:47:28 -08:00 · 2020-11-16 16:47:28 -08:00 · 28207fc493
--- a/cns/fakes/cnsfake.go
+++ b/cns/fakes/cnsfake.go
@ -135,17 +135,19 @@ func (ipm *IPStateManager) MarkIPsAsPending(numberOfIPsToMark int) (map[string]c
 	defer ipm.Unlock()

 	var (
-		err            error
-		pendingRelease []cns.IPConfigurationStatus
+		err error
 	)

+	pendingRelease := make(map[string]cns.IPConfigurationStatus)
+
 	defer func() {
 		// if there was an error, and not all ip's have been freed, restore state
 		if err != nil && len(pendingRelease) != numberOfIPsToMark {
-			for i := range pendingRelease {
-				ipm.AvailableIPIDStack.Push(pendingRelease[i].ID)
-				ipm.AvailableIPConfigState[pendingRelease[i].ID] = pendingRelease[i]
-				delete(ipm.PendingReleaseIPConfigState, pendingRelease[i].ID)
+			for uuid, ipState := range pendingRelease {
+				ipState.State = cns.Available
+				ipm.AvailableIPIDStack.Push(pendingRelease[uuid].ID)
+				ipm.AvailableIPConfigState[pendingRelease[uuid].ID] = ipState
+				delete(ipm.PendingReleaseIPConfigState, pendingRelease[uuid].ID)
 			}
 		}
 	}()
@ -157,7 +159,10 @@ func (ipm *IPStateManager) MarkIPsAsPending(numberOfIPsToMark int) (map[string]c
 		}

 		// add all pending release to a slice
-		pendingRelease = append(pendingRelease, ipm.AvailableIPConfigState[id])
+		ipConfig := ipm.AvailableIPConfigState[id]
+		ipConfig.State = cns.PendingRelease
+		pendingRelease[id] = ipConfig
+
 		delete(ipm.AvailableIPConfigState, id)
 	}

@ -166,7 +171,7 @@ func (ipm *IPStateManager) MarkIPsAsPending(numberOfIPsToMark int) (map[string]c
 		ipm.PendingReleaseIPConfigState[pendingRelease[i].ID] = pendingRelease[i]
 	}

-	return ipm.PendingReleaseIPConfigState, nil
+	return pendingRelease, nil
 }

 type HTTPServiceFake struct {
--- a/cns/ipampoolmonitor/ipampoolmonitor.go
+++ b/cns/ipampoolmonitor/ipampoolmonitor.go
@ -107,12 +107,12 @@ func (pm *CNSIPAMPoolMonitor) increasePoolSize() error {
 	pm.cachedNNC.Spec.RequestedIPCount += pm.scalarUnits.BatchSize

 	// pass nil map to CNStoCRDSpec because we don't want to modify the to be deleted ipconfigs
-	pm.cachedNNC.Spec, err = MarkIPsAsPendingInCRD(nil, pm.cachedNNC.Spec.RequestedIPCount)
+	pm.cachedNNC.Spec, err = pm.createNNCSpecForCRD(false)
 	if err != nil {
 		return err
 	}

-	logger.Printf("[ipam-pool-monitor] Increasing pool size, Current Pool Size: %v, Requested IP Count: %v, Pods with IP's:%v", len(pm.cns.GetPodIPConfigState()), pm.cachedNNC.Spec.RequestedIPCount, len(pm.cns.GetAllocatedIPConfigs()))
+	logger.Printf("[ipam-pool-monitor] Increasing pool size, Current Pool Size: %v, Requested IP Count: %v, Pods with IP's:%v, ToBeDeleted Count: %v", len(pm.cns.GetPodIPConfigState()), pm.cachedNNC.Spec.RequestedIPCount, len(pm.cns.GetAllocatedIPConfigs()), len(pm.cachedNNC.Spec.IPsNotInUse))
 	return pm.rc.UpdateCRDSpec(context.Background(), pm.cachedNNC.Spec)
 }

@ -124,18 +124,20 @@ func (pm *CNSIPAMPoolMonitor) decreasePoolSize() error {
 	pm.cachedNNC.Spec.RequestedIPCount -= pm.scalarUnits.BatchSize

 	// mark n number of IP's as pending
-	pendingIPAddresses, err := pm.cns.MarkIPsAsPending(int(pm.scalarUnits.BatchSize))
+	pendingIpAddresses, err := pm.cns.MarkIPsAsPending(int(pm.scalarUnits.BatchSize))
 	if err != nil {
 		return err
 	}

+	logger.Printf("[ipam-pool-monitor] Updated Requested count %v, Releasing ips: %+v", pm.cachedNNC.Spec.RequestedIPCount, pendingIpAddresses)
+
 	// convert the pending IP addresses to a spec
-	pm.cachedNNC.Spec, err = MarkIPsAsPendingInCRD(pendingIPAddresses, pm.cachedNNC.Spec.RequestedIPCount)
+	pm.cachedNNC.Spec, err = pm.createNNCSpecForCRD(false)
 	if err != nil {
 		return err
 	}
 	pm.pendingRelease = true
-	logger.Printf("[ipam-pool-monitor] Decreasing pool size, Current Pool Size: %v, Requested IP Count: %v, Pods with IP's: %v", len(pm.cns.GetPodIPConfigState()), pm.cachedNNC.Spec.RequestedIPCount, len(pm.cns.GetAllocatedIPConfigs()))
+	logger.Printf("[ipam-pool-monitor] Decreasing pool size, Current Pool Size: %v, Requested IP Count: %v, Pods with IP's: %v, ToBeDeleted Count: %v", len(pm.cns.GetPodIPConfigState()), pm.cachedNNC.Spec.RequestedIPCount, len(pm.cns.GetAllocatedIPConfigs()), len(pm.cachedNNC.Spec.IPsNotInUse))
 	return pm.rc.UpdateCRDSpec(context.Background(), pm.cachedNNC.Spec)
 }

@ -146,7 +148,7 @@ func (pm *CNSIPAMPoolMonitor) cleanPendingRelease() error {
 	defer pm.mu.Unlock()

 	var err error
-	pm.cachedNNC.Spec, err = MarkIPsAsPendingInCRD(nil, pm.cachedNNC.Spec.RequestedIPCount)
+	pm.cachedNNC.Spec, err = pm.createNNCSpecForCRD(true)
 	if err != nil {
 		logger.Printf("[ipam-pool-monitor] Failed to translate ")
 	}
@ -156,19 +158,22 @@ func (pm *CNSIPAMPoolMonitor) cleanPendingRelease() error {
 }

 // CNSToCRDSpec translates CNS's map of Ips to be released and requested ip count into a CRD Spec
-func MarkIPsAsPendingInCRD(toBeDeletedSecondaryIPConfigs map[string]cns.IPConfigurationStatus, ipCount int64) (nnc.NodeNetworkConfigSpec, error) {
+func (pm *CNSIPAMPoolMonitor) createNNCSpecForCRD(resetNotInUseList bool) (nnc.NodeNetworkConfigSpec, error) {
 	var (
 		spec nnc.NodeNetworkConfigSpec
-		uuid string
 	)

-	spec.RequestedIPCount = ipCount
+	// DUpdate the count from cached spec
+	spec.RequestedIPCount = pm.cachedNNC.Spec.RequestedIPCount

-	if toBeDeletedSecondaryIPConfigs == nil {
+	// Discard the ToBeDeleted list if requested. This happens if DNC has cleaned up the pending ips and CNS has also updated its state.
+	if resetNotInUseList == true {
 		spec.IPsNotInUse = make([]string, 0)
 	} else {
-		for uuid = range toBeDeletedSecondaryIPConfigs {
-			spec.IPsNotInUse = append(spec.IPsNotInUse, uuid)
+		// Get All Pending IPs from CNS and populate it again.
+		pendingIps := pm.cns.GetPendingReleaseIPConfigs()
+		for _, pendingIp := range pendingIps {
+			spec.IPsNotInUse = append(spec.IPsNotInUse, pendingIp.ID)
 		}
 	}

--- a/cns/ipampoolmonitor/ipampoolmonitor_test.go
+++ b/cns/ipampoolmonitor/ipampoolmonitor_test.go
@ -313,3 +313,85 @@ func TestPoolSizeDecreaseWhenDecreaseHasAlreadyBeenRequested(t *testing.T) {
 		t.Fatalf("Expected IP's not in use to be 0 after reconcile, expected %v, actual %v", (initialIPConfigCount - batchSize), len(poolmonitor.cachedNNC.Spec.IPsNotInUse))
 	}
 }
+
+func TestPoolSizeDecreaseToReallyLow(t *testing.T) {
+	var (
+		batchSize               = 10
+		initialIPConfigCount    = 30
+		requestThresholdPercent = 30
+		releaseThresholdPercent = 100
+	)
+
+	fakecns, fakerc, poolmonitor := initFakes(batchSize, initialIPConfigCount, requestThresholdPercent, releaseThresholdPercent)
+
+	log.Printf("Min free IP's %v", poolmonitor.MinimumFreeIps)
+	log.Printf("Max free IP %v", poolmonitor.MaximumFreeIps)
+
+	// initial pool count is 30, set 23 of them to be allocated
+	err := fakecns.SetNumberOfAllocatedIPs(23)
+	if err != nil {
+		t.Error(err)
+	}
+
+	// Pool monitor does nothing, as the current number of IP's falls in the threshold
+	err = poolmonitor.Reconcile()
+	if err != nil {
+		t.Errorf("Expected pool monitor to not fail after CNS set number of allocated IP's %v", err)
+	}
+
+	// Now Drop the Allocated count to really low, say 3. This should trigger release in 2 batches
+	err = fakecns.SetNumberOfAllocatedIPs(3)
+	if err != nil {
+		t.Error(err)
+	}
+
+	// Pool monitor does nothing, as the current number of IP's falls in the threshold
+	t.Logf("Reconcile after Allocated count from 33 -> 3, Exepected free count = 10")
+	err = poolmonitor.Reconcile()
+	if err != nil {
+		t.Errorf("Expected pool monitor to not fail after CNS set number of allocated IP's %v", err)
+	}
+
+	// Ensure the size of the requested spec is still the same
+	if len(poolmonitor.cachedNNC.Spec.IPsNotInUse) != batchSize {
+		t.Fatalf("Expected IP's not in use is not correct, expected %v, actual %v", batchSize, len(poolmonitor.cachedNNC.Spec.IPsNotInUse))
+	}
+
+	// Ensure the request ipcount is now one batch size smaller than the inital IP count
+	if poolmonitor.cachedNNC.Spec.RequestedIPCount != int64(initialIPConfigCount-batchSize) {
+		t.Fatalf("Expected pool size to be one batch size smaller after reconcile, expected %v, actual %v", (initialIPConfigCount - batchSize), len(poolmonitor.cachedNNC.Spec.IPsNotInUse))
+	}
+
+	// Reconcile again, it should release the second batch
+	t.Logf("Reconcile again - 2, Exepected free count = 20")
+	err = poolmonitor.Reconcile()
+	if err != nil {
+		t.Errorf("Expected pool monitor to not fail after CNS set number of allocated IP's %v", err)
+	}
+
+	// Ensure the size of the requested spec is still the same
+	if len(poolmonitor.cachedNNC.Spec.IPsNotInUse) != batchSize*2 {
+		t.Fatalf("Expected IP's not in use is not correct, expected %v, actual %v", batchSize*2, len(poolmonitor.cachedNNC.Spec.IPsNotInUse))
+	}
+
+	// Ensure the request ipcount is now one batch size smaller than the inital IP count
+	if poolmonitor.cachedNNC.Spec.RequestedIPCount != int64(initialIPConfigCount-(batchSize*2)) {
+		t.Fatalf("Expected pool size to be one batch size smaller after reconcile, expected %v, actual %v", (initialIPConfigCount - batchSize), len(poolmonitor.cachedNNC.Spec.IPsNotInUse))
+	}
+
+	t.Logf("Update Request Controller")
+	err = fakerc.Reconcile()
+	if err != nil {
+		t.Error(err)
+	}
+
+	err = poolmonitor.Reconcile()
+	if err != nil {
+		t.Errorf("Expected no pool monitor failure after request controller reconcile: %v", err)
+	}
+
+	// Ensure the spec doesn't have any IPsNotInUse after request controller has reconciled
+	if len(poolmonitor.cachedNNC.Spec.IPsNotInUse) != 0 {
+		t.Fatalf("Expected IP's not in use to be 0 after reconcile, expected %v, actual %v", (initialIPConfigCount - batchSize), len(poolmonitor.cachedNNC.Spec.IPsNotInUse))
+	}
+}