fix: [Linux] [NPM] handle #2977 for netpols without cidrs (#2990)

* fix: [Linux] [NPM] handle #2977 for netpols without cidrs

Signed-off-by: Hunter Gregory <42728408+huntergregory@users.noreply.github.com>

* fix: lock and no need to track policy key

Signed-off-by: Hunter Gregory <42728408+huntergregory@users.noreply.github.com>

* style: remove dead code

Signed-off-by: Hunter Gregory <42728408+huntergregory@users.noreply.github.com>

---------

Signed-off-by: Hunter Gregory <42728408+huntergregory@users.noreply.github.com>
This commit is contained in:
Hunter Gregory 2024-09-11 12:40:16 -07:00 коммит произвёл GitHub
Родитель 47b4329b06
Коммит 60ddd3531a
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
3 изменённых файлов: 62 добавлений и 32 удалений

2
.gitignore поставляемый
Просмотреть файл

@ -1,3 +1,5 @@
vendor/
# Binaries
out/*
output/*

Просмотреть файл

@ -4,6 +4,7 @@ import (
"errors"
"fmt"
"strings"
"sync"
"time"
"github.com/Azure/azure-container-networking/common"
@ -18,12 +19,12 @@ import (
const (
reconcileDuration = time.Duration(5 * time.Minute)
contextBackground = "BACKGROUND"
contextApplyDP = "APPLY-DP"
contextAddNetPol = "ADD-NETPOL"
contextAddNetPolBootup = "BOOTUP-ADD-NETPOL"
contextAddNetPolCIDRPrecaution = "ADD-NETPOL-CIDR-PRECAUTION"
contextDelNetPol = "DEL-NETPOL"
contextBackground = "BACKGROUND"
contextApplyDP = "APPLY-DP"
contextAddNetPol = "ADD-NETPOL"
contextAddNetPolBootup = "BOOTUP-ADD-NETPOL"
contextAddNetPolPrecaution = "ADD-NETPOL-PRECAUTION"
contextDelNetPol = "DEL-NETPOL"
)
var (
@ -48,6 +49,11 @@ type Config struct {
*policies.PolicyManagerCfg
}
type removePolicyInfo struct {
sync.Mutex
previousRemovePolicyIPSetsFailed bool
}
type DataPlane struct {
*Config
applyInBackground bool
@ -64,7 +70,10 @@ type DataPlane struct {
endpointQuery *endpointQuery
applyInfo *applyInfo
netPolQueue *netPolQueue
stopChannel <-chan struct{}
// removePolicyInfo tracks when a policy was removed yet had ApplyIPSet failures.
// This field is only relevant for Linux.
removePolicyInfo removePolicyInfo
stopChannel <-chan struct{}
}
func NewDataPlane(nodeName string, ioShim *common.IOShim, cfg *Config, stopChannel <-chan struct{}) (*DataPlane, error) {
@ -335,6 +344,9 @@ func (dp *DataPlane) applyDataPlaneNow(context string) error {
}
klog.Infof("[DataPlane] [ApplyDataPlane] [%s] finished applying ipsets", context)
// see comment in RemovePolicy() for why this is here
dp.setRemovePolicyFailure(false)
if dp.applyInBackground {
dp.applyInfo.Lock()
dp.applyInfo.numBatches = 0
@ -472,26 +484,17 @@ func (dp *DataPlane) addPolicies(netPols []*policies.NPMNetworkPolicy) error {
}
}
if !util.IsWindowsDP() {
for _, netPol := range netPols {
if !(netPol.HasCIDRRules() && dp.ipsetMgr.PreviousApplyFailed()) {
continue
}
if inBootupPhase {
// this should never happen because bootup phase is for windows, but just in case, we don't want to applyDataplaneNow() or else there will be a deadlock on dp.applyInfo
msg := fmt.Sprintf("[DataPlane] [%s] at risk of improperly applying a CIDR policy which is removed then readded", contextAddNetPolCIDRPrecaution)
klog.Warning(msg)
metrics.SendErrorLogAndMetric(util.DaemonDataplaneID, msg)
break
}
if dp.hadRemovePolicyFailure() {
if inBootupPhase {
// this should never happen because bootup phase is for windows, but just in case, we don't want to applyDataplaneNow() or else there will be a deadlock on dp.applyInfo
msg := fmt.Sprintf("[DataPlane] [%s] at risk of improperly applying a policy which is removed then readded", contextAddNetPolPrecaution)
klog.Warning(msg)
metrics.SendErrorLogAndMetric(util.DaemonDataplaneID, msg)
} else {
// prevent #2977
if err := dp.applyDataPlaneNow(contextAddNetPolCIDRPrecaution); err != nil {
if err := dp.applyDataPlaneNow(contextAddNetPolPrecaution); err != nil {
return err // nolint:wrapcheck // unnecessary to wrap error since the provided context is included in the error
}
break
}
}
@ -531,6 +534,9 @@ func (dp *DataPlane) addPolicies(netPols []*policies.NPMNetworkPolicy) error {
}
klog.Infof("[DataPlane] [%s] finished applying ipsets", contextAddNetPolBootup)
// see comment in RemovePolicy() for why this is here
dp.setRemovePolicyFailure(false)
dp.applyInfo.numBatches = 0
}
@ -627,7 +633,16 @@ func (dp *DataPlane) RemovePolicy(policyKey string) error {
return err
}
return dp.applyDataPlaneNow(contextApplyDP)
if err := dp.applyDataPlaneNow(contextDelNetPol); err != nil {
// Failed to apply IPSets while removing this policy.
// Consider this removepolicy call a failure until apply IPSets is successful.
// Related to #2977
klog.Info("[DataPlane] remove policy has failed to apply ipsets. setting remove policy failure")
dp.setRemovePolicyFailure(true)
return err // nolint:wrapcheck // unnecessary to wrap error since the provided context is included in the error
}
return nil
}
// UpdatePolicy takes in updated policy object, calculates the delta and applies changes
@ -749,3 +764,23 @@ func (dp *DataPlane) deleteIPSetsAndReferences(sets []*ipsets.TranslatedIPSet, n
}
return nil
}
func (dp *DataPlane) setRemovePolicyFailure(failed bool) {
if util.IsWindowsDP() {
return
}
dp.removePolicyInfo.Lock()
defer dp.removePolicyInfo.Unlock()
dp.removePolicyInfo.previousRemovePolicyIPSetsFailed = failed
}
func (dp *DataPlane) hadRemovePolicyFailure() bool {
if util.IsWindowsDP() {
return false
}
dp.removePolicyInfo.Lock()
defer dp.removePolicyInfo.Unlock()
return dp.removePolicyInfo.previousRemovePolicyIPSetsFailed
}

Просмотреть файл

@ -79,13 +79,6 @@ func NewIPSetManager(iMgrCfg *IPSetManagerCfg, ioShim *common.IOShim) *IPSetMana
}
}
// PreviousApplyFailed is only relevant for Linux right now since Windows doesn't track failures
func (iMgr *IPSetManager) PreviousApplyFailed() bool {
iMgr.Lock()
defer iMgr.Unlock()
return iMgr.consecutiveApplyFailures > 0
}
/*
Reconcile removes empty/unreferenced sets from the cache.
For ApplyAllIPSets mode, those sets are added to the toDeleteCache.