fix: use cached ctrlruntime client in IPAM pool monitor (#2043)

Signed-off-by: Evan Baker <rbtr@users.noreply.github.com>
This commit is contained in:
Evan Baker 2023-07-21 11:35:39 -05:00 коммит произвёл GitHub
Родитель 6325924bf1
Коммит 97fdf81f89
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
2 изменённых файлов: 36 добавлений и 21 удалений

Просмотреть файл

@ -485,6 +485,10 @@ func (service *HTTPRestService) handleDebugPodContext(w http.ResponseWriter, r *
func (service *HTTPRestService) handleDebugRestData(w http.ResponseWriter, r *http.Request) {
service.RLock()
defer service.RUnlock()
if service.IPAMPoolMonitor == nil {
http.Error(w, "not ready", http.StatusServiceUnavailable)
return
}
resp := GetHTTPServiceDataResponse{
HTTPRestServiceData: HTTPRestServiceData{
PodIPIDByPodInterfaceKey: service.PodIPIDByPodInterfaceKey,

Просмотреть файл

@ -1147,20 +1147,12 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
if err != nil {
return errors.Wrap(err, "failed to create ctrl client")
}
nnccli := nodenetworkconfig.NewClient(directcli)
directnnccli := nodenetworkconfig.NewClient(directcli)
if err != nil {
return errors.Wrap(err, "failed to create NNC client")
}
// TODO(rbtr): nodename and namespace should be in the cns config
scopedcli := nncctrl.NewScopedClient(nnccli, types.NamespacedName{Namespace: "kube-system", Name: nodeName})
clusterSubnetStateChan := make(chan v1alpha1.ClusterSubnetState)
// initialize the ipam pool monitor
poolOpts := ipampool.Options{
RefreshDelay: poolIPAMRefreshRateInMilliseconds * time.Millisecond,
}
poolMonitor := ipampool.NewMonitor(httpRestServiceImplementation, scopedcli, clusterSubnetStateChan, &poolOpts)
httpRestServiceImplementation.IPAMPoolMonitor = poolMonitor
directscopedcli := nncctrl.NewScopedClient(directnnccli, types.NamespacedName{Namespace: "kube-system", Name: nodeName})
logger.Printf("Reconciling initial CNS state")
// apiserver nnc might not be registered or api server might be down and crashloop backof puts us outside of 5-10 minutes we have for
@ -1170,7 +1162,7 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
err = retry.Do(func() error {
attempt++
logger.Printf("reconciling initial CNS state attempt: %d", attempt)
err = reconcileInitialCNSState(ctx, scopedcli, httpRestServiceImplementation, podInfoByIPProvider)
err = reconcileInitialCNSState(ctx, directscopedcli, httpRestServiceImplementation, podInfoByIPProvider)
if err != nil {
logger.Errorf("failed to reconcile initial CNS state, attempt: %d err: %v", attempt, err)
}
@ -1181,16 +1173,6 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
}
logger.Printf("reconciled initial CNS state after %d attempts", attempt)
// start the pool Monitor before the Reconciler, since it needs to be ready to receive an
// NodeNetworkConfig update by the time the Reconciler tries to send it.
go func() {
logger.Printf("Starting IPAM Pool Monitor")
if e := poolMonitor.Start(ctx); e != nil {
logger.Errorf("[Azure CNS] Failed to start pool monitor with err: %v", e)
}
}()
logger.Printf("initialized and started IPAM pool monitor")
// the nodeScopedCache sets Selector options on the Manager cache which are used
// to perform *server-side* filtering of the cached objects. This is very important
// for high node/pod count clusters, as it keeps us from watching objects at the
@ -1220,6 +1202,25 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
return errors.Wrap(err, "failed to create manager")
}
// Build the IPAM Pool monitor
clusterSubnetStateChan := make(chan v1alpha1.ClusterSubnetState)
// this cachedscopedclient is built using the Manager's cached client, which is
// NOT SAFE TO USE UNTIL THE MANAGER IS STARTED!
// This is okay because it is only used to build the IPAMPoolMonitor, which does not
// attempt to use the client until it has received a NodeNetworkConfig to update, and
// that can only happen once the Manager has started and the NodeNetworkConfig
// reconciler has pushed the Monitor a NodeNetworkConfig.
cachedscopedcli := nncctrl.NewScopedClient(nodenetworkconfig.NewClient(manager.GetClient()), types.NamespacedName{Namespace: "kube-system", Name: nodeName})
poolOpts := ipampool.Options{
RefreshDelay: poolIPAMRefreshRateInMilliseconds * time.Millisecond,
}
poolMonitor := ipampool.NewMonitor(httpRestServiceImplementation, cachedscopedcli, clusterSubnetStateChan, &poolOpts)
httpRestServiceImplementation.IPAMPoolMonitor = poolMonitor
// Start building the NNC Reconciler
// get our Node so that we can xref it against the NodeNetworkConfig's to make sure that the
// NNC is not stale and represents the Node we're running on.
node, err := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
@ -1252,6 +1253,16 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
httpRestServiceImplementation.RegisterPProfEndpoints()
}
// start the pool Monitor before the Reconciler, since it needs to be ready to receive an
// NodeNetworkConfig update by the time the Reconciler tries to send it.
go func() {
logger.Printf("Starting IPAM Pool Monitor")
if e := poolMonitor.Start(ctx); e != nil {
logger.Errorf("[Azure CNS] Failed to start pool monitor with err: %v", e)
}
}()
logger.Printf("initialized and started IPAM pool monitor")
// Start the Manager which starts the reconcile loop.
// The Reconciler will send an initial NodeNetworkConfig update to the PoolMonitor, starting the
// Monitor's internal loop.