2020-07-16 03:34:41 +03:00
// Copyright 2017 Microsoft. All rights reserved.
// MIT License
package restserver
2020-07-16 12:51:11 +03:00
import (
2020-07-23 23:03:10 +03:00
"bytes"
2020-12-12 00:54:17 +03:00
"context"
2020-07-23 23:03:10 +03:00
"encoding/json"
"fmt"
2023-08-02 04:49:21 +03:00
"net"
2020-07-23 23:03:10 +03:00
"net/http"
"net/http/httptest"
2020-07-23 03:11:41 +03:00
"reflect"
2020-12-12 00:54:17 +03:00
"strconv"
2023-05-17 23:34:04 +03:00
"strings"
2022-03-12 04:11:12 +03:00
"time"
2020-07-23 03:11:41 +03:00
2020-07-16 12:51:11 +03:00
"github.com/Azure/azure-container-networking/cns"
"github.com/Azure/azure-container-networking/cns/logger"
2024-10-18 02:59:40 +03:00
"github.com/Azure/azure-container-networking/cns/nodesubnet"
2021-08-07 00:19:21 +03:00
"github.com/Azure/azure-container-networking/cns/types"
2020-07-23 23:03:10 +03:00
"github.com/Azure/azure-container-networking/common"
2021-08-25 19:57:14 +03:00
"github.com/Azure/azure-container-networking/crd/nodenetworkconfig/api/v1alpha"
2021-08-07 00:19:21 +03:00
"github.com/pkg/errors"
2020-07-16 12:51:11 +03:00
)
2020-07-16 03:34:41 +03:00
// This file contains the internal functions called by either HTTP APIs (api.go) or
// internal APIs (definde in internalapi.go).
// This will be used internally (say by RequestController in case of AKS)
// GetPartitionKey - Get dnc/service partition key
func ( service * HTTPRestService ) GetPartitionKey ( ) ( dncPartitionKey string ) {
service . RLock ( )
dncPartitionKey = service . dncPartitionKey
service . RUnlock ( )
return
}
2020-07-16 12:51:11 +03:00
2020-07-23 23:03:10 +03:00
// SetNodeOrchestrator :- Set node orchestrator after registering with mDNC
func ( service * HTTPRestService ) SetNodeOrchestrator ( r * cns . SetOrchestratorTypeRequest ) {
body , _ := json . Marshal ( r )
req , _ := http . NewRequest ( http . MethodPost , "" , bytes . NewBuffer ( body ) )
req . Header . Set ( common . ContentType , common . JsonContent )
service . setOrchestratorType ( httptest . NewRecorder ( ) , req )
}
2022-05-16 20:01:15 +03:00
func ( service * HTTPRestService ) SyncNodeStatus ( dncEP , infraVnet , nodeID string , contextFromCNI json . RawMessage ) ( returnCode types . ResponseCode , errStr string ) {
2020-07-23 23:03:10 +03:00
logger . Printf ( "[Azure CNS] SyncNodeStatus" )
var (
2021-08-07 00:19:21 +03:00
resp * http . Response
2020-07-23 23:03:10 +03:00
nodeInfoResponse cns . NodeInfoResponse
body [ ] byte
httpc = common . GetHttpClient ( )
)
// try to retrieve NodeInfoResponse from mDNC
2021-08-07 00:19:21 +03:00
url := fmt . Sprintf ( common . SyncNodeNetworkContainersURLFmt , dncEP , infraVnet , nodeID , dncApiVersion )
req , _ := http . NewRequestWithContext ( context . TODO ( ) , http . MethodGet , url , nil )
resp , err := httpc . Do ( req )
2020-07-23 23:03:10 +03:00
if err == nil {
2021-08-07 00:19:21 +03:00
if resp . StatusCode == http . StatusOK {
err = json . NewDecoder ( resp . Body ) . Decode ( & nodeInfoResponse )
2020-07-23 23:03:10 +03:00
} else {
2021-08-07 00:19:21 +03:00
err = errors . Errorf ( "http err: %d" , resp . StatusCode )
2020-07-23 23:03:10 +03:00
}
2021-08-07 00:19:21 +03:00
resp . Body . Close ( )
2020-07-23 23:03:10 +03:00
}
if err != nil {
2021-08-07 00:19:21 +03:00
returnCode = types . UnexpectedError
2020-07-23 23:03:10 +03:00
errStr = fmt . Sprintf ( "[Azure-CNS] Failed to sync node with error: %+v" , err )
logger . Errorf ( errStr )
return
}
var (
ncsToBeAdded = make ( map [ string ] cns . CreateNetworkContainerRequest )
ncsToBeDeleted = make ( map [ string ] bool )
)
// determine new NCs and NCs to be deleted
service . RLock ( )
for ncid := range service . state . ContainerStatus {
ncsToBeDeleted [ ncid ] = true
}
for _ , nc := range nodeInfoResponse . NetworkContainers {
2020-07-28 02:27:13 +03:00
ncid := nc . NetworkContainerid
2020-07-23 23:03:10 +03:00
delete ( ncsToBeDeleted , ncid )
if savedNc , exists := service . state . ContainerStatus [ ncid ] ; ! exists || savedNc . CreateNetworkContainerRequest . Version < nc . Version {
ncsToBeAdded [ ncid ] = nc
}
}
service . RUnlock ( )
2023-01-20 02:10:23 +03:00
skipNCVersionCheck := false
ctx , cancel := context . WithTimeout ( context . Background ( ) , nmaAPICallTimeout )
defer cancel ( )
ncVersionListResp , err := service . nma . GetNCVersionList ( ctx )
if err != nil {
skipNCVersionCheck = true
logger . Errorf ( "failed to get nc version list from nmagent" )
}
2020-07-23 23:03:10 +03:00
2023-01-20 02:10:23 +03:00
if ! skipNCVersionCheck {
nmaNCs := map [ string ] string { }
for _ , nc := range ncVersionListResp . Containers {
2023-11-03 20:39:07 +03:00
nmaNCs [ strings . TrimPrefix ( lowerCaseNCGuid ( nc . NetworkContainerID ) , cns . SwiftPrefix ) ] = nc . Version
2023-01-20 02:10:23 +03:00
}
Replace the NMAgent client in CNS with the one from the nmagent package (#1643)
* Switch PublishNetworkContainer to nmagent package
This removes the PublishNetworkContainer method from the CNS client's
nmagent package in favor of using the one from the top-level nmagent
client package. This is to ensure that there's only one nmagent client
to maintain.
* Use Unpublish method from nmagent package
The existing unpublish endpoints within CNS used the older nmagent
client. This converts them to use the newer one.
* Use JoinNetwork from new nmagent client
The API in CNS was using the old nmagent client endpoints which we want
to phase out so that there is exactly one nmagent client across all
systems.
* Add SupportedAPIs method to nmagent and use it
The CNS client uses this SupportedAPIs method to proxy requests from DNC
and detect capabilities of the node.
* Add GetNCVersion to nmagent and use it in CNS
CNS previously used its own client method for accessing the GetNCVersion
endpoint of nmagent. In the interest of having a singular NMAgent
client, this adopts the behavior into the nmagent package and uses it
within CNS instead.
* Use test helpers for context and checking errs
Checking whether the error was present and should it be present was
annoying and repetitive, so there's a helper now to take care of that.
Similarly, contexts are necessary but it's nice to create them from the
test's deadline itself. There's a helper to do that now as well.
* Fix broken tests & improve the suite
There were some broken tests left over from implementing new NMAgent
interface methods. This makes those tests pass and also improves the
test suite by detangling mixed concerns of the utility functions within.
Many of them returned errors and made assertions, which made them
confusing to use. The utility functions now only do things you request,
and the tests themselves are the only ones making assertions (sometimes
through helpers that were added to make those assertions easier).
* Add GetNCVersionList endpoint to NMAgent client
This is the final endpoint that was being used in CNS without being
present in the "official" NMAgent client. This moves that
implementation, more-or-less, to the nmagent package and consumes it in
NMAgent through an interface.
* Remove incorrect error shadowing
There were a few places where errors were shadowed. Also removed the
C-ism of pre-declaring variables for the sake of pre-declaring
variables.
* Replace error type assertions with errors.As
In two instances, type assertions were used instead of errors.As. Apart
from being less obvious, there are apparently instances where this can
be incorrect with error wrapping.
Also, there was an instance where errors.As was mistakenly used in the
init clause of an if statement, instead of the predicate. This was
corrected to be a conjunctive predicate converting the error and then
making assertions using that error. This is safe because short-circuit
evaluation will prevent the second half of the predicate from being
evaluated if the error is not of that type.
* Use context for joinNetwork
The linter rightly pointed out that context could be trivially
propagated further than it was. This commit does exactly that.
* Add error wrapping to an otherwise-opaque error
The linter highlighted this error, showing that the error returned would
be confusing to a user of this function. This wraps the error indicating
that a join network request was issued at the point where the error was
produced, to aid in debugging.
* Remove error shadowing in the tests
This is really somewhat unnecessary because it's just a test. It
shouldn't impact correctness, since the errors were properly scoped to
their respective if statements. However, to prevent other people's
linters from complaining, this corrects the lint.
* Silence complaints about dynamic errors in tests
This is an unnecessary lint in a test, because it's useful to be able to
define errors on the fly when you need them in a test. However, the
linter demands it presently, so it must be silenced.
* Add missing return for fmt.Errorf
Surprisingly, a return was missing here. This was caught by the linter,
and was most certainly incorrect.
* Remove yet another shadowed err
There was nothing incorrect about this shadowed err variable, but
linters complain about it and fail CI.
* Finish wiring in the NMAgent Client
There were missing places where the nmagent client wasn't wired in
properly in main. This threads the client through completely and also
alters some tests to maintain compatibility.
* Add config for Wireserver to NMAgent Client
This was in reaction to a lint detecting a vestigial use of
"WireserverIP". This package variable is no longer in use, however, the
spirit of it still exists. The changes adapt the provided configuration
into an nmagent.Config for use with the NMAgent Client instead.
* Silence the linter
The linter complained about a shadowed err, which is fine since it's
scoped in the `if`. Also there was a duplicate import which resulted
from refactoring. This has been de-duped.
* Rename jnr -> req and nma -> nmagent
The "jnr" variable was confusing when "req" is far more revealing of
what it is. Also, the "nma" alias was left over from a prior iteration
when the legacy nmagent package and the new one co-existed.
* Rename NodeInquirer -> NodeInterrogator
"Interrogator" is more revealing about the set of behavior encapsulated
by the interface. Depending on that behavior allows a consumer to
interrogate nodes for various information (but not modify anything about
them).
2022-10-20 00:38:01 +03:00
2023-01-20 02:10:23 +03:00
// check if the version is valid and save it to service state
for ncid := range ncsToBeAdded {
waitingForUpdate , _ , _ := service . isNCWaitingForUpdate ( ncsToBeAdded [ ncid ] . Version , ncsToBeAdded [ ncid ] . NetworkContainerid , nmaNCs )
Replace the NMAgent client in CNS with the one from the nmagent package (#1643)
* Switch PublishNetworkContainer to nmagent package
This removes the PublishNetworkContainer method from the CNS client's
nmagent package in favor of using the one from the top-level nmagent
client package. This is to ensure that there's only one nmagent client
to maintain.
* Use Unpublish method from nmagent package
The existing unpublish endpoints within CNS used the older nmagent
client. This converts them to use the newer one.
* Use JoinNetwork from new nmagent client
The API in CNS was using the old nmagent client endpoints which we want
to phase out so that there is exactly one nmagent client across all
systems.
* Add SupportedAPIs method to nmagent and use it
The CNS client uses this SupportedAPIs method to proxy requests from DNC
and detect capabilities of the node.
* Add GetNCVersion to nmagent and use it in CNS
CNS previously used its own client method for accessing the GetNCVersion
endpoint of nmagent. In the interest of having a singular NMAgent
client, this adopts the behavior into the nmagent package and uses it
within CNS instead.
* Use test helpers for context and checking errs
Checking whether the error was present and should it be present was
annoying and repetitive, so there's a helper now to take care of that.
Similarly, contexts are necessary but it's nice to create them from the
test's deadline itself. There's a helper to do that now as well.
* Fix broken tests & improve the suite
There were some broken tests left over from implementing new NMAgent
interface methods. This makes those tests pass and also improves the
test suite by detangling mixed concerns of the utility functions within.
Many of them returned errors and made assertions, which made them
confusing to use. The utility functions now only do things you request,
and the tests themselves are the only ones making assertions (sometimes
through helpers that were added to make those assertions easier).
* Add GetNCVersionList endpoint to NMAgent client
This is the final endpoint that was being used in CNS without being
present in the "official" NMAgent client. This moves that
implementation, more-or-less, to the nmagent package and consumes it in
NMAgent through an interface.
* Remove incorrect error shadowing
There were a few places where errors were shadowed. Also removed the
C-ism of pre-declaring variables for the sake of pre-declaring
variables.
* Replace error type assertions with errors.As
In two instances, type assertions were used instead of errors.As. Apart
from being less obvious, there are apparently instances where this can
be incorrect with error wrapping.
Also, there was an instance where errors.As was mistakenly used in the
init clause of an if statement, instead of the predicate. This was
corrected to be a conjunctive predicate converting the error and then
making assertions using that error. This is safe because short-circuit
evaluation will prevent the second half of the predicate from being
evaluated if the error is not of that type.
* Use context for joinNetwork
The linter rightly pointed out that context could be trivially
propagated further than it was. This commit does exactly that.
* Add error wrapping to an otherwise-opaque error
The linter highlighted this error, showing that the error returned would
be confusing to a user of this function. This wraps the error indicating
that a join network request was issued at the point where the error was
produced, to aid in debugging.
* Remove error shadowing in the tests
This is really somewhat unnecessary because it's just a test. It
shouldn't impact correctness, since the errors were properly scoped to
their respective if statements. However, to prevent other people's
linters from complaining, this corrects the lint.
* Silence complaints about dynamic errors in tests
This is an unnecessary lint in a test, because it's useful to be able to
define errors on the fly when you need them in a test. However, the
linter demands it presently, so it must be silenced.
* Add missing return for fmt.Errorf
Surprisingly, a return was missing here. This was caught by the linter,
and was most certainly incorrect.
* Remove yet another shadowed err
There was nothing incorrect about this shadowed err variable, but
linters complain about it and fail CI.
* Finish wiring in the NMAgent Client
There were missing places where the nmagent client wasn't wired in
properly in main. This threads the client through completely and also
alters some tests to maintain compatibility.
* Add config for Wireserver to NMAgent Client
This was in reaction to a lint detecting a vestigial use of
"WireserverIP". This package variable is no longer in use, however, the
spirit of it still exists. The changes adapt the provided configuration
into an nmagent.Config for use with the NMAgent Client instead.
* Silence the linter
The linter complained about a shadowed err, which is fine since it's
scoped in the `if`. Also there was a duplicate import which resulted
from refactoring. This has been de-duped.
* Rename jnr -> req and nma -> nmagent
The "jnr" variable was confusing when "req" is far more revealing of
what it is. Also, the "nma" alias was left over from a prior iteration
when the legacy nmagent package and the new one co-existed.
* Rename NodeInquirer -> NodeInterrogator
"Interrogator" is more revealing about the set of behavior encapsulated
by the interface. Depending on that behavior allows a consumer to
interrogate nodes for various information (but not modify anything about
them).
2022-10-20 00:38:01 +03:00
2023-01-20 02:10:23 +03:00
body , err = json . Marshal ( ncsToBeAdded [ ncid ] )
if err != nil {
logger . Errorf ( "[Azure-CNS] Failed to marshal nc with nc id %s and content %v" , ncid , ncsToBeAdded [ ncid ] )
2020-07-23 23:03:10 +03:00
}
2023-01-20 02:10:23 +03:00
req , err = http . NewRequestWithContext ( ctx , http . MethodPost , "" , bytes . NewBuffer ( body ) )
if err != nil {
logger . Errorf ( "[Azure CNS] Error received while creating http POST request for nc %v" , ncsToBeAdded [ ncid ] )
}
req . Header . Set ( common . ContentType , common . JsonContent )
w := httptest . NewRecorder ( )
service . createOrUpdateNetworkContainer ( w , req )
result := w . Result ( )
if result . StatusCode == http . StatusOK {
var resp cns . CreateNetworkContainerResponse
if err = json . Unmarshal ( w . Body . Bytes ( ) , & resp ) ; err == nil && resp . Response . ReturnCode == types . Success {
service . Lock ( )
ncstatus := service . state . ContainerStatus [ ncid ]
ncstatus . VfpUpdateComplete = ! waitingForUpdate
service . state . ContainerStatus [ ncid ] = ncstatus
service . Unlock ( )
}
}
result . Body . Close ( )
2020-07-23 23:03:10 +03:00
}
}
service . Lock ( )
service . saveState ( )
service . Unlock ( )
// delete dangling NCs
for nc := range ncsToBeDeleted {
var body bytes . Buffer
json . NewEncoder ( & body ) . Encode ( & cns . DeleteNetworkContainerRequest { NetworkContainerid : nc } )
req , err = http . NewRequest ( http . MethodPost , "" , & body )
if err == nil {
req . Header . Set ( common . JsonContent , common . JsonContent )
service . deleteNetworkContainer ( httptest . NewRecorder ( ) , req )
} else {
logger . Errorf ( "[Azure-CNS] Failed to delete NC request to sync state: %s" , err . Error ( ) )
}
}
return
}
2020-12-12 00:54:17 +03:00
// SyncHostNCVersion will check NC version from NMAgent and save it as host NC version in container status.
// If NMAgent NC version got updated, CNS will refresh the pending programming IP status.
2021-10-21 21:14:03 +03:00
func ( service * HTTPRestService ) SyncHostNCVersion ( ctx context . Context , channelMode string ) {
service . Lock ( )
defer service . Unlock ( )
2022-03-12 04:11:12 +03:00
start := time . Now ( )
2023-05-05 02:29:25 +03:00
programmedNCCount , err := service . syncHostNCVersion ( ctx , channelMode )
// even if we get an error, we want to write the CNI conflist if we have any NC programmed to any version
if programmedNCCount > 0 {
// This will only be done once per lifetime of the CNS process. This function is threadsafe and will panic
// if it fails, so it is safe to call in a non-preemptable goroutine.
go service . MustGenerateCNIConflistOnce ( )
}
2022-03-12 04:11:12 +03:00
if err != nil {
logger . Errorf ( "sync host error %v" , err )
}
2022-10-13 04:00:21 +03:00
syncHostNCVersionCount . WithLabelValues ( strconv . FormatBool ( err == nil ) ) . Inc ( )
syncHostNCVersionLatency . WithLabelValues ( strconv . FormatBool ( err == nil ) ) . Observe ( time . Since ( start ) . Seconds ( ) )
2022-03-12 04:11:12 +03:00
}
var errNonExistentContainerStatus = errors . New ( "nonExistantContainerstatus" )
2023-05-05 02:29:25 +03:00
// syncHostVersion updates the CNS state with the latest programmed versions of NCs attached to the VM. If any NC in local CNS state
// does not match the version that DNC claims to have published, this function will call NMAgent and list the latest programmed versions of
// all NCs and update the CNS state accordingly. This function returns the the total number of NCs on this VM that have been programmed to
// some version, NOT the number of NCs that are up-to-date.
func ( service * HTTPRestService ) syncHostNCVersion ( ctx context . Context , channelMode string ) ( int , error ) {
2022-10-13 04:00:21 +03:00
outdatedNCs := map [ string ] struct { } { }
2023-05-05 02:29:25 +03:00
programmedNCs := map [ string ] struct { } { }
2021-10-21 21:14:03 +03:00
for idx := range service . state . ContainerStatus {
2020-12-12 00:54:17 +03:00
// Will open a separate PR to convert all the NC version related variable to int. Change from string to int is a pain.
2022-10-13 04:00:21 +03:00
localNCVersion , err := strconv . Atoi ( service . state . ContainerStatus [ idx ] . HostVersion )
2020-12-12 00:54:17 +03:00
if err != nil {
2021-10-21 21:14:03 +03:00
logger . Errorf ( "Received err when change containerstatus.HostVersion %s to int, err msg %v" , service . state . ContainerStatus [ idx ] . HostVersion , err )
2020-12-12 00:54:17 +03:00
continue
}
2022-10-13 04:00:21 +03:00
dncNCVersion , err := strconv . Atoi ( service . state . ContainerStatus [ idx ] . CreateNetworkContainerRequest . Version )
2020-12-12 00:54:17 +03:00
if err != nil {
2021-10-21 21:14:03 +03:00
logger . Errorf ( "Received err when change nc version %s in containerstatus to int, err msg %v" , service . state . ContainerStatus [ idx ] . CreateNetworkContainerRequest . Version , err )
2020-12-12 00:54:17 +03:00
continue
}
// host NC version is the NC version from NMAgent, if it's smaller than NC version from DNC, then append it to indicate it needs update.
2022-10-13 04:00:21 +03:00
if localNCVersion < dncNCVersion {
outdatedNCs [ service . state . ContainerStatus [ idx ] . ID ] = struct { } { }
} else if localNCVersion > dncNCVersion {
logger . Errorf ( "NC version from NMAgent is larger than DNC, NC version from NMAgent is %d, NC version from DNC is %d" , localNCVersion , dncNCVersion )
2020-12-12 00:54:17 +03:00
}
2023-05-05 02:29:25 +03:00
if localNCVersion > - 1 {
programmedNCs [ service . state . ContainerStatus [ idx ] . ID ] = struct { } { }
}
2020-12-12 00:54:17 +03:00
}
2022-10-13 04:00:21 +03:00
if len ( outdatedNCs ) == 0 {
2023-05-05 02:29:25 +03:00
return len ( programmedNCs ) , nil
2021-10-21 21:14:03 +03:00
}
Replace the NMAgent client in CNS with the one from the nmagent package (#1643)
* Switch PublishNetworkContainer to nmagent package
This removes the PublishNetworkContainer method from the CNS client's
nmagent package in favor of using the one from the top-level nmagent
client package. This is to ensure that there's only one nmagent client
to maintain.
* Use Unpublish method from nmagent package
The existing unpublish endpoints within CNS used the older nmagent
client. This converts them to use the newer one.
* Use JoinNetwork from new nmagent client
The API in CNS was using the old nmagent client endpoints which we want
to phase out so that there is exactly one nmagent client across all
systems.
* Add SupportedAPIs method to nmagent and use it
The CNS client uses this SupportedAPIs method to proxy requests from DNC
and detect capabilities of the node.
* Add GetNCVersion to nmagent and use it in CNS
CNS previously used its own client method for accessing the GetNCVersion
endpoint of nmagent. In the interest of having a singular NMAgent
client, this adopts the behavior into the nmagent package and uses it
within CNS instead.
* Use test helpers for context and checking errs
Checking whether the error was present and should it be present was
annoying and repetitive, so there's a helper now to take care of that.
Similarly, contexts are necessary but it's nice to create them from the
test's deadline itself. There's a helper to do that now as well.
* Fix broken tests & improve the suite
There were some broken tests left over from implementing new NMAgent
interface methods. This makes those tests pass and also improves the
test suite by detangling mixed concerns of the utility functions within.
Many of them returned errors and made assertions, which made them
confusing to use. The utility functions now only do things you request,
and the tests themselves are the only ones making assertions (sometimes
through helpers that were added to make those assertions easier).
* Add GetNCVersionList endpoint to NMAgent client
This is the final endpoint that was being used in CNS without being
present in the "official" NMAgent client. This moves that
implementation, more-or-less, to the nmagent package and consumes it in
NMAgent through an interface.
* Remove incorrect error shadowing
There were a few places where errors were shadowed. Also removed the
C-ism of pre-declaring variables for the sake of pre-declaring
variables.
* Replace error type assertions with errors.As
In two instances, type assertions were used instead of errors.As. Apart
from being less obvious, there are apparently instances where this can
be incorrect with error wrapping.
Also, there was an instance where errors.As was mistakenly used in the
init clause of an if statement, instead of the predicate. This was
corrected to be a conjunctive predicate converting the error and then
making assertions using that error. This is safe because short-circuit
evaluation will prevent the second half of the predicate from being
evaluated if the error is not of that type.
* Use context for joinNetwork
The linter rightly pointed out that context could be trivially
propagated further than it was. This commit does exactly that.
* Add error wrapping to an otherwise-opaque error
The linter highlighted this error, showing that the error returned would
be confusing to a user of this function. This wraps the error indicating
that a join network request was issued at the point where the error was
produced, to aid in debugging.
* Remove error shadowing in the tests
This is really somewhat unnecessary because it's just a test. It
shouldn't impact correctness, since the errors were properly scoped to
their respective if statements. However, to prevent other people's
linters from complaining, this corrects the lint.
* Silence complaints about dynamic errors in tests
This is an unnecessary lint in a test, because it's useful to be able to
define errors on the fly when you need them in a test. However, the
linter demands it presently, so it must be silenced.
* Add missing return for fmt.Errorf
Surprisingly, a return was missing here. This was caught by the linter,
and was most certainly incorrect.
* Remove yet another shadowed err
There was nothing incorrect about this shadowed err variable, but
linters complain about it and fail CI.
* Finish wiring in the NMAgent Client
There were missing places where the nmagent client wasn't wired in
properly in main. This threads the client through completely and also
alters some tests to maintain compatibility.
* Add config for Wireserver to NMAgent Client
This was in reaction to a lint detecting a vestigial use of
"WireserverIP". This package variable is no longer in use, however, the
spirit of it still exists. The changes adapt the provided configuration
into an nmagent.Config for use with the NMAgent Client instead.
* Silence the linter
The linter complained about a shadowed err, which is fine since it's
scoped in the `if`. Also there was a duplicate import which resulted
from refactoring. This has been de-duped.
* Rename jnr -> req and nma -> nmagent
The "jnr" variable was confusing when "req" is far more revealing of
what it is. Also, the "nma" alias was left over from a prior iteration
when the legacy nmagent package and the new one co-existed.
* Rename NodeInquirer -> NodeInterrogator
"Interrogator" is more revealing about the set of behavior encapsulated
by the interface. Depending on that behavior allows a consumer to
interrogate nodes for various information (but not modify anything about
them).
2022-10-20 00:38:01 +03:00
ncVersionListResp , err := service . nma . GetNCVersionList ( ctx )
2021-10-21 21:14:03 +03:00
if err != nil {
2023-05-05 02:29:25 +03:00
return len ( programmedNCs ) , errors . Wrap ( err , "failed to get nc version list from nmagent" )
2021-10-21 21:14:03 +03:00
}
2022-10-13 04:00:21 +03:00
nmaNCs := map [ string ] string { }
for _ , nc := range ncVersionListResp . Containers {
2023-05-17 23:34:04 +03:00
nmaNCs [ strings . ToLower ( nc . NetworkContainerID ) ] = nc . Version
2021-10-21 21:14:03 +03:00
}
2022-10-13 04:00:21 +03:00
for ncID := range outdatedNCs {
nmaNCVersionStr , ok := nmaNCs [ ncID ]
2021-10-21 21:14:03 +03:00
if ! ok {
2022-10-13 04:00:21 +03:00
// NMA doesn't have this NC that we need programmed yet, bail out
2021-10-21 21:14:03 +03:00
continue
}
2022-10-13 04:00:21 +03:00
nmaNCVersion , err := strconv . Atoi ( nmaNCVersionStr )
2021-10-21 21:14:03 +03:00
if err != nil {
2022-10-13 04:00:21 +03:00
logger . Errorf ( "failed to parse container version of %s: %s" , ncID , err )
continue
2021-10-21 21:14:03 +03:00
}
// Check whether it exist in service state and get the related nc info
ncInfo , exist := service . state . ContainerStatus [ ncID ]
if ! exist {
2022-10-13 04:00:21 +03:00
// if we marked this NC as needs update, but it no longer exists in internal state when we reach
// this point, our internal state has changed unexpectedly and we should bail out and try again.
2023-05-05 02:29:25 +03:00
return len ( programmedNCs ) , errors . Wrapf ( errNonExistentContainerStatus , "can't find NC with ID %s in service state, stop updating this host NC version" , ncID )
2021-10-21 21:14:03 +03:00
}
2023-05-05 02:29:25 +03:00
// if the NC still exists in state and is programmed to some version (doesn't have to be latest), add it to our set of NCs that have been programmed
if nmaNCVersion > - 1 {
programmedNCs [ ncID ] = struct { } { }
}
2022-10-13 04:00:21 +03:00
localNCVersion , err := strconv . Atoi ( ncInfo . HostVersion )
if err != nil {
logger . Errorf ( "failed to parse host nc version string %s: %s" , ncInfo . HostVersion , err )
continue
}
if localNCVersion > nmaNCVersion {
logger . Errorf ( "NC version from NMA is decreasing: have %d, got %d" , localNCVersion , nmaNCVersion )
continue
}
2021-10-21 21:14:03 +03:00
if channelMode == cns . CRD {
2022-10-13 04:00:21 +03:00
service . MarkIpsAsAvailableUntransacted ( ncInfo . ID , nmaNCVersion )
2020-12-12 00:54:17 +03:00
}
2022-10-13 04:00:21 +03:00
logger . Printf ( "Updating NC %s host version from %s to %s" , ncID , ncInfo . HostVersion , nmaNCVersionStr )
ncInfo . HostVersion = nmaNCVersionStr
logger . Printf ( "Updated NC %s host version to %s" , ncID , ncInfo . HostVersion )
2021-10-21 21:14:03 +03:00
service . state . ContainerStatus [ ncID ] = ncInfo
2022-10-13 04:00:21 +03:00
// if we successfully updated the NC, pop it from the needs update set.
delete ( outdatedNCs , ncID )
}
// if we didn't empty out the needs update set, NMA has not programmed all the NCs we are expecting, and we
// need to return an error indicating that
if len ( outdatedNCs ) > 0 {
2023-05-05 02:29:25 +03:00
return len ( programmedNCs ) , errors . Errorf ( "unabled to update some NCs: %v, missing or bad response from NMA" , outdatedNCs )
2020-12-12 00:54:17 +03:00
}
2022-11-29 07:56:08 +03:00
2023-05-05 02:29:25 +03:00
return len ( programmedNCs ) , nil
2020-12-12 00:54:17 +03:00
}
2024-10-18 02:59:40 +03:00
func ( service * HTTPRestService ) ReconcileIPAssignment ( podInfoByIP map [ string ] cns . PodInfo , ncReqs [ ] * cns . CreateNetworkContainerRequest ) types . ResponseCode {
2023-08-02 04:49:21 +03:00
// index all the secondary IP configs for all the nc reqs, for easier lookup later on.
allSecIPsIdx := make ( map [ string ] * cns . CreateNetworkContainerRequest )
for i := range ncReqs {
for _ , secIPConfig := range ncReqs [ i ] . SecondaryIPConfigs {
allSecIPsIdx [ secIPConfig . IPAddress ] = ncReqs [ i ]
}
}
2020-07-28 06:53:49 +03:00
2023-08-02 04:49:21 +03:00
// we now need to reconcile IP assignment.
// considering that a single pod may have multiple ips (such as in dual stack scenarios)
// and that IP assignment in CNS (as done by requestIPConfigsHelper) does not allow
// updates (it returns the existing state if one already exists for the pod's interface),
// we need to assign all IPs for a pod interface or name+namespace at the same time.
//
// iterating over single IPs is not appropriate then, since assignment for the first IP for
// a pod will prevent the second IP from being added. the following function call transforms
// pod info indexed by ip address:
//
// {
// "10.0.0.1": podInfo{interface: "aaa-eth0"},
// "fe80::1": podInfo{interface: "aaa-eth0"},
// }
//
// to pod IPs indexed by pod key (interface or name+namespace, depending on scenario):
//
// {
// "aaa-eth0": podIPs{v4IP: 10.0.0.1, v6IP: fe80::1}
// }
//
// such that we can iterate over pod interfaces, and assign all IPs for it at once.
2024-10-18 02:59:40 +03:00
2023-08-02 04:49:21 +03:00
podKeyToPodIPs , err := newPodKeyToPodIPsMap ( podInfoByIP )
if err != nil {
logger . Errorf ( "could not transform pods indexed by IP address to pod IPs indexed by interface: %v" , err )
return types . UnexpectedError
}
2020-07-28 06:53:49 +03:00
2023-08-02 04:49:21 +03:00
for podKey , podIPs := range podKeyToPodIPs {
var (
desiredIPs [ ] string
ncIDs [ ] string
)
var ips [ ] net . IP
if podIPs . v4IP != nil {
ips = append ( ips , podIPs . v4IP )
}
2020-07-28 06:53:49 +03:00
2023-08-02 04:49:21 +03:00
if podIPs . v6IP != nil {
ips = append ( ips , podIPs . v6IP )
}
for _ , ip := range ips {
if ncReq , ok := allSecIPsIdx [ ip . String ( ) ] ; ok {
logger . Printf ( "secondary ip %s is assigned to pod %+v, ncId: %s ncVersion: %s" , ip , podIPs , ncReq . NetworkContainerid , ncReq . Version )
desiredIPs = append ( desiredIPs , ip . String ( ) )
ncIDs = append ( ncIDs , ncReq . NetworkContainerid )
} else {
// it might still be possible to see host networking pods here (where ips are not from ncs) if we are restoring using the kube podinfo provider
// todo: once kube podinfo provider reconcile flow is removed, this line will not be necessary/should be removed.
logger . Errorf ( "ip %s assigned to pod %+v but not found in any nc" , ip , podIPs )
2020-07-28 06:53:49 +03:00
}
2023-08-02 04:49:21 +03:00
}
if len ( desiredIPs ) == 0 {
// this may happen for pods in the host network
continue
}
jsonContext , err := podIPs . OrchestratorContext ( )
if err != nil {
logger . Errorf ( "Failed to marshal KubernetesPodInfo, error: %v" , err )
return types . UnexpectedError
}
ipconfigsRequest := cns . IPConfigsRequest {
DesiredIPAddresses : desiredIPs ,
OrchestratorContext : jsonContext ,
InfraContainerID : podIPs . InfraContainerID ( ) ,
PodInterfaceID : podIPs . InterfaceID ( ) ,
}
if _ , err := requestIPConfigsHelper ( service , ipconfigsRequest ) ; err != nil {
logger . Errorf ( "requestIPConfigsHelper failed for pod key %s, podInfo %+v, ncIds %v, error: %v" , podKey , podIPs , ncIDs , err )
return types . FailedToAllocateIPConfig
2020-07-28 06:53:49 +03:00
}
}
2024-10-18 02:59:40 +03:00
return types . Success
}
func ( service * HTTPRestService ) CreateNCs ( ncReqs [ ] * cns . CreateNetworkContainerRequest ) types . ResponseCode {
for _ , ncReq := range ncReqs {
returnCode := service . CreateOrUpdateNetworkContainerInternal ( ncReq )
if returnCode != types . Success {
return returnCode
}
}
return types . Success
}
func ( service * HTTPRestService ) ReconcileIPAMStateForSwift ( ncReqs [ ] * cns . CreateNetworkContainerRequest , podInfoByIP map [ string ] cns . PodInfo , nnc * v1alpha . NodeNetworkConfig ) types . ResponseCode {
logger . Printf ( "Reconciling CNS IPAM state with nc requests: [%+v], PodInfo [%+v], NNC: [%+v]" , ncReqs , podInfoByIP , nnc )
// if no nc reqs, there is no CRD state yet
if len ( ncReqs ) == 0 {
logger . Printf ( "CNS starting with no NC state, podInfoMap count %d" , len ( podInfoByIP ) )
return types . Success
}
// first step in reconciliation is to create all the NCs in CNS, no IP assignment yet.
if returnCode := service . CreateNCs ( ncReqs ) ; returnCode != types . Success {
return returnCode
}
logger . Debugf ( "ncReqs created successfully, now save IPs" )
// now reconcile IPAM state.
if returnCode := service . ReconcileIPAssignment ( podInfoByIP , ncReqs ) ; returnCode != types . Success {
return returnCode
}
2023-08-02 04:49:21 +03:00
if err := service . MarkExistingIPsAsPendingRelease ( nnc . Spec . IPsNotInUse ) ; err != nil {
2021-11-30 00:48:31 +03:00
logger . Errorf ( "[Azure CNS] Error. Failed to mark IPs as pending %v" , nnc . Spec . IPsNotInUse )
2021-08-07 00:19:21 +03:00
return types . UnexpectedError
2020-09-29 01:37:36 +03:00
}
2024-10-18 02:59:40 +03:00
return types . Success
}
// todo: there is some redundancy between this funcation and ReconcileIPAMStateForNodeSubnet. The difference is that this one
// doesn't include the NNC parameter. We may want to unify the common parts.
func ( service * HTTPRestService ) ReconcileIPAMStateForNodeSubnet ( ncReqs [ ] * cns . CreateNetworkContainerRequest , podInfoByIP map [ string ] cns . PodInfo ) types . ResponseCode {
logger . Printf ( "Reconciling CNS IPAM state with nc requests: [%+v], PodInfo [%+v]" , ncReqs , podInfoByIP )
if len ( ncReqs ) != 1 {
logger . Errorf ( "Nodesubnet should always have 1 NC to hold secondary IPs" )
return types . NetworkContainerNotSpecified
}
// first step in reconciliation is to create all the NCs in CNS, no IP assignment yet.
if returnCode := service . CreateNCs ( ncReqs ) ; returnCode != types . Success {
return returnCode
}
logger . Debugf ( "ncReqs created successfully, now save IPs" )
// now reconcile IPAM state.
if returnCode := service . ReconcileIPAssignment ( podInfoByIP , ncReqs ) ; returnCode != types . Success {
return returnCode
}
return types . Success
2020-07-28 06:53:49 +03:00
}
2023-08-02 04:49:21 +03:00
var (
errIPParse = errors . New ( "parse IP" )
errMultipleIPPerFamily = errors . New ( "multiple IPs per family" )
)
// newPodKeyToPodIPsMap groups IPs by interface id and returns them indexed by interface id.
func newPodKeyToPodIPsMap ( podInfoByIP map [ string ] cns . PodInfo ) ( map [ string ] podIPs , error ) {
podKeyToPodIPs := make ( map [ string ] podIPs )
for ipStr , podInfo := range podInfoByIP {
id := podInfo . Key ( )
ips , ok := podKeyToPodIPs [ id ]
if ! ok {
ips . PodInfo = podInfo
}
ip := net . ParseIP ( ipStr )
switch {
case ip == nil :
return nil , errors . Wrapf ( errIPParse , "could not parse ip string %q on pod %+v" , ipStr , podInfo )
case ip . To4 ( ) != nil :
if ips . v4IP != nil {
return nil , errors . Wrapf ( errMultipleIPPerFamily , "multiple ipv4 addresses (%v, %v) associated to pod %+v" , ips . v4IP , ip , podInfo )
}
ips . v4IP = ip
case ip . To16 ( ) != nil :
if ips . v6IP != nil {
return nil , errors . Wrapf ( errMultipleIPPerFamily , "multiple ipv6 addresses (%v, %v) associated to pod %+v" , ips . v6IP , ip , podInfo )
}
ips . v6IP = ip
}
podKeyToPodIPs [ id ] = ips
}
return podKeyToPodIPs , nil
}
// podIPs are all the IPs associated with a pod, along with pod info
type podIPs struct {
cns . PodInfo
v4IP net . IP
v6IP net . IP
}
2021-06-04 06:49:00 +03:00
// GetNetworkContainerInternal gets network container details.
2021-08-07 00:19:21 +03:00
func ( service * HTTPRestService ) GetNetworkContainerInternal (
req cns . GetNetworkContainerRequest ,
) ( cns . GetNetworkContainerResponse , types . ResponseCode ) {
2023-03-04 06:51:12 +03:00
getNetworkContainerResponses := service . getAllNetworkContainerResponses ( req )
return getNetworkContainerResponses [ 0 ] , getNetworkContainerResponses [ 0 ] . Response . ReturnCode
2021-06-04 06:49:00 +03:00
}
// DeleteNetworkContainerInternal deletes a network container.
2021-08-07 00:19:21 +03:00
func ( service * HTTPRestService ) DeleteNetworkContainerInternal (
req cns . DeleteNetworkContainerRequest ,
) types . ResponseCode {
2023-03-04 06:51:12 +03:00
ncid := req . NetworkContainerid
_ , exist := service . getNetworkContainerDetails ( ncid )
2021-06-04 06:49:00 +03:00
if ! exist {
2023-03-04 06:51:12 +03:00
logger . Printf ( "network container for id %v doesn't exist" , ncid )
2021-08-07 00:19:21 +03:00
return types . Success
2021-06-04 06:49:00 +03:00
}
service . Lock ( )
defer service . Unlock ( )
if service . state . ContainerStatus != nil {
2023-03-04 06:51:12 +03:00
delete ( service . state . ContainerStatus , ncid )
2021-06-04 06:49:00 +03:00
}
if service . state . ContainerIDByOrchestratorContext != nil {
2023-03-04 06:51:12 +03:00
for orchestratorContext , networkContainerIDs := range service . state . ContainerIDByOrchestratorContext { //nolint:gocritic // copy is ok
if networkContainerIDs . Contains ( ncid ) {
networkContainerIDs . Delete ( ncid )
if * networkContainerIDs == "" {
delete ( service . state . ContainerIDByOrchestratorContext , orchestratorContext )
break
}
2021-06-04 06:49:00 +03:00
}
}
}
service . saveState ( )
2021-08-07 00:19:21 +03:00
return types . Success
2021-06-04 06:49:00 +03:00
}
2023-09-08 06:39:08 +03:00
func ( service * HTTPRestService ) MustEnsureNoStaleNCs ( validNCIDs [ ] string ) {
valid := make ( map [ string ] struct { } )
for _ , ncID := range validNCIDs {
valid [ ncID ] = struct { } { }
}
service . Lock ( )
defer service . Unlock ( )
ncIDToAssignedIPs := make ( map [ string ] [ ] cns . IPConfigurationStatus )
for _ , ipInfo := range service . PodIPConfigState { // nolint:gocritic // copy is fine; it's a larger change to modify the map to hold pointers
if ipInfo . GetState ( ) == types . Assigned {
ncIDToAssignedIPs [ ipInfo . NCID ] = append ( ncIDToAssignedIPs [ ipInfo . NCID ] , ipInfo )
}
}
mutated := false
for ncID := range service . state . ContainerStatus {
if _ , ok := valid [ ncID ] ; ! ok {
// stale NCs with assigned IPs are an unexpected CNS state which we need to alert on.
if assignedIPs , hasAssignedIPs := ncIDToAssignedIPs [ ncID ] ; hasAssignedIPs {
msg := fmt . Sprintf ( "Unexpected state: found stale NC ID %s in CNS state with %d assigned IPs: %+v" , ncID , len ( assignedIPs ) , assignedIPs )
logger . Errorf ( msg )
panic ( msg )
}
logger . Errorf ( "[Azure CNS] Found stale NC ID %s in CNS state. Removing..." , ncID )
delete ( service . state . ContainerStatus , ncID )
mutated = true
}
}
if mutated {
_ = service . saveState ( )
}
}
2020-07-28 06:53:49 +03:00
// This API will be called by CNS RequestController on CRD update.
2021-10-07 19:55:09 +03:00
func ( service * HTTPRestService ) CreateOrUpdateNetworkContainerInternal ( req * cns . CreateNetworkContainerRequest ) types . ResponseCode {
2020-07-16 12:51:11 +03:00
if req . NetworkContainerid == "" {
logger . Errorf ( "[Azure CNS] Error. NetworkContainerid is empty" )
2021-08-07 00:19:21 +03:00
return types . NetworkContainerNotSpecified
2020-07-16 12:51:11 +03:00
}
// For now only RequestController uses this API which will be initialized only for AKS scenario.
// Validate ContainerType is set as Docker
2024-03-04 10:30:26 +03:00
if service . state . OrchestratorType != cns . KubernetesCRD && service . state . OrchestratorType != cns . Kubernetes {
2020-07-16 12:51:11 +03:00
logger . Errorf ( "[Azure CNS] Error. Unsupported OrchestratorType: %s" , service . state . OrchestratorType )
2021-08-07 00:19:21 +03:00
return types . UnsupportedOrchestratorType
2020-07-16 12:51:11 +03:00
}
2024-10-18 02:59:40 +03:00
if req . NetworkContainerid == nodesubnet . NodeSubnetNCID {
// For NodeSubnet scenarios, Validate PrimaryCA must be empty
if req . IPConfiguration . IPSubnet . IPAddress != "" {
logger . Errorf ( "[Azure CNS] Error. PrimaryCA is invalid, NC Req: %v" , req )
return types . InvalidPrimaryIPConfig
}
} else {
// For Swift scenarios, Validate PrimaryCA must never be empty
err := validateIPSubnet ( req . IPConfiguration . IPSubnet )
if err != nil {
logger . Errorf ( "[Azure CNS] Error. PrimaryCA is invalid, NC Req: %v" , req )
return types . InvalidPrimaryIPConfig
}
2020-07-17 08:22:10 +03:00
}
// Validate SecondaryIPConfig
2023-04-04 19:00:02 +03:00
for _ , secIPConfig := range req . SecondaryIPConfigs {
2020-07-17 08:22:10 +03:00
// Validate Ipconfig
2023-04-04 19:00:02 +03:00
if secIPConfig . IPAddress == "" {
logger . Errorf ( "Failed to add IPConfig to state: %+v, empty IPSubnet.IPAddress" , secIPConfig )
2021-08-07 00:19:21 +03:00
return types . InvalidSecondaryIPConfig
2020-07-17 08:22:10 +03:00
}
2020-07-16 12:51:11 +03:00
}
// Validate if state exists already
2020-07-23 03:11:41 +03:00
existingNCInfo , ok := service . getNetworkContainerDetails ( req . NetworkContainerid )
2020-07-16 12:51:11 +03:00
if ok {
2020-07-23 03:11:41 +03:00
existingReq := existingNCInfo . CreateNetworkContainerRequest
2023-05-23 18:31:01 +03:00
if ! reflect . DeepEqual ( existingReq . IPConfiguration . IPSubnet , req . IPConfiguration . IPSubnet ) {
logger . Errorf ( "[Azure CNS] Error. PrimaryCA is not same, NCId %s, old CA %s/%d, new CA %s/%d" ,
req . NetworkContainerid ,
existingReq . IPConfiguration . IPSubnet . IPAddress ,
existingReq . IPConfiguration . IPSubnet . PrefixLength ,
req . IPConfiguration . IPSubnet . IPAddress ,
req . IPConfiguration . IPSubnet . PrefixLength )
2021-08-07 00:19:21 +03:00
return types . PrimaryCANotSame
2020-07-16 12:51:11 +03:00
}
}
// This will Create Or Update the NC state.
2021-09-22 03:02:03 +03:00
returnCode , returnMessage := service . saveNetworkContainerGoalState ( * req )
2020-07-16 12:51:11 +03:00
// If the NC was created successfully, log NC snapshot.
if returnCode == 0 {
2021-09-22 03:02:03 +03:00
logNCSnapshot ( * req )
2024-11-05 23:14:30 +03:00
service . publishIPStateMetrics ( )
2020-07-16 12:51:11 +03:00
} else {
logger . Errorf ( returnMessage )
}
2022-07-22 02:34:10 +03:00
if service . Options [ common . OptProgramSNATIPTables ] == true {
2022-08-15 23:00:32 +03:00
returnCode , returnMessage = service . programSNATRules ( req )
2022-07-22 02:34:10 +03:00
if returnCode != 0 {
logger . Errorf ( returnMessage )
}
}
2021-06-04 06:49:00 +03:00
return returnCode
}
2024-07-31 04:36:38 +03:00
func ( service * HTTPRestService ) SetVFForAccelnetNICs ( ) error {
return service . setVFForAccelnetNICs ( )
}