зеркало из https://github.com/Azure/ARO-RP.git
Merge pull request #3403 from hawkowl/hawkowl/local-provision-fixes
Fix failures provisioning a cluster in dev due to NSG attaching
This commit is contained in:
Коммит
c02c83b1f9
|
@ -10,6 +10,7 @@ import (
|
|||
"net/http"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
mgmtnetwork "github.com/Azure/azure-sdk-for-go/services/network/mgmt/2020-08-01/network"
|
||||
mgmtfeatures "github.com/Azure/azure-sdk-for-go/services/resources/mgmt/2019-07-01/features"
|
||||
|
@ -17,6 +18,7 @@ import (
|
|||
"github.com/Azure/go-autorest/autorest/azure"
|
||||
"github.com/Azure/go-autorest/autorest/to"
|
||||
utilrand "k8s.io/apimachinery/pkg/util/rand"
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
|
||||
"github.com/Azure/ARO-RP/pkg/api"
|
||||
apisubnet "github.com/Azure/ARO-RP/pkg/api/util/subnet"
|
||||
|
@ -254,65 +256,103 @@ func (m *manager) subnetsWithServiceEndpoint(ctx context.Context, serviceEndpoin
|
|||
return subnets, nil
|
||||
}
|
||||
|
||||
func (m *manager) attachNSGs(ctx context.Context) (bool, error) {
|
||||
// attachNSGs attaches NSGs to the cluster subnets, if preconfigured NSG is not
|
||||
// enabled. This method is suitable for use with steps, and has default
|
||||
// timeout/polls set.
|
||||
func (m *manager) attachNSGs(ctx context.Context) error {
|
||||
// Since we need to guard against the case where NSGs are not ready
|
||||
// immediately after creation, we can have a relatively short retry period
|
||||
// of 30s and timeout of 3m. These numbers were chosen via a
|
||||
// highly-non-specific and data-adjacent process (picking them because they
|
||||
// seemed decent enough).
|
||||
//
|
||||
// If we get the NSG not-ready error after 3 minutes, it's unusual enough
|
||||
// that we should be raising it as an issue rather than tolerating it.
|
||||
return m._attachNSGs(ctx, 3*time.Minute, 30*time.Second)
|
||||
}
|
||||
|
||||
// _attachNSGs attaches NSGs to the cluster subnets, if preconfigured NSG is not
|
||||
// enabled. timeout and pollInterval are provided as arguments for testing
|
||||
// reasons.
|
||||
func (m *manager) _attachNSGs(ctx context.Context, timeout time.Duration, pollInterval time.Duration) error {
|
||||
if m.doc.OpenShiftCluster.Properties.NetworkProfile.PreconfiguredNSG == api.PreconfiguredNSGEnabled {
|
||||
return true, nil
|
||||
return nil
|
||||
}
|
||||
var innerErr error
|
||||
|
||||
workerProfiles, _ := api.GetEnrichedWorkerProfiles(m.doc.OpenShiftCluster.Properties)
|
||||
workerSubnetId := workerProfiles[0].SubnetID
|
||||
|
||||
for _, subnetID := range []string{
|
||||
m.doc.OpenShiftCluster.Properties.MasterProfile.SubnetID,
|
||||
workerSubnetId,
|
||||
} {
|
||||
m.log.Printf("attaching network security group to subnet %s", subnetID)
|
||||
timeoutCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||
defer cancel()
|
||||
|
||||
// TODO: there is probably an undesirable race condition here - check if etags can help.
|
||||
// This polling function protects the case below where the NSG might not be
|
||||
// ready to be referenced. We don't guard against trying to re-attach the
|
||||
// NSG since the inner loop is tolerant of that, and since we are attaching
|
||||
// the same NSG the only allowed failure case is when the NSG cannot be
|
||||
// attached to begin with, so it shouldn't happen in practice.
|
||||
_ = wait.PollImmediateUntil(pollInterval, func() (bool, error) {
|
||||
var c bool
|
||||
c, innerErr = func() (bool, error) {
|
||||
for _, subnetID := range []string{
|
||||
m.doc.OpenShiftCluster.Properties.MasterProfile.SubnetID,
|
||||
workerSubnetId,
|
||||
} {
|
||||
m.log.Printf("attaching network security group to subnet %s", subnetID)
|
||||
// TODO: there is probably an undesirable race condition here - check if etags can help.
|
||||
|
||||
s, err := m.subnet.Get(ctx, subnetID)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
// We use the outer context, not the timeout context, as we do not want
|
||||
// to time out the condition function itself, only stop retrying once
|
||||
// timeoutCtx's timeout has fired.
|
||||
s, err := m.subnet.Get(ctx, subnetID)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
if s.SubnetPropertiesFormat == nil {
|
||||
s.SubnetPropertiesFormat = &mgmtnetwork.SubnetPropertiesFormat{}
|
||||
}
|
||||
if s.SubnetPropertiesFormat == nil {
|
||||
s.SubnetPropertiesFormat = &mgmtnetwork.SubnetPropertiesFormat{}
|
||||
}
|
||||
|
||||
nsgID, err := apisubnet.NetworkSecurityGroupID(m.doc.OpenShiftCluster, subnetID)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
nsgID, err := apisubnet.NetworkSecurityGroupID(m.doc.OpenShiftCluster, subnetID)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
// Sometimes we get into the race condition between external services modifying
|
||||
// subnets and our validation code. We try to catch this early, but
|
||||
// these errors is propagated to make the user-facing error more clear incase
|
||||
// modification happened after we ran validation code and we lost the race
|
||||
if s.SubnetPropertiesFormat.NetworkSecurityGroup != nil {
|
||||
if strings.EqualFold(*s.SubnetPropertiesFormat.NetworkSecurityGroup.ID, nsgID) {
|
||||
continue
|
||||
// Sometimes we get into the race condition between external services modifying
|
||||
// subnets and our validation code. We try to catch this early, but
|
||||
// these errors is propagated to make the user-facing error more clear incase
|
||||
// modification happened after we ran validation code and we lost the race
|
||||
if s.SubnetPropertiesFormat.NetworkSecurityGroup != nil {
|
||||
if strings.EqualFold(*s.SubnetPropertiesFormat.NetworkSecurityGroup.ID, nsgID) {
|
||||
continue
|
||||
}
|
||||
|
||||
return false, api.NewCloudError(http.StatusBadRequest, api.CloudErrorCodeInvalidLinkedVNet, "", "The provided subnet '%s' is invalid: must not have a network security group attached.", subnetID)
|
||||
}
|
||||
|
||||
s.SubnetPropertiesFormat.NetworkSecurityGroup = &mgmtnetwork.SecurityGroup{
|
||||
ID: to.StringPtr(nsgID),
|
||||
}
|
||||
|
||||
// Because we attempt to attach the NSG immediately after the base resource deployment
|
||||
// finishes, the NSG is sometimes not yet ready to be referenced and used, causing
|
||||
// an error to occur here. So if this particular error occurs, return nil to retry.
|
||||
// But if some other type of error occurs, just return that error.
|
||||
err = m.subnet.CreateOrUpdate(ctx, subnetID, s)
|
||||
if err != nil {
|
||||
if nsgNotReadyErrorRegex.MatchString(err.Error()) {
|
||||
return false, nil
|
||||
}
|
||||
return false, err
|
||||
}
|
||||
}
|
||||
return true, nil
|
||||
}()
|
||||
|
||||
return false, api.NewCloudError(http.StatusBadRequest, api.CloudErrorCodeInvalidLinkedVNet, "", "The provided subnet '%s' is invalid: must not have a network security group attached.", subnetID)
|
||||
}
|
||||
return c, innerErr
|
||||
}, timeoutCtx.Done())
|
||||
|
||||
s.SubnetPropertiesFormat.NetworkSecurityGroup = &mgmtnetwork.SecurityGroup{
|
||||
ID: to.StringPtr(nsgID),
|
||||
}
|
||||
|
||||
// Because we attempt to attach the NSG immediately after the base resource deployment
|
||||
// finishes, the NSG is sometimes not yet ready to be referenced and used, causing
|
||||
// an error to occur here. So if this particular error occurs, return nil to retry.
|
||||
// But if some other type of error occurs, just return that error.
|
||||
err = m.subnet.CreateOrUpdate(ctx, subnetID, s)
|
||||
if err != nil {
|
||||
if nsgNotReadyErrorRegex.MatchString(err.Error()) {
|
||||
return false, nil
|
||||
}
|
||||
return false, err
|
||||
}
|
||||
}
|
||||
|
||||
return true, nil
|
||||
return innerErr
|
||||
}
|
||||
|
||||
func (m *manager) setMasterSubnetPolicies(ctx context.Context) error {
|
||||
|
|
|
@ -12,6 +12,7 @@ import (
|
|||
"sort"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
mgmtnetwork "github.com/Azure/azure-sdk-for-go/services/network/mgmt/2020-08-01/network"
|
||||
mgmtfeatures "github.com/Azure/azure-sdk-for-go/services/resources/mgmt/2019-07-01/features"
|
||||
|
@ -242,11 +243,10 @@ func TestAttachNSGs(t *testing.T) {
|
|||
ctx := context.Background()
|
||||
|
||||
for _, tt := range []struct {
|
||||
name string
|
||||
oc *api.OpenShiftClusterDocument
|
||||
mocks func(*mock_subnet.MockManager)
|
||||
wantResult bool
|
||||
wantErr string
|
||||
name string
|
||||
oc *api.OpenShiftClusterDocument
|
||||
mocks func(*mock_subnet.MockManager)
|
||||
wantErr string
|
||||
}{
|
||||
{
|
||||
name: "Success - NSG attached to both subnets",
|
||||
|
@ -287,7 +287,6 @@ func TestAttachNSGs(t *testing.T) {
|
|||
},
|
||||
}).Return(nil)
|
||||
},
|
||||
wantResult: true,
|
||||
},
|
||||
{
|
||||
name: "Success - preconfigured NSG enabled",
|
||||
|
@ -313,8 +312,7 @@ func TestAttachNSGs(t *testing.T) {
|
|||
},
|
||||
},
|
||||
},
|
||||
mocks: func(subnet *mock_subnet.MockManager) {},
|
||||
wantResult: true,
|
||||
mocks: func(subnet *mock_subnet.MockManager) {},
|
||||
},
|
||||
{
|
||||
name: "Failure - unable to get a subnet",
|
||||
|
@ -340,8 +338,7 @@ func TestAttachNSGs(t *testing.T) {
|
|||
mocks: func(subnet *mock_subnet.MockManager) {
|
||||
subnet.EXPECT().Get(ctx, "masterSubnetID").Return(&mgmtnetwork.Subnet{}, fmt.Errorf("subnet not found"))
|
||||
},
|
||||
wantResult: false,
|
||||
wantErr: "subnet not found",
|
||||
wantErr: "subnet not found",
|
||||
},
|
||||
{
|
||||
name: "Failure - NSG already attached to a subnet",
|
||||
|
@ -373,8 +370,7 @@ func TestAttachNSGs(t *testing.T) {
|
|||
},
|
||||
}, nil)
|
||||
},
|
||||
wantResult: false,
|
||||
wantErr: "400: InvalidLinkedVNet: : The provided subnet 'masterSubnetID' is invalid: must not have a network security group attached.",
|
||||
wantErr: "400: InvalidLinkedVNet: : The provided subnet 'masterSubnetID' is invalid: must not have a network security group attached.",
|
||||
},
|
||||
{
|
||||
name: "Failure - failed to CreateOrUpdate subnet because NSG not yet ready for use",
|
||||
|
@ -407,7 +403,6 @@ func TestAttachNSGs(t *testing.T) {
|
|||
},
|
||||
}).Return(fmt.Errorf("Some random stuff followed by the important part that we're trying to match: Resource /subscriptions/00000000-0000-0000-0000-000000000000/resourcegroups/aro-12345678/providers/Microsoft.Network/networkSecurityGroups/infra-nsg referenced by resource masterSubnetID was not found. and here's some more stuff that's at the end past the important part"))
|
||||
},
|
||||
wantResult: false,
|
||||
},
|
||||
{
|
||||
name: "Failure - failed to CreateOrUpdate subnet with arbitrary error",
|
||||
|
@ -440,8 +435,7 @@ func TestAttachNSGs(t *testing.T) {
|
|||
},
|
||||
}).Return(fmt.Errorf("I'm an arbitrary error here to make life harder"))
|
||||
},
|
||||
wantResult: false,
|
||||
wantErr: "I'm an arbitrary error here to make life harder",
|
||||
wantErr: "I'm an arbitrary error here to make life harder",
|
||||
},
|
||||
} {
|
||||
controller := gomock.NewController(t)
|
||||
|
@ -456,10 +450,7 @@ func TestAttachNSGs(t *testing.T) {
|
|||
subnet: subnet,
|
||||
}
|
||||
|
||||
result, err := m.attachNSGs(ctx)
|
||||
if result != tt.wantResult {
|
||||
t.Errorf("Got %v, wanted %v", result, tt.wantResult)
|
||||
}
|
||||
err := m._attachNSGs(ctx, 1*time.Millisecond, 30*time.Second)
|
||||
utilerror.AssertErrorMessage(t, err, tt.wantErr)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -285,7 +285,7 @@ func (m *manager) bootstrap() []steps.Step {
|
|||
steps.Action(m.ensureServiceEndpoints),
|
||||
steps.Action(m.setMasterSubnetPolicies),
|
||||
steps.AuthorizationRetryingAction(m.fpAuthorizer, m.deployBaseResourceTemplate),
|
||||
steps.Condition(m.attachNSGs, 3*time.Minute, true),
|
||||
steps.AuthorizationRetryingAction(m.fpAuthorizer, m.attachNSGs),
|
||||
steps.Action(m.updateAPIIPEarly),
|
||||
steps.Action(m.createOrUpdateRouterIPEarly),
|
||||
steps.Action(m.ensureGatewayCreate),
|
||||
|
|
|
@ -83,7 +83,9 @@ func (s *authorizationRefreshingActionStep) run(ctx context.Context, log *logrus
|
|||
err = s.auth.Rebuild()
|
||||
return false, err // retry step
|
||||
}
|
||||
log.Printf("non-auth error, giving up: %v", err)
|
||||
if err != nil {
|
||||
log.Printf("non-auth error, giving up: %v", err)
|
||||
}
|
||||
return true, err
|
||||
}, timeoutCtx.Done())
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче