try and fix e2e race condition

This commit is contained in:
Amber Brown 2024-10-18 15:31:45 +11:00
Родитель ac26d1918e
Коммит 1cb1a57b23
4 изменённых файлов: 69 добавлений и 19 удалений

Просмотреть файл

@ -34,7 +34,7 @@ var _ = Describe("MIMO Actuator", Ordered, func() {
var manifests database.MaintenanceManifests
var manifestsClient *cosmosdb.FakeMaintenanceManifestDocumentClient
var clusters database.OpenShiftClusters
//var clustersClient cosmosdb.OpenShiftClusterDocumentClient
var clustersClient *cosmosdb.FakeOpenShiftClusterDocumentClient
var a Actuator
@ -75,7 +75,7 @@ var _ = Describe("MIMO Actuator", Ordered, func() {
BeforeEach(func() {
now := func() time.Time { return time.Unix(120, 0) }
manifests, manifestsClient = testdatabase.NewFakeMaintenanceManifests(now)
clusters, _ = testdatabase.NewFakeOpenShiftClusters()
clusters, clustersClient = testdatabase.NewFakeOpenShiftClusters()
a = &actuator{
log: log,
@ -105,6 +105,9 @@ var _ = Describe("MIMO Actuator", Ordered, func() {
Key: strings.ToLower(clusterResourceID),
OpenShiftCluster: &api.OpenShiftCluster{
ID: clusterResourceID,
Properties: api.OpenShiftClusterProperties{
ProvisioningState: api.ProvisioningStateSucceeded,
},
},
})
@ -130,6 +133,15 @@ var _ = Describe("MIMO Actuator", Ordered, func() {
RunAfter: 0,
},
})
checker.AddOpenShiftClusterDocuments(&api.OpenShiftClusterDocument{
Key: strings.ToLower(clusterResourceID),
OpenShiftCluster: &api.OpenShiftCluster{
ID: clusterResourceID,
Properties: api.OpenShiftClusterProperties{
ProvisioningState: api.ProvisioningStateSucceeded,
},
},
})
})
It("expires them", func() {
@ -139,6 +151,9 @@ var _ = Describe("MIMO Actuator", Ordered, func() {
errs := checker.CheckMaintenanceManifests(manifestsClient)
Expect(errs).To(BeNil(), fmt.Sprintf("%v", errs))
errs = checker.CheckOpenShiftClusters(clustersClient)
Expect(errs).To(BeNil(), fmt.Sprintf("%v", errs))
})
})
@ -151,6 +166,10 @@ var _ = Describe("MIMO Actuator", Ordered, func() {
Key: strings.ToLower(clusterResourceID),
OpenShiftCluster: &api.OpenShiftCluster{
ID: clusterResourceID,
Properties: api.OpenShiftClusterProperties{
ProvisioningState: api.ProvisioningStateSucceeded,
MaintenanceState: api.MaintenanceStateNone,
},
},
})
@ -179,6 +198,16 @@ var _ = Describe("MIMO Actuator", Ordered, func() {
RunAfter: 0,
},
})
checker.AddOpenShiftClusterDocuments(&api.OpenShiftClusterDocument{
Key: strings.ToLower(clusterResourceID),
OpenShiftCluster: &api.OpenShiftCluster{
ID: clusterResourceID,
Properties: api.OpenShiftClusterProperties{
ProvisioningState: api.ProvisioningStateSucceeded,
MaintenanceState: api.MaintenanceStateNone,
},
},
})
})
It("runs them", func() {
@ -198,6 +227,9 @@ var _ = Describe("MIMO Actuator", Ordered, func() {
errs := checker.CheckMaintenanceManifests(manifestsClient)
Expect(errs).To(BeNil(), fmt.Sprintf("%v", errs))
errs = checker.CheckOpenShiftClusters(clustersClient)
Expect(errs).To(BeNil(), fmt.Sprintf("%v", errs))
})
})

Просмотреть файл

@ -125,6 +125,8 @@ func (a *actuator) Process(ctx context.Context) (bool, error) {
return false, nil
}
a.log.Infof("Processing %d manifests", len(manifestsToAction))
// Dequeue the document
oc, err := a.oc.Get(ctx, a.clusterResourceID)
if err != nil {
@ -136,12 +138,10 @@ func (a *actuator) Process(ctx context.Context) (bool, error) {
return false, fmt.Errorf("failed dequeuing cluster document: %w", err) // This will include StatusPreconditionFaileds
}
// Save these so we can reset them after
previousProvisioningState := oc.OpenShiftCluster.Properties.ProvisioningState
previousFailedProvisioningState := oc.OpenShiftCluster.Properties.FailedProvisioningState
// Mark the maintenance state as unplanned and put it in AdminUpdating
a.log.Infof("Marking cluster as in AdminUpdating")
oc, err = a.oc.PatchWithLease(ctx, a.clusterResourceID, func(oscd *api.OpenShiftClusterDocument) error {
oscd.OpenShiftCluster.Properties.LastProvisioningState = oscd.OpenShiftCluster.Properties.ProvisioningState
oscd.OpenShiftCluster.Properties.ProvisioningState = api.ProvisioningStateAdminUpdating
oscd.OpenShiftCluster.Properties.MaintenanceState = api.MaintenanceStateUnplanned
return nil
@ -151,7 +151,7 @@ func (a *actuator) Process(ctx context.Context) (bool, error) {
a.log.Error(err)
// attempt to dequeue the document, for what it's worth
_, leaseErr := a.oc.EndLease(ctx, a.clusterResourceID, previousProvisioningState, previousFailedProvisioningState, nil)
_, leaseErr := a.oc.EndLease(ctx, a.clusterResourceID, oc.OpenShiftCluster.Properties.LastProvisioningState, oc.OpenShiftCluster.Properties.FailedProvisioningState, nil)
if leaseErr != nil {
return false, fmt.Errorf("failed ending lease early on cluster document: %w", leaseErr)
}
@ -162,12 +162,11 @@ func (a *actuator) Process(ctx context.Context) (bool, error) {
// Execute on the manifests we want to action
for _, doc := range manifestsToAction {
// here
f, ok := a.tasks[doc.MaintenanceManifest.MaintenanceTaskID]
if !ok {
a.log.Infof("not found %v", doc.MaintenanceManifest.MaintenanceTaskID)
continue
}
taskLog := a.log.WithFields(logrus.Fields{
"manifestID": doc.ID,
"taskID": doc.MaintenanceManifest.MaintenanceTaskID,
})
taskLog.Info("begin processing manifest")
// Attempt a dequeue
doc, err = a.mmf.Lease(ctx, a.clusterResourceID, doc.ID)
@ -177,9 +176,23 @@ func (a *actuator) Process(ctx context.Context) (bool, error) {
continue
}
// error if we don't know what this task is, then continue
f, ok := a.tasks[doc.MaintenanceManifest.MaintenanceTaskID]
if !ok {
a.log.Errorf("not found %v", doc.MaintenanceManifest.MaintenanceTaskID)
msg := "task ID not registered"
_, err = a.mmf.EndLease(ctx, doc.ClusterResourceID, doc.ID, api.MaintenanceManifestStateFailed, &msg)
if err != nil {
a.log.Error(fmt.Errorf("failed ending lease early on manifest: %w", err))
}
continue
}
var state api.MaintenanceManifestState
var msg string
taskLog.Info("executing manifest")
// Perform the task with a timeout
err = taskContext.RunInTimeout(time.Minute*60, func() error {
innerErr := f(taskContext, doc, oc)
@ -193,32 +206,36 @@ func (a *actuator) Process(ctx context.Context) (bool, error) {
msg = taskContext.GetResultMessage()
if err != nil {
a.log.Error(err)
if doc.Dequeues >= maxDequeueCount {
msg = fmt.Sprintf("did not succeed after %d times, failing -- %s", doc.Dequeues, err.Error())
state = api.MaintenanceManifestStateRetriesExceeded
taskLog.Error(msg)
} else if utilmimo.IsRetryableError(err) {
// If an error is retryable (i.e explicitly marked as a transient error
// by wrapping it in utilmimo.TransientError), then mark it back as
// Pending so that it will get picked up and retried.
state = api.MaintenanceManifestStatePending
taskLog.Error(fmt.Errorf("task returned a retryable error: %w", err))
} else {
// Terminal errors (explicitly marked or unwrapped) cause task failure
state = api.MaintenanceManifestStateFailed
taskLog.Error(fmt.Errorf("task returned a terminal error: %w", err))
}
} else {
// Mark tasks that don't have an error as succeeded implicitly
state = api.MaintenanceManifestStateCompleted
taskLog.Info("manifest executed successfully")
}
_, err = a.mmf.EndLease(ctx, doc.ClusterResourceID, doc.ID, state, &msg)
if err != nil {
a.log.Error(err)
taskLog.Error(fmt.Errorf("failed ending lease on manifest: %w", err))
}
taskLog.Info("manifest processing complete")
}
// Remove any set maintenance state
a.log.Info("removing maintenance state on cluster")
oc, err = a.oc.PatchWithLease(ctx, a.clusterResourceID, func(oscd *api.OpenShiftClusterDocument) error {
oscd.OpenShiftCluster.Properties.MaintenanceState = api.MaintenanceStateNone
return nil
@ -228,7 +245,8 @@ func (a *actuator) Process(ctx context.Context) (bool, error) {
}
// release the OpenShiftCluster
_, err = a.oc.EndLease(ctx, a.clusterResourceID, previousProvisioningState, previousFailedProvisioningState, nil)
a.log.Info("ending lease on cluster")
_, err = a.oc.EndLease(ctx, a.clusterResourceID, oc.OpenShiftCluster.Properties.LastProvisioningState, oc.OpenShiftCluster.Properties.FailedProvisioningState, nil)
if err != nil {
return false, fmt.Errorf("failed ending lease on cluster document: %w", err)
}

Просмотреть файл

@ -14,7 +14,7 @@ import (
"github.com/Azure/ARO-RP/pkg/api/admin"
)
var _ = Describe("[Admin API] Cluster admin update action", func() {
var _ = Describe("[Admin API] Cluster admin update action", Serial, func() {
BeforeEach(skipIfNotInDevelopmentEnv)
It("must run cluster update operation on a cluster", func(ctx context.Context) {

Просмотреть файл

@ -37,7 +37,7 @@ var _ = Describe("MIMO Actuator E2E Testing", func() {
})
})
It("Should be able to schedule and run a maintenance set via the admin API", func(ctx context.Context) {
It("Should be able to schedule and run a maintenance set via the admin API", Serial, func(ctx context.Context) {
var oc = &admin.OpenShiftCluster{}
testflag := "aro.e2e.testflag." + uuid.DefaultGenerator.Generate()