зеркало из https://github.com/Azure/ARO-RP.git
374 строки
12 KiB
Go
374 строки
12 KiB
Go
package backend
|
|
|
|
// Copyright (c) Microsoft Corporation.
|
|
// Licensed under the Apache License 2.0.
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"net/http"
|
|
"strings"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/Azure/go-autorest/autorest/azure"
|
|
"github.com/Azure/go-autorest/autorest/to"
|
|
"github.com/sirupsen/logrus"
|
|
|
|
"github.com/Azure/ARO-RP/pkg/api"
|
|
"github.com/Azure/ARO-RP/pkg/cluster"
|
|
"github.com/Azure/ARO-RP/pkg/database"
|
|
"github.com/Azure/ARO-RP/pkg/env"
|
|
"github.com/Azure/ARO-RP/pkg/hive"
|
|
"github.com/Azure/ARO-RP/pkg/metrics"
|
|
"github.com/Azure/ARO-RP/pkg/util/billing"
|
|
"github.com/Azure/ARO-RP/pkg/util/encryption"
|
|
utillog "github.com/Azure/ARO-RP/pkg/util/log"
|
|
"github.com/Azure/ARO-RP/pkg/util/recover"
|
|
)
|
|
|
|
type openShiftClusterBackend struct {
|
|
*backend
|
|
|
|
newManager func(context.Context, *logrus.Entry, env.Interface, database.OpenShiftClusters, database.Gateway, database.OpenShiftVersions, encryption.AEAD, billing.Manager, *api.OpenShiftClusterDocument, *api.SubscriptionDocument, hive.ClusterManager, metrics.Emitter) (cluster.Interface, error)
|
|
}
|
|
|
|
func newOpenShiftClusterBackend(b *backend) *openShiftClusterBackend {
|
|
return &openShiftClusterBackend{
|
|
backend: b,
|
|
newManager: cluster.New,
|
|
}
|
|
}
|
|
|
|
// try tries to dequeue an OpenShiftClusterDocument for work, and works it on a
|
|
// new goroutine. It returns a boolean to the caller indicating whether it
|
|
// succeeded in dequeuing anything - if this is false, the caller should sleep
|
|
// before calling again
|
|
func (ocb *openShiftClusterBackend) try(ctx context.Context) (bool, error) {
|
|
doc, err := ocb.dbOpenShiftClusters.Dequeue(ctx)
|
|
if err != nil || doc == nil {
|
|
return false, err
|
|
}
|
|
|
|
log := ocb.baseLog
|
|
log = utillog.EnrichWithResourceID(log, doc.OpenShiftCluster.ID)
|
|
log = utillog.EnrichWithCorrelationData(log, doc.CorrelationData)
|
|
log = utillog.EnrichWithClusterVersion(log, doc.OpenShiftCluster.Properties.ClusterProfile.Version)
|
|
log = utillog.EnrichWithClusterDeploymentNamespace(log, doc.OpenShiftCluster.Properties.HiveProfile.Namespace)
|
|
|
|
if doc.Dequeues > maxDequeueCount {
|
|
err := fmt.Errorf("dequeued %d times, failing", doc.Dequeues)
|
|
return true, ocb.endLease(ctx, log, nil, doc, api.ProvisioningStateFailed, api.ProvisioningStateFailed, err)
|
|
}
|
|
|
|
log.Print("dequeued")
|
|
atomic.AddInt32(&ocb.workers, 1)
|
|
ocb.m.EmitGauge("backend.openshiftcluster.workers.count", int64(atomic.LoadInt32(&ocb.workers)), nil)
|
|
|
|
go func() {
|
|
defer recover.Panic(log)
|
|
|
|
t := time.Now()
|
|
|
|
defer func() {
|
|
atomic.AddInt32(&ocb.workers, -1)
|
|
ocb.m.EmitGauge("backend.openshiftcluster.workers.count", int64(atomic.LoadInt32(&ocb.workers)), nil)
|
|
ocb.cond.Signal()
|
|
|
|
log.WithField("duration", time.Since(t).Seconds()).Print("done")
|
|
}()
|
|
|
|
err := ocb.handle(context.Background(), log, doc)
|
|
if err != nil {
|
|
log.Error(err)
|
|
}
|
|
}()
|
|
|
|
return true, nil
|
|
}
|
|
|
|
// handle is responsible for handling backend operation and lease
|
|
func (ocb *openShiftClusterBackend) handle(ctx context.Context, log *logrus.Entry, doc *api.OpenShiftClusterDocument) error {
|
|
ctx, cancel := context.WithCancel(ctx)
|
|
defer cancel()
|
|
|
|
stop := ocb.heartbeat(ctx, cancel, log, doc)
|
|
defer stop()
|
|
|
|
r, err := azure.ParseResourceID(doc.OpenShiftCluster.ID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
subscriptionDoc, err := ocb.dbSubscriptions.Get(ctx, r.SubscriptionID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Only attempt to access Hive if we are installing via Hive or adopting clusters
|
|
installViaHive, err := ocb.env.LiveConfig().InstallViaHive(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
adoptViaHive, err := ocb.env.LiveConfig().AdoptByHive(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
var hr hive.ClusterManager
|
|
if installViaHive || adoptViaHive {
|
|
hiveShard := 1
|
|
hiveRestConfig, err := ocb.env.LiveConfig().HiveRestConfig(ctx, hiveShard)
|
|
if err != nil {
|
|
return fmt.Errorf("failed getting RESTConfig for Hive shard %d: %w", hiveShard, err)
|
|
}
|
|
hr, err = hive.NewFromConfig(log, ocb.env, hiveRestConfig)
|
|
if err != nil {
|
|
return fmt.Errorf("failed creating HiveClusterManager: %w", err)
|
|
}
|
|
}
|
|
|
|
m, err := ocb.newManager(ctx, log, ocb.env, ocb.dbOpenShiftClusters, ocb.dbGateway, ocb.dbOpenShiftVersions, ocb.aead, ocb.billing, doc, subscriptionDoc, hr, ocb.m)
|
|
if err != nil {
|
|
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateFailed, api.ProvisioningStateFailed, err)
|
|
}
|
|
|
|
switch doc.OpenShiftCluster.Properties.ProvisioningState {
|
|
case api.ProvisioningStateCreating:
|
|
log.Print("creating")
|
|
|
|
err = m.Install(ctx)
|
|
if err != nil {
|
|
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateCreating, api.ProvisioningStateFailed, err)
|
|
}
|
|
// re-get document and check the state:
|
|
// if Install = nil, we are done with the install.
|
|
// if Install != nil, we need to terminate, release lease and let other
|
|
// backend worker to pick up next install phase
|
|
doc, err = ocb.dbOpenShiftClusters.Get(ctx, strings.ToLower(doc.OpenShiftCluster.ID))
|
|
if err != nil {
|
|
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateCreating, api.ProvisioningStateFailed, err)
|
|
}
|
|
if doc.OpenShiftCluster.Properties.Install == nil {
|
|
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateCreating, api.ProvisioningStateSucceeded, nil)
|
|
}
|
|
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateCreating, api.ProvisioningStateCreating, nil)
|
|
|
|
case api.ProvisioningStateAdminUpdating:
|
|
log.Printf("admin updating (type: %s)", doc.OpenShiftCluster.Properties.MaintenanceTask)
|
|
|
|
err = m.AdminUpdate(ctx)
|
|
if err != nil {
|
|
// Customer will continue to see the cluster in an ongoing maintenance state
|
|
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateAdminUpdating, api.ProvisioningStateFailed, err)
|
|
}
|
|
// Maintenance task is complete, so we can clear the maintenance state
|
|
doc, err = ocb.setNoMaintenanceState(ctx, doc)
|
|
if err != nil {
|
|
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateAdminUpdating, api.ProvisioningStateFailed, err)
|
|
}
|
|
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateAdminUpdating, api.ProvisioningStateSucceeded, nil)
|
|
|
|
case api.ProvisioningStateUpdating:
|
|
log.Print("updating")
|
|
|
|
err = m.Update(ctx)
|
|
if err != nil {
|
|
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateUpdating, api.ProvisioningStateFailed, err)
|
|
}
|
|
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateUpdating, api.ProvisioningStateSucceeded, nil)
|
|
|
|
case api.ProvisioningStateDeleting:
|
|
log.Print("deleting")
|
|
t := time.Now()
|
|
|
|
err = m.Delete(ctx)
|
|
if err != nil {
|
|
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateDeleting, api.ProvisioningStateFailed, err)
|
|
}
|
|
|
|
err = ocb.updateAsyncOperation(ctx, log, doc.AsyncOperationID, nil, api.ProvisioningStateSucceeded, "", nil)
|
|
if err != nil {
|
|
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateDeleting, api.ProvisioningStateFailed, err)
|
|
}
|
|
|
|
stop()
|
|
|
|
// This Sleep ensures that the monitor has enough time
|
|
// to capture the deletion (by reading from the changefeed)
|
|
// and stop monitoring the cluster.
|
|
// TODO: Provide better communication between RP and Monitor
|
|
time.Sleep(time.Until(t.Add(time.Second * 20)))
|
|
return ocb.dbOpenShiftClusters.Delete(ctx, doc)
|
|
}
|
|
|
|
return fmt.Errorf("unexpected provisioningState %q", doc.OpenShiftCluster.Properties.ProvisioningState)
|
|
}
|
|
|
|
func (ocb *openShiftClusterBackend) heartbeat(ctx context.Context, cancel context.CancelFunc, log *logrus.Entry, doc *api.OpenShiftClusterDocument) func() {
|
|
var stopped bool
|
|
stop, done := make(chan struct{}), make(chan struct{})
|
|
|
|
go func() {
|
|
defer recover.Panic(log)
|
|
|
|
defer close(done)
|
|
|
|
t := time.NewTicker(10 * time.Second)
|
|
defer t.Stop()
|
|
|
|
for {
|
|
_, err := ocb.dbOpenShiftClusters.Lease(ctx, doc.Key)
|
|
if err != nil {
|
|
log.Error(err)
|
|
cancel()
|
|
return
|
|
}
|
|
|
|
select {
|
|
case <-t.C:
|
|
case <-stop:
|
|
return
|
|
}
|
|
}
|
|
}()
|
|
|
|
return func() {
|
|
if !stopped {
|
|
close(stop)
|
|
<-done
|
|
stopped = true
|
|
}
|
|
}
|
|
}
|
|
|
|
func (ocb *openShiftClusterBackend) updateAsyncOperation(ctx context.Context, log *logrus.Entry, id string, oc *api.OpenShiftCluster, provisioningState, failedProvisioningState api.ProvisioningState, backendErr error) error {
|
|
if id != "" {
|
|
_, err := ocb.dbAsyncOperations.Patch(ctx, id, func(asyncdoc *api.AsyncOperationDocument) error {
|
|
asyncdoc.AsyncOperation.ProvisioningState = provisioningState
|
|
|
|
now := time.Now()
|
|
asyncdoc.AsyncOperation.EndTime = &now
|
|
|
|
if provisioningState == api.ProvisioningStateFailed {
|
|
// if type is CloudError - we want to propagate it to the
|
|
// asyncOperations errors. Otherwise - return generic error
|
|
err, ok := backendErr.(*api.CloudError)
|
|
if ok {
|
|
log.Print(backendErr)
|
|
asyncdoc.AsyncOperation.Error = err.CloudErrorBody
|
|
} else {
|
|
log.Error(backendErr)
|
|
asyncdoc.AsyncOperation.Error = &api.CloudErrorBody{
|
|
Code: api.CloudErrorCodeInternalServerError,
|
|
Message: "Internal server error.",
|
|
}
|
|
}
|
|
}
|
|
|
|
if oc != nil {
|
|
//nolint:govet
|
|
ocCopy := *oc
|
|
ocCopy.Properties.ProvisioningState = provisioningState
|
|
ocCopy.Properties.LastProvisioningState = ""
|
|
ocCopy.Properties.FailedProvisioningState = failedProvisioningState
|
|
|
|
asyncdoc.OpenShiftCluster = &ocCopy
|
|
}
|
|
|
|
return nil
|
|
})
|
|
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (ocb *openShiftClusterBackend) endLease(ctx context.Context, log *logrus.Entry, stop func(), doc *api.OpenShiftClusterDocument, operationType, provisioningState api.ProvisioningState, backendErr error) error {
|
|
var adminUpdateError *string
|
|
var failedProvisioningState api.ProvisioningState
|
|
initialProvisioningState := doc.OpenShiftCluster.Properties.ProvisioningState
|
|
|
|
if initialProvisioningState != api.ProvisioningStateAdminUpdating &&
|
|
provisioningState == api.ProvisioningStateFailed {
|
|
failedProvisioningState = initialProvisioningState
|
|
}
|
|
|
|
// If cluster is in the non-terminal state we are still in the same
|
|
// operational context and AsyncOperation should not be updated.
|
|
if provisioningState.IsTerminal() {
|
|
err := ocb.updateAsyncOperation(ctx, log, doc.AsyncOperationID, doc.OpenShiftCluster, provisioningState, failedProvisioningState, backendErr)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
ocb.asyncOperationResultLog(log, initialProvisioningState, backendErr)
|
|
ocb.emitMetrics(log, doc, operationType, provisioningState, nil)
|
|
ocb.emitProvisioningMetrics(doc, provisioningState)
|
|
}
|
|
|
|
if initialProvisioningState == api.ProvisioningStateAdminUpdating {
|
|
provisioningState = doc.OpenShiftCluster.Properties.LastProvisioningState
|
|
failedProvisioningState = doc.OpenShiftCluster.Properties.FailedProvisioningState
|
|
|
|
if backendErr == nil {
|
|
adminUpdateError = to.StringPtr("")
|
|
} else {
|
|
adminUpdateError = to.StringPtr(backendErr.Error())
|
|
}
|
|
}
|
|
|
|
if stop != nil {
|
|
stop()
|
|
}
|
|
|
|
ocb.emitMetrics(log, doc, operationType, provisioningState, nil)
|
|
|
|
_, err := ocb.dbOpenShiftClusters.EndLease(ctx, doc.Key, provisioningState, failedProvisioningState, adminUpdateError)
|
|
return err
|
|
}
|
|
|
|
func (ocb *openShiftClusterBackend) asyncOperationResultLog(log *logrus.Entry, initialProvisioningState api.ProvisioningState, backendErr error) {
|
|
log = log.WithFields(logrus.Fields{
|
|
"LOGKIND": "asyncqos",
|
|
"resultType": utillog.SuccessResultType,
|
|
"operationType": initialProvisioningState.String(),
|
|
})
|
|
|
|
if backendErr == nil {
|
|
log.Info("long running operation succeeded")
|
|
return
|
|
}
|
|
|
|
if strings.Contains(strings.ToLower(backendErr.Error()), "one of the claims 'puid' or 'altsecid' or 'oid' should be present") {
|
|
backendErr = api.NewCloudError(http.StatusBadRequest, api.CloudErrorCodeInvalidServicePrincipalClaims,
|
|
"properties.servicePrincipalProfile", "The Azure Red Hat Openshift resource provider service principal has been removed from your tenant. To restore, please unregister and then re-register the Azure Red Hat OpenShift resource provider.")
|
|
}
|
|
|
|
err, ok := backendErr.(*api.CloudError)
|
|
if ok {
|
|
resultType := utillog.MapStatusCodeToResultType(err.StatusCode)
|
|
log = log.WithField("resultType", resultType)
|
|
|
|
if resultType == utillog.SuccessResultType {
|
|
log.Info("long running operation succeeded")
|
|
return
|
|
}
|
|
} else {
|
|
log = log.WithField("resultType", utillog.ServerErrorResultType)
|
|
}
|
|
|
|
log = log.WithField("errorDetails", backendErr.Error())
|
|
log.Info("long running operation failed")
|
|
}
|
|
|
|
func (ocb *openShiftClusterBackend) setNoMaintenanceState(ctx context.Context, doc *api.OpenShiftClusterDocument) (*api.OpenShiftClusterDocument, error) {
|
|
return ocb.dbOpenShiftClusters.Patch(ctx, doc.Key, func(doc *api.OpenShiftClusterDocument) error {
|
|
doc.OpenShiftCluster.Properties.MaintenanceState = api.MaintenanceStateNone
|
|
return nil
|
|
})
|
|
}
|