ARO-RP/pkg/backend/openshiftcluster.go

374 строки
12 KiB
Go

package backend
// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.
import (
"context"
"fmt"
"net/http"
"strings"
"sync/atomic"
"time"
"github.com/Azure/go-autorest/autorest/azure"
"github.com/Azure/go-autorest/autorest/to"
"github.com/sirupsen/logrus"
"github.com/Azure/ARO-RP/pkg/api"
"github.com/Azure/ARO-RP/pkg/cluster"
"github.com/Azure/ARO-RP/pkg/database"
"github.com/Azure/ARO-RP/pkg/env"
"github.com/Azure/ARO-RP/pkg/hive"
"github.com/Azure/ARO-RP/pkg/metrics"
"github.com/Azure/ARO-RP/pkg/util/billing"
"github.com/Azure/ARO-RP/pkg/util/encryption"
utillog "github.com/Azure/ARO-RP/pkg/util/log"
"github.com/Azure/ARO-RP/pkg/util/recover"
)
type openShiftClusterBackend struct {
*backend
newManager func(context.Context, *logrus.Entry, env.Interface, database.OpenShiftClusters, database.Gateway, database.OpenShiftVersions, encryption.AEAD, billing.Manager, *api.OpenShiftClusterDocument, *api.SubscriptionDocument, hive.ClusterManager, metrics.Emitter) (cluster.Interface, error)
}
func newOpenShiftClusterBackend(b *backend) *openShiftClusterBackend {
return &openShiftClusterBackend{
backend: b,
newManager: cluster.New,
}
}
// try tries to dequeue an OpenShiftClusterDocument for work, and works it on a
// new goroutine. It returns a boolean to the caller indicating whether it
// succeeded in dequeuing anything - if this is false, the caller should sleep
// before calling again
func (ocb *openShiftClusterBackend) try(ctx context.Context) (bool, error) {
doc, err := ocb.dbOpenShiftClusters.Dequeue(ctx)
if err != nil || doc == nil {
return false, err
}
log := ocb.baseLog
log = utillog.EnrichWithResourceID(log, doc.OpenShiftCluster.ID)
log = utillog.EnrichWithCorrelationData(log, doc.CorrelationData)
log = utillog.EnrichWithClusterVersion(log, doc.OpenShiftCluster.Properties.ClusterProfile.Version)
log = utillog.EnrichWithClusterDeploymentNamespace(log, doc.OpenShiftCluster.Properties.HiveProfile.Namespace)
if doc.Dequeues > maxDequeueCount {
err := fmt.Errorf("dequeued %d times, failing", doc.Dequeues)
return true, ocb.endLease(ctx, log, nil, doc, api.ProvisioningStateFailed, api.ProvisioningStateFailed, err)
}
log.Print("dequeued")
atomic.AddInt32(&ocb.workers, 1)
ocb.m.EmitGauge("backend.openshiftcluster.workers.count", int64(atomic.LoadInt32(&ocb.workers)), nil)
go func() {
defer recover.Panic(log)
t := time.Now()
defer func() {
atomic.AddInt32(&ocb.workers, -1)
ocb.m.EmitGauge("backend.openshiftcluster.workers.count", int64(atomic.LoadInt32(&ocb.workers)), nil)
ocb.cond.Signal()
log.WithField("duration", time.Since(t).Seconds()).Print("done")
}()
err := ocb.handle(context.Background(), log, doc)
if err != nil {
log.Error(err)
}
}()
return true, nil
}
// handle is responsible for handling backend operation and lease
func (ocb *openShiftClusterBackend) handle(ctx context.Context, log *logrus.Entry, doc *api.OpenShiftClusterDocument) error {
ctx, cancel := context.WithCancel(ctx)
defer cancel()
stop := ocb.heartbeat(ctx, cancel, log, doc)
defer stop()
r, err := azure.ParseResourceID(doc.OpenShiftCluster.ID)
if err != nil {
return err
}
subscriptionDoc, err := ocb.dbSubscriptions.Get(ctx, r.SubscriptionID)
if err != nil {
return err
}
// Only attempt to access Hive if we are installing via Hive or adopting clusters
installViaHive, err := ocb.env.LiveConfig().InstallViaHive(ctx)
if err != nil {
return err
}
adoptViaHive, err := ocb.env.LiveConfig().AdoptByHive(ctx)
if err != nil {
return err
}
var hr hive.ClusterManager
if installViaHive || adoptViaHive {
hiveShard := 1
hiveRestConfig, err := ocb.env.LiveConfig().HiveRestConfig(ctx, hiveShard)
if err != nil {
return fmt.Errorf("failed getting RESTConfig for Hive shard %d: %w", hiveShard, err)
}
hr, err = hive.NewFromConfig(log, ocb.env, hiveRestConfig)
if err != nil {
return fmt.Errorf("failed creating HiveClusterManager: %w", err)
}
}
m, err := ocb.newManager(ctx, log, ocb.env, ocb.dbOpenShiftClusters, ocb.dbGateway, ocb.dbOpenShiftVersions, ocb.aead, ocb.billing, doc, subscriptionDoc, hr, ocb.m)
if err != nil {
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateFailed, api.ProvisioningStateFailed, err)
}
switch doc.OpenShiftCluster.Properties.ProvisioningState {
case api.ProvisioningStateCreating:
log.Print("creating")
err = m.Install(ctx)
if err != nil {
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateCreating, api.ProvisioningStateFailed, err)
}
// re-get document and check the state:
// if Install = nil, we are done with the install.
// if Install != nil, we need to terminate, release lease and let other
// backend worker to pick up next install phase
doc, err = ocb.dbOpenShiftClusters.Get(ctx, strings.ToLower(doc.OpenShiftCluster.ID))
if err != nil {
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateCreating, api.ProvisioningStateFailed, err)
}
if doc.OpenShiftCluster.Properties.Install == nil {
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateCreating, api.ProvisioningStateSucceeded, nil)
}
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateCreating, api.ProvisioningStateCreating, nil)
case api.ProvisioningStateAdminUpdating:
log.Printf("admin updating (type: %s)", doc.OpenShiftCluster.Properties.MaintenanceTask)
err = m.AdminUpdate(ctx)
if err != nil {
// Customer will continue to see the cluster in an ongoing maintenance state
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateAdminUpdating, api.ProvisioningStateFailed, err)
}
// Maintenance task is complete, so we can clear the maintenance state
doc, err = ocb.setNoMaintenanceState(ctx, doc)
if err != nil {
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateAdminUpdating, api.ProvisioningStateFailed, err)
}
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateAdminUpdating, api.ProvisioningStateSucceeded, nil)
case api.ProvisioningStateUpdating:
log.Print("updating")
err = m.Update(ctx)
if err != nil {
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateUpdating, api.ProvisioningStateFailed, err)
}
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateUpdating, api.ProvisioningStateSucceeded, nil)
case api.ProvisioningStateDeleting:
log.Print("deleting")
t := time.Now()
err = m.Delete(ctx)
if err != nil {
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateDeleting, api.ProvisioningStateFailed, err)
}
err = ocb.updateAsyncOperation(ctx, log, doc.AsyncOperationID, nil, api.ProvisioningStateSucceeded, "", nil)
if err != nil {
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateDeleting, api.ProvisioningStateFailed, err)
}
stop()
// This Sleep ensures that the monitor has enough time
// to capture the deletion (by reading from the changefeed)
// and stop monitoring the cluster.
// TODO: Provide better communication between RP and Monitor
time.Sleep(time.Until(t.Add(time.Second * 20)))
return ocb.dbOpenShiftClusters.Delete(ctx, doc)
}
return fmt.Errorf("unexpected provisioningState %q", doc.OpenShiftCluster.Properties.ProvisioningState)
}
func (ocb *openShiftClusterBackend) heartbeat(ctx context.Context, cancel context.CancelFunc, log *logrus.Entry, doc *api.OpenShiftClusterDocument) func() {
var stopped bool
stop, done := make(chan struct{}), make(chan struct{})
go func() {
defer recover.Panic(log)
defer close(done)
t := time.NewTicker(10 * time.Second)
defer t.Stop()
for {
_, err := ocb.dbOpenShiftClusters.Lease(ctx, doc.Key)
if err != nil {
log.Error(err)
cancel()
return
}
select {
case <-t.C:
case <-stop:
return
}
}
}()
return func() {
if !stopped {
close(stop)
<-done
stopped = true
}
}
}
func (ocb *openShiftClusterBackend) updateAsyncOperation(ctx context.Context, log *logrus.Entry, id string, oc *api.OpenShiftCluster, provisioningState, failedProvisioningState api.ProvisioningState, backendErr error) error {
if id != "" {
_, err := ocb.dbAsyncOperations.Patch(ctx, id, func(asyncdoc *api.AsyncOperationDocument) error {
asyncdoc.AsyncOperation.ProvisioningState = provisioningState
now := time.Now()
asyncdoc.AsyncOperation.EndTime = &now
if provisioningState == api.ProvisioningStateFailed {
// if type is CloudError - we want to propagate it to the
// asyncOperations errors. Otherwise - return generic error
err, ok := backendErr.(*api.CloudError)
if ok {
log.Print(backendErr)
asyncdoc.AsyncOperation.Error = err.CloudErrorBody
} else {
log.Error(backendErr)
asyncdoc.AsyncOperation.Error = &api.CloudErrorBody{
Code: api.CloudErrorCodeInternalServerError,
Message: "Internal server error.",
}
}
}
if oc != nil {
//nolint:govet
ocCopy := *oc
ocCopy.Properties.ProvisioningState = provisioningState
ocCopy.Properties.LastProvisioningState = ""
ocCopy.Properties.FailedProvisioningState = failedProvisioningState
asyncdoc.OpenShiftCluster = &ocCopy
}
return nil
})
if err != nil {
return err
}
}
return nil
}
func (ocb *openShiftClusterBackend) endLease(ctx context.Context, log *logrus.Entry, stop func(), doc *api.OpenShiftClusterDocument, operationType, provisioningState api.ProvisioningState, backendErr error) error {
var adminUpdateError *string
var failedProvisioningState api.ProvisioningState
initialProvisioningState := doc.OpenShiftCluster.Properties.ProvisioningState
if initialProvisioningState != api.ProvisioningStateAdminUpdating &&
provisioningState == api.ProvisioningStateFailed {
failedProvisioningState = initialProvisioningState
}
// If cluster is in the non-terminal state we are still in the same
// operational context and AsyncOperation should not be updated.
if provisioningState.IsTerminal() {
err := ocb.updateAsyncOperation(ctx, log, doc.AsyncOperationID, doc.OpenShiftCluster, provisioningState, failedProvisioningState, backendErr)
if err != nil {
return err
}
ocb.asyncOperationResultLog(log, initialProvisioningState, backendErr)
ocb.emitMetrics(log, doc, operationType, provisioningState, nil)
ocb.emitProvisioningMetrics(doc, provisioningState)
}
if initialProvisioningState == api.ProvisioningStateAdminUpdating {
provisioningState = doc.OpenShiftCluster.Properties.LastProvisioningState
failedProvisioningState = doc.OpenShiftCluster.Properties.FailedProvisioningState
if backendErr == nil {
adminUpdateError = to.StringPtr("")
} else {
adminUpdateError = to.StringPtr(backendErr.Error())
}
}
if stop != nil {
stop()
}
ocb.emitMetrics(log, doc, operationType, provisioningState, nil)
_, err := ocb.dbOpenShiftClusters.EndLease(ctx, doc.Key, provisioningState, failedProvisioningState, adminUpdateError)
return err
}
func (ocb *openShiftClusterBackend) asyncOperationResultLog(log *logrus.Entry, initialProvisioningState api.ProvisioningState, backendErr error) {
log = log.WithFields(logrus.Fields{
"LOGKIND": "asyncqos",
"resultType": utillog.SuccessResultType,
"operationType": initialProvisioningState.String(),
})
if backendErr == nil {
log.Info("long running operation succeeded")
return
}
if strings.Contains(strings.ToLower(backendErr.Error()), "one of the claims 'puid' or 'altsecid' or 'oid' should be present") {
backendErr = api.NewCloudError(http.StatusBadRequest, api.CloudErrorCodeInvalidServicePrincipalClaims,
"properties.servicePrincipalProfile", "The Azure Red Hat Openshift resource provider service principal has been removed from your tenant. To restore, please unregister and then re-register the Azure Red Hat OpenShift resource provider.")
}
err, ok := backendErr.(*api.CloudError)
if ok {
resultType := utillog.MapStatusCodeToResultType(err.StatusCode)
log = log.WithField("resultType", resultType)
if resultType == utillog.SuccessResultType {
log.Info("long running operation succeeded")
return
}
} else {
log = log.WithField("resultType", utillog.ServerErrorResultType)
}
log = log.WithField("errorDetails", backendErr.Error())
log.Info("long running operation failed")
}
func (ocb *openShiftClusterBackend) setNoMaintenanceState(ctx context.Context, doc *api.OpenShiftClusterDocument) (*api.OpenShiftClusterDocument, error) {
return ocb.dbOpenShiftClusters.Patch(ctx, doc.Key, func(doc *api.OpenShiftClusterDocument) error {
doc.OpenShiftCluster.Properties.MaintenanceState = api.MaintenanceStateNone
return nil
})
}