ARO-RP/pkg/deploy/predeploy.go

561 строка
17 KiB
Go

package deploy
// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.
import (
"context"
"crypto/rand"
"crypto/rsa"
"crypto/x509"
"encoding/base64"
"encoding/json"
"fmt"
"net/http"
"path/filepath"
"strings"
"time"
mgmtcompute "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2020-06-01/compute"
azkeyvault "github.com/Azure/azure-sdk-for-go/services/keyvault/v7.0/keyvault"
mgmtfeatures "github.com/Azure/azure-sdk-for-go/services/resources/mgmt/2019-07-01/features"
"github.com/Azure/go-autorest/autorest/azure"
"github.com/Azure/go-autorest/autorest/to"
"k8s.io/apimachinery/pkg/util/wait"
"github.com/Azure/ARO-RP/pkg/api"
"github.com/Azure/ARO-RP/pkg/deploy/assets"
"github.com/Azure/ARO-RP/pkg/deploy/generator"
"github.com/Azure/ARO-RP/pkg/env"
"github.com/Azure/ARO-RP/pkg/util/arm"
"github.com/Azure/ARO-RP/pkg/util/keyvault"
)
const (
// Rotate the secret on every deploy of the RP if the most recent
// secret is greater than 7 days old
rotateSecretAfter = time.Hour * 24 * 7
rpRestartScript = "systemctl restart aro-monitor; systemctl restart aro-portal; systemctl restart aro-rp"
)
// PreDeploy deploys managed identity, NSGs and keyvaults, needed for main
// deployment
func (d *deployer) PreDeploy(ctx context.Context, lbHealthcheckWaitTimeSec int) error {
// deploy global rbac
err := d.deployRPGlobalSubscription(ctx)
if err != nil {
return err
}
d.log.Infof("deploying rg %s in %s", *d.config.Configuration.SubscriptionResourceGroupName, *d.config.Configuration.SubscriptionResourceGroupLocation)
_, err = d.groups.CreateOrUpdate(ctx, *d.config.Configuration.SubscriptionResourceGroupName, mgmtfeatures.ResourceGroup{
Location: d.config.Configuration.SubscriptionResourceGroupLocation,
})
if err != nil {
return err
}
d.log.Infof("deploying rg %s in %s", *d.config.Configuration.GlobalResourceGroupName, *d.config.Configuration.GlobalResourceGroupLocation)
_, err = d.globalgroups.CreateOrUpdate(ctx, *d.config.Configuration.GlobalResourceGroupName, mgmtfeatures.ResourceGroup{
Location: d.config.Configuration.GlobalResourceGroupLocation,
})
if err != nil {
return err
}
d.log.Infof("deploying rg %s in %s", d.config.RPResourceGroupName, d.config.Location)
_, err = d.groups.CreateOrUpdate(ctx, d.config.RPResourceGroupName, mgmtfeatures.ResourceGroup{
Location: &d.config.Location,
})
if err != nil {
return err
}
d.log.Infof("deploying rg %s in %s", d.config.GatewayResourceGroupName, d.config.Location)
_, err = d.groups.CreateOrUpdate(ctx, d.config.GatewayResourceGroupName, mgmtfeatures.ResourceGroup{
Location: &d.config.Location,
})
if err != nil {
return err
}
// deploy action groups
err = d.deployRPSubscription(ctx)
if err != nil {
return err
}
// deploy managed identity
err = d.deployManagedIdentity(ctx, d.config.RPResourceGroupName, generator.FileRPProductionManagedIdentity)
if err != nil {
return err
}
rpMSI, err := d.userassignedidentities.Get(ctx, d.config.RPResourceGroupName, "aro-rp-"+d.config.Location)
if err != nil {
return err
}
// deploy managed identity
err = d.deployManagedIdentity(ctx, d.config.GatewayResourceGroupName, generator.FileGatewayProductionManagedIdentity)
if err != nil {
return err
}
gwMSI, err := d.userassignedidentities.Get(ctx, d.config.GatewayResourceGroupName, "aro-gateway-"+d.config.Location)
if err != nil {
return err
}
// deploy ACR RBAC, RP version storage account
err = d.deployRPGlobal(ctx, rpMSI.PrincipalID.String(), gwMSI.PrincipalID.String())
if err != nil {
return err
}
// Due to https://github.com/Azure/azure-resource-manager-schemas/issues/1067
// we can't use conditions to define ACR replication object deployment.
// Also, an ACR replica cannot be defined in the home registry location.
acrLocation := *d.config.Configuration.GlobalResourceGroupLocation
if d.config.Configuration.ACRLocationOverride != nil && *d.config.Configuration.ACRLocationOverride != "" {
acrLocation = *d.config.Configuration.ACRLocationOverride
}
if !strings.EqualFold(d.config.Location, acrLocation) &&
(d.config.Configuration.ACRReplicaDisabled == nil || !*d.config.Configuration.ACRReplicaDisabled) {
err = d.deployRPGlobalACRReplication(ctx)
if err != nil {
return err
}
}
// deploy NSGs, keyvaults
// gateway first because RP predeploy will peer its vnet to the gateway vnet
// key the decision to deploy NSGs on the existence of the gateway
// predeploy. We do this in order to refresh the RP NSGs when the gateway
// is deployed for the first time.
isCreate := false
_, err = d.deployments.Get(ctx, d.config.GatewayResourceGroupName, strings.TrimSuffix(generator.FileGatewayProductionPredeploy, ".json"))
if isDeploymentNotFoundError(err) {
isCreate = true
err = nil
}
if err != nil {
return err
}
err = d.deployPreDeploy(ctx, d.config.GatewayResourceGroupName, generator.FileGatewayProductionPredeploy, "gatewayServicePrincipalId", gwMSI.PrincipalID.String(), isCreate)
if err != nil {
return err
}
err = d.deployPreDeploy(ctx, d.config.RPResourceGroupName, generator.FileRPProductionPredeploy, "rpServicePrincipalId", rpMSI.PrincipalID.String(), isCreate)
if err != nil {
return err
}
return d.configureServiceSecrets(ctx, lbHealthcheckWaitTimeSec)
}
func (d *deployer) deployRPGlobal(ctx context.Context, rpServicePrincipalID, gatewayServicePrincipalID string) error {
deploymentName := "rp-global-" + d.config.Location
asset, err := assets.EmbeddedFiles.ReadFile(generator.FileRPProductionGlobal)
if err != nil {
return err
}
var template map[string]interface{}
err = json.Unmarshal(asset, &template)
if err != nil {
return err
}
parameters := d.getParameters(template["parameters"].(map[string]interface{}))
parameters.Parameters["rpServicePrincipalId"] = &arm.ParametersParameter{
Value: rpServicePrincipalID,
}
parameters.Parameters["gatewayServicePrincipalId"] = &arm.ParametersParameter{
Value: gatewayServicePrincipalID,
}
for i := 0; i < 2; i++ {
d.log.Infof("deploying %s", deploymentName)
err = d.globaldeployments.CreateOrUpdateAndWait(ctx, *d.config.Configuration.GlobalResourceGroupName, deploymentName, mgmtfeatures.Deployment{
Properties: &mgmtfeatures.DeploymentProperties{
Template: template,
Mode: mgmtfeatures.Incremental,
Parameters: parameters.Parameters,
},
})
if serviceErr, ok := err.(*azure.ServiceError); ok &&
serviceErr.Code == "DeploymentFailed" &&
i < 1 {
// Can get a Conflict ("Another operation is in progress") on the
// ACR. Retry once.
d.log.Print(err)
continue
}
if err != nil {
return err
}
break
}
return nil
}
func (d *deployer) deployRPGlobalACRReplication(ctx context.Context) error {
deploymentName := "rp-global-acr-replication-" + d.config.Location
asset, err := assets.EmbeddedFiles.ReadFile(generator.FileRPProductionGlobalACRReplication)
if err != nil {
return err
}
var template map[string]interface{}
err = json.Unmarshal(asset, &template)
if err != nil {
return err
}
parameters := d.getParameters(template["parameters"].(map[string]interface{}))
parameters.Parameters["location"] = &arm.ParametersParameter{
Value: d.config.Location,
}
d.log.Infof("deploying %s", deploymentName)
return d.globaldeployments.CreateOrUpdateAndWait(ctx, *d.config.Configuration.GlobalResourceGroupName, deploymentName, mgmtfeatures.Deployment{
Properties: &mgmtfeatures.DeploymentProperties{
Template: template,
Mode: mgmtfeatures.Incremental,
Parameters: parameters.Parameters,
},
})
}
func (d *deployer) deployRPGlobalSubscription(ctx context.Context) error {
deploymentName := "rp-global-subscription-" + d.config.Location
asset, err := assets.EmbeddedFiles.ReadFile(generator.FileRPProductionGlobalSubscription)
if err != nil {
return err
}
var template map[string]interface{}
err = json.Unmarshal(asset, &template)
if err != nil {
return err
}
d.log.Infof("deploying %s", deploymentName)
for i := 0; i < 5; i++ {
err = d.globaldeployments.CreateOrUpdateAtSubscriptionScopeAndWait(ctx, deploymentName, mgmtfeatures.Deployment{
Properties: &mgmtfeatures.DeploymentProperties{
Template: template,
Mode: mgmtfeatures.Incremental,
},
Location: d.config.Configuration.GlobalResourceGroupLocation,
})
if serviceErr, ok := err.(*azure.ServiceError); ok &&
serviceErr.Code == "DeploymentFailed" &&
i < 4 {
// Sometimes we see RoleDefinitionUpdateConflict when multiple RPs
// are deploying at once. Retry a few times.
d.log.Print(err)
continue
}
if err != nil {
return err
}
break
}
return nil
}
func (d *deployer) deployRPSubscription(ctx context.Context) error {
deploymentName := "rp-production-subscription-" + d.config.Location
asset, err := assets.EmbeddedFiles.ReadFile(generator.FileRPProductionSubscription)
if err != nil {
return err
}
var template map[string]interface{}
err = json.Unmarshal(asset, &template)
if err != nil {
return err
}
d.log.Infof("deploying %s", deploymentName)
return d.deployments.CreateOrUpdateAndWait(ctx, *d.config.Configuration.SubscriptionResourceGroupName, deploymentName, mgmtfeatures.Deployment{
Properties: &mgmtfeatures.DeploymentProperties{
Template: template,
Mode: mgmtfeatures.Incremental,
},
})
}
func (d *deployer) deployManagedIdentity(ctx context.Context, resourceGroupName, deploymentFile string) error {
deploymentName := strings.TrimSuffix(deploymentFile, ".json")
asset, err := assets.EmbeddedFiles.ReadFile(deploymentFile)
if err != nil {
return err
}
var template map[string]interface{}
err = json.Unmarshal(asset, &template)
if err != nil {
return err
}
d.log.Infof("deploying %s", deploymentName)
return d.deployments.CreateOrUpdateAndWait(ctx, resourceGroupName, deploymentName, mgmtfeatures.Deployment{
Properties: &mgmtfeatures.DeploymentProperties{
Template: template,
Mode: mgmtfeatures.Incremental,
},
})
}
func (d *deployer) deployPreDeploy(ctx context.Context, resourceGroupName, deploymentFile, spIDName, spID string, isCreate bool) error {
deploymentName := strings.TrimSuffix(deploymentFile, ".json")
asset, err := assets.EmbeddedFiles.ReadFile(deploymentFile)
if err != nil {
return err
}
var template map[string]interface{}
err = json.Unmarshal(asset, &template)
if err != nil {
return err
}
parameters := d.getParameters(template["parameters"].(map[string]interface{}))
parameters.Parameters["deployNSGs"] = &arm.ParametersParameter{
Value: isCreate,
}
// TODO: ugh
if _, ok := template["parameters"].(map[string]interface{})["gatewayResourceGroupName"]; ok {
parameters.Parameters["gatewayResourceGroupName"] = &arm.ParametersParameter{
Value: d.config.GatewayResourceGroupName,
}
}
parameters.Parameters[spIDName] = &arm.ParametersParameter{
Value: spID,
}
d.log.Infof("deploying %s", deploymentName)
return d.deployments.CreateOrUpdateAndWait(ctx, resourceGroupName, deploymentName, mgmtfeatures.Deployment{
Properties: &mgmtfeatures.DeploymentProperties{
Template: template,
Mode: mgmtfeatures.Incremental,
Parameters: parameters.Parameters,
},
})
}
func (d *deployer) configureServiceSecrets(ctx context.Context, lbHealthcheckWaitTimeSec int) error {
isRotated := false
for _, s := range []struct {
kv keyvault.Manager
secretName string
len int
}{
{d.serviceKeyvault, env.EncryptionSecretV2Name, 64},
{d.serviceKeyvault, env.FrontendEncryptionSecretV2Name, 64},
{d.portalKeyvault, env.PortalServerSessionKeySecretName, 32},
} {
isNew, err := d.ensureAndRotateSecret(ctx, s.kv, s.secretName, s.len)
isRotated = isNew || isRotated
if err != nil {
return err
}
}
// don't rotate legacy secrets
for _, s := range []struct {
kv keyvault.Manager
secretName string
len int
}{
{d.serviceKeyvault, env.EncryptionSecretName, 32},
{d.serviceKeyvault, env.FrontendEncryptionSecretName, 32},
} {
isNew, err := d.ensureSecret(ctx, s.kv, s.secretName, s.len)
isRotated = isNew || isRotated
if err != nil {
return err
}
}
isNew, err := d.ensureSecretKey(ctx, d.portalKeyvault, env.PortalServerSSHKeySecretName)
isRotated = isNew || isRotated
if err != nil {
return err
}
if isRotated {
err = d.restartOldScalesets(ctx, lbHealthcheckWaitTimeSec)
if err != nil {
return err
}
}
return nil
}
func (d *deployer) ensureAndRotateSecret(ctx context.Context, kv keyvault.Manager, secretName string, len int) (isNew bool, err error) {
existingSecrets, err := kv.GetSecrets(ctx)
if err != nil {
return false, err
}
for _, secret := range existingSecrets {
if filepath.Base(*secret.ID) == secretName {
latestVersion, err := kv.GetSecret(ctx, secretName)
if err != nil {
return false, err
}
updatedTime := time.Unix(0, latestVersion.Attributes.Created.Duration().Nanoseconds()).Add(rotateSecretAfter)
// do not create a secret if rotateSecretAfter time has
// not elapsed since the secret version's creation timestamp
if time.Now().Before(updatedTime) {
return false, nil
}
}
}
return true, d.createSecret(ctx, kv, secretName, len)
}
func (d *deployer) ensureSecret(ctx context.Context, kv keyvault.Manager, secretName string, len int) (isNew bool, err error) {
existingSecrets, err := kv.GetSecrets(ctx)
if err != nil {
return false, err
}
for _, secret := range existingSecrets {
if filepath.Base(*secret.ID) == secretName {
return false, nil
}
}
return true, d.createSecret(ctx, kv, secretName, len)
}
func (d *deployer) createSecret(ctx context.Context, kv keyvault.Manager, secretName string, len int) error {
key := make([]byte, len)
_, err := rand.Read(key)
if err != nil {
return err
}
d.log.Infof("setting %s", secretName)
return kv.SetSecret(ctx, secretName, azkeyvault.SecretSetParameters{
Value: to.StringPtr(base64.StdEncoding.EncodeToString(key)),
})
}
func (d *deployer) ensureSecretKey(ctx context.Context, kv keyvault.Manager, secretName string) (isNew bool, err error) {
existingSecrets, err := kv.GetSecrets(ctx)
if err != nil {
return false, err
}
for _, secret := range existingSecrets {
if filepath.Base(*secret.ID) == secretName {
return false, nil
}
}
key, err := rsa.GenerateKey(rand.Reader, 2048)
if err != nil {
return false, err
}
d.log.Infof("setting %s", secretName)
return true, kv.SetSecret(ctx, secretName, azkeyvault.SecretSetParameters{
Value: to.StringPtr(base64.StdEncoding.EncodeToString(x509.MarshalPKCS1PrivateKey(key))),
})
}
func (d *deployer) restartOldScalesets(ctx context.Context, lbHealthcheckWaitTimeSec int) error {
scalesets, err := d.vmss.List(ctx, d.config.RPResourceGroupName)
if err != nil {
return err
}
for _, vmss := range scalesets {
err = d.restartOldScaleset(ctx, *vmss.Name, lbHealthcheckWaitTimeSec)
if err != nil {
return err
}
}
return nil
}
func (d *deployer) restartOldScaleset(ctx context.Context, vmssName string, lbHealthcheckWaitTimeSec int) error {
if !strings.HasPrefix(vmssName, rpVMSSPrefix) {
return &api.CloudError{
StatusCode: http.StatusBadRequest,
CloudErrorBody: &api.CloudErrorBody{
Code: api.CloudErrorCodeInvalidResource,
Message: fmt.Sprintf("provided vmss %s does not match RP prefix",
vmssName,
),
},
}
}
scalesetVMs, err := d.vmssvms.List(ctx, d.config.RPResourceGroupName, vmssName, "", "", "")
if err != nil {
return err
}
for _, vm := range scalesetVMs {
d.log.Printf("waiting for restart script to complete on older rp vmss %s, instance %s", vmssName, *vm.InstanceID)
err = d.vmssvms.RunCommandAndWait(ctx, d.config.RPResourceGroupName, vmssName, *vm.InstanceID, mgmtcompute.RunCommandInput{
CommandID: to.StringPtr("RunShellScript"),
Script: &[]string{rpRestartScript},
})
if err != nil {
return err
}
// wait for load balancer probe to change the vm health status
time.Sleep(time.Duration(lbHealthcheckWaitTimeSec) * time.Second)
timeoutCtx, cancel := context.WithTimeout(ctx, time.Hour)
defer cancel()
err = d.waitForReadiness(timeoutCtx, vmssName, *vm.InstanceID)
if err != nil {
return err
}
}
return nil
}
func (d *deployer) waitForReadiness(ctx context.Context, vmssName string, vmInstanceID string) error {
return wait.PollImmediateUntil(10*time.Second, func() (bool, error) {
return d.isVMInstanceHealthy(ctx, d.config.RPResourceGroupName, vmssName, vmInstanceID), nil
}, ctx.Done())
}
func (d *deployer) isVMInstanceHealthy(ctx context.Context, resourceGroupName string, vmssName string, vmInstanceID string) bool {
r, err := d.vmssvms.GetInstanceView(ctx, resourceGroupName, vmssName, vmInstanceID)
instanceUnhealthy := r.VMHealth != nil && r.VMHealth.Status != nil && r.VMHealth.Status.Code != nil && *r.VMHealth.Status.Code != "HealthState/healthy"
if err != nil || instanceUnhealthy {
d.log.Printf("instance %s is unhealthy", vmInstanceID)
return false
}
return true
}