From 4f4b8e72b28b83b1a64bda52184b591ac1580b52 Mon Sep 17 00:00:00 2001 From: Jack Francis Date: Thu, 17 Sep 2020 09:36:41 -0700 Subject: [PATCH] feat: update VMSS node pools (#3830) --- cmd/root.go | 1 + cmd/root_test.go | 2 +- cmd/scale.go | 37 +++- cmd/update.go | 260 ++++++++++++++++++++++++ cmd/updated_test.go | 100 +++++++++ docs/community/developer-guide.md | 1 + test/e2e/cluster.sh | 42 +++- test/e2e/test_cluster_configs/base.json | 1 + 8 files changed, 433 insertions(+), 11 deletions(-) create mode 100644 cmd/update.go create mode 100644 cmd/updated_test.go diff --git a/cmd/root.go b/cmd/root.go index 74a5c0219..4803954ed 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -70,6 +70,7 @@ func NewRootCmd() *cobra.Command { rootCmd.AddCommand(newOrchestratorsCmd()) rootCmd.AddCommand(newUpgradeCmd()) rootCmd.AddCommand(newScaleCmd()) + rootCmd.AddCommand(newUpdateCmd()) rootCmd.AddCommand(newRotateCertsCmd()) rootCmd.AddCommand(newAddPoolCmd()) rootCmd.AddCommand(newGetLocationsCmd()) diff --git a/cmd/root_test.go b/cmd/root_test.go index c38268978..c61cccb3f 100644 --- a/cmd/root_test.go +++ b/cmd/root_test.go @@ -49,7 +49,7 @@ func TestNewRootCmd(t *testing.T) { t.Fatalf("root command should have use %s equal %s, short %s equal %s and long %s equal to %s", command.Use, rootName, command.Short, rootShortDescription, command.Long, rootLongDescription) } // The commands need to be listed in alphabetical order - expectedCommands := []*cobra.Command{newAddPoolCmd(), getCompletionCmd(command), newDeployCmd(), newGenerateCmd(), newGetLocationsCmd(), newGetLogsCmd(), newGetSkusCmd(), newGetVersionsCmd(), newOrchestratorsCmd(), newRotateCertsCmd(), newScaleCmd(), newUpgradeCmd(), newVersionCmd()} + expectedCommands := []*cobra.Command{newAddPoolCmd(), getCompletionCmd(command), newDeployCmd(), newGenerateCmd(), newGetLocationsCmd(), newGetLogsCmd(), newGetSkusCmd(), newGetVersionsCmd(), newOrchestratorsCmd(), newRotateCertsCmd(), newScaleCmd(), newUpdateCmd(), newUpgradeCmd(), newVersionCmd()} rc := command.Commands() for i, c := range expectedCommands { diff --git a/cmd/scale.go b/cmd/scale.go index b5e667996..36b74c872 100644 --- a/cmd/scale.go +++ b/cmd/scale.go @@ -44,6 +44,12 @@ type scaleCmd struct { agentPoolToScale string masterFQDN string + // lib input + updateVMSSModel bool + validateCmd bool + loadAPIModel bool + persistAPIModel bool + // derived containerService *api.ContainerService apiVersion string @@ -67,7 +73,12 @@ const ( // NewScaleCmd run a command to upgrade a Kubernetes cluster func newScaleCmd() *cobra.Command { - sc := scaleCmd{} + sc := scaleCmd{ + validateCmd: true, + updateVMSSModel: false, + loadAPIModel: true, + persistAPIModel: true, + } scaleCmd := &cobra.Command{ Use: scaleName, @@ -141,6 +152,7 @@ func (sc *scaleCmd) load() error { ctx, cancel := context.WithTimeout(context.Background(), armhelpers.DefaultARMOperationTimeout) defer cancel() + sc.updateVMSSModel = true if sc.apiModelPath == "" { sc.apiModelPath = filepath.Join(sc.deploymentDirectory, apiModelFilename) @@ -236,11 +248,15 @@ func (sc *scaleCmd) load() error { } func (sc *scaleCmd) run(cmd *cobra.Command, args []string) error { - if err := sc.validate(cmd); err != nil { - return errors.Wrap(err, "failed to validate scale command") + if sc.validateCmd { + if err := sc.validate(cmd); err != nil { + return errors.Wrap(err, "failed to validate scale command") + } } - if err := sc.load(); err != nil { - return errors.Wrap(err, "failed to load existing container service") + if sc.loadAPIModel { + if err := sc.load(); err != nil { + return errors.Wrap(err, "failed to load existing container service") + } } ctx, cancel := context.WithTimeout(context.Background(), armhelpers.DefaultARMOperationTimeout) @@ -382,7 +398,9 @@ func (sc *scaleCmd) run(cmd *cobra.Command, args []string) error { } } - return sc.saveAPIModel() + if sc.persistAPIModel { + return sc.saveAPIModel() + } } } else { for vmssListPage, err := sc.client.ListVirtualMachineScaleSets(ctx, sc.resourceGroupName); vmssListPage.NotDone(); err = vmssListPage.NextWithContext(ctx) { @@ -409,7 +427,7 @@ func (sc *scaleCmd) run(cmd *cobra.Command, args []string) error { if vmss.Sku != nil { currentNodeCount = int(*vmss.Sku.Capacity) - if int(*vmss.Sku.Capacity) == sc.newDesiredAgentCount { + if int(*vmss.Sku.Capacity) == sc.newDesiredAgentCount && !sc.updateVMSSModel { sc.printScaleTargetEqualsExisting(currentNodeCount) return nil } else if int(*vmss.Sku.Capacity) > sc.newDesiredAgentCount { @@ -533,7 +551,10 @@ func (sc *scaleCmd) run(cmd *cobra.Command, args []string) error { } } - return sc.saveAPIModel() + if sc.persistAPIModel { + return sc.saveAPIModel() + } + return nil } func (sc *scaleCmd) saveAPIModel() error { diff --git a/cmd/update.go b/cmd/update.go new file mode 100644 index 000000000..cbf428ebc --- /dev/null +++ b/cmd/update.go @@ -0,0 +1,260 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +package cmd + +import ( + "context" + "fmt" + "os" + "strconv" + + "github.com/Azure/aks-engine/pkg/api" + "github.com/Azure/aks-engine/pkg/armhelpers" + "github.com/Azure/aks-engine/pkg/engine" + "github.com/Azure/aks-engine/pkg/helpers" + "github.com/Azure/aks-engine/pkg/i18n" + "github.com/leonelquinteros/gotext" + "github.com/pkg/errors" + log "github.com/sirupsen/logrus" + "github.com/spf13/cobra" + prefixed "github.com/x-cray/logrus-prefixed-formatter" +) + +type updateCmd struct { + authArgs + + // user input + apiModelPath string + resourceGroupName string + location string + agentPoolToUpdate string + + // derived + containerService *api.ContainerService + apiVersion string + agentPool *api.AgentPoolProfile + client armhelpers.AKSEngineClient + locale *gotext.Locale + nameSuffix string + agentPoolIndex int + logger *log.Entry + kubeconfig string +} + +const ( + updateName = "update" + updateShortDescription = "Update an existing AKS Engine-created VMSS node pool" + updateLongDescription = "Update an existing AKS Engine-created VMSS node pool in a Kubernetes cluster by updating its VMSS model" +) + +// newUpdateCmd returns an instance reference of updateCmd +func newUpdateCmd() *cobra.Command { + uc := updateCmd{} + + updateCmd := &cobra.Command{ + Use: updateName, + Short: updateShortDescription, + Long: updateLongDescription, + RunE: uc.run, + } + + f := updateCmd.Flags() + f.StringVarP(&uc.location, "location", "l", "", "location the cluster is deployed in") + f.StringVarP(&uc.resourceGroupName, "resource-group", "g", "", "the resource group where the cluster is deployed") + f.StringVarP(&uc.apiModelPath, "api-model", "m", "", "path to the generated apimodel.json file") + f.StringVar(&uc.agentPoolToUpdate, "node-pool", "", "node pool to scale") + addAuthFlags(&uc.authArgs, f) + + return updateCmd +} + +func (uc *updateCmd) validate(cmd *cobra.Command) error { + log.Debugln("validating update command line arguments...") + var err error + + uc.locale, err = i18n.LoadTranslations() + if err != nil { + return errors.Wrap(err, "error loading translation files") + } + + if uc.resourceGroupName == "" { + _ = cmd.Usage() + return errors.New("--resource-group must be specified") + } + + if uc.location == "" { + _ = cmd.Usage() + return errors.New("--location must be specified") + } + + uc.location = helpers.NormalizeAzureRegion(uc.location) + + if uc.apiModelPath == "" { + _ = cmd.Usage() + return errors.New("--api-model must be specified") + } + + if uc.agentPoolToUpdate == "" { + _ = cmd.Usage() + return errors.New("--node-pool must be specified") + } + + return nil +} + +func (uc *updateCmd) load() error { + logger := log.New() + logger.Formatter = new(prefixed.TextFormatter) + uc.logger = log.NewEntry(log.New()) + var err error + + ctx, cancel := context.WithTimeout(context.Background(), armhelpers.DefaultARMOperationTimeout) + defer cancel() + + if _, err = os.Stat(uc.apiModelPath); os.IsNotExist(err) { + return errors.Errorf("specified api model does not exist (%s)", uc.apiModelPath) + } + + apiloader := &api.Apiloader{ + Translator: &i18n.Translator{ + Locale: uc.locale, + }, + } + uc.containerService, uc.apiVersion, err = apiloader.LoadContainerServiceFromFile(uc.apiModelPath, true, true, nil) + if err != nil { + return errors.Wrap(err, "error parsing the api model") + } + + if uc.containerService.Properties.IsCustomCloudProfile() { + if err = writeCustomCloudProfile(uc.containerService); err != nil { + return errors.Wrap(err, "error writing custom cloud profile") + } + + if err = uc.containerService.Properties.SetCustomCloudSpec(api.AzureCustomCloudSpecParams{IsUpgrade: false, IsScale: true}); err != nil { + return errors.Wrap(err, "error parsing the api model") + } + } + + if err = uc.authArgs.validateAuthArgs(); err != nil { + return err + } + + if uc.client, err = uc.authArgs.getClient(); err != nil { + return errors.Wrap(err, "failed to get client") + } + + _, err = uc.client.EnsureResourceGroup(ctx, uc.resourceGroupName, uc.location, nil) + if err != nil { + return err + } + + if uc.containerService.Location == "" { + uc.containerService.Location = uc.location + } else if uc.containerService.Location != uc.location { + return errors.New("--location does not match api model location") + } + + agentPoolIndex := -1 + for i, pool := range uc.containerService.Properties.AgentPoolProfiles { + if pool.Name == uc.agentPoolToUpdate { + agentPoolIndex = i + uc.agentPool = pool + uc.agentPoolIndex = i + } + } + if agentPoolIndex == -1 { + return errors.Errorf("node pool %s was not found in the deployed api model", uc.agentPoolToUpdate) + } + if uc.agentPool.AvailabilityProfile != api.VirtualMachineScaleSets { + return errors.Errorf("aks-engine node pool update requires a VMSS node pool, %s is backed by a VM Availability Set", uc.agentPoolToUpdate) + } + + //allows to identify VMs in the resource group that belong to this cluster. + uc.nameSuffix = uc.containerService.Properties.GetClusterID() + log.Debugf("Cluster ID used in all agent pools: %s", uc.nameSuffix) + + uc.kubeconfig, err = engine.GenerateKubeConfig(uc.containerService.Properties, uc.location) + if err != nil { + return errors.New("Unable to derive kubeconfig from api model") + } + return nil +} + +func (uc *updateCmd) run(cmd *cobra.Command, args []string) error { + if err := uc.validate(cmd); err != nil { + return errors.Wrap(err, "failed to validate scale command") + } + if err := uc.load(); err != nil { + return errors.Wrap(err, "failed to load existing container service") + } + + ctx, cancel := context.WithTimeout(context.Background(), armhelpers.DefaultARMOperationTimeout) + defer cancel() + + sc := scaleCmd{ + location: uc.location, + apiModelPath: uc.apiModelPath, + resourceGroupName: uc.resourceGroupName, + agentPoolToScale: uc.agentPoolToUpdate, + validateCmd: false, + updateVMSSModel: true, + loadAPIModel: false, + persistAPIModel: false, + } + sc.RawAzureEnvironment = uc.RawAzureEnvironment + sc.rawSubscriptionID = uc.rawSubscriptionID + sc.SubscriptionID = uc.SubscriptionID + sc.AuthMethod = uc.AuthMethod + sc.rawClientID = uc.rawClientID + sc.ClientID = uc.ClientID + sc.ClientSecret = uc.ClientSecret + sc.CertificatePath = uc.CertificatePath + sc.PrivateKeyPath = uc.PrivateKeyPath + sc.IdentitySystem = uc.IdentitySystem + sc.language = uc.language + sc.logger = uc.logger + sc.containerService = uc.containerService + sc.apiVersion = uc.apiVersion + sc.client = uc.client + sc.nameSuffix = uc.nameSuffix + sc.kubeconfig = uc.kubeconfig + sc.agentPool = uc.agentPool + sc.agentPoolIndex = uc.agentPoolIndex + + for vmssListPage, err := sc.client.ListVirtualMachineScaleSets(ctx, sc.resourceGroupName); vmssListPage.NotDone(); err = vmssListPage.NextWithContext(ctx) { + if err != nil { + return errors.Wrap(err, "failed to get VMSS list in the resource group") + } + for _, vmss := range vmssListPage.Values() { + vmssName := *vmss.Name + if sc.agentPool.OSType == api.Windows { + possibleIndex, nameMungingErr := strconv.Atoi(vmssName[len(vmssName)-2:]) + if nameMungingErr != nil { + continue + } + if !(sc.containerService.Properties.GetAgentVMPrefix(sc.agentPool, possibleIndex) == vmssName) { + continue + } + } else { + if !sc.vmInAgentPool(vmssName, vmss.Tags) { + continue + } + } + + if vmss.Sku != nil { + sc.newDesiredAgentCount = int(*vmss.Sku.Capacity) + uc.agentPool.Count = sc.newDesiredAgentCount + } else { + return errors.Wrap(err, fmt.Sprintf("failed to find VMSS matching node pool %s in resource group %s", sc.agentPoolToScale, sc.resourceGroupName)) + } + } + } + + err := sc.run(cmd, args) + if err != nil { + return errors.Wrap(err, "aks-engine update failed") + } + + return nil +} diff --git a/cmd/updated_test.go b/cmd/updated_test.go new file mode 100644 index 000000000..5d212fdb9 --- /dev/null +++ b/cmd/updated_test.go @@ -0,0 +1,100 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +package cmd + +import ( + "testing" + + "github.com/pkg/errors" + "github.com/spf13/cobra" +) + +func TestNewUpdateCmd(t *testing.T) { + command := newUpdateCmd() + if command.Use != updateName || command.Short != updateShortDescription || command.Long != updateLongDescription { + t.Fatalf("update command should have use %s equal %s, short %s equal %s and long %s equal to %s", command.Use, updateName, command.Short, updateShortDescription, command.Long, updateLongDescription) + } + + expectedFlags := []string{"location", "resource-group", "api-model", "node-pool"} + for _, f := range expectedFlags { + if command.Flags().Lookup(f) == nil { + t.Fatalf("update command should have flag %s", f) + } + } + + command.SetArgs([]string{}) + if err := command.Execute(); err == nil { + t.Fatalf("expected an error when calling update with no arguments") + } +} + +func TestUpdateCmdValidate(t *testing.T) { + r := &cobra.Command{} + + cases := []struct { + uc *updateCmd + expectedErr error + name string + }{ + { + uc: &updateCmd{ + apiModelPath: "./not/used", + location: "centralus", + resourceGroupName: "", + agentPoolToUpdate: "agentpool1", + }, + expectedErr: errors.New("--resource-group must be specified"), + name: "NoResourceGroup", + }, + { + uc: &updateCmd{ + apiModelPath: "./not/used", + location: "", + resourceGroupName: "rgname", + agentPoolToUpdate: "agentpool1", + }, + expectedErr: errors.New("--location must be specified"), + name: "NoLocation", + }, + { + uc: &updateCmd{ + apiModelPath: "", + location: "centralus", + resourceGroupName: "rgname", + agentPoolToUpdate: "agentpool1", + }, + expectedErr: errors.New("--api-model must be specified"), + name: "NoApiModel", + }, + { + uc: &updateCmd{ + apiModelPath: "./not/used", + location: "centralus", + resourceGroupName: "rgname", + agentPoolToUpdate: "", + }, + expectedErr: errors.New("--node-pool must be specified"), + name: "NoNodePool", + }, + } + + for _, tc := range cases { + c := tc + t.Run(c.name, func(t *testing.T) { + t.Parallel() + err := c.uc.validate(r) + if err != nil && c.expectedErr != nil { + if err.Error() != c.expectedErr.Error() { + t.Fatalf("expected validate update command to return error %s, but instead got %s", c.expectedErr.Error(), err.Error()) + } + } else { + if c.expectedErr != nil { + t.Fatalf("expected validate update command to return error %s, but instead got no error", c.expectedErr.Error()) + } else if err != nil { + t.Fatalf("expected validate update command to return no error, but instead got %s", err.Error()) + } + } + }) + } +} diff --git a/docs/community/developer-guide.md b/docs/community/developer-guide.md index 2fb088f04..5d2853ac6 100644 --- a/docs/community/developer-guide.md +++ b/docs/community/developer-guide.md @@ -71,6 +71,7 @@ Available Commands: help Help about any command rotate-certs Rotate certificates on an existing AKS Engine-created Kubernetes cluster scale Scale an existing AKS Engine-created Kubernetes cluster + update Update an existing AKS Engine-created VMSS node pool upgrade Upgrade an existing AKS Engine-created Kubernetes cluster version Print the version of aks-engine diff --git a/test/e2e/cluster.sh b/test/e2e/cluster.sh index 3d24b994e..d644e51d8 100755 --- a/test/e2e/cluster.sh +++ b/test/e2e/cluster.sh @@ -7,6 +7,7 @@ TMP_BASENAME=$(basename ${TMP_DIR}) GOPATH="/go" WORK_DIR="/aks-engine" MASTER_VM_UPGRADE_SKU="${MASTER_VM_UPGRADE_SKU:-Standard_D4_v3}" +NODE_VM_UPGRADE_SKU="${NODE_VM_UPGRADE_SKU:-Standard_D4_v3}" AZURE_ENV="${AZURE_ENV:-AzurePublicCloud}" IDENTITY_SYSTEM="${IDENTITY_SYSTEM:-azure_ad}" ARC_LOCATION="eastus" @@ -204,7 +205,7 @@ if [ "${UPGRADE_CLUSTER}" = "true" ] || [ "${SCALE_CLUSTER}" = "true" ] || [ -n done done fi - + if [ "${UPGRADE_CLUSTER}" = "true" ]; then git reset --hard git remote rm $UPGRADE_FORK @@ -299,7 +300,44 @@ if [ -n "$ADD_NODE_POOL_INPUT" ]; then fi if [ "${SCALE_CLUSTER}" = "true" ]; then - for nodepool in $(jq -r '.properties.agentPoolProfiles[].name' < _output/$RESOURCE_GROUP/apimodel.json); do + nodepools=$(jq -r '.properties.agentPoolProfiles[].name' < _output/$RESOURCE_GROUP/apimodel.json) + for ((i = 0; i <= ${#nodepools[@]}; ++i)); do + if [ -n "$UPDATE_NODE_POOLS" ]; then + nodepool=$(jq -r --arg i $i '. | .properties.agentPoolProfiles[$i | tonumber].name' < _output/$RESOURCE_GROUP/apimodel.json) + # modify the master VM SKU to simulate vertical vm scaling via upgrade + docker run --rm \ + -v $(pwd):${WORK_DIR} \ + -w ${WORK_DIR} \ + -e RESOURCE_GROUP=$RESOURCE_GROUP \ + -e NODE_VM_UPGRADE_SKU=$NODE_VM_UPGRADE_SKU \ + ${DEV_IMAGE} \ + /bin/bash -c "jq --arg sku \"$NODE_VM_UPGRADE_SKU\" --arg i $i '. | .properties.agentPoolProfiles[$i | tonumber].vmSize = \$sku' < _output/$RESOURCE_GROUP/apimodel.json > _output/$RESOURCE_GROUP/apimodel-update.json" || exit 1 + docker run --rm \ + -v $(pwd):${WORK_DIR} \ + -w ${WORK_DIR} \ + -e RESOURCE_GROUP=$RESOURCE_GROUP \ + ${DEV_IMAGE} \ + /bin/bash -c "mv _output/$RESOURCE_GROUP/apimodel-update.json _output/$RESOURCE_GROUP/apimodel.json" || exit 1 + docker run --rm \ + -v $(pwd):${WORK_DIR} \ + -v /etc/ssl/certs:/etc/ssl/certs \ + -w ${WORK_DIR} \ + -e RESOURCE_GROUP=$RESOURCE_GROUP \ + -e REGION=$REGION \ + -e UPDATE_POOL_NAME=$UPDATE_POOL_NAME \ + ${DEV_IMAGE} \ + ./bin/aks-engine update \ + --azure-env ${AZURE_ENV} \ + --subscription-id ${AZURE_SUBSCRIPTION_ID} \ + --api-model _output/$RESOURCE_GROUP/apimodel.json \ + --node-pool $nodepool \ + --location $REGION \ + --resource-group $RESOURCE_GROUP \ + --auth-method client_secret \ + --client-id ${AZURE_CLIENT_ID} \ + --client-secret ${AZURE_CLIENT_SECRET} || exit 1 + az vmss list -g $RESOURCE_GROUP --subscription ${AZURE_SUBSCRIPTION_ID} --query '[].sku' | grep $NODE_VM_UPGRADE_SKU || exit 1 + fi docker run --rm \ -v $(pwd):${WORK_DIR} \ -v /etc/ssl/certs:/etc/ssl/certs \ diff --git a/test/e2e/test_cluster_configs/base.json b/test/e2e/test_cluster_configs/base.json index c4ab31d7a..98d2e7529 100644 --- a/test/e2e/test_cluster_configs/base.json +++ b/test/e2e/test_cluster_configs/base.json @@ -1,6 +1,7 @@ { "env": { "SCALE_CLUSTER": true, + "UPDATE_NODE_POOLS": true, "UPGRADE_CLUSTER": true, "GET_CLUSTER_LOGS": true, "GINKGO_SKIP_AFTER_SCALE_DOWN": "should report all nodes in a Ready state",