azure-container-networking/.pipelines/npm/npm-scale-test.yaml

pr:
  branches:
    include:
      - master
      - release/*
  paths:
    include:
      - npm/*
      - .pipelines/npm/*
      - test/scale/*


trigger:
  branches:
    include:
      - master
  tags:
    include:
      - "*"

variables:
  - name: VNET_NAME
    value: npm-vnet

jobs:
  - job: setup
    displayName: "Configure Test Environment"
    pool:
      name: $(BUILD_POOL_NAME_DEFAULT)
      demands:
        - agent.os -equals Linux
        - Role -equals Build
    steps:
      - checkout: self

      - script: |
          go version
          go env
          which go
          echo $PATH
          mkdir -p '$(GOBIN)'
          mkdir -p '$(GOPATH)/pkg'
          BUILD_NUMBER=$(Build.BuildNumber)
          # format: npm-<year>-<month>-<day>-<minute>-<second>
          RG=e2e-$(echo "npm-`date "+%Y-%m-%d-%M-%S"`")
          TAG=$(make npm-version)
          echo "Resource group: $RG"
          echo "Image tag: $TAG"

          echo "##vso[task.setvariable variable=RESOURCE_GROUP;isOutput=true;]$RG"
          echo "##vso[task.setvariable variable=TAG;isOutput=true;]$TAG"

        name: "EnvironmentalVariables"
        displayName: "Set environmental variables"
        condition: always()

  - job: containerize
    dependsOn: [setup]
    displayName: Build Images
    variables:
      TAG: $[ dependencies.setup.outputs['EnvironmentalVariables.TAG'] ]
    pool:
      name: "$(BUILD_POOL_NAME_DEFAULT)"
    strategy:
      matrix:
        npm_linux_amd64:
            arch: amd64
            name: npm
            os: linux
        npm_windows_amd64:
          arch: amd64
          name: npm
          os: windows
    steps:
      - template: ../containers/container-template.yaml
        parameters:
          arch: $(arch)
          name: $(name)
          os: $(os)

  - job: Create_Cluster_and_Run_Test
    timeoutInMinutes: 360
    displayName: "Run Scale Test"
    pool:
      name: $(BUILD_POOL_NAME_DEFAULT)
      demands:
        - agent.os -equals Linux
        - Role -equals Build
    dependsOn: [containerize, setup]
    variables:
      RESOURCE_GROUP: $[ dependencies.setup.outputs['EnvironmentalVariables.RESOURCE_GROUP'] ]
      TAG: $[ dependencies.setup.outputs['EnvironmentalVariables.TAG'] ]
      FQDN: empty
    strategy:
      matrix:
        v2-linux:
          PROFILE: "sc-lin"
          NUM_NETPOLS: 800
          INITIAL_CONNECTIVITY_TIMEOUT: 60
        # 2024/07/23: Windows Scale Test is consistently failing to establish initial connectivity in time
        # ws22:
        #   PROFILE: "sc-ws22"
        #   NUM_NETPOLS: 50
        #   INITIAL_CONNECTIVITY_TIMEOUT: 720
    steps:
      - checkout: self
      - bash: |
          test -d $(Pipeline.Workspace)/s/test/scale/ || {
              echo "##vso[task.logissue type=error]$(Pipeline.Workspace)/s/test/scale/ does not exist"
              exit 1
          }
        displayName: "Verify Directory Exists"
        failOnStderr: true
      - task: AzureCLI@2
        displayName: "Download Kubectl"
        inputs:
          azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION)
          scriptType: "bash"
          scriptLocation: "inlineScript"
          failOnStderr: true
          condition: succeeded()
          inlineScript: |
            set -e
            curl -LO https://dl.k8s.io/release/v1.23.0/bin/linux/amd64/kubectl
            chmod +x kubectl
      - task: AzureCLI@2
        displayName: "Create AKS Cluster"
        inputs:
          azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION)
          scriptType: "bash"
          scriptLocation: "inlineScript"
          failOnStderr: true
          condition: succeeded()
          inlineScript: |
            set -e
            az extension add --name aks-preview
            az extension update --name aks-preview

            echo "Creating resource group named $(RESOURCE_GROUP)"
            az group create --name $(RESOURCE_GROUP) -l $(LOCATION) -o table

            export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
            echo "Creating cluster named $CLUSTER_NAME"
            az aks create \
                --resource-group $(RESOURCE_GROUP) \
                --name $CLUSTER_NAME \
                --generate-ssh-keys \
                --windows-admin-username e2eadmin \
                --windows-admin-password alpha@numeric!password2 \
                --network-plugin azure \
                --vm-set-type VirtualMachineScaleSets \
                --node-vm-size Standard_D4s_v3 \
                --node-count 1 \
                --tier standard \
                --max-pods 100

            echo "Getting credentials to $CLUSTER_NAME"
            az aks get-credentials -g $(RESOURCE_GROUP) -n $CLUSTER_NAME --overwrite-existing --file ./kubeconfig
            mkdir -p ~/.kube/
            cp ./kubeconfig ~/.kube/config

            # install kwok on linux node
            cd $(Pipeline.Workspace)/s/test/scale/
            chmod u+x run-kwok-as-pod.sh test-scale.sh connectivity/test-connectivity.sh
            ./run-kwok-as-pod.sh
            # need reliability in case multiple controllers enter CrashLoopBackOff from "context cancelled"
            kubectl scale deployment -n kube-system -l app=kwok-controller --replicas=5

            if [[ $(PROFILE) == *ws22 ]]; then
              echo "Adding Windows nodepool to $CLUSTER_NAME"
              az aks nodepool add \
                  --resource-group $(RESOURCE_GROUP) \
                  --cluster-name $CLUSTER_NAME \
                  --name awin22 \
                  --os-type Windows \
                  --os-sku Windows2022 \
                  --node-vm-size Standard_D4s_v3 \
                  --node-count 1 \
                  --max-pods 100
            fi

      - task: AzureCLI@2
        displayName: "Deploy NPM to Test Cluster"
        inputs:
          azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION)
          scriptType: "bash"
          scriptLocation: "inlineScript"
          failOnStderr: true
          condition: succeeded()
          inlineScript: |
            set -e

            # deploy azure-npm
            cp $(Pipeline.Workspace)/s/npm/azure-npm.yaml azure-npm.yaml
            sed -i 's/memory: 300Mi/memory: 1000Mi/g' azure-npm.yaml
            kubectl apply -f azure-npm.yaml

            cp $(Pipeline.Workspace)/s/npm/examples/windows/azure-npm.yaml azure-npm-win.yaml
            # set higher memory limit
            sed -i 's/memory: 300Mi/memory: 1000Mi/g' azure-npm-win.yaml
            kubectl apply -f azure-npm-win.yaml

            # swap azure-npm image with one built during run
            kubectl set image daemonset/azure-npm -n kube-system azure-npm=$IMAGE_REGISTRY/azure-npm:linux-amd64-$(TAG)
            kubectl set image daemonset/azure-npm-win -n kube-system azure-npm=$IMAGE_REGISTRY/azure-npm:windows-amd64-$(TAG)

            sleep 30s
            echo "waiting for NPM to start running..."
            kubectl wait --for=condition=Ready pod -l k8s-app=azure-npm -n kube-system --timeout=15m || {
                kubectl describe pod -n kube-system -l k8s-app=azure-npm
                echo "##vso[task.logissue type=error]NPM failed to start running"
                exit 1
            }
            echo "sleep 3m to let NPM restart in case of bootup failure due to HNS errors"
            sleep 3m

            kubectl get po -n kube-system -owide -A

            if [[ $(PROFILE) == *ws22 ]]; then
              echo "labeling Windows nodes for scale test"
              kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | awk '{print $1}' | xargs -n 1 -I {} kubectl label node {} scale-test=true connectivity-test=true
            else
              echo "labeling Linux nodes for scale test"
              kubectl get node -o wide | grep "Ubuntu" | awk '{print $1}' | xargs -n 1 -I {} kubectl label node {} scale-test=true connectivity-test=true
            fi

            export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
            echo "Showing cluster status for $CLUSTER_NAME"
            FQDN=`az aks show -n $CLUSTER_NAME -g $(RESOURCE_GROUP) --query fqdn -o tsv`
            echo "##vso[task.setvariable variable=FQDN]$FQDN"

      - task: AzureCLI@2
        displayName: "Scale Up Large"
        inputs:
          azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION)
          scriptType: "bash"
          scriptLocation: "inlineScript"
          failOnStderr: true
          condition: succeeded()
          inlineScript: |
            set -e
            export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
            mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME

            # 20 kwok nodes
            # 1000 kwok Pods
            # 30 real Pods
            # 300 ACLs per endpoint
            # ~4K IPSets
            # ~36K IPSet members
            kubectlPath=`pwd`/kubectl
            cd $(Pipeline.Workspace)/s/test/scale/
            set +e
            ./test-scale.sh --kubectl-binary=$kubectlPath \
                --max-kwok-pods-per-node=50 \
                --num-kwok-deployments=10 \
                --num-kwok-replicas=100 \
                --max-real-pods-per-node=30 \
                --num-real-deployments=10 \
                --num-real-replicas=3 \
                --num-network-policies=$(NUM_NETPOLS) \
                --num-unapplied-network-policies=$(NUM_NETPOLS) \
                --num-unique-labels-per-pod=2 \
                --num-unique-labels-per-deployment=2 \
                --num-shared-labels-per-pod=10
            rc=$?
            exit $rc

      - task: AzureCLI@2
        displayName: "Test NPM Bootup Latency and Connectivity ($(PROFILE))"
        inputs:
          azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION)
          scriptType: "bash"
          scriptLocation: "inlineScript"
          failOnStderr: true
          condition: succeeded()
          inlineScript: |
            set -e
            export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
            mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME

            kubectl rollout restart -n kube-system ds azure-npm-win
            echo "sleeping 3 minutes to allow NPM pods to restart after scale-up..."
            sleep 3m

            kubectl get pod -n kube-system -l app=kwok-controller -owide
            kubectl get pod -n kube-system -l app=kwok-controller -owide | grep -q Running || {
                echo "##vso[task.logissue type=error]need at least one kwok pod running"
                exit 1
            }

            cd $(Pipeline.Workspace)/s/test/scale/connectivity/
            # notes for Windows:
            # initial connectivity should be established within 15 minutes of NPM restart (12 minute timeout since we already waited 3 minutes above)
            # adding new network policy to all 30 Pods should happen within 30 seconds
            set +e
            ./test-connectivity.sh --kubectl-binary=$kubectlPath \
                --num-scale-pods-to-verify=all \
                --max-wait-for-initial-connectivity=$(INITIAL_CONNECTIVITY_TIMEOUT) \
                --max-wait-after-adding-netpol=30
            rc=$?
            if [[ $rc != 0 ]]; then
                echo "capturing cluster state due to failure"
                if [[ $(PROFILE) == *ws22 ]]; then
                    cd $(Pipeline.Workspace)/s/debug/windows/npm/
                    chmod u+x win-debug.sh
                    ./win-debug.sh
                    mv logs_* $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/
                else
                    set -x
                    npmPod=`kubectl get pod -n kube-system | grep npm | grep -v npm-win | awk '{print $1}' | head -n 1 | tr -d '\n'`
                    kubectl exec -n kube-system $npmPod -- iptables-nft -vnL > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/iptables.out
                    kubectl exec -n kube-system $npmPod -- ipset -L > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/ipset.out
                fi

                kubectl get pod -n scale-test
                kubectl get pod -n connectivity-test
                exit $rc
            fi

            kubectl get pod -n kube-system -l app=kwok-controller -owide
            kubectl get pod -n kube-system -l app=kwok-controller -owide | grep -q Running || {
                echo "##vso[task.logissue type=error]need at least one kwok pod running"
                exit 1
            }

      - task: AzureCLI@2
        displayName: "CRUD at Medium Scale"
        inputs:
          azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION)
          scriptType: "bash"
          scriptLocation: "inlineScript"
          failOnStderr: true
          condition: succeeded()
          inlineScript: |
            set -e
            export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
            mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME

            # will delete scale-test and connectivity-test namespaces from previous run
            # 10 kwok Pods
            # 30 real Pods
            kubectlPath=`pwd`/kubectl
            cd $(Pipeline.Workspace)/s/test/scale/
            set +e
            ./test-scale.sh --kubectl-binary=$kubectlPath \
                --max-kwok-pods-per-node=50 \
                --num-kwok-deployments=10 \
                --num-kwok-replicas=1 \
                --max-real-pods-per-node=30 \
                --num-real-deployments=3 \
                --num-real-replicas=4 \
                --num-network-policies=1 \
                --num-unapplied-network-policies=10 \
                --num-unique-labels-per-pod=2 \
                --num-unique-labels-per-deployment=2 \
                --num-shared-labels-per-pod=10 \
                --delete-labels \
                --delete-labels-interval=30 \
                --delete-labels-times=2 \
                --delete-netpols \
                --delete-netpols-interval=0 \
                --delete-netpols-times=1 \
                --delete-kwok-pods=10 \
                --delete-real-pods=6 \
                --delete-pods-interval=120 \
                --delete-pods-times=2
            rc=$?
            exit $rc

      - task: AzureCLI@2
        displayName: "Test Connectivity after CRUD ($(PROFILE))"
        inputs:
          azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION)
          scriptType: "bash"
          scriptLocation: "inlineScript"
          failOnStderr: true
          condition: succeeded()
          inlineScript: |
            set -e
            export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
            mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME

            kubectl get pod -n kube-system -l app=kwok-controller -owide
            kubectl get pod -n kube-system -l app=kwok-controller -owide | grep -q Running || {
                echo "##vso[task.logissue type=error]need at least one kwok pod running"
                exit 1
            }


            cd $(Pipeline.Workspace)/s/test/scale/connectivity/
            # initial connectivity should be established within 10 minutes
            # adding new network policy to all 12 Pods should happen within 20 seconds
            set +e
            ./test-connectivity.sh --kubectl-binary=$kubectlPath \
                --num-scale-pods-to-verify=all \
                --max-wait-for-initial-connectivity=$((10*60)) \
                --max-wait-after-adding-netpol=20
            rc=$?
            if [[ $rc != 0 ]]; then
                echo "capturing cluster state due to failure"
                if [[ $(PROFILE) == *ws22 ]]; then
                    cd $(Pipeline.Workspace)/s/debug/windows/npm/
                    chmod u+x win-debug.sh
                    ./win-debug.sh
                    mv logs_* $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/
                else
                    set -x
                    npmPod=`kubectl get pod -n kube-system | grep npm | grep -v npm-win | awk '{print $1}' | head -n 1 | tr -d '\n'`
                    kubectl exec -n kube-system $npmPod -- iptables-nft -vnL > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/iptables.out
                    kubectl exec -n kube-system $npmPod -- ipset -L > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/ipset.out
                fi

                kubectl get pod -n scale-test
                kubectl get pod -n connectivity-test
                exit $rc
            fi

            kubectl get pod -n kube-system -l app=kwok-controller -owide
            kubectl get pod -n kube-system -l app=kwok-controller -owide | grep -q Running || {
                echo "##vso[task.logissue type=error]need at least one kwok pod running"
                exit 1
            }

      - bash: |
          export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
          echo "Getting cluster state for $CLUSTER_NAME"
          mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
          kubectl get pods -n kube-system -owide | grep npm | grep -v kwok
          npmPodList=`kubectl get pods -n kube-system -owide | grep npm | grep -v kwok | awk '{print $1}'`
          for npmPod in $npmPodList; do
              logFile=$(System.DefaultWorkingDirectory)/$CLUSTER_NAME/npm-logs_$(PROFILE)-$npmPod.txt
              kubectl logs -n kube-system $npmPod > $logFile

              # capture any previous logs in case there was a crash
              previousLogFile=$(System.DefaultWorkingDirectory)/$CLUSTER_NAME/previous-npm-logs_$(PROFILE).txt
              kubectl logs -n kube-system $npmPod -p > $previousLogFile
              if [[ $? -ne 0 ]]; then
                  # remove the empty file if kubectl logs failed (e.g. there was no previous terminated container)
                  rm $previousLogFile
              fi
          done
          cp ./kubeconfig $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/.kubeconfig
        condition: always()
        displayName: "Get Logs"

      - publish: $(System.DefaultWorkingDirectory)/$(RESOURCE_GROUP)-$(PROFILE)
        condition: always()
        artifact: NpmLogs-$(RESOURCE_GROUP)-$(PROFILE)

  - job: clean_up
    displayName: "Cleanup"
    pool:
      name: $(BUILD_POOL_NAME_DEFAULT)
      demands:
        - agent.os -equals Linux
        - Role -equals Build
    dependsOn:
      [Create_Cluster_and_Run_Test, setup]
    variables:
      RESOURCE_GROUP: $[ dependencies.setup.outputs['EnvironmentalVariables.RESOURCE_GROUP'] ]
    steps:
      - checkout: none
      - task: AzureCLI@2
        displayName: "Delete Test Cluster Resource Group"
        inputs:
          azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION)
          scriptType: "bash"
          scriptLocation: "inlineScript"
          condition: succeeded()
          inlineScript: |
            echo Deleting $(RESOURCE_GROUP)
            az group delete -n $(RESOURCE_GROUP) --yes