pr: branches: include: - master - release/* paths: include: - npm/* - .pipelines/npm/* - test/scale/* trigger: branches: include: - master tags: include: - "*" variables: - name: VNET_NAME value: npm-vnet jobs: - job: setup displayName: "Configure Test Environment" pool: name: $(BUILD_POOL_NAME_DEFAULT) demands: - agent.os -equals Linux - Role -equals Build steps: - checkout: self - script: | go version go env which go echo $PATH mkdir -p '$(GOBIN)' mkdir -p '$(GOPATH)/pkg' BUILD_NUMBER=$(Build.BuildNumber) # format: npm----- RG=e2e-$(echo "npm-`date "+%Y-%m-%d-%M-%S"`") TAG=$(make npm-version) echo "Resource group: $RG" echo "Image tag: $TAG" echo "##vso[task.setvariable variable=RESOURCE_GROUP;isOutput=true;]$RG" echo "##vso[task.setvariable variable=TAG;isOutput=true;]$TAG" name: "EnvironmentalVariables" displayName: "Set environmental variables" condition: always() - job: containerize dependsOn: [setup] displayName: Build Images variables: TAG: $[ dependencies.setup.outputs['EnvironmentalVariables.TAG'] ] pool: name: "$(BUILD_POOL_NAME_DEFAULT)" strategy: matrix: npm_linux_amd64: arch: amd64 name: npm os: linux npm_windows_amd64: arch: amd64 name: npm os: windows steps: - template: ../containers/container-template.yaml parameters: arch: $(arch) name: $(name) os: $(os) - job: Create_Cluster_and_Run_Test timeoutInMinutes: 360 displayName: "Run Scale Test" pool: name: $(BUILD_POOL_NAME_DEFAULT) demands: - agent.os -equals Linux - Role -equals Build dependsOn: [containerize, setup] variables: RESOURCE_GROUP: $[ dependencies.setup.outputs['EnvironmentalVariables.RESOURCE_GROUP'] ] TAG: $[ dependencies.setup.outputs['EnvironmentalVariables.TAG'] ] FQDN: empty strategy: matrix: v2-linux: PROFILE: "sc-lin" NUM_NETPOLS: 800 INITIAL_CONNECTIVITY_TIMEOUT: 60 # 2024/07/23: Windows Scale Test is consistently failing to establish initial connectivity in time # ws22: # PROFILE: "sc-ws22" # NUM_NETPOLS: 50 # INITIAL_CONNECTIVITY_TIMEOUT: 720 steps: - checkout: self - bash: | test -d $(Pipeline.Workspace)/s/test/scale/ || { echo "##vso[task.logissue type=error]$(Pipeline.Workspace)/s/test/scale/ does not exist" exit 1 } displayName: "Verify Directory Exists" failOnStderr: true - task: AzureCLI@2 displayName: "Download Kubectl" inputs: azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) scriptType: "bash" scriptLocation: "inlineScript" failOnStderr: true condition: succeeded() inlineScript: | set -e curl -LO https://dl.k8s.io/release/v1.23.0/bin/linux/amd64/kubectl chmod +x kubectl - task: AzureCLI@2 displayName: "Create AKS Cluster" inputs: azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) scriptType: "bash" scriptLocation: "inlineScript" failOnStderr: true condition: succeeded() inlineScript: | set -e az extension add --name aks-preview az extension update --name aks-preview echo "Creating resource group named $(RESOURCE_GROUP)" az group create --name $(RESOURCE_GROUP) -l $(LOCATION) -o table export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE) echo "Creating cluster named $CLUSTER_NAME" az aks create \ --resource-group $(RESOURCE_GROUP) \ --name $CLUSTER_NAME \ --generate-ssh-keys \ --windows-admin-username e2eadmin \ --windows-admin-password alpha@numeric!password2 \ --network-plugin azure \ --vm-set-type VirtualMachineScaleSets \ --node-vm-size Standard_D4s_v3 \ --node-count 1 \ --tier standard \ --max-pods 100 echo "Getting credentials to $CLUSTER_NAME" az aks get-credentials -g $(RESOURCE_GROUP) -n $CLUSTER_NAME --overwrite-existing --file ./kubeconfig mkdir -p ~/.kube/ cp ./kubeconfig ~/.kube/config # install kwok on linux node cd $(Pipeline.Workspace)/s/test/scale/ chmod u+x run-kwok-as-pod.sh test-scale.sh connectivity/test-connectivity.sh ./run-kwok-as-pod.sh # need reliability in case multiple controllers enter CrashLoopBackOff from "context cancelled" kubectl scale deployment -n kube-system -l app=kwok-controller --replicas=5 if [[ $(PROFILE) == *ws22 ]]; then echo "Adding Windows nodepool to $CLUSTER_NAME" az aks nodepool add \ --resource-group $(RESOURCE_GROUP) \ --cluster-name $CLUSTER_NAME \ --name awin22 \ --os-type Windows \ --os-sku Windows2022 \ --node-vm-size Standard_D4s_v3 \ --node-count 1 \ --max-pods 100 fi - task: AzureCLI@2 displayName: "Deploy NPM to Test Cluster" inputs: azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) scriptType: "bash" scriptLocation: "inlineScript" failOnStderr: true condition: succeeded() inlineScript: | set -e # deploy azure-npm cp $(Pipeline.Workspace)/s/npm/azure-npm.yaml azure-npm.yaml sed -i 's/memory: 300Mi/memory: 1000Mi/g' azure-npm.yaml kubectl apply -f azure-npm.yaml cp $(Pipeline.Workspace)/s/npm/examples/windows/azure-npm.yaml azure-npm-win.yaml # set higher memory limit sed -i 's/memory: 300Mi/memory: 1000Mi/g' azure-npm-win.yaml kubectl apply -f azure-npm-win.yaml # swap azure-npm image with one built during run kubectl set image daemonset/azure-npm -n kube-system azure-npm=$IMAGE_REGISTRY/azure-npm:linux-amd64-$(TAG) kubectl set image daemonset/azure-npm-win -n kube-system azure-npm=$IMAGE_REGISTRY/azure-npm:windows-amd64-$(TAG) sleep 30s echo "waiting for NPM to start running..." kubectl wait --for=condition=Ready pod -l k8s-app=azure-npm -n kube-system --timeout=15m || { kubectl describe pod -n kube-system -l k8s-app=azure-npm echo "##vso[task.logissue type=error]NPM failed to start running" exit 1 } echo "sleep 3m to let NPM restart in case of bootup failure due to HNS errors" sleep 3m kubectl get po -n kube-system -owide -A if [[ $(PROFILE) == *ws22 ]]; then echo "labeling Windows nodes for scale test" kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | awk '{print $1}' | xargs -n 1 -I {} kubectl label node {} scale-test=true connectivity-test=true else echo "labeling Linux nodes for scale test" kubectl get node -o wide | grep "Ubuntu" | awk '{print $1}' | xargs -n 1 -I {} kubectl label node {} scale-test=true connectivity-test=true fi export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE) echo "Showing cluster status for $CLUSTER_NAME" FQDN=`az aks show -n $CLUSTER_NAME -g $(RESOURCE_GROUP) --query fqdn -o tsv` echo "##vso[task.setvariable variable=FQDN]$FQDN" - task: AzureCLI@2 displayName: "Scale Up Large" inputs: azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) scriptType: "bash" scriptLocation: "inlineScript" failOnStderr: true condition: succeeded() inlineScript: | set -e export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE) mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME # 20 kwok nodes # 1000 kwok Pods # 30 real Pods # 300 ACLs per endpoint # ~4K IPSets # ~36K IPSet members kubectlPath=`pwd`/kubectl cd $(Pipeline.Workspace)/s/test/scale/ set +e ./test-scale.sh --kubectl-binary=$kubectlPath \ --max-kwok-pods-per-node=50 \ --num-kwok-deployments=10 \ --num-kwok-replicas=100 \ --max-real-pods-per-node=30 \ --num-real-deployments=10 \ --num-real-replicas=3 \ --num-network-policies=$(NUM_NETPOLS) \ --num-unapplied-network-policies=$(NUM_NETPOLS) \ --num-unique-labels-per-pod=2 \ --num-unique-labels-per-deployment=2 \ --num-shared-labels-per-pod=10 rc=$? exit $rc - task: AzureCLI@2 displayName: "Test NPM Bootup Latency and Connectivity ($(PROFILE))" inputs: azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) scriptType: "bash" scriptLocation: "inlineScript" failOnStderr: true condition: succeeded() inlineScript: | set -e export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE) mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME kubectl rollout restart -n kube-system ds azure-npm-win echo "sleeping 3 minutes to allow NPM pods to restart after scale-up..." sleep 3m kubectl get pod -n kube-system -l app=kwok-controller -owide kubectl get pod -n kube-system -l app=kwok-controller -owide | grep -q Running || { echo "##vso[task.logissue type=error]need at least one kwok pod running" exit 1 } cd $(Pipeline.Workspace)/s/test/scale/connectivity/ # notes for Windows: # initial connectivity should be established within 15 minutes of NPM restart (12 minute timeout since we already waited 3 minutes above) # adding new network policy to all 30 Pods should happen within 30 seconds set +e ./test-connectivity.sh --kubectl-binary=$kubectlPath \ --num-scale-pods-to-verify=all \ --max-wait-for-initial-connectivity=$(INITIAL_CONNECTIVITY_TIMEOUT) \ --max-wait-after-adding-netpol=30 rc=$? if [[ $rc != 0 ]]; then echo "capturing cluster state due to failure" if [[ $(PROFILE) == *ws22 ]]; then cd $(Pipeline.Workspace)/s/debug/windows/npm/ chmod u+x win-debug.sh ./win-debug.sh mv logs_* $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/ else set -x npmPod=`kubectl get pod -n kube-system | grep npm | grep -v npm-win | awk '{print $1}' | head -n 1 | tr -d '\n'` kubectl exec -n kube-system $npmPod -- iptables-nft -vnL > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/iptables.out kubectl exec -n kube-system $npmPod -- ipset -L > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/ipset.out fi kubectl get pod -n scale-test kubectl get pod -n connectivity-test exit $rc fi kubectl get pod -n kube-system -l app=kwok-controller -owide kubectl get pod -n kube-system -l app=kwok-controller -owide | grep -q Running || { echo "##vso[task.logissue type=error]need at least one kwok pod running" exit 1 } - task: AzureCLI@2 displayName: "CRUD at Medium Scale" inputs: azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) scriptType: "bash" scriptLocation: "inlineScript" failOnStderr: true condition: succeeded() inlineScript: | set -e export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE) mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME # will delete scale-test and connectivity-test namespaces from previous run # 10 kwok Pods # 30 real Pods kubectlPath=`pwd`/kubectl cd $(Pipeline.Workspace)/s/test/scale/ set +e ./test-scale.sh --kubectl-binary=$kubectlPath \ --max-kwok-pods-per-node=50 \ --num-kwok-deployments=10 \ --num-kwok-replicas=1 \ --max-real-pods-per-node=30 \ --num-real-deployments=3 \ --num-real-replicas=4 \ --num-network-policies=1 \ --num-unapplied-network-policies=10 \ --num-unique-labels-per-pod=2 \ --num-unique-labels-per-deployment=2 \ --num-shared-labels-per-pod=10 \ --delete-labels \ --delete-labels-interval=30 \ --delete-labels-times=2 \ --delete-netpols \ --delete-netpols-interval=0 \ --delete-netpols-times=1 \ --delete-kwok-pods=10 \ --delete-real-pods=6 \ --delete-pods-interval=120 \ --delete-pods-times=2 rc=$? exit $rc - task: AzureCLI@2 displayName: "Test Connectivity after CRUD ($(PROFILE))" inputs: azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) scriptType: "bash" scriptLocation: "inlineScript" failOnStderr: true condition: succeeded() inlineScript: | set -e export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE) mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME kubectl get pod -n kube-system -l app=kwok-controller -owide kubectl get pod -n kube-system -l app=kwok-controller -owide | grep -q Running || { echo "##vso[task.logissue type=error]need at least one kwok pod running" exit 1 } cd $(Pipeline.Workspace)/s/test/scale/connectivity/ # initial connectivity should be established within 10 minutes # adding new network policy to all 12 Pods should happen within 20 seconds set +e ./test-connectivity.sh --kubectl-binary=$kubectlPath \ --num-scale-pods-to-verify=all \ --max-wait-for-initial-connectivity=$((10*60)) \ --max-wait-after-adding-netpol=20 rc=$? if [[ $rc != 0 ]]; then echo "capturing cluster state due to failure" if [[ $(PROFILE) == *ws22 ]]; then cd $(Pipeline.Workspace)/s/debug/windows/npm/ chmod u+x win-debug.sh ./win-debug.sh mv logs_* $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/ else set -x npmPod=`kubectl get pod -n kube-system | grep npm | grep -v npm-win | awk '{print $1}' | head -n 1 | tr -d '\n'` kubectl exec -n kube-system $npmPod -- iptables-nft -vnL > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/iptables.out kubectl exec -n kube-system $npmPod -- ipset -L > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/ipset.out fi kubectl get pod -n scale-test kubectl get pod -n connectivity-test exit $rc fi kubectl get pod -n kube-system -l app=kwok-controller -owide kubectl get pod -n kube-system -l app=kwok-controller -owide | grep -q Running || { echo "##vso[task.logissue type=error]need at least one kwok pod running" exit 1 } - bash: | export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE) echo "Getting cluster state for $CLUSTER_NAME" mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME kubectl get pods -n kube-system -owide | grep npm | grep -v kwok npmPodList=`kubectl get pods -n kube-system -owide | grep npm | grep -v kwok | awk '{print $1}'` for npmPod in $npmPodList; do logFile=$(System.DefaultWorkingDirectory)/$CLUSTER_NAME/npm-logs_$(PROFILE)-$npmPod.txt kubectl logs -n kube-system $npmPod > $logFile # capture any previous logs in case there was a crash previousLogFile=$(System.DefaultWorkingDirectory)/$CLUSTER_NAME/previous-npm-logs_$(PROFILE).txt kubectl logs -n kube-system $npmPod -p > $previousLogFile if [[ $? -ne 0 ]]; then # remove the empty file if kubectl logs failed (e.g. there was no previous terminated container) rm $previousLogFile fi done cp ./kubeconfig $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/.kubeconfig condition: always() displayName: "Get Logs" - publish: $(System.DefaultWorkingDirectory)/$(RESOURCE_GROUP)-$(PROFILE) condition: always() artifact: NpmLogs-$(RESOURCE_GROUP)-$(PROFILE) - job: clean_up displayName: "Cleanup" pool: name: $(BUILD_POOL_NAME_DEFAULT) demands: - agent.os -equals Linux - Role -equals Build dependsOn: [Create_Cluster_and_Run_Test, setup] variables: RESOURCE_GROUP: $[ dependencies.setup.outputs['EnvironmentalVariables.RESOURCE_GROUP'] ] steps: - checkout: none - task: AzureCLI@2 displayName: "Delete Test Cluster Resource Group" inputs: azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) scriptType: "bash" scriptLocation: "inlineScript" condition: succeeded() inlineScript: | echo Deleting $(RESOURCE_GROUP) az group delete -n $(RESOURCE_GROUP) --yes