test(scale): [NPM] fix flakes in kwok and capture kernel state on failure (#2249)
* test(kwok): try standard tier for cluster * Revert "test(kwok): try standard tier for cluster" This reverts commitf76e50a559
. * test: run kwok as pod * fix: add execute permission to sh files * fix: allow scheduling on linux for kwok pod * fix: wait timeouts and add retry logic * fix: make sure to reapply kwok nodes if wait fails * test: print out cluster state if wait fails * test: prevent kwok from scheduling on windows node * test: first wait for kwok pods (20 minutes) * style: rearrange wait check * fix: scale up kwok controller for reliability * fix: typo in scaling kwok pods * fix: check kwok pods running in test-connectivity instead of test-scale * fix: wait for pods before adding NetPol * fix: 7 second timeout for windows agnhost connect * feat: get cluster state on failure * debug: fake a failure to verify log capture * fix: bugs in getting cluster state * fix: remove newline instead of "n" * Revert "debug: fake a failure to verify log capture" This reverts commit24ec927425
. * feat(win-debug): get prom metrics * fix: leave timeout=5s for win * style: remove new, unused --connect-timeout parameter * style: comment * feat: top node/pod
This commit is contained in:
Родитель
2382637912
Коммит
7e90960ed0
|
@ -96,7 +96,7 @@ jobs:
|
||||||
displayName: "Verify Directory Exists"
|
displayName: "Verify Directory Exists"
|
||||||
failOnStderr: true
|
failOnStderr: true
|
||||||
- task: AzureCLI@2
|
- task: AzureCLI@2
|
||||||
displayName: "Download Kubectl and Kwok"
|
displayName: "Download Kubectl"
|
||||||
inputs:
|
inputs:
|
||||||
azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION)
|
azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION)
|
||||||
scriptType: "bash"
|
scriptType: "bash"
|
||||||
|
@ -107,11 +107,6 @@ jobs:
|
||||||
set -e
|
set -e
|
||||||
curl -LO https://dl.k8s.io/release/v1.23.0/bin/linux/amd64/kubectl
|
curl -LO https://dl.k8s.io/release/v1.23.0/bin/linux/amd64/kubectl
|
||||||
chmod +x kubectl
|
chmod +x kubectl
|
||||||
|
|
||||||
KWOK_REPO=kubernetes-sigs/kwok
|
|
||||||
KWOK_LATEST_RELEASE=$(curl "https://api.github.com/repos/${KWOK_REPO}/releases/latest" | jq -r '.tag_name')
|
|
||||||
wget -O kwok -c "https://github.com/kubernetes-sigs/kwok/releases/download/${KWOK_LATEST_RELEASE}/kwok-$(go env GOOS)-$(go env GOARCH)"
|
|
||||||
chmod +x kwok
|
|
||||||
- task: AzureCLI@2
|
- task: AzureCLI@2
|
||||||
displayName: "Create AKS Cluster"
|
displayName: "Create AKS Cluster"
|
||||||
inputs:
|
inputs:
|
||||||
|
@ -140,17 +135,22 @@ jobs:
|
||||||
--vm-set-type VirtualMachineScaleSets \
|
--vm-set-type VirtualMachineScaleSets \
|
||||||
--node-vm-size Standard_D4s_v3 \
|
--node-vm-size Standard_D4s_v3 \
|
||||||
--node-count 1 \
|
--node-count 1 \
|
||||||
|
--tier standard \
|
||||||
--max-pods 100
|
--max-pods 100
|
||||||
|
|
||||||
if [[ $(PROFILE) == *ws22 ]]; then
|
echo "Getting credentials to $CLUSTER_NAME"
|
||||||
# don't schedule anything on the linux system pool
|
az aks get-credentials -g $(RESOURCE_GROUP) -n $CLUSTER_NAME --overwrite-existing --file ./kubeconfig
|
||||||
echo "Updating $CLUSTER_NAME to not schedule anything on linux pool..."
|
mkdir -p ~/.kube/
|
||||||
az aks nodepool update \
|
cp ./kubeconfig ~/.kube/config
|
||||||
--cluster-name $CLUSTER_NAME \
|
|
||||||
-g $(RESOURCE_GROUP) \
|
|
||||||
-n nodepool1 \
|
|
||||||
--node-taints CriticalAddonsOnly=true:NoSchedule
|
|
||||||
|
|
||||||
|
# install kwok on linux node
|
||||||
|
cd $(Pipeline.Workspace)/s/test/scale/
|
||||||
|
chmod u+x run-kwok-as-pod.sh test-scale.sh connectivity/test-connectivity.sh
|
||||||
|
./run-kwok-as-pod.sh
|
||||||
|
# need reliability in case multiple controllers enter CrashLoopBackOff from "context cancelled"
|
||||||
|
kubectl scale deployment -n kube-system -l app=kwok-controller --replicas=5
|
||||||
|
|
||||||
|
if [[ $(PROFILE) == *ws22 ]]; then
|
||||||
echo "Adding Windows nodepool to $CLUSTER_NAME"
|
echo "Adding Windows nodepool to $CLUSTER_NAME"
|
||||||
az aks nodepool add \
|
az aks nodepool add \
|
||||||
--resource-group $(RESOURCE_GROUP) \
|
--resource-group $(RESOURCE_GROUP) \
|
||||||
|
@ -163,11 +163,6 @@ jobs:
|
||||||
--max-pods 100
|
--max-pods 100
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "Getting credentials to $CLUSTER_NAME"
|
|
||||||
az aks get-credentials -g $(RESOURCE_GROUP) -n $CLUSTER_NAME --overwrite-existing --file ./kubeconfig
|
|
||||||
mkdir -p ~/.kube/
|
|
||||||
cp ./kubeconfig ~/.kube/config
|
|
||||||
|
|
||||||
- task: AzureCLI@2
|
- task: AzureCLI@2
|
||||||
displayName: "Deploy NPM to Test Cluster"
|
displayName: "Deploy NPM to Test Cluster"
|
||||||
inputs:
|
inputs:
|
||||||
|
@ -230,15 +225,6 @@ jobs:
|
||||||
set -e
|
set -e
|
||||||
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
|
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
|
||||||
mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
|
mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
|
||||||
./kwok --kubeconfig ~/.kube/config \
|
|
||||||
--cidr=155.0.0.0/16 \
|
|
||||||
--node-ip=155.0.0.1 \
|
|
||||||
--manage-all-nodes=false \
|
|
||||||
--manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \
|
|
||||||
--manage-nodes-with-label-selector= \
|
|
||||||
--disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \
|
|
||||||
--disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/kwok-scale-up.log &
|
|
||||||
kwok_pid=$!
|
|
||||||
|
|
||||||
# 20 kwok nodes
|
# 20 kwok nodes
|
||||||
# 1000 kwok Pods
|
# 1000 kwok Pods
|
||||||
|
@ -262,7 +248,6 @@ jobs:
|
||||||
--num-unique-labels-per-deployment=2 \
|
--num-unique-labels-per-deployment=2 \
|
||||||
--num-shared-labels-per-pod=10
|
--num-shared-labels-per-pod=10
|
||||||
rc=$?
|
rc=$?
|
||||||
kill $kwok_pid
|
|
||||||
exit $rc
|
exit $rc
|
||||||
|
|
||||||
- task: AzureCLI@2
|
- task: AzureCLI@2
|
||||||
|
@ -277,20 +262,17 @@ jobs:
|
||||||
set -e
|
set -e
|
||||||
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
|
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
|
||||||
mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
|
mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
|
||||||
./kwok --kubeconfig ~/.kube/config \
|
|
||||||
--cidr=155.0.0.0/16 \
|
|
||||||
--node-ip=155.0.0.1 \
|
|
||||||
--manage-all-nodes=false \
|
|
||||||
--manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \
|
|
||||||
--manage-nodes-with-label-selector= \
|
|
||||||
--disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \
|
|
||||||
--disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/kwok-bootup-latency.log &
|
|
||||||
kwok_pid=$!
|
|
||||||
|
|
||||||
kubectl rollout restart -n kube-system ds azure-npm-win
|
kubectl rollout restart -n kube-system ds azure-npm-win
|
||||||
echo "sleeping 3 minutes to allow NPM pods to restart after scale-up..."
|
echo "sleeping 3 minutes to allow NPM pods to restart after scale-up..."
|
||||||
sleep 3m
|
sleep 3m
|
||||||
|
|
||||||
|
kubectl get pod -n kube-system -l app=kwok-controller -owide
|
||||||
|
kubectl get pod -n kube-system -l app=kwok-controller -owide | grep -q Running || {
|
||||||
|
echo "##vso[task.logissue type=error]need at least one kwok pod running"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
cd $(Pipeline.Workspace)/s/test/scale/connectivity/
|
cd $(Pipeline.Workspace)/s/test/scale/connectivity/
|
||||||
# notes for Windows:
|
# notes for Windows:
|
||||||
# initial connectivity should be established within 15 minutes of NPM restart (12 minute timeout since we already waited 3 minutes above)
|
# initial connectivity should be established within 15 minutes of NPM restart (12 minute timeout since we already waited 3 minutes above)
|
||||||
|
@ -302,11 +284,29 @@ jobs:
|
||||||
--max-wait-after-adding-netpol=30
|
--max-wait-after-adding-netpol=30
|
||||||
rc=$?
|
rc=$?
|
||||||
if [[ $rc != 0 ]]; then
|
if [[ $rc != 0 ]]; then
|
||||||
|
echo "capturing cluster state due to failure"
|
||||||
|
if [[ $(PROFILE) == *ws22 ]]; then
|
||||||
|
cd $(Pipeline.Workspace)/s/debug/windows/npm/
|
||||||
|
chmod u+x win-debug.sh
|
||||||
|
./win-debug.sh
|
||||||
|
mv logs_* $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/
|
||||||
|
else
|
||||||
|
set -x
|
||||||
|
npmPod=`kubectl get pod -n kube-system | grep npm | grep -v npm-win | awk '{print $1}' | head -n 1 | tr -d '\n'`
|
||||||
|
kubectl exec -n kube-system $npmPod -- iptables-nft -vnL > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/iptables.out
|
||||||
|
kubectl exec -n kube-system $npmPod -- ipset -L > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/ipset.out
|
||||||
|
fi
|
||||||
|
|
||||||
kubectl get pod -n scale-test
|
kubectl get pod -n scale-test
|
||||||
kubectl get pod -n connectivity-test
|
kubectl get pod -n connectivity-test
|
||||||
|
exit $rc
|
||||||
fi
|
fi
|
||||||
kill $kwok_pid
|
|
||||||
exit $rc
|
kubectl get pod -n kube-system -l app=kwok-controller -owide
|
||||||
|
kubectl get pod -n kube-system -l app=kwok-controller -owide | grep -q Running || {
|
||||||
|
echo "##vso[task.logissue type=error]need at least one kwok pod running"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
- task: AzureCLI@2
|
- task: AzureCLI@2
|
||||||
displayName: "CRUD at Medium Scale"
|
displayName: "CRUD at Medium Scale"
|
||||||
|
@ -320,15 +320,6 @@ jobs:
|
||||||
set -e
|
set -e
|
||||||
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
|
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
|
||||||
mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
|
mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
|
||||||
./kwok --kubeconfig ~/.kube/config \
|
|
||||||
--cidr=155.0.0.0/16 \
|
|
||||||
--node-ip=155.0.0.1 \
|
|
||||||
--manage-all-nodes=false \
|
|
||||||
--manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \
|
|
||||||
--manage-nodes-with-label-selector= \
|
|
||||||
--disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \
|
|
||||||
--disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/kwok-crud.log &
|
|
||||||
kwok_pid=$!
|
|
||||||
|
|
||||||
# will delete scale-test and connectivity-test namespaces from previous run
|
# will delete scale-test and connectivity-test namespaces from previous run
|
||||||
# 10 kwok Pods
|
# 10 kwok Pods
|
||||||
|
@ -359,7 +350,6 @@ jobs:
|
||||||
--delete-pods-interval=120 \
|
--delete-pods-interval=120 \
|
||||||
--delete-pods-times=2
|
--delete-pods-times=2
|
||||||
rc=$?
|
rc=$?
|
||||||
kill $kwok_pid
|
|
||||||
exit $rc
|
exit $rc
|
||||||
|
|
||||||
- task: AzureCLI@2
|
- task: AzureCLI@2
|
||||||
|
@ -374,15 +364,13 @@ jobs:
|
||||||
set -e
|
set -e
|
||||||
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
|
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
|
||||||
mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
|
mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
|
||||||
./kwok --kubeconfig ~/.kube/config \
|
|
||||||
--cidr=155.0.0.0/16 \
|
kubectl get pod -n kube-system -l app=kwok-controller -owide
|
||||||
--node-ip=155.0.0.1 \
|
kubectl get pod -n kube-system -l app=kwok-controller -owide | grep -q Running || {
|
||||||
--manage-all-nodes=false \
|
echo "##vso[task.logissue type=error]need at least one kwok pod running"
|
||||||
--manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \
|
exit 1
|
||||||
--manage-nodes-with-label-selector= \
|
}
|
||||||
--disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \
|
|
||||||
--disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/kwok-crud-connectivity.log &
|
|
||||||
kwok_pid=$!
|
|
||||||
|
|
||||||
cd $(Pipeline.Workspace)/s/test/scale/connectivity/
|
cd $(Pipeline.Workspace)/s/test/scale/connectivity/
|
||||||
# initial connectivity should be established within 10 minutes
|
# initial connectivity should be established within 10 minutes
|
||||||
|
@ -394,11 +382,29 @@ jobs:
|
||||||
--max-wait-after-adding-netpol=20
|
--max-wait-after-adding-netpol=20
|
||||||
rc=$?
|
rc=$?
|
||||||
if [[ $rc != 0 ]]; then
|
if [[ $rc != 0 ]]; then
|
||||||
|
echo "capturing cluster state due to failure"
|
||||||
|
if [[ $(PROFILE) == *ws22 ]]; then
|
||||||
|
cd $(Pipeline.Workspace)/s/debug/windows/npm/
|
||||||
|
chmod u+x win-debug.sh
|
||||||
|
./win-debug.sh
|
||||||
|
mv logs_* $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/
|
||||||
|
else
|
||||||
|
set -x
|
||||||
|
npmPod=`kubectl get pod -n kube-system | grep npm | grep -v npm-win | awk '{print $1}' | head -n 1 | tr -d '\n'`
|
||||||
|
kubectl exec -n kube-system $npmPod -- iptables-nft -vnL > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/iptables.out
|
||||||
|
kubectl exec -n kube-system $npmPod -- ipset -L > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/ipset.out
|
||||||
|
fi
|
||||||
|
|
||||||
kubectl get pod -n scale-test
|
kubectl get pod -n scale-test
|
||||||
kubectl get pod -n connectivity-test
|
kubectl get pod -n connectivity-test
|
||||||
|
exit $rc
|
||||||
fi
|
fi
|
||||||
kill $kwok_pid
|
|
||||||
exit $rc
|
kubectl get pod -n kube-system -l app=kwok-controller -owide
|
||||||
|
kubectl get pod -n kube-system -l app=kwok-controller -owide | grep -q Running || {
|
||||||
|
echo "##vso[task.logissue type=error]need at least one kwok pod running"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
- bash: |
|
- bash: |
|
||||||
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
|
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
|
||||||
|
|
|
@ -1,20 +1,23 @@
|
||||||
|
kubeconfig=$1
|
||||||
|
if [[ -z $1 ]]; then
|
||||||
|
echo "kubeconfig not provided. using default kubeconfig"
|
||||||
|
else
|
||||||
|
echo "using kubeconfig: $kubeconfig"
|
||||||
|
kubeconfigArg="--kubeconfig $kubeconfig"
|
||||||
|
fi
|
||||||
|
|
||||||
# NOTE: you may not be able to unzip logs.zip in Linux since it was compressed in Windows
|
# NOTE: you may not be able to unzip logs.zip in Linux since it was compressed in Windows
|
||||||
set -e
|
set -x
|
||||||
dateString=`date -I` # like 2022-09-24
|
dateString=`date -I` # like 2022-09-24
|
||||||
filepath=logs_$dateString
|
filepath=logs_$dateString
|
||||||
mkdir $filepath
|
mkdir $filepath
|
||||||
|
|
||||||
echo "gathering logs and writing to $filepath/"
|
echo "gathering logs and writing to $filepath/"
|
||||||
|
|
||||||
kubectl get pod -A -o wide --show-labels > $filepath/allpods.out
|
|
||||||
kubectl get netpol -A -o yaml > $filepath/all-netpol-yamls.out
|
|
||||||
kubectl describe netpol -A > $filepath/all-netpol-descriptions.out
|
|
||||||
|
|
||||||
npmPods=()
|
npmPods=()
|
||||||
nodes=()
|
nodes=()
|
||||||
for npmPodOrNode in `kubectl get pod -n kube-system -owide --output=custom-columns='Name:.metadata.name,Node:spec.nodeName' | grep "npm-win"`; do
|
for npmPodOrNode in `kubectl $kubeconfigArg get pod -n kube-system -owide --output=custom-columns='Name:.metadata.name,Node:spec.nodeName' | grep "npm-win"`; do
|
||||||
# for loop will go over each item (npm pod, then its node, then the next npm pod, then its node, ...)
|
# for loop will go over each item (npm pod, then its node, then the next npm pod, then its node, ...)
|
||||||
set +e
|
|
||||||
echo $npmPodOrNode | grep -q azure-npm-win-
|
echo $npmPodOrNode | grep -q azure-npm-win-
|
||||||
if [ $? -eq 0 ]; then
|
if [ $? -eq 0 ]; then
|
||||||
npmPods+=($npmPodOrNode)
|
npmPods+=($npmPodOrNode)
|
||||||
|
@ -22,7 +25,6 @@ for npmPodOrNode in `kubectl get pod -n kube-system -owide --output=custom-colum
|
||||||
nodes+=($npmPodOrNode)
|
nodes+=($npmPodOrNode)
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
set -e
|
|
||||||
|
|
||||||
echo "npm pods: ${npmPods[@]}"
|
echo "npm pods: ${npmPods[@]}"
|
||||||
echo "nodes of npm pods: ${nodes[@]}"
|
echo "nodes of npm pods: ${nodes[@]}"
|
||||||
|
@ -33,22 +35,48 @@ for i in $(seq 1 ${#npmPods[*]}); do
|
||||||
node=${nodes[$j]}
|
node=${nodes[$j]}
|
||||||
|
|
||||||
echo "gathering logs. npm pod: $npmPod. node: $node"
|
echo "gathering logs. npm pod: $npmPod. node: $node"
|
||||||
kubectl logs -n kube-system $npmPod > $filepath/logs_$npmPod.out
|
kubectl $kubeconfigArg logs -n kube-system $npmPod > $filepath/logs_$npmPod.out
|
||||||
|
|
||||||
ips=()
|
ips=()
|
||||||
for ip in `kubectl get pod -A -owide --output=custom-columns='IP:.status.podIP,Node:spec.nodeName' | grep $node | grep -oP "\d+\.\d+\.\d+\.\d+"`; do
|
for ip in `kubectl $kubeconfigArg get pod -A -owide --output=custom-columns='IP:.status.podIP,Node:spec.nodeName' | grep $node | grep -oP "\d+\.\d+\.\d+\.\d+"`; do
|
||||||
ips+=($ip)
|
ips+=($ip)
|
||||||
done
|
done
|
||||||
echo "node $node has IPs: ${ips[@]}"
|
echo "node $node has IPs: ${ips[@]}"
|
||||||
|
|
||||||
echo "copying ps1 file into $npmPod"
|
echo "copying ps1 file into $npmPod"
|
||||||
kubectl cp ./pod_exec.ps1 kube-system/"$npmPod":execw.ps1
|
kubectl $kubeconfigArg cp ./pod_exec.ps1 kube-system/"$npmPod":execw.ps1
|
||||||
|
|
||||||
echo "executing ps1 file on $npmPod"
|
echo "executing ps1 file on $npmPod"
|
||||||
kubectl exec -it -n kube-system $npmPod -- powershell.exe -Command .\\execw.ps1 "'${ips[@]}'"
|
kubectl $kubeconfigArg exec -n kube-system $npmPod -- powershell.exe -Command .\\execw.ps1 "'${ips[@]}'"
|
||||||
|
|
||||||
echo "copying logs.zip from $npmPod. NOTE: this will be a windows-based compressed archive (probably need windows to expand it)"
|
echo "copying logs.zip from $npmPod. NOTE: this will be a windows-based compressed archive (probably need windows to expand it)"
|
||||||
kubectl cp kube-system/"$npmPod":npm-exec-logs.zip $filepath/npm-exec-logs_$node.zip
|
kubectl $kubeconfigArg cp kube-system/"$npmPod":npm-exec-logs.zip $filepath/npm-exec-logs_$node.zip
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "finished getting HNS info. getting prometheus metrics"
|
||||||
|
|
||||||
|
mkdir -p $filepath/prometheus/node-metrics
|
||||||
|
for i in $(seq 1 ${#npmPods[*]}); do
|
||||||
|
j=$((i-1))
|
||||||
|
npmPod=${npmPods[$j]}
|
||||||
|
kubectl $kubeconfigArg exec -n kube-system $npmPod -- powershell.exe -Command "(Invoke-WebRequest -UseBasicParsing http://localhost:10091/node-metrics).Content" > $filepath/prometheus/node-metrics/$npmPod.out
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "finished getting prometheus metrics. getting cluster state"
|
||||||
|
|
||||||
|
kubectl $kubeconfigArg get pod -A -o wide --show-labels > $filepath/allpods.out
|
||||||
|
kubectl $kubeconfigArg get netpol -A -o yaml > $filepath/all-netpol-yamls.out
|
||||||
|
kubectl $kubeconfigArg describe netpol -A > $filepath/all-netpol-descriptions.out
|
||||||
|
|
||||||
|
for ns in `kubectl $kubeconfigArg get pod -A | grep -v Running | grep -v STATUS | awk '{print $1}' | sort | uniq`; do
|
||||||
|
echo "describing failed pods in namespace $ns..."
|
||||||
|
failingPods=`kubectl $kubeconfigArg get pod -n $ns | grep -v Running | grep -v STATUS | awk '{print $1}' | xargs echo`
|
||||||
|
if [[ -z $failingPods ]]; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
echo "failing Pods: $failingPods"
|
||||||
|
kubectl $kubeconfigArg describe pod -n $ns $failingPods > $filepath/describepod_$ns.out
|
||||||
|
break
|
||||||
done
|
done
|
||||||
|
|
||||||
echo "finished gathering all logs. written to $filepath/"
|
echo "finished gathering all logs. written to $filepath/"
|
||||||
|
|
|
@ -29,7 +29,7 @@ REQUIRED PARAMETERS:
|
||||||
|
|
||||||
OPTIONAL PARAMETERS:
|
OPTIONAL PARAMETERS:
|
||||||
--kubeconfig=<path> path to kubeconfig file
|
--kubeconfig=<path> path to kubeconfig file
|
||||||
--kubectl-binary=<path> path to kubectl binary. Default is kubectl
|
--kubectl-binary=<path> path to kubectl binary. Default is kubectl
|
||||||
|
|
||||||
EXIT CODES:
|
EXIT CODES:
|
||||||
0 - success
|
0 - success
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
# source: https://kwok.sigs.k8s.io/docs/user/kwok-in-cluster/
|
||||||
|
KWOK_REPO=kubernetes-sigs/kwok
|
||||||
|
KWOK_LATEST_RELEASE=$(curl "https://api.github.com/repos/${KWOK_REPO}/releases/latest" | jq -r '.tag_name')
|
||||||
|
kubectl apply -f "https://github.com/${KWOK_REPO}/releases/download/${KWOK_LATEST_RELEASE}/kwok.yaml"
|
||||||
|
kubectl apply -f "https://github.com/${KWOK_REPO}/releases/download/${KWOK_LATEST_RELEASE}/stage-fast.yaml"
|
|
@ -276,21 +276,49 @@ fi
|
||||||
|
|
||||||
## HELPER FUNCTIONS
|
## HELPER FUNCTIONS
|
||||||
wait_for_pods() {
|
wait_for_pods() {
|
||||||
# wait for all pods to run
|
|
||||||
minutesToWaitForRealPods=$(( 10 + $numRealPods / 250 ))
|
|
||||||
set -x
|
|
||||||
if [[ $numRealPods -gt 0 ]]; then
|
|
||||||
$KUBECTL $KUBECONFIG_ARG wait --for=condition=Ready pods -n scale-test -l is-real=true --all --timeout="${minutesToWaitForRealPods}m"
|
|
||||||
fi
|
|
||||||
set +x
|
|
||||||
|
|
||||||
# just make sure kwok pods are Running, not necessarily Ready (sometimes kwok pods have NodeNotReady even though the node is ready)
|
|
||||||
minutesToWaitForKwokPods=$(( 1 + $numKwokPods / 500 ))
|
|
||||||
set -x
|
|
||||||
if [[ $numKwokPods -gt 0 ]]; then
|
if [[ $numKwokPods -gt 0 ]]; then
|
||||||
$KUBECTL $KUBECONFIG_ARG wait --for=condition=Initialized pods -n scale-test -l is-kwok=true --all --timeout="${minutesToWaitForKwokPods}m"
|
# wait up to 20 minutes
|
||||||
|
startDate=`date +%s`
|
||||||
|
count=0
|
||||||
|
while : ; do
|
||||||
|
echo "waiting for fake pods to run (try $count)"
|
||||||
|
count=$((count+1))
|
||||||
|
# just make sure kwok pods are Running, not necessarily Ready (sometimes kwok pods have NodeNotReady even though the node is ready)
|
||||||
|
set +e -x
|
||||||
|
$KUBECTL $KUBECONFIG_ARG wait --for=condition=Initialized pods -n scale-test -l is-kwok=true --all --timeout=0 && set -e +x && break
|
||||||
|
endDate=`date +%s`
|
||||||
|
set -e +x
|
||||||
|
if [[ $endDate -gt $(( startDate + (20*60) )) ]]; then
|
||||||
|
echo "timed out waiting for all kwok pods to run"
|
||||||
|
k get pod -n scale-test -owide
|
||||||
|
k get node
|
||||||
|
k get pod -n kube-system -l app=kwok-controller -owide
|
||||||
|
k top pod -n kube-system -l app=kwok-controller
|
||||||
|
k top node
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
# try recreating nodes if KWOK controller failed
|
||||||
|
$KUBECTL $KUBECONFIG_ARG apply -f generated/kwok-nodes/
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $numRealPods -gt 0 ]]; then
|
||||||
|
# wait up to 10 minutes
|
||||||
|
startDate=`date +%s`
|
||||||
|
count=0
|
||||||
|
while : ; do
|
||||||
|
echo "waiting for real pods to run (try $count)"
|
||||||
|
count=$((count+1))
|
||||||
|
set +e -x
|
||||||
|
$KUBECTL $KUBECONFIG_ARG wait --for=condition=Ready pods -n scale-test -l is-real=true --all --timeout=0 && set -e +x && break
|
||||||
|
set -e +x
|
||||||
|
endDate=`date +%s`
|
||||||
|
if [[ $endDate -gt $(( startDate + (10*60) )) ]]; then
|
||||||
|
echo "timed out waiting for all real pods to run"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
fi
|
fi
|
||||||
set +x
|
|
||||||
}
|
}
|
||||||
|
|
||||||
## FILE SETUP
|
## FILE SETUP
|
||||||
|
@ -444,19 +472,29 @@ echo
|
||||||
|
|
||||||
set -x
|
set -x
|
||||||
$KUBECTL $KUBECONFIG_ARG create ns scale-test
|
$KUBECTL $KUBECONFIG_ARG create ns scale-test
|
||||||
|
set +x
|
||||||
|
|
||||||
if [[ $numKwokNodes -gt 0 ]]; then
|
if [[ $numKwokNodes -gt 0 ]]; then
|
||||||
|
set -x
|
||||||
$KUBECTL $KUBECONFIG_ARG apply -f generated/kwok-nodes/
|
$KUBECTL $KUBECONFIG_ARG apply -f generated/kwok-nodes/
|
||||||
|
set +x
|
||||||
fi
|
fi
|
||||||
if [[ $numRealPods -gt 0 ]]; then
|
if [[ $numRealPods -gt 0 ]]; then
|
||||||
|
set -x
|
||||||
$KUBECTL $KUBECONFIG_ARG apply -f generated/deployments/real-$realPodType/
|
$KUBECTL $KUBECONFIG_ARG apply -f generated/deployments/real-$realPodType/
|
||||||
|
set +x
|
||||||
fi
|
fi
|
||||||
if [[ $numKwokPods -gt 0 ]]; then
|
if [[ $numKwokPods -gt 0 ]]; then
|
||||||
|
set -x
|
||||||
$KUBECTL $KUBECONFIG_ARG apply -f generated/deployments/kwok/
|
$KUBECTL $KUBECONFIG_ARG apply -f generated/deployments/kwok/
|
||||||
|
set +x
|
||||||
fi
|
fi
|
||||||
if [[ $numRealServices -gt 0 ]]; then
|
if [[ $numRealServices -gt 0 ]]; then
|
||||||
|
set -x
|
||||||
$KUBECTL $KUBECONFIG_ARG apply -f generated/services/real/
|
$KUBECTL $KUBECONFIG_ARG apply -f generated/services/real/
|
||||||
|
set +x
|
||||||
fi
|
fi
|
||||||
set +x
|
|
||||||
|
|
||||||
add_shared_labels() {
|
add_shared_labels() {
|
||||||
if [[ $numSharedLabelsPerPod -gt 0 ]]; then
|
if [[ $numSharedLabelsPerPod -gt 0 ]]; then
|
||||||
|
@ -489,16 +527,19 @@ if [[ $numUniqueLabelsPerPod -gt 0 ]]; then
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
|
|
||||||
set -x
|
# to better evaluate time to apply ACLs, wait for pods to come up first (takes a variable amount of time) before applying the NetPols
|
||||||
|
wait_for_pods
|
||||||
|
|
||||||
if [[ $numUnappliedNetworkPolicies -gt 0 ]]; then
|
if [[ $numUnappliedNetworkPolicies -gt 0 ]]; then
|
||||||
|
set -x
|
||||||
$KUBECTL $KUBECONFIG_ARG apply -f generated/networkpolicies/unapplied
|
$KUBECTL $KUBECONFIG_ARG apply -f generated/networkpolicies/unapplied
|
||||||
|
set +x
|
||||||
fi
|
fi
|
||||||
if [[ $numNetworkPolicies -gt 0 ]]; then
|
if [[ $numNetworkPolicies -gt 0 ]]; then
|
||||||
|
set -x
|
||||||
$KUBECTL $KUBECONFIG_ARG apply -f generated/networkpolicies/applied
|
$KUBECTL $KUBECONFIG_ARG apply -f generated/networkpolicies/applied
|
||||||
|
set +x
|
||||||
fi
|
fi
|
||||||
set +x
|
|
||||||
|
|
||||||
wait_for_pods
|
|
||||||
|
|
||||||
echo
|
echo
|
||||||
echo "done scaling at $(date -u). Had started at $startDate."
|
echo "done scaling at $(date -u). Had started at $startDate."
|
||||||
|
|
Загрузка…
Ссылка в новой задаче