diff --git a/.pipelines/npm/npm-scale-test.yaml b/.pipelines/npm/npm-scale-test.yaml
index c8a418053..866b887d1 100644
--- a/.pipelines/npm/npm-scale-test.yaml
+++ b/.pipelines/npm/npm-scale-test.yaml
@@ -96,7 +96,7 @@ jobs:
         displayName: "Verify Directory Exists"
         failOnStderr: true
       - task: AzureCLI@2
-        displayName: "Download Kubectl and Kwok"
+        displayName: "Download Kubectl"
         inputs:
           azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION)
           scriptType: "bash"
@@ -107,11 +107,6 @@ jobs:
             set -e
             curl -LO https://dl.k8s.io/release/v1.23.0/bin/linux/amd64/kubectl
             chmod +x kubectl
-
-            KWOK_REPO=kubernetes-sigs/kwok
-            KWOK_LATEST_RELEASE=$(curl "https://api.github.com/repos/${KWOK_REPO}/releases/latest" | jq -r '.tag_name')
-            wget -O kwok -c "https://github.com/kubernetes-sigs/kwok/releases/download/${KWOK_LATEST_RELEASE}/kwok-$(go env GOOS)-$(go env GOARCH)"
-            chmod +x kwok
       - task: AzureCLI@2
         displayName: "Create AKS Cluster"
         inputs:
@@ -140,17 +135,22 @@ jobs:
                 --vm-set-type VirtualMachineScaleSets \
                 --node-vm-size Standard_D4s_v3 \
                 --node-count 1 \
+                --tier standard \
                 --max-pods 100
 
-            if [[ $(PROFILE) == *ws22 ]]; then
-              # don't schedule anything on the linux system pool
-              echo "Updating $CLUSTER_NAME to not schedule anything on linux pool..."
-              az aks nodepool update \
-                --cluster-name $CLUSTER_NAME \
-                -g $(RESOURCE_GROUP) \
-                -n nodepool1 \
-                --node-taints CriticalAddonsOnly=true:NoSchedule
+            echo "Getting credentials to $CLUSTER_NAME"
+            az aks get-credentials -g $(RESOURCE_GROUP) -n $CLUSTER_NAME --overwrite-existing --file ./kubeconfig
+            mkdir -p ~/.kube/
+            cp ./kubeconfig ~/.kube/config
 
+            # install kwok on linux node
+            cd $(Pipeline.Workspace)/s/test/scale/
+            chmod u+x run-kwok-as-pod.sh test-scale.sh connectivity/test-connectivity.sh
+            ./run-kwok-as-pod.sh
+            # need reliability in case multiple controllers enter CrashLoopBackOff from "context cancelled"
+            kubectl scale deployment -n kube-system -l app=kwok-controller --replicas=5
+
+            if [[ $(PROFILE) == *ws22 ]]; then
               echo "Adding Windows nodepool to $CLUSTER_NAME"
               az aks nodepool add \
                   --resource-group $(RESOURCE_GROUP) \
@@ -163,11 +163,6 @@ jobs:
                   --max-pods 100
             fi
 
-            echo "Getting credentials to $CLUSTER_NAME"
-            az aks get-credentials -g $(RESOURCE_GROUP) -n $CLUSTER_NAME --overwrite-existing --file ./kubeconfig
-            mkdir -p ~/.kube/
-            cp ./kubeconfig ~/.kube/config
-
       - task: AzureCLI@2
         displayName: "Deploy NPM to Test Cluster"
         inputs:
@@ -230,15 +225,6 @@ jobs:
             set -e
             export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
             mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
-            ./kwok --kubeconfig ~/.kube/config \
-              --cidr=155.0.0.0/16 \
-              --node-ip=155.0.0.1 \
-              --manage-all-nodes=false \
-              --manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \
-              --manage-nodes-with-label-selector= \
-              --disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \
-              --disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/kwok-scale-up.log &
-            kwok_pid=$!
 
             # 20 kwok nodes
             # 1000 kwok Pods
@@ -262,7 +248,6 @@ jobs:
                 --num-unique-labels-per-deployment=2 \
                 --num-shared-labels-per-pod=10
             rc=$?
-            kill $kwok_pid
             exit $rc
 
       - task: AzureCLI@2
@@ -277,20 +262,17 @@ jobs:
             set -e
             export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
             mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
-            ./kwok --kubeconfig ~/.kube/config \
-              --cidr=155.0.0.0/16 \
-              --node-ip=155.0.0.1 \
-              --manage-all-nodes=false \
-              --manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \
-              --manage-nodes-with-label-selector= \
-              --disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \
-              --disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/kwok-bootup-latency.log &
-            kwok_pid=$!
 
             kubectl rollout restart -n kube-system ds azure-npm-win
             echo "sleeping 3 minutes to allow NPM pods to restart after scale-up..."
             sleep 3m
 
+            kubectl get pod -n kube-system -l app=kwok-controller -owide
+            kubectl get pod -n kube-system -l app=kwok-controller -owide | grep -q Running || {
+                echo "##vso[task.logissue type=error]need at least one kwok pod running"
+                exit 1
+            }
+
             cd $(Pipeline.Workspace)/s/test/scale/connectivity/
             # notes for Windows:
             # initial connectivity should be established within 15 minutes of NPM restart (12 minute timeout since we already waited 3 minutes above)
@@ -302,11 +284,29 @@ jobs:
                 --max-wait-after-adding-netpol=30
             rc=$?
             if [[ $rc != 0 ]]; then
+                echo "capturing cluster state due to failure"
+                if [[ $(PROFILE) == *ws22 ]]; then
+                    cd $(Pipeline.Workspace)/s/debug/windows/npm/
+                    chmod u+x win-debug.sh
+                    ./win-debug.sh
+                    mv logs_* $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/
+                else
+                    set -x
+                    npmPod=`kubectl get pod -n kube-system | grep npm | grep -v npm-win | awk '{print $1}' | head -n 1 | tr -d '\n'`
+                    kubectl exec -n kube-system $npmPod -- iptables-nft -vnL > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/iptables.out
+                    kubectl exec -n kube-system $npmPod -- ipset -L > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/ipset.out
+                fi
+
                 kubectl get pod -n scale-test
                 kubectl get pod -n connectivity-test
+                exit $rc
             fi
-            kill $kwok_pid
-            exit $rc
+
+            kubectl get pod -n kube-system -l app=kwok-controller -owide
+            kubectl get pod -n kube-system -l app=kwok-controller -owide | grep -q Running || {
+                echo "##vso[task.logissue type=error]need at least one kwok pod running"
+                exit 1
+            }
 
       - task: AzureCLI@2
         displayName: "CRUD at Medium Scale"
@@ -320,15 +320,6 @@ jobs:
             set -e
             export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
             mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
-            ./kwok --kubeconfig ~/.kube/config \
-              --cidr=155.0.0.0/16 \
-              --node-ip=155.0.0.1 \
-              --manage-all-nodes=false \
-              --manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \
-              --manage-nodes-with-label-selector= \
-              --disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \
-              --disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/kwok-crud.log &
-            kwok_pid=$!
 
             # will delete scale-test and connectivity-test namespaces from previous run
             # 10 kwok Pods
@@ -359,7 +350,6 @@ jobs:
                 --delete-pods-interval=120 \
                 --delete-pods-times=2
             rc=$?
-            kill $kwok_pid
             exit $rc
 
       - task: AzureCLI@2
@@ -374,15 +364,13 @@ jobs:
             set -e
             export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
             mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
-            ./kwok --kubeconfig ~/.kube/config \
-              --cidr=155.0.0.0/16 \
-              --node-ip=155.0.0.1 \
-              --manage-all-nodes=false \
-              --manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \
-              --manage-nodes-with-label-selector= \
-              --disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \
-              --disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/kwok-crud-connectivity.log &
-            kwok_pid=$!
+
+            kubectl get pod -n kube-system -l app=kwok-controller -owide
+            kubectl get pod -n kube-system -l app=kwok-controller -owide | grep -q Running || {
+                echo "##vso[task.logissue type=error]need at least one kwok pod running"
+                exit 1
+            }
+
 
             cd $(Pipeline.Workspace)/s/test/scale/connectivity/
             # initial connectivity should be established within 10 minutes
@@ -394,11 +382,29 @@ jobs:
                 --max-wait-after-adding-netpol=20
             rc=$?
             if [[ $rc != 0 ]]; then
+                echo "capturing cluster state due to failure"
+                if [[ $(PROFILE) == *ws22 ]]; then
+                    cd $(Pipeline.Workspace)/s/debug/windows/npm/
+                    chmod u+x win-debug.sh
+                    ./win-debug.sh
+                    mv logs_* $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/
+                else
+                    set -x
+                    npmPod=`kubectl get pod -n kube-system | grep npm | grep -v npm-win | awk '{print $1}' | head -n 1 | tr -d '\n'`
+                    kubectl exec -n kube-system $npmPod -- iptables-nft -vnL > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/iptables.out
+                    kubectl exec -n kube-system $npmPod -- ipset -L > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/ipset.out
+                fi
+
                 kubectl get pod -n scale-test
                 kubectl get pod -n connectivity-test
+                exit $rc
             fi
-            kill $kwok_pid
-            exit $rc
+
+            kubectl get pod -n kube-system -l app=kwok-controller -owide
+            kubectl get pod -n kube-system -l app=kwok-controller -owide | grep -q Running || {
+                echo "##vso[task.logissue type=error]need at least one kwok pod running"
+                exit 1
+            }
 
       - bash: |
           export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
diff --git a/debug/windows/npm/win-debug.sh b/debug/windows/npm/win-debug.sh
index 28f13ca20..11e4a066c 100755
--- a/debug/windows/npm/win-debug.sh
+++ b/debug/windows/npm/win-debug.sh
@@ -1,20 +1,23 @@
+kubeconfig=$1
+if [[ -z $1 ]]; then
+    echo "kubeconfig not provided. using default kubeconfig"
+else
+    echo "using kubeconfig: $kubeconfig"
+    kubeconfigArg="--kubeconfig $kubeconfig"
+fi
+
 # NOTE: you may not be able to unzip logs.zip in Linux since it was compressed in Windows
-set -e
+set -x
 dateString=`date -I` # like 2022-09-24
 filepath=logs_$dateString
 mkdir $filepath
 
 echo "gathering logs and writing to $filepath/"
 
-kubectl get pod -A -o wide --show-labels > $filepath/allpods.out
-kubectl get netpol -A -o yaml > $filepath/all-netpol-yamls.out
-kubectl describe netpol -A > $filepath/all-netpol-descriptions.out
-
 npmPods=()
 nodes=()
-for npmPodOrNode in `kubectl get pod -n kube-system -owide --output=custom-columns='Name:.metadata.name,Node:spec.nodeName' | grep "npm-win"`; do
+for npmPodOrNode in `kubectl $kubeconfigArg get pod -n kube-system -owide --output=custom-columns='Name:.metadata.name,Node:spec.nodeName' | grep "npm-win"`; do
     # for loop will go over each item (npm pod, then its node, then the next npm pod, then its node, ...)
-    set +e
     echo $npmPodOrNode | grep -q azure-npm-win-
     if [ $? -eq 0 ]; then
         npmPods+=($npmPodOrNode)
@@ -22,7 +25,6 @@ for npmPodOrNode in `kubectl get pod -n kube-system -owide --output=custom-colum
         nodes+=($npmPodOrNode)
     fi
 done
-set -e
 
 echo "npm pods: ${npmPods[@]}"
 echo "nodes of npm pods: ${nodes[@]}"
@@ -33,22 +35,48 @@ for i in $(seq 1 ${#npmPods[*]}); do
     node=${nodes[$j]}
 
     echo "gathering logs. npm pod: $npmPod. node: $node"
-    kubectl logs -n kube-system $npmPod > $filepath/logs_$npmPod.out
+    kubectl $kubeconfigArg logs -n kube-system $npmPod > $filepath/logs_$npmPod.out
 
     ips=()
-    for ip in `kubectl get pod -A -owide --output=custom-columns='IP:.status.podIP,Node:spec.nodeName' | grep $node | grep -oP "\d+\.\d+\.\d+\.\d+"`; do 
+    for ip in `kubectl $kubeconfigArg get pod -A -owide --output=custom-columns='IP:.status.podIP,Node:spec.nodeName' | grep $node | grep -oP "\d+\.\d+\.\d+\.\d+"`; do 
         ips+=($ip)
     done
     echo "node $node has IPs: ${ips[@]}"
 
     echo "copying ps1 file into $npmPod"
-    kubectl cp ./pod_exec.ps1 kube-system/"$npmPod":execw.ps1
+    kubectl $kubeconfigArg cp ./pod_exec.ps1 kube-system/"$npmPod":execw.ps1
 
     echo "executing ps1 file on $npmPod"
-    kubectl exec -it -n kube-system $npmPod -- powershell.exe -Command  .\\execw.ps1 "'${ips[@]}'"
+    kubectl $kubeconfigArg exec -n kube-system $npmPod -- powershell.exe -Command  .\\execw.ps1 "'${ips[@]}'"
 
     echo "copying logs.zip from $npmPod. NOTE: this will be a windows-based compressed archive (probably need windows to expand it)"
-    kubectl cp kube-system/"$npmPod":npm-exec-logs.zip $filepath/npm-exec-logs_$node.zip
+    kubectl $kubeconfigArg cp kube-system/"$npmPod":npm-exec-logs.zip $filepath/npm-exec-logs_$node.zip
+done
+
+echo "finished getting HNS info. getting prometheus metrics"
+
+mkdir -p $filepath/prometheus/node-metrics
+for i in $(seq 1 ${#npmPods[*]}); do
+    j=$((i-1))
+    npmPod=${npmPods[$j]}
+    kubectl $kubeconfigArg exec -n kube-system $npmPod -- powershell.exe -Command "(Invoke-WebRequest -UseBasicParsing http://localhost:10091/node-metrics).Content" > $filepath/prometheus/node-metrics/$npmPod.out
+done
+
+echo "finished getting prometheus metrics. getting cluster state"
+
+kubectl $kubeconfigArg get pod -A -o wide --show-labels > $filepath/allpods.out
+kubectl $kubeconfigArg get netpol -A -o yaml > $filepath/all-netpol-yamls.out
+kubectl $kubeconfigArg describe netpol -A > $filepath/all-netpol-descriptions.out
+
+for ns in `kubectl $kubeconfigArg get pod -A | grep -v Running | grep -v STATUS | awk '{print $1}' | sort | uniq`; do
+    echo "describing failed pods in namespace $ns..."
+    failingPods=`kubectl $kubeconfigArg get pod -n $ns | grep -v Running | grep -v STATUS | awk '{print $1}' | xargs echo`
+    if [[ -z $failingPods ]]; then
+        continue
+    fi
+    echo "failing Pods: $failingPods"
+    kubectl $kubeconfigArg describe pod -n $ns $failingPods > $filepath/describepod_$ns.out
+    break
 done
 
 echo "finished gathering all logs. written to $filepath/"
diff --git a/test/scale/connectivity/test-connectivity.sh b/test/scale/connectivity/test-connectivity.sh
index 8feef98f6..0f8ae9c89 100755
--- a/test/scale/connectivity/test-connectivity.sh
+++ b/test/scale/connectivity/test-connectivity.sh
@@ -29,7 +29,7 @@ REQUIRED PARAMETERS:
 
 OPTIONAL PARAMETERS:
     --kubeconfig=<path>                 path to kubeconfig file
-    --kubectl-binary=<path>               path to kubectl binary. Default is kubectl
+    --kubectl-binary=<path>             path to kubectl binary. Default is kubectl
 
 EXIT CODES:
 0 - success
diff --git a/test/scale/run-kwok-as-pod.sh b/test/scale/run-kwok-as-pod.sh
new file mode 100644
index 000000000..600bbd0a5
--- /dev/null
+++ b/test/scale/run-kwok-as-pod.sh
@@ -0,0 +1,5 @@
+# source: https://kwok.sigs.k8s.io/docs/user/kwok-in-cluster/
+KWOK_REPO=kubernetes-sigs/kwok
+KWOK_LATEST_RELEASE=$(curl "https://api.github.com/repos/${KWOK_REPO}/releases/latest" | jq -r '.tag_name')
+kubectl apply -f "https://github.com/${KWOK_REPO}/releases/download/${KWOK_LATEST_RELEASE}/kwok.yaml"
+kubectl apply -f "https://github.com/${KWOK_REPO}/releases/download/${KWOK_LATEST_RELEASE}/stage-fast.yaml"
diff --git a/test/scale/test-scale.sh b/test/scale/test-scale.sh
index 1a24f7fe7..167959295 100755
--- a/test/scale/test-scale.sh
+++ b/test/scale/test-scale.sh
@@ -276,21 +276,49 @@ fi
 
 ## HELPER FUNCTIONS
 wait_for_pods() {
-    # wait for all pods to run
-    minutesToWaitForRealPods=$(( 10 + $numRealPods / 250 ))
-    set -x
-    if [[ $numRealPods -gt 0 ]]; then
-        $KUBECTL $KUBECONFIG_ARG wait --for=condition=Ready pods -n scale-test -l is-real=true --all --timeout="${minutesToWaitForRealPods}m"
-    fi
-    set +x
-
-    # just make sure kwok pods are Running, not necessarily Ready (sometimes kwok pods have NodeNotReady even though the node is ready)
-    minutesToWaitForKwokPods=$(( 1 + $numKwokPods / 500 ))
-    set -x
     if [[ $numKwokPods -gt 0 ]]; then
-        $KUBECTL $KUBECONFIG_ARG wait --for=condition=Initialized pods -n scale-test -l is-kwok=true --all --timeout="${minutesToWaitForKwokPods}m"
+        # wait up to 20 minutes
+        startDate=`date +%s`
+        count=0
+        while : ; do
+            echo "waiting for fake pods to run (try $count)"
+            count=$((count+1))
+            # just make sure kwok pods are Running, not necessarily Ready (sometimes kwok pods have NodeNotReady even though the node is ready)
+            set +e -x
+            $KUBECTL $KUBECONFIG_ARG wait --for=condition=Initialized pods -n scale-test -l is-kwok=true --all --timeout=0 && set -e +x && break
+            endDate=`date +%s`
+            set -e +x
+            if [[ $endDate -gt $(( startDate + (20*60) )) ]]; then
+                echo "timed out waiting for all kwok pods to run"
+                k get pod -n scale-test -owide
+                k get node
+                k get pod -n kube-system -l app=kwok-controller -owide
+                k top pod -n kube-system -l app=kwok-controller
+                k top node
+                exit 1
+            fi
+            # try recreating nodes if KWOK controller failed
+            $KUBECTL $KUBECONFIG_ARG apply -f generated/kwok-nodes/
+        done
+    fi
+
+    if [[ $numRealPods -gt 0 ]]; then
+        # wait up to 10 minutes
+        startDate=`date +%s`
+        count=0
+        while : ; do
+            echo "waiting for real pods to run (try $count)"
+            count=$((count+1))
+            set +e -x
+            $KUBECTL $KUBECONFIG_ARG wait --for=condition=Ready pods -n scale-test -l is-real=true --all --timeout=0 && set -e +x && break
+            set -e +x
+            endDate=`date +%s`
+            if [[ $endDate -gt $(( startDate + (10*60) )) ]]; then
+                echo "timed out waiting for all real pods to run"
+                exit 1
+            fi
+        done
     fi
-    set +x
 }
 
 ## FILE SETUP
@@ -444,19 +472,29 @@ echo
 
 set -x
 $KUBECTL $KUBECONFIG_ARG create ns scale-test
+set +x
+
 if [[ $numKwokNodes -gt 0 ]]; then
+    set -x
     $KUBECTL $KUBECONFIG_ARG apply -f generated/kwok-nodes/
+    set +x
 fi
 if [[ $numRealPods -gt 0 ]]; then
+    set -x
     $KUBECTL $KUBECONFIG_ARG apply -f generated/deployments/real-$realPodType/
+    set +x
 fi
 if [[ $numKwokPods -gt 0 ]]; then
+    set -x
     $KUBECTL $KUBECONFIG_ARG apply -f generated/deployments/kwok/
+    set +x
 fi
 if [[ $numRealServices -gt 0 ]]; then
+    set -x
     $KUBECTL $KUBECONFIG_ARG apply -f generated/services/real/
+    set +x
 fi
-set +x
+
 
 add_shared_labels() {
     if [[ $numSharedLabelsPerPod -gt 0 ]]; then
@@ -489,16 +527,19 @@ if [[ $numUniqueLabelsPerPod -gt 0 ]]; then
     done
 fi
 
-set -x
+# to better evaluate time to apply ACLs, wait for pods to come up first (takes a variable amount of time) before applying the NetPols
+wait_for_pods
+
 if [[ $numUnappliedNetworkPolicies -gt 0 ]]; then
+    set -x
     $KUBECTL $KUBECONFIG_ARG apply -f generated/networkpolicies/unapplied
+    set +x
 fi
 if [[ $numNetworkPolicies -gt 0 ]]; then
+    set -x
     $KUBECTL $KUBECONFIG_ARG apply -f generated/networkpolicies/applied
+    set +x
 fi
-set +x
-
-wait_for_pods
 
 echo
 echo "done scaling at $(date -u). Had started at $startDate."