Initial version of hpc monitoring integration into AKS

2024-07-09 13:46:10 -05:00 · 2024-07-09 13:46:10 -05:00 · 783c30f986
--- a/experimental/hpc_monitoring/aks/Dockerfile
+++ b/experimental/hpc_monitoring/aks/Dockerfile
@ -0,0 +1,15 @@
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/cuda:12.5.0-devel-ubuntu22.04
+
+FROM ${FROM_IMAGE_NAME}
+
+RUN apt update
+RUN apt-get -y install systemctl python3 python3-pip wget git nfs-common
+RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+RUN dpkg -i cuda-keyring_1.1-1_all.deb
+RUN apt-get update
+RUN apt-get install -y datacenter-gpu-manager
+RUN systemctl --now enable nvidia-dcgm
+RUN pip install requests
+RUN git clone https://github.com/Azure/azurehpc.git
+RUN cp /azurehpc/experimental/hpc_monitoring/cc_hpc_monitoring/specs/default/cluster-init/files/hpc_data_collector.py /bin
+RUN rm -rf /azurehpc
--- a/experimental/hpc_monitoring/aks/hpc-ai-monitor-config.yaml
+++ b/experimental/hpc_monitoring/aks/hpc-ai-monitor-config.yaml
@ -0,0 +1,17 @@
+apiVersion: v1
+data:
+  hpc_data_collector.sh: |
+    #!/bin/bash
+
+    # starts NPD, sets env vars and executes hpc_datacollector.py
+    #
+    export LOG_ANALYTICS_CUSTOMER_ID=$1
+    export LOG_ANALYTICS_SHARED_KEY=$2
+
+    /usr/bin/nv-hostengine -n --service-account nvidia-dcgm &
+
+    /bin/hpc_data_collector.py -fhm -gpum -ibm
+kind: ConfigMap
+metadata:
+  name: hpc-ai-monitor-config
+  namespace: kube-system
--- a/experimental/hpc_monitoring/aks/hpc-ai-monitor.yaml
+++ b/experimental/hpc_monitoring/aks/hpc-ai-monitor.yaml
@ -0,0 +1,128 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: hpc-ai-monitor
+  namespace: kube-system
+  labels:
+    app: hpc-ai-monitor
+spec:
+  selector:
+    matchLabels:
+      app: hpc-ai-monitor
+  template:
+    metadata:
+      labels:
+        app: hpc-ai-monitor
+    spec:
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: kubernetes.io/os
+                    operator: In
+                    values:
+                      - linux
+                  - key: accelerator
+                    operator: In
+                    values:
+                      - nvidia
+      containers:
+      - name: hpc-ai-monitor
+        command:
+        - "/bin/bash"
+        - "-c"
+        - /hpc_monitor/hpc_data_collector.sh $log_analytics_customer_id $log_analytics_shared_key
+        env:
+        - name: log_analytics_customer_id
+          valueFrom:
+            secretKeyRef:
+              name: log-analytics-key
+              key: log_analytics_customer_id
+        - name: log_analytics_shared_key
+          valueFrom:
+            secretKeyRef:
+              name: log-analytics-key
+              key: log_analytics_shared_key
+        image: <YOUR ACR>.azurecr.io/aks-ai-monitoring:<YOUR TAG>
+        resources:
+          limits:
+            cpu: 240m
+            memory: 2048Mi
+          requests:
+            cpu: 240m
+            memory: 2048Mi
+        imagePullPolicy: Always
+        securityContext:
+          privileged: true
+        volumeMounts:
+        - name: localtime
+          mountPath: /etc/localtime
+          readOnly: true
+        - name: hyperv
+          mountPath: /var/lib/hyperv
+          readOnly: true
+        - name: devices
+          mountPath: /sys/devices
+          readOnly: true
+        - name: infiniband
+          mountPath: /sys/class/infiniband
+          readOnly: true
+        - name: net
+          mountPath: /sys/class/net
+          readOnly: true
+        - name: diskstats
+          mountPath: /proc/diskstats
+          readOnly: true
+        - name: stat
+          mountPath: /proc/stat
+          readOnly: true
+        - name: meminfo
+          mountPath: /proc/meminfo
+          readOnly: true
+        - name: loadavg
+          mountPath: /proc/loadavg
+          readOnly: true
+        - name: config
+          mountPath: /hpc_monitor
+          readOnly: true
+      volumes:
+      - name: localtime
+        hostPath:
+          path: /etc/localtime
+      - name: hyperv
+        hostPath:
+          path: /var/lib/hyperv
+      - name: devices
+        hostPath:
+          path: /sys/devices
+      - name: infiniband
+        hostPath:
+          path: /sys/class/infiniband
+      - name: net
+        hostPath:
+          path: /sys/class/net
+      - name: diskstats
+        hostPath:
+          path: /proc/diskstats
+      - name: stat
+        hostPath:
+          path: /proc/stat
+      - name: meminfo
+        hostPath:
+          path: /proc/meminfo
+      - name: loadavg
+        hostPath:
+          path: /proc/loadavg
+      - name: config
+        configMap:
+          name: hpc-ai-monitor-config
+          defaultMode: 0755
+          items:
+          - key: hpc_data_collector.sh
+            path: hpc_data_collector.sh
+      tolerations:
+        - effect: NoSchedule
+          operator: Exists
+        - effect: NoExecute
+          operator: Exists
--- a/experimental/hpc_monitoring/aks/log_analytics_secret_key.yaml
+++ b/experimental/hpc_monitoring/aks/log_analytics_secret_key.yaml
@ -0,0 +1,9 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: log-analytics-key
+  namespace: kube-system
+type: Opaque
+stringData:
+  log_analytics_customer_id: <YOUR LOG ANALYTICS WORKSPACE ID>
+  log_analytics_shared_key: <YOUR LOG ANALYTICS KEY>
--- a/experimental/hpc_monitoring/aks/readme.md
+++ b/experimental/hpc_monitoring/aks/readme.md
@ -0,0 +1,25 @@
+# Integrate HPC/AI cluster monitoring with AKS 
+
+Shows how to run custom HPC/AI cluster  monitoring (IB, GPU, CPU, Disks etc) with AKS
+ 
+## Prerequisites
+
+- AKS cluster (NDmv4) is deployed, see [blog post](https://techcommunity.microsoft.com/t5/azure-high-performance-computing/deploy-ndm-v4-a100-kubernetes-cluster/ba-p/3838871)
+- You have a log analytics workspace.
+- DCGM Exporter (in GPU operator) is disabled ( dcgmExporter.enabled=false)
+- You will need to replace all references to \<YOUR ACR\> , \<YOUR TAG\>, \<YOUR LOG ANALYTICS WORKSPACE ID\>, and \<YOUR LOG ANALYTICS KEY\> in the provided scripts. 
+
+## Build hpc monitoring container image
+
+```
+docker build -t \<YOUR ACR\>.azurecr.io/aks-ai-monitoring:\<YOUR TAG\> .
+docker d -t \<YOUR ACR\>.azurecr.io/aks-ai-monitoring:\<YOUR TAG\>
+```
+
+## Deploy HPC/AI Monitoring in AKS
+```
+kubectl apply -f log_analytics_secret_key.yaml
+kubectl apply -f hpc-ai-monitor-config.yaml
+kubectl apply -f hpc-ai-monitor.yaml
+``` 
+>Note: By default HPC/AI monitoring monitors IB & GPU (GPU Util, GPU mem & GPU tensor core), metrics collected every 10 sec. You can change what is monitored by modifying hpc-ai-monitor-config.yaml.