Initial version of hpc monitoring integration into AKS

This commit is contained in:
Cormac Garvey 2024-07-09 13:46:10 -05:00
Родитель c3d8aceffe
Коммит 783c30f986
5 изменённых файлов: 194 добавлений и 0 удалений

Просмотреть файл

@ -0,0 +1,15 @@
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/cuda:12.5.0-devel-ubuntu22.04
FROM ${FROM_IMAGE_NAME}
RUN apt update
RUN apt-get -y install systemctl python3 python3-pip wget git nfs-common
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
RUN dpkg -i cuda-keyring_1.1-1_all.deb
RUN apt-get update
RUN apt-get install -y datacenter-gpu-manager
RUN systemctl --now enable nvidia-dcgm
RUN pip install requests
RUN git clone https://github.com/Azure/azurehpc.git
RUN cp /azurehpc/experimental/hpc_monitoring/cc_hpc_monitoring/specs/default/cluster-init/files/hpc_data_collector.py /bin
RUN rm -rf /azurehpc

Просмотреть файл

@ -0,0 +1,17 @@
apiVersion: v1
data:
hpc_data_collector.sh: |
#!/bin/bash
# starts NPD, sets env vars and executes hpc_datacollector.py
#
export LOG_ANALYTICS_CUSTOMER_ID=$1
export LOG_ANALYTICS_SHARED_KEY=$2
/usr/bin/nv-hostengine -n --service-account nvidia-dcgm &
/bin/hpc_data_collector.py -fhm -gpum -ibm
kind: ConfigMap
metadata:
name: hpc-ai-monitor-config
namespace: kube-system

Просмотреть файл

@ -0,0 +1,128 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: hpc-ai-monitor
namespace: kube-system
labels:
app: hpc-ai-monitor
spec:
selector:
matchLabels:
app: hpc-ai-monitor
template:
metadata:
labels:
app: hpc-ai-monitor
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/os
operator: In
values:
- linux
- key: accelerator
operator: In
values:
- nvidia
containers:
- name: hpc-ai-monitor
command:
- "/bin/bash"
- "-c"
- /hpc_monitor/hpc_data_collector.sh $log_analytics_customer_id $log_analytics_shared_key
env:
- name: log_analytics_customer_id
valueFrom:
secretKeyRef:
name: log-analytics-key
key: log_analytics_customer_id
- name: log_analytics_shared_key
valueFrom:
secretKeyRef:
name: log-analytics-key
key: log_analytics_shared_key
image: <YOUR ACR>.azurecr.io/aks-ai-monitoring:<YOUR TAG>
resources:
limits:
cpu: 240m
memory: 2048Mi
requests:
cpu: 240m
memory: 2048Mi
imagePullPolicy: Always
securityContext:
privileged: true
volumeMounts:
- name: localtime
mountPath: /etc/localtime
readOnly: true
- name: hyperv
mountPath: /var/lib/hyperv
readOnly: true
- name: devices
mountPath: /sys/devices
readOnly: true
- name: infiniband
mountPath: /sys/class/infiniband
readOnly: true
- name: net
mountPath: /sys/class/net
readOnly: true
- name: diskstats
mountPath: /proc/diskstats
readOnly: true
- name: stat
mountPath: /proc/stat
readOnly: true
- name: meminfo
mountPath: /proc/meminfo
readOnly: true
- name: loadavg
mountPath: /proc/loadavg
readOnly: true
- name: config
mountPath: /hpc_monitor
readOnly: true
volumes:
- name: localtime
hostPath:
path: /etc/localtime
- name: hyperv
hostPath:
path: /var/lib/hyperv
- name: devices
hostPath:
path: /sys/devices
- name: infiniband
hostPath:
path: /sys/class/infiniband
- name: net
hostPath:
path: /sys/class/net
- name: diskstats
hostPath:
path: /proc/diskstats
- name: stat
hostPath:
path: /proc/stat
- name: meminfo
hostPath:
path: /proc/meminfo
- name: loadavg
hostPath:
path: /proc/loadavg
- name: config
configMap:
name: hpc-ai-monitor-config
defaultMode: 0755
items:
- key: hpc_data_collector.sh
path: hpc_data_collector.sh
tolerations:
- effect: NoSchedule
operator: Exists
- effect: NoExecute
operator: Exists

Просмотреть файл

@ -0,0 +1,9 @@
apiVersion: v1
kind: Secret
metadata:
name: log-analytics-key
namespace: kube-system
type: Opaque
stringData:
log_analytics_customer_id: <YOUR LOG ANALYTICS WORKSPACE ID>
log_analytics_shared_key: <YOUR LOG ANALYTICS KEY>

Просмотреть файл

@ -0,0 +1,25 @@
# Integrate HPC/AI cluster monitoring with AKS
Shows how to run custom HPC/AI cluster monitoring (IB, GPU, CPU, Disks etc) with AKS
## Prerequisites
- AKS cluster (NDmv4) is deployed, see [blog post](https://techcommunity.microsoft.com/t5/azure-high-performance-computing/deploy-ndm-v4-a100-kubernetes-cluster/ba-p/3838871)
- You have a log analytics workspace.
- DCGM Exporter (in GPU operator) is disabled ( dcgmExporter.enabled=false)
- You will need to replace all references to \<YOUR ACR\> , \<YOUR TAG\>, \<YOUR LOG ANALYTICS WORKSPACE ID\>, and \<YOUR LOG ANALYTICS KEY\> in the provided scripts.
## Build hpc monitoring container image
```
docker build -t \<YOUR ACR\>.azurecr.io/aks-ai-monitoring:\<YOUR TAG\> .
docker d -t \<YOUR ACR\>.azurecr.io/aks-ai-monitoring:\<YOUR TAG\>
```
## Deploy HPC/AI Monitoring in AKS
```
kubectl apply -f log_analytics_secret_key.yaml
kubectl apply -f hpc-ai-monitor-config.yaml
kubectl apply -f hpc-ai-monitor.yaml
```
>Note: By default HPC/AI monitoring monitors IB & GPU (GPU Util, GPU mem & GPU tensor core), metrics collected every 10 sec. You can change what is monitored by modifying hpc-ai-monitor-config.yaml.