зеркало из https://github.com/Azure/azurehpc.git
Initial version of hpc monitoring integration into AKS
This commit is contained in:
Родитель
c3d8aceffe
Коммит
783c30f986
|
@ -0,0 +1,15 @@
|
|||
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/cuda:12.5.0-devel-ubuntu22.04
|
||||
|
||||
FROM ${FROM_IMAGE_NAME}
|
||||
|
||||
RUN apt update
|
||||
RUN apt-get -y install systemctl python3 python3-pip wget git nfs-common
|
||||
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
|
||||
RUN dpkg -i cuda-keyring_1.1-1_all.deb
|
||||
RUN apt-get update
|
||||
RUN apt-get install -y datacenter-gpu-manager
|
||||
RUN systemctl --now enable nvidia-dcgm
|
||||
RUN pip install requests
|
||||
RUN git clone https://github.com/Azure/azurehpc.git
|
||||
RUN cp /azurehpc/experimental/hpc_monitoring/cc_hpc_monitoring/specs/default/cluster-init/files/hpc_data_collector.py /bin
|
||||
RUN rm -rf /azurehpc
|
|
@ -0,0 +1,17 @@
|
|||
apiVersion: v1
|
||||
data:
|
||||
hpc_data_collector.sh: |
|
||||
#!/bin/bash
|
||||
|
||||
# starts NPD, sets env vars and executes hpc_datacollector.py
|
||||
#
|
||||
export LOG_ANALYTICS_CUSTOMER_ID=$1
|
||||
export LOG_ANALYTICS_SHARED_KEY=$2
|
||||
|
||||
/usr/bin/nv-hostengine -n --service-account nvidia-dcgm &
|
||||
|
||||
/bin/hpc_data_collector.py -fhm -gpum -ibm
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: hpc-ai-monitor-config
|
||||
namespace: kube-system
|
|
@ -0,0 +1,128 @@
|
|||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: hpc-ai-monitor
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app: hpc-ai-monitor
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: hpc-ai-monitor
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: hpc-ai-monitor
|
||||
spec:
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: kubernetes.io/os
|
||||
operator: In
|
||||
values:
|
||||
- linux
|
||||
- key: accelerator
|
||||
operator: In
|
||||
values:
|
||||
- nvidia
|
||||
containers:
|
||||
- name: hpc-ai-monitor
|
||||
command:
|
||||
- "/bin/bash"
|
||||
- "-c"
|
||||
- /hpc_monitor/hpc_data_collector.sh $log_analytics_customer_id $log_analytics_shared_key
|
||||
env:
|
||||
- name: log_analytics_customer_id
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: log-analytics-key
|
||||
key: log_analytics_customer_id
|
||||
- name: log_analytics_shared_key
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: log-analytics-key
|
||||
key: log_analytics_shared_key
|
||||
image: <YOUR ACR>.azurecr.io/aks-ai-monitoring:<YOUR TAG>
|
||||
resources:
|
||||
limits:
|
||||
cpu: 240m
|
||||
memory: 2048Mi
|
||||
requests:
|
||||
cpu: 240m
|
||||
memory: 2048Mi
|
||||
imagePullPolicy: Always
|
||||
securityContext:
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- name: localtime
|
||||
mountPath: /etc/localtime
|
||||
readOnly: true
|
||||
- name: hyperv
|
||||
mountPath: /var/lib/hyperv
|
||||
readOnly: true
|
||||
- name: devices
|
||||
mountPath: /sys/devices
|
||||
readOnly: true
|
||||
- name: infiniband
|
||||
mountPath: /sys/class/infiniband
|
||||
readOnly: true
|
||||
- name: net
|
||||
mountPath: /sys/class/net
|
||||
readOnly: true
|
||||
- name: diskstats
|
||||
mountPath: /proc/diskstats
|
||||
readOnly: true
|
||||
- name: stat
|
||||
mountPath: /proc/stat
|
||||
readOnly: true
|
||||
- name: meminfo
|
||||
mountPath: /proc/meminfo
|
||||
readOnly: true
|
||||
- name: loadavg
|
||||
mountPath: /proc/loadavg
|
||||
readOnly: true
|
||||
- name: config
|
||||
mountPath: /hpc_monitor
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: localtime
|
||||
hostPath:
|
||||
path: /etc/localtime
|
||||
- name: hyperv
|
||||
hostPath:
|
||||
path: /var/lib/hyperv
|
||||
- name: devices
|
||||
hostPath:
|
||||
path: /sys/devices
|
||||
- name: infiniband
|
||||
hostPath:
|
||||
path: /sys/class/infiniband
|
||||
- name: net
|
||||
hostPath:
|
||||
path: /sys/class/net
|
||||
- name: diskstats
|
||||
hostPath:
|
||||
path: /proc/diskstats
|
||||
- name: stat
|
||||
hostPath:
|
||||
path: /proc/stat
|
||||
- name: meminfo
|
||||
hostPath:
|
||||
path: /proc/meminfo
|
||||
- name: loadavg
|
||||
hostPath:
|
||||
path: /proc/loadavg
|
||||
- name: config
|
||||
configMap:
|
||||
name: hpc-ai-monitor-config
|
||||
defaultMode: 0755
|
||||
items:
|
||||
- key: hpc_data_collector.sh
|
||||
path: hpc_data_collector.sh
|
||||
tolerations:
|
||||
- effect: NoSchedule
|
||||
operator: Exists
|
||||
- effect: NoExecute
|
||||
operator: Exists
|
|
@ -0,0 +1,9 @@
|
|||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: log-analytics-key
|
||||
namespace: kube-system
|
||||
type: Opaque
|
||||
stringData:
|
||||
log_analytics_customer_id: <YOUR LOG ANALYTICS WORKSPACE ID>
|
||||
log_analytics_shared_key: <YOUR LOG ANALYTICS KEY>
|
|
@ -0,0 +1,25 @@
|
|||
# Integrate HPC/AI cluster monitoring with AKS
|
||||
|
||||
Shows how to run custom HPC/AI cluster monitoring (IB, GPU, CPU, Disks etc) with AKS
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- AKS cluster (NDmv4) is deployed, see [blog post](https://techcommunity.microsoft.com/t5/azure-high-performance-computing/deploy-ndm-v4-a100-kubernetes-cluster/ba-p/3838871)
|
||||
- You have a log analytics workspace.
|
||||
- DCGM Exporter (in GPU operator) is disabled ( dcgmExporter.enabled=false)
|
||||
- You will need to replace all references to \<YOUR ACR\> , \<YOUR TAG\>, \<YOUR LOG ANALYTICS WORKSPACE ID\>, and \<YOUR LOG ANALYTICS KEY\> in the provided scripts.
|
||||
|
||||
## Build hpc monitoring container image
|
||||
|
||||
```
|
||||
docker build -t \<YOUR ACR\>.azurecr.io/aks-ai-monitoring:\<YOUR TAG\> .
|
||||
docker d -t \<YOUR ACR\>.azurecr.io/aks-ai-monitoring:\<YOUR TAG\>
|
||||
```
|
||||
|
||||
## Deploy HPC/AI Monitoring in AKS
|
||||
```
|
||||
kubectl apply -f log_analytics_secret_key.yaml
|
||||
kubectl apply -f hpc-ai-monitor-config.yaml
|
||||
kubectl apply -f hpc-ai-monitor.yaml
|
||||
```
|
||||
>Note: By default HPC/AI monitoring monitors IB & GPU (GPU Util, GPU mem & GPU tensor core), metrics collected every 10 sec. You can change what is monitored by modifying hpc-ai-monitor-config.yaml.
|
Загрузка…
Ссылка в новой задаче