зеркало из https://github.com/Azure/azurehpc.git
Initial version of hpc monitoring integration into AKS
This commit is contained in:
Родитель
c3d8aceffe
Коммит
783c30f986
|
@ -0,0 +1,15 @@
|
||||||
|
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/cuda:12.5.0-devel-ubuntu22.04
|
||||||
|
|
||||||
|
FROM ${FROM_IMAGE_NAME}
|
||||||
|
|
||||||
|
RUN apt update
|
||||||
|
RUN apt-get -y install systemctl python3 python3-pip wget git nfs-common
|
||||||
|
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
|
||||||
|
RUN dpkg -i cuda-keyring_1.1-1_all.deb
|
||||||
|
RUN apt-get update
|
||||||
|
RUN apt-get install -y datacenter-gpu-manager
|
||||||
|
RUN systemctl --now enable nvidia-dcgm
|
||||||
|
RUN pip install requests
|
||||||
|
RUN git clone https://github.com/Azure/azurehpc.git
|
||||||
|
RUN cp /azurehpc/experimental/hpc_monitoring/cc_hpc_monitoring/specs/default/cluster-init/files/hpc_data_collector.py /bin
|
||||||
|
RUN rm -rf /azurehpc
|
|
@ -0,0 +1,17 @@
|
||||||
|
apiVersion: v1
|
||||||
|
data:
|
||||||
|
hpc_data_collector.sh: |
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# starts NPD, sets env vars and executes hpc_datacollector.py
|
||||||
|
#
|
||||||
|
export LOG_ANALYTICS_CUSTOMER_ID=$1
|
||||||
|
export LOG_ANALYTICS_SHARED_KEY=$2
|
||||||
|
|
||||||
|
/usr/bin/nv-hostengine -n --service-account nvidia-dcgm &
|
||||||
|
|
||||||
|
/bin/hpc_data_collector.py -fhm -gpum -ibm
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: hpc-ai-monitor-config
|
||||||
|
namespace: kube-system
|
|
@ -0,0 +1,128 @@
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: DaemonSet
|
||||||
|
metadata:
|
||||||
|
name: hpc-ai-monitor
|
||||||
|
namespace: kube-system
|
||||||
|
labels:
|
||||||
|
app: hpc-ai-monitor
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: hpc-ai-monitor
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: hpc-ai-monitor
|
||||||
|
spec:
|
||||||
|
affinity:
|
||||||
|
nodeAffinity:
|
||||||
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
nodeSelectorTerms:
|
||||||
|
- matchExpressions:
|
||||||
|
- key: kubernetes.io/os
|
||||||
|
operator: In
|
||||||
|
values:
|
||||||
|
- linux
|
||||||
|
- key: accelerator
|
||||||
|
operator: In
|
||||||
|
values:
|
||||||
|
- nvidia
|
||||||
|
containers:
|
||||||
|
- name: hpc-ai-monitor
|
||||||
|
command:
|
||||||
|
- "/bin/bash"
|
||||||
|
- "-c"
|
||||||
|
- /hpc_monitor/hpc_data_collector.sh $log_analytics_customer_id $log_analytics_shared_key
|
||||||
|
env:
|
||||||
|
- name: log_analytics_customer_id
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: log-analytics-key
|
||||||
|
key: log_analytics_customer_id
|
||||||
|
- name: log_analytics_shared_key
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: log-analytics-key
|
||||||
|
key: log_analytics_shared_key
|
||||||
|
image: <YOUR ACR>.azurecr.io/aks-ai-monitoring:<YOUR TAG>
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpu: 240m
|
||||||
|
memory: 2048Mi
|
||||||
|
requests:
|
||||||
|
cpu: 240m
|
||||||
|
memory: 2048Mi
|
||||||
|
imagePullPolicy: Always
|
||||||
|
securityContext:
|
||||||
|
privileged: true
|
||||||
|
volumeMounts:
|
||||||
|
- name: localtime
|
||||||
|
mountPath: /etc/localtime
|
||||||
|
readOnly: true
|
||||||
|
- name: hyperv
|
||||||
|
mountPath: /var/lib/hyperv
|
||||||
|
readOnly: true
|
||||||
|
- name: devices
|
||||||
|
mountPath: /sys/devices
|
||||||
|
readOnly: true
|
||||||
|
- name: infiniband
|
||||||
|
mountPath: /sys/class/infiniband
|
||||||
|
readOnly: true
|
||||||
|
- name: net
|
||||||
|
mountPath: /sys/class/net
|
||||||
|
readOnly: true
|
||||||
|
- name: diskstats
|
||||||
|
mountPath: /proc/diskstats
|
||||||
|
readOnly: true
|
||||||
|
- name: stat
|
||||||
|
mountPath: /proc/stat
|
||||||
|
readOnly: true
|
||||||
|
- name: meminfo
|
||||||
|
mountPath: /proc/meminfo
|
||||||
|
readOnly: true
|
||||||
|
- name: loadavg
|
||||||
|
mountPath: /proc/loadavg
|
||||||
|
readOnly: true
|
||||||
|
- name: config
|
||||||
|
mountPath: /hpc_monitor
|
||||||
|
readOnly: true
|
||||||
|
volumes:
|
||||||
|
- name: localtime
|
||||||
|
hostPath:
|
||||||
|
path: /etc/localtime
|
||||||
|
- name: hyperv
|
||||||
|
hostPath:
|
||||||
|
path: /var/lib/hyperv
|
||||||
|
- name: devices
|
||||||
|
hostPath:
|
||||||
|
path: /sys/devices
|
||||||
|
- name: infiniband
|
||||||
|
hostPath:
|
||||||
|
path: /sys/class/infiniband
|
||||||
|
- name: net
|
||||||
|
hostPath:
|
||||||
|
path: /sys/class/net
|
||||||
|
- name: diskstats
|
||||||
|
hostPath:
|
||||||
|
path: /proc/diskstats
|
||||||
|
- name: stat
|
||||||
|
hostPath:
|
||||||
|
path: /proc/stat
|
||||||
|
- name: meminfo
|
||||||
|
hostPath:
|
||||||
|
path: /proc/meminfo
|
||||||
|
- name: loadavg
|
||||||
|
hostPath:
|
||||||
|
path: /proc/loadavg
|
||||||
|
- name: config
|
||||||
|
configMap:
|
||||||
|
name: hpc-ai-monitor-config
|
||||||
|
defaultMode: 0755
|
||||||
|
items:
|
||||||
|
- key: hpc_data_collector.sh
|
||||||
|
path: hpc_data_collector.sh
|
||||||
|
tolerations:
|
||||||
|
- effect: NoSchedule
|
||||||
|
operator: Exists
|
||||||
|
- effect: NoExecute
|
||||||
|
operator: Exists
|
|
@ -0,0 +1,9 @@
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Secret
|
||||||
|
metadata:
|
||||||
|
name: log-analytics-key
|
||||||
|
namespace: kube-system
|
||||||
|
type: Opaque
|
||||||
|
stringData:
|
||||||
|
log_analytics_customer_id: <YOUR LOG ANALYTICS WORKSPACE ID>
|
||||||
|
log_analytics_shared_key: <YOUR LOG ANALYTICS KEY>
|
|
@ -0,0 +1,25 @@
|
||||||
|
# Integrate HPC/AI cluster monitoring with AKS
|
||||||
|
|
||||||
|
Shows how to run custom HPC/AI cluster monitoring (IB, GPU, CPU, Disks etc) with AKS
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- AKS cluster (NDmv4) is deployed, see [blog post](https://techcommunity.microsoft.com/t5/azure-high-performance-computing/deploy-ndm-v4-a100-kubernetes-cluster/ba-p/3838871)
|
||||||
|
- You have a log analytics workspace.
|
||||||
|
- DCGM Exporter (in GPU operator) is disabled ( dcgmExporter.enabled=false)
|
||||||
|
- You will need to replace all references to \<YOUR ACR\> , \<YOUR TAG\>, \<YOUR LOG ANALYTICS WORKSPACE ID\>, and \<YOUR LOG ANALYTICS KEY\> in the provided scripts.
|
||||||
|
|
||||||
|
## Build hpc monitoring container image
|
||||||
|
|
||||||
|
```
|
||||||
|
docker build -t \<YOUR ACR\>.azurecr.io/aks-ai-monitoring:\<YOUR TAG\> .
|
||||||
|
docker d -t \<YOUR ACR\>.azurecr.io/aks-ai-monitoring:\<YOUR TAG\>
|
||||||
|
```
|
||||||
|
|
||||||
|
## Deploy HPC/AI Monitoring in AKS
|
||||||
|
```
|
||||||
|
kubectl apply -f log_analytics_secret_key.yaml
|
||||||
|
kubectl apply -f hpc-ai-monitor-config.yaml
|
||||||
|
kubectl apply -f hpc-ai-monitor.yaml
|
||||||
|
```
|
||||||
|
>Note: By default HPC/AI monitoring monitors IB & GPU (GPU Util, GPU mem & GPU tensor core), metrics collected every 10 sec. You can change what is monitored by modifying hpc-ai-monitor-config.yaml.
|
Загрузка…
Ссылка в новой задаче