298 строки
6.5 KiB
YAML
298 строки
6.5 KiB
YAML
# SuperBench Config
|
|
version: v0.6
|
|
superbench:
|
|
enable: null
|
|
monitor:
|
|
enable: true
|
|
sample_duration: 1
|
|
sample_interval: 10
|
|
var:
|
|
default_local_mode: &default_local_mode
|
|
enable: true
|
|
modes:
|
|
- name: local
|
|
proc_num: 8
|
|
prefix: CUDA_VISIBLE_DEVICES={proc_rank}
|
|
parallel: yes
|
|
default_pytorch_mode: &default_pytorch_mode
|
|
enable: true
|
|
modes:
|
|
- name: torch.distributed
|
|
proc_num: 8
|
|
node_num: 1
|
|
frameworks:
|
|
- pytorch
|
|
common_model_config: &common_model_config
|
|
duration: 0
|
|
num_warmup: 16
|
|
num_steps: 128
|
|
batch_size: 1
|
|
precision:
|
|
- float32
|
|
- float16
|
|
model_action:
|
|
- train
|
|
benchmarks:
|
|
gpu-burn:
|
|
enable: true
|
|
modes:
|
|
- name: local
|
|
proc_num: 1
|
|
parallel: no
|
|
parameters:
|
|
time: 300
|
|
doubles: true
|
|
tensor_core: true
|
|
nccl-bw:default:
|
|
enable: true
|
|
modes:
|
|
- name: local
|
|
proc_num: 1
|
|
parallel: no
|
|
parameters:
|
|
ngpus: 8
|
|
nccl-bw:gdr-only:
|
|
enable: true
|
|
modes:
|
|
- name: local
|
|
proc_num: 1
|
|
parallel: no
|
|
env:
|
|
NCCL_IB_PCI_RELAXED_ORDERING: '1'
|
|
NCCL_NET_GDR_LEVEL: '5'
|
|
NCCL_P2P_DISABLE: '1'
|
|
NCCL_SHM_DISABLE: '1'
|
|
NCCL_MIN_NCHANNELS: '16'
|
|
NCCL_IB_DISABLE: '0'
|
|
parameters:
|
|
ngpus: 8
|
|
nccl-bw:all-nodes:
|
|
enable: true
|
|
modes:
|
|
- name: mpi
|
|
proc_num: 8
|
|
node_num: all
|
|
pattern:
|
|
type: all-nodes
|
|
parameters:
|
|
ngpus: 8
|
|
nccl-bw:pair-wise:
|
|
enable: true
|
|
modes:
|
|
- name: mpi
|
|
proc_num: 8
|
|
node_num: all
|
|
pattern:
|
|
type: pair-wise
|
|
parameters:
|
|
ngpus: 8
|
|
nccl-bw:k-batch:
|
|
enable: true
|
|
modes:
|
|
- name: mpi
|
|
proc_num: 8
|
|
node_num: all
|
|
pattern:
|
|
type: k-batch
|
|
parameters:
|
|
ngpus: 8
|
|
nccl-bw:topo-aware:
|
|
enable: true
|
|
modes:
|
|
- name: mpi
|
|
proc_num: 8
|
|
node_num: all
|
|
pattern:
|
|
type: topo-aware
|
|
ibstat: ibstat_file.txt
|
|
ibnetdiscover: ibnetdiscover_file.txt
|
|
min_dist: 2
|
|
max_dist: 6
|
|
parameters:
|
|
ngpus: 8
|
|
ib-loopback:
|
|
enable: true
|
|
modes:
|
|
- name: local
|
|
proc_num: 4
|
|
prefix: PROC_RANK={proc_rank} IB_DEVICES=0,2,4,6 NUMA_NODES=1,0,3,2
|
|
parallel: yes
|
|
- name: local
|
|
proc_num: 4
|
|
prefix: PROC_RANK={proc_rank} IB_DEVICES=1,3,5,7 NUMA_NODES=1,0,3,2
|
|
parallel: yes
|
|
disk-benchmark:
|
|
enable: false
|
|
modes:
|
|
- name: local
|
|
proc_num: 1
|
|
parallel: no
|
|
parameters:
|
|
block_devices:
|
|
- /dev/nvme0n1
|
|
cpu-memory-bw-latency:
|
|
enable: false
|
|
modes:
|
|
- name: local
|
|
proc_num: 1
|
|
parallel: no
|
|
parameters:
|
|
tests:
|
|
- bandwidth_matrix
|
|
- latency_matrix
|
|
- max_bandwidth
|
|
mem-bw:
|
|
enable: true
|
|
modes:
|
|
- name: local
|
|
proc_num: 8
|
|
prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
|
|
parallel: no
|
|
gpu-copy-bw:correctness:
|
|
enable: true
|
|
modes:
|
|
- name: local
|
|
parallel: no
|
|
parameters:
|
|
mem_type:
|
|
- htod
|
|
- dtoh
|
|
- dtod
|
|
copy_type:
|
|
- sm
|
|
- dma
|
|
size: 4096
|
|
num_warm_up: 0
|
|
num_loops: 1
|
|
check_data: true
|
|
gpu-copy-bw:perf:
|
|
enable: true
|
|
modes:
|
|
- name: local
|
|
parallel: no
|
|
parameters:
|
|
mem_type:
|
|
- htod
|
|
- dtoh
|
|
- dtod
|
|
copy_type:
|
|
- sm
|
|
- dma
|
|
kernel-launch:
|
|
<<: *default_local_mode
|
|
gemm-flops:
|
|
<<: *default_local_mode
|
|
cudnn-function:
|
|
<<: *default_local_mode
|
|
cublas-function:
|
|
<<: *default_local_mode
|
|
matmul:
|
|
<<: *default_local_mode
|
|
frameworks:
|
|
- pytorch
|
|
sharding-matmul:
|
|
<<: *default_pytorch_mode
|
|
computation-communication-overlap:
|
|
<<: *default_pytorch_mode
|
|
ib-traffic:
|
|
enable: false
|
|
modes:
|
|
- name: mpi
|
|
proc_num: 8
|
|
parameters:
|
|
msg_size: 8388608
|
|
ib_dev: mlx5_$LOCAL_RANK
|
|
gpu_dev: $LOCAL_RANK
|
|
numa_dev: $((LOCAL_RANK/2))
|
|
gpcnet-network-test:
|
|
enable: false
|
|
modes:
|
|
- name: mpi
|
|
proc_num: 1
|
|
mca:
|
|
pml: ucx
|
|
btl: ^uct
|
|
btl_tcp_if_include: eth0
|
|
env:
|
|
UCX_NET_DEVICES: mlx5_0:1
|
|
gpcnet-network-load-test:
|
|
enable: false
|
|
modes:
|
|
- name: mpi
|
|
proc_num: 1
|
|
mca:
|
|
pml: ucx
|
|
btl: ^uct
|
|
btl_tcp_if_include: eth0
|
|
env:
|
|
UCX_NET_DEVICES: mlx5_0:1
|
|
tcp-connectivity:
|
|
enable: false
|
|
modes:
|
|
- name: local
|
|
parallel: no
|
|
parameters:
|
|
port: 22
|
|
ort-inference:
|
|
<<: *default_local_mode
|
|
parameters:
|
|
batch_size: 1
|
|
tensorrt-inference:
|
|
<<: *default_local_mode
|
|
parameters:
|
|
pytorch_models:
|
|
- resnet50
|
|
- resnet101
|
|
- resnet152
|
|
- densenet169
|
|
- densenet201
|
|
- bert-base
|
|
- bert-large
|
|
seq_length: 224
|
|
batch_size: 1
|
|
precision: int8
|
|
gpt_models:
|
|
<<: *default_pytorch_mode
|
|
models:
|
|
- gpt2-small
|
|
- gpt2-large
|
|
parameters:
|
|
<<: *common_model_config
|
|
bert_models:
|
|
<<: *default_pytorch_mode
|
|
models:
|
|
- bert-base
|
|
- bert-large
|
|
parameters:
|
|
<<: *common_model_config
|
|
lstm_models:
|
|
<<: *default_pytorch_mode
|
|
models:
|
|
- lstm
|
|
parameters:
|
|
<<: *common_model_config
|
|
resnet_models:
|
|
<<: *default_pytorch_mode
|
|
models:
|
|
- resnet50
|
|
- resnet101
|
|
- resnet152
|
|
parameters:
|
|
<<: *common_model_config
|
|
densenet_models:
|
|
<<: *default_pytorch_mode
|
|
models:
|
|
- densenet169
|
|
- densenet201
|
|
parameters:
|
|
<<: *common_model_config
|
|
vgg_models:
|
|
<<: *default_pytorch_mode
|
|
models:
|
|
- vgg11
|
|
- vgg13
|
|
- vgg16
|
|
- vgg19
|
|
parameters:
|
|
<<: *common_model_config
|