From a9634ef5a818a741121c4fbee02c0f3db63e0625 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Mon, 21 Mar 2022 14:24:37 +0800 Subject: [PATCH] Config - Add inference config for NC A100 and NV A10 series (#329) Add inference config for preview SKUs, including: * [NC96ads_A100_v4](https://docs.microsoft.com/en-us/azure/virtual-machines/nc-a100-v4-series) * [NV18ads_A10_v5](https://docs.microsoft.com/en-us/azure/virtual-machines/nva10v5-series) --- .mypy.ini | 1 + .../azure/inference/nc96ads_a100_v4.yaml | 315 ++++++++++++++++++ .../azure/inference/nv18ads_a10_v5.yaml | 296 ++++++++++++++++ 3 files changed, 612 insertions(+) create mode 100644 superbench/config/azure/inference/nc96ads_a100_v4.yaml create mode 100644 superbench/config/azure/inference/nv18ads_a10_v5.yaml diff --git a/.mypy.ini b/.mypy.ini index e7bced23..bd47de1d 100644 --- a/.mypy.ini +++ b/.mypy.ini @@ -1,4 +1,5 @@ [mypy] +exclude = build/ ignore_missing_imports = True scripts_are_modules = True warn_return_any = True diff --git a/superbench/config/azure/inference/nc96ads_a100_v4.yaml b/superbench/config/azure/inference/nc96ads_a100_v4.yaml new file mode 100644 index 00000000..a3529292 --- /dev/null +++ b/superbench/config/azure/inference/nc96ads_a100_v4.yaml @@ -0,0 +1,315 @@ +version: v0.4 +superbench: + enable: null + monitor: + enable: true + sample_duration: 1 + sample_interval: 10 + var: + default_local_mode: &default_local_mode + enable: true + modes: + - name: local + proc_num: 4 + prefix: CUDA_VISIBLE_DEVICES={proc_rank} + parallel: yes + default_pytorch_mode: &default_pytorch_mode + enable: true + modes: + - name: torch.distributed + proc_num: 4 + node_num: 1 + frameworks: + - pytorch + offline_inference_config: &offline_inference_config + duration: 0 + num_warmup: 64 + num_steps: 2048 + sample_count: 8192 + batch_size: 32 + precision: + - float32 + - float16 + model_action: + - inference + pin_memory: yes + online_inference_config: &online_inference_config + duration: 0 + num_warmup: 64 + num_steps: 2048 + sample_count: 8192 + batch_size: 1 + precision: + - float32 + - float16 + model_action: + - inference + pin_memory: yes + benchmarks: + kernel-launch: + <<: *default_local_mode + gemm-flops: + <<: *default_local_mode + nccl-bw: + enable: true + modes: + - name: local + proc_num: 1 + parallel: no + parameters: + ngpus: 4 + cpu-memory-bw-latency: + enable: true + modes: + - name: local + proc_num: 1 + parallel: no + parameters: + tests: + - bandwidth_matrix + - latency_matrix + - max_bandwidth + mem-bw: + enable: true + modes: + - name: local + proc_num: 4 + prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N {proc_rank} + parallel: no + gpu-copy-bw: + enable: true + modes: + - name: local + parallel: no + parameters: + mem_type: + - htod + - dtoh + copy_type: + - sm + - dma + cudnn-function: + <<: *default_local_mode + cublas-function: + <<: *default_local_mode + matmul: + <<: *default_local_mode + frameworks: + - pytorch + sharding-matmul: + <<: *default_pytorch_mode + computation-communication-overlap: + <<: *default_pytorch_mode + enable: false + ort-inference:fp16-offline: + <<: *default_local_mode + parameters: + pytorch_models: + - resnet50 + - resnet101 + - resnet152 + - densenet169 + - densenet201 + batch_size: 32 + precision: float16 + ort-inference:fp16-online: + <<: *default_local_mode + parameters: + pytorch_models: + - resnet50 + - resnet101 + - resnet152 + - densenet169 + - densenet201 + batch_size: 1 + precision: float16 + tensorrt-inference:fp16-offline: + <<: *default_local_mode + parameters: + pytorch_models: + - resnet50 + - resnet101 + - resnet152 + - densenet169 + - densenet201 + - bert-base + - bert-large + seq_length: 224 + batch_size: 32 + precision: fp16 + tensorrt-inference:fp16-online: + <<: *default_local_mode + parameters: + pytorch_models: + - resnet50 + - resnet101 + - resnet152 + - densenet169 + - densenet201 + - bert-base + - bert-large + seq_length: 224 + batch_size: 1 + precision: fp16 + tensorrt-inference:int8-offline: + <<: *default_local_mode + parameters: + pytorch_models: + - resnet50 + - resnet101 + - resnet152 + - densenet169 + - densenet201 + - bert-base + - bert-large + seq_length: 224 + batch_size: 32 + precision: int8 + tensorrt-inference:int8-online: + <<: *default_local_mode + parameters: + pytorch_models: + - resnet50 + - resnet101 + - resnet152 + - densenet169 + - densenet201 + - bert-base + - bert-large + seq_length: 224 + batch_size: 1 + precision: int8 + # PyTorch offline inference + model-benchmarks:gpt-offline: + <<: *default_local_mode + frameworks: + - pytorch + models: + - gpt2-small + - gpt2-large + parameters: + <<: *offline_inference_config + batch_size: 8 + seq_len: 224 + model-benchmarks:bert-offline: + <<: *default_local_mode + frameworks: + - pytorch + models: + - bert-base + - bert-large + parameters: + <<: *offline_inference_config + seq_len: 224 + model-benchmarks:lstm-offline: + <<: *default_local_mode + frameworks: + - pytorch + models: + - lstm + parameters: + <<: *offline_inference_config + batch_size: 224 + input_size: 224 + hidden_size: 1000 + seq_len: 32 + pin_memory: no + model-benchmarks:resnet-offline: + <<: *default_local_mode + frameworks: + - pytorch + models: + - resnet50 + - resnet101 + - resnet152 + parameters: + <<: *offline_inference_config + batch_size: 192 + num_steps: 512 + model-benchmarks:densenet-offline: + <<: *default_local_mode + frameworks: + - pytorch + models: + - densenet169 + - densenet201 + parameters: + <<: *offline_inference_config + pin_memory: no + model-benchmarks:vgg-offline: + <<: *default_local_mode + frameworks: + - pytorch + models: + - vgg11 + - vgg13 + - vgg16 + - vgg19 + parameters: + <<: *offline_inference_config + pin_memory: no + # PyTorch online inference + model-benchmarks:gpt-online: + <<: *default_local_mode + frameworks: + - pytorch + models: + - gpt2-small + - gpt2-large + parameters: + <<: *online_inference_config + seq_len: 224 + model-benchmarks:bert-online: + <<: *default_local_mode + frameworks: + - pytorch + models: + - bert-base + - bert-large + parameters: + <<: *online_inference_config + seq_len: 224 + model-benchmarks:lstm-online: + <<: *default_local_mode + frameworks: + - pytorch + models: + - lstm + parameters: + <<: *online_inference_config + input_size: 224 + hidden_size: 1000 + seq_len: 32 + pin_memory: no + model-benchmarks:resnet-online: + <<: *default_local_mode + frameworks: + - pytorch + models: + - resnet50 + - resnet101 + - resnet152 + parameters: + <<: *online_inference_config + model-benchmarks:densenet-online: + <<: *default_local_mode + frameworks: + - pytorch + models: + - densenet169 + - densenet201 + parameters: + <<: *online_inference_config + pin_memory: no + model-benchmarks:vgg-online: + <<: *default_local_mode + frameworks: + - pytorch + models: + - vgg11 + - vgg13 + - vgg16 + - vgg19 + parameters: + <<: *online_inference_config + pin_memory: no diff --git a/superbench/config/azure/inference/nv18ads_a10_v5.yaml b/superbench/config/azure/inference/nv18ads_a10_v5.yaml new file mode 100644 index 00000000..1d3d4a2a --- /dev/null +++ b/superbench/config/azure/inference/nv18ads_a10_v5.yaml @@ -0,0 +1,296 @@ +version: v0.4 +superbench: + enable: null + monitor: + enable: false + sample_duration: 1 + sample_interval: 10 + var: + default_local_mode: &default_local_mode + enable: true + modes: + - name: local + proc_num: 1 + prefix: CUDA_VISIBLE_DEVICES={proc_rank} + parallel: yes + offline_inference_config: &offline_inference_config + duration: 0 + num_warmup: 64 + num_steps: 2048 + sample_count: 8192 + batch_size: 32 + precision: + - float32 + - float16 + model_action: + - inference + pin_memory: yes + online_inference_config: &online_inference_config + duration: 0 + num_warmup: 64 + num_steps: 2048 + sample_count: 8192 + batch_size: 1 + precision: + - float32 + - float16 + model_action: + - inference + pin_memory: yes + benchmarks: + kernel-launch: + <<: *default_local_mode + gemm-flops: + <<: *default_local_mode + parameters: + precision: [fp32, fp16, tf32_tc, bf16_tc, fp16_tc, int8_tc, int4_tc] + cpu-memory-bw-latency: + enable: true + modes: + - name: local + proc_num: 1 + parallel: no + parameters: + tests: + - bandwidth_matrix + - latency_matrix + - max_bandwidth + mem-bw: + enable: true + modes: + - name: local + proc_num: 1 + prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N {proc_rank} + parallel: no + gpu-copy-bw: + enable: true + modes: + - name: local + parallel: no + parameters: + mem_type: + - htod + - dtoh + copy_type: + - sm + - dma + cudnn-function: + <<: *default_local_mode + cublas-function: + <<: *default_local_mode + matmul: + <<: *default_local_mode + frameworks: + - pytorch + ort-inference:fp16-offline: + <<: *default_local_mode + parameters: + pytorch_models: + - resnet50 + - resnet101 + - resnet152 + - densenet169 + - densenet201 + batch_size: 8 + precision: float16 + ort-inference:fp16-online: + <<: *default_local_mode + parameters: + pytorch_models: + - resnet50 + - resnet101 + - resnet152 + - densenet169 + - densenet201 + batch_size: 1 + precision: float16 + tensorrt-inference:fp16-offline: + <<: *default_local_mode + parameters: + pytorch_models: + - resnet50 + - resnet101 + - resnet152 + - densenet169 + - densenet201 + - bert-base + - bert-large + seq_length: 224 + batch_size: 8 + precision: fp16 + tensorrt-inference:fp16-online: + <<: *default_local_mode + parameters: + pytorch_models: + - resnet50 + - resnet101 + - resnet152 + - densenet169 + - densenet201 + - bert-base + - bert-large + seq_length: 224 + batch_size: 1 + precision: fp16 + tensorrt-inference:int8-offline: + <<: *default_local_mode + parameters: + pytorch_models: + - resnet50 + - resnet101 + - resnet152 + - densenet169 + - densenet201 + - bert-base + - bert-large + seq_length: 224 + batch_size: 32 + precision: int8 + tensorrt-inference:int8-online: + <<: *default_local_mode + parameters: + pytorch_models: + - resnet50 + - resnet101 + - resnet152 + - densenet169 + - densenet201 + - bert-base + - bert-large + seq_length: 224 + batch_size: 1 + precision: int8 + # PyTorch offline inference + model-benchmarks:gpt-offline: + <<: *default_local_mode + frameworks: + - pytorch + models: + - gpt2-small + - gpt2-large + parameters: + <<: *offline_inference_config + batch_size: 8 + seq_len: 224 + model-benchmarks:bert-offline: + <<: *default_local_mode + frameworks: + - pytorch + models: + - bert-base + - bert-large + parameters: + <<: *offline_inference_config + seq_len: 224 + model-benchmarks:lstm-offline: + <<: *default_local_mode + frameworks: + - pytorch + models: + - lstm + parameters: + <<: *offline_inference_config + batch_size: 224 + input_size: 224 + hidden_size: 1000 + seq_len: 32 + pin_memory: no + model-benchmarks:resnet-offline: + <<: *default_local_mode + frameworks: + - pytorch + models: + - resnet50 + - resnet101 + - resnet152 + parameters: + <<: *offline_inference_config + batch_size: 192 + num_steps: 512 + model-benchmarks:densenet-offline: + <<: *default_local_mode + frameworks: + - pytorch + models: + - densenet169 + - densenet201 + parameters: + <<: *offline_inference_config + pin_memory: no + model-benchmarks:vgg-offline: + <<: *default_local_mode + frameworks: + - pytorch + models: + - vgg11 + - vgg13 + - vgg16 + - vgg19 + parameters: + <<: *offline_inference_config + pin_memory: no + # PyTorch online inference + model-benchmarks:gpt-online: + <<: *default_local_mode + frameworks: + - pytorch + models: + - gpt2-small + - gpt2-large + parameters: + <<: *online_inference_config + seq_len: 224 + model-benchmarks:bert-online: + <<: *default_local_mode + frameworks: + - pytorch + models: + - bert-base + - bert-large + parameters: + <<: *online_inference_config + seq_len: 224 + model-benchmarks:lstm-online: + <<: *default_local_mode + frameworks: + - pytorch + models: + - lstm + parameters: + <<: *online_inference_config + input_size: 224 + hidden_size: 1000 + seq_len: 32 + pin_memory: no + model-benchmarks:resnet-online: + <<: *default_local_mode + frameworks: + - pytorch + models: + - resnet50 + - resnet101 + - resnet152 + parameters: + <<: *online_inference_config + model-benchmarks:densenet-online: + <<: *default_local_mode + frameworks: + - pytorch + models: + - densenet169 + - densenet201 + parameters: + <<: *online_inference_config + pin_memory: no + model-benchmarks:vgg-online: + <<: *default_local_mode + frameworks: + - pytorch + models: + - vgg11 + - vgg13 + - vgg16 + - vgg19 + parameters: + <<: *online_inference_config + pin_memory: no