Benchmarks: Add Feature - Add GDR-only nccl-tests for Nvidia machines (#299)
This commit adds GDR-only nccl-tests for Nvidia machines. Also bump NCCL to v2.10.3-1 to achieve peak performance in this test.
This commit is contained in:
Родитель
682b2c120d
Коммит
433785fd0c
|
@ -103,6 +103,8 @@ jobs:
|
|||
tags: ${{ steps.metadata.outputs.tags }}
|
||||
cache-from: ${{ steps.metadata.outputs.cache_from }}
|
||||
cache-to: ${{ steps.metadata.outputs.cache_to }}
|
||||
build-args: |
|
||||
NUM_MAKE_JOBS=8
|
||||
labels: |
|
||||
org.opencontainers.image.source=${{ github.event.repository.html_url }}
|
||||
org.opencontainers.image.created=${{ github.event.repository.pushed_at }}
|
||||
|
|
|
@ -7,7 +7,7 @@ FROM nvcr.io/nvidia/pytorch:20.12-py3
|
|||
# NVIDIA:
|
||||
# - CUDA: 11.1.1
|
||||
# - cuDNN: 8.0.5
|
||||
# - NCCL: bootstrap_tag
|
||||
# - NCCL: v2.10.3-1
|
||||
# Mellanox:
|
||||
# - OFED: 5.2-2.2.3.0
|
||||
# - HPC-X: v2.8.3
|
||||
|
@ -46,6 +46,8 @@ RUN apt-get update && \
|
|||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /opt/cmake-3.14.6-Linux-x86_64
|
||||
|
||||
ARG NUM_MAKE_JOBS=
|
||||
|
||||
# Install Docker
|
||||
ENV DOCKER_VERSION=20.10.8
|
||||
RUN cd /tmp && \
|
||||
|
@ -85,16 +87,16 @@ RUN cd /tmp && \
|
|||
git reset --hard 7cccbc1 && \
|
||||
./autogen.sh && \
|
||||
./configure --prefix=/usr/local --with-cuda=/usr/local/cuda && \
|
||||
make -j && \
|
||||
make -j ${NUM_MAKE_JOBS} && \
|
||||
make install && \
|
||||
cd /tmp && \
|
||||
rm -rf nccl-rdma-sharp-plugins
|
||||
|
||||
# Install NCCL patch
|
||||
RUN cd /tmp && \
|
||||
git clone -b bootstrap_tag https://github.com/NVIDIA/nccl.git && \
|
||||
git clone -b v2.10.3-1 https://github.com/NVIDIA/nccl.git && \
|
||||
cd nccl && \
|
||||
make -j src.build && \
|
||||
make -j ${NUM_MAKE_JOBS} src.build && \
|
||||
make install && \
|
||||
cd /tmp && \
|
||||
rm -rf nccl
|
||||
|
@ -117,7 +119,7 @@ ENV PATH="${PATH}" \
|
|||
WORKDIR ${SB_HOME}
|
||||
|
||||
ADD third_party third_party
|
||||
RUN make -j -C third_party cuda
|
||||
RUN make -j ${NUM_MAKE_JOBS} -C third_party cuda
|
||||
|
||||
ADD . .
|
||||
RUN python3 -m pip install .[nvidia,torch,ort] && \
|
||||
|
|
|
@ -43,7 +43,7 @@ superbench:
|
|||
<<: *default_local_mode
|
||||
gemm-flops:
|
||||
<<: *default_local_mode
|
||||
nccl-bw:
|
||||
nccl-bw:default:
|
||||
enable: true
|
||||
modes:
|
||||
- name: local
|
||||
|
@ -51,6 +51,21 @@ superbench:
|
|||
parallel: no
|
||||
parameters:
|
||||
ngpus: 8
|
||||
nccl-bw:gdr-only:
|
||||
enable: true
|
||||
modes:
|
||||
- name: local
|
||||
proc_num: 1
|
||||
parallel: no
|
||||
env:
|
||||
NCCL_IB_PCI_RELAXED_ORDERING: '1'
|
||||
NCCL_NET_GDR_LEVEL: '5'
|
||||
NCCL_P2P_DISABLE: '1'
|
||||
NCCL_SHM_DISABLE: '1'
|
||||
NCCL_MIN_NCHANNELS: '16'
|
||||
NCCL_IB_DISABLE: '0'
|
||||
parameters:
|
||||
ngpus: 8
|
||||
ib-loopback:
|
||||
enable: true
|
||||
modes:
|
||||
|
|
|
@ -39,7 +39,7 @@ superbench:
|
|||
<<: *default_local_mode
|
||||
gemm-flops:
|
||||
<<: *default_local_mode
|
||||
nccl-bw:
|
||||
nccl-bw:default:
|
||||
enable: true
|
||||
modes:
|
||||
- name: local
|
||||
|
@ -47,6 +47,21 @@ superbench:
|
|||
parallel: no
|
||||
parameters:
|
||||
ngpus: 8
|
||||
nccl-bw:gdr-only:
|
||||
enable: true
|
||||
modes:
|
||||
- name: local
|
||||
proc_num: 1
|
||||
parallel: no
|
||||
env:
|
||||
NCCL_IB_PCI_RELAXED_ORDERING: '1'
|
||||
NCCL_NET_GDR_LEVEL: '5'
|
||||
NCCL_P2P_DISABLE: '1'
|
||||
NCCL_SHM_DISABLE: '1'
|
||||
NCCL_MIN_NCHANNELS: '16'
|
||||
NCCL_IB_DISABLE: '0'
|
||||
parameters:
|
||||
ngpus: 8
|
||||
ib-loopback:
|
||||
enable: true
|
||||
modes:
|
||||
|
|
|
@ -33,7 +33,7 @@ superbench:
|
|||
model_action:
|
||||
- train
|
||||
benchmarks:
|
||||
nccl-bw:
|
||||
nccl-bw:default:
|
||||
enable: true
|
||||
modes:
|
||||
- name: local
|
||||
|
@ -41,6 +41,21 @@ superbench:
|
|||
parallel: no
|
||||
parameters:
|
||||
ngpus: 8
|
||||
nccl-bw:gdr-only:
|
||||
enable: true
|
||||
modes:
|
||||
- name: local
|
||||
proc_num: 1
|
||||
parallel: no
|
||||
env:
|
||||
NCCL_IB_PCI_RELAXED_ORDERING: '1'
|
||||
NCCL_NET_GDR_LEVEL: '5'
|
||||
NCCL_P2P_DISABLE: '1'
|
||||
NCCL_SHM_DISABLE: '1'
|
||||
NCCL_MIN_NCHANNELS: '16'
|
||||
NCCL_IB_DISABLE: '0'
|
||||
parameters:
|
||||
ngpus: 8
|
||||
ib-loopback:
|
||||
enable: true
|
||||
modes:
|
||||
|
|
Загрузка…
Ссылка в новой задаче