Benchmarks: Add Feature - Add GDR-only nccl-tests for Nvidia machines (#299)

This commit adds GDR-only nccl-tests for Nvidia machines. Also bump NCCL to v2.10.3-1 to achieve peak performance in this test.
This commit is contained in:
Ziyue Yang 2022-02-08 17:59:48 +08:00 коммит произвёл GitHub
Родитель 682b2c120d
Коммит 433785fd0c
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
5 изменённых файлов: 57 добавлений и 8 удалений

2
.github/workflows/build-image.yml поставляемый
Просмотреть файл

@ -103,6 +103,8 @@ jobs:
tags: ${{ steps.metadata.outputs.tags }}
cache-from: ${{ steps.metadata.outputs.cache_from }}
cache-to: ${{ steps.metadata.outputs.cache_to }}
build-args: |
NUM_MAKE_JOBS=8
labels: |
org.opencontainers.image.source=${{ github.event.repository.html_url }}
org.opencontainers.image.created=${{ github.event.repository.pushed_at }}

Просмотреть файл

@ -7,7 +7,7 @@ FROM nvcr.io/nvidia/pytorch:20.12-py3
# NVIDIA:
# - CUDA: 11.1.1
# - cuDNN: 8.0.5
# - NCCL: bootstrap_tag
# - NCCL: v2.10.3-1
# Mellanox:
# - OFED: 5.2-2.2.3.0
# - HPC-X: v2.8.3
@ -46,6 +46,8 @@ RUN apt-get update && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /opt/cmake-3.14.6-Linux-x86_64
ARG NUM_MAKE_JOBS=
# Install Docker
ENV DOCKER_VERSION=20.10.8
RUN cd /tmp && \
@ -85,16 +87,16 @@ RUN cd /tmp && \
git reset --hard 7cccbc1 && \
./autogen.sh && \
./configure --prefix=/usr/local --with-cuda=/usr/local/cuda && \
make -j && \
make -j ${NUM_MAKE_JOBS} && \
make install && \
cd /tmp && \
rm -rf nccl-rdma-sharp-plugins
# Install NCCL patch
RUN cd /tmp && \
git clone -b bootstrap_tag https://github.com/NVIDIA/nccl.git && \
git clone -b v2.10.3-1 https://github.com/NVIDIA/nccl.git && \
cd nccl && \
make -j src.build && \
make -j ${NUM_MAKE_JOBS} src.build && \
make install && \
cd /tmp && \
rm -rf nccl
@ -117,7 +119,7 @@ ENV PATH="${PATH}" \
WORKDIR ${SB_HOME}
ADD third_party third_party
RUN make -j -C third_party cuda
RUN make -j ${NUM_MAKE_JOBS} -C third_party cuda
ADD . .
RUN python3 -m pip install .[nvidia,torch,ort] && \

Просмотреть файл

@ -43,7 +43,7 @@ superbench:
<<: *default_local_mode
gemm-flops:
<<: *default_local_mode
nccl-bw:
nccl-bw:default:
enable: true
modes:
- name: local
@ -51,6 +51,21 @@ superbench:
parallel: no
parameters:
ngpus: 8
nccl-bw:gdr-only:
enable: true
modes:
- name: local
proc_num: 1
parallel: no
env:
NCCL_IB_PCI_RELAXED_ORDERING: '1'
NCCL_NET_GDR_LEVEL: '5'
NCCL_P2P_DISABLE: '1'
NCCL_SHM_DISABLE: '1'
NCCL_MIN_NCHANNELS: '16'
NCCL_IB_DISABLE: '0'
parameters:
ngpus: 8
ib-loopback:
enable: true
modes:

Просмотреть файл

@ -39,7 +39,7 @@ superbench:
<<: *default_local_mode
gemm-flops:
<<: *default_local_mode
nccl-bw:
nccl-bw:default:
enable: true
modes:
- name: local
@ -47,6 +47,21 @@ superbench:
parallel: no
parameters:
ngpus: 8
nccl-bw:gdr-only:
enable: true
modes:
- name: local
proc_num: 1
parallel: no
env:
NCCL_IB_PCI_RELAXED_ORDERING: '1'
NCCL_NET_GDR_LEVEL: '5'
NCCL_P2P_DISABLE: '1'
NCCL_SHM_DISABLE: '1'
NCCL_MIN_NCHANNELS: '16'
NCCL_IB_DISABLE: '0'
parameters:
ngpus: 8
ib-loopback:
enable: true
modes:

Просмотреть файл

@ -33,7 +33,7 @@ superbench:
model_action:
- train
benchmarks:
nccl-bw:
nccl-bw:default:
enable: true
modes:
- name: local
@ -41,6 +41,21 @@ superbench:
parallel: no
parameters:
ngpus: 8
nccl-bw:gdr-only:
enable: true
modes:
- name: local
proc_num: 1
parallel: no
env:
NCCL_IB_PCI_RELAXED_ORDERING: '1'
NCCL_NET_GDR_LEVEL: '5'
NCCL_P2P_DISABLE: '1'
NCCL_SHM_DISABLE: '1'
NCCL_MIN_NCHANNELS: '16'
NCCL_IB_DISABLE: '0'
parameters:
ngpus: 8
ib-loopback:
enable: true
modes: