Benchmarks: Add Feature - Add GDR-only nccl-tests for Nvidia machines (#299)

This commit adds GDR-only nccl-tests for Nvidia machines. Also bump NCCL to v2.10.3-1 to achieve peak performance in this test.
2022-02-08 17:59:48 +08:00 · 2022-02-08 17:59:48 +08:00 · 433785fd0c
--- a/.github/workflows/build-image.yml
+++ b/.github/workflows/build-image.yml
@ -103,6 +103,8 @@ jobs:
          tags: ${{ steps.metadata.outputs.tags }}
          cache-from: ${{ steps.metadata.outputs.cache_from }}
          cache-to: ${{ steps.metadata.outputs.cache_to }}
+          build-args: |
+            NUM_MAKE_JOBS=8
          labels: |
            org.opencontainers.image.source=${{ github.event.repository.html_url }}
            org.opencontainers.image.created=${{ github.event.repository.pushed_at }}
--- a/dockerfile/cuda11.1.1.dockerfile
+++ b/dockerfile/cuda11.1.1.dockerfile
@ -7,7 +7,7 @@ FROM nvcr.io/nvidia/pytorch:20.12-py3
 # NVIDIA:
 #   - CUDA: 11.1.1
 #   - cuDNN: 8.0.5
-#   - NCCL: bootstrap_tag
+#   - NCCL: v2.10.3-1
 # Mellanox:
 #   - OFED: 5.2-2.2.3.0
 #   - HPC-X: v2.8.3
@ -46,6 +46,8 @@ RUN apt-get update && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/* /tmp/* /opt/cmake-3.14.6-Linux-x86_64

+ARG NUM_MAKE_JOBS=
+
 # Install Docker
 ENV DOCKER_VERSION=20.10.8
 RUN cd /tmp && \
@ -85,16 +87,16 @@ RUN cd /tmp && \
    git reset --hard 7cccbc1 && \
    ./autogen.sh && \
    ./configure --prefix=/usr/local --with-cuda=/usr/local/cuda && \
-    make -j && \
+    make -j ${NUM_MAKE_JOBS} && \
    make install && \
    cd /tmp && \
    rm -rf nccl-rdma-sharp-plugins

 # Install NCCL patch
 RUN cd /tmp && \
-    git clone -b bootstrap_tag https://github.com/NVIDIA/nccl.git && \
+    git clone -b v2.10.3-1 https://github.com/NVIDIA/nccl.git && \
    cd nccl && \
-    make -j src.build && \
+    make -j ${NUM_MAKE_JOBS} src.build && \
    make install && \
    cd /tmp && \
    rm -rf nccl
@ -117,7 +119,7 @@ ENV PATH="${PATH}" \
 WORKDIR ${SB_HOME}

 ADD third_party third_party
-RUN make -j -C third_party cuda
+RUN make -j ${NUM_MAKE_JOBS} -C third_party cuda

 ADD . .
 RUN python3 -m pip install .[nvidia,torch,ort] && \
--- a/superbench/config/azure_ndmv4.yaml
+++ b/superbench/config/azure_ndmv4.yaml
@ -43,7 +43,7 @@ superbench:
      <<: *default_local_mode
    gemm-flops:
      <<: *default_local_mode
-    nccl-bw:
+    nccl-bw:default:
      enable: true
      modes:
        - name: local
@ -51,6 +51,21 @@ superbench:
          parallel: no
      parameters:
        ngpus: 8
+    nccl-bw:gdr-only:
+      enable: true
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+          env:
+            NCCL_IB_PCI_RELAXED_ORDERING: '1'
+            NCCL_NET_GDR_LEVEL: '5'
+            NCCL_P2P_DISABLE: '1'
+            NCCL_SHM_DISABLE: '1'
+            NCCL_MIN_NCHANNELS: '16'
+            NCCL_IB_DISABLE: '0'
+      parameters:
+        ngpus: 8
    ib-loopback:
      enable: true
      modes:
--- a/superbench/config/azure_ndv4.yaml
+++ b/superbench/config/azure_ndv4.yaml
@ -39,7 +39,7 @@ superbench:
      <<: *default_local_mode
    gemm-flops:
      <<: *default_local_mode
-    nccl-bw:
+    nccl-bw:default:
      enable: true
      modes:
        - name: local
@ -47,6 +47,21 @@ superbench:
          parallel: no
      parameters:
        ngpus: 8
+    nccl-bw:gdr-only:
+      enable: true
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+          env:
+            NCCL_IB_PCI_RELAXED_ORDERING: '1'
+            NCCL_NET_GDR_LEVEL: '5'
+            NCCL_P2P_DISABLE: '1'
+            NCCL_SHM_DISABLE: '1'
+            NCCL_MIN_NCHANNELS: '16'
+            NCCL_IB_DISABLE: '0'
+      parameters:
+        ngpus: 8
    ib-loopback:
      enable: true
      modes:
--- a/superbench/config/default.yaml
+++ b/superbench/config/default.yaml
@ -33,7 +33,7 @@ superbench:
      model_action:
        - train
  benchmarks:
-    nccl-bw:
+    nccl-bw:default:
      enable: true
      modes:
        - name: local
@ -41,6 +41,21 @@ superbench:
          parallel: no
      parameters:
        ngpus: 8
+    nccl-bw:gdr-only:
+      enable: true
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+          env:
+            NCCL_IB_PCI_RELAXED_ORDERING: '1'
+            NCCL_NET_GDR_LEVEL: '5'
+            NCCL_P2P_DISABLE: '1'
+            NCCL_SHM_DISABLE: '1'
+            NCCL_MIN_NCHANNELS: '16'
+            NCCL_IB_DISABLE: '0'
+      parameters:
+        ngpus: 8
    ib-loopback:
      enable: true
      modes: