From 2101e933ccef2fd4113c8634d40129eab52a153a Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Mon, 29 Jul 2024 07:43:06 +0800 Subject: [PATCH] CI/CD - Fix MSCCL build error in CUDA12.4 docker build pipeline (#633) **Description** Fix MSCCL build error in CUDA12.4 docker build pipeline due to OOM issue. --- .github/workflows/build-image.yml | 6 +++--- dockerfile/rocm5.7.x.dockerfile | 13 ++++++++++++- dockerfile/rocm6.0.x.dockerfile | 2 +- third_party/Makefile | 8 +++++--- 4 files changed, 21 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index 952430d5..78efa780 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -18,7 +18,7 @@ jobs: docker: name: Docker build ${{ matrix.name }} runs-on: ${{ matrix.runner }} - timeout-minutes: 600 + timeout-minutes: 1200 permissions: contents: read packages: write @@ -29,12 +29,12 @@ jobs: dockerfile: cuda12.4 tags: superbench/main:cuda12.4 runner: [self-hosted, rocm-build] - build_args: "NUM_MAKE_JOBS=32" + build_args: "NUM_MAKE_JOBS=8" - name: cuda12.2 dockerfile: cuda12.2 tags: superbench/main:cuda12.2 runner: [self-hosted, rocm-build] - build_args: "NUM_MAKE_JOBS=64" + build_args: "NUM_MAKE_JOBS=8" - name: cuda11.1.1 dockerfile: cuda11.1.1 tags: superbench/main:cuda11.1.1,superbench/superbench:latest diff --git a/dockerfile/rocm5.7.x.dockerfile b/dockerfile/rocm5.7.x.dockerfile index 87af0e36..85ba1919 100644 --- a/dockerfile/rocm5.7.x.dockerfile +++ b/dockerfile/rocm5.7.x.dockerfile @@ -133,7 +133,7 @@ RUN cd /tmp && \ # Install RCCL RUN cd /opt/ && \ - git clone https://github.com/ROCmSoftwarePlatform/rccl.git && \ + git clone -b release/rocm-rel-5.7 https://github.com/ROCmSoftwarePlatform/rccl.git && \ cd rccl && \ mkdir build && \ cd build && \ @@ -167,6 +167,17 @@ ADD third_party third_party # Apply patch RUN cd third_party/perftest && \ git apply ../perftest_rocm6.patch + +# Update package index and install dependencies of ROCBLAS +RUN apt-get update && \ + apt-get install -y software-properties-common && \ + add-apt-repository universe && \ + apt-get update && \ + apt-get install -y --fix-missing \ + make python3 python3-yaml python3-venv python3-joblib 'python3*-pip' libmsgpack-dev gfortran libomp-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-5.7.1.1 HIPBLASLT_BRANCH=release/rocm-rel-5.7 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm ADD . . diff --git a/dockerfile/rocm6.0.x.dockerfile b/dockerfile/rocm6.0.x.dockerfile index 648bc195..bd33e289 100644 --- a/dockerfile/rocm6.0.x.dockerfile +++ b/dockerfile/rocm6.0.x.dockerfile @@ -139,7 +139,7 @@ RUN cd /tmp && \ # Install RCCL RUN cd /opt/ && \ - git clone https://github.com/ROCmSoftwarePlatform/rccl.git && \ + git clone -b release/rocm-rel-6.0 https://github.com/ROCmSoftwarePlatform/rccl.git && \ cd rccl && \ mkdir build && \ cd build && \ diff --git a/third_party/Makefile b/third_party/Makefile index 0a47bd45..7abac4fb 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -14,6 +14,8 @@ ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | HIPBLASLT_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3) ROCM_VER ?= $(shell hipconfig -R | grep -oP '\d+\.\d+\.\d+' || echo "0.0.0") +NUM_MAKE_JOBS ?= $(shell nproc --ignore=2) + .PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm # Build all targets. @@ -214,21 +216,21 @@ apex_rocm: cuda_msccl: sb_micro_path ifneq (,$(wildcard msccl/executor/msccl-executor-nccl/Makefile)) cd ./msccl/executor/msccl-executor-nccl && \ - make -j $(shell nproc --ignore=2) src.build && \ + make -j ${NUM_MAKE_JOBS} src.build && \ cd ../../.. mkdir -p $(SB_MICRO_PATH)/lib/msccl-executor-nccl && \ cp -r -v ./msccl/executor/msccl-executor-nccl/build/* $(SB_MICRO_PATH)/lib/msccl-executor-nccl/ endif ifneq (,$(wildcard msccl/scheduler/msccl-scheduler/Makefile)) cd ./msccl/scheduler/msccl-scheduler && \ - CXX=nvcc BIN_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl SRC_HOME=../../../msccl/executor/msccl-executor-nccl make -j $(shell nproc --ignore=2) && \ + CXX=nvcc BIN_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl SRC_HOME=../../../msccl/executor/msccl-executor-nccl make -j ${NUM_MAKE_JOBS} && \ cd ../../.. mkdir -p $(SB_MICRO_PATH)/lib/msccl-scheduler && \ cp -r -v ./msccl/scheduler/msccl-scheduler/build/* $(SB_MICRO_PATH)/lib/msccl-scheduler/ endif ifneq (,$(wildcard msccl/tests/msccl-tests-nccl/Makefile)) cd ./msccl/tests/msccl-tests-nccl && \ - make MPI=1 MPI_HOME=$(MPI_HOME) NCCL_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl -j $(shell nproc --ignore=2) && cd ../../.. + make MPI=1 MPI_HOME=$(MPI_HOME) NCCL_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl -j ${NUM_MAKE_JOBS} && cd ../../.. mkdir -p $(SB_MICRO_PATH)/bin/msccl-tests-nccl && \ cp -r -v ./msccl/tests/msccl-tests-nccl/build/* $(SB_MICRO_PATH)/bin/msccl-tests-nccl/ endif