CI/CD - Fix MSCCL build error in CUDA12.4 docker build pipeline (#633)
**Description** Fix MSCCL build error in CUDA12.4 docker build pipeline due to OOM issue.
This commit is contained in:
Родитель
e304cf1572
Коммит
2101e933cc
|
@ -18,7 +18,7 @@ jobs:
|
||||||
docker:
|
docker:
|
||||||
name: Docker build ${{ matrix.name }}
|
name: Docker build ${{ matrix.name }}
|
||||||
runs-on: ${{ matrix.runner }}
|
runs-on: ${{ matrix.runner }}
|
||||||
timeout-minutes: 600
|
timeout-minutes: 1200
|
||||||
permissions:
|
permissions:
|
||||||
contents: read
|
contents: read
|
||||||
packages: write
|
packages: write
|
||||||
|
@ -29,12 +29,12 @@ jobs:
|
||||||
dockerfile: cuda12.4
|
dockerfile: cuda12.4
|
||||||
tags: superbench/main:cuda12.4
|
tags: superbench/main:cuda12.4
|
||||||
runner: [self-hosted, rocm-build]
|
runner: [self-hosted, rocm-build]
|
||||||
build_args: "NUM_MAKE_JOBS=32"
|
build_args: "NUM_MAKE_JOBS=8"
|
||||||
- name: cuda12.2
|
- name: cuda12.2
|
||||||
dockerfile: cuda12.2
|
dockerfile: cuda12.2
|
||||||
tags: superbench/main:cuda12.2
|
tags: superbench/main:cuda12.2
|
||||||
runner: [self-hosted, rocm-build]
|
runner: [self-hosted, rocm-build]
|
||||||
build_args: "NUM_MAKE_JOBS=64"
|
build_args: "NUM_MAKE_JOBS=8"
|
||||||
- name: cuda11.1.1
|
- name: cuda11.1.1
|
||||||
dockerfile: cuda11.1.1
|
dockerfile: cuda11.1.1
|
||||||
tags: superbench/main:cuda11.1.1,superbench/superbench:latest
|
tags: superbench/main:cuda11.1.1,superbench/superbench:latest
|
||||||
|
|
|
@ -133,7 +133,7 @@ RUN cd /tmp && \
|
||||||
|
|
||||||
# Install RCCL
|
# Install RCCL
|
||||||
RUN cd /opt/ && \
|
RUN cd /opt/ && \
|
||||||
git clone https://github.com/ROCmSoftwarePlatform/rccl.git && \
|
git clone -b release/rocm-rel-5.7 https://github.com/ROCmSoftwarePlatform/rccl.git && \
|
||||||
cd rccl && \
|
cd rccl && \
|
||||||
mkdir build && \
|
mkdir build && \
|
||||||
cd build && \
|
cd build && \
|
||||||
|
@ -167,6 +167,17 @@ ADD third_party third_party
|
||||||
# Apply patch
|
# Apply patch
|
||||||
RUN cd third_party/perftest && \
|
RUN cd third_party/perftest && \
|
||||||
git apply ../perftest_rocm6.patch
|
git apply ../perftest_rocm6.patch
|
||||||
|
|
||||||
|
# Update package index and install dependencies of ROCBLAS
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y software-properties-common && \
|
||||||
|
add-apt-repository universe && \
|
||||||
|
apt-get update && \
|
||||||
|
apt-get install -y --fix-missing \
|
||||||
|
make python3 python3-yaml python3-venv python3-joblib 'python3*-pip' libmsgpack-dev gfortran libomp-dev && \
|
||||||
|
apt-get clean && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-5.7.1.1 HIPBLASLT_BRANCH=release/rocm-rel-5.7 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm
|
RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-5.7.1.1 HIPBLASLT_BRANCH=release/rocm-rel-5.7 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm
|
||||||
|
|
||||||
ADD . .
|
ADD . .
|
||||||
|
|
|
@ -139,7 +139,7 @@ RUN cd /tmp && \
|
||||||
|
|
||||||
# Install RCCL
|
# Install RCCL
|
||||||
RUN cd /opt/ && \
|
RUN cd /opt/ && \
|
||||||
git clone https://github.com/ROCmSoftwarePlatform/rccl.git && \
|
git clone -b release/rocm-rel-6.0 https://github.com/ROCmSoftwarePlatform/rccl.git && \
|
||||||
cd rccl && \
|
cd rccl && \
|
||||||
mkdir build && \
|
mkdir build && \
|
||||||
cd build && \
|
cd build && \
|
||||||
|
|
|
@ -14,6 +14,8 @@ ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' |
|
||||||
HIPBLASLT_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
|
HIPBLASLT_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
|
||||||
ROCM_VER ?= $(shell hipconfig -R | grep -oP '\d+\.\d+\.\d+' || echo "0.0.0")
|
ROCM_VER ?= $(shell hipconfig -R | grep -oP '\d+\.\d+\.\d+' || echo "0.0.0")
|
||||||
|
|
||||||
|
NUM_MAKE_JOBS ?= $(shell nproc --ignore=2)
|
||||||
|
|
||||||
.PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm
|
.PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm
|
||||||
|
|
||||||
# Build all targets.
|
# Build all targets.
|
||||||
|
@ -214,21 +216,21 @@ apex_rocm:
|
||||||
cuda_msccl: sb_micro_path
|
cuda_msccl: sb_micro_path
|
||||||
ifneq (,$(wildcard msccl/executor/msccl-executor-nccl/Makefile))
|
ifneq (,$(wildcard msccl/executor/msccl-executor-nccl/Makefile))
|
||||||
cd ./msccl/executor/msccl-executor-nccl && \
|
cd ./msccl/executor/msccl-executor-nccl && \
|
||||||
make -j $(shell nproc --ignore=2) src.build && \
|
make -j ${NUM_MAKE_JOBS} src.build && \
|
||||||
cd ../../..
|
cd ../../..
|
||||||
mkdir -p $(SB_MICRO_PATH)/lib/msccl-executor-nccl && \
|
mkdir -p $(SB_MICRO_PATH)/lib/msccl-executor-nccl && \
|
||||||
cp -r -v ./msccl/executor/msccl-executor-nccl/build/* $(SB_MICRO_PATH)/lib/msccl-executor-nccl/
|
cp -r -v ./msccl/executor/msccl-executor-nccl/build/* $(SB_MICRO_PATH)/lib/msccl-executor-nccl/
|
||||||
endif
|
endif
|
||||||
ifneq (,$(wildcard msccl/scheduler/msccl-scheduler/Makefile))
|
ifneq (,$(wildcard msccl/scheduler/msccl-scheduler/Makefile))
|
||||||
cd ./msccl/scheduler/msccl-scheduler && \
|
cd ./msccl/scheduler/msccl-scheduler && \
|
||||||
CXX=nvcc BIN_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl SRC_HOME=../../../msccl/executor/msccl-executor-nccl make -j $(shell nproc --ignore=2) && \
|
CXX=nvcc BIN_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl SRC_HOME=../../../msccl/executor/msccl-executor-nccl make -j ${NUM_MAKE_JOBS} && \
|
||||||
cd ../../..
|
cd ../../..
|
||||||
mkdir -p $(SB_MICRO_PATH)/lib/msccl-scheduler && \
|
mkdir -p $(SB_MICRO_PATH)/lib/msccl-scheduler && \
|
||||||
cp -r -v ./msccl/scheduler/msccl-scheduler/build/* $(SB_MICRO_PATH)/lib/msccl-scheduler/
|
cp -r -v ./msccl/scheduler/msccl-scheduler/build/* $(SB_MICRO_PATH)/lib/msccl-scheduler/
|
||||||
endif
|
endif
|
||||||
ifneq (,$(wildcard msccl/tests/msccl-tests-nccl/Makefile))
|
ifneq (,$(wildcard msccl/tests/msccl-tests-nccl/Makefile))
|
||||||
cd ./msccl/tests/msccl-tests-nccl && \
|
cd ./msccl/tests/msccl-tests-nccl && \
|
||||||
make MPI=1 MPI_HOME=$(MPI_HOME) NCCL_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl -j $(shell nproc --ignore=2) && cd ../../..
|
make MPI=1 MPI_HOME=$(MPI_HOME) NCCL_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl -j ${NUM_MAKE_JOBS} && cd ../../..
|
||||||
mkdir -p $(SB_MICRO_PATH)/bin/msccl-tests-nccl && \
|
mkdir -p $(SB_MICRO_PATH)/bin/msccl-tests-nccl && \
|
||||||
cp -r -v ./msccl/tests/msccl-tests-nccl/build/* $(SB_MICRO_PATH)/bin/msccl-tests-nccl/
|
cp -r -v ./msccl/tests/msccl-tests-nccl/build/* $(SB_MICRO_PATH)/bin/msccl-tests-nccl/
|
||||||
endif
|
endif
|
||||||
|
|
Загрузка…
Ссылка в новой задаче