Dockerfile - Add rocm6.0 dockerfile (#602)
**Description** Add rocm6.0 dockerfile.
This commit is contained in:
Родитель
e8777e24ab
Коммит
c2e7a543bb
|
@ -40,6 +40,11 @@ jobs:
|
|||
tags: superbench/main:rocm5.7
|
||||
runner: [self-hosted, rocm-build]
|
||||
build_args: "NUM_MAKE_JOBS=64"
|
||||
- name: rocm6.0
|
||||
dockerfile: rocm6.0.x
|
||||
tags: superbench/main:rocm6.0
|
||||
runner: [self-hosted, rocm-build]
|
||||
build_args: "NUM_MAKE_JOBS=64"
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
|
|
|
@ -110,21 +110,18 @@ RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942\ngfx10
|
|||
# Install OpenMPI
|
||||
ENV OPENMPI_VERSION=4.1.x
|
||||
# Check if Open MPI is installed
|
||||
RUN [ -d /usr/local/bin/mpirun ] || { \
|
||||
echo "Open MPI not found. Installing Open MPI..." && \
|
||||
cd /tmp && \
|
||||
RUN cd /tmp && \
|
||||
git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION} && \
|
||||
cd ompi && \
|
||||
./autogen.pl && \
|
||||
mkdir build && \
|
||||
cd build && \
|
||||
../configure --prefix=/usr/local --enable-orterun-prefix-by-default --enable-mpirun-prefix-by-default --enable-prte-prefix-by-default --with-rocm=/opt/rocm && \
|
||||
../configure --prefix=/usr/local/mpi --enable-orterun-prefix-by-default --enable-mpirun-prefix-by-default --enable-prte-prefix-by-default --with-rocm=/opt/rocm && \
|
||||
make -j $(nproc) && \
|
||||
make -j $(nproc) install && \
|
||||
ldconfig && \
|
||||
cd / && \
|
||||
rm -rf /tmp/openmpi-${OPENMPI_VERSION}* ;\
|
||||
}
|
||||
rm -rf /tmp/openmpi-${OPENMPI_VERSION}*
|
||||
|
||||
# Install Intel MLC
|
||||
RUN cd /tmp && \
|
||||
|
@ -166,11 +163,13 @@ RUN apt install rocm-cmake -y && \
|
|||
WORKDIR ${SB_HOME}
|
||||
|
||||
ADD third_party third_party
|
||||
RUN make RCCL_HOME=/opt/rccl/build/ MPI_HOME=/usr/local ROCBLAS_BRANCH=release/rocm-rel-5.7.1.1 HIPBLASLT_BRANCH=release-staging/rocm-rel-5.7 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm
|
||||
# Apply patch
|
||||
RUN cd third_party/perftest && \
|
||||
git apply ../perftest_rocm6.patch
|
||||
RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-5.7.1.1 HIPBLASLT_BRANCH=release/rocm-rel-5.7 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm
|
||||
|
||||
ADD . .
|
||||
#ENV USE_HIPBLASLT_DATATYPE=1
|
||||
ENV CXX=/opt/rocm/bin/hipcc
|
||||
RUN python3 -m pip install .[amdworker] && \
|
||||
make cppbuild && \
|
||||
CXX=/opt/rocm/bin/hipcc make cppbuild && \
|
||||
make postinstall
|
||||
|
|
|
@ -0,0 +1,180 @@
|
|||
ARG BASE_IMAGE=rocm/pytorch:rocm6.0_ubuntu22.04_py3.9_pytorch_2.0.1
|
||||
|
||||
FROM ${BASE_IMAGE}
|
||||
|
||||
# OS:
|
||||
# - Ubuntu: 22.04
|
||||
# - Docker Client: 20.10.8
|
||||
# ROCm:
|
||||
# - ROCm: 6.0
|
||||
# Lib:
|
||||
# - torch: 2.0.1
|
||||
# - rccl: 2.18.3+hip6.0 develop:7e1cbb4
|
||||
# - hipblaslt: 950ca43
|
||||
# - openmpi: 4.1.x
|
||||
# - apex: 1.0.0
|
||||
# Intel:
|
||||
# - mlc: v3.10
|
||||
|
||||
LABEL maintainer="SuperBench"
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
RUN apt-get update && \
|
||||
apt-get -q install -y --no-install-recommends \
|
||||
autoconf \
|
||||
automake \
|
||||
bc \
|
||||
build-essential \
|
||||
curl \
|
||||
dmidecode \
|
||||
git \
|
||||
hipify-clang \
|
||||
iproute2 \
|
||||
jq \
|
||||
libaio-dev \
|
||||
libboost-program-options-dev \
|
||||
libcap2 \
|
||||
libcurl4-openssl-dev \
|
||||
libnuma-dev \
|
||||
libpci-dev \
|
||||
libssl-dev \
|
||||
libtinfo5 \
|
||||
libtool \
|
||||
lshw \
|
||||
net-tools \
|
||||
numactl \
|
||||
openssh-client \
|
||||
openssh-server \
|
||||
pciutils \
|
||||
python3-mpi4py \
|
||||
rsync \
|
||||
sudo \
|
||||
util-linux \
|
||||
vim \
|
||||
wget \
|
||||
&& \
|
||||
rm -rf /tmp/*
|
||||
|
||||
ARG NUM_MAKE_JOBS=64
|
||||
|
||||
# Check if CMake is installed and its version
|
||||
RUN cmake_version=$(cmake --version 2>/dev/null | grep -oP "(?<=cmake version )(\d+\.\d+)" || echo "0.0") && \
|
||||
required_version="3.24.1" && \
|
||||
if [ "$(printf "%s\n" "$required_version" "$cmake_version" | sort -V | head -n 1)" != "$required_version" ]; then \
|
||||
echo "existing cmake version is ${cmake_version}" && \
|
||||
cd /tmp && \
|
||||
wget -q https://github.com/Kitware/CMake/releases/download/v${required_version}/cmake-${required_version}.tar.gz && \
|
||||
tar xzf cmake-${required_version}.tar.gz && \
|
||||
cd cmake-${required_version} && \
|
||||
./bootstrap --prefix=/usr --no-system-curl --parallel=16 && \
|
||||
make -j ${NUM_MAKE_JOBS} && \
|
||||
make install && \
|
||||
rm -rf /tmp/cmake-${required_version}* \
|
||||
else \
|
||||
echo "CMake version is greater than or equal to 3.23"; \
|
||||
fi
|
||||
|
||||
# Install Docker
|
||||
ENV DOCKER_VERSION=20.10.8
|
||||
RUN cd /tmp && \
|
||||
wget -q https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
|
||||
tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \
|
||||
rm docker.tgz
|
||||
|
||||
# Update system config
|
||||
RUN mkdir -p /root/.ssh && \
|
||||
touch /root/.ssh/authorized_keys && \
|
||||
mkdir -p /var/run/sshd && \
|
||||
sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
|
||||
sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \
|
||||
sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \
|
||||
echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \
|
||||
echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf
|
||||
|
||||
|
||||
# Get Ubuntu version and set as an environment variable
|
||||
RUN export UBUNTU_VERSION=$(lsb_release -r -s)
|
||||
RUN echo "Ubuntu version: $UBUNTU_VERSION"
|
||||
ENV UBUNTU_VERSION=${UBUNTU_VERSION}
|
||||
|
||||
# Install OFED
|
||||
ENV OFED_VERSION=5.9-0.5.6.0
|
||||
# Check if ofed_info is present and has a version
|
||||
RUN if ! command -v ofed_info >/dev/null 2>&1; then \
|
||||
echo "OFED not found. Installing OFED..."; \
|
||||
cd /tmp && \
|
||||
wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \
|
||||
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \
|
||||
PATH=/usr/bin:${PATH} MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \
|
||||
rm -rf MLNX_OFED_LINUX-${OFED_VERSION}* ; \
|
||||
fi
|
||||
|
||||
# Add target file to help determine which device(s) to build for
|
||||
ENV ROCM_PATH=/opt/rocm
|
||||
RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942:sramecc+:xnack-\n" >> ${ROCM_PATH}/bin/target.lst'
|
||||
|
||||
# Install OpenMPI
|
||||
ENV OPENMPI_VERSION=4.1.x
|
||||
# Check if Open MPI is installed
|
||||
RUN cd /tmp && \
|
||||
git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION} && \
|
||||
cd ompi && \
|
||||
./autogen.pl && \
|
||||
mkdir build && \
|
||||
cd build && \
|
||||
../configure --prefix=/usr/local/mpi --enable-orterun-prefix-by-default --enable-mpirun-prefix-by-default --enable-prte-prefix-by-default --with-rocm=/opt/rocm && \
|
||||
make -j $(nproc) && \
|
||||
make -j $(nproc) install && \
|
||||
ldconfig && \
|
||||
cd / && \
|
||||
rm -rf /tmp/openmpi-${OPENMPI_VERSION}*
|
||||
|
||||
# Install Intel MLC
|
||||
RUN cd /tmp && \
|
||||
wget -q https://downloadmirror.intel.com/763324/mlc_v3.10.tgz -O mlc.tgz && \
|
||||
tar xzf mlc.tgz Linux/mlc && \
|
||||
cp ./Linux/mlc /usr/local/bin/ && \
|
||||
rm -rf ./Linux mlc.tgz
|
||||
|
||||
# Install RCCL
|
||||
RUN cd /opt/ && \
|
||||
git clone https://github.com/ROCmSoftwarePlatform/rccl.git && \
|
||||
cd rccl && \
|
||||
mkdir build && \
|
||||
cd build && \
|
||||
CXX=/opt/rocm/bin/hipcc cmake -DHIP_COMPILER=clang -DCMAKE_BUILD_TYPE=Release -DCMAKE_VERBOSE_MAKEFILE=1 \
|
||||
-DCMAKE_PREFIX_PATH="${ROCM_PATH}/hsa;${ROCM_PATH}/hip;${ROCM_PATH}/share/rocm/cmake/;${ROCM_PATH}" \
|
||||
.. && \
|
||||
make -j${NUM_MAKE_JOBS}
|
||||
|
||||
ENV PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
|
||||
LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \
|
||||
LD_LIBRARY_PATH="/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
|
||||
SB_HOME=/opt/superbench \
|
||||
SB_MICRO_PATH=/opt/superbench \
|
||||
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
|
||||
ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections
|
||||
|
||||
RUN echo PATH="$PATH" > /etc/environment && \
|
||||
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
|
||||
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment
|
||||
|
||||
RUN apt install rocm-cmake -y && \
|
||||
python3 -m pip install --upgrade pip wheel setuptools==65.7
|
||||
|
||||
WORKDIR ${SB_HOME}
|
||||
|
||||
ADD third_party third_party
|
||||
# Apply patch
|
||||
RUN cd third_party/perftest && \
|
||||
git apply ../perftest_rocm6.patch
|
||||
RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-6.0 HIPBLASLT_BRANCH=release/rocm-rel-6.0 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm
|
||||
RUN cd third_party/Megatron/Megatron-DeepSpeed && \
|
||||
git apply ../megatron_deepspeed_rocm6.patch
|
||||
|
||||
ADD . .
|
||||
ENV USE_HIP_DATATYPE=1
|
||||
ENV USE_HIPBLAS_COMPUTETYPE=1
|
||||
RUN python3 -m pip install .[amdworker] && \
|
||||
CXX=/opt/rocm/bin/hipcc make cppbuild && \
|
||||
make postinstall
|
|
@ -7,10 +7,12 @@ MPI_HOME ?= /usr/local/mpi
|
|||
HIP_HOME ?= /opt/rocm/hip
|
||||
RCCL_HOME ?= /opt/rocm/rccl
|
||||
HPCX_HOME ?= /opt/hpcx
|
||||
ROCM_PATH ?= /opt/rocm
|
||||
|
||||
CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c2- | cut -d '.' -f1-2)
|
||||
ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
|
||||
HIPBLASLT_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
|
||||
ROCM_VER ?= $(shell hipconfig -R | grep -oP '\d+\.\d+\.\d+' || echo "0.0.0")
|
||||
|
||||
.PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm
|
||||
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
diff --git a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
|
||||
index 76086de..1533648 100644
|
||||
--- a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
|
||||
+++ b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
|
||||
@@ -4,7 +4,7 @@
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
-#ifndef __HIP_PLATFORM_HCC__
|
||||
+#ifndef __HIP_PLATFORM_AMD__
|
||||
#include <cuda_profiler_api.h>
|
||||
#endif
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
diff --git a/megatron/fused_kernels/scaled_softmax_cuda.cu b/megatron/fused_kernels/scaled_softmax_cuda.cu
|
||||
index 90e1c9f..d217aec 100644
|
||||
--- a/megatron/fused_kernels/scaled_softmax_cuda.cu
|
||||
+++ b/megatron/fused_kernels/scaled_softmax_cuda.cu
|
||||
@@ -4,7 +4,7 @@
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
-#ifndef __HIP_PLATFORM_HCC__
|
||||
+#ifndef __HIP_PLATFORM_AMD__
|
||||
#include <cuda_profiler_api.h>
|
||||
#endif
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
|
||||
index 74c9f3d..03b5fc8 100644
|
||||
--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
|
||||
+++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
|
||||
@@ -4,7 +4,7 @@
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
-#ifndef __HIP_PLATFORM_HCC__
|
||||
+#ifndef __HIP_PLATFORM_AMD__
|
||||
#include <cuda_profiler_api.h>
|
||||
#endif
|
||||
#include <ATen/cuda/CUDAContext.h>
|
|
@ -1 +1 @@
|
|||
Subproject commit 5fb4f10a7e7827ed15e53c25810a10be279d6e23
|
||||
Subproject commit dffd1dd8b8a26dad2634a546e7e4d082dc882fbc
|
|
@ -0,0 +1,28 @@
|
|||
diff --git a/configure.ac b/configure.ac
|
||||
index 20eceda..c8f0c07 100755
|
||||
--- a/configure.ac
|
||||
+++ b/configure.ac
|
||||
@@ -237,7 +237,7 @@ AC_ARG_WITH([rocm],
|
||||
],
|
||||
[AS_CASE([$with_rocm],
|
||||
[yes|no], [],
|
||||
- [CPPFLAGS="-I$with_rocm/include $CPPFLAGS"
|
||||
+ [CPPFLAGS="-I$with_rocm/include -D__HIP_PLATFORM_AMD__=1 $CPPFLAGS"
|
||||
LDFLAGS="-L$with_rocm/lib64 -Wl,-rpath=$with_rocm/lib64 -L$with_rocm/lib -Wl,-rpath=$with_rocm/lib -lamdhip64 $LDFLAGS"])
|
||||
])
|
||||
|
||||
diff --git a/src/rocm_memory.c b/src/rocm_memory.c
|
||||
index e9a9136..b6cb23a 100644
|
||||
--- a/src/rocm_memory.c
|
||||
+++ b/src/rocm_memory.c
|
||||
@@ -44,8 +44,8 @@ static int init_rocm(int device_id) {
|
||||
|
||||
hipDeviceProp_t prop = {0};
|
||||
ROCM_CHECK(hipGetDeviceProperties(&prop, device_id));
|
||||
- printf("Using ROCm Device with ID: %d, Name: %s, PCI Bus ID: 0x%x, GCN Arch: %d\n",
|
||||
- device_id, prop.name, prop.pciBusID, prop.gcnArch);
|
||||
+ printf("Using ROCm Device with ID: %d, Name: %s, PCI Bus ID: 0x%x, GCN Arch: %s\n",
|
||||
+ device_id, prop.name, prop.pciBusID, prop.gcnArchName);
|
||||
|
||||
return SUCCESS;
|
||||
}
|
Загрузка…
Ссылка в новой задаче