Dockerfile - Upgrade to rocm5.7 dockerfile (#587)

**Description**
upgrade to rocm5.7 dockerfile.

---------

Co-authored-by: yukirora <yuting.jiang@microsoft.com>
This commit is contained in:
Yuting Jiang 2023-12-10 01:41:12 +08:00 коммит произвёл GitHub
Родитель 4fa60be7cd
Коммит 1f5031bd74
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
8 изменённых файлов: 247 добавлений и 69 удалений

48
.github/workflows/build-image.yml поставляемый
Просмотреть файл

@ -17,7 +17,7 @@ on:
jobs:
docker:
name: Docker build ${{ matrix.name }}
runs-on: ubuntu-latest
runs-on: ${{ matrix.runner }}
permissions:
contents: read
packages: write
@ -27,29 +27,15 @@ jobs:
- name: cuda12.2
dockerfile: cuda12.2
tags: superbench/main:cuda12.2
runner: ubuntu-latest
- name: cuda11.1.1
dockerfile: cuda11.1.1
tags: superbench/main:cuda11.1.1,superbench/superbench:latest
- name: rocm5.1.3
dockerfile: rocm5.1.x
tags: superbench/main:rocm5.1.3
extra_args: >-
BASE_IMAGE=rocm/pytorch:rocm5.1.3_ubuntu20.04_py3.7_pytorch_1.11.0
- name: rocm5.1.1
dockerfile: rocm5.1.x
tags: superbench/main:rocm5.1.1
extra_args: >-
BASE_IMAGE=rocm/pytorch:rocm5.1.1_ubuntu20.04_py3.7_pytorch_1.10.0
- name: rocm5.0.1
dockerfile: rocm5.0.x
tags: superbench/main:rocm5.0.1
extra_args: >-
BASE_IMAGE=rocm/pytorch:rocm5.0.1_ubuntu18.04_py3.7_pytorch_1.9.0
- name: rocm5.0
dockerfile: rocm5.0.x
tags: superbench/main:rocm5.0
extra_args: >-
BASE_IMAGE=rocm/pytorch:rocm5.0_ubuntu18.04_py3.7_pytorch_1.9.0
runner: ubuntu-latest
- name: rocm5.7
dockerfile: rocm5.7.x
tags: superbench/main:rocm5.7
runner: [self-hosted, rocm-build]
steps:
- name: Checkout
uses: actions/checkout@v2
@ -57,24 +43,18 @@ jobs:
submodules: recursive
- name: Free disk space
run: |
mkdir /tmp/emptydir
mkdir -p /tmp/emptydir
for dir in /usr/share/swift /usr/share/dotnet /usr/local/share/powershell /usr/local/share/chromium /usr/local/lib/android /opt/ghc; do
sudo rsync -a --delete /tmp/emptydir/ ${dir}
done
sudo apt-get clean
sudo docker rmi $(sudo docker images --format "{{.Repository}}:{{.Tag}}" --filter=reference="node" --filter=reference="buildpack-deps")
# Check if Docker images exist before trying to remove them
if sudo docker images -q --filter=reference="node" --filter=reference="buildpack-deps" | grep -q .; then
sudo docker rmi $(sudo docker images --format "{{.Repository}}:{{.Tag}}" --filter=reference="node" --filter=reference="buildpack-deps")
else
echo "No Docker images found with the specified references."
fi
df -h
- name: Free Up GitHub Actions Ubuntu Runner Disk Space 🔧
uses: hirnidrin/free-disk-space@main
with:
# This might remove tools that are actually needed, if set to "true" but frees about 6 GB
tool-cache: false
# All of these default to true, but feel free to set to "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
swap-storage: true
- name: Prepare metadata
id: metadata
run: |

Просмотреть файл

@ -0,0 +1,175 @@
ARG BASE_IMAGE=rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1
FROM ${BASE_IMAGE}
# OS:
# - Ubuntu: 22.04
# - Docker Client: 20.10.8
# ROCm:
# - ROCm: 5.7
# Intel:
# - mlc: v3.10
LABEL maintainer="SuperBench"
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \
apt-get -q install -y --no-install-recommends \
autoconf \
automake \
build-essential \
curl \
dmidecode \
git \
hipify-clang \
iproute2 \
jq \
libaio-dev \
libboost-program-options-dev \
libcap2 \
libnuma-dev \
libpci-dev \
libssl-dev \
libtinfo5 \
libtool \
lshw \
net-tools \
numactl \
openssh-client \
openssh-server \
pciutils \
rsync \
sudo \
util-linux \
vim \
wget \
&& \
rm -rf /tmp/*
ARG NUM_MAKE_JOBS=16
# Check if CMake is installed and its version
RUN cmake_version=$(cmake --version 2>/dev/null | grep -oP "(?<=cmake version )(\d+\.\d+)" || echo "0.0") && \
required_version="3.26.4" && \
if [ "$(printf "%s\n" "$required_version" "$cmake_version" | sort -V | head -n 1)" != "$required_version" ]; then \
echo "existing cmake version is ${cmake_version}" && \
cd /tmp && \
wget -q https://github.com/Kitware/CMake/releases/download/v${required_version}/cmake-${required_version}.tar.gz && \
tar xzf cmake-${required_version}.tar.gz && \
cd cmake-${required_version} && \
./bootstrap --prefix=/usr --no-system-curl --parallel=16 && \
make -j ${NUM_MAKE_JOBS} && \
make install && \
rm -rf /tmp/cmake-${required_version}* \
else \
echo "CMake version is greater than or equal to 3.23"; \
fi
# Install Docker
ENV DOCKER_VERSION=20.10.8
RUN cd /tmp && \
wget -q https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \
rm docker.tgz
# Update system config
RUN mkdir -p /root/.ssh && \
touch /root/.ssh/authorized_keys && \
mkdir -p /var/run/sshd && \
sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \
echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \
echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf
# Get Ubuntu version and set as an environment variable
RUN export UBUNTU_VERSION=$(lsb_release -r -s)
RUN echo "Ubuntu version: $UBUNTU_VERSION"
ENV UBUNTU_VERSION=${UBUNTU_VERSION}
# Install OFED
ENV OFED_VERSION=5.9-0.5.6.0
# Check if ofed_info is present and has a version
RUN if ! command -v ofed_info >/dev/null 2>&1; then \
echo "OFED not found. Installing OFED..."; \
cd /tmp && \
wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \
PATH=/usr/bin:${PATH} MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \
rm -rf MLNX_OFED_LINUX-${OFED_VERSION}* ; \
fi
# Install UCX
ENV UCX_VERSION=1.14.1
RUN if [ -z "$(ls -A /opt/ucx)" ]; then \
echo "/opt/ucx is empty. Installing UCX..."; \
cd /tmp && \
git clone https://github.com/openucx/ucx.git -b v${UCX_VERSION} && \
cd ucx && \
./autogen.sh && \
mkdir build && \
cd build && \
../configure -prefix=$UCX_DIR --with-rocm=/opt/rocm --without-knem && \
make -j $(nproc) && make -j $(nproc) install && rm -rf /tmp/ucx-${UCX_VERSION} ; \
else \
echo "/opt/ucx is not empty. Skipping UCX installation."; \
fi
# Install OpenMPI
ENV OPENMPI_VERSION=4.1.x
# Check if Open MPI is installed
RUN [ -d /usr/local/bin/mpirun ] || { \
echo "Open MPI not found. Installing Open MPI..." && \
cd /tmp && \
git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION} && \
cd ompi && \
./autogen.pl && \
mkdir build && \
cd build && \
../configure --prefix=/usr/local --enable-orterun-prefix-by-default --enable-mpirun-prefix-by-default --enable-prte-prefix-by-default --enable-mca-no-build=btl-uct --with-ucx=/opt/ucx --with-rocm=/opt/rocm && \
make -j $(nproc) && \
make -j $(nproc) install && \
ldconfig && \
cd / && \
rm -rf /tmp/openmpi-${OPENMPI_VERSION}* ;\
}
# Install Intel MLC
RUN cd /tmp && \
wget -q https://downloadmirror.intel.com/763324/mlc_v3.10.tgz -O mlc.tgz && \
tar xzf mlc.tgz Linux/mlc && \
cp ./Linux/mlc /usr/local/bin/ && \
rm -rf ./Linux mlc.tgz
# Install RCCL
RUN cd /opt/ && \
git clone https://github.com/ROCmSoftwarePlatform/rccl.git && \
cd rccl && \
mkdir build && \
cd build && \
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/opt/rocm/ .. && \
make -j${NUM_MAKE_JOBS}
ENV PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \
LD_LIBRARY_PATH="/opt/ucx/lib:/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections
RUN echo PATH="$PATH" > /etc/environment && \
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment
WORKDIR ${SB_HOME}
ADD . .
RUN apt install rocm-cmake -y && \
python3 -m pip install --upgrade pip wheel setuptools==65.7 && \
python3 -m pip install .[amdworker] && \
make postinstall
RUN make cppbuild
ADD third_party third_party
RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-5.7.1.1 HIPBLASLT_BRANCH=release-staging/rocm-rel-5.7 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm

Просмотреть файл

@ -39,7 +39,7 @@ docker buildx build \
export DOCKER_BUILDKIT=1
docker buildx build \
--platform linux/amd64 --cache-to type=inline,mode=max \
--tag superbench-dev --file dockerfile/rocm5.1.x.dockerfile .
--tag superbench-dev --file dockerfile/rocm5.7.x.dockerfile .
```
</TabItem>

Просмотреть файл

@ -184,7 +184,7 @@ setup(
**x,
'develop': x['dev'] + x['test'],
'cpuworker': x['torch'],
'amdworker': x['torch'] + x['ort'] + x['amd'],
'amdworker': x['torch'] + x['amd'],
'nvworker': x['torch'] + x['ort'] + x['nvidia'],
}
)(

Просмотреть файл

@ -14,25 +14,24 @@ if(CUDAToolkit_FOUND)
include(../cuda_common.cmake)
add_executable(gpu_copy gpu_copy.cu)
set_property(TARGET gpu_copy PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
target_link_libraries(gpu_copy numa)
else()
# ROCm environment
include(../rocm_common.cmake)
find_package(HIP QUIET)
if(HIP_FOUND)
find_package(hip QUIET)
if(hip_FOUND)
message(STATUS "Found ROCm: " ${HIP_VERSION})
# Convert cuda code to hip code inplace
execute_process(COMMAND hipify-perl -inplace -print-stats gpu_copy.cu
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)
# Convert cuda code to hip code in cpp
execute_process(COMMAND hipify-perl -print-stats -o gpu_copy.cpp gpu_copy.cu WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)
# Add HIP targets
set_source_files_properties(gpu_copy.cu PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
# Link with HIP
hip_add_executable(gpu_copy gpu_copy.cu)
# link hip device lib
add_executable(gpu_copy gpu_copy.cpp)
add_compile_options(-O2)
target_link_libraries(gpu_copy numa hip::device)
else()
message(FATAL_ERROR "No CUDA or ROCm environment found.")
endif()
endif()
install(TARGETS gpu_copy RUNTIME DESTINATION bin)
target_link_libraries(gpu_copy numa)
install(TARGETS gpu_copy RUNTIME DESTINATION bin)

Просмотреть файл

@ -18,18 +18,16 @@ if(CUDAToolkit_FOUND)
else()
# ROCm environment
include(../rocm_common.cmake)
find_package(HIP QUIET)
if(HIP_FOUND)
find_package(hip QUIET)
if(hip_FOUND)
message(STATUS "Found HIP: " ${HIP_VERSION})
# Convert cuda code to hip code inplace
execute_process(COMMAND hipify-perl -inplace -print-stats kernel_launch.cu
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)
# Convert cuda code to hip code in cpp
execute_process(COMMAND hipify-perl -print-stats -o kernel_launch.cpp kernel_launch.cu WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)
# Add HIP targets
set_source_files_properties(kernel_launch.cu PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
# Link with HIP
hip_add_executable(kernel_launch_overhead kernel_launch.cu)
# link hip device lib
add_executable(kernel_launch_overhead kernel_launch.cpp)
target_link_libraries(kernel_launch_overhead hip::device)
# Install tergets
install(TARGETS kernel_launch_overhead RUNTIME DESTINATION bin)
else()

Просмотреть файл

@ -3,14 +3,36 @@
# Set ROCM_PATH
if(NOT DEFINED ENV{ROCM_PATH})
set(ROCM_PATH /opt/rocm)
# Run hipconfig -p to get ROCm path
execute_process(
COMMAND hipconfig -R
RESULT_VARIABLE HIPCONFIG_RESULT
OUTPUT_VARIABLE ROCM_PATH
OUTPUT_STRIP_TRAILING_WHITESPACE
)
# Check if hipconfig was successful
if(NOT HIPCONFIG_RESULT EQUAL 0)
message(FATAL_ERROR "Failed to run hipconfig -p. Make sure ROCm is installed and hipconfig is available.")
endif()
else()
set(ROCM_PATH $ENV{ROCM_PATH})
endif()
# Set HIP_PATH
if(NOT DEFINED ENV{HIP_PATH})
set(HIP_PATH ${ROCM_PATH}/hip)
execute_process(
COMMAND hipconfig -p
RESULT_VARIABLE HIPCONFIG_RESULT
OUTPUT_VARIABLE HIP_PATH
OUTPUT_STRIP_TRAILING_WHITESPACE
)
# Check if hipconfig was successful
if(NOT HIPCONFIG_RESULT EQUAL 0)
message(FATAL_ERROR "Failed to run hipconfig -p. Make sure ROCm is installed and hipconfig is available.")
endif()
else()
set(HIP_PATH $ENV{HIP_PATH})
endif()
@ -24,6 +46,8 @@ message(STATUS "CMAKE HIP ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")
if(EXISTS ${HIP_PATH})
# Search for hip in common locations
list(APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH})
set(CMAKE_PREFIX_PATH /opt/rocm ROCM_PATH)
set(CMAKE_CXX_COMPILER "${HIP_PATH}/bin/hipcc")
set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
set(CMAKE_MODULE_PATH "${HIP_PATH}/lib/cmake/hip" ${CMAKE_MODULE_PATH})
endif()

24
third_party/Makefile поставляемый
Просмотреть файл

@ -10,6 +10,7 @@ HPCX_HOME ?= /opt/hpcx
CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c2- | cut -d '.' -f1-2)
ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
HIPBLASLT_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
.PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed
@ -97,12 +98,13 @@ endif
# The version we use is the released tag which is consistent with the rocm version in the environment or docker.
# Since it takes several hours to build, avoid to build again if rocblas-bench exsists.
rocm_rocblas: sb_micro_path
ifeq (, $(wildcard $(SB_MICRO_PATH)/bin/rocblas-bench))
if [ -d rocBLAS ]; then rm -rf rocBLAS; fi
git clone -b ${ROCBLAS_BRANCH} https://github.com/ROCmSoftwarePlatform/rocBLAS.git ./rocBLAS
cd ./rocBLAS && ./install.sh --dependencies --clients-only
cp -v ./rocBLAS/build/release/clients/staging/rocblas-bench $(SB_MICRO_PATH)/bin/
endif
@if [ ! -e $(SB_MICRO_PATH)/bin/rocblas-bench ] && [ -z `which rocblas-bench` ]; then \
if [ -d rocBLAS ]; then rm -rf rocBLAS; fi; \
git clone -b ${ROCBLAS_BRANCH} https://github.com/ROCmSoftwarePlatform/rocBLAS.git ./rocBLAS; \
sed -i 's|#include "gemm.hpp"|#include "Tensile/gemm.hpp"|' rocBLAS/clients/benchmarks/../../library/src/blas3/rocblas_trtri.hpp; \
cd ./rocBLAS && ./install.sh --dependencies --clients-only; \
cp -v $(SB_MICRO_PATH)/third_party/rocBLAS/build/release/clients/staging/rocblas-bench $(SB_MICRO_PATH)/bin/; \
fi
# Build hipblaslt-bench.
# hipBLASLt is released with rocm, like rocm-4.2.0 and so on.
@ -111,18 +113,18 @@ endif
rocm_hipblaslt: sb_micro_path
@if [ ! -e $(SB_MICRO_PATH)/bin/hipblaslt-bench ] && [ -z `which hipblaslt-bench` ]; then \
if [ -d hipBLASLt ]; then rm -rf hipBLASLt; fi; \
git clone -b ${ROCBLAS_BRANCH} https://github.com/ROCmSoftwarePlatform/hipBLASLt.git ./hipBLASLt; \
git clone -b ${HIPBLASLT_BRANCH} https://github.com/ROCmSoftwarePlatform/hipBLASLt.git ./hipBLASLt; \
cd ./hipBLASLt && ./install.sh -dc; \
cp -v ./hipBLASLt/build/release/clients/staging/hipblaslt-bench $(SB_MICRO_PATH)/bin/; \
cp -v $(SB_MICRO_PATH)/third_party/hipBLASLt/build/release/clients/staging/hipblaslt-bench $(SB_MICRO_PATH)/bin/; \
fi
# Build hipBusBandwidth.
# HIP is released with rocm, like rocm-4.2.0 and so on.
# The version we use is the released tag which is consistent with the rocm version in the environment or docker.
rocm_bandwidthTest: sb_micro_path
cp -r -v $(shell hipconfig -p)/samples/1_Utils/hipBusBandwidth ./
cd ./hipBusBandwidth/ && mkdir -p build && cd build && cmake .. && make
cp -v ./hipBusBandwidth/build/hipBusBandwidth $(SB_MICRO_PATH)/bin/
git clone -b ${ROCM_VER} https://github.com/ROCm-Developer-Tools/HIP.git
cd ./HIP/samples/1_Utils/hipBusBandwidth/ && mkdir -p build && cd build && cmake .. && make
cp -v ./HIP/samples/1_Utils/hipBusBandwidth/build/hipBusBandwidth $(SB_MICRO_PATH)/bin/
# Build GPCNET from commit c56fd9.
gpcnet: sb_micro_path