diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index f1d7fe5d..ed821d65 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -18,6 +18,7 @@ jobs: docker: name: Docker build ${{ matrix.name }} runs-on: ${{ matrix.runner }} + timeout-minutes: 600 permissions: contents: read packages: write @@ -27,15 +28,23 @@ jobs: - name: cuda12.2 dockerfile: cuda12.2 tags: superbench/main:cuda12.2 - runner: ubuntu-latest + runner: [self-hosted, rocm-build] + build_args: "NUM_MAKE_JOBS=64" - name: cuda11.1.1 dockerfile: cuda11.1.1 tags: superbench/main:cuda11.1.1,superbench/superbench:latest runner: ubuntu-latest + build_args: "NUM_MAKE_JOBS=8" - name: rocm5.7 dockerfile: rocm5.7.x tags: superbench/main:rocm5.7 runner: [self-hosted, rocm-build] + build_args: "NUM_MAKE_JOBS=64" + - name: rocm6.0 + dockerfile: rocm6.0.x + tags: superbench/main:rocm6.0 + runner: [self-hosted, rocm-build] + build_args: "NUM_MAKE_JOBS=64" steps: - name: Checkout uses: actions/checkout@v2 @@ -75,7 +84,7 @@ jobs: fi DOCKERFILE=dockerfile/${{ matrix.dockerfile }}.dockerfile - BUILD_ARGS="NUM_MAKE_JOBS=8" + BUILD_ARGS=${{ matrix.build_args }} if [[ "${{ matrix.extra_args }}" ]]; then BUILD_ARGS="${BUILD_ARGS} ${{ matrix.extra_args }}" fi @@ -87,11 +96,11 @@ jobs: CACHE_TO="type=inline,mode=max" fi - echo ::set-output name=dockerfile::${DOCKERFILE} - echo ::set-output name=build_args::${BUILD_ARGS} - echo ::set-output name=tags::${TAGS} - echo ::set-output name=cache_from::${CACHE_FROM} - echo ::set-output name=cache_to::${CACHE_TO} + echo "dockerfile=${DOCKERFILE}" >> "$GITHUB_OUTPUT" + echo "build_args=${BUILD_ARGS}" >> "$GITHUB_OUTPUT" + echo "tags=${TAGS}" >> "$GITHUB_OUTPUT" + echo "cache_from=${CACHE_FROM}" >> "$GITHUB_OUTPUT" + echo "cache_to=${CACHE_TO}" >> "$GITHUB_OUTPUT" - name: Echo build args run: echo ${{ steps.metadata.outputs.build_args }} - name: Echo image tag @@ -106,6 +115,9 @@ jobs: with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: Pull cache image + run: sudo docker pull ${{ steps.metadata.outputs.tags }} + continue-on-error: true - name: Login to the GitHub Container Registry uses: docker/login-action@v1 if: ${{ github.event_name == 'release' }} diff --git a/.gitmodules b/.gitmodules index d99bd2b7..339520d1 100644 --- a/.gitmodules +++ b/.gitmodules @@ -24,3 +24,9 @@ [submodule "third_party/msccl"] path = third_party/msccl url = https://github.com/Azure/msccl +[submodule "third_party/Megatron/Megatron-LM"] + path = third_party/Megatron/Megatron-LM + url = https://github.com/NVIDIA/Megatron-LM.git +[submodule "third_party/Megatron/Megatron-DeepSpeed"] + path = third_party/Megatron/Megatron-DeepSpeed + url = https://github.com/microsoft/Megatron-DeepSpeed.git diff --git a/README.md b/README.md index cfcd4b6b..225bd7d7 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ __SuperBench__ is a validation and profiling tool for AI infrastructure. -📢 [v0.9.0](https://github.com/microsoft/superbenchmark/releases/tag/v0.9.0) has been released! +📢 [v0.10.0](https://github.com/microsoft/superbenchmark/releases/tag/v0.10.0) has been released! ## _Check [aka.ms/superbench](https://aka.ms/superbench) for more details._ diff --git a/dockerfile/cuda12.2.dockerfile b/dockerfile/cuda12.2.dockerfile index a2e4559c..96944c52 100644 --- a/dockerfile/cuda12.2.dockerfile +++ b/dockerfile/cuda12.2.dockerfile @@ -7,7 +7,7 @@ FROM nvcr.io/nvidia/pytorch:23.10-py3 # NVIDIA: # - CUDA: 12.2.2 # - cuDNN: 8.9.5 -# - NCCL: v2.19.3-1 +# - NCCL: v2.18.3-1 # Mellanox: # - OFED: 23.07-0.5.1.2 # - HPC-X: v2.16 @@ -113,6 +113,13 @@ RUN cd /tmp && \ mv amd-blis /opt/AMD && \ rm -rf aocl-blis-linux-aocc-4.0.tar.gz +# Install NCCL 2.18.3 +RUN cd /tmp && \ + git clone -b v2.18.3-1 https://github.com/NVIDIA/nccl.git && \ + cd nccl && \ + make -j src.build && \ + make install && \ + rm -rf /tmp/nccl ENV PATH="${PATH}" \ LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \ diff --git a/dockerfile/directx12.dockerfile b/dockerfile/directx12.dockerfile index cd5ab9ad..6a3db35e 100644 --- a/dockerfile/directx12.dockerfile +++ b/dockerfile/directx12.dockerfile @@ -54,6 +54,8 @@ RUN curl -s -L https://dist.nuget.org/win-x86-commandline/latest/nuget.exe -o "% # Run the setup script to install the visual studio components RUN "%SB_HOME%\\dockerfile\\directx\\install-components.bat" +RUN powershell -Command "Set-ItemProperty -Path HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem -Name LongPathsEnabled -Value 1;" +RUN git config --system core.longpaths true # Install Superbench RUN python -m pip install setuptools==65.0.0 && \ python -m pip install --no-cache-dir .[amdworker] && \ diff --git a/dockerfile/etc/ndv4-topo.xml b/dockerfile/etc/ndv4-topo.xml index 1a8fa239..95352165 100644 --- a/dockerfile/etc/ndv4-topo.xml +++ b/dockerfile/etc/ndv4-topo.xml @@ -1,34 +1,34 @@ - - - - - - - - - - - - - - + + + + + + - - + + + + + + + + + + diff --git a/dockerfile/etc/ndv5-topo.xml b/dockerfile/etc/ndv5-topo.xml new file mode 100644 index 00000000..50ffa21c --- /dev/null +++ b/dockerfile/etc/ndv5-topo.xml @@ -0,0 +1,38 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dockerfile/rocm5.7.x.dockerfile b/dockerfile/rocm5.7.x.dockerfile index ee762e9e..4d68073c 100644 --- a/dockerfile/rocm5.7.x.dockerfile +++ b/dockerfile/rocm5.7.x.dockerfile @@ -17,6 +17,7 @@ RUN apt-get update && \ apt-get -q install -y --no-install-recommends \ autoconf \ automake \ + bc \ build-essential \ curl \ dmidecode \ @@ -27,6 +28,7 @@ RUN apt-get update && \ libaio-dev \ libboost-program-options-dev \ libcap2 \ + libcurl4-openssl-dev \ libnuma-dev \ libpci-dev \ libssl-dev \ @@ -38,6 +40,7 @@ RUN apt-get update && \ openssh-client \ openssh-server \ pciutils \ + python3-mpi4py \ rsync \ sudo \ util-linux \ @@ -46,11 +49,11 @@ RUN apt-get update && \ && \ rm -rf /tmp/* -ARG NUM_MAKE_JOBS=16 +ARG NUM_MAKE_JOBS= # Check if CMake is installed and its version RUN cmake_version=$(cmake --version 2>/dev/null | grep -oP "(?<=cmake version )(\d+\.\d+)" || echo "0.0") && \ - required_version="3.26.4" && \ + required_version="3.24.1" && \ if [ "$(printf "%s\n" "$required_version" "$cmake_version" | sort -V | head -n 1)" != "$required_version" ]; then \ echo "existing cmake version is ${cmake_version}" && \ cd /tmp && \ @@ -100,40 +103,26 @@ RUN if ! command -v ofed_info >/dev/null 2>&1; then \ rm -rf MLNX_OFED_LINUX-${OFED_VERSION}* ; \ fi -# Install UCX -ENV UCX_VERSION=1.14.1 -RUN if [ -z "$(ls -A /opt/ucx)" ]; then \ - echo "/opt/ucx is empty. Installing UCX..."; \ - cd /tmp && \ - git clone https://github.com/openucx/ucx.git -b v${UCX_VERSION} && \ - cd ucx && \ - ./autogen.sh && \ - mkdir build && \ - cd build && \ - ../configure -prefix=$UCX_DIR --with-rocm=/opt/rocm --without-knem && \ - make -j $(nproc) && make -j $(nproc) install && rm -rf /tmp/ucx-${UCX_VERSION} ; \ - else \ - echo "/opt/ucx is not empty. Skipping UCX installation."; \ - fi +# Add target file to help determine which device(s) to build for +ENV ROCM_PATH=/opt/rocm +RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942\ngfx1030\ngfx1100\ngfx1101\ngfx1102\n" >> ${ROCM_PATH}/bin/target.lst' # Install OpenMPI ENV OPENMPI_VERSION=4.1.x +ENV MPI_HOME=/usr/local/mpi # Check if Open MPI is installed -RUN [ -d /usr/local/bin/mpirun ] || { \ - echo "Open MPI not found. Installing Open MPI..." && \ - cd /tmp && \ +RUN cd /tmp && \ git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION} && \ cd ompi && \ ./autogen.pl && \ mkdir build && \ cd build && \ - ../configure --prefix=/usr/local --enable-orterun-prefix-by-default --enable-mpirun-prefix-by-default --enable-prte-prefix-by-default --enable-mca-no-build=btl-uct --with-ucx=/opt/ucx --with-rocm=/opt/rocm && \ + ../configure --prefix=/usr/local/mpi --enable-orterun-prefix-by-default --enable-mpirun-prefix-by-default --enable-prte-prefix-by-default --with-rocm=/opt/rocm && \ make -j $(nproc) && \ make -j $(nproc) install && \ ldconfig && \ cd / && \ - rm -rf /tmp/openmpi-${OPENMPI_VERSION}* ;\ - } + rm -rf /tmp/openmpi-${OPENMPI_VERSION}* # Install Intel MLC RUN cd /tmp && \ @@ -148,12 +137,18 @@ RUN cd /opt/ && \ cd rccl && \ mkdir build && \ cd build && \ - CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/opt/rocm/ .. && \ + CXX=/opt/rocm/bin/hipcc cmake -DHIP_COMPILER=clang -DCMAKE_BUILD_TYPE=Release -DCMAKE_VERBOSE_MAKEFILE=1 \ + -DCMAKE_PREFIX_PATH="${ROCM_PATH}/hsa;${ROCM_PATH}/hip;${ROCM_PATH}/share/rocm/cmake/;${ROCM_PATH}" \ + .. && \ make -j${NUM_MAKE_JOBS} -ENV PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \ +# Install AMD SMI Python Library +RUN cd /opt/rocm/share/amd_smi && \ + python3 -m pip install --user . + +ENV PATH="/usr/local/mpi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \ LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \ - LD_LIBRARY_PATH="/opt/ucx/lib:/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \ + LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \ SB_HOME=/opt/superbench \ SB_MICRO_PATH=/opt/superbench \ ANSIBLE_DEPRECATION_WARNINGS=FALSE \ @@ -163,13 +158,19 @@ RUN echo PATH="$PATH" > /etc/environment && \ echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \ echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment +RUN apt install rocm-cmake -y && \ + python3 -m pip install --upgrade pip wheel setuptools==65.7 + WORKDIR ${SB_HOME} -ADD . . -RUN apt install rocm-cmake -y && \ - python3 -m pip install --upgrade pip wheel setuptools==65.7 && \ - python3 -m pip install .[amdworker] && \ - make postinstall -RUN make cppbuild ADD third_party third_party -RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-5.7.1.1 HIPBLASLT_BRANCH=release-staging/rocm-rel-5.7 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm +# Apply patch +RUN cd third_party/perftest && \ + git apply ../perftest_rocm6.patch +RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-5.7.1.1 HIPBLASLT_BRANCH=release/rocm-rel-5.7 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm + +ADD . . +#ENV USE_HIPBLASLT_DATATYPE=1 +RUN python3 -m pip install .[amdworker] && \ + CXX=/opt/rocm/bin/hipcc make cppbuild && \ + make postinstall diff --git a/dockerfile/rocm6.0.x.dockerfile b/dockerfile/rocm6.0.x.dockerfile new file mode 100644 index 00000000..57194575 --- /dev/null +++ b/dockerfile/rocm6.0.x.dockerfile @@ -0,0 +1,181 @@ +ARG BASE_IMAGE=rocm/pytorch:rocm6.0_ubuntu22.04_py3.9_pytorch_2.0.1 + +FROM ${BASE_IMAGE} + +# OS: +# - Ubuntu: 22.04 +# - Docker Client: 20.10.8 +# ROCm: +# - ROCm: 6.0 +# Lib: +# - torch: 2.0.1 +# - rccl: 2.18.3+hip6.0 develop:7e1cbb4 +# - hipblaslt: release/rocm-rel-6.0 +# - openmpi: 4.1.x +# - apex: 1.0.0 +# Intel: +# - mlc: v3.10 + +LABEL maintainer="SuperBench" + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update && \ + apt-get -q install -y --no-install-recommends \ + autoconf \ + automake \ + bc \ + build-essential \ + curl \ + dmidecode \ + git \ + hipify-clang \ + iproute2 \ + jq \ + libaio-dev \ + libboost-program-options-dev \ + libcap2 \ + libcurl4-openssl-dev \ + libnuma-dev \ + libpci-dev \ + libssl-dev \ + libtinfo5 \ + libtool \ + lshw \ + net-tools \ + numactl \ + openssh-client \ + openssh-server \ + pciutils \ + python3-mpi4py \ + rsync \ + sudo \ + util-linux \ + vim \ + wget \ + && \ + rm -rf /tmp/* + +ARG NUM_MAKE_JOBS=64 + +# Check if CMake is installed and its version +RUN cmake_version=$(cmake --version 2>/dev/null | grep -oP "(?<=cmake version )(\d+\.\d+)" || echo "0.0") && \ + required_version="3.24.1" && \ + if [ "$(printf "%s\n" "$required_version" "$cmake_version" | sort -V | head -n 1)" != "$required_version" ]; then \ + echo "existing cmake version is ${cmake_version}" && \ + cd /tmp && \ + wget -q https://github.com/Kitware/CMake/releases/download/v${required_version}/cmake-${required_version}.tar.gz && \ + tar xzf cmake-${required_version}.tar.gz && \ + cd cmake-${required_version} && \ + ./bootstrap --prefix=/usr --no-system-curl --parallel=16 && \ + make -j ${NUM_MAKE_JOBS} && \ + make install && \ + rm -rf /tmp/cmake-${required_version}* \ + else \ + echo "CMake version is greater than or equal to 3.23"; \ + fi + +# Install Docker +ENV DOCKER_VERSION=20.10.8 +RUN cd /tmp && \ + wget -q https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \ + tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \ + rm docker.tgz + +# Update system config +RUN mkdir -p /root/.ssh && \ + touch /root/.ssh/authorized_keys && \ + mkdir -p /var/run/sshd && \ + sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \ + sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \ + sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \ + echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \ + echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf + + +# Get Ubuntu version and set as an environment variable +RUN export UBUNTU_VERSION=$(lsb_release -r -s) +RUN echo "Ubuntu version: $UBUNTU_VERSION" +ENV UBUNTU_VERSION=${UBUNTU_VERSION} + +# Install OFED +ENV OFED_VERSION=5.9-0.5.6.0 +# Check if ofed_info is present and has a version +RUN if ! command -v ofed_info >/dev/null 2>&1; then \ + echo "OFED not found. Installing OFED..."; \ + cd /tmp && \ + wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \ + tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \ + PATH=/usr/bin:${PATH} MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \ + rm -rf MLNX_OFED_LINUX-${OFED_VERSION}* ; \ + fi + +# Add target file to help determine which device(s) to build for +ENV ROCM_PATH=/opt/rocm +RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942:sramecc+:xnack-\n" >> ${ROCM_PATH}/bin/target.lst' + +# Install OpenMPI +ENV OPENMPI_VERSION=4.1.x +ENV MPI_HOME=/usr/local/mpi +# Check if Open MPI is installed +RUN cd /tmp && \ + git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION} && \ + cd ompi && \ + ./autogen.pl && \ + mkdir build && \ + cd build && \ + ../configure --prefix=/usr/local/mpi --enable-orterun-prefix-by-default --enable-mpirun-prefix-by-default --enable-prte-prefix-by-default --with-rocm=/opt/rocm && \ + make -j $(nproc) && \ + make -j $(nproc) install && \ + ldconfig && \ + cd / && \ + rm -rf /tmp/openmpi-${OPENMPI_VERSION}* + +# Install Intel MLC +RUN cd /tmp && \ + wget -q https://downloadmirror.intel.com/763324/mlc_v3.10.tgz -O mlc.tgz && \ + tar xzf mlc.tgz Linux/mlc && \ + cp ./Linux/mlc /usr/local/bin/ && \ + rm -rf ./Linux mlc.tgz + +# Install RCCL +RUN cd /opt/ && \ + git clone https://github.com/ROCmSoftwarePlatform/rccl.git && \ + cd rccl && \ + mkdir build && \ + cd build && \ + CXX=/opt/rocm/bin/hipcc cmake -DHIP_COMPILER=clang -DCMAKE_BUILD_TYPE=Release -DCMAKE_VERBOSE_MAKEFILE=1 \ + -DCMAKE_PREFIX_PATH="${ROCM_PATH}/hsa;${ROCM_PATH}/hip;${ROCM_PATH}/share/rocm/cmake/;${ROCM_PATH}" \ + .. && \ + make -j${NUM_MAKE_JOBS} + +ENV PATH="/usr/local/mpi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \ + LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \ + LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \ + SB_HOME=/opt/superbench \ + SB_MICRO_PATH=/opt/superbench \ + ANSIBLE_DEPRECATION_WARNINGS=FALSE \ + ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections + +RUN echo PATH="$PATH" > /etc/environment && \ + echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \ + echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment + +RUN apt install rocm-cmake -y && \ + python3 -m pip install --upgrade pip wheel setuptools==65.7 + +WORKDIR ${SB_HOME} + +ADD third_party third_party +# Apply patch +RUN cd third_party/perftest && \ + git apply ../perftest_rocm6.patch +RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-6.0 HIPBLASLT_BRANCH=release/rocm-rel-6.0 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm +RUN cd third_party/Megatron/Megatron-DeepSpeed && \ + git apply ../megatron_deepspeed_rocm6.patch + +ADD . . +ENV USE_HIP_DATATYPE=1 +ENV USE_HIPBLAS_COMPUTETYPE=1 +RUN python3 -m pip install .[amdworker] && \ + CXX=/opt/rocm/bin/hipcc make cppbuild && \ + make postinstall diff --git a/docs/developer-guides/using-docker.mdx b/docs/developer-guides/using-docker.mdx index a4b0fb1d..da473f24 100644 --- a/docs/developer-guides/using-docker.mdx +++ b/docs/developer-guides/using-docker.mdx @@ -29,7 +29,7 @@ You need to [clone the code](./development.md#set-up) first before building the export DOCKER_BUILDKIT=1 docker buildx build \ --platform linux/amd64 --cache-to type=inline,mode=max \ - --tag superbench-dev --file dockerfile/cuda12.1.dockerfile . + --tag superbench-dev --file dockerfile/cuda12.2.dockerfile . ``` diff --git a/docs/getting-started/installation.mdx b/docs/getting-started/installation.mdx index 8570306c..cf48c4ca 100644 --- a/docs/getting-started/installation.mdx +++ b/docs/getting-started/installation.mdx @@ -61,7 +61,7 @@ You can clone the source from GitHub and build it. :::note Note You should checkout corresponding tag to use release version, for example, -`git clone -b v0.9.0 https://github.com/microsoft/superbenchmark` +`git clone -b v0.10.0 https://github.com/microsoft/superbenchmark` ::: ```bash diff --git a/docs/getting-started/run-superbench.md b/docs/getting-started/run-superbench.md index 16c6d7a2..a6bb3bc1 100644 --- a/docs/getting-started/run-superbench.md +++ b/docs/getting-started/run-superbench.md @@ -27,7 +27,7 @@ sb deploy -f remote.ini --host-password [password] :::note Note You should deploy corresponding Docker image to use release version, for example, -`sb deploy -f local.ini -i superbench/superbench:v0.9.0-cuda12.1` +`sb deploy -f local.ini -i superbench/superbench:v0.10.0-cuda12.2` You should note that version of git repo only determines version of sb CLI, and not the sb container. You should define the container version even if you specified a release version for the git clone. diff --git a/docs/superbench-config.mdx b/docs/superbench-config.mdx index 8893c46b..340c7d61 100644 --- a/docs/superbench-config.mdx +++ b/docs/superbench-config.mdx @@ -70,7 +70,7 @@ superbench: ```yaml -version: v0.9 +version: v0.10 superbench: enable: benchmark_1 monitor: diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md index fd0365f4..388bfa11 100644 --- a/docs/user-tutorial/benchmarks/micro-benchmarks.md +++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md @@ -58,17 +58,18 @@ Large scale matmul operation using `torch.matmul` with one GPU. |--------------------------------|-----------|--------------------------------| | pytorch-matmul/nosharding_time | time (ms) | Time of pure matmul operation. | -### `cublaslt-gemm` +### `cublaslt-gemm` / `hipblaslt-gemm` #### Introduction -Measure the GEMM performance of [`cublasLtMatmul`](https://docs.nvidia.com/cuda/cublas/#cublasltmatmul). +Measure the GEMM performance of [`cublasLtMatmul`](https://docs.nvidia.com/cuda/cublas/#cublasltmatmul) or [`hipblasLt-bench`](https://github.com/ROCm/hipBLASLt/blob/develop/clients/benchmarks/README.md). #### Metrics -| Name | Unit | Description | -|----------------------------------------------------------|----------------|---------------------------------| -| cublaslt-gemm/${dtype}\_${batch}\_${m}\_${n}\_${k}_flops | FLOPS (TFLOPS) | TFLOPS of measured GEMM kernel. | +| Name | Unit | Description | +|-----------------------------------------------------------|----------------|---------------------------------| +| cublaslt-gemm/${dtype}\_${batch}\_${m}\_${n}\_${k}_flops | FLOPS (TFLOPS) | TFLOPS of measured GEMM kernel. | +| hipblaslt-gemm/${dtype}\_${batch}\_${m}\_${n}\_${k}_flops | FLOPS (TFLOPS) | TFLOPS of measured GEMM kernel. | ### `cublas-function` @@ -243,6 +244,7 @@ or [AMD](https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/1_Utils ### `gpu-copy-bw` Measure the memory copy bandwidth performed by GPU SM/DMA engine, including device-to-host, host-to-device and device-to-device. +For measurements of peer-to-peer communication performance between AMD GPUs, GPU memory buffers are allocated in `hipDeviceMallocUncached` (previous `hipDeviceMallocFinegrained`) mode to maximize performance. #### Metrics @@ -283,6 +285,7 @@ Measure the performance of NCCL/RCCL operations under multi nodes' traffic patte performed by [nccl-tests](https://github.com/NVIDIA/nccl-tests/tree/44df0bf010dcc95e840ca0fb7466c67cff3f1f0f) or [rccl-tests](https://github.com/ROCmSoftwarePlatform/rccl-tests/tree/dc1ad4853d7ec738387d42a75a58a98d7af00c7b). Support the following operations currently: allreduce, allgather, broadcast, reduce, reducescatter, alltoall. +Support both in-place and out-of-place measurements. Support the following traffic patterns: * `all-nodes`, validate the NCCL/RCCL performance across all VM nodes simultaneously. diff --git a/docs/user-tutorial/container-images.mdx b/docs/user-tutorial/container-images.mdx index 5fd11502..ffca2279 100644 --- a/docs/user-tutorial/container-images.mdx +++ b/docs/user-tutorial/container-images.mdx @@ -28,26 +28,29 @@ available tags are listed below for all stable versions. }> -| Tag | Description | -|-------------------|------------------------------------| -| v0.9.0-cuda12.1 | SuperBench v0.9.0 with CUDA 12.1 | -| v0.9.0-cuda11.1.1 | SuperBench v0.9.0 with CUDA 11.1.1 | -| v0.8.0-cuda12.1 | SuperBench v0.8.0 with CUDA 12.1 | -| v0.8.0-cuda11.1.1 | SuperBench v0.8.0 with CUDA 11.1.1 | -| v0.7.0-cuda11.8 | SuperBench v0.7.0 with CUDA 11.8 | -| v0.7.0-cuda11.1.1 | SuperBench v0.7.0 with CUDA 11.1.1 | -| v0.6.0-cuda11.1.1 | SuperBench v0.6.0 with CUDA 11.1.1 | -| v0.5.0-cuda11.1.1 | SuperBench v0.5.0 with CUDA 11.1.1 | -| v0.4.0-cuda11.1.1 | SuperBench v0.4.0 with CUDA 11.1.1 | -| v0.3.0-cuda11.1.1 | SuperBench v0.3.0 with CUDA 11.1.1 | -| v0.2.1-cuda11.1.1 | SuperBench v0.2.1 with CUDA 11.1.1 | -| v0.2.0-cuda11.1.1 | SuperBench v0.2.0 with CUDA 11.1.1 | +| Tag | Description | +|--------------------|-------------------------------------| +| v0.10.0-cuda12.2 | SuperBench v0.10.0 with CUDA 12.2 | +| v0.10.0-cuda11.1.1 | SuperBench v0.10.0 with CUDA 11.1.1 | +| v0.9.0-cuda12.1 | SuperBench v0.9.0 with CUDA 12.1 | +| v0.9.0-cuda11.1.1 | SuperBench v0.9.0 with CUDA 11.1.1 | +| v0.8.0-cuda12.1 | SuperBench v0.8.0 with CUDA 12.1 | +| v0.8.0-cuda11.1.1 | SuperBench v0.8.0 with CUDA 11.1.1 | +| v0.7.0-cuda11.8 | SuperBench v0.7.0 with CUDA 11.8 | +| v0.7.0-cuda11.1.1 | SuperBench v0.7.0 with CUDA 11.1.1 | +| v0.6.0-cuda11.1.1 | SuperBench v0.6.0 with CUDA 11.1.1 | +| v0.5.0-cuda11.1.1 | SuperBench v0.5.0 with CUDA 11.1.1 | +| v0.4.0-cuda11.1.1 | SuperBench v0.4.0 with CUDA 11.1.1 | +| v0.3.0-cuda11.1.1 | SuperBench v0.3.0 with CUDA 11.1.1 | +| v0.2.1-cuda11.1.1 | SuperBench v0.2.1 with CUDA 11.1.1 | +| v0.2.0-cuda11.1.1 | SuperBench v0.2.0 with CUDA 11.1.1 | | Tag | Description | |-------------------------------|--------------------------------------------------| +| v0.10.0-rocm5.7 | SuperBench v0.10.0 with ROCm 5.7 | | v0.9.0-rocm5.1.3 | SuperBench v0.9.0 with ROCm 5.1.3 | | v0.9.0-rocm5.1.1 | SuperBench v0.9.0 with ROCm 5.1.1 | | v0.9.0-rocm5.0.1 | SuperBench v0.9.0 with ROCm 5.0.1 | diff --git a/docs/user-tutorial/data-diagnosis.md b/docs/user-tutorial/data-diagnosis.md index a0bd9964..c2f0e336 100644 --- a/docs/user-tutorial/data-diagnosis.md +++ b/docs/user-tutorial/data-diagnosis.md @@ -65,7 +65,7 @@ superbench: example: ```yaml # SuperBench rules -version: v0.9 +version: v0.10 superbench: rules: failure-rule: diff --git a/docs/user-tutorial/result-summary.md b/docs/user-tutorial/result-summary.md index 7e393a18..dffee251 100644 --- a/docs/user-tutorial/result-summary.md +++ b/docs/user-tutorial/result-summary.md @@ -58,7 +58,7 @@ superbench: ```yaml title="Example" # SuperBench rules -version: v0.9 +version: v0.10 superbench: rules: kernel_launch: diff --git a/superbench/__init__.py b/superbench/__init__.py index bc20aebf..e1f4234f 100644 --- a/superbench/__init__.py +++ b/superbench/__init__.py @@ -6,5 +6,5 @@ Provide hardware and software benchmarks for AI systems. """ -__version__ = '0.9.0' +__version__ = '0.10.0' __author__ = 'Microsoft' diff --git a/superbench/benchmarks/micro_benchmarks/cuda_nccl_bw_performance.py b/superbench/benchmarks/micro_benchmarks/cuda_nccl_bw_performance.py index b856f939..08055fe7 100644 --- a/superbench/benchmarks/micro_benchmarks/cuda_nccl_bw_performance.py +++ b/superbench/benchmarks/micro_benchmarks/cuda_nccl_bw_performance.py @@ -94,6 +94,17 @@ class CudaNcclBwBenchmark(MicroBenchmarkWithInvoke): default=0, help='Number of graph launch iterations. Set to 0 to disable graph mode. Default: 0.', ) + self._parser.add_argument( + '--in_place', + action='store_true', + help='If specified, collect in-place numbers, else collect out-of-place numbers.', + ) + self._parser.add_argument( + '--data_type', + type=str, + default='float', + help='Data type used in NCCL operations. Default: float.', + ) def _preprocess(self): """Preprocess/preparation operations before the benchmarking. @@ -123,9 +134,10 @@ class CudaNcclBwBenchmark(MicroBenchmarkWithInvoke): return False command = os.path.join(self._args.bin_dir, self._bin_name) - command += ' -b {} -e {} -f {} -g {} -c {} -n {} -w {} -G {}'.format( + command += ' -b {} -e {} -f {} -g {} -c {} -n {} -w {} -G {} -d {}'.format( self._args.minbytes, self._args.maxbytes, str(self._args.stepfactor), str(self._args.ngpus), - str(self._args.check), str(self._args.iters), str(self._args.warmup_iters), str(self._args.graph_iters) + str(self._args.check), str(self._args.iters), str(self._args.warmup_iters), str(self._args.graph_iters), + self._args.data_type ) self._commands.append(command) @@ -171,9 +183,9 @@ class CudaNcclBwBenchmark(MicroBenchmarkWithInvoke): content = content[out_of_place_index + 1:out_of_bound_index] # Parse max out of bound bus bw as the result size_index = -1 - time_index = -1 - busbw_index = -1 - algbw_index = -1 + time_index = None + busbw_index = None + algbw_index = None for line in content: if 'time' in line and 'busbw' in line: # Get index of selected column @@ -181,11 +193,17 @@ class CudaNcclBwBenchmark(MicroBenchmarkWithInvoke): line = re.sub(r' +', ' ', line).split(' ') # Get first index of condition in list, if it not existing, raise exception size_index = line.index('size') - time_index = line.index('time') - len(line) - busbw_index = line.index('busbw') - len(line) - algbw_index = line.index('algbw') - len(line) + # Need index from the end because sometimes previous fields (like redop) can be empty + if self._args.in_place: + time_index = -1 - list(reversed(line)).index('time') + busbw_index = -1 - list(reversed(line)).index('busbw') + algbw_index = -1 - list(reversed(line)).index('algbw') + else: + time_index = line.index('time') - len(line) + busbw_index = line.index('busbw') - len(line) + algbw_index = line.index('algbw') - len(line) break - if size_index != -1 and busbw_index != -1 and time_index != -1 and algbw_index != -1: + if size_index != -1 and busbw_index is not None and time_index is not None and algbw_index is not None: for line in content: line = line.strip(' ') line = re.sub(r' +', ' ', line).split(' ') diff --git a/superbench/benchmarks/micro_benchmarks/dist_inference.py b/superbench/benchmarks/micro_benchmarks/dist_inference.py index ffc955ab..7232a140 100644 --- a/superbench/benchmarks/micro_benchmarks/dist_inference.py +++ b/superbench/benchmarks/micro_benchmarks/dist_inference.py @@ -493,13 +493,12 @@ class DistInference(MicroBenchmarkWithInvoke): try: output_lines = [x.strip() for x in raw_output.strip().splitlines()] - step_time = None + step_times = [] for output_line in output_lines: - if ' ms per iteration' in output_line: - step_time = float(output_line.split(' ms per iteration')[0].split()[-1]) - break + if output_line.startswith('Latency of step'): + step_times.append(float(output_line.split(' ms')[0].split()[-1])) return self._process_numeric_result( - 'step_times', [step_time], reduce_type=ReduceType.MAX, cal_percentile=True + 'step_times', step_times, reduce_type=ReduceType.MAX, cal_percentile=True ) except BaseException as e: return self._set_error_code_and_print_error_msg( diff --git a/superbench/benchmarks/micro_benchmarks/dist_inference_cpp/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/dist_inference_cpp/CMakeLists.txt index 728fffc9..27a220f9 100644 --- a/superbench/benchmarks/micro_benchmarks/dist_inference_cpp/CMakeLists.txt +++ b/superbench/benchmarks/micro_benchmarks/dist_inference_cpp/CMakeLists.txt @@ -31,6 +31,14 @@ else() # link hip device lib add_executable(dist_inference dist_inference.cpp) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -DROCM_USE_FLOAT16=1") + if(DEFINED ENV{USE_HIPBLASLT_DATATYPE}) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_HIPBLASLT_DATATYPE=1") + elseif(DEFINED ENV{USE_HIP_DATATYPE}) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_HIP_DATATYPE=1") + endif() + if(DEFINED ENV{USE_HIPBLAS_COMPUTETYPE}) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_HIPBLAS_COMPUTETYPE=1") + endif() target_link_libraries(dist_inference MPI::MPI_CXX rccl hipblaslt hip::device) else() message(FATAL_ERROR "No CUDA or ROCm environment found.") diff --git a/superbench/benchmarks/micro_benchmarks/dist_inference_cpp/dist_inference.cu b/superbench/benchmarks/micro_benchmarks/dist_inference_cpp/dist_inference.cu index 2f4e798c..c6d7cd03 100644 --- a/superbench/benchmarks/micro_benchmarks/dist_inference_cpp/dist_inference.cu +++ b/superbench/benchmarks/micro_benchmarks/dist_inference_cpp/dist_inference.cu @@ -45,6 +45,21 @@ #include #include using cublasLtHalf = hipblasLtHalf; +#if defined(USE_HIPBLASLT_DATATYPE) +#define DIST_INF_HIP_DATATYPE_R_16F HIPBLASLT_R_16F +#define DIST_INF_HIP_DATATYPE_R_32F HIPBLASLT_R_32F +#elif defined(USE_HIP_DATATYPE) +#define DIST_INF_HIP_DATATYPE_R_16F HIP_R_16F +#define DIST_INF_HIP_DATATYPE_R_32F HIP_R_32F +#else +#define DIST_INF_HIP_DATATYPE_R_16F HIPBLAS_R_16F +#define DIST_INF_HIP_DATATYPE_R_32F HIPBLAS_R_32F +#endif +#if defined(USE_HIPBLAS_COMPUTETYPE) +#define DIST_INF_HIP_COMPUTETYPE_F32 HIPBLAS_COMPUTE_32F +#else +#define DIST_INF_HIP_COMPUTETYPE_F32 HIPBLASLT_COMPUTE_F32 +#endif #else #include #include @@ -229,16 +244,18 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t CHECK_CUBLASLT_ERROR(hipblasLtCreate(&handle)); - CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matA, HIPBLAS_R_16F, k, n, k)); - CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matB, HIPBLAS_R_16F, m, k, m)); - CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matC, HIPBLAS_R_16F, m, n, m)); - CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matD, HIPBLAS_R_16F, m, n, m)); - CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matE, HIPBLAS_R_16F, k, m, k)); - CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matF, HIPBLAS_R_16F, k, n, k)); - CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matG, HIPBLAS_R_16F, k, n, k)); + CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matA, DIST_INF_HIP_DATATYPE_R_16F, k, n, k)); + CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matB, DIST_INF_HIP_DATATYPE_R_16F, m, k, m)); + CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matC, DIST_INF_HIP_DATATYPE_R_16F, m, n, m)); + CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matD, DIST_INF_HIP_DATATYPE_R_16F, m, n, m)); + CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matE, DIST_INF_HIP_DATATYPE_R_16F, k, m, k)); + CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matF, DIST_INF_HIP_DATATYPE_R_16F, k, n, k)); + CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matG, DIST_INF_HIP_DATATYPE_R_16F, k, n, k)); - CHECK_CUBLASLT_ERROR(hipblasLtMatmulDescCreate(&matmul1, HIPBLASLT_COMPUTE_F32, HIPBLAS_R_32F)); - CHECK_CUBLASLT_ERROR(hipblasLtMatmulDescCreate(&matmul2, HIPBLASLT_COMPUTE_F32, HIPBLAS_R_32F)); + CHECK_CUBLASLT_ERROR( + hipblasLtMatmulDescCreate(&matmul1, DIST_INF_HIP_COMPUTETYPE_F32, DIST_INF_HIP_DATATYPE_R_32F)); + CHECK_CUBLASLT_ERROR( + hipblasLtMatmulDescCreate(&matmul2, DIST_INF_HIP_COMPUTETYPE_F32, DIST_INF_HIP_DATATYPE_R_32F)); hipblasOperation_t trans = HIPBLAS_OP_N; CHECK_CUBLASLT_ERROR( @@ -336,8 +353,9 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t #endif std::chrono::steady_clock::time_point start_time, stop_time; + std::vector step_times(num_iters, 0.); for (int i = 0; i < num_warmups + num_iters; ++i) { - if (i == num_warmups) { + if (i >= num_warmups) { start_time = std::chrono::steady_clock::now(); } #if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && (CUDART_VERSION >= 11030 || HIP_VERSION >= 50221310) @@ -350,11 +368,15 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t model_forward(); #endif CHECK_CUDA_ERROR(cudaStreamSynchronize(stream)); + if (i >= num_warmups) { + stop_time = std::chrono::steady_clock::now(); + double step_time = std::chrono::duration_cast(stop_time - start_time).count(); + step_times[i - num_warmups] = step_time; + } + } + for (int i = 0; i < num_iters; i++) { + fprintf(stdout, "Latency of step %d: %g ms\n", i, step_times[i] / 1e6); } - stop_time = std::chrono::steady_clock::now(); - double duration = std::chrono::duration_cast(stop_time - start_time).count(); - fprintf(stdout, "Time: %g ms in total, %g ms per iteration, %g ms per layer\n", duration, duration / num_iters, - duration / num_iters / num_layers); #if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && (CUDART_VERSION >= 11030 || HIP_VERSION >= 50221310) // Destroy graph diff --git a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/CMakeLists.txt index 7d285772..410fba00 100644 --- a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/CMakeLists.txt +++ b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/CMakeLists.txt @@ -27,6 +27,13 @@ else() # link hip device lib add_executable(gpu_copy gpu_copy.cpp) + + include(CheckSymbolExists) + check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY) + if(${HIP_UNCACHED_MEMORY}) + target_compile_definitions(gpu_copy PRIVATE HIP_UNCACHED_MEMORY) + endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") target_link_libraries(gpu_copy numa hip::device) else() diff --git a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu index 929bd8d3..74710c3c 100644 --- a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu +++ b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu @@ -313,6 +313,25 @@ int SetGpu(int gpu_id) { return 0; } +#if defined(__HIP_PLATFORM_AMD__) +bool UseFineGrained(const SubBenchArgs &args) { + return args.is_src_dev_gpu && args.is_dst_dev_gpu && args.src_gpu_id != args.dst_gpu_id; +} +cudaError_t GpuMallocDataBuf(uint8_t **ptr, uint64_t size, bool use_fine_grained) { + if (use_fine_grained) { +#if defined(HIP_UNCACHED_MEMORY) + return hipExtMallocWithFlags((void **)ptr, size, hipDeviceMallocUncached); +#else + return hipExtMallocWithFlags((void **)ptr, size, hipDeviceMallocFinegrained); +#endif + } else { + return cudaMalloc(ptr, size); + } +} +#else +cudaError_t GpuMallocDataBuf(uint8_t **ptr, uint64_t size) { return cudaMalloc(ptr, size); } +#endif + // Prepare data buffers and streams to be used. int PrepareBufAndStream(BenchArgs *args) { cudaError_t cuda_err = cudaSuccess; @@ -346,7 +365,11 @@ int PrepareBufAndStream(BenchArgs *args) { return -1; } *(host_buf_ptrs[j]) = nullptr; - cuda_err = cudaMalloc(gpu_buf_ptrs[j], args->size); +#if defined(__HIP_PLATFORM_AMD__) + cuda_err = GpuMallocDataBuf(gpu_buf_ptrs[j], args->size, UseFineGrained(sub)); +#else + cuda_err = GpuMallocDataBuf(gpu_buf_ptrs[j], args->size); +#endif if (cuda_err != cudaSuccess) { fprintf(stderr, "PrepareBufAndStream::cudaMalloc error: %d\n", cuda_err); return -1; @@ -876,7 +899,11 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank } // Prepare source buffers - cuda_err = cudaMalloc(&(src_buffers_gpu[rank]), opts.size); +#if defined(__HIP_PLATFORM_AMD__) + cuda_err = GpuMallocDataBuf(&(src_buffers_gpu[rank]), opts.size, true); +#else + cuda_err = GpuMallocDataBuf(&(src_buffers_gpu[rank]), opts.size); +#endif if (cuda_err != cudaSuccess) { fprintf(stderr, "RunAllToAllBench::cudaMalloc for src_buffers_gpu[%d] error: %d\n", cuda_err, rank); return -1; @@ -893,7 +920,11 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank } // Prepare destination buffers - cuda_err = cudaMalloc(&(dst_buffers_gpu[rank]), opts.size); +#if defined(__HIP_PLATFORM_AMD__) + cuda_err = GpuMallocDataBuf(&(dst_buffers_gpu[rank]), opts.size, true); +#else + cuda_err = GpuMallocDataBuf(&(dst_buffers_gpu[rank]), opts.size); +#endif if (cuda_err != cudaSuccess) { fprintf(stderr, "RunAllToAllBench::cudaMalloc for dst_buffers_gpu[%d] error: %d\n", cuda_err, rank); return -1; diff --git a/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py b/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py index 50897377..3feb582d 100644 --- a/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py +++ b/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py @@ -4,7 +4,6 @@ """Module of the hipBlasLt GEMM benchmark.""" import os -import re from superbench.common.utils import logger from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode @@ -23,11 +22,12 @@ class HipBlasLtBenchmark(BlasLtBaseBenchmark): super().__init__(name, parameters) self._bin_name = 'hipblaslt-bench' - self._in_types = ['fp32', 'fp16', 'bf16'] + self._in_types = ['fp32', 'fp16', 'bf16', 'fp8'] self._in_type_map = { 'fp16': '--a_type f16_r --b_type f16_r --c_type f16_r --d_type f16_r --compute_type f32_r', 'fp32': '--a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --compute_type f32_r', 'bf16': '--a_type bf16_r --b_type bf16_r --c_type bf16_r --d_type bf16_r --compute_type f32_r', + 'fp8': '--a_type f8_r --b_type f8_r --c_type f8_r --d_type f8_r --compute_type f32_r', } def add_parser_arguments(self): @@ -42,6 +42,30 @@ class HipBlasLtBenchmark(BlasLtBaseBenchmark): required=False, help='List of input data types, support {}.'.format(' '.join(self._in_types)), ) + self._parser.add_argument( + '--initialization', + type=str, + default='rand_int', + choices=['trig_float', 'rand_int', 'hpl'], + required=False, + help='Initialize matrix data.', + ) + self._parser.add_argument( + '--transA', + type=str, + default='N', + choices=['N', 'T', 'C'], + required=False, + help='Transpose matrix A.', + ) + self._parser.add_argument( + '--transB', + type=str, + default='N', + choices=['N', 'T', 'C'], + required=False, + help='Transpose matrix B.', + ) def _preprocess(self): """Preprocess/preparation operations before the benchmarking. @@ -58,7 +82,9 @@ class HipBlasLtBenchmark(BlasLtBaseBenchmark): self._precision_in_commands = [] for (_m, _n, _k, _b, _in_type) in self._shapes_to_run: command = f'{self.__bin_path} -m {_m} -n {_n} -k {_k} -j {self._args.num_warmup}' + \ - f' -i {self._args.num_steps} {self._in_type_map[_in_type]}' + f' -i {self._args.num_steps} {self._in_type_map[_in_type]}' + \ + f' --transA {self._args.transA} --transB {self._args.transB}' + \ + f' --initialization {self._args.initialization}' command = command + f' -b {str(_b)}' if _b > 0 else command logger.info(command) self._commands.append(command) @@ -97,13 +123,12 @@ class HipBlasLtBenchmark(BlasLtBaseBenchmark): fields = lines[index + 1].strip().split(',') # Check the number of fields and the format of the first two fields - if len(fields) != 23 or not all( - re.match(r'\d*\.\d*$', item.strip()) or item.strip().isdigit() for item in fields[-2:] - ): + if len(fields) != 23: raise ValueError('Invalid result') self._result.add_result( - f'{self._precision_in_commands[cmd_idx]}_{fields[3]}_{"_".join(fields[4:7])}_flops', float(fields[-2]) + f'{self._precision_in_commands[cmd_idx]}_{fields[3]}_{"_".join(fields[4:7])}_flops', + float(fields[-2]) / 1000 ) except BaseException as e: self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE) diff --git a/superbench/benchmarks/micro_benchmarks/rocm_common.cmake b/superbench/benchmarks/micro_benchmarks/rocm_common.cmake index be60df12..1d2cc393 100644 --- a/superbench/benchmarks/micro_benchmarks/rocm_common.cmake +++ b/superbench/benchmarks/micro_benchmarks/rocm_common.cmake @@ -45,8 +45,7 @@ message(STATUS "CMAKE HIP ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}") if(EXISTS ${HIP_PATH}) # Search for hip in common locations - list(APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH}) - set(CMAKE_PREFIX_PATH /opt/rocm ROCM_PATH) + list(APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH} ${ROCM_PATH}/hsa ${ROCM_PATH}/hip ${ROCM_PATH}/share/rocm/cmake/) set(CMAKE_CXX_COMPILER "${HIP_PATH}/bin/hipcc") set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) set(CMAKE_MODULE_PATH "${HIP_PATH}/lib/cmake/hip" ${CMAKE_MODULE_PATH}) diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py index 23310978..5c3350e9 100644 --- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py +++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py @@ -116,6 +116,9 @@ class MegatronGPT(ModelBenchmark): self._parser.add_argument('--data_home', type=str, default='/tmp', help='Data home.') self._parser.add_argument('--vocab_path', type=str, default='/tmp/gpt2-vocab.json', help='Vocab path.') self._parser.add_argument('--merge_path', type=str, default='/tmp/gpt2-merges.txt', help='Merge path.') + self._parser.add_argument( + '--split', type=str, default='949,50,1', help='Split dataset ratio for train/val/test.' + ) self._parser.add_argument('--prescale_grad', action='store_true', help='Prescale grad.') self._parser.add_argument( '--hostfile', type=str, default=None, help='Hostfile to run the mutli-node benchmark.' @@ -128,6 +131,13 @@ class MegatronGPT(ModelBenchmark): def _preprocess(self): if not super()._preprocess(): return False + if not self._args.code_base: + if self._args.deepspeed: + self._args.code_base = os.path.join( + os.getenv('SB_MICRO_PATH'), 'third_party/Megatron/Megatron-DeepSpeed/' + ) + else: + self._args.code_base = os.path.join(os.getenv('SB_MICRO_PATH'), 'third_party/Megatron/Megatron-LM') if not os.path.exists(self._args.code_base) or \ not os.path.exists(os.path.join(self._args.code_base, 'pretrain_gpt.py')): @@ -156,35 +166,35 @@ class MegatronGPT(ModelBenchmark): def _parse_log(self, output): """Parse log output and get the performance.""" - tflops_pattern = re.compile(r'TFLOPs: (\d+\.\d+)') + tflops_pattern = re.compile(r'(TFLOPs|TFLOP/s/GPU\)): (\d+\.\d+)') elapsed_time_pattern = re.compile(r'elapsed time per iteration \(ms\): (\d+\.\d+)') - mem_allocated_pattern = re.compile(r'MemAllocated=([\d.]+)[KMGTPEZY]?B') - max_mem_allocated_pattern = re.compile(r'MaxMemAllocated=([\d.]+)[KMGTPEZY]?B') + mem_allocated_pattern = re.compile(r'allocated: (\d+\.\d+)') + max_mem_allocated_pattern = re.compile(r'max allocated: (\d+\.\d+)') lines = output.splitlines() tflops = [] mem_allocated = [] max_mem_allocated = [] iteration_times = [] for line in lines: - if 'TFLOPs' in line: + if 'elapsed time per iteration' in line: tflops_matches = tflops_pattern.search(line) elapsed_time_match = elapsed_time_pattern.search(line) if tflops_matches: - tflops_values = float(tflops_matches.group(1)) + tflops_values = float(tflops_matches.group(2)) tflops.append(tflops_values) if elapsed_time_match: elapsed_time_value = float(elapsed_time_match.group(1)) iteration_times.append(elapsed_time_value) - if 'MaxMemAllocated' in line: + if 'max allocated' in line: mem_allocated_match = mem_allocated_pattern.search(line) max_mem_allocated_match = max_mem_allocated_pattern.search(line) if mem_allocated_match: - mem_allocated_value = float(mem_allocated_match.group(1)) + mem_allocated_value = float(mem_allocated_match.group(1)) / 1024 mem_allocated.append(mem_allocated_value) if max_mem_allocated_match: - max_mem_allocated_value = float(max_mem_allocated_match.group(1)) + max_mem_allocated_value = float(max_mem_allocated_match.group(1)) / 1024 max_mem_allocated.append(max_mem_allocated_value) return iteration_times, tflops, mem_allocated, max_mem_allocated @@ -224,7 +234,9 @@ class MegatronGPT(ModelBenchmark): --deepspeed \ --deepspeed_config {self._config_json_path} \ --zero-stage {self._args.zero_stage} \ - --pipeline-model-parallel-size {self._args.pipeline_model_parallel_size}' + --pipeline-model-parallel-size {self._args.pipeline_model_parallel_size} \ + --train-tokens {self._args.train_tokens} \ + --data-impl {self._args.data_impl}' if self._args.pipeline_model_parallel_size <= 1: deepspeed_options = f'{deepspeed_options} --no-pipeline-parallel' @@ -255,11 +267,10 @@ class MegatronGPT(ModelBenchmark): --num-attention-heads {self._args.num_attn_heads} \ --seq-length {self._args.seq_len} \ --max-position-embeddings {self._args.seq_len} \ - --train-tokens {self._args.train_tokens} \ --train-samples {self._args.num_steps * self._args.batch_size} \ --lr {self._args.lr} \ --min-lr {self._args.min_lr} \ - --split 949,50,1 \ + --split {self._args.split} \ --log-interval {self._args.log_interval} \ --eval-interval {self._args.eval_interval} \ --eval-iters {self._args.eval_iters} \ @@ -273,7 +284,8 @@ class MegatronGPT(ModelBenchmark): --optimizer adam \ --use-distributed-optimizer \ {precision_megatron} \ - --seed {self._args.seed}' + --seed {self._args.seed} \ + --log-throughput' if self._args.sequence_parallel: megatron_options = f'{megatron_options} --sequence-parallel' @@ -298,6 +310,8 @@ class MegatronGPT(ModelBenchmark): script_path = os.path.join(self._args.code_base, 'pretrain_gpt.py') if self._args.deepspeed: deepspeed_option = self.__prepare_deespeed_config(precision_megatron.lstrip('--')) + # No --log-throughput in Megatron-DeepSpeed by 20231219 + megatron_options = megatron_options.replace('--log-throughput', '').strip() if self._num_nodes > 1: command = f'torchrun {self._distributed_args} ' + \ f'{script_path} {megatron_options} {self._data_options} {deepspeed_option}' @@ -379,6 +393,7 @@ class MegatronGPT(ModelBenchmark): return False self._num_nodes = int(os.getenv('OMPI_COMM_WORLD_SIZE')) // int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE')) + master_addr = 'localhost' if self._num_nodes > 1: if not self._args.hostfile: sb_hostfile = os.path.join(os.environ.get('SB_WORKSPACE', '.'), 'hostfile') @@ -395,12 +410,13 @@ class MegatronGPT(ModelBenchmark): if self._num_nodes != len(hosts): logger.error('MPI init failed since hostfile not match the MPI setting.') return False + master_addr = hosts[0].split()[0] - addr = os.getenv('MASTER_ADDR', hosts[0].split()[0]) - port = os.getenv('MASTER_PORT', '29500') - node_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) // int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE']) - self._distributed_args = f'--nproc_per_node {self._args.num_gpus} --nnodes {self._num_nodes} ' + \ - f'--node_rank {node_rank} --master_addr {addr} --master_port {port}' + addr = os.getenv('MASTER_ADDR', master_addr) + port = os.getenv('MASTER_PORT', '29500') + node_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) // int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE']) + self._distributed_args = f'--nproc_per_node {self._args.num_gpus} --nnodes {self._num_nodes} ' + \ + f'--node_rank {node_rank} --master_addr {addr} --master_port {port}' return True def _generate_dataset(self): @@ -448,8 +464,7 @@ class MegatronGPT(ModelBenchmark): self._data_options = f'\ --vocab-file {self._vocab_path} \ --merge-file {self._merges_path} \ - --data-path {self._data_path} \ - --data-impl {self._args.data_impl}' + --data-path {self._data_path}' logger.info('Dataset preparation successfully.') return True diff --git a/superbench/benchmarks/model_benchmarks/model_base.py b/superbench/benchmarks/model_benchmarks/model_base.py index d1da27b4..fc625af9 100644 --- a/superbench/benchmarks/model_benchmarks/model_base.py +++ b/superbench/benchmarks/model_benchmarks/model_base.py @@ -265,8 +265,8 @@ class ModelBenchmark(Benchmark): # The unit of step time should be millisecond. step_times = self._train_step(precision) if isinstance(step_times, tuple): - step_times = step_times[0] info = step_times[1] + step_times = step_times[0] self._process_info(ModelAction.TRAIN, precision, info) step_times = self.__process_model_result(ModelAction.TRAIN, precision, step_times) if not step_times: diff --git a/superbench/common/utils/device_manager.py b/superbench/common/utils/device_manager.py index 09398cac..18bed8dc 100644 --- a/superbench/common/utils/device_manager.py +++ b/superbench/common/utils/device_manager.py @@ -13,7 +13,7 @@ gpu = GPU() if gpu.vendor == 'nvidia' or gpu.vendor == 'nvidia-graphics': import py3nvml.py3nvml as nvml elif gpu.vendor == 'amd' or gpu.vendor == 'amd-graphics': - from pyrsmi import rocml + import amdsmi as rocml class DeviceManager: @@ -150,7 +150,7 @@ class NvidiaDeviceManager(DeviceManager): try: cap = nvml.nvmlDeviceGetCudaComputeCapability(self._device_handlers[0]) except Exception as err: - logger.error('Get device compute capability failed: {}'.format(str(err))) + logger.warning('Get device compute capability failed: {}'.format(str(err))) return None return cap @@ -166,7 +166,7 @@ class NvidiaDeviceManager(DeviceManager): try: util = nvml.nvmlDeviceGetUtilizationRates(self._device_handlers[idx]) except Exception as err: - logger.error('Get device utilization failed: {}'.format(str(err))) + logger.warning('Get device utilization failed: {}'.format(str(err))) return None return util.gpu @@ -182,7 +182,7 @@ class NvidiaDeviceManager(DeviceManager): try: temp = nvml.nvmlDeviceGetTemperature(self._device_handlers[idx], nvml.NVML_TEMPERATURE_GPU) except Exception as err: - logger.error('Get device temperature failed: {}'.format(str(err))) + logger.warning('Get device temperature failed: {}'.format(str(err))) temp = None return temp @@ -198,7 +198,7 @@ class NvidiaDeviceManager(DeviceManager): try: power = nvml.nvmlDeviceGetPowerUsage(self._device_handlers[idx]) except Exception as err: - logger.error('Get device power failed: {}'.format(str(err))) + logger.warning('Get device power failed: {}'.format(str(err))) return None return int(int(power) / 1000) @@ -214,7 +214,7 @@ class NvidiaDeviceManager(DeviceManager): try: powerlimit = nvml.nvmlDeviceGetPowerManagementLimit(self._device_handlers[idx]) except Exception as err: - logger.error('Get device power limitation failed: {}'.format(str(err))) + logger.warning('Get device power limitation failed: {}'.format(str(err))) return None return int(int(powerlimit) / 1000) @@ -231,7 +231,7 @@ class NvidiaDeviceManager(DeviceManager): try: mem = nvml.nvmlDeviceGetMemoryInfo(self._device_handlers[idx]) except Exception as err: - logger.error('Get device memory failed: {}'.format(str(err))) + logger.warning('Get device memory failed: {}'.format(str(err))) return None, None return mem.used, mem.total @@ -304,7 +304,7 @@ class NvidiaDeviceManager(DeviceManager): except nvml.NVMLError: pass except Exception as err: - logger.error('Get device ECC information failed: {}'.format(str(err))) + logger.warning('Get device ECC information failed: {}'.format(str(err))) return None, None try: @@ -316,7 +316,7 @@ class NvidiaDeviceManager(DeviceManager): except nvml.NVMLError: pass except Exception as err: - logger.error('Get device ECC information failed: {}'.format(str(err))) + logger.warning('Get device ECC information failed: {}'.format(str(err))) return None, None return corrected_ecc, uncorrected_ecc @@ -326,12 +326,13 @@ class AmdDeviceManager(DeviceManager): """Device management module for AMD.""" def __init__(self): """Constructor.""" - rocml.smi_initialize() + rocml.amdsmi_init() + self._device_handlers = rocml.amdsmi_get_processor_handles() super().__init__() def __del__(self): """Destructor.""" - rocml.smi_shutdown() + rocml.amdsmi_shut_down() def get_device_count(self): """Get the number of device. @@ -339,7 +340,7 @@ class AmdDeviceManager(DeviceManager): Return: count (int): count of device. """ - return rocml.smi_get_device_count() + return len(self._device_handlers) def get_device_utilization(self, idx): """Get the utilization of device. @@ -351,11 +352,11 @@ class AmdDeviceManager(DeviceManager): util (int): the utilization of device, None means failed to get the data. """ try: - util = rocml.smi_get_device_utilization(idx) + engine_usage = rocml.amdsmi_get_gpu_activity(self._device_handlers[idx]) except Exception as err: - logger.error('Get device utilization failed: {}'.format(str(err))) + logger.warning('Get device utilization failed: {}'.format(str(err))) return None - return util + return engine_usage['gfx_activity'] def get_device_temperature(self, idx): """Get the temperature of device, unit: celsius. @@ -366,8 +367,16 @@ class AmdDeviceManager(DeviceManager): Return: temp (int): the temperature of device, None means failed to get the data. """ - # Currently no API provided in rocml. - return None + try: + temp = rocml.amdsmi_get_temp_metric( + self._device_handlers[idx], rocml.AmdSmiTemperatureType.EDGE, rocml.AmdSmiTemperatureMetric.CURRENT + ) + except (rocml.AmdSmiLibraryException, rocml.AmdSmiParameterException): + pass + except Exception as err: + logger.warning('Get device temperature failed: {}'.format(str(err))) + temp = None + return temp def get_device_power(self, idx): """Get the realtime power of device, unit: watt. @@ -379,11 +388,11 @@ class AmdDeviceManager(DeviceManager): temp (int): the realtime power of device, None means failed to get the data. """ try: - power = rocml.smi_get_device_average_power(idx) + power_measure = rocml.amdsmi_get_power_info(self._device_handlers[idx]) except Exception as err: - logger.error('Get device power failed: {}'.format(str(err))) + logger.warning('Get device power failed: {}'.format(str(err))) return None - return int(int(power) / 1000) + return int(power_measure['average_socket_power']) def get_device_power_limit(self, idx): """Get the power management limit of device, unit: watt. @@ -394,8 +403,12 @@ class AmdDeviceManager(DeviceManager): Return: temp (int): the power management limit of device, None means failed to get the data. """ - # Currently no API provided in rocml. - return None + try: + power_measure = rocml.amdsmi_get_power_info(self._device_handlers[idx]) + except Exception as err: + logger.warning('Get device power limit failed: {}'.format(str(err))) + return None + return int(power_measure['power_limit']) def get_device_memory(self, idx): """Get the memory information of device, unit: byte. @@ -408,10 +421,10 @@ class AmdDeviceManager(DeviceManager): total (int): the total device memory in bytes, None means failed to get the data. """ try: - mem_used = rocml.smi_get_device_memory_used(idx) - mem_total = rocml.smi_get_device_memory_total(idx) + mem_used = rocml.amdsmi_get_gpu_memory_usage(self._device_handlers[idx], rocml.AmdSmiMemoryType.VRAM) + mem_total = rocml.amdsmi_get_gpu_memory_total(self._device_handlers[idx], rocml.AmdSmiMemoryType.VRAM) except Exception as err: - logger.error('Get device memory failed: {}'.format(str(err))) + logger.warning('Get device memory failed: {}'.format(str(err))) return None, None return mem_used, mem_total @@ -425,8 +438,19 @@ class AmdDeviceManager(DeviceManager): corrected_ecc (int) : the count of single bit ecc error. uncorrected_ecc (int): the count of double bit ecc error. """ - # Currently no API provided in rocml. - return None, None + corrected_ecc = 0 + uncorrected_ecc = 0 + for block in rocml.AmdSmiGpuBlock: + try: + ecc_count = rocml.amdsmi_get_gpu_ecc_count(self._device_handlers[idx], block) + corrected_ecc += ecc_count['correctable_count'] + uncorrected_ecc += ecc_count['uncorrectable_count'] + except (rocml.AmdSmiLibraryException, rocml.AmdSmiParameterException): + pass + except Exception as err: + logger.info('Get device ECC information failed: {}'.format(str(err))) + + return corrected_ecc, uncorrected_ecc device_manager: Optional[DeviceManager] = DeviceManager() diff --git a/superbench/config/amd_mi100_hpe.yaml b/superbench/config/amd_mi100_hpe.yaml index 71822453..9aec785f 100644 --- a/superbench/config/amd_mi100_hpe.yaml +++ b/superbench/config/amd_mi100_hpe.yaml @@ -3,7 +3,7 @@ # Server: # - Product: HPE Apollo 6500 -version: v0.9 +version: v0.10 superbench: enable: null var: diff --git a/superbench/config/amd_mi100_z53.yaml b/superbench/config/amd_mi100_z53.yaml index 8aa8fd85..7e56d1a3 100644 --- a/superbench/config/amd_mi100_z53.yaml +++ b/superbench/config/amd_mi100_z53.yaml @@ -4,7 +4,7 @@ # - Product: G482-Z53 # - Link: https://www.gigabyte.cn/FileUpload/Global/MicroSite/553/G482-Z53.html -version: v0.9 +version: v0.10 superbench: enable: null var: diff --git a/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml b/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml index 5ffa2631..7624a86d 100644 --- a/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml +++ b/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml @@ -1,4 +1,4 @@ -version: v0.9 +version: v0.10 superbench: enable: null monitor: diff --git a/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml b/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml index 5c78d866..befcd178 100644 --- a/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml +++ b/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml @@ -1,4 +1,4 @@ -version: v0.9 +version: v0.10 superbench: enable: null monitor: diff --git a/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml b/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml index 75375cd7..af19e0a2 100644 --- a/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml +++ b/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml @@ -1,4 +1,4 @@ -version: v0.9 +version: v0.10 superbench: enable: null monitor: diff --git a/superbench/config/azure_ndmv4.yaml b/superbench/config/azure_ndmv4.yaml index 4914780a..3ef0c399 100644 --- a/superbench/config/azure_ndmv4.yaml +++ b/superbench/config/azure_ndmv4.yaml @@ -3,7 +3,7 @@ # Azure NDm A100 v4 # reference: https://docs.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series -version: v0.9 +version: v0.10 superbench: enable: null monitor: diff --git a/superbench/config/azure_ndv4.yaml b/superbench/config/azure_ndv4.yaml index 827626af..921a446b 100644 --- a/superbench/config/azure_ndv4.yaml +++ b/superbench/config/azure_ndv4.yaml @@ -1,5 +1,5 @@ # SuperBench Config -version: v0.9 +version: v0.10 superbench: enable: null monitor: diff --git a/superbench/config/default.yaml b/superbench/config/default.yaml index 7bade5d3..9533806c 100644 --- a/superbench/config/default.yaml +++ b/superbench/config/default.yaml @@ -1,5 +1,5 @@ # SuperBench Config -version: v0.9 +version: v0.10 superbench: enable: null monitor: diff --git a/superbench/runner/playbooks/deploy.yaml b/superbench/runner/playbooks/deploy.yaml index 516d252b..4830b97a 100644 --- a/superbench/runner/playbooks/deploy.yaml +++ b/superbench/runner/playbooks/deploy.yaml @@ -100,7 +100,7 @@ docker run -itd --name={{ container }} \ --privileged --net=host --ipc=host \ {{ '--gpus=all' if nvidia_gpu_exist else '' }} \ - {{ '--security-opt seccomp=unconfined --group-add video' if amd_gpu_exist else '' }} \ + {{ '--security-opt seccomp=unconfined --group-add video --device=/dev/kfd --device=/dev/dri --cap-add=SYS_PTRACE --shm-size=16G' if amd_gpu_exist else '' }} \ -w /root -v {{ workspace }}:/root -v /mnt:/mnt \ -v /var/run/docker.sock:/var/run/docker.sock \ --entrypoint /bin/bash {{ docker_image }} && \ diff --git a/tests/benchmarks/micro_benchmarks/test_cuda_nccl_bw_performance.py b/tests/benchmarks/micro_benchmarks/test_cuda_nccl_bw_performance.py index b818a26b..c6d85c38 100644 --- a/tests/benchmarks/micro_benchmarks/test_cuda_nccl_bw_performance.py +++ b/tests/benchmarks/micro_benchmarks/test_cuda_nccl_bw_performance.py @@ -66,6 +66,8 @@ class CudaNcclBwBenchmarkTest(BenchmarkTestCase, unittest.TestCase): assert (benchmark._args.iters == 20) assert (benchmark._args.warmup_iters == 5) assert (benchmark._args.graph_iters == 0) + assert (benchmark._args.in_place is False) + assert (benchmark._args.data_type == 'float') # Check command list bin_names = [ @@ -74,7 +76,7 @@ class CudaNcclBwBenchmarkTest(BenchmarkTestCase, unittest.TestCase): ] command = bin_names[0] + benchmark._commands[0].split(bin_names[0])[1] - expected_command = '{} -b 8 -e 8G -f 2 -g 8 -c 0 -n 20 -w 5 -G 0'.format(bin_names[0]) + expected_command = '{} -b 8 -e 8G -f 2 -g 8 -c 0 -n 20 -w 5 -G 0 -d float'.format(bin_names[0]) assert (command == expected_command) # Check results and metrics. @@ -91,6 +93,11 @@ class CudaNcclBwBenchmarkTest(BenchmarkTestCase, unittest.TestCase): 'alltoall': alltoall, } + if 'SB_MODE_SERIAL_INDEX' in os.environ: + os.environ.pop('SB_MODE_SERIAL_INDEX') + if 'SB_MODE_PARALLEL_INDEX' in os.environ: + os.environ.pop('SB_MODE_PARALLEL_INDEX') + for op in raw_output.keys(): benchmark._args.operation = op assert (benchmark._process_raw_result(0, raw_output[op])) @@ -131,3 +138,48 @@ class CudaNcclBwBenchmarkTest(BenchmarkTestCase, unittest.TestCase): assert (benchmark.result['alltoall_0_0:8589934592_time'][0] == 33508.0) assert (benchmark.result['alltoall_0_0:8589934592_algbw'][0] == 256.36) assert (benchmark.result['alltoall_0_0:8589934592_busbw'][0] == 224.31) + + @decorator.load_data('tests/data/nccl_allreduce.log') + @decorator.load_data('tests/data/nccl_alltoall.log') + def test_nccl_bw_performance_in_place_parsing(self, allreduce, alltoall): + """Test nccl-bw benchmark in-place parsing.""" + benchmark_name = 'nccl-bw' + (benchmark_class, + predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA) + assert (benchmark_class) + + benchmark = benchmark_class(benchmark_name, parameters='--ngpus 8 --in_place') + + ret = benchmark._preprocess() + assert (ret is True) + assert (benchmark.return_code == ReturnCode.SUCCESS) + assert (benchmark._args.in_place is True) + + # Case with valid raw_output + raw_output = { + 'allreduce': allreduce, + 'alltoall': alltoall, + } + + if 'SB_MODE_SERIAL_INDEX' in os.environ: + os.environ.pop('SB_MODE_SERIAL_INDEX') + if 'SB_MODE_PARALLEL_INDEX' in os.environ: + os.environ.pop('SB_MODE_PARALLEL_INDEX') + + for op in raw_output.keys(): + benchmark._args.operation = op + assert (benchmark._process_raw_result(0, raw_output[op])) + + for name in ['time', 'algbw', 'busbw']: + for size in ['8589934592', '4294967296', '2147483648', '1073741824', '536870912', '32']: + metric = op + '_' + size + '_' + name + assert (metric in benchmark.result) + assert (len(benchmark.result[metric]) == 1) + assert (isinstance(benchmark.result[metric][0], numbers.Number)) + + assert (benchmark.result['allreduce_8589934592_time'][0] == 63959.0) + assert (benchmark.result['allreduce_8589934592_algbw'][0] == 134.30) + assert (benchmark.result['allreduce_8589934592_busbw'][0] == 235.03) + assert (benchmark.result['alltoall_8589934592_time'][0] == 33234.0) + assert (benchmark.result['alltoall_8589934592_algbw'][0] == 258.47) + assert (benchmark.result['alltoall_8589934592_busbw'][0] == 226.16) diff --git a/tests/benchmarks/micro_benchmarks/test_dist_inference.py b/tests/benchmarks/micro_benchmarks/test_dist_inference.py index e24ec341..2fb6a8a4 100644 --- a/tests/benchmarks/micro_benchmarks/test_dist_inference.py +++ b/tests/benchmarks/micro_benchmarks/test_dist_inference.py @@ -3,7 +3,6 @@ """Tests for distributed inference benchmark.""" -import numbers import unittest from tests.helper import decorator @@ -209,19 +208,17 @@ class DistInferenceCppImplTest(BenchmarkTestCase, unittest.TestCase): # step_times assert (len(benchmark.raw_data) == 2) # return code + (avg, 50th, 90th, 95th, 99th, 99.9th) - test_latency = float(test_raw_output.splitlines()[-1].split(' ms per iteration')[0].split()[-1]) assert (7 == len(benchmark.result)) - for output_key in benchmark.result: - if output_key == 'return_code': - assert (benchmark.result[output_key] == [0]) - else: - assert (output_key.startswith('step_times')) - assert (len(benchmark.result[output_key]) == 1) - assert (isinstance(benchmark.result[output_key][0], numbers.Number)) - assert (test_latency == benchmark.result[output_key][0]) + assert (benchmark.result['return_code'] == [0]) + assert (benchmark.result['step_times'] == [1.9052048]) + assert (benchmark.result['step_times_50'] == [1.851]) + assert (benchmark.result['step_times_90'] == [1.89637]) + assert (benchmark.result['step_times_95'] == [2.12037]) + assert (benchmark.result['step_times_99'] == [2.67155]) + assert (benchmark.result['step_times_99.9'] == [4.4198]) # Negative case - invalid raw output. - assert (benchmark._process_raw_result(1, 'Invalid raw output') is False) + assert (benchmark._process_raw_result(1, 'Latency of step: xxx ms') is False) assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE) @decorator.cuda_test diff --git a/tests/benchmarks/micro_benchmarks/test_hipblaslt_function.py b/tests/benchmarks/micro_benchmarks/test_hipblaslt_function.py index f91019f6..a93d93fb 100644 --- a/tests/benchmarks/micro_benchmarks/test_hipblaslt_function.py +++ b/tests/benchmarks/micro_benchmarks/test_hipblaslt_function.py @@ -55,7 +55,7 @@ class HipblasLtBenchmarkTestCase(BenchmarkTestCase, unittest.TestCase): self.assertFalse(benchmark._preprocess()) benchmark = benchmark_cls( self.benchmark_name, - parameters='--shapes 2:4,4:8,8:32 2:4,4:8,8:32:+4 --in_types fp16 fp32 bf16', + parameters='--shapes 2:4,4:8,8:32 2:4,4:8,8:32:+4 --in_types fp16 fp32 bf16 fp8', ) self.assertTrue(benchmark._preprocess()) self.assertEqual((2 * 2 * 3 + 2 * 2 * 7) * len(benchmark._args.in_types), len(benchmark._commands)) @@ -63,12 +63,16 @@ class HipblasLtBenchmarkTestCase(BenchmarkTestCase, unittest.TestCase): def cmd(t, b, m, n, k): if b == 0: return f'{benchmark._HipBlasLtBenchmark__bin_path} ' + \ - f'-m {m} -n {n} -k {k} -j 20 -i 50 {benchmark._in_type_map[t]}' + f'-m {m} -n {n} -k {k} -j 20 -i 50 {benchmark._in_type_map[t]}' + \ + f' --transA {benchmark._args.transA} --transB {benchmark._args.transB}' + \ + f' --initialization {benchmark._args.initialization}' else: return f'{benchmark._HipBlasLtBenchmark__bin_path} ' + \ - f'-m {m} -n {n} -k {k} -j 20 -i 50 {benchmark._in_type_map[t]} -b {b}' + f'-m {m} -n {n} -k {k} -j 20 -i 50 {benchmark._in_type_map[t]} -b {b}' + \ + f' --transA {benchmark._args.transA} --transB {benchmark._args.transB}' + \ + f' --initialization {benchmark._args.initialization}' - for _t in ['fp16', 'fp32', 'bf16']: + for _t in ['fp16', 'fp32', 'bf16', 'fp8']: for _m in [2, 4]: for _n in [4, 8]: for _k in [8, 16, 32]: @@ -102,7 +106,7 @@ N,N,0,1,896,896,896,1,896,802816,0,896,802816,896,802816,896,802816,fp16_r,f32_r self.assertEqual(ReturnCode.SUCCESS, benchmark.return_code) self.assertEqual(2, len(benchmark.result)) - self.assertEqual(58624.5, benchmark.result['fp16_1_896_896_896_flops'][0]) + self.assertEqual(58.6245, benchmark.result['fp16_1_896_896_896_flops'][0]) # Negative case - invalid raw output self.assertFalse(benchmark._process_raw_result(1, 'HipBLAS API failed')) diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py index 7a8be1aa..2f1d076f 100644 --- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py +++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py @@ -177,8 +177,7 @@ class MegatronGPTTest(BenchmarkTestCase, unittest.TestCase): benchmark._data_options = f'\ --vocab-file {self._tmp_dir}/gpt2-vocab.json \ --merge-file {self._tmp_dir}/gpt2-merges.txt \ - --data-path {self._tmp_dir}/dataset_text_document \ - --data-impl mmap' + --data-path {self._tmp_dir}/dataset_text_document' script_path = str(Path(self._tmp_dir) / 'pretrain_gpt.py') expected_command = 'torchrun {distributed_args} {script_path} \ @@ -197,7 +196,6 @@ class MegatronGPTTest(BenchmarkTestCase, unittest.TestCase): --num-attention-heads 32 \ --seq-length 2048 \ --max-position-embeddings 2048 \ - --train-tokens 300000000000 \ --train-samples 20480 \ --lr 0.00012 \ --min-lr 1e-06 \ @@ -215,7 +213,8 @@ class MegatronGPTTest(BenchmarkTestCase, unittest.TestCase): --optimizer adam \ --use-distributed-optimizer \ {precision} \ - --seed 1234 {data_options}' + --seed 1234 \ + --log-throughput {data_options}' precision = Precision.FLOAT32 command = benchmark._megatron_command(precision) @@ -262,12 +261,10 @@ class MegatronGPTTest(BenchmarkTestCase, unittest.TestCase): benchmark._data_options = f'\ --vocab-file {self._tmp_dir}/gpt2-vocab.json \ --merge-file {self._tmp_dir}/gpt2-merges.txt \ - --data-path {self._tmp_dir}/dataset_text_document \ - --data-impl mmap' + --data-path {self._tmp_dir}/dataset_text_document' command = benchmark._megatron_command(Precision.BFLOAT16) - expected_command = 'deepspeed {script_path} \ - --override-opt_param-scheduler \ + expected_command = 'deepspeed {script_path} --override-opt_param-scheduler \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ --tensor-model-parallel-size 1 \ @@ -282,7 +279,6 @@ class MegatronGPTTest(BenchmarkTestCase, unittest.TestCase): --num-attention-heads 32 \ --seq-length 2048 \ --max-position-embeddings 2048 \ - --train-tokens 300000000000 \ --train-samples 20480 \ --lr 0.00012 \ --min-lr 1e-06 \ @@ -306,7 +302,9 @@ class MegatronGPTTest(BenchmarkTestCase, unittest.TestCase): --deepspeed \ --deepspeed_config {benchmark._config_json_path} \ --zero-stage 1 \ - --pipeline-model-parallel-size 1 --no-pipeline-parallel' + --pipeline-model-parallel-size 1 \ + --train-tokens 300000000000 \ + --data-impl mmap --no-pipeline-parallel' self.assertEqual( command, @@ -346,12 +344,12 @@ class MegatronGPTTest(BenchmarkTestCase, unittest.TestCase): iteration_times, tflops, mem_allocated, max_mem_allocated = benchmark._parse_log(raw_output) assert (statistics.mean(iteration_times) == 75239.24) assert (statistics.mean(tflops) == 149.136) - assert (statistics.mean(mem_allocated) == 17.54) - assert (statistics.mean(max_mem_allocated) == 66.97) + assert (statistics.mean(mem_allocated) == 17.535637855529785) + assert (statistics.mean(max_mem_allocated) == 66.9744234085083) info = {'tflops': tflops, 'mem_allocated': mem_allocated, 'max_mem_allocated': max_mem_allocated} benchmark._process_info(ModelAction.TRAIN, Precision.FLOAT16, info) assert (benchmark.result is not None) assert (benchmark.result['fp16_train_tflops'][0] == 149.136) - assert (benchmark.result['fp16_train_mem_allocated'][0] == 17.54) - assert (benchmark.result['fp16_train_max_mem_allocated'][0] == 66.97) + assert (benchmark.result['fp16_train_mem_allocated'][0] == 17.535637855529785) + assert (benchmark.result['fp16_train_max_mem_allocated'][0] == 66.9744234085083) diff --git a/tests/data/dist_inference.log b/tests/data/dist_inference.log index 14b104cf..3d3b7e8c 100644 --- a/tests/data/dist_inference.log +++ b/tests/data/dist_inference.log @@ -1,2 +1,100 @@ -Parameters: m=80, n=128, k=128, alpha=1.000000, beta=1.000000, num_layers=50, num_warmups=20, num_iters=100, use_cuda_graph=0 -Time: 173 ms in total, 1.73 ms per iteration, 0.0346 ms per layer +Latency of step 0: 1.8339 ms +Latency of step 1: 1.84222 ms +Latency of step 2: 1.90869 ms +Latency of step 3: 1.85375 ms +Latency of step 4: 1.87192 ms +Latency of step 5: 1.84254 ms +Latency of step 6: 1.91165 ms +Latency of step 7: 1.8214 ms +Latency of step 8: 1.91427 ms +Latency of step 9: 1.89586 ms +Latency of step 10: 1.86816 ms +Latency of step 11: 1.85105 ms +Latency of step 12: 1.84486 ms +Latency of step 13: 1.84915 ms +Latency of step 14: 1.82332 ms +Latency of step 15: 1.91444 ms +Latency of step 16: 1.85073 ms +Latency of step 17: 1.81812 ms +Latency of step 18: 2.67155 ms +Latency of step 19: 1.85119 ms +Latency of step 20: 1.87989 ms +Latency of step 21: 1.83932 ms +Latency of step 22: 1.84041 ms +Latency of step 23: 1.84789 ms +Latency of step 24: 1.85079 ms +Latency of step 25: 1.82229 ms +Latency of step 26: 1.83376 ms +Latency of step 27: 1.851 ms +Latency of step 28: 1.86246 ms +Latency of step 29: 1.8371 ms +Latency of step 30: 1.88932 ms +Latency of step 31: 1.84459 ms +Latency of step 32: 1.82725 ms +Latency of step 33: 1.83566 ms +Latency of step 34: 1.84041 ms +Latency of step 35: 1.87058 ms +Latency of step 36: 1.84038 ms +Latency of step 37: 1.85555 ms +Latency of step 38: 1.85848 ms +Latency of step 39: 2.40561 ms +Latency of step 40: 1.85029 ms +Latency of step 41: 1.84562 ms +Latency of step 42: 1.8351 ms +Latency of step 43: 1.84196 ms +Latency of step 44: 1.86032 ms +Latency of step 45: 1.87147 ms +Latency of step 46: 1.84832 ms +Latency of step 47: 1.85715 ms +Latency of step 48: 1.86012 ms +Latency of step 49: 1.86327 ms +Latency of step 50: 1.84388 ms +Latency of step 51: 1.86396 ms +Latency of step 52: 1.85538 ms +Latency of step 53: 1.85564 ms +Latency of step 54: 1.83979 ms +Latency of step 55: 1.85334 ms +Latency of step 56: 1.85712 ms +Latency of step 57: 1.85284 ms +Latency of step 58: 1.84534 ms +Latency of step 59: 1.86041 ms +Latency of step 60: 1.86305 ms +Latency of step 61: 2.2213 ms +Latency of step 62: 1.83054 ms +Latency of step 63: 4.4198 ms +Latency of step 64: 1.87245 ms +Latency of step 65: 1.83845 ms +Latency of step 66: 1.82047 ms +Latency of step 67: 1.81191 ms +Latency of step 68: 1.83887 ms +Latency of step 69: 1.8463 ms +Latency of step 70: 2.12037 ms +Latency of step 71: 1.85782 ms +Latency of step 72: 1.84939 ms +Latency of step 73: 1.82054 ms +Latency of step 74: 1.8866 ms +Latency of step 75: 1.83937 ms +Latency of step 76: 1.84167 ms +Latency of step 77: 1.89637 ms +Latency of step 78: 1.8392 ms +Latency of step 79: 1.83754 ms +Latency of step 80: 1.84721 ms +Latency of step 81: 1.88112 ms +Latency of step 82: 1.84474 ms +Latency of step 83: 1.84084 ms +Latency of step 84: 1.85134 ms +Latency of step 85: 1.85315 ms +Latency of step 86: 1.83406 ms +Latency of step 87: 1.87803 ms +Latency of step 88: 1.8369 ms +Latency of step 89: 1.85909 ms +Latency of step 90: 1.84519 ms +Latency of step 91: 2.52689 ms +Latency of step 92: 1.86594 ms +Latency of step 93: 1.86974 ms +Latency of step 94: 1.85219 ms +Latency of step 95: 1.86255 ms +Latency of step 96: 1.82652 ms +Latency of step 97: 1.84379 ms +Latency of step 98: 1.84553 ms +Latency of step 99: 1.87082 ms diff --git a/third_party/Makefile b/third_party/Makefile index 41550ea8..69623af8 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -7,18 +7,20 @@ MPI_HOME ?= /usr/local/mpi HIP_HOME ?= /opt/rocm/hip RCCL_HOME ?= /opt/rocm/rccl HPCX_HOME ?= /opt/hpcx +ROCM_PATH ?= /opt/rocm CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c2- | cut -d '.' -f1-2) ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3) HIPBLASLT_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3) +ROCM_VER ?= $(shell hipconfig -R | grep -oP '\d+\.\d+\.\d+' || echo "0.0.0") -.PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed +.PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm # Build all targets. all: cuda rocm cuda_with_msccl: cuda cuda_msccl cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed -rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed +rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm cpu: common cpu_perftest common: cpu_hpl cpu_stream fio directx_amd: directx_amf_encoding_latency @@ -62,7 +64,7 @@ endif cuda_nccl_tests: sb_micro_path ifneq (,$(wildcard nccl-tests/Makefile)) cd ./nccl-tests && make MPI=1 MPI_HOME=$(MPI_HOME) -j - cp -v ./nccl-tests/build/* $(SB_MICRO_PATH)/bin/ + cp -v -r ./nccl-tests/build/* $(SB_MICRO_PATH)/bin/ endif # Build perftest. @@ -86,11 +88,11 @@ ifneq (,$(wildcard fio/Makefile)) cd ./fio && ./configure --prefix=$(SB_MICRO_PATH) --disable-native && make -j && make install endif -# Build rccl-tests from commit 2a18737 of default branch. +# Build rccl-tests from commit 46375b1 of default branch. rocm_rccl_tests: sb_micro_path ifneq (, $(wildcard rccl-tests/Makefile)) - cd ./rccl-tests && make MPI=1 MPI_HOME=$(MPI_HOME) HIP_HOME=$(HIP_HOME) RCCL_HOME=$(RCCL_HOME) -j - cp -v ./rccl-tests/build/* $(SB_MICRO_PATH)/bin/ + cd ./rccl-tests && make MPI=1 MPI_HOME=$(MPI_HOME) -j + cp -v -r ./rccl-tests/build/* $(SB_MICRO_PATH)/bin/ endif # Build rocblas-bench. @@ -175,42 +177,58 @@ directx_amf_encoding_latency: "C:\temp\BuildTools\MSBuild\Current\Bin\MSBuild.exe" "AMF\amf\public\samples\CPPSamples_vs2019.sln" /t:EncoderLatency /p:Platform=x64 /p:Configuration=Release /p:OutDir="%SB_MICRO_PATH%\bin" \ ) -# Install Megatron-LM +# Install requirements for Megatron-LM megatron_lm: - if [ ! -d "Megatron/Megatron-LM" ]; then \ - git clone "https://github.com/NVIDIA/Megatron-LM.git" "Megatron/Megatron-LM"; \ - fi cd Megatron && \ - python -m pip install -r requirements.txt + apt install -y python3-mpi4py && \ + python -m pip install --no-cache-dir -r requirements.txt -# Install Megatron-DeepSpeed +# Install requirements for Megatron-DeepSpeed megatron_deepspeed: - if [ ! -d "Megatron/Megatron-DeepSpeed" ]; then \ - git clone "https://github.com/microsoft/Megatron-DeepSpeed.git" "Megatron/Megatron-DeepSpeed"; \ - fi cd Megatron && \ - python -m pip install -r requirements.txt && \ + apt install -y python3-mpi4py && \ + python -m pip install --no-cache-dir -r requirements.txt && \ python -m pip install DeepSpeed +# Instal apex of ROCm due to dependency of Megatron +apex_rocm: + $(eval TORCH_VERSION ?= $(shell python -c "import torch; print(torch.__version__)")) + $(eval TORCH_MAJOR_VERSION ?= $(word 1,$(subst ., ,$(TORCH_VERSION)))) + $(eval TORCH_MINOR_VERSION ?= $(word 2,$(subst ., ,$(TORCH_VERSION)))) + if [ ! -d "apex" ]; then \ + git clone https://github.com/ROCmSoftwarePlatform/apex.git ; \ + fi + cd apex && \ + if [ "$$(expr $(TORCH_MAJOR_VERSION) \> 2)" -eq 1 ] && [ "$$(expr $(TORCH_MINOR_VERSION) \> 1)" -eq 1 ]; then \ + git checkout master ; \ + elif [ "$$(expr $(TORCH_MAJOR_VERSION) == 2)" -eq 1 ] && [ "$$(expr $(TORCH_MINOR_VERSION) == 1)" -eq 1 ]; then \ + git checkout release/1.1.0 ; \ + elif [ "$$(expr $(TORCH_MAJOR_VERSION) == 2)" -eq 1 ] && [ "$$(expr $(TORCH_MINOR_VERSION) == 0)" -eq 1 ]; then \ + git checkout release/1.0.0 ; \ + elif [ "$$(expr $(TORCH_MAJOR_VERSION) == 1)" -eq 1 ]; then \ + git checkout release/1.0.0 ; \ + fi + pip install -v --disable-pip-version-check --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./apex + # Build MSCCL for CUDA cuda_msccl: sb_micro_path ifneq (,$(wildcard msccl/executor/msccl-executor-nccl/Makefile)) cd ./msccl/executor/msccl-executor-nccl && \ - make -j4 src.build && \ + make -j $(shell nproc --ignore=2) src.build && \ cd ../../.. mkdir -p $(SB_MICRO_PATH)/lib/msccl-executor-nccl && \ cp -r -v ./msccl/executor/msccl-executor-nccl/build/* $(SB_MICRO_PATH)/lib/msccl-executor-nccl/ endif ifneq (,$(wildcard msccl/scheduler/msccl-scheduler/Makefile)) cd ./msccl/scheduler/msccl-scheduler && \ - CXX=nvcc BIN_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl SRC_HOME=../../../msccl/executor/msccl-executor-nccl make -j4 && \ + CXX=nvcc BIN_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl SRC_HOME=../../../msccl/executor/msccl-executor-nccl make -j $(shell nproc --ignore=2) && \ cd ../../.. mkdir -p $(SB_MICRO_PATH)/lib/msccl-scheduler && \ cp -r -v ./msccl/scheduler/msccl-scheduler/build/* $(SB_MICRO_PATH)/lib/msccl-scheduler/ endif ifneq (,$(wildcard msccl/tests/msccl-tests-nccl/Makefile)) cd ./msccl/tests/msccl-tests-nccl && \ - make MPI=1 MPI_HOME=$(MPI_HOME) NCCL_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl -j4 && cd ../../.. + make MPI=1 MPI_HOME=$(MPI_HOME) NCCL_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl -j $(shell nproc --ignore=2) && cd ../../.. mkdir -p $(SB_MICRO_PATH)/bin/msccl-tests-nccl && \ cp -r -v ./msccl/tests/msccl-tests-nccl/build/* $(SB_MICRO_PATH)/bin/msccl-tests-nccl/ endif diff --git a/third_party/Megatron/Megatron-DeepSpeed b/third_party/Megatron/Megatron-DeepSpeed new file mode 160000 index 00000000..71e8407c --- /dev/null +++ b/third_party/Megatron/Megatron-DeepSpeed @@ -0,0 +1 @@ +Subproject commit 71e8407c98bacacb002823ea587c321fe58b28a6 diff --git a/third_party/Megatron/Megatron-LM b/third_party/Megatron/Megatron-LM new file mode 160000 index 00000000..52b7a18a --- /dev/null +++ b/third_party/Megatron/Megatron-LM @@ -0,0 +1 @@ +Subproject commit 52b7a18a00bced8b3670eededfd58ee0c4bd7d06 diff --git a/third_party/Megatron/megatron_deepspeed_rocm6.patch b/third_party/Megatron/megatron_deepspeed_rocm6.patch new file mode 100644 index 00000000..05c72186 --- /dev/null +++ b/third_party/Megatron/megatron_deepspeed_rocm6.patch @@ -0,0 +1,26 @@ +diff --git a/megatron/fused_kernels/scaled_softmax_cuda.cu b/megatron/fused_kernels/scaled_softmax_cuda.cu +index 90e1c9f..d217aec 100644 +--- a/megatron/fused_kernels/scaled_softmax_cuda.cu ++++ b/megatron/fused_kernels/scaled_softmax_cuda.cu +@@ -4,7 +4,7 @@ + #include + #include + #include +-#ifndef __HIP_PLATFORM_HCC__ ++#ifndef __HIP_PLATFORM_AMD__ + #include + #endif + #include +diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu +index 74c9f3d..03b5fc8 100644 +--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu ++++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu +@@ -4,7 +4,7 @@ + #include + #include + #include +-#ifndef __HIP_PLATFORM_HCC__ ++#ifndef __HIP_PLATFORM_AMD__ + #include + #endif + #include diff --git a/third_party/Megatron/requirements.txt b/third_party/Megatron/requirements.txt index 8b772329..a64133a0 100644 --- a/third_party/Megatron/requirements.txt +++ b/third_party/Megatron/requirements.txt @@ -10,4 +10,6 @@ tqdm sentencepiece wandb einops -typing_extensions==4.5.0 +typing_extensions==4.9.0 +apex +mpi4py diff --git a/third_party/nccl-tests b/third_party/nccl-tests index 8274cb47..1292b255 160000 --- a/third_party/nccl-tests +++ b/third_party/nccl-tests @@ -1 +1 @@ -Subproject commit 8274cb47b6dc70ce4411e7f114b77173d3892414 +Subproject commit 1292b25553bd0384f2faa2965f9d82b99797a348 diff --git a/third_party/perftest b/third_party/perftest index 5fb4f10a..dffd1dd8 160000 --- a/third_party/perftest +++ b/third_party/perftest @@ -1 +1 @@ -Subproject commit 5fb4f10a7e7827ed15e53c25810a10be279d6e23 +Subproject commit dffd1dd8b8a26dad2634a546e7e4d082dc882fbc diff --git a/third_party/perftest_rocm6.patch b/third_party/perftest_rocm6.patch new file mode 100644 index 00000000..fe15bd25 --- /dev/null +++ b/third_party/perftest_rocm6.patch @@ -0,0 +1,28 @@ +diff --git a/configure.ac b/configure.ac +index 20eceda..c8f0c07 100755 +--- a/configure.ac ++++ b/configure.ac +@@ -237,7 +237,7 @@ AC_ARG_WITH([rocm], + ], + [AS_CASE([$with_rocm], + [yes|no], [], +- [CPPFLAGS="-I$with_rocm/include $CPPFLAGS" ++ [CPPFLAGS="-I$with_rocm/include -D__HIP_PLATFORM_AMD__=1 $CPPFLAGS" + LDFLAGS="-L$with_rocm/lib64 -Wl,-rpath=$with_rocm/lib64 -L$with_rocm/lib -Wl,-rpath=$with_rocm/lib -lamdhip64 $LDFLAGS"]) + ]) + +diff --git a/src/rocm_memory.c b/src/rocm_memory.c +index e9a9136..b6cb23a 100644 +--- a/src/rocm_memory.c ++++ b/src/rocm_memory.c +@@ -44,8 +44,8 @@ static int init_rocm(int device_id) { + + hipDeviceProp_t prop = {0}; + ROCM_CHECK(hipGetDeviceProperties(&prop, device_id)); +- printf("Using ROCm Device with ID: %d, Name: %s, PCI Bus ID: 0x%x, GCN Arch: %d\n", +- device_id, prop.name, prop.pciBusID, prop.gcnArch); ++ printf("Using ROCm Device with ID: %d, Name: %s, PCI Bus ID: 0x%x, GCN Arch: %s\n", ++ device_id, prop.name, prop.pciBusID, prop.gcnArchName); + + return SUCCESS; + } diff --git a/third_party/rccl-tests b/third_party/rccl-tests index 2a18737d..46375b1c 160000 --- a/third_party/rccl-tests +++ b/third_party/rccl-tests @@ -1 +1 @@ -Subproject commit 2a18737dc681e03ce82c046caa71b28db65017b5 +Subproject commit 46375b1c527b2e3afe80fdd6dd136151bd939675 diff --git a/website/blog/2023-12-31-release-0-10.md b/website/blog/2023-12-31-release-0-10.md new file mode 100644 index 00000000..60fc2402 --- /dev/null +++ b/website/blog/2023-12-31-release-0-10.md @@ -0,0 +1,53 @@ +--- +slug: release-sb-v0.10 +title: Releasing SuperBench v0.10 +author: Peng Cheng +author_title: SuperBench Team +author_url: https://github.com/cp5555 +author_image_url: https://github.com/cp5555.png +tags: [superbench, announcement, release] +--- + +We are very happy to announce that **SuperBench 0.10.0 version** is officially released today! + +You can install and try superbench by following [Getting Started Tutorial](https://microsoft.github.io/superbenchmark/docs/getting-started/installation). + +## SuperBench 0.10.0 Release Notes + +### SuperBench Improvements + +- Support monitoring for AMD GPUs. +- Support ROCm 5.7 and ROCm 6.0 dockerfile. +- Add MSCCL support for Nvidia GPU. +- Fix NUMA domains swap issue in NDv4 topology file. +- Add NDv5 topo file. +- Fix NCCL and NCCL-test to 2.18.3 for hang issue in CUDA 12.2. + +### Micro-benchmark Improvements + +- Add HPL random generator to gemm-flops with ROCm. +- Add DirectXGPURenderFPS benchmark to measure the FPS of rendering simple frames. +- Add HWDecoderFPS benchmark to measure the FPS of hardware decoder performance. +- Update Docker image for H100 support. +- Update MLC version into 3.10 for CUDA/ROCm dockerfile. +- Bug fix for GPU Burn test. +- Support INT8 in cublaslt function. +- Add hipBLASLt function benchmark. +- Support cpu-gpu and gpu-cpu in ib-validation. +- Support graph mode in NCCL/RCCL benchmarks for latency metrics. +- Support cpp implementation in distributed inference benchmark. +- Add O2 option for gpu copy ROCm build. +- Support different hipblasLt data types in dist inference. +- Support in-place in NCCL/RCCL benchmark. +- Support data type option in NCCL/RCCL benchmark. +- Improve P2P performance with fine-grained GPU memory in GPU-copy test for AMD GPUs. +- Update hipblaslt GEMM metric unit to tflops. +- Support FP8 for hipblaslt benchmark. + +### Model Benchmark Improvements + +- Change torch.distributed.launch to torchrun. +- Support Megatron-LM/Megatron-Deepspeed GPT pretrain benchmark. + +### Result Analysis +- Support baseline generation from multiple nodes. diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index c1d83edf..a533084e 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -101,7 +101,7 @@ module.exports = { announcementBar: { id: 'supportus', content: - '📢 v0.9.0 has been released! ' + + '📢 v0.10.0 has been released! ' + '⭐️ If you like SuperBench, give it a star on GitHub! ⭐️', }, algolia: { diff --git a/website/package-lock.json b/website/package-lock.json index 7bf8c531..4ec6973f 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -1,6 +1,6 @@ { "name": "superbench-website", - "version": "0.9.0", + "version": "0.10.0", "lockfileVersion": 1, "requires": true, "dependencies": { @@ -11678,4 +11678,4 @@ "integrity": "sha512-V50KMwwzqJV0NpZIZFwfOD5/lyny3WlSzRiXgA0G7VUnRlqttta1L6UQIHzd6EuBY/cHGfwTIck7w1yH6Q5zUw==" } } -} +} \ No newline at end of file diff --git a/website/package.json b/website/package.json index 38ca1f75..f2bb9ed7 100644 --- a/website/package.json +++ b/website/package.json @@ -1,6 +1,6 @@ { "name": "superbench-website", - "version": "0.9.0", + "version": "0.10.0", "private": true, "scripts": { "docusaurus": "docusaurus", @@ -38,4 +38,4 @@ "last 1 safari version" ] } -} +} \ No newline at end of file