Dockerfile - Add support for arm64 build (#660)
Add support for arm64 build: - Updated dockerfile for arm64 build - extend cpu stream compilation for neoverse - handle onnxruntime-gpu installation - third party builds filtering based on arch - disable cuda decode perf build for non x86
This commit is contained in:
Родитель
59d36f7ff0
Коммит
479491279e
|
@ -28,21 +28,25 @@ jobs:
|
|||
- name: cuda12.4
|
||||
dockerfile: cuda12.4
|
||||
tags: superbench/main:cuda12.4
|
||||
platforms: linux/amd64 # TODO: linux/arm64
|
||||
runner: [self-hosted]
|
||||
build_args: "NUM_MAKE_JOBS=16"
|
||||
- name: cuda12.2
|
||||
dockerfile: cuda12.2
|
||||
tags: superbench/main:cuda12.2
|
||||
platforms: linux/amd64
|
||||
runner: [self-hosted]
|
||||
build_args: "NUM_MAKE_JOBS=16"
|
||||
- name: cuda11.1.1
|
||||
dockerfile: cuda11.1.1
|
||||
tags: superbench/main:cuda11.1.1,superbench/superbench:latest
|
||||
platforms: linux/amd64
|
||||
runner: ubuntu-latest
|
||||
build_args: "NUM_MAKE_JOBS=8"
|
||||
- name: rocm6.2
|
||||
dockerfile: rocm6.2.x
|
||||
tags: superbench/main:rocm6.2
|
||||
platforms: linux/amd64
|
||||
runner: [self-hosted]
|
||||
build_args: "NUM_MAKE_JOBS=16"
|
||||
steps:
|
||||
|
@ -125,7 +129,7 @@ jobs:
|
|||
id: docker_build
|
||||
uses: docker/build-push-action@v2
|
||||
with:
|
||||
platforms: linux/amd64
|
||||
platforms: ${{ matrix.platforms }}
|
||||
context: .
|
||||
file: ${{ steps.metadata.outputs.dockerfile }}
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
|
|
|
@ -19,6 +19,7 @@ FROM nvcr.io/nvidia/pytorch:24.03-py3
|
|||
LABEL maintainer="SuperBench"
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
autoconf \
|
||||
|
@ -60,11 +61,13 @@ RUN apt-get update && \
|
|||
rm -rf /var/lib/apt/lists/* /tmp/*
|
||||
|
||||
ARG NUM_MAKE_JOBS=
|
||||
ARG TARGETPLATFORM
|
||||
ARG TARGETARCH
|
||||
|
||||
# Install Docker
|
||||
ENV DOCKER_VERSION=20.10.8
|
||||
RUN cd /tmp && \
|
||||
wget -q https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
|
||||
RUN TARGETARCH_HW=$(uname -m) && \
|
||||
wget -q https://download.docker.com/linux/static/stable/${TARGETARCH_HW}/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
|
||||
tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \
|
||||
rm docker.tgz
|
||||
|
||||
|
@ -80,40 +83,43 @@ RUN mkdir -p /root/.ssh && \
|
|||
|
||||
# Install OFED
|
||||
ENV OFED_VERSION=23.07-0.5.1.2
|
||||
RUN cd /tmp && \
|
||||
wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \
|
||||
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \
|
||||
MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
|
||||
RUN TARGETARCH_HW=$(uname -m) && \
|
||||
cd /tmp && \
|
||||
wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-${TARGETARCH_HW}.tgz && \
|
||||
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-${TARGETARCH_HW}.tgz && \
|
||||
MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-${TARGETARCH_HW}/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
|
||||
rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
|
||||
|
||||
# Install HPC-X
|
||||
ENV HPCX_VERSION=v2.18
|
||||
RUN cd /opt && \
|
||||
RUN TARGETARCH_HW=$(uname -m) && \
|
||||
cd /opt && \
|
||||
rm -rf hpcx && \
|
||||
wget https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64.tbz -O hpcx.tbz && \
|
||||
wget https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-${TARGETARCH_HW}.tbz -O hpcx.tbz && \
|
||||
tar xf hpcx.tbz && \
|
||||
mv hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64 hpcx && \
|
||||
mv hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-${TARGETARCH_HW} hpcx && \
|
||||
rm hpcx.tbz
|
||||
|
||||
# Install Intel MLC
|
||||
RUN cd /tmp && \
|
||||
# Installs specific to amd64 platform
|
||||
RUN if [ "$TARGETARCH" = "amd64" ]; then \
|
||||
# Install Intel MLC
|
||||
cd /tmp && \
|
||||
wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
|
||||
tar xzf mlc.tgz Linux/mlc && \
|
||||
cp ./Linux/mlc /usr/local/bin/ && \
|
||||
rm -rf ./Linux mlc.tgz
|
||||
|
||||
# Install AOCC compiler
|
||||
RUN cd /tmp && \
|
||||
rm -rf ./Linux mlc.tgz && \
|
||||
# Install AOCC compiler
|
||||
wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
|
||||
apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
|
||||
rm -rf aocc-compiler-4.0.0_1_amd64.deb
|
||||
|
||||
# Install AMD BLIS
|
||||
RUN cd /tmp && \
|
||||
rm -rf aocc-compiler-4.0.0_1_amd64.deb && \
|
||||
# Install AMD BLIS
|
||||
wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
|
||||
tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
|
||||
mv amd-blis /opt/AMD && \
|
||||
rm -rf aocl-blis-linux-aocc-4.0.tar.gz
|
||||
rm -rf aocl-blis-linux-aocc-4.0.tar.gz; \
|
||||
else \
|
||||
echo "Skipping Intel MLC, AOCC and AMD Bliss installations for non-amd64 architecture: $TARGETARCH"; \
|
||||
fi
|
||||
|
||||
# Install NCCL 2.23.4
|
||||
RUN cd /tmp && \
|
||||
|
|
4
setup.py
4
setup.py
|
@ -215,8 +215,8 @@ setup(
|
|||
],
|
||||
'ort': [
|
||||
'onnx>=1.10.2',
|
||||
'onnxruntime-gpu==1.10.0; python_version<"3.10"',
|
||||
'onnxruntime-gpu; python_version>="3.10"',
|
||||
'onnxruntime-gpu==1.10.0; python_version<"3.10" and platform_machine == "x86_64"',
|
||||
'onnxruntime-gpu; python_version>="3.10" and platform_machine == "x86_64"',
|
||||
],
|
||||
'nvidia': ['py3nvml>=0.2.6'],
|
||||
'amd': ['amdsmi'],
|
||||
|
|
|
@ -23,7 +23,7 @@ class CpuStreamBenchmark(MicroBenchmarkWithInvoke):
|
|||
super().__init__(name, parameters)
|
||||
|
||||
self._bin_name = 'streamZen3.exe'
|
||||
self.__cpu_arch = ['other', 'zen3', 'zen4']
|
||||
self.__cpu_arch = ['other', 'zen3', 'zen4', 'neo2']
|
||||
|
||||
def add_parser_arguments(self):
|
||||
"""Add the specified arguments."""
|
||||
|
@ -80,6 +80,8 @@ class CpuStreamBenchmark(MicroBenchmarkWithInvoke):
|
|||
exe = 'streamZen3.exe'
|
||||
elif self._args.cpu_arch == 'zen4':
|
||||
exe = 'streamZen4.exe'
|
||||
elif self._args.cpu_arch == 'neo2':
|
||||
exe = 'streamNeo2.exe'
|
||||
else:
|
||||
exe = 'streamx86.exe'
|
||||
|
||||
|
|
|
@ -4,114 +4,120 @@
|
|||
cmake_minimum_required(VERSION 3.18)
|
||||
project(cuda_decode_performance)
|
||||
|
||||
find_package(CUDA QUIET)
|
||||
if(CUDA_FOUND)
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
# Check architecture
|
||||
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64")
|
||||
message(WARNING "Skipping Cuda decode Performance build. This build only supports x86_64 arch.")
|
||||
else()
|
||||
find_package(CUDA QUIET)
|
||||
if(CUDA_FOUND)
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
|
||||
set(THIRD_PARTY_SAMPLE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Samples)
|
||||
set(NVCODEC_PUBLIC_INTERFACE_DIR ${THIRD_PARTY_SAMPLE_DIR}/../Interface)
|
||||
set(NVCODEC_UTILS_DIR ${THIRD_PARTY_SAMPLE_DIR}/Utils)
|
||||
set(NV_CODEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec)
|
||||
set(NV_DEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec/NvDecoder)
|
||||
set(THIRD_PARTY_SAMPLE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Samples)
|
||||
set(NVCODEC_PUBLIC_INTERFACE_DIR ${THIRD_PARTY_SAMPLE_DIR}/../Interface)
|
||||
set(NVCODEC_UTILS_DIR ${THIRD_PARTY_SAMPLE_DIR}/Utils)
|
||||
set(NV_CODEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec)
|
||||
set(NV_DEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec/NvDecoder)
|
||||
|
||||
if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
|
||||
find_package(PkgConfig REQUIRED)
|
||||
pkg_check_modules(PC_AVCODEC REQUIRED IMPORTED_TARGET libavcodec)
|
||||
pkg_check_modules(PC_AVFORMAT REQUIRED IMPORTED_TARGET libavformat)
|
||||
pkg_check_modules(PC_AVUTIL REQUIRED IMPORTED_TARGET libavutil)
|
||||
pkg_check_modules(PC_SWRESAMPLE REQUIRED IMPORTED_TARGET libswresample)
|
||||
if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
|
||||
find_package(PkgConfig REQUIRED)
|
||||
pkg_check_modules(PC_AVCODEC REQUIRED IMPORTED_TARGET libavcodec)
|
||||
pkg_check_modules(PC_AVFORMAT REQUIRED IMPORTED_TARGET libavformat)
|
||||
pkg_check_modules(PC_AVUTIL REQUIRED IMPORTED_TARGET libavutil)
|
||||
pkg_check_modules(PC_SWRESAMPLE REQUIRED IMPORTED_TARGET libswresample)
|
||||
|
||||
set(NV_FFMPEG_HDRS ${PC_AVCODEC_INCLUDE_DIRS})
|
||||
find_library(AVCODEC_LIBRARY NAMES avcodec
|
||||
HINTS
|
||||
${PC_AVCODEC_LIBDIR}
|
||||
${PC_AVCODEC_LIBRARY_DIRS}
|
||||
)
|
||||
find_library(AVFORMAT_LIBRARY NAMES avformat
|
||||
HINTS
|
||||
${PC_AVFORMAT_LIBDIR}
|
||||
${PC_AVFORMAT_LIBRARY_DIRS}
|
||||
)
|
||||
find_library(AVUTIL_LIBRARY NAMES avutil
|
||||
HINTS
|
||||
${PC_AVUTIL_LIBDIR}
|
||||
${PC_AVUTIL_LIBRARY_DIRS}
|
||||
)
|
||||
find_library(SWRESAMPLE_LIBRARY NAMES swresample
|
||||
HINTS
|
||||
${PC_SWRESAMPLE_LIBDIR}
|
||||
${PC_SWRESAMPLE_LIBRARY_DIRS}
|
||||
)
|
||||
set(AVCODEC_LIB ${AVCODEC_LIBRARY})
|
||||
set(AVFORMAT_LIB ${AVFORMAT_LIBRARY})
|
||||
set(AVUTIL_LIB ${AVUTIL_LIBRARY})
|
||||
set(SWRESAMPLE_LIB ${SWRESAMPLE_LIBRARY})
|
||||
endif()
|
||||
set(NV_FFMPEG_HDRS ${PC_AVCODEC_INCLUDE_DIRS})
|
||||
find_library(AVCODEC_LIBRARY NAMES avcodec
|
||||
HINTS
|
||||
${PC_AVCODEC_LIBDIR}
|
||||
${PC_AVCODEC_LIBRARY_DIRS}
|
||||
)
|
||||
find_library(AVFORMAT_LIBRARY NAMES avformat
|
||||
HINTS
|
||||
${PC_AVFORMAT_LIBDIR}
|
||||
${PC_AVFORMAT_LIBRARY_DIRS}
|
||||
)
|
||||
find_library(AVUTIL_LIBRARY NAMES avutil
|
||||
HINTS
|
||||
${PC_AVUTIL_LIBDIR}
|
||||
${PC_AVUTIL_LIBRARY_DIRS}
|
||||
)
|
||||
find_library(SWRESAMPLE_LIBRARY NAMES swresample
|
||||
HINTS
|
||||
${PC_SWRESAMPLE_LIBDIR}
|
||||
${PC_SWRESAMPLE_LIBRARY_DIRS}
|
||||
)
|
||||
set(AVCODEC_LIB ${AVCODEC_LIBRARY})
|
||||
set(AVFORMAT_LIB ${AVFORMAT_LIBRARY})
|
||||
set(AVUTIL_LIB ${AVUTIL_LIBRARY})
|
||||
set(SWRESAMPLE_LIB ${SWRESAMPLE_LIBRARY})
|
||||
endif()
|
||||
|
||||
set(APP_SOURCES
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/AppDecPerf.cpp
|
||||
)
|
||||
set(APP_SOURCES
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/AppDecPerf.cpp
|
||||
)
|
||||
|
||||
set(NV_DEC_SOURCES
|
||||
${NV_DEC_DIR}/NvDecoder.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.cpp
|
||||
)
|
||||
set(NV_DEC_SOURCES
|
||||
${NV_DEC_DIR}/NvDecoder.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.cpp
|
||||
)
|
||||
|
||||
set(NV_DEC_HDRS
|
||||
${NV_DEC_DIR}/NvDecoder.h
|
||||
${NVCODEC_PUBLIC_INTERFACE_DIR}/cuviddec.h
|
||||
${NVCODEC_PUBLIC_INTERFACE_DIR}/nvcuvid.h
|
||||
${NVCODEC_UTILS_DIR}/NvCodecUtils.h
|
||||
${NVCODEC_UTILS_DIR}/FFmpegDemuxer.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/ThreadPoolUtils.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.h
|
||||
)
|
||||
set(NV_DEC_HDRS
|
||||
${NV_DEC_DIR}/NvDecoder.h
|
||||
${NVCODEC_PUBLIC_INTERFACE_DIR}/cuviddec.h
|
||||
${NVCODEC_PUBLIC_INTERFACE_DIR}/nvcuvid.h
|
||||
${NVCODEC_UTILS_DIR}/NvCodecUtils.h
|
||||
${NVCODEC_UTILS_DIR}/FFmpegDemuxer.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/ThreadPoolUtils.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.h
|
||||
)
|
||||
|
||||
source_group( "headers" FILES ${NV_DEC_HDRS} )
|
||||
source_group( "sources" FILES ${APP_SOURCES} ${NV_DEC_SOURCES})
|
||||
set(CMAKE_LIBRARY_PATH "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib64;${CUDA_TOOLKIT_ROOT_DIR}/lib;${CMAKE_LIBRARY_PATH}")
|
||||
find_package(CUDA)
|
||||
set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
|
||||
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=\"sm_50,compute_50\")
|
||||
if ( CMAKE_COMPILER_IS_GNUCC )
|
||||
if(NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=c\\+\\+11" )
|
||||
list(APPEND CUDA_NVCC_FLAGS -std=c++11)
|
||||
endif()
|
||||
endif()
|
||||
source_group( "headers" FILES ${NV_DEC_HDRS} )
|
||||
source_group( "sources" FILES ${APP_SOURCES} ${NV_DEC_SOURCES})
|
||||
set(CMAKE_LIBRARY_PATH "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib64;${CUDA_TOOLKIT_ROOT_DIR}/lib;${CMAKE_LIBRARY_PATH}")
|
||||
find_package(CUDA)
|
||||
set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
|
||||
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=\"sm_50,compute_50\")
|
||||
if ( CMAKE_COMPILER_IS_GNUCC )
|
||||
if(NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=c\\+\\+11" )
|
||||
list(APPEND CUDA_NVCC_FLAGS -std=c++11)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Check if the file exists
|
||||
if (NOT EXISTS "/usr/local/lib/libnvcuvid.so" )
|
||||
execute_process(
|
||||
COMMAND sudo ln -s /usr/lib/x86_64-linux-gnu/libnvcuvid.so.1 /usr/local/lib/libnvcuvid.so
|
||||
RESULT_VARIABLE result
|
||||
)
|
||||
if(result)
|
||||
message(FATAL_ERROR "Failed to create symbolic link for nvcuvid lib: ${result}")
|
||||
endif()
|
||||
endif ()
|
||||
# Check if the file exists
|
||||
if (NOT EXISTS "/usr/local/lib/libnvcuvid.so" )
|
||||
execute_process(
|
||||
COMMAND sudo ln -s /usr/lib/x86_64-linux-gnu/libnvcuvid.so.1 /usr/local/lib/libnvcuvid.so
|
||||
RESULT_VARIABLE result
|
||||
)
|
||||
if(result)
|
||||
message(FATAL_ERROR "Failed to create symbolic link for nvcuvid lib: ${result}")
|
||||
endif()
|
||||
endif ()
|
||||
|
||||
find_library(CUVID_LIB nvcuvid
|
||||
HINTS
|
||||
"/usr/local/lib/"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/"
|
||||
)
|
||||
find_library(CUVID_LIB nvcuvid
|
||||
HINTS
|
||||
"/usr/local/lib/"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/"
|
||||
)
|
||||
|
||||
cuda_add_executable(${PROJECT_NAME} ${APP_SOURCES} ${NV_DEC_SOURCES} ${NV_DEC_HDRS})
|
||||
cuda_add_executable(${PROJECT_NAME} ${APP_SOURCES} ${NV_DEC_SOURCES} ${NV_DEC_HDRS})
|
||||
|
||||
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
|
||||
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
|
||||
|
||||
target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_INCLUDE_DIRS}
|
||||
${NVCODEC_PUBLIC_INTERFACE_DIR}
|
||||
${NVCODEC_UTILS_DIR}
|
||||
${NV_CODEC_DIR}
|
||||
${NV_APPDEC_COMMON_DIR}
|
||||
${NV_FFMPEG_HDRS}
|
||||
${THIRD_PARTY_SAMPLE_DIR}
|
||||
)
|
||||
target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_INCLUDE_DIRS}
|
||||
${NVCODEC_PUBLIC_INTERFACE_DIR}
|
||||
${NVCODEC_UTILS_DIR}
|
||||
${NV_CODEC_DIR}
|
||||
${NV_APPDEC_COMMON_DIR}
|
||||
${NV_FFMPEG_HDRS}
|
||||
${THIRD_PARTY_SAMPLE_DIR}
|
||||
)
|
||||
|
||||
target_link_libraries(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY} ${CMAKE_DL_LIBS} ${CUVID_LIB} ${AVCODEC_LIB}
|
||||
${AVFORMAT_LIB} ${AVUTIL_LIB} ${SWRESAMPLE_LIB})
|
||||
target_link_libraries(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY} ${CMAKE_DL_LIBS} ${CUVID_LIB} ${AVCODEC_LIB}
|
||||
${AVFORMAT_LIB} ${AVUTIL_LIB} ${SWRESAMPLE_LIB})
|
||||
|
||||
install(TARGETS ${PROJECT_NAME} RUNTIME DESTINATION bin LIBRARY DESTINATION lib)
|
||||
endif()
|
||||
|
||||
install(TARGETS ${PROJECT_NAME} RUNTIME DESTINATION bin LIBRARY DESTINATION lib)
|
||||
endif()
|
||||
|
|
|
@ -18,14 +18,19 @@ NUM_MAKE_JOBS ?= $(shell nproc --ignore=2)
|
|||
|
||||
.PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm
|
||||
|
||||
# Build all targets.
|
||||
# Build targets.
|
||||
all: cuda rocm
|
||||
cuda_with_msccl: cuda cuda_msccl
|
||||
cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed
|
||||
rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm
|
||||
cpu: common cpu_perftest
|
||||
common: cpu_hpl cpu_stream fio
|
||||
cpu: common cpu_perftest cpu_stream
|
||||
common: fio
|
||||
|
||||
# non aarch64 specific targets
|
||||
ifneq ($(shell uname -m), aarch64)
|
||||
common: fio cpu_hpl
|
||||
directx_amd: directx_amf_encoding_latency
|
||||
endif
|
||||
|
||||
# Create $(SB_MICRO_PATH)/bin and $(SB_MICRO_PATH)/lib, no error if existing, make parent directories as needed.
|
||||
sb_micro_path:
|
||||
|
@ -59,7 +64,7 @@ else
|
|||
endif
|
||||
if [ -d cuda-samples ]; then rm -rf cuda-samples; fi
|
||||
git clone -b v$(CUDA_VER) https://github.com/NVIDIA/cuda-samples.git
|
||||
cd ./$(TEST_PATH) && make clean && make TARGET_ARCH=x86_64 SMS=$(ARCHS)
|
||||
cd ./$(TEST_PATH) && make clean && make SMS=$(ARCHS)
|
||||
cp -v ./$(TEST_PATH)/bandwidthTest $(SB_MICRO_PATH)/bin/
|
||||
|
||||
# Build nccl-tests from commit 8274cb4 of default branch.
|
||||
|
|
|
@ -1,16 +1,27 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
CC= /opt/AMD/aocc-compiler-4.0.0/bin/clang
|
||||
CFLAGS= -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -lomp -fopenmp -fnt-store=aggressive -DNTIMES=10
|
||||
GENFLAGS= -DSTREAM_ARRAY_SIZE=400000000
|
||||
ZEN3FLAGS= -DSTREAM_ARRAY_SIZE=400000000 -march=znver3
|
||||
ZEN4FLAGS= -DSTREAM_ARRAY_SIZE=800000000 -march=znver4
|
||||
GEN_OUTPUT= streamx86.exe
|
||||
ZEN3_OUTPUT= streamZen3.exe
|
||||
ZEN4_OUTPUT= streamZen4.exe
|
||||
GENFLAGS := -DSTREAM_ARRAY_SIZE=400000000
|
||||
ZEN3FLAGS := -DSTREAM_ARRAY_SIZE=400000000 -march=znver3
|
||||
ZEN4FLAGS := -DSTREAM_ARRAY_SIZE=800000000 -march=znver4
|
||||
NEO2FLAGS := -DSTREAM_ARRAY_SIZE=120000000 -mcpu=neoverse-v2
|
||||
|
||||
GEN_OUTPUT := streamx86.exe
|
||||
ZEN3_OUTPUT := streamZen3.exe
|
||||
ZEN4_OUTPUT := streamZen4.exe
|
||||
NEO2_OUTPUT := streamNeo2.exe
|
||||
|
||||
ARCH := $(shell uname -m)
|
||||
|
||||
ifeq ($(ARCH), aarch64)
|
||||
CFLAGS := -Ofast -fopenmp -DNTIMES=200
|
||||
CC := gcc
|
||||
all: NEO2
|
||||
else
|
||||
CC := /opt/AMD/aocc-compiler-4.0.0/bin/clang
|
||||
CFLAGS := -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -lomp -fopenmp -fnt-store=aggressive -DNTIMES=10
|
||||
all: ZEN3 ZEN4 X86
|
||||
endif
|
||||
|
||||
ZEN3: stream.c
|
||||
$(CC) $(CFLAGS) $(ZEN3FLAGS) stream.c -o $(ZEN3_OUTPUT)
|
||||
|
@ -18,6 +29,13 @@ ZEN4:
|
|||
$(CC) $(CFLAGS) $(ZEN4FLAGS) stream.c -o $(ZEN4_OUTPUT)
|
||||
X86:
|
||||
$(CC) $(CFLAGS) $(GENFLAGS) stream.c -o $(GEN_OUTPUT)
|
||||
NEO2:
|
||||
$(CC) $(CFLAGS) $(NEO2FLAGS) stream.c -o $(NEO2_OUTPUT)
|
||||
|
||||
ifeq ($(ARCH), aarch64)
|
||||
clean:
|
||||
rm $(NEO2_OUTPUT)
|
||||
else
|
||||
clean:
|
||||
rm $(GEN_OUTPUT) $(ZEN3_OUTPUT) $(ZEN4_OUTPUT)
|
||||
endif
|
||||
|
|
Загрузка…
Ссылка в новой задаче