Adding HPL benchmark (#482)
**Description** - Adding HPL benchmark --------- Co-authored-by: Ubuntu <azureuser@sbtestvm.jzlku1oskncengjiado35wf1hd.ax.internal.cloudapp.net> Co-authored-by: Peng Cheng <chengpeng5555@outlook.com>
This commit is contained in:
Родитель
644b5395df
Коммит
655bd0aa59
|
@ -128,6 +128,13 @@ RUN cd /tmp && \
|
|||
apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
|
||||
rm -rf aocc-compiler-4.0.0_1_amd64.deb
|
||||
|
||||
# Install AMD BLIS
|
||||
RUN cd /tmp && \
|
||||
wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
|
||||
tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
|
||||
mv amd-blis /opt/AMD && \
|
||||
rm -rf aocl-blis-linux-aocc-4.0.tar.gz
|
||||
|
||||
# Add config files
|
||||
ADD dockerfile/etc /opt/microsoft/
|
||||
|
||||
|
|
|
@ -108,6 +108,13 @@ RUN cd /tmp && \
|
|||
apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
|
||||
rm -rf aocc-compiler-4.0.0_1_amd64.deb
|
||||
|
||||
# Install AMD BLIS
|
||||
RUN cd /tmp && \
|
||||
wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
|
||||
tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
|
||||
mv amd-blis /opt/AMD && \
|
||||
rm -rf aocl-blis-linux-aocc-4.0.tar.gz
|
||||
|
||||
# Add config files
|
||||
ADD dockerfile/etc /opt/microsoft/
|
||||
|
||||
|
|
|
@ -108,6 +108,13 @@ RUN cd /tmp && \
|
|||
apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
|
||||
rm -rf aocc-compiler-4.0.0_1_amd64.deb
|
||||
|
||||
# Install AMD BLIS
|
||||
RUN cd /tmp && \
|
||||
wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
|
||||
tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
|
||||
mv amd-blis /opt/AMD && \
|
||||
rm -rf aocl-blis-linux-aocc-4.0.tar.gz
|
||||
|
||||
# Install rccl-rdma-sharp-plugins
|
||||
ENV SHARP_VERSION=5.0
|
||||
RUN cd /opt/rocm && \
|
||||
|
|
|
@ -120,6 +120,13 @@ RUN cd /tmp && \
|
|||
apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
|
||||
rm -rf aocc-compiler-4.0.0_1_amd64.deb
|
||||
|
||||
# Install AMD BLIS
|
||||
RUN cd /tmp && \
|
||||
wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
|
||||
tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
|
||||
mv amd-blis /opt/AMD && \
|
||||
rm -rf aocl-blis-linux-aocc-4.0.tar.gz
|
||||
|
||||
ENV PATH="${PATH}:/opt/rocm/hip/bin/" \
|
||||
LD_LIBRARY_PATH="/usr/local/lib/:${LD_LIBRARY_PATH}" \
|
||||
SB_HOME=/opt/superbench \
|
||||
|
|
|
@ -171,6 +171,21 @@ Supports the use of double unit types and the use of tensor cores.
|
|||
| gpu-burn/gpu_[0-9]_pass | yes/no | The result of the gpu-burn test for each GPU (1: yes, 0: no). |
|
||||
| gpu-burn/abort | yes/no | Whether or not GPU-burn test aborted before returning GPU results (1: yes, 0: no). |
|
||||
|
||||
### `cpu-hpl`
|
||||
|
||||
#### Introduction
|
||||
|
||||
HPL or High Performance Computing Linpack evaluates compute bandwidth by solving dense linear systems in double precision arethmetic.
|
||||
Performed by [High-Performance Linpack Benchmark for Distributed-Memory Computers](https://netlib.org/benchmark/hpl/)
|
||||
|
||||
#### Metrics
|
||||
|
||||
| Name | Unit | Description |
|
||||
|---------------------|--------------------|----------------------------------------------------------------------------|
|
||||
| cpu-hpl/tests_pass | | HPL completed running and correctness test has passed (1: pass, 0: fail). |
|
||||
| cpu-hpl/throughput | bandwidth (GFlops) | Compute bandwidth. |
|
||||
| cpu-hpl/time | time (s) | Time elapsed during HPL run. |
|
||||
|
||||
### `cpu-stream`
|
||||
|
||||
#### Introduction
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
"""Micro benchmark example for CPU HPL performance.
|
||||
|
||||
Commands to run:
|
||||
python3 examples/benchmarks/cpu_hpl_performance.py
|
||||
"""
|
||||
|
||||
from superbench.benchmarks import BenchmarkRegistry
|
||||
from superbench.common.utils import logger
|
||||
|
||||
if __name__ == '__main__':
|
||||
context = BenchmarkRegistry.create_benchmark_context(
|
||||
'cpu-hpl',
|
||||
parameters='--cpu_arch zen3 \
|
||||
--blockSize 224 --coreCount 60 --blocks 1 --problemSize 224000'
|
||||
)
|
||||
|
||||
benchmark = BenchmarkRegistry.launch_benchmark(context)
|
||||
if benchmark:
|
||||
logger.info(
|
||||
'benchmark: {}, return code: {}, result: {}'.format(
|
||||
benchmark.name, benchmark.return_code, benchmark.result
|
||||
)
|
||||
)
|
|
@ -17,6 +17,7 @@ from superbench.benchmarks.micro_benchmarks.cudnn_function import CudnnBenchmark
|
|||
from superbench.benchmarks.micro_benchmarks.disk_performance import DiskBenchmark
|
||||
from superbench.benchmarks.micro_benchmarks.cpu_memory_bw_latency_performance import CpuMemBwLatencyBenchmark
|
||||
from superbench.benchmarks.micro_benchmarks.cpu_stream_performance import CpuStreamBenchmark
|
||||
from superbench.benchmarks.micro_benchmarks.cpu_hpl_performance import CpuHplBenchmark
|
||||
from superbench.benchmarks.micro_benchmarks.gpcnet_performance import GPCNetBenchmark
|
||||
from superbench.benchmarks.micro_benchmarks.gpu_copy_bw_performance import GpuCopyBwBenchmark
|
||||
from superbench.benchmarks.micro_benchmarks.gpu_burn_test import GpuBurnBenchmark
|
||||
|
@ -33,6 +34,7 @@ from superbench.benchmarks.micro_benchmarks.tensorrt_inference_performance impor
|
|||
__all__ = [
|
||||
'ComputationCommunicationOverlap',
|
||||
'CpuMemBwLatencyBenchmark',
|
||||
'CpuHplBenchmark',
|
||||
'CpuStreamBenchmark',
|
||||
'CublasBenchmark',
|
||||
'CublasLtBenchmark',
|
||||
|
|
|
@ -0,0 +1,152 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
"""Module for running the HPL benchmark."""
|
||||
|
||||
import os
|
||||
|
||||
from superbench.common.utils import logger
|
||||
from superbench.benchmarks import BenchmarkRegistry
|
||||
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
|
||||
|
||||
|
||||
class CpuHplBenchmark(MicroBenchmarkWithInvoke):
|
||||
"""The HPL benchmark class."""
|
||||
def __init__(self, name, parameters=''):
|
||||
"""Constructor.
|
||||
|
||||
Args:
|
||||
name (str): benchmark name.
|
||||
parameters (str): benchmark parameters.
|
||||
"""
|
||||
super().__init__(name, parameters)
|
||||
|
||||
self._bin_name = 'hpl_run.sh'
|
||||
|
||||
def add_parser_arguments(self):
|
||||
"""Add the specified arguments."""
|
||||
super().add_parser_arguments()
|
||||
|
||||
self.__cpu_arch = ['zen3', 'zen4']
|
||||
|
||||
self._parser.add_argument(
|
||||
'--cpu_arch',
|
||||
type=str,
|
||||
default='zen4',
|
||||
required=False,
|
||||
help='The targeted cpu architectures to run \
|
||||
HPL. Default is zen4. Possible values are {}.'.format(' '.join(self.__cpu_arch))
|
||||
)
|
||||
self._parser.add_argument(
|
||||
'--blockSize',
|
||||
type=int,
|
||||
default=384,
|
||||
required=False,
|
||||
help='Size of blocks. This parameter is an HPL input. Default 384.'
|
||||
)
|
||||
self._parser.add_argument(
|
||||
'--coreCount',
|
||||
type=int,
|
||||
default=88, # for HBv4 total number of cores is 176 => 88 per cpu
|
||||
required=False,
|
||||
help='Number of cores per CPU. Used for MPI and HPL configuration. \
|
||||
Default 88 (HBv4 has a total of 176 w/ 2 cpus therefore 88 per cpu)'
|
||||
)
|
||||
self._parser.add_argument(
|
||||
'--blocks',
|
||||
type=int,
|
||||
default=1,
|
||||
required=False,
|
||||
help='Number of blocks. This parameter is an HPL input. Default 1.'
|
||||
)
|
||||
self._parser.add_argument(
|
||||
'--problemSize',
|
||||
type=int,
|
||||
default=384000,
|
||||
required=False,
|
||||
help='This is the problem size designated by "N" notation. \
|
||||
This parameter is an HPL input. Default is 384000'
|
||||
)
|
||||
|
||||
def _preprocess(self, hpl_template):
|
||||
"""Preprocess/preparation operations before the benchmarking.
|
||||
|
||||
Return:
|
||||
True if _preprocess() succeed.
|
||||
"""
|
||||
if not super()._preprocess():
|
||||
return False
|
||||
|
||||
if not self._set_binary_path():
|
||||
logger.error(
|
||||
'Executable {} not found in {} or it is not executable'.format(self._bin_name, self._args.bin_dir)
|
||||
)
|
||||
return False
|
||||
|
||||
# xhpl type
|
||||
xhpl = 'xhpl_z4'
|
||||
if self._args.cpu_arch == 'zen3':
|
||||
xhpl = 'xhpl_z3'
|
||||
|
||||
# command
|
||||
command = os.path.join(self._args.bin_dir, self._bin_name)
|
||||
command = command + ' ' + xhpl + ' ' + str(self._args.coreCount)
|
||||
|
||||
# modify HPL.dat
|
||||
if hpl_template:
|
||||
hpl_input_file = hpl_template
|
||||
else:
|
||||
hpl_input_file = os.path.join(self._args.bin_dir, 'template_hpl.dat')
|
||||
search_string = ['problemSize', 'blockCount', 'blockSize']
|
||||
with open(hpl_input_file, 'r') as hplfile:
|
||||
lines = hplfile.readlines()
|
||||
hpl_input_file = os.path.join(os.getcwd(), 'HPL.dat')
|
||||
with open(hpl_input_file, 'w') as hplfile:
|
||||
for line in lines:
|
||||
if search_string[0] in line:
|
||||
line = line.replace(search_string[0], str(self._args.problemSize))
|
||||
elif search_string[1] in line:
|
||||
line = line.replace(search_string[1], str(self._args.blocks))
|
||||
elif search_string[2] in line:
|
||||
line = line.replace(search_string[2], str(self._args.blockSize))
|
||||
hplfile.write(line)
|
||||
|
||||
self._commands.append(command)
|
||||
return True
|
||||
|
||||
def _process_raw_result(self, cmd_idx, raw_output):
|
||||
"""Function to parse raw results and save the summarized results.
|
||||
|
||||
self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
|
||||
|
||||
Args:
|
||||
cmd_idx (int): the index of command corresponding with the raw_output.
|
||||
raw_output (str): raw output string of the micro-benchmark.
|
||||
|
||||
Return:
|
||||
True if the raw output string is valid and result can be extracted.
|
||||
"""
|
||||
content = raw_output.splitlines()
|
||||
|
||||
for idx, line in enumerate(content):
|
||||
if 'T/V' in line and 'Gflops' in line:
|
||||
break
|
||||
|
||||
results = content[idx + 2].split()
|
||||
|
||||
for line in content[idx + 2:]:
|
||||
if '1 tests completed and passed residual checks' in line:
|
||||
self._result.add_result('tests_pass', 1)
|
||||
elif '0 tests completed and passed residual checks' in line:
|
||||
self._result.add_result('tests_pass', 0)
|
||||
|
||||
self._result.add_result('time', float(results[5]))
|
||||
self._result.add_result('throughput', float(results[6]))
|
||||
|
||||
# raw output
|
||||
self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
BenchmarkRegistry.register_benchmark('cpu-hpl', CpuHplBenchmark)
|
|
@ -0,0 +1,66 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
"""Tests for HPL benchmark."""
|
||||
|
||||
import unittest
|
||||
|
||||
from tests.helper import decorator
|
||||
from tests.helper.testcase import BenchmarkTestCase
|
||||
from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
|
||||
|
||||
|
||||
class CpuHplBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
|
||||
"""Test class for HPL benchmark."""
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""Hook method for setting up class fixture before running tests in the class."""
|
||||
super().setUpClass()
|
||||
cls.createMockEnvs(cls)
|
||||
cls.createMockFiles(cls, ['bin/hpl_run.sh'])
|
||||
return True
|
||||
|
||||
@decorator.load_data('tests/data/hpl_results.log')
|
||||
def test_hpl(self, results):
|
||||
"""Test HPL benchmark command generation."""
|
||||
benchmark_name = 'cpu-hpl'
|
||||
(benchmark_class,
|
||||
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU)
|
||||
assert (benchmark_class)
|
||||
|
||||
parameters = '--cpu_arch zen3 \
|
||||
--blockSize 224 --coreCount 60 --blocks 1 --problemSize 224000'
|
||||
|
||||
benchmark = benchmark_class(benchmark_name, parameters=parameters)
|
||||
|
||||
# Check basic information
|
||||
assert (benchmark)
|
||||
ret = benchmark._preprocess(hpl_template='third_party/hpl-tests/template_hpl.dat')
|
||||
assert (ret is True)
|
||||
assert (benchmark.return_code == ReturnCode.SUCCESS)
|
||||
assert (benchmark.name == benchmark_name)
|
||||
assert (benchmark.type == BenchmarkType.MICRO)
|
||||
|
||||
# Check parameters specified in BenchmarkContext.
|
||||
|
||||
assert (benchmark._args.cpu_arch == 'zen3')
|
||||
assert (benchmark._args.blockSize == 224)
|
||||
assert (benchmark._args.coreCount == 60)
|
||||
assert (benchmark._args.blocks == 1)
|
||||
assert (benchmark._args.problemSize == 224000)
|
||||
|
||||
# Check command
|
||||
assert (1 == len(benchmark._commands))
|
||||
assert ('60' in benchmark._commands[0])
|
||||
assert ('hpl_run.sh' in benchmark._commands[0])
|
||||
assert ('xhpl_z3' in benchmark._commands[0])
|
||||
|
||||
# Check results
|
||||
assert (benchmark._process_raw_result(0, results))
|
||||
assert (benchmark.result['return_code'][0] == 0)
|
||||
assert (float(benchmark.result['time'][0]) == 4645.37)
|
||||
assert (float(benchmark.result['throughput'][0]) == 8126.1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -11,14 +11,14 @@ HPCX_HOME ?= /opt/hpcx
|
|||
CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c2- | cut -d '.' -f1-2)
|
||||
ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
|
||||
|
||||
.PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream
|
||||
.PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl
|
||||
|
||||
# Build all targets.
|
||||
all: cuda rocm
|
||||
cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn
|
||||
rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest
|
||||
cpu: common cpu_perftest
|
||||
common: fio cpu_stream
|
||||
common: cpu_hpl cpu_stream fio
|
||||
|
||||
# Create $(SB_MICRO_PATH)/bin and $(SB_MICRO_PATH)/lib, no error if existing, make parent directories as needed.
|
||||
sb_micro_path:
|
||||
|
@ -124,7 +124,23 @@ ifneq (,$(wildcard gpu-burn/Makefile))
|
|||
cp -v ./gpu-burn/compare.ptx $(SB_MICRO_PATH)/bin/
|
||||
endif
|
||||
|
||||
#Build stream from main branch (only branch that exists)
|
||||
# Build HPL from main branch
|
||||
cpu_hpl: sb_micro_path
|
||||
ifneq (,$(wildcard hpl-tests/Makefile))
|
||||
cd ./hpl-tests && \
|
||||
wget https://netlib.org/benchmark/hpl/hpl-2.3.tar.gz && \
|
||||
tar xzf hpl-2.3.tar.gz && \
|
||||
cp Make.Linux_zen3 hpl-2.3 && \
|
||||
cp Make.Linux_zen4 hpl-2.3 && \
|
||||
make all
|
||||
cp -v ./hpl-tests/hpl-2.3/bin/Linux_zen3/xhpl $(SB_MICRO_PATH)/bin/xhpl_z3
|
||||
cp -v ./hpl-tests/hpl-2.3/bin/Linux_zen4/xhpl $(SB_MICRO_PATH)/bin/xhpl_z4
|
||||
cp -v ./hpl-tests/hpl_run.sh $(SB_MICRO_PATH)/bin/
|
||||
cp -v ./hpl-tests/bindmem.sh $(SB_MICRO_PATH)/bin/
|
||||
cp -v ./hpl-tests/template_hpl.dat $(SB_MICRO_PATH)/bin/
|
||||
endif
|
||||
|
||||
# Build STREAM
|
||||
cpu_stream: sb_micro_path
|
||||
ifneq (,$(wildcard stream-tests/Makefile))
|
||||
cd ./stream-tests && \
|
||||
|
@ -132,4 +148,3 @@ ifneq (,$(wildcard stream-tests/Makefile))
|
|||
make all
|
||||
cp -v ./stream-tests/stream*.exe $(SB_MICRO_PATH)/bin/
|
||||
endif
|
||||
|
||||
|
|
|
@ -0,0 +1,62 @@
|
|||
# ######################################################################
|
||||
# HPL Makefile for Zen3 build
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
# ######################################################################
|
||||
#
|
||||
# Shell path and command translations
|
||||
#
|
||||
SHELL = /bin/sh
|
||||
#
|
||||
CD = cd
|
||||
CP = cp
|
||||
LN_S = ln -fs
|
||||
MKDIR = mkdir -p
|
||||
RM = /bin/rm -f
|
||||
TOUCH = touch
|
||||
#
|
||||
# Build Type
|
||||
#
|
||||
ARCH = $(arch)
|
||||
#
|
||||
#HPL Directory
|
||||
#
|
||||
TOPdir = ../../..
|
||||
INCdir = $(TOPdir)/include
|
||||
BINdir = $(TOPdir)/bin/$(ARCH)
|
||||
LIBdir = $(TOPdir)/lib/$(ARCH)
|
||||
#
|
||||
HPLlib = $(LIBdir)/libhpl.a
|
||||
#
|
||||
# OPenMPI LIB
|
||||
#
|
||||
MPdir = $(omp)
|
||||
MPinc = -I$(MPdir)/include
|
||||
MPlib = $(MPdir)/lib/libmpi.so
|
||||
#
|
||||
# Add Bliss
|
||||
#
|
||||
LAdir = /opt/AMD/amd-blis
|
||||
LAinc = -I$(LAdir)/lib/include
|
||||
LAlib = $(LAdir)/lib/LP64/libblis-mt.so
|
||||
#
|
||||
# Fortran to C
|
||||
#
|
||||
Fort_C_FLAGS = -DAdd__ -DF77_INTEGER=int -DStringSunStyle
|
||||
#
|
||||
# HPL includes and libraries
|
||||
#
|
||||
HPL_INCLUDES = -I$(INCdir) -I$(INCdir)/$(ARCH) $(LAinc) $(MPinc)
|
||||
HPL_LIBS = $(HPLlib) $(LAlib) $(MPlib) -lm
|
||||
#
|
||||
# Compilers and linker path + flags
|
||||
#
|
||||
HPL_DEFS = $(Fort_C_FLAGS) -DHPL_PROGRESS_REPORT $(HPL_INCLUDES)
|
||||
CC = /opt/AMD/aocc-compiler-4.0.0/bin/clang
|
||||
CCNOOPT = $(HPL_DEFS)
|
||||
CCFLAGS = $(HPL_DEFS) -march=znver3 -fomit-frame-pointer -O3 -funroll-loops
|
||||
LINKER = /opt/AMD/aocc-compiler-4.0.0/bin/clang
|
||||
LINKFLAGS = $(CCFLAGS)
|
||||
ARCHIVER = ar
|
||||
ARFLAGS = r
|
||||
RANLIB = echo
|
|
@ -0,0 +1,62 @@
|
|||
# ######################################################################
|
||||
# HPL Makefile for Zen4 build
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
# ######################################################################
|
||||
#
|
||||
# Shell path and command translations
|
||||
#
|
||||
SHELL = /bin/sh
|
||||
#
|
||||
CD = cd
|
||||
CP = cp
|
||||
LN_S = ln -fs
|
||||
MKDIR = mkdir -p
|
||||
RM = /bin/rm -f
|
||||
TOUCH = touch
|
||||
#
|
||||
# Build Type
|
||||
#
|
||||
ARCH = $(arch)
|
||||
#
|
||||
#HPL Directory
|
||||
#
|
||||
TOPdir = ../../..
|
||||
INCdir = $(TOPdir)/include
|
||||
BINdir = $(TOPdir)/bin/$(ARCH)
|
||||
LIBdir = $(TOPdir)/lib/$(ARCH)
|
||||
#
|
||||
HPLlib = $(LIBdir)/libhpl.a
|
||||
#
|
||||
# OPenMPI LIB
|
||||
#
|
||||
MPdir = $(omp)
|
||||
MPinc = -I$(MPdir)/include
|
||||
MPlib = $(MPdir)/lib/libmpi.so
|
||||
#
|
||||
# Add Bliss
|
||||
#
|
||||
LAdir = /opt/AMD/amd-blis
|
||||
LAinc = -I$(LAdir)/lib/include
|
||||
LAlib = $(LAdir)/lib/LP64/libblis-mt.so
|
||||
#
|
||||
# Fortran to C
|
||||
#
|
||||
Fort_C_FLAGS = -DAdd__ -DF77_INTEGER=int -DStringSunStyle
|
||||
#
|
||||
# HPL includes and libraries
|
||||
#
|
||||
HPL_INCLUDES = -I$(INCdir) -I$(INCdir)/$(ARCH) $(LAinc) $(MPinc)
|
||||
HPL_LIBS = $(HPLlib) $(LAlib) $(MPlib) -lm
|
||||
#
|
||||
# Compilers and linker path + flags
|
||||
#
|
||||
HPL_DEFS = $(Fort_C_FLAGS) -DHPL_PROGRESS_REPORT $(HPL_INCLUDES)
|
||||
CC = /opt/AMD/aocc-compiler-4.0.0/bin/clang
|
||||
CCNOOPT = $(HPL_DEFS)
|
||||
CCFLAGS = $(HPL_DEFS) -march=znver4 -fomit-frame-pointer -O3 -funroll-loops
|
||||
LINKER = /opt/AMD/aocc-compiler-4.0.0/bin/clang
|
||||
LINKFLAGS = $(CCFLAGS)
|
||||
ARCHIVER = ar
|
||||
ARFLAGS = r
|
||||
RANLIB = echo
|
|
@ -0,0 +1,43 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
|
||||
all: ZEN3 ZEN4 CONFIGURE
|
||||
|
||||
CONFIGURE:
|
||||
cp ./hpl-2.3/setup/Make.Linux_ATHLON_CBLAS ./hpl-2.3/Make.Linux_zen3
|
||||
sed -i 's/.*ARCH .*=.*/ARCH =$$(arch)/' ./hpl-2.3/Make.Linux_zen3
|
||||
sed -i 's/.*TOPdir .*=.*/TOPdir = ..\/..\/../' ./hpl-2.3/Make.Linux_zen3
|
||||
sed -i 's/.*MPdir .*=.*/MPdir = $$(omp)/' ./hpl-2.3/Make.Linux_zen3
|
||||
sed -i 's/.*MPinc .*=.*/MPinc = -I$$(MPdir)\/include/' ./hpl-2.3/Make.Linux_zen3
|
||||
sed -i 's/.*MPlib .*=.*/MPlib = $$(MPdir)\/lib\/libmpi.so/' ./hpl-2.3/Make.Linux_zen3
|
||||
sed -i 's/.*LAdir .*=.*/LAdir = \/opt\/AMD\/amd-blis/' ./hpl-2.3/Make.Linux_zen3
|
||||
sed -i 's/LAinc .*=/LAinc = -I$$(LAdir)\/lib\/include/' ./hpl-2.3/Make.Linux_zen3
|
||||
sed -i 's/.*LAlib .*=.*/LAlib = $$(LAdir)\/lib\/LP64\/libblis-mt.so/' ./hpl-2.3/Make.Linux_zen3
|
||||
sed -i 's/.*CC .*=.*/CC = \/opt\/AMD\/aocc-compiler-4.0.0\/bin\/clang/' ./hpl-2.3/Make.Linux_zen3
|
||||
sed -i 's/.*CCFLAGS .*=.*/CCFLAGS = $$(HPL_DEFS) -march=znver3 -fomit-frame-pointer -O3 -funroll-loops/' ./hpl-2.3/Make.Linux_zen3
|
||||
sed -i 's/.*LINKER .*=.*/LINKER = \/opt\/AMD\/aocc-compiler-4.0.0\/bin\/clang/' ./hpl-2.3/Make.Linux_zen3
|
||||
cp ./hpl-2.3/Make.Linux_zen3 ./hpl-2.3/Make.Linux_zen4
|
||||
sed -i 's/.*CCFLAGS .*=.*/CCFLAGS = $$(HPL_DEFS) -march=znver4 -fomit-frame-pointer -O3 -funroll-loops/' ./hpl-2.3/Make.Linux_zen4
|
||||
ZEN3: CONFIGURE
|
||||
ifneq (,$(wildcard /opt/hpcx/ompi))
|
||||
cd ./hpl-2.3 && \
|
||||
make arch=Linux_zen3 omp=/opt/hpcx/ompi
|
||||
else
|
||||
cd ./hpl-2.3 && \
|
||||
make arch=Linux_zen3 omp=/opt/ompi/
|
||||
endif
|
||||
|
||||
ZEN4: CONFIGURE
|
||||
ifneq (,$(wildcard /opt/hpcx/ompi))
|
||||
cd ./hpl-2.3 && \
|
||||
make arch=Linux_zen4 omp=/opt/hpcx/ompi
|
||||
else
|
||||
cd ./hpl-2.3 && \
|
||||
make arch=Linux_zen4 omp=/opt/ompi/
|
||||
endif
|
||||
|
||||
clean:
|
||||
cd ./hpl-2.3 && \
|
||||
make clean arch=Linux_zen3 &&\
|
||||
make clean arch=Linux_zen4
|
|
@ -0,0 +1,3 @@
|
|||
#!/bin/bash
|
||||
nodes=$(numactl --show | awk -F: '/^cpubind/ {print $2;}' | sed -e 's/^ //g' -e 's/ $//g' | tr ' ' ',')
|
||||
exec numactl --interleave=${nodes} $@
|
|
@ -0,0 +1,22 @@
|
|||
#!/bin/bash
|
||||
|
||||
source /opt/hpcx/hpcx-init.sh
|
||||
hpcx_load
|
||||
|
||||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/AMD/amd-blis/lib/LP64/:/opt/hpcx/ompi/lib/:/opt/AMD/aocc-compiler-4.0.0/lib/
|
||||
|
||||
# On the off chance OMPI_MCA is set to UCX-only, disable that
|
||||
unset OMPI_MCA_osc
|
||||
|
||||
BIN_PATH="$(dirname "$0")"
|
||||
XHPL_EXE=$1
|
||||
NCORES=$2
|
||||
|
||||
NT=${NCORES}
|
||||
NR=2
|
||||
MAP_BY=socket
|
||||
set -x
|
||||
|
||||
mpirun --allow-run-as-root --map-by ${MAP_BY}:PE=$NT -np $NR --bind-to core \
|
||||
-x OMP_NUM_THREADS=$NT -x OMP_PROC_BIND=close -x OMP_PLACES=cores \
|
||||
${BIN_PATH}/bindmem.sh ${BIN_PATH}/${XHPL_EXE}
|
|
@ -0,0 +1,31 @@
|
|||
HPLinpack benchmark input file
|
||||
Innovative Computing Laboratory, University of Tennessee
|
||||
HPL.out output file name (if any)
|
||||
6 device out (6=stdout,7=stderr,file)
|
||||
1 # of problems sizes (N)
|
||||
problemSize Ns
|
||||
blockCount # of NBs
|
||||
blockSize # NBs
|
||||
0 MAP process mapping (0=Row-,1=Column-major)
|
||||
1 # of process grids (P x Q)
|
||||
1 Ps
|
||||
2 Qs
|
||||
16.0 threshold
|
||||
1 # of panel fact<
|
||||
1 PFACTs (0=left, 1=Crout, 2=Right)
|
||||
1 # of recursive stopping criterium
|
||||
48 NBMINs (>= 1)
|
||||
1 # of panels in recursion
|
||||
8 NDIVs
|
||||
1 # of recursive panel fact.
|
||||
2 RFACTs (0=left, 1=Crout, 2=Right)
|
||||
1 # of broadcast
|
||||
7 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)
|
||||
1 # of lookahead depth
|
||||
0 DEPTHs (>=0)
|
||||
2 SWAP (0=bin-exch,1=long,2=mix)
|
||||
64 swapping threshold
|
||||
0 L1 in (0=transposed,1=no-transposed) form
|
||||
0 U in (0=transposed,1=no-transposed) form
|
||||
1 Equilibration (0=no,1=yes)
|
||||
8 memory alignment in double (> 0)
|
Загрузка…
Ссылка в новой задаче