Adding HPL benchmark (#482)

**Description** - Adding HPL benchmark --------- Co-authored-by: Ubuntu <azureuser@sbtestvm.jzlku1oskncengjiado35wf1hd.ax.internal.cloudapp.net> Co-authored-by: Peng Cheng <chengpeng5555@outlook.com>
2023-03-21 12:44:08 -04:00 · 2023-03-21 12:44:08 -04:00 · 655bd0aa59
--- a/dockerfile/cuda11.1.1.dockerfile
+++ b/dockerfile/cuda11.1.1.dockerfile
@ -128,6 +128,13 @@ RUN cd /tmp && \
    apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
    rm -rf aocc-compiler-4.0.0_1_amd64.deb

+# Install AMD BLIS
+RUN cd /tmp && \
+    wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
+    tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
+    mv amd-blis /opt/AMD && \
+    rm -rf aocl-blis-linux-aocc-4.0.tar.gz
+
 # Add config files
 ADD dockerfile/etc /opt/microsoft/

--- a/dockerfile/cuda11.8.dockerfile
+++ b/dockerfile/cuda11.8.dockerfile
@ -108,6 +108,13 @@ RUN cd /tmp && \
    apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
    rm -rf aocc-compiler-4.0.0_1_amd64.deb

+# Install AMD BLIS
+RUN cd /tmp && \
+    wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
+    tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
+    mv amd-blis /opt/AMD && \
+    rm -rf aocl-blis-linux-aocc-4.0.tar.gz
+
 # Add config files
 ADD dockerfile/etc /opt/microsoft/

--- a/dockerfile/rocm5.0.x.dockerfile
+++ b/dockerfile/rocm5.0.x.dockerfile
@ -108,6 +108,13 @@ RUN cd /tmp && \
    apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
    rm -rf aocc-compiler-4.0.0_1_amd64.deb

+# Install AMD BLIS
+RUN cd /tmp && \
+    wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
+    tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
+    mv amd-blis /opt/AMD && \
+    rm -rf aocl-blis-linux-aocc-4.0.tar.gz
+
 # Install rccl-rdma-sharp-plugins
 ENV SHARP_VERSION=5.0
 RUN cd /opt/rocm && \
--- a/dockerfile/rocm5.1.x.dockerfile
+++ b/dockerfile/rocm5.1.x.dockerfile
@ -120,6 +120,13 @@ RUN cd /tmp && \
    apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
    rm -rf aocc-compiler-4.0.0_1_amd64.deb

+# Install AMD BLIS
+RUN cd /tmp && \
+    wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
+    tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
+    mv amd-blis /opt/AMD && \
+    rm -rf aocl-blis-linux-aocc-4.0.tar.gz
+
 ENV PATH="${PATH}:/opt/rocm/hip/bin/" \
    LD_LIBRARY_PATH="/usr/local/lib/:${LD_LIBRARY_PATH}" \
    SB_HOME=/opt/superbench \
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@ -171,6 +171,21 @@ Supports the use of double unit types and the use of tensor cores.
 | gpu-burn/gpu_[0-9]_pass | yes/no   | The result of the gpu-burn test for each GPU (1: yes, 0: no).                      |
 | gpu-burn/abort          | yes/no   | Whether or not GPU-burn test aborted before returning GPU results (1: yes, 0: no). |

+### `cpu-hpl`
+
+#### Introduction
+
+HPL or High Performance Computing Linpack evaluates compute bandwidth by solving dense linear systems in double precision arethmetic.
+Performed by [High-Performance Linpack Benchmark for Distributed-Memory Computers](https://netlib.org/benchmark/hpl/)
+
+#### Metrics
+
+| Name                | Unit               | Description                                                                |
+|---------------------|--------------------|----------------------------------------------------------------------------|
+| cpu-hpl/tests_pass  |                    | HPL completed running and correctness test has passed (1: pass, 0: fail).  |
+| cpu-hpl/throughput  | bandwidth (GFlops) | Compute bandwidth.                                                         |
+| cpu-hpl/time        | time (s)           | Time elapsed during HPL run.                                               |
+
 ### `cpu-stream`

 #### Introduction
--- a/examples/benchmarks/cpu_hpl_performance.py
+++ b/examples/benchmarks/cpu_hpl_performance.py
@ -0,0 +1,26 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Micro benchmark example for CPU HPL performance.
+
+Commands to run:
+  python3 examples/benchmarks/cpu_hpl_performance.py
+"""
+
+from superbench.benchmarks import BenchmarkRegistry
+from superbench.common.utils import logger
+
+if __name__ == '__main__':
+    context = BenchmarkRegistry.create_benchmark_context(
+        'cpu-hpl',
+        parameters='--cpu_arch zen3 \
+        --blockSize 224 --coreCount 60 --blocks 1 --problemSize 224000'
+    )
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
--- a/superbench/benchmarks/micro_benchmarks/init.py
+++ b/superbench/benchmarks/micro_benchmarks/init.py
@ -17,6 +17,7 @@ from superbench.benchmarks.micro_benchmarks.cudnn_function import CudnnBenchmark
 from superbench.benchmarks.micro_benchmarks.disk_performance import DiskBenchmark
 from superbench.benchmarks.micro_benchmarks.cpu_memory_bw_latency_performance import CpuMemBwLatencyBenchmark
 from superbench.benchmarks.micro_benchmarks.cpu_stream_performance import CpuStreamBenchmark
+from superbench.benchmarks.micro_benchmarks.cpu_hpl_performance import CpuHplBenchmark
 from superbench.benchmarks.micro_benchmarks.gpcnet_performance import GPCNetBenchmark
 from superbench.benchmarks.micro_benchmarks.gpu_copy_bw_performance import GpuCopyBwBenchmark
 from superbench.benchmarks.micro_benchmarks.gpu_burn_test import GpuBurnBenchmark
@ -33,6 +34,7 @@ from superbench.benchmarks.micro_benchmarks.tensorrt_inference_performance impor
 __all__ = [
    'ComputationCommunicationOverlap',
    'CpuMemBwLatencyBenchmark',
+    'CpuHplBenchmark',
    'CpuStreamBenchmark',
    'CublasBenchmark',
    'CublasLtBenchmark',
--- a/superbench/benchmarks/micro_benchmarks/cpu_hpl_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/cpu_hpl_performance.py
@ -0,0 +1,152 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Module for running the HPL benchmark."""
+
+import os
+
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+
+
+class CpuHplBenchmark(MicroBenchmarkWithInvoke):
+    """The HPL benchmark class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+
+        self._bin_name = 'hpl_run.sh'
+
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+
+        self.__cpu_arch = ['zen3', 'zen4']
+
+        self._parser.add_argument(
+            '--cpu_arch',
+            type=str,
+            default='zen4',
+            required=False,
+            help='The targeted cpu architectures to run \
+                HPL. Default is zen4. Possible values are {}.'.format(' '.join(self.__cpu_arch))
+        )
+        self._parser.add_argument(
+            '--blockSize',
+            type=int,
+            default=384,
+            required=False,
+            help='Size of blocks. This parameter is an HPL input. Default 384.'
+        )
+        self._parser.add_argument(
+            '--coreCount',
+            type=int,
+            default=88,    # for HBv4 total number of cores is 176 => 88 per cpu
+            required=False,
+            help='Number of cores per CPU. Used for MPI and HPL configuration. \
+            Default 88 (HBv4 has a total of 176 w/ 2 cpus therefore 88 per cpu)'
+        )
+        self._parser.add_argument(
+            '--blocks',
+            type=int,
+            default=1,
+            required=False,
+            help='Number of blocks. This parameter is an HPL input. Default 1.'
+        )
+        self._parser.add_argument(
+            '--problemSize',
+            type=int,
+            default=384000,
+            required=False,
+            help='This is the problem size designated by "N" notation. \
+            This parameter is an HPL input. Default is 384000'
+        )
+
+    def _preprocess(self, hpl_template):
+        """Preprocess/preparation operations before the benchmarking.
+
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        if not self._set_binary_path():
+            logger.error(
+                'Executable {} not found in {} or it is not executable'.format(self._bin_name, self._args.bin_dir)
+            )
+            return False
+
+        # xhpl type
+        xhpl = 'xhpl_z4'
+        if self._args.cpu_arch == 'zen3':
+            xhpl = 'xhpl_z3'
+
+        # command
+        command = os.path.join(self._args.bin_dir, self._bin_name)
+        command = command + ' ' + xhpl + ' ' + str(self._args.coreCount)
+
+        # modify HPL.dat
+        if hpl_template:
+            hpl_input_file = hpl_template
+        else:
+            hpl_input_file = os.path.join(self._args.bin_dir, 'template_hpl.dat')
+        search_string = ['problemSize', 'blockCount', 'blockSize']
+        with open(hpl_input_file, 'r') as hplfile:
+            lines = hplfile.readlines()
+        hpl_input_file = os.path.join(os.getcwd(), 'HPL.dat')
+        with open(hpl_input_file, 'w') as hplfile:
+            for line in lines:
+                if search_string[0] in line:
+                    line = line.replace(search_string[0], str(self._args.problemSize))
+                elif search_string[1] in line:
+                    line = line.replace(search_string[1], str(self._args.blocks))
+                elif search_string[2] in line:
+                    line = line.replace(search_string[2], str(self._args.blockSize))
+                hplfile.write(line)
+
+        self._commands.append(command)
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to parse raw results and save the summarized results.
+
+          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        content = raw_output.splitlines()
+
+        for idx, line in enumerate(content):
+            if 'T/V' in line and 'Gflops' in line:
+                break
+
+        results = content[idx + 2].split()
+
+        for line in content[idx + 2:]:
+            if '1 tests completed and passed residual checks' in line:
+                self._result.add_result('tests_pass', 1)
+            elif '0 tests completed and passed residual checks' in line:
+                self._result.add_result('tests_pass', 0)
+
+        self._result.add_result('time', float(results[5]))
+        self._result.add_result('throughput', float(results[6]))
+
+        # raw output
+        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
+
+        return True
+
+
+BenchmarkRegistry.register_benchmark('cpu-hpl', CpuHplBenchmark)
--- a/tests/benchmarks/micro_benchmarks/test_cpu_hpl_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_cpu_hpl_performance.py
@ -0,0 +1,66 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for HPL benchmark."""
+
+import unittest
+
+from tests.helper import decorator
+from tests.helper.testcase import BenchmarkTestCase
+from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
+
+
+class CpuHplBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
+    """Test class for HPL benchmark."""
+    @classmethod
+    def setUpClass(cls):
+        """Hook method for setting up class fixture before running tests in the class."""
+        super().setUpClass()
+        cls.createMockEnvs(cls)
+        cls.createMockFiles(cls, ['bin/hpl_run.sh'])
+        return True
+
+    @decorator.load_data('tests/data/hpl_results.log')
+    def test_hpl(self, results):
+        """Test HPL benchmark command generation."""
+        benchmark_name = 'cpu-hpl'
+        (benchmark_class,
+         predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU)
+        assert (benchmark_class)
+
+        parameters = '--cpu_arch zen3 \
+        --blockSize 224 --coreCount 60 --blocks 1 --problemSize 224000'
+
+        benchmark = benchmark_class(benchmark_name, parameters=parameters)
+
+        # Check basic information
+        assert (benchmark)
+        ret = benchmark._preprocess(hpl_template='third_party/hpl-tests/template_hpl.dat')
+        assert (ret is True)
+        assert (benchmark.return_code == ReturnCode.SUCCESS)
+        assert (benchmark.name == benchmark_name)
+        assert (benchmark.type == BenchmarkType.MICRO)
+
+        # Check parameters specified in BenchmarkContext.
+
+        assert (benchmark._args.cpu_arch == 'zen3')
+        assert (benchmark._args.blockSize == 224)
+        assert (benchmark._args.coreCount == 60)
+        assert (benchmark._args.blocks == 1)
+        assert (benchmark._args.problemSize == 224000)
+
+        # Check command
+        assert (1 == len(benchmark._commands))
+        assert ('60' in benchmark._commands[0])
+        assert ('hpl_run.sh' in benchmark._commands[0])
+        assert ('xhpl_z3' in benchmark._commands[0])
+
+        # Check results
+        assert (benchmark._process_raw_result(0, results))
+        assert (benchmark.result['return_code'][0] == 0)
+        assert (float(benchmark.result['time'][0]) == 4645.37)
+        assert (float(benchmark.result['throughput'][0]) == 8126.1)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tests/data/hpl_results.log
+++ b/tests/data/hpl_results.log
--- a/third_party/Makefile
+++ b/third_party/Makefile
@ -11,14 +11,14 @@ HPCX_HOME ?= /opt/hpcx
 CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c2- | cut -d '.' -f1-2)
 ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)

-.PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream
+.PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl

 # Build all targets.
 all: cuda rocm
 cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn
 rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest
 cpu: common cpu_perftest
-common: fio cpu_stream
+common: cpu_hpl cpu_stream fio

 # Create $(SB_MICRO_PATH)/bin and $(SB_MICRO_PATH)/lib, no error if existing, make parent directories as needed.
 sb_micro_path:
@ -124,7 +124,23 @@ ifneq (,$(wildcard gpu-burn/Makefile))
 	cp -v ./gpu-burn/compare.ptx $(SB_MICRO_PATH)/bin/
 endif

-#Build stream from main branch (only branch that exists)
+# Build HPL from main branch 
+cpu_hpl: sb_micro_path
+ifneq (,$(wildcard hpl-tests/Makefile))
+	cd ./hpl-tests && \
+    wget https://netlib.org/benchmark/hpl/hpl-2.3.tar.gz && \
+	tar xzf hpl-2.3.tar.gz && \
+	cp Make.Linux_zen3 hpl-2.3 && \
+	cp Make.Linux_zen4 hpl-2.3 && \
+	make all
+	cp -v ./hpl-tests/hpl-2.3/bin/Linux_zen3/xhpl $(SB_MICRO_PATH)/bin/xhpl_z3
+	cp -v ./hpl-tests/hpl-2.3/bin/Linux_zen4/xhpl $(SB_MICRO_PATH)/bin/xhpl_z4
+	cp -v ./hpl-tests/hpl_run.sh $(SB_MICRO_PATH)/bin/
+	cp -v ./hpl-tests/bindmem.sh $(SB_MICRO_PATH)/bin/
+	cp -v ./hpl-tests/template_hpl.dat $(SB_MICRO_PATH)/bin/
+endif
+
+# Build STREAM 
 cpu_stream: sb_micro_path
 ifneq (,$(wildcard stream-tests/Makefile))
 	cd ./stream-tests && \
@ -132,4 +148,3 @@ ifneq (,$(wildcard stream-tests/Makefile))
 	make all
 	cp -v ./stream-tests/stream*.exe $(SB_MICRO_PATH)/bin/
 endif
-
--- a/third_party/hpl-tests/Make.Linux_zen3
+++ b/third_party/hpl-tests/Make.Linux_zen3
@ -0,0 +1,62 @@
+# ######################################################################
+# HPL Makefile for Zen3 build
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+# ######################################################################
+#
+# Shell path and command translations  
+#
+SHELL        = /bin/sh
+#
+CD           = cd
+CP           = cp
+LN_S         = ln -fs
+MKDIR        = mkdir -p
+RM           = /bin/rm -f
+TOUCH        = touch
+#
+# Build Type
+#
+ARCH         = $(arch)
+#
+#HPL Directory
+#
+TOPdir       = ../../..
+INCdir       = $(TOPdir)/include
+BINdir       = $(TOPdir)/bin/$(ARCH)
+LIBdir       = $(TOPdir)/lib/$(ARCH)
+#
+HPLlib       = $(LIBdir)/libhpl.a
+#
+# OPenMPI LIB
+#
+MPdir        = $(omp)
+MPinc        = -I$(MPdir)/include
+MPlib        = $(MPdir)/lib/libmpi.so
+#
+# Add Bliss
+#
+LAdir        = /opt/AMD/amd-blis
+LAinc        = -I$(LAdir)/lib/include
+LAlib        = $(LAdir)/lib/LP64/libblis-mt.so
+#
+# Fortran to C
+#
+Fort_C_FLAGS      = -DAdd__ -DF77_INTEGER=int -DStringSunStyle
+#
+# HPL includes and libraries 
+#
+HPL_INCLUDES = -I$(INCdir) -I$(INCdir)/$(ARCH) $(LAinc) $(MPinc)
+HPL_LIBS     = $(HPLlib) $(LAlib) $(MPlib) -lm
+#
+# Compilers and linker path + flags 
+#
+HPL_DEFS     = $(Fort_C_FLAGS) -DHPL_PROGRESS_REPORT $(HPL_INCLUDES)
+CC           = /opt/AMD/aocc-compiler-4.0.0/bin/clang
+CCNOOPT      = $(HPL_DEFS)
+CCFLAGS      = $(HPL_DEFS) -march=znver3 -fomit-frame-pointer -O3 -funroll-loops
+LINKER       = /opt/AMD/aocc-compiler-4.0.0/bin/clang
+LINKFLAGS    = $(CCFLAGS)
+ARCHIVER     = ar
+ARFLAGS      = r
+RANLIB       = echo
--- a/third_party/hpl-tests/Make.Linux_zen4
+++ b/third_party/hpl-tests/Make.Linux_zen4
@ -0,0 +1,62 @@
+# ######################################################################
+# HPL Makefile for Zen4 build
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+# ######################################################################
+#
+# Shell path and command translations  
+#
+SHELL        = /bin/sh
+#
+CD           = cd
+CP           = cp
+LN_S         = ln -fs
+MKDIR        = mkdir -p
+RM           = /bin/rm -f
+TOUCH        = touch
+#
+# Build Type
+#
+ARCH         = $(arch)
+#
+#HPL Directory
+#
+TOPdir       = ../../..
+INCdir       = $(TOPdir)/include
+BINdir       = $(TOPdir)/bin/$(ARCH)
+LIBdir       = $(TOPdir)/lib/$(ARCH)
+#
+HPLlib       = $(LIBdir)/libhpl.a
+#
+# OPenMPI LIB
+#
+MPdir        = $(omp)
+MPinc        = -I$(MPdir)/include
+MPlib        = $(MPdir)/lib/libmpi.so
+#
+# Add Bliss
+#
+LAdir        = /opt/AMD/amd-blis
+LAinc        = -I$(LAdir)/lib/include
+LAlib        = $(LAdir)/lib/LP64/libblis-mt.so
+#
+# Fortran to C
+#
+Fort_C_FLAGS      = -DAdd__ -DF77_INTEGER=int -DStringSunStyle
+#
+# HPL includes and libraries 
+#
+HPL_INCLUDES = -I$(INCdir) -I$(INCdir)/$(ARCH) $(LAinc) $(MPinc)
+HPL_LIBS     = $(HPLlib) $(LAlib) $(MPlib) -lm
+#
+# Compilers and linker path + flags 
+#
+HPL_DEFS     = $(Fort_C_FLAGS) -DHPL_PROGRESS_REPORT $(HPL_INCLUDES)
+CC           = /opt/AMD/aocc-compiler-4.0.0/bin/clang
+CCNOOPT      = $(HPL_DEFS)
+CCFLAGS      = $(HPL_DEFS) -march=znver4 -fomit-frame-pointer -O3 -funroll-loops
+LINKER       = /opt/AMD/aocc-compiler-4.0.0/bin/clang
+LINKFLAGS    = $(CCFLAGS)
+ARCHIVER     = ar
+ARFLAGS      = r
+RANLIB       = echo
--- a/third_party/hpl-tests/Makefile
+++ b/third_party/hpl-tests/Makefile
@ -0,0 +1,43 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+
+all: ZEN3 ZEN4 CONFIGURE
+
+CONFIGURE:
+	cp ./hpl-2.3/setup/Make.Linux_ATHLON_CBLAS ./hpl-2.3/Make.Linux_zen3
+	sed -i 's/.*ARCH   .*=.*/ARCH         =$$(arch)/' ./hpl-2.3/Make.Linux_zen3
+	sed -i 's/.*TOPdir .*=.*/TOPdir       = ..\/..\/../' ./hpl-2.3/Make.Linux_zen3
+	sed -i 's/.*MPdir .*=.*/MPdir        = $$(omp)/' ./hpl-2.3/Make.Linux_zen3
+	sed -i 's/.*MPinc .*=.*/MPinc        = -I$$(MPdir)\/include/' ./hpl-2.3/Make.Linux_zen3
+	sed -i 's/.*MPlib .*=.*/MPlib        = $$(MPdir)\/lib\/libmpi.so/' ./hpl-2.3/Make.Linux_zen3
+	sed -i 's/.*LAdir .*=.*/LAdir        = \/opt\/AMD\/amd-blis/' ./hpl-2.3/Make.Linux_zen3
+	sed -i 's/LAinc  .*=/LAinc        = -I$$(LAdir)\/lib\/include/' ./hpl-2.3/Make.Linux_zen3
+	sed -i 's/.*LAlib .*=.*/LAlib        = $$(LAdir)\/lib\/LP64\/libblis-mt.so/' ./hpl-2.3/Make.Linux_zen3
+	sed -i 's/.*CC .*=.*/CC           = \/opt\/AMD\/aocc-compiler-4.0.0\/bin\/clang/' ./hpl-2.3/Make.Linux_zen3	
+	sed -i 's/.*CCFLAGS .*=.*/CCFLAGS      = $$(HPL_DEFS) -march=znver3 -fomit-frame-pointer -O3 -funroll-loops/' ./hpl-2.3/Make.Linux_zen3
+	sed -i 's/.*LINKER .*=.*/LINKER       = \/opt\/AMD\/aocc-compiler-4.0.0\/bin\/clang/' ./hpl-2.3/Make.Linux_zen3
+	cp ./hpl-2.3/Make.Linux_zen3 ./hpl-2.3/Make.Linux_zen4
+	sed -i 's/.*CCFLAGS .*=.*/CCFLAGS      = $$(HPL_DEFS) -march=znver4 -fomit-frame-pointer -O3 -funroll-loops/' ./hpl-2.3/Make.Linux_zen4
+ZEN3: CONFIGURE
+ifneq (,$(wildcard /opt/hpcx/ompi))
+	cd ./hpl-2.3 && \
+	make arch=Linux_zen3 omp=/opt/hpcx/ompi
+else
+	cd ./hpl-2.3 && \
+	make arch=Linux_zen3 omp=/opt/ompi/
+endif 
+	
+ZEN4: CONFIGURE
+ifneq (,$(wildcard /opt/hpcx/ompi))
+	cd ./hpl-2.3 && \
+	make arch=Linux_zen4 omp=/opt/hpcx/ompi
+else
+	cd ./hpl-2.3 && \
+	make arch=Linux_zen4 omp=/opt/ompi/
+endif 
+
+clean:
+	cd ./hpl-2.3 && \
+	make clean arch=Linux_zen3 &&\
+	make clean arch=Linux_zen4
--- a/third_party/hpl-tests/bindmem.sh
+++ b/third_party/hpl-tests/bindmem.sh
@ -0,0 +1,3 @@
+#!/bin/bash
+nodes=$(numactl --show | awk -F: '/^cpubind/ {print $2;}' | sed -e 's/^ //g' -e 's/ $//g' | tr ' ' ',')
+exec numactl --interleave=${nodes} $@
--- a/third_party/hpl-tests/hpl_run.sh
+++ b/third_party/hpl-tests/hpl_run.sh
@ -0,0 +1,22 @@
+#!/bin/bash
+
+source /opt/hpcx/hpcx-init.sh
+hpcx_load
+
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/AMD/amd-blis/lib/LP64/:/opt/hpcx/ompi/lib/:/opt/AMD/aocc-compiler-4.0.0/lib/
+
+# On the off chance OMPI_MCA is set to UCX-only, disable that
+unset OMPI_MCA_osc
+
+BIN_PATH="$(dirname "$0")"
+XHPL_EXE=$1
+NCORES=$2
+
+NT=${NCORES}
+NR=2
+MAP_BY=socket
+set -x
+
+mpirun --allow-run-as-root --map-by ${MAP_BY}:PE=$NT -np $NR --bind-to core \
+    -x OMP_NUM_THREADS=$NT -x OMP_PROC_BIND=close -x OMP_PLACES=cores \
+    ${BIN_PATH}/bindmem.sh ${BIN_PATH}/${XHPL_EXE} 
--- a/third_party/hpl-tests/template_hpl.dat
+++ b/third_party/hpl-tests/template_hpl.dat
@ -0,0 +1,31 @@
+HPLinpack benchmark input file
+Innovative Computing Laboratory, University of Tennessee
+HPL.out     output file name (if any)
+6           device out (6=stdout,7=stderr,file)
+1           # of problems sizes (N)
+problemSize      Ns
+blockCount           # of NBs
+blockSize         # NBs
+0           MAP process mapping (0=Row-,1=Column-major)
+1           # of process grids (P x Q)
+1           Ps
+2           Qs
+16.0        threshold
+1           # of panel fact<
+1           PFACTs (0=left, 1=Crout, 2=Right)
+1           # of recursive stopping criterium
+48           NBMINs (>= 1)
+1           # of panels in recursion
+8           NDIVs
+1           # of recursive panel fact.
+2           RFACTs (0=left, 1=Crout, 2=Right)
+1           # of broadcast
+7           BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)
+1           # of lookahead depth
+0           DEPTHs (>=0)
+2           SWAP (0=bin-exch,1=long,2=mix)
+64          swapping threshold
+0           L1 in (0=transposed,1=no-transposed) form
+0           U in (0=transposed,1=no-transposed) form
+1           Equilibration (0=no,1=yes)
+8           memory alignment in double (> 0)