diff --git a/Doxyfile b/Doxyfile index c9de961..fe641fe 100644 --- a/Doxyfile +++ b/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = X-Mem # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 2.1.13 +PROJECT_NUMBER = 2.1.14 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/README.md b/README.md index d899c6a..2a853ca 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ README ------------------------------------------------------------------------------------------------------------ -X-Mem: Extensible Memory Benchmarking Tool v2.1.13 +X-Mem: Extensible Memory Benchmarking Tool v2.1.14 ------------------------------------------------------------------------------------------------------------ The flexible open-source research tool for characterizing memory hierarchy throughput, latency, and power. @@ -10,7 +10,7 @@ Originally authored by Mark Gottscho (Email: ) as a Summer 2 This project is under active development. Stay tuned for more updates. -PROJECT REVISION DATE: April 22, 2015. +PROJECT REVISION DATE: April 23, 2015. ------------------------------------------------------------------------------------------------------------ LICENSE @@ -54,9 +54,8 @@ Extensibility: modularity via C++ object-oriented principles - Supports rapid addition of new benchmark kernel routines - Example: stream triad algorithm, impact of false sharing, etc. are possible with minor changes -Cross-platform: Currently implemented for Windows and GNU/Linux on x86-64, x86-64 with AVX extensions CPUs +Cross-platform: Currently implemented for Windows and GNU/Linux on x86, x86-64, and x86-64 with AVX extensions CPUs - Designed to allow straightforward porting to other operating systems and ISAs - - 32-bit x86 port under development - ARM port under development Memory throughput: @@ -92,9 +91,8 @@ There are a few runtime prerequisites in order for the software to run correctly HARDWARE: -- Intel x86-64 CPU with optional support for AVX extensions. AMD CPUs should also work although this has not been tested. -- COMING SOON: Intel 32-bit x86 CPU -- COMING SOON: ARMv7 CPUs +- Intel x86 or x86-64 CPU with optional support for AVX extensions. AMD CPUs should also work although this has not been tested. +- COMING SOON: ARM CPUs WINDOWS: @@ -109,7 +107,7 @@ WINDOWS: GNU/LINUX: -- GNU utilities with support for C++11. Tested with gcc 4.8.2 on Ubuntu 14.04 LTS for x86-64 CPU. +- GNU utilities with support for C++11. Tested with gcc 4.8.2 on Ubuntu 14.04 LTS for x86-64 CPU with AVX. - libhugetlbfs. You can obtain it at . On Ubuntu systems, you can install using "sudo apt-get install libhugetlbfs0". - Potentially, administrator privileges, if you plan to use the --large_pages option. - During runtime, if the --large_pages option is selected, you may need to first manually ensure that large pages are available from the OS. This can be done by running "hugeadm --pool-list". It is recommended to set minimum pool to 1GB (in order to measure DRAM effectively). If needed, this can be done by running "hugeadm --pool-pages-min 2MB:512". Alternatively, run the linux_setup_runtime_hugetlbfs.sh script that is provided with X-Mem. @@ -313,7 +311,7 @@ WINDOWS: GNU/LINUX: -- gcc with support for the C++11 standard. Tested with gcc version 4.8.2 on Ubuntu 14.04 LTS for x86-64. +- gcc with support for the C++11 standard. Tested with gcc version 4.8.2 on Ubuntu 14.04 LTS for x86-64 with AVX. - Python 2.7. You can obtain it at . On Ubuntu systems, you can install using "sudo apt-get install python2.7". You may need some other Python 2.7 packages as well. - SCons build system. You can obtain it at . On Ubuntu systems, you can install using "sudo apt-get install scons". Build tested with SCons 2.3.4. - Kernel support for large (huge) pages. This support can be verified on your Linux installation by running "grep hugetlbfs /proc/filesystems". If you do not have huge page support in your kernel, you can build a kernel with the appropriate options switched on: "CONFIG_HUGETLB_PAGE" and "CONFIG_HUGETLBFS". diff --git a/X-Mem_Developer_Manual.pdf b/X-Mem_Developer_Manual.pdf index 5837db2..32f46d3 100644 Binary files a/X-Mem_Developer_Manual.pdf and b/X-Mem_Developer_Manual.pdf differ diff --git a/bin/xmem-linux-x64 b/bin/xmem-linux-x64 index 8bddf11..776f706 100755 Binary files a/bin/xmem-linux-x64 and b/bin/xmem-linux-x64 differ diff --git a/bin/xmem-linux-x64_avx b/bin/xmem-linux-x64_avx index eeb3dc4..7ec6e2c 100755 Binary files a/bin/xmem-linux-x64_avx and b/bin/xmem-linux-x64_avx differ diff --git a/bin/xmem-linux-x86 b/bin/xmem-linux-x86 index fc381a3..a4aca30 100755 Binary files a/bin/xmem-linux-x86 and b/bin/xmem-linux-x86 differ diff --git a/src/Benchmark.cpp b/src/Benchmark.cpp index ad53e00..0466192 100644 --- a/src/Benchmark.cpp +++ b/src/Benchmark.cpp @@ -45,9 +45,6 @@ Benchmark::Benchmark( void* mem_array, size_t len, uint32_t iterations, -#ifdef USE_SIZE_BASED_BENCHMARKS - uint32_t passes_per_iteration, -#endif uint32_t num_worker_threads, uint32_t mem_node, uint32_t cpu_node, @@ -62,9 +59,6 @@ Benchmark::Benchmark( _mem_array(mem_array), _len(len), _iterations(iterations), -#ifdef USE_SIZE_BASED_BENCHMARKS - _passes_per_iteration(passes_per_iteration), -#endif _num_worker_threads(num_worker_threads), _mem_node(mem_node), _cpu_node(cpu_node), @@ -272,12 +266,6 @@ uint32_t Benchmark::getIterations() const { return _iterations; } -#ifdef USE_SIZE_BASED_BENCHMARKS -uint32_t Benchmark::getPassesPerIteration() const { - return _passes_per_iteration; -} -#endif - chunk_size_t Benchmark::getChunkSize() const { return _chunk_size; } diff --git a/src/BenchmarkManager.cpp b/src/BenchmarkManager.cpp index 2c5ad2b..f1c4aa6 100644 --- a/src/BenchmarkManager.cpp +++ b/src/BenchmarkManager.cpp @@ -527,16 +527,9 @@ bool BenchmarkManager::__buildBenchmarks() { //Add the throughput benchmark benchmark_name = static_cast(&(std::ostringstream() << "Test #" << g_test_index << "T (Throughput)"))->str(); -#ifdef USE_SIZE_BASED_BENCHMARKS - //Determine number of passes for each benchmark. This is working set size-dependent, to ensure the timed duration of each run is sufficiently long, but not too long. - size_t passes_per_iteration = compute_number_of_passes((mem_array_len / __config.getNumWorkerThreads()) / KB); -#endif __tp_benchmarks.push_back(new ThroughputBenchmark(mem_array, mem_array_len, __config.getIterationsPerTest(), -#ifdef USE_SIZE_BASED_BENCHMARKS - passes_per_iteration, -#endif __config.getNumWorkerThreads(), mem_node, cpu_node, @@ -556,16 +549,9 @@ bool BenchmarkManager::__buildBenchmarks() { //Special case: number of worker threads is 1, only need 1 latency thread in general to do unloaded latency tests. if (__config.getNumWorkerThreads() > 1 || __lat_benchmarks.size() < 1) { benchmark_name = static_cast(&(std::ostringstream() << "Test #" << g_test_index << "L (Latency)"))->str(); -#ifdef USE_SIZE_BASED_BENCHMARKS - //Determine number of passes for each benchmark. This is working set size-dependent, to ensure the timed duration of each run is sufficiently long, but not too long. - passes_per_iteration = compute_number_of_passes((mem_array_len / __config.getNumWorkerThreads()) / KB) / 4; -#endif __lat_benchmarks.push_back(new LatencyBenchmark(mem_array, mem_array_len, __config.getIterationsPerTest(), -#ifdef USE_SIZE_BASED_BENCHMARKS - passes_per_iteration, -#endif __config.getNumWorkerThreads(), mem_node, cpu_node, @@ -600,16 +586,9 @@ bool BenchmarkManager::__buildBenchmarks() { //Add the throughput benchmark benchmark_name = static_cast(&(std::ostringstream() << "Test #" << g_test_index << "T (Throughput)"))->str(); -#ifdef USE_SIZE_BASED_BENCHMARKS - //Determine number of passes for each benchmark. This is working set size-dependent, to ensure the timed duration of each run is sufficiently long, but not too long. - size_t passes_per_iteration = compute_number_of_passes((mem_array_len / __config.getNumWorkerThreads()) / KB); -#endif __tp_benchmarks.push_back(new ThroughputBenchmark(mem_array, mem_array_len, __config.getIterationsPerTest(), -#ifdef USE_SIZE_BASED_BENCHMARKS - passes_per_iteration, -#endif __config.getNumWorkerThreads(), mem_node, cpu_node, @@ -628,16 +607,9 @@ bool BenchmarkManager::__buildBenchmarks() { //Special case: number of worker threads is 1, only need 1 latency thread in general to do unloaded latency tests. if (__config.getNumWorkerThreads() > 1 || __lat_benchmarks.size() < 1) { benchmark_name = static_cast(&(std::ostringstream() << "Test #" << g_test_index << "L (Latency)"))->str(); -#ifdef USE_SIZE_BASED_BENCHMARKS - //Determine number of passes for each benchmark. This is working set size-dependent, to ensure the timed duration of each run is sufficiently long, but not too long. - passes_per_iteration = compute_number_of_passes((mem_array_len / __config.getNumWorkerThreads()) / KB) / 4; -#endif __lat_benchmarks.push_back(new LatencyBenchmark(mem_array, mem_array_len, __config.getIterationsPerTest(), -#ifdef USE_SIZE_BASED_BENCHMARKS - passes_per_iteration, -#endif __config.getNumWorkerThreads(), mem_node, cpu_node, @@ -697,16 +669,9 @@ bool BenchmarkManager::runExtDelayInjectedLoadedLatencyBenchmark() { std::string benchmark_name = static_cast(&(std::ostringstream() << "Test #" << g_test_index++ << "E" << EXT_NUM_DELAY_INJECTED_LOADED_LATENCY_BENCHMARK << " (Extension: Delay-Injected Loaded Latency)"))->str(); -#ifdef USE_SIZE_BASED_BENCHMARKS - //Determine number of passes for each benchmark. This is working set size-dependent, to ensure the timed duration of each run is sufficiently long, but not too long. - passes_per_iteration = compute_number_of_passes((mem_array_len / __config.getNumWorkerThreads()) / KB) / 4; -#endif del_lat_benchmarks.push_back(new DelayInjectedLoadedLatencyBenchmark(mem_array, mem_array_len, __config.getIterationsPerTest(), -#ifdef USE_SIZE_BASED_BENCHMARKS - passes_per_iteration, -#endif __config.getNumWorkerThreads(), mem_node, cpu_node, diff --git a/src/LatencyBenchmark.cpp b/src/LatencyBenchmark.cpp index 945f398..7b09883 100644 --- a/src/LatencyBenchmark.cpp +++ b/src/LatencyBenchmark.cpp @@ -52,9 +52,6 @@ LatencyBenchmark::LatencyBenchmark( void* mem_array, size_t len, uint32_t iterations, -#ifdef USE_SIZE_BASED_BENCHMARKS - uint32_t passes_per_iteration, -#endif uint32_t num_worker_threads, uint32_t mem_node, uint32_t cpu_node, @@ -69,9 +66,6 @@ LatencyBenchmark::LatencyBenchmark( mem_array, len, iterations, -#ifdef USE_SIZE_BASED_BENCHMARKS - passes_per_iteration, -#endif num_worker_threads, mem_node, cpu_node, @@ -303,9 +297,6 @@ bool LatencyBenchmark::_run_core() { if (t == 0) { //special case: thread 0 is always latency thread workers.push_back(new LatencyWorker(thread_mem_array, len_per_thread, -#ifdef USE_SIZE_BASED_BENCHMARKS - _passes_per_iteration, -#endif lat_kernel_fptr, lat_kernel_dummy_fptr, cpu_id)); @@ -313,18 +304,12 @@ bool LatencyBenchmark::_run_core() { if (_pattern_mode == SEQUENTIAL) workers.push_back(new LoadWorker(thread_mem_array, len_per_thread, -#ifdef USE_SIZE_BASED_BENCHMARKS - _passes_per_iteration, -#endif load_kernel_fptr_seq, load_kernel_dummy_fptr_seq, cpu_id)); else if (_pattern_mode == RANDOM) workers.push_back(new LoadWorker(thread_mem_array, len_per_thread, -#ifdef USE_SIZE_BASED_BENCHMARKS - _passes_per_iteration, -#endif load_kernel_fptr_ran, load_kernel_dummy_fptr_ran, cpu_id)); diff --git a/src/LatencyWorker.cpp b/src/LatencyWorker.cpp index c0ab2a6..9e81eec 100644 --- a/src/LatencyWorker.cpp +++ b/src/LatencyWorker.cpp @@ -49,9 +49,6 @@ using namespace xmem; LatencyWorker::LatencyWorker( void* mem_array, size_t len, - #ifdef USE_SIZE_BASED_BENCHMARKS - uint32_t passes_per_iteration, - #endif RandomFunction kernel_fptr, RandomFunction kernel_dummy_fptr, int32_t cpu_affinity @@ -59,9 +56,6 @@ LatencyWorker::LatencyWorker( MemoryWorker( mem_array, len, -#ifdef USE_SIZE_BASED_BENCHMARKS - passes_per_iteration, -#endif cpu_affinity ), __kernel_fptr(kernel_fptr), @@ -87,22 +81,14 @@ void LatencyWorker::run() { tick_t elapsed_dummy_ticks = 0; tick_t adjusted_ticks = 0; bool warning = false; - -#ifdef USE_TIME_BASED_BENCHMARKS void* mem_array = NULL; size_t len = 0; tick_t target_ticks = g_ticks_per_ms * BENCHMARK_DURATION_MS; //Rough target run duration in ticks -#endif //Grab relevant setup state thread-safely and keep it local if (_acquireLock(-1)) { -#ifdef USE_TIME_BASED_BENCHMARKS mem_array = _mem_array; len = _len; -#endif -#ifdef USE_SIZE_BASED_BENCHMARKS - passes = _passes_per_iteration; -#endif bytes_per_pass = LATENCY_BENCHMARK_UNROLL_LENGTH * 8; cpu_affinity = _cpu_affinity; kernel_fptr = __kernel_fptr; @@ -134,7 +120,6 @@ void LatencyWorker::run() { } //Run benchmark -#ifdef USE_TIME_BASED_BENCHMARKS //Run actual version of function and loop overhead next_address = static_cast(mem_array); while (elapsed_ticks < target_ticks) { @@ -154,25 +139,6 @@ void LatencyWorker::run() { elapsed_dummy_ticks += (stop_tick - start_tick); p+=256; } -#endif - -#ifdef USE_SIZE_BASED_BENCHMARKS - //Time actual version of function and loop overhead - next_address = static_cast(mem_array); - start_tick = start_timer(); - for (p = 0; p < passes; p++) - (*kernel_fptr)(next_address, &next_address, len); - stop_tick = stop_timer(); - elapsed_ticks += (start_tick - stop_tick); - - //Time dummy version of function and loop overhead - next_address = static_cast(_mem_array); - start_tick = start_timer(); - for (p = 0; p < passes; p++) - (*kernel_dummy_fptr)(next_address, &next_address, len); - stop_tick = stop_timer(); - elapsed_dummy_ticks += (start_tick - stop_tick); -#endif adjusted_ticks = elapsed_ticks - elapsed_dummy_ticks; diff --git a/src/LoadWorker.cpp b/src/LoadWorker.cpp index 94c1495..9bad5df 100644 --- a/src/LoadWorker.cpp +++ b/src/LoadWorker.cpp @@ -49,9 +49,6 @@ using namespace xmem; LoadWorker::LoadWorker( void* mem_array, size_t len, -#ifdef USE_SIZE_BASED_BENCHMARKS - uint32_t passes_per_iteration, -#endif SequentialFunction kernel_fptr, SequentialFunction kernel_dummy_fptr, int32_t cpu_affinity @@ -59,9 +56,6 @@ LoadWorker::LoadWorker( MemoryWorker( mem_array, len, -#ifdef USE_SIZE_BASED_BENCHMARKS - passes_per_iteration, -#endif cpu_affinity ), __use_sequential_kernel_fptr(true), @@ -75,9 +69,6 @@ LoadWorker::LoadWorker( LoadWorker::LoadWorker( void* mem_array, size_t len, - #ifdef USE_SIZE_BASED_BENCHMARKS - uint32_t passes_per_iteration, - #endif RandomFunction kernel_fptr, RandomFunction kernel_dummy_fptr, int32_t cpu_affinity @@ -85,9 +76,6 @@ LoadWorker::LoadWorker( MemoryWorker( mem_array, len, -#ifdef USE_SIZE_BASED_BENCHMARKS - passes_per_iteration, -#endif cpu_affinity ), __use_sequential_kernel_fptr(false), @@ -121,25 +109,16 @@ void LoadWorker::run() { tick_t elapsed_dummy_ticks = 0; tick_t adjusted_ticks = 0; bool warning = false; - -#ifdef USE_TIME_BASED_BENCHMARKS void* mem_array = NULL; size_t len = 0; tick_t target_ticks = g_ticks_per_ms * BENCHMARK_DURATION_MS; //Rough target run duration in ticks uint32_t p = 0; bytes_per_pass = THROUGHPUT_BENCHMARK_BYTES_PER_PASS; -#endif //Grab relevant setup state thread-safely and keep it local if (_acquireLock(-1)) { -#ifdef USE_TIME_BASED_BENCHMARKS mem_array = _mem_array; len = _len; -#endif -#ifdef USE_SIZE_BASED_BENCHMARKS - bytes_per_pass = _len; - passes = _passes_per_iteration; -#endif cpu_affinity = _cpu_affinity; use_sequential_kernel_fptr = __use_sequential_kernel_fptr; kernel_fptr_seq = __kernel_fptr_seq; @@ -176,7 +155,6 @@ void LoadWorker::run() { //Run the benchmark! uintptr_t* next_address = static_cast(mem_array); -#ifdef USE_TIME_BASED_BENCHMARKS //Run actual version of function and loop overhead while (elapsed_ticks < target_ticks) { if (use_sequential_kernel_fptr) { //sequential function semantics @@ -221,38 +199,6 @@ void LoadWorker::run() { elapsed_dummy_ticks += (stop_tick - start_tick); } -#endif - -#ifdef USE_SIZE_BASED_BENCHMARKS - next_address = static_cast(mem_array); - if (use_sequential_kernel_fptr) { //sequential function semantics - start_tick = start_timer(); - for (uint32_t p = 0; p < __passes_per_iteration; p++) - (*kernel_fptr_seq)(start_address, end_address); - stop_tick = stop_timer(); - } else { //random function semantics - start_tick = start_timer(); - for (uint32_t p = 0; p < __passes_per_iteration; p++) - (*kernel_fptr_ran)(next_address, &next_address, bytes_per_pass); - stop_tick = stop_timer(); - } - elapsed_ticks = stop_tick - start_tick; - - //Time dummy version of function and loop overhead - next_address = static_cast(mem_array); - if (use_sequential_kernel_fptr) { //sequential function semantics - start_tick = start_timer(); - for (uint32_t p = 0; p < __passes_per_iteration; p++) - (*kernel_dummy_fptr_seq)(start_address, end_address); - stop_tick = stop_timer(); - } else { //random function semantics - start_tick = start_timer(); - for (uint32_t p = 0; p < __passes_per_iteration; p++) - (*kernel_dummy_fptr_ran)(next_address, &next_address, bytes_per_pass); - stop_tick = stop_timer(); - } - elapsed_dummy_ticks = stop_tick - start_tick; -#endif //Unset processor affinity if (locked) diff --git a/src/MemoryWorker.cpp b/src/MemoryWorker.cpp index 396b0cd..741aa31 100644 --- a/src/MemoryWorker.cpp +++ b/src/MemoryWorker.cpp @@ -36,9 +36,6 @@ using namespace xmem; MemoryWorker::MemoryWorker( void* mem_array, size_t len, - #ifdef USE_SIZE_BASED_BENCHMARKS - uint32_t passes_per_iteration, - #endif int32_t cpu_affinity ) : _mem_array(mem_array), @@ -50,9 +47,6 @@ MemoryWorker::MemoryWorker( _elapsed_dummy_ticks(0), _adjusted_ticks(0), _warning(false), -#ifdef USE_SIZE_BASED_BENCHMARKS - _passes_per_iteration(passes_per_iteration), -#endif _completed(false) { } diff --git a/src/README.md b/src/README.md index d899c6a..2a853ca 100644 --- a/src/README.md +++ b/src/README.md @@ -1,7 +1,7 @@ README ------------------------------------------------------------------------------------------------------------ -X-Mem: Extensible Memory Benchmarking Tool v2.1.13 +X-Mem: Extensible Memory Benchmarking Tool v2.1.14 ------------------------------------------------------------------------------------------------------------ The flexible open-source research tool for characterizing memory hierarchy throughput, latency, and power. @@ -10,7 +10,7 @@ Originally authored by Mark Gottscho (Email: ) as a Summer 2 This project is under active development. Stay tuned for more updates. -PROJECT REVISION DATE: April 22, 2015. +PROJECT REVISION DATE: April 23, 2015. ------------------------------------------------------------------------------------------------------------ LICENSE @@ -54,9 +54,8 @@ Extensibility: modularity via C++ object-oriented principles - Supports rapid addition of new benchmark kernel routines - Example: stream triad algorithm, impact of false sharing, etc. are possible with minor changes -Cross-platform: Currently implemented for Windows and GNU/Linux on x86-64, x86-64 with AVX extensions CPUs +Cross-platform: Currently implemented for Windows and GNU/Linux on x86, x86-64, and x86-64 with AVX extensions CPUs - Designed to allow straightforward porting to other operating systems and ISAs - - 32-bit x86 port under development - ARM port under development Memory throughput: @@ -92,9 +91,8 @@ There are a few runtime prerequisites in order for the software to run correctly HARDWARE: -- Intel x86-64 CPU with optional support for AVX extensions. AMD CPUs should also work although this has not been tested. -- COMING SOON: Intel 32-bit x86 CPU -- COMING SOON: ARMv7 CPUs +- Intel x86 or x86-64 CPU with optional support for AVX extensions. AMD CPUs should also work although this has not been tested. +- COMING SOON: ARM CPUs WINDOWS: @@ -109,7 +107,7 @@ WINDOWS: GNU/LINUX: -- GNU utilities with support for C++11. Tested with gcc 4.8.2 on Ubuntu 14.04 LTS for x86-64 CPU. +- GNU utilities with support for C++11. Tested with gcc 4.8.2 on Ubuntu 14.04 LTS for x86-64 CPU with AVX. - libhugetlbfs. You can obtain it at . On Ubuntu systems, you can install using "sudo apt-get install libhugetlbfs0". - Potentially, administrator privileges, if you plan to use the --large_pages option. - During runtime, if the --large_pages option is selected, you may need to first manually ensure that large pages are available from the OS. This can be done by running "hugeadm --pool-list". It is recommended to set minimum pool to 1GB (in order to measure DRAM effectively). If needed, this can be done by running "hugeadm --pool-pages-min 2MB:512". Alternatively, run the linux_setup_runtime_hugetlbfs.sh script that is provided with X-Mem. @@ -313,7 +311,7 @@ WINDOWS: GNU/LINUX: -- gcc with support for the C++11 standard. Tested with gcc version 4.8.2 on Ubuntu 14.04 LTS for x86-64. +- gcc with support for the C++11 standard. Tested with gcc version 4.8.2 on Ubuntu 14.04 LTS for x86-64 with AVX. - Python 2.7. You can obtain it at . On Ubuntu systems, you can install using "sudo apt-get install python2.7". You may need some other Python 2.7 packages as well. - SCons build system. You can obtain it at . On Ubuntu systems, you can install using "sudo apt-get install scons". Build tested with SCons 2.3.4. - Kernel support for large (huge) pages. This support can be verified on your Linux installation by running "grep hugetlbfs /proc/filesystems". If you do not have huge page support in your kernel, you can build a kernel with the appropriate options switched on: "CONFIG_HUGETLB_PAGE" and "CONFIG_HUGETLBFS". diff --git a/src/ThroughputBenchmark.cpp b/src/ThroughputBenchmark.cpp index 92699da..56217ac 100644 --- a/src/ThroughputBenchmark.cpp +++ b/src/ThroughputBenchmark.cpp @@ -44,9 +44,6 @@ ThroughputBenchmark::ThroughputBenchmark( void* mem_array, size_t len, uint32_t iterations, -#ifdef USE_SIZE_BASED_BENCHMARKS - uint32_t passes_per_iteration, -#endif uint32_t num_worker_threads, uint32_t mem_node, uint32_t cpu_node, @@ -61,9 +58,6 @@ ThroughputBenchmark::ThroughputBenchmark( mem_array, len, iterations, -#ifdef USE_SIZE_BASED_BENCHMARKS - passes_per_iteration, -#endif num_worker_threads, mem_node, cpu_node, @@ -141,18 +135,12 @@ bool ThroughputBenchmark::_run_core() { if (_pattern_mode == SEQUENTIAL) workers.push_back(new LoadWorker(thread_mem_array, len_per_thread, -#ifdef USE_SIZE_BASED_BENCHMARKS - _passes_per_iteration, -#endif kernel_fptr_seq, kernel_dummy_fptr_seq, cpu_id)); else if (_pattern_mode == RANDOM) workers.push_back(new LoadWorker(thread_mem_array, len_per_thread, -#ifdef USE_SIZE_BASED_BENCHMARKS - _passes_per_iteration, -#endif kernel_fptr_ran, kernel_dummy_fptr_ran, cpu_id)); diff --git a/src/benchmark_kernels.cpp b/src/benchmark_kernels.cpp index d85e0e7..5f7af44 100644 --- a/src/benchmark_kernels.cpp +++ b/src/benchmark_kernels.cpp @@ -44,24 +44,66 @@ #include #include #include -#if defined(__gnu_linux__) && defined(ARCH_INTEL_X86_64) -#include //for Intel AVX intrinsics +#if defined(__gnu_linux__) && defined(ARCH_INTEL_X86_64) && (defined(HAS_WORD_128) || defined(HAS_WORD_256)) +//Intel intrinsics +#include +#include +#include #endif -#ifdef ARCH_INTEL_X86_64 -#define my_32b_set_128b_word(a, b, c, d) _mm_set_epi32x(a, b, c, d) -#define my_32b_set_256b_word(a, b, c, d, e, f, g, h) _mm256_set_epi32x(a, b, c, d, e, f, g, h) -#define my_64b_set_128b_word(a, b) _mm_set_epi64x(a, b) -#define my_64b_set_256b_word(a, b, c, d) _mm256_set_epi64x(a, b, c, d) +#if defined(__gnu_linux__) && defined(ARCH_INTEL_X86_64) && (defined(HAS_WORD_128) || defined(HAS_WORD_256)) +#define my_32b_set_128b_word(a, b, c, d) _mm_set_epi32(a, b, c, d) //SSE2 intrinsic, corresponds to ??? instruction. Header: emmintrin.h +#define my_32b_set_256b_word(a, b, c, d, e, f, g, h) _mm256_set_epi32(a, b, c, d, e, f, g, h) //AVX intrinsic, corresponds to ??? instruction. Header: immintrin.h +#define my_64b_set_128b_word(a, b) _mm_set_epi64x(a, b) //SSE2 intrinsic, corresponds to ??? instruction. Header: emmintrin.h +#define my_64b_set_256b_word(a, b, c, d) _mm256_set_epi64x(a, b, c, d) //AVX intrinsic, corresponds to ??? instruction. Header: immintrin.h -#define my_32b_extractLSB_128b(w) _mm_extract_epi32(w, 0) -#define my_32b_extractLSB_256b(w) _mm256_extract_epi32(w, 0) -#define my_64b_extractLSB_128b(w) _mm_extract_epi64(w, 0) -#define my_64b_extractLSB_256b(w) _mm256_extract_epi64(w, 0) +#define my_32b_extractLSB_128b(w) _mm_extract_epi32(w, 0) //SSE 4.1 intrinsic, corresponds to "pextrd" instruction. Header: smmintrin.h +#define my_32b_extractLSB_256b(w) _mm256_extract_epi32(w, 0) //AVX intrinsic, corresponds to ??? instruction. Header: immintrin.h +#define my_64b_extractLSB_128b(w) _mm_extract_epi64(w, 0) //SSE 4.1 intrinsic, corresponds to "pextrq" instruction. Header: smmintrin.h +#define my_64b_extractLSB_256b(w) _mm256_extract_epi64(w, 0) //AVX intrinsic, corresponds to ??? instruction. Header: immintrin.h #endif #ifdef ARCH_ARM -#error TODO: ARM intrinsics +#error TODO: ARM intrinsics? +#endif + +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) +/* Hand-coded assembly functions for the 128-bit and 256-bit benchmark kernels on Windows x86-64 where applicable. + * These are needed because the VC++ compiler, like gcc, optimizes away the vector instructions unless I use the volatile keyword. + * But I can't use something like volatile Word256_t* because it is incompatible with _mm_load_si256() with VC++, etc. + * Then, an alternative is to use inline assembly for Windows. However, the VC++ compiler does not support inline assembly in x86-64! + * The only remaining choice for this unfortunate circumstance is to use hand-coded assembly on Windows x86-64 builds. + * In this special case, I implemented the routine as a wrapper around the assembler function. + */ + +#ifdef HAS_WORD_128 +//128-bit +extern "C" int win_x86_64_asm_forwSequentialRead_Word128(Word128_t* first_word, Word128_t* last_word); +extern "C" int win_x86_64_asm_revSequentialRead_Word128(Word128_t* last_word, Word128_t* first_word); +extern "C" int win_x86_64_asm_forwSequentialWrite_Word128(Word128_t* first_word, Word128_t* last_word); +extern "C" int win_x86_64_asm_revSequentialWrite_Word128(Word128_t* last_word, Word128_t* first_word); +#endif + +#ifdef HAS_WORD_256 +//256-bit +extern "C" int win_x86_64_asm_forwSequentialRead_Word256(Word256_t* first_word, Word256_t* last_word); +extern "C" int win_x86_64_asm_revSequentialRead_Word256(Word256_t* last_word, Word256_t* first_word); +extern "C" int win_x86_64_asm_forwSequentialWrite_Word256(Word256_t* first_word, Word256_t* last_word); +extern "C" int win_x86_64_asm_revSequentialWrite_Word256(Word256_t* last_word, Word256_t* first_word); +#endif + +//Dummies +#ifdef HAS_WORD_128 +//128-bit +extern "C" int win_x86_64_asm_dummy_forwSequentialLoop_Word128(Word128_t* first_word, Word128_t* last_word); +extern "C" int win_x86_64_asm_dummy_revSequentialLoop_Word128(Word128_t* first_word, Word128_t* last_word); +#endif + +#ifdef HAS_WORD_256 +//256-bit +extern "C" int win_x86_64_asm_dummy_forwSequentialLoop_Word256(Word256_t* first_word, Word256_t* last_word); +extern "C" int win_x86_64_asm_dummy_revSequentialLoop_Word256(Word256_t* first_word, Word256_t* last_word); +#endif #endif using namespace xmem; @@ -588,63 +630,15 @@ bool xmem::buildRandomPointerPermutation(void* start_address, void* end_address, std::mt19937_64 gen(time(NULL)); //Mersenne Twister random number generator, seeded at current time - /* - //TODO: probably remove this. - //Build a random directed Hamiltonian Cycle across the memory region - - //Let W be the list of memory locations that have not been reached yet. Each entry is an index in mem_base. - std::vector W; - size_t w_index = 0; - - //Initialize W to contain all memory locations, where each memory location appears exactly once in the list. The order does not strictly matter. - W.resize(num_pointers); - for (w_index = 0; w_index < num_pointers; w_index++) { - W.at(w_index) = w_index; - } - - //Build the directed Hamiltonian Cycle - size_t v = 0; //the current memory location. Always start at the first location for the Hamiltonian Cycle construction - size_t w = 0; //the next memory location - w_index = 0; - while (W.size() > 0) { //while we have not reached all memory locations - W.erase(W.begin() + w_index); - - //Normal case - if (W.size() > 0) { - //Choose the next w_index at random from W - w_index = gen() % W.size(); - - //Extract the memory location corresponding to this w_index - w = W[w_index]; - } else { //Last element visited needs to point at head of memory to complete the cycle - w = 0; - } - - //Create pointer v --> w. This corresponds to a directed edge in the graph with nodes v and w. - mem_region_base[v] = reinterpret_cast(mem_region_base + w); - - //Chase this pointer to move to next step - v = w; - } - */ - - //Do a random shuffle of memory pointers -#ifndef HAS_WORD_64 //special case for 32-bit architectures - Word32_t* mem_region_base = reinterpret_cast(start_address); -#endif + //Do a random shuffle of memory pointers. + //I had originally used a random Hamiltonian Cycle generator, but this was much slower and aside from + //rare instances, did not make any difference in random-access performance measurement. #ifdef HAS_WORD_64 Word64_t* mem_region_base = reinterpret_cast(start_address); +#else //special case for 32-bit architectures + Word32_t* mem_region_base = reinterpret_cast(start_address); #endif switch (chunk_size) { - //special case for 32-bit architectures -#ifndef HAS_WORD_64 - case CHUNK_32b: - for (size_t i = 0; i < num_pointers; i++) { //Initialize pointers to point at themselves (identity mapping) - mem_region_base[i] = reinterpret_cast(mem_region_base+i); - } - std::shuffle(mem_region_base, mem_region_base + num_pointers, gen); - break; -#endif #ifdef HAS_WORD_64 case CHUNK_64b: for (size_t i = 0; i < num_pointers; i++) { //Initialize pointers to point at themselves (identity mapping) @@ -652,19 +646,25 @@ bool xmem::buildRandomPointerPermutation(void* start_address, void* end_address, } std::shuffle(mem_region_base, mem_region_base + num_pointers, gen); break; +#else //special case for 32-bit architectures + case CHUNK_32b: + for (size_t i = 0; i < num_pointers; i++) { //Initialize pointers to point at themselves (identity mapping) + mem_region_base[i] = reinterpret_cast(mem_region_base+i); + } + std::shuffle(mem_region_base, mem_region_base + num_pointers, gen); + break; #endif #ifdef HAS_WORD_128 case CHUNK_128b: for (size_t i = 0; i < num_pointers; i++) { //Initialize pointers to point at themselves (identity mapping) -#ifndef HAS_WORD_64 //special case for 32-bit architectures +#ifdef HAS_WORD_64 + mem_region_base[i*2] = reinterpret_cast(mem_region_base+(i*2)); + mem_region_base[(i*2)+1] = 0xFFFFFFFFFFFFFFFF; //1-fill upper 64 bits +#else //special case for 32-bit architectures mem_region_base[i*4] = reinterpret_cast(mem_region_base+(i*4)); mem_region_base[(i*4)+1] = 0xFFFFFFFF; //1-fill upper 96 bits mem_region_base[(i*4)+2] = 0xFFFFFFFF; mem_region_base[(i*4)+3] = 0xFFFFFFFF; -#endif -#ifdef HAS_WORD_64 - mem_region_base[i*2] = reinterpret_cast(mem_region_base+(i*2)); - mem_region_base[(i*2)+1] = 0xFFFFFFFFFFFFFFFF; //1-fill upper 64 bits #endif } std::shuffle(reinterpret_cast(mem_region_base), reinterpret_cast(mem_region_base) + num_pointers, gen); @@ -673,7 +673,12 @@ bool xmem::buildRandomPointerPermutation(void* start_address, void* end_address, #ifdef HAS_WORD_256 case CHUNK_256b: for (size_t i = 0; i < num_pointers; i++) { //Initialize pointers to point at themselves (identity mapping) -#ifndef HAS_WORD_64 //special case for 32-bit architectures +#ifdef HAS_WORD_64 + mem_region_base[i*4] = reinterpret_cast(mem_region_base+(i*4)); + mem_region_base[(i*4)+1] = 0xFFFFFFFFFFFFFFFF; //1-fill upper 192 bits + mem_region_base[(i*4)+2] = 0xFFFFFFFFFFFFFFFF; + mem_region_base[(i*4)+3] = 0xFFFFFFFFFFFFFFFF; +#else //special case for 32-bit architectures mem_region_base[i*8] = reinterpret_cast(mem_region_base+(i*8)); mem_region_base[(i*8)+1] = 0xFFFFFFFF; //1-fill upper 224 bits mem_region_base[(i*8)+2] = 0xFFFFFFFF; @@ -682,12 +687,6 @@ bool xmem::buildRandomPointerPermutation(void* start_address, void* end_address, mem_region_base[(i*8)+5] = 0xFFFFFFFF; mem_region_base[(i*8)+6] = 0xFFFFFFFF; mem_region_base[(i*8)+7] = 0xFFFFFFFF; -#endif -#ifdef HAS_WORD_64 - mem_region_base[i*4] = reinterpret_cast(mem_region_base+(i*4)); - mem_region_base[(i*4)+1] = 0xFFFFFFFFFFFFFFFF; //1-fill upper 192 bits - mem_region_base[(i*4)+2] = 0xFFFFFFFFFFFFFFFF; - mem_region_base[(i*4)+3] = 0xFFFFFFFFFFFFFFFF; #endif } std::shuffle(reinterpret_cast(mem_region_base), reinterpret_cast(mem_region_base) + num_pointers, gen); @@ -716,10 +715,6 @@ bool xmem::buildRandomPointerPermutation(void* start_address, void* end_address, int32_t xmem::dummy_chasePointers(uintptr_t*, uintptr_t**, size_t len) { volatile uintptr_t placeholder = 0; //Try to defeat compiler optimizations removing this method -#ifdef USE_SIZE_BASED_BENCHMARKS - for (size_t i = 0; i < len / sizeof(uintptr_t); i += 512) - placeholder = 0; -#endif return 0; } @@ -727,15 +722,7 @@ int32_t xmem::dummy_chasePointers(uintptr_t*, uintptr_t**, size_t len) { int32_t xmem::chasePointers(uintptr_t* first_address, uintptr_t** last_touched_address, size_t len) { volatile uintptr_t* p = first_address; - -#ifdef USE_TIME_BASED_BENCHMARKS UNROLL512(p = reinterpret_cast(*p);) -#endif -#ifdef USE_SIZE_BASED_BENCHMARKS - for (size_t i = 0; i < len / sizeof(uintptr_t); i += 512) { - UNROLL512(p = reinterpret_cast(*p);) - } -#endif *last_touched_address = const_cast(p); return 0; } @@ -747,45 +734,6 @@ int32_t xmem::chasePointers(uintptr_t* first_address, uintptr_t** last_touched_a *********************************************************************** ***********************************************************************/ -#if defined(_WIN32) && defined(ARCH_INTEL) -//TODO: will we need ARM asm on Win? - -//Hand-coded assembly functions for the SSE2/AVX benchmark routines. -//VC++ compiler does not support inline assembly in x86-64. -//And the compiler optimizes away the vector instructions unless I use volatile. -//But I can't use for example volatile Word256_t* because it is incompatible with _mm_load_si256() with VC++. -//Fortunately, I implemented the routine as a wrapper around a hand-coded assembler C function. - -#ifdef HAS_WORD_128 -//128-bit -extern "C" int win_asm_forwSequentialRead_Word128(Word128_t* first_word, Word128_t* last_word); -extern "C" int win_asm_revSequentialRead_Word128(Word128_t* last_word, Word128_t* first_word); -extern "C" int win_asm_forwSequentialWrite_Word128(Word128_t* first_word, Word128_t* last_word); -extern "C" int win_asm_revSequentialWrite_Word128(Word128_t* last_word, Word128_t* first_word); -#endif - -#ifdef HAS_WORD_256 -//256-bit -extern "C" int win_asm_forwSequentialRead_Word256(Word256_t* first_word, Word256_t* last_word); -extern "C" int win_asm_revSequentialRead_Word256(Word256_t* last_word, Word256_t* first_word); -extern "C" int win_asm_forwSequentialWrite_Word256(Word256_t* first_word, Word256_t* last_word); -extern "C" int win_asm_revSequentialWrite_Word256(Word256_t* last_word, Word256_t* first_word); -#endif - -//Dummies -#ifdef HAS_WORD_128 -//128-bit -extern "C" int win_asm_dummy_forwSequentialLoop_Word128(Word128_t* first_word, Word128_t* last_word); -extern "C" int win_asm_dummy_revSequentialLoop_Word128(Word128_t* first_word, Word128_t* last_word); -#endif - -#ifdef HAS_WORD_256 -//256-bit -extern "C" int win_asm_dummy_forwSequentialLoop_Word256(Word256_t* first_word, Word256_t* last_word); -extern "C" int win_asm_dummy_revSequentialLoop_Word256(Word256_t* first_word, Word256_t* last_word); -#endif -#endif - /* --------------------- DUMMY BENCHMARK ROUTINES --------------------------- */ int32_t xmem::dummy_empty(void*, void*) { @@ -816,8 +764,8 @@ int32_t xmem::dummy_forwSequentialLoop_Word64(void* start_address, void* end_add #ifdef HAS_WORD_128 int32_t xmem::dummy_forwSequentialLoop_Word128(void* start_address, void* end_address) { -#ifdef _WIN32 - return win_asm_dummy_forwSequentialLoop_Word128(static_cast(start_address), static_cast(end_address)); +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) + return win_x86_64_asm_dummy_forwSequentialLoop_Word128(static_cast(start_address), static_cast(end_address)); #endif #ifdef __gnu_linux__ volatile int32_t placeholder = 0; //Try our best to defeat compiler optimizations @@ -832,8 +780,8 @@ int32_t xmem::dummy_forwSequentialLoop_Word128(void* start_address, void* end_ad #ifdef HAS_WORD_256 int32_t xmem::dummy_forwSequentialLoop_Word256(void* start_address, void* end_address) { -#ifdef _WIN32 - return win_asm_dummy_forwSequentialLoop_Word256(static_cast(start_address), static_cast(end_address)); +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) + return win_x86_64_asm_dummy_forwSequentialLoop_Word256(static_cast(start_address), static_cast(end_address)); #endif #ifdef __gnu_linux__ volatile int32_t placeholder = 0; //Try our best to defeat compiler optimizations @@ -868,8 +816,8 @@ int32_t xmem::dummy_revSequentialLoop_Word64(void* start_address, void* end_addr #ifdef HAS_WORD_128 int32_t xmem::dummy_revSequentialLoop_Word128(void* start_address, void* end_address) { -#ifdef _WIN32 - return win_asm_dummy_revSequentialLoop_Word128(static_cast(end_address), static_cast(start_address)); +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) + return win_x86_64_asm_dummy_revSequentialLoop_Word128(static_cast(end_address), static_cast(start_address)); #endif volatile int32_t placeholder = 0; //Try our best to defeat compiler optimizations for (volatile Word128_t* wordptr = static_cast(end_address), *begptr = static_cast(start_address); wordptr > begptr;) { @@ -882,8 +830,8 @@ int32_t xmem::dummy_revSequentialLoop_Word128(void* start_address, void* end_add #ifdef HAS_WORD_256 int32_t xmem::dummy_revSequentialLoop_Word256(void* start_address, void* end_address) { -#ifdef _WIN32 - return win_asm_dummy_revSequentialLoop_Word256(static_cast(end_address), static_cast(start_address)); +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) + return win_x86_64_asm_dummy_revSequentialLoop_Word256(static_cast(end_address), static_cast(start_address)); #endif #ifdef __gnu_linux__ volatile int32_t placeholder = 0; //Try our best to defeat compiler optimizations @@ -1341,12 +1289,6 @@ int32_t xmem::dummy_revStride16Loop_Word256(void* start_address, void* end_addre #ifndef HAS_WORD_64 //special case: 32-bit architectures int32_t xmem::dummy_randomLoop_Word32(uintptr_t*, uintptr_t**, size_t len) { volatile uintptr_t* placeholder = NULL; //Try to defeat compiler optimizations removing this method - -#ifdef USE_SIZE_BASED_BENCHMARKS - for (size_t i = 0; i < len / sizeof(uintptr_t); i += 1024) - placeholder = NULL; -#endif - return 0; } #endif @@ -1354,19 +1296,13 @@ int32_t xmem::dummy_randomLoop_Word32(uintptr_t*, uintptr_t**, size_t len) { #ifdef HAS_WORD_64 int32_t xmem::dummy_randomLoop_Word64(uintptr_t*, uintptr_t**, size_t len) { volatile uintptr_t* placeholder = NULL; //Try to defeat compiler optimizations removing this method - -#ifdef USE_SIZE_BASED_BENCHMARKS - for (size_t i = 0; i < len / sizeof(uintptr_t); i += 512) - placeholder = NULL; -#endif - return 0; } #endif #ifdef HAS_WORD_128 int32_t xmem::dummy_randomLoop_Word128(uintptr_t* first_address, uintptr_t** last_touched_address, size_t len) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -1376,24 +1312,12 @@ int32_t xmem::dummy_randomLoop_Word128(uintptr_t* first_address, uintptr_t** las volatile uintptr_t val_extract; val = my_64b_set_128b_word(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF); -#ifdef USE_TIME_BASED_BENCHMARKS #ifndef HAS_WORD_64 //special case: 32-bit machines UNROLL256(val_extract = my_32b_extractLSB_128b(val);) //Extract 32 LSB. #endif #ifdef HAS_WORD_64 UNROLL256(val_extract = my_64b_extractLSB_128b(val);) //Extract 64 LSB. #endif -#endif -#ifdef USE_SIZE_BASED_BENCHMARKS - for (size_t i = 0; i < len / sizeof(Word128_t); i += 256) { -#ifndef HAS_WORD_64 //special case: 32-bit machines - UNROLL256(val_extract = my_32b_extractLSB_128b(val);) //Extract 32 LSB. -#endif -#ifdef HAS_WORD_64 - UNROLL256(val_extract = my_64b_extractLSB_128b(val);) //Extract 64 LSB. -#endif - } -#endif return 0; #endif @@ -1402,7 +1326,7 @@ int32_t xmem::dummy_randomLoop_Word128(uintptr_t* first_address, uintptr_t** las #ifdef HAS_WORD_256 int32_t xmem::dummy_randomLoop_Word256(uintptr_t* first_address, uintptr_t** last_touched_address, size_t len) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -1412,24 +1336,12 @@ int32_t xmem::dummy_randomLoop_Word256(uintptr_t* first_address, uintptr_t** las volatile uintptr_t val_extract; val = my_64b_set_256b_word(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF); -#ifdef USE_TIME_BASED_BENCHMARKS #ifndef HAS_WORD_64 //special case: 32-bit machines UNROLL128(val_extract = my_32b_extractLSB_256b(val);) //Extract 32 LSB. #endif #ifdef HAS_WORD_64 UNROLL128(val_extract = my_64b_extractLSB_256b(val);) //Extract 64 LSB. #endif -#endif -#ifdef USE_SIZE_BASED_BENCHMARKS - for (size_t i = 0; i < len / sizeof(Word256_t); i += 128) { -#ifndef HAS_WORD_64 //special case: 32-bit machines - UNROLL128(val_extract = my_32b_extractLSB_256b(val);) //Extract 32 LSB. -#endif -#ifdef HAS_WORD_64 - UNROLL128(val_extract = my_64b_extractLSB_256b(val);) //Extract 64 LSB. -#endif - } -#endif return 0; #endif @@ -1472,8 +1384,8 @@ int32_t xmem::forwSequentialRead_Word64(void* start_address, void* end_address) #ifdef HAS_WORD_128 int32_t xmem::forwSequentialRead_Word128(void* start_address, void* end_address) { -#ifdef _WIN32 - return win_asm_forwSequentialRead_Word128(static_cast(start_address), static_cast(end_address)); +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) + return win_x86_64_asm_forwSequentialRead_Word128(static_cast(start_address), static_cast(end_address)); #endif #ifdef __gnu_linux__ register Word128_t val; @@ -1487,8 +1399,8 @@ int32_t xmem::forwSequentialRead_Word128(void* start_address, void* end_address) #ifdef HAS_WORD_256 int32_t xmem::forwSequentialRead_Word256(void* start_address, void* end_address) { -#ifdef _WIN32 - return win_asm_forwSequentialRead_Word256(static_cast(start_address), static_cast(end_address)); +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) + return win_x86_64_asm_forwSequentialRead_Word256(static_cast(start_address), static_cast(end_address)); #endif #ifdef __gnu_linux__ register Word256_t val; @@ -1520,8 +1432,8 @@ int32_t xmem::revSequentialRead_Word64(void* start_address, void* end_address) { #ifdef HAS_WORD_128 int32_t xmem::revSequentialRead_Word128(void* start_address, void* end_address) { -#ifdef _WIN32 - return win_asm_revSequentialRead_Word128(static_cast(end_address), static_cast(start_address)); +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) + return win_x86_64_asm_revSequentialRead_Word128(static_cast(end_address), static_cast(start_address)); #endif #ifdef __gnu_linux__ register Word128_t val; @@ -1535,8 +1447,8 @@ int32_t xmem::revSequentialRead_Word128(void* start_address, void* end_address) #ifdef HAS_WORD_256 int32_t xmem::revSequentialRead_Word256(void* start_address, void* end_address) { -#ifdef _WIN32 - return win_asm_revSequentialRead_Word256(static_cast(end_address), static_cast(start_address)); +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) + return win_x86_64_asm_revSequentialRead_Word256(static_cast(end_address), static_cast(start_address)); #endif #ifdef __gnu_linux__ register Word256_t val; @@ -1570,8 +1482,8 @@ int32_t xmem::forwSequentialWrite_Word64(void* start_address, void* end_address) #ifdef HAS_WORD_128 int32_t xmem::forwSequentialWrite_Word128(void* start_address, void* end_address) { -#ifdef _WIN32 - return win_asm_forwSequentialWrite_Word128(static_cast(start_address), static_cast(end_address)); +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) + return win_x86_64_asm_forwSequentialWrite_Word128(static_cast(start_address), static_cast(end_address)); #endif #ifdef __gnu_linux__ register Word128_t val; @@ -1586,8 +1498,8 @@ int32_t xmem::forwSequentialWrite_Word128(void* start_address, void* end_address #ifdef HAS_WORD_256 int32_t xmem::forwSequentialWrite_Word256(void* start_address, void* end_address) { -#ifdef _WIN32 - return win_asm_forwSequentialWrite_Word256(static_cast(start_address), static_cast(end_address)); +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) + return win_x86_64_asm_forwSequentialWrite_Word256(static_cast(start_address), static_cast(end_address)); #endif #ifdef __gnu_linux__ register Word256_t val; @@ -1620,8 +1532,8 @@ int32_t xmem::revSequentialWrite_Word64(void* start_address, void* end_address) #ifdef HAS_WORD_128 int32_t xmem::revSequentialWrite_Word128(void* start_address, void* end_address) { -#ifdef _WIN32 - return win_asm_revSequentialWrite_Word128(static_cast(end_address), static_cast(start_address)); +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) + return win_x86_64_asm_revSequentialWrite_Word128(static_cast(end_address), static_cast(start_address)); #endif #ifdef __gnu_linux__ register Word128_t val; @@ -1636,8 +1548,8 @@ int32_t xmem::revSequentialWrite_Word128(void* start_address, void* end_address) #ifdef HAS_WORD_256 int32_t xmem::revSequentialWrite_Word256(void* start_address, void* end_address) { -#ifdef _WIN32 - return win_asm_revSequentialWrite_Word256(static_cast(end_address), static_cast(start_address)); +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) + return win_x86_64_asm_revSequentialWrite_Word256(static_cast(end_address), static_cast(start_address)); #endif #ifdef __gnu_linux__ register Word256_t val; @@ -1680,7 +1592,7 @@ int32_t xmem::forwStride2Read_Word64(void* start_address, void* end_address) { #ifdef HAS_WORD_128 int32_t xmem::forwStride2Read_Word128(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -1699,7 +1611,7 @@ int32_t xmem::forwStride2Read_Word128(void* start_address, void* end_address) { #ifdef HAS_WORD_256 int32_t xmem::forwStride2Read_Word256(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -1744,7 +1656,7 @@ int32_t xmem::revStride2Read_Word64(void* start_address, void* end_address) { #ifdef HAS_WORD_128 int32_t xmem::revStride2Read_Word128(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -1763,7 +1675,7 @@ int32_t xmem::revStride2Read_Word128(void* start_address, void* end_address) { #ifdef HAS_WORD_256 int32_t xmem::revStride2Read_Word256(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -1810,7 +1722,7 @@ int32_t xmem::forwStride2Write_Word64(void* start_address, void* end_address) { #ifdef HAS_WORD_128 int32_t xmem::forwStride2Write_Word128(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -1830,7 +1742,7 @@ int32_t xmem::forwStride2Write_Word128(void* start_address, void* end_address) { #ifdef HAS_WORD_256 int32_t xmem::forwStride2Write_Word256(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -1876,7 +1788,7 @@ int32_t xmem::revStride2Write_Word64(void* start_address, void* end_address) { #ifdef HAS_WORD_128 int32_t xmem::revStride2Write_Word128(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -1896,7 +1808,7 @@ int32_t xmem::revStride2Write_Word128(void* start_address, void* end_address) { #ifdef HAS_WORD_256 int32_t xmem::revStride2Write_Word256(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -1944,7 +1856,7 @@ int32_t xmem::forwStride4Read_Word64(void* start_address, void* end_address) { #ifdef HAS_WORD_128 int32_t xmem::forwStride4Read_Word128(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -1963,7 +1875,7 @@ int32_t xmem::forwStride4Read_Word128(void* start_address, void* end_address) { #ifdef HAS_WORD_256 int32_t xmem::forwStride4Read_Word256(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -2008,7 +1920,7 @@ int32_t xmem::revStride4Read_Word64(void* start_address, void* end_address) { #ifdef HAS_WORD_128 int32_t xmem::revStride4Read_Word128(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -2027,7 +1939,7 @@ int32_t xmem::revStride4Read_Word128(void* start_address, void* end_address) { #ifdef HAS_WORD_256 int32_t xmem::revStride4Read_Word256(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -2074,7 +1986,7 @@ int32_t xmem::forwStride4Write_Word64(void* start_address, void* end_address) { #ifdef HAS_WORD_128 int32_t xmem::forwStride4Write_Word128(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -2094,7 +2006,7 @@ int32_t xmem::forwStride4Write_Word128(void* start_address, void* end_address) { #ifdef HAS_WORD_256 int32_t xmem::forwStride4Write_Word256(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -2140,7 +2052,7 @@ int32_t xmem::revStride4Write_Word64(void* start_address, void* end_address) { #ifdef HAS_WORD_128 int32_t xmem::revStride4Write_Word128(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -2160,7 +2072,7 @@ int32_t xmem::revStride4Write_Word128(void* start_address, void* end_address) { #ifdef HAS_WORD_256 int32_t xmem::revStride4Write_Word256(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -2208,7 +2120,7 @@ int32_t xmem::forwStride8Read_Word64(void* start_address, void* end_address) { #ifdef HAS_WORD_128 int32_t xmem::forwStride8Read_Word128(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -2227,7 +2139,7 @@ int32_t xmem::forwStride8Read_Word128(void* start_address, void* end_address) { #ifdef HAS_WORD_256 int32_t xmem::forwStride8Read_Word256(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -2272,7 +2184,7 @@ int32_t xmem::revStride8Read_Word64(void* start_address, void* end_address) { #ifdef HAS_WORD_128 int32_t xmem::revStride8Read_Word128(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -2291,7 +2203,7 @@ int32_t xmem::revStride8Read_Word128(void* start_address, void* end_address) { #ifdef HAS_WORD_256 int32_t xmem::revStride8Read_Word256(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -2338,7 +2250,7 @@ int32_t xmem::forwStride8Write_Word64(void* start_address, void* end_address) { #ifdef HAS_WORD_128 int32_t xmem::forwStride8Write_Word128(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -2358,7 +2270,7 @@ int32_t xmem::forwStride8Write_Word128(void* start_address, void* end_address) { #ifdef HAS_WORD_256 int32_t xmem::forwStride8Write_Word256(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -2404,7 +2316,7 @@ int32_t xmem::revStride8Write_Word64(void* start_address, void* end_address) { #ifdef HAS_WORD_128 int32_t xmem::revStride8Write_Word128(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -2424,7 +2336,7 @@ int32_t xmem::revStride8Write_Word128(void* start_address, void* end_address) { #ifdef HAS_WORD_256 int32_t xmem::revStride8Write_Word256(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -2472,7 +2384,7 @@ int32_t xmem::forwStride16Read_Word64(void* start_address, void* end_address) { #ifdef HAS_WORD_128 int32_t xmem::forwStride16Read_Word128(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -2491,7 +2403,7 @@ int32_t xmem::forwStride16Read_Word128(void* start_address, void* end_address) { #ifdef HAS_WORD_256 int32_t xmem::forwStride16Read_Word256(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -2536,7 +2448,7 @@ int32_t xmem::revStride16Read_Word64(void* start_address, void* end_address) { #ifdef HAS_WORD_128 int32_t xmem::revStride16Read_Word128(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -2555,7 +2467,7 @@ int32_t xmem::revStride16Read_Word128(void* start_address, void* end_address) { #ifdef HAS_WORD_256 int32_t xmem::revStride16Read_Word256(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -2602,7 +2514,7 @@ int32_t xmem::forwStride16Write_Word64(void* start_address, void* end_address) { #ifdef HAS_WORD_128 int32_t xmem::forwStride16Write_Word128(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; #endif #ifdef __gnu_linux__ @@ -2622,7 +2534,7 @@ int32_t xmem::forwStride16Write_Word128(void* start_address, void* end_address) #ifdef HAS_WORD_256 int32_t xmem::forwStride16Write_Word256(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; #endif #ifdef __gnu_linux__ @@ -2670,7 +2582,7 @@ int32_t xmem::revStride16Write_Word64(void* start_address, void* end_address) { #ifdef HAS_WORD_128 int32_t xmem::revStride16Write_Word128(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -2691,7 +2603,7 @@ int32_t xmem::revStride16Write_Word128(void* start_address, void* end_address) { #ifdef HAS_WORD_256 int32_t xmem::revStride16Write_Word256(void* start_address, void* end_address) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ @@ -2716,14 +2628,7 @@ int32_t xmem::revStride16Write_Word256(void* start_address, void* end_address) { int32_t xmem::randomRead_Word32(uintptr_t* first_address, uintptr_t** last_touched_address, size_t len) { volatile uintptr_t* p = first_address; -#ifdef USE_TIME_BASED_BENCHMARKS UNROLL1024(p = reinterpret_cast(*p);) -#endif -#ifdef USE_SIZE_BASED_BENCHMARKS - for (size_t i = 0; i < len / sizeof(uintptr_t); i += 1024) { - UNROLL1024(p = reinterpret_cast(*p);) - } -#endif *last_touched_address = const_cast(p); return 0; } @@ -2733,14 +2638,7 @@ int32_t xmem::randomRead_Word32(uintptr_t* first_address, uintptr_t** last_touch int32_t xmem::randomRead_Word64(uintptr_t* first_address, uintptr_t** last_touched_address, size_t len) { volatile uintptr_t* p = first_address; -#ifdef USE_TIME_BASED_BENCHMARKS UNROLL512(p = reinterpret_cast(*p);) -#endif -#ifdef USE_SIZE_BASED_BENCHMARKS - for (size_t i = 0; i < len / sizeof(uintptr_t); i += 512) { - UNROLL512(p = reinterpret_cast(*p);) - } -#endif *last_touched_address = const_cast(p); return 0; } @@ -2748,31 +2646,19 @@ int32_t xmem::randomRead_Word64(uintptr_t* first_address, uintptr_t** last_touch #ifdef HAS_WORD_128 int32_t xmem::randomRead_Word128(uintptr_t* first_address, uintptr_t** last_touched_address, size_t len) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ volatile Word128_t* p = reinterpret_cast(first_address); register Word128_t val; -#ifdef USE_TIME_BASED_BENCHMARKS #ifndef HAS_WORD_64 //special case: 32-bit machine UNROLL256(val = *p; p = reinterpret_cast(my_32b_extractLSB_128b(val));) //Do 128-bit load. Then extract 32 LSB to use as next load address. #endif #ifdef HAS_WORD_64 UNROLL256(val = *p; p = reinterpret_cast(my_64b_extractLSB_128b(val));) //Do 128-bit load. Then extract 64 LSB to use as next load address. #endif -#endif -#ifdef USE_SIZE_BASED_BENCHMARKS - for (size_t i = 0; i < len / sizeof(Word128_t); i += 256) { -#ifndef HAS_WORD_64 //special case: 32-bit machine - UNROLL256(val = *p; p = reinterpret_cast(my_32b_extractLSB_128b(val));) //Do 128-bit load. Then extract 32 LSB to use as next load address. -#endif -#ifdef HAS_WORD_64 - UNROLL256(val = *p; p = reinterpret_cast(my_64b_extractLSB_128b(val));) //Do 128-bit load. Then extract 64 LSB to use as next load address. -#endif - } -#endif *last_touched_address = reinterpret_cast(const_cast(p)); //Trick compiler. First get rid of volatile qualifier, and then reinterpret pointer return 0; @@ -2782,31 +2668,19 @@ int32_t xmem::randomRead_Word128(uintptr_t* first_address, uintptr_t** last_touc #ifdef HAS_WORD_256 int32_t xmem::randomRead_Word256(uintptr_t* first_address, uintptr_t** last_touched_address, size_t len) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ volatile Word256_t* p = reinterpret_cast(first_address); register Word256_t val; -#ifdef USE_TIME_BASED_BENCHMARKS #ifndef HAS_WORD_64 //special case: 32-bit machine UNROLL128(val = *p; p = reinterpret_cast(my_32b_extractLSB_256b(val));) //Do 256-bit load. Then extract 32 LSB to use as next load address. #endif #ifdef HAS_WORD_64 UNROLL128(val = *p; p = reinterpret_cast(my_64b_extractLSB_256b(val));) //Do 256-bit load. Then extract 64 LSB to use as next load address. #endif -#endif -#ifdef USE_SIZE_BASED_BENCHMARKS - for (size_t i = 0; i < len / sizeof(Word256_t); i += 128) { -#ifndef HAS_WORD_64 //special case: 32-bit machine - UNROLL128(val = *p; p = reinterpret_cast(my_32b_extractLSB_256b(val));) //Do 256-bit load. Then extract 32 LSB to use as next load address. -#endif -#ifdef HAS_WORD_64 - UNROLL128(val = *p; p = reinterpret_cast(my_64b_extractLSB_256b(val));) //Do 256-bit load. Then extract 64 LSB to use as next load address. -#endif - } -#endif *last_touched_address = reinterpret_cast(const_cast(p)); //Trick compiler. First get rid of volatile qualifier, and then reinterpret pointer return 0; @@ -2821,14 +2695,7 @@ int32_t xmem::randomWrite_Word32(uintptr_t* first_address, uintptr_t** last_touc volatile uintptr_t* p = first_address; volatile uintptr_t* p2 = NULL; -#ifdef USE_TIME_BASED_BENCHMARKS UNROLL1024(p2 = reinterpret_cast(*p); *p = reinterpret_cast(p2); p = p2;) -#endif -#ifdef USE_SIZE_BASED_BENCHMARKS - for (size_t i = 0; i < len / sizeof(uintptr_t); i += 1024) { - UNROLL1024(p2 = reinterpret_cast(*p); *p = reinterpret_cast(p2); p = p2;) - } -#endif *last_touched_address = const_cast(p); return 0; } @@ -2839,14 +2706,7 @@ int32_t xmem::randomWrite_Word64(uintptr_t* first_address, uintptr_t** last_touc volatile uintptr_t* p = first_address; volatile uintptr_t* p2 = NULL; -#ifdef USE_TIME_BASED_BENCHMARKS UNROLL512(p2 = reinterpret_cast(*p); *p = reinterpret_cast(p2); p = p2;) -#endif -#ifdef USE_SIZE_BASED_BENCHMARKS - for (size_t i = 0; i < len / sizeof(uintptr_t); i += 512) { - UNROLL512(p2 = reinterpret_cast(*p); *p = reinterpret_cast(p2); p = p2;) - } -#endif *last_touched_address = const_cast(p); return 0; } @@ -2854,31 +2714,19 @@ int32_t xmem::randomWrite_Word64(uintptr_t* first_address, uintptr_t** last_touc #ifdef HAS_WORD_128 int32_t xmem::randomWrite_Word128(uintptr_t* first_address, uintptr_t** last_touched_address, size_t len) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ volatile Word128_t* p = reinterpret_cast(first_address); register Word128_t val; -#ifdef USE_TIME_BASED_BENCHMARKS #ifndef HAS_WORD_64 //special case: 32-bit machine UNROLL256(val = *p; *p = val; p = reinterpret_cast(my_32b_extractLSB_128b(val));) //Do 128-bit load. Then do 128-bit store. Then extract 32 LSB to use as next load address. #endif #ifdef HAS_WORD_64 UNROLL256(val = *p; *p = val; p = reinterpret_cast(my_64b_extractLSB_128b(val));) //Do 128-bit load. Then do 128-bit store. Then extract 64 LSB to use as next load address. #endif -#endif -#ifdef USE_SIZE_BASED_BENCHMARKS - for (size_t i = 0; i < len / sizeof(Word128_t); i += 256) { -#ifndef HAS_WORD_64 //special case: 32-bit machine - UNROLL256(val = *p; *p = val; p = reinterpret_cast(my_32b_extractLSB_128b(val));) //Do 128-bit load. Then do 128-bit store. Then extract 32 LSB to use as next load address. -#endif -#ifdef HAS_WORD_64 - UNROLL256(val = *p; *p = val; p = reinterpret_cast(my_64b_extractLSB_128b(val));) //Do 128-bit load. Then do 128-bit store. Then extract 64 LSB to use as next load address. -#endif - } -#endif *last_touched_address = reinterpret_cast(const_cast(p)); //Trick compiler. First get rid of volatile qualifier, and then reinterpret pointer return 0; @@ -2888,31 +2736,19 @@ int32_t xmem::randomWrite_Word128(uintptr_t* first_address, uintptr_t** last_tou #ifdef HAS_WORD_256 int32_t xmem::randomWrite_Word256(uintptr_t* first_address, uintptr_t** last_touched_address, size_t len) { -#ifdef _WIN32 +#if defined(_WIN32) && defined(ARCH_INTEL_X86_64) return 0; //TODO: Implement for Windows. #endif #ifdef __gnu_linux__ volatile Word256_t* p = reinterpret_cast(first_address); register Word256_t val; -#ifdef USE_TIME_BASED_BENCHMARKS #ifndef HAS_WORD_64 //special case: 32-bit machine UNROLL128(val = *p; *p = val; p = reinterpret_cast(my_32b_extractLSB_256b(val));) //Do 256-bit load. Then do 256-bit store. Then extract 32 LSB to use as next load address. #endif #ifdef HAS_WORD_64 UNROLL128(val = *p; *p = val; p = reinterpret_cast(my_64b_extractLSB_256b(val));) //Do 256-bit load. Then do 256-bit store. Then extract 64 LSB to use as next load address. #endif -#endif -#ifdef USE_SIZE_BASED_BENCHMARKS - for (size_t i = 0; i < len / sizeof(Word256_t); i += 128) { -#ifndef HAS_WORD_64 //special case: 32-bit machine - UNROLL128(val = *p; *p = val; p = reinterpret_cast(my_32b_extractLSB_256b(val));) //Do 256-bit load. Then do 256-bit store. Then extract 32 LSB to use as next load address. -#endif -#ifdef HAS_WORD_64 - UNROLL128(val = *p; *p = val; p = reinterpret_cast(my_64b_extractLSB_256b(val));) //Do 256-bit load. Then do 256-bit store. Then extract 64 LSB to use as next load address. -#endif - } -#endif *last_touched_address = reinterpret_cast(const_cast(p)); //Trick compiler. First get rid of volatile qualifier, and then reinterpret pointer return 0; diff --git a/src/common.cpp b/src/common.cpp index d7d0737..8eb423c 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -147,9 +147,15 @@ void xmem::print_compile_time_options() { #ifdef ARCH_INTEL_X86_64 std::cout << "ARCH_INTEL_X86_64" << std::endl; #endif +#ifdef ARCH_INTEL_X86_64_SSE + std::cout << "ARCH_INTEL_X86_64_SSE" << std::endl; +#endif #ifdef ARCH_INTEL_X86_64_SSE2 std::cout << "ARCH_INTEL_X86_64_SSE2" << std::endl; #endif +#ifdef ARCH_INTEL_X86_64_SSE3 + std::cout << "ARCH_INTEL_X86_64_SSE3" << std::endl; +#endif #ifdef ARCH_INTEL_X86_64_AVX std::cout << "ARCH_INTEL_X86_64_AVX" << std::endl; #endif @@ -162,6 +168,24 @@ void xmem::print_compile_time_options() { #ifdef ARCH_ARM std::cout << "ARCH_ARM" << std::endl; #endif +#ifdef ARCH_ARM_64 + std::cout << "ARCH_ARM_64" << std::endl; +#endif +#ifdef ARCH_ARM_V7 + std::cout << "ARCH_ARM_V7" << std::endl; +#endif +#ifdef ARCH_ARM_V8 + std::cout << "ARCH_ARM_V8" << std::endl; +#endif +#ifdef ARCH_ARM_VFP_V3 + std::cout << "ARCH_ARM_VFP_V3" << std::endl; +#endif +#ifdef ARCH_ARM_VFP_V4 + std::cout << "ARCH_ARM_VFP_V4" << std::endl; +#endif +#ifdef ARCH_ARM_NEON + std::cout << "ARCH_ARM_NEON" << std::endl; +#endif #ifdef ARCH_64BIT std::cout << "ARCH_64BIT" << std::endl; #endif @@ -198,24 +222,12 @@ void xmem::print_compile_time_options() { std::cout << "USE_TSC_TIMER" << std::endl; #endif //TODO: ARM timer -#ifdef USE_TIME_BASED_BENCHMARKS - std::cout << "USE_TIME_BASED_BENCHMARKS" << std::endl; -#endif #ifdef BENCHMARK_DURATION_SEC std::cout << "BENCHMARK_DURATION_SEC = " << BENCHMARK_DURATION_SEC << std::endl; //This must be defined #endif #ifdef THROUGHPUT_BENCHMARK_BYTES_PER_PASS std::cout << "THROUGHPUT_BENCHMARK_BYTES_PER_PASS == " << THROUGHPUT_BENCHMARK_BYTES_PER_PASS << std::endl; #endif -#ifdef USE_SIZE_BASED_BENCHMARKS - std::cout << "USE_SIZE_BASED_BENCHMARKS" << std::endl; -#endif -#ifdef USE_PASSES_CURVE_1 - std::cout << "USE_PASSES_CURVE_1" << std::endl; -#endif -#ifdef USE_PASSES_CURVE_2 - std::cout << "USE_PASSES_CURVE_2" << std::endl; -#endif #ifdef POWER_SAMPLING_PERIOD_SEC std::cout << "POWER_SAMPLING_PERIOD_MS == " << POWER_SAMPLING_PERIOD_MS << std::endl; #endif @@ -379,20 +391,6 @@ int32_t xmem::cpu_id_in_numa_node(uint32_t numa_node, uint32_t cpu_in_node) { #endif } -size_t xmem::compute_number_of_passes(size_t working_set_size_KB) { - size_t passes = 0; -#ifdef USE_PASSES_CURVE_1 - passes = 65536 / working_set_size_KB; -#else -#ifdef USE_PASSES_CURVE_2 - passes = (4*2097152) / working_set_size_KB^2; -#endif -#endif - if (passes < 1) - passes = 1; - return passes; -} - void xmem::init_globals() { //Initialize global variables to defaults. g_verbose = false; diff --git a/src/ext/DelayInjectedLoadedLatencyBenchmark/DelayInjectedLoadedLatencyBenchmark.cpp b/src/ext/DelayInjectedLoadedLatencyBenchmark/DelayInjectedLoadedLatencyBenchmark.cpp index dddfe75..14c8663 100644 --- a/src/ext/DelayInjectedLoadedLatencyBenchmark/DelayInjectedLoadedLatencyBenchmark.cpp +++ b/src/ext/DelayInjectedLoadedLatencyBenchmark/DelayInjectedLoadedLatencyBenchmark.cpp @@ -56,9 +56,6 @@ DelayInjectedLoadedLatencyBenchmark::DelayInjectedLoadedLatencyBenchmark( void* mem_array, size_t len, uint32_t iterations, -#ifdef USE_SIZE_BASED_BENCHMARKS - uint32_t passes_per_iteration, -#endif uint32_t num_worker_threads, uint32_t mem_node, uint32_t cpu_node, @@ -71,9 +68,6 @@ DelayInjectedLoadedLatencyBenchmark::DelayInjectedLoadedLatencyBenchmark( mem_array, len, iterations, -#ifdef USE_SIZE_BASED_BENCHMARKS - passes_per_iteration, -#endif num_worker_threads, mem_node, cpu_node, @@ -277,18 +271,12 @@ bool DelayInjectedLoadedLatencyBenchmark::_run_core() { if (t == 0) { //special case: thread 0 is always latency thread workers.push_back(new LatencyWorker(thread_mem_array, len_per_thread, -#ifdef USE_SIZE_BASED_BENCHMARKS - _passes_per_iteration, -#endif lat_kernel_fptr, lat_kernel_dummy_fptr, cpu_id)); } else { workers.push_back(new LoadWorker(thread_mem_array, len_per_thread, -#ifdef USE_SIZE_BASED_BENCHMARKS - _passes_per_iteration, -#endif load_kernel_fptr, load_kernel_dummy_fptr, cpu_id)); diff --git a/src/include/Benchmark.h b/src/include/Benchmark.h index 68b0e9c..0b4050d 100644 --- a/src/include/Benchmark.h +++ b/src/include/Benchmark.h @@ -61,7 +61,6 @@ namespace xmem { * @param mem_array A pointer to a contiguous chunk of memory that has been allocated for benchmarking among potentially several worker threads. This should be aligned to a 256-bit boundary. * @param len Length of mem_array in bytes. This must be a multiple of 4 KB and should be at least the per-thread working set size times the number of worker threads. * @param iterations Number of iterations of the complete benchmark. Used to average results and provide a measure of consistency and reproducibility. - * @param passes_per_iteration Number of passes to do in each iteration, to ensure timed section of code is "long enough". * @param num_worker_threads The number of worker threads to use in the benchmark. * @param mem_node The logical memory NUMA node used in the benchmark. * @param cpu_node The logical CPU NUMA node to use for the benchmark. @@ -76,9 +75,6 @@ namespace xmem { void* mem_array, size_t len, uint32_t iterations, -#ifdef USE_SIZE_BASED_BENCHMARKS - uint32_t passes_per_iteration, -#endif uint32_t num_worker_threads, uint32_t mem_node, uint32_t cpu_node, @@ -172,14 +168,6 @@ namespace xmem { */ uint32_t getIterations() const; -#ifdef USE_SIZE_BASED_BENCHMARKS - /** - * @brief Gets the number of passes in each iteration. - * @returns The number of passes per iteration for this benchmark. - */ - uint32_t getPassesPerIteration() const; -#endif - /** * @brief Gets the width of memory access used in this benchmark. * @returns The chunk size for this benchmark. @@ -256,9 +244,6 @@ namespace xmem { //Benchmark repetition uint32_t _iterations; /**< Number of iterations used in this benchmark. */ -#ifdef USE_SIZE_BASED_BENCHMARKS - uint32_t _passes_per_iteration; /**< Number of passes per iteration in this benchmark. */ -#endif //Threading and affinity uint32_t _num_worker_threads; /**< The number of worker threads used in this benchmark. */ diff --git a/src/include/LatencyBenchmark.h b/src/include/LatencyBenchmark.h index 44634e2..6852915 100644 --- a/src/include/LatencyBenchmark.h +++ b/src/include/LatencyBenchmark.h @@ -53,9 +53,6 @@ namespace xmem { void* mem_array, size_t len, uint32_t iterations, -#ifdef USE_SIZE_BASED_BENCHMARKS - uint32_t passes_per_iteration, -#endif uint32_t num_worker_threads, uint32_t mem_node, uint32_t cpu_node, diff --git a/src/include/LatencyWorker.h b/src/include/LatencyWorker.h index 2857980..1122578 100644 --- a/src/include/LatencyWorker.h +++ b/src/include/LatencyWorker.h @@ -53,9 +53,6 @@ namespace xmem { LatencyWorker( void* mem_array, size_t len, -#ifdef USE_SIZE_BASED_BENCHMARKS - uint32_t passes_per_iteration, -#endif RandomFunction kernel_fptr, RandomFunction kernel_dummy_fptr, int32_t cpu_affinity diff --git a/src/include/LoadWorker.h b/src/include/LoadWorker.h index bc64ec0..9709ec5 100644 --- a/src/include/LoadWorker.h +++ b/src/include/LoadWorker.h @@ -52,9 +52,6 @@ namespace xmem { LoadWorker( void* mem_array, size_t len, -#ifdef USE_SIZE_BASED_BENCHMARKS - uint32_t passes_per_iteration, -#endif SequentialFunction kernel_fptr, SequentialFunction kernel_dummy_fptr, int32_t cpu_affinity @@ -71,9 +68,6 @@ namespace xmem { LoadWorker( void* mem_array, size_t len, -#ifdef USE_SIZE_BASED_BENCHMARKS - uint32_t passes_per_iteration, -#endif RandomFunction kernel_fptr, RandomFunction kernel_dummy_fptr, int32_t cpu_affinity diff --git a/src/include/MemoryWorker.h b/src/include/MemoryWorker.h index a7048dc..5a56359 100644 --- a/src/include/MemoryWorker.h +++ b/src/include/MemoryWorker.h @@ -48,15 +48,11 @@ namespace xmem { * @brief Constructor. * @param mem_array Pointer to the memory region to use by this worker. * @param len Length of the memory region to use by this worker. - * @param passes_per_iteration for size-based benchmarking, this is the number of passes to execute in a single benchmark iteration. * @param cpu_affinity Logical CPU identifier to lock this worker's thread to. */ MemoryWorker( void* mem_array, size_t len, -#ifdef USE_SIZE_BASED_BENCHMARKS - uint32_t passes_per_iteration, -#endif int32_t cpu_affinity ); @@ -124,9 +120,6 @@ namespace xmem { tick_t _adjusted_ticks; /**< Elapsed ticks minus dummy elapsed ticks. */ bool _warning; /**< If true, results may be suspect. */ bool _completed; /**< If true, worker completed. */ -#ifdef USE_SIZE_BASED_BENCHMARKS - uint32_t _passes_per_iteration; /**< Number of passes per iteration. */ -#endif }; }; diff --git a/src/include/ThroughputBenchmark.h b/src/include/ThroughputBenchmark.h index 50b69ed..5966145 100644 --- a/src/include/ThroughputBenchmark.h +++ b/src/include/ThroughputBenchmark.h @@ -52,9 +52,6 @@ namespace xmem { void* mem_array, size_t len, uint32_t iterations, -#ifdef USE_SIZE_BASED_BENCHMARKS - uint32_t passes_per_iteration, -#endif uint32_t num_worker_threads, uint32_t mem_node, uint32_t cpu_node, diff --git a/src/include/common.h b/src/include/common.h index 1e8dceb..2e4ad99 100644 --- a/src/include/common.h +++ b/src/include/common.h @@ -45,7 +45,7 @@ namespace xmem { -#define VERSION "2.1.13" +#define VERSION "2.1.14" #if !defined(_WIN32) && !defined(__gnu_linux__) #error Neither Windows/GNULinux build environments were detected! @@ -55,43 +55,58 @@ namespace xmem { #ifdef _WIN32 #ifdef _M_IX86 //Intel x86 -#define ARCH_INTEL_X86 #define ARCH_INTEL +#define ARCH_INTEL_X86 #endif #ifdef _M_X64 //Intel x86-64 -#define ARCH_INTEL_X86_64 #define ARCH_INTEL +#define ARCH_INTEL_X86_64 #define ARCH_64BIT #define HAS_NUMA #endif #ifdef _M_IX86_FP //Intel x86-64 SSE2 extensions -#define ARCH_INTEL_X86_64_SSE2 #define ARCH_INTEL +#if _M_IX86_FP == 1 +#define ARCH_INTEL_X86_64_SSE +#endif +#if _M_IX86_FP == 2 +#define ARCH_INTEL_X86_64_SSE +#define ARCH_INTEL_X86_64_SSE2 +#endif #endif #ifdef __AVX__ //Intel x86-64 AVX extensions -#define ARCH_INTEL_X86_64_AVX #define ARCH_INTEL +#define ARCH_INTEL_X86_64_AVX #endif #ifdef __AVX2__ //Intel x86-64 AVX2 extensions -#define ARCH_INTEL_X86_64_AVX2 #define ARCH_INTEL +#define ARCH_INTEL_X86_64_AVX2 #endif -#ifdef _AMD64 //AMD64 +#ifdef _M_AMD64 //AMD64 +#define ARCH_INTEL #define ARCH_AMD64 #define ARCH_64BIT -#define ARCH_INTEL #endif #ifdef _M_ARM //ARM architecture #define ARCH_ARM +#define ARCH_ARM_NEON //FIXME: I don't think there is a way to explicitly check for NEON support on Windows, so I suppose it is always present on any Windows-supported ARM platform anyway. #endif -//TODO: ARM 64-bit support +#ifdef _M_ARM_FP //ARM extensions +#if _M_ARM_FP >= 30 && _M_ARM_FP <= 39 +#define ARCH_ARM_VFP_V3 +#endif +#if _M_ARM_FP >= 40 && _M_ARM_FP <= 49 +#define ARCH_ARM_VFP_V3 +#define ARCH_ARM_VFP_V4 +#endif +#endif #endif @@ -99,42 +114,78 @@ namespace xmem { #ifdef __gnu_linux__ #ifdef __i386__ //Intel x86 -#define ARCH_INTEL_X86 #define ARCH_INTEL +#define ARCH_INTEL_X86 #endif #ifdef __x86_64__ //Intel x86-64 +#define ARCH_INTEL #define ARCH_INTEL_X86_64 #define ARCH_64BIT -#define ARCH_INTEL #define HAS_NUMA #endif -#ifdef __SSE2__ //Intel x86-64 SSE2 extensions -#define ARCH_INTEL_X86_64_SSE2 +#ifdef __SSE__ //Intel x86-64 SSE extensions #define ARCH_INTEL +#define ARCH_INTEL_X86_64_SSE +#endif + +#ifdef __SSE2__ //Intel x86-64 SSE2 extensions +#define ARCH_INTEL +#define ARCH_INTEL_X86_64_SSE2 +#endif + +#ifdef __SSE3__ //Intel x86-64 SSE3 extensions +#define ARCH_INTEL +#define ARCH_INTEL_X86_64_SSE3 #endif #ifdef __AVX__ //Intel x86-64 AVX extensions -#define ARCH_INTEL_X86_64_AVX #define ARCH_INTEL +#define ARCH_INTEL_X86_64_AVX #endif #ifdef __AVX2__ //Intel x86-64 AVX2 extensions -#define ARCH_INTEL_X86_64_AVX2 #define ARCH_INTEL +#define ARCH_INTEL_X86_64_AVX2 #endif #ifdef __amd64__ //AMD64 +#define ARCH_INTEL #define ARCH_AMD64 #define ARCH_64BIT #endif #ifdef __arm__ //ARM architecture #define ARCH_ARM +#define ARCH_ARM_VFP_V3 //FIXME: this is assumed, as I don't know how to check directly #endif -//TODO: ARM 64-bit support +#ifdef __aarch64__ //ARM 64-bit +#define ARCH_ARM +#define ARCH_ARM_64 +#define ARCH_ARM_VFP_V4 //FIXME: this is assumed, as I don't know how to check directly +#define ARCH_64BIT +#endif + +#ifdef __ARM_ARCH_7__ //ARMv7 +#define ARCH_ARM +#define ARCH_ARM_V7 +#endif + +#ifdef __ARM_ARCH_8__ //ARMv8 +#define ARCH_ARM +#define ARCH_ARM_V8 +#define ARCH_ARM_64 +#define ARCH_ARM_VFP_V4 //FIXME: this is assumed, as I don't know how to check directly +#define ARCH_64BIT +#endif + +#ifdef __ARM_NEON__ //ARM NEON extensions +#define ARCH_ARM +#define ARCH_ARM_NEON +#define ARCH_ARM_VFP_V4 //FIXME: this is assumed, as I don't know how to check directly +#endif #endif @@ -204,19 +255,8 @@ namespace xmem { #define USE_OS_TIMER /**< RECOMMENDED ENABLED. If enabled, uses the QPC timer on Windows and the POSIX clock_gettime() on GNU/Linux for all timing purposes. */ //#define USE_HW_TIMER /**< RECOMMENDED DISABLED. If enabled, uses the platform-specific hardware timer (e.g., TSC on Intel x86-64). This may be less portable or have other implementation-specific quirks but for most purposes should work fine. */ -//Benchmarking methodology. Only one may be selected! -#define USE_TIME_BASED_BENCHMARKS /**< RECOMMENDED ENABLED. All benchmarks run for an estimated amount of time, and the figures of merit are computed based on the amount of memory accesses completed in the time limit. This mode has more consistent runtime across different machines, memory performance, and working set sizes, but may have more conservative measurements for differing levels of cache hierarchy (overestimating latency and underestimating throughput). */ -//#define USE_SIZE_BASED_BENCHMARKS /**< RECOMMENDED DISABLED. All benchmarks run for an estimated amount of memory accesses, and the figures of merit are computed based on the length of time required to run the benchmark. This mode may have highly varying runtime across different machines, memory performance, and working set sizes, but may have more optimistic measurements across differing levels of cache hierarchy (underestimating latency and overestimating throughput). TODO: remove this feature entirely at some point, it just complicates things... */ - -#ifdef USE_TIME_BASED_BENCHMARKS //DO NOT COMMENT THIS OUT! #define BENCHMARK_DURATION_MS 250 /**< RECOMMENDED VALUE: At least 1000. Number of milliseconds to run in each benchmark. */ #define THROUGHPUT_BENCHMARK_BYTES_PER_PASS 4096 /**< RECOMMENDED VALUE: 4096. Number of bytes read or written per pass of any ThroughputBenchmark. This must be less than or equal to the minimum working set size, which is currently 4 KB. */ -#endif //DO NOT COMMENT THIS OUT - -#ifdef USE_SIZE_BASED_BENCHMARKS //DO NOT COMMENT THIS OUT -//#define USE_PASSES_CURVE_1 /**< RECOMMENDED DISABLED. The passes per iteration of a benchmark will be given by y = 65536 / working_set_size_KB */ -#define USE_PASSES_CURVE_2 /**< RECOMMENDED ENABLED. The passes per iteration of a benchmark will be given by y = 4*2097152 / working_set_size_KB^2 */ -#endif //DO NOT COMMENT THIS OUT #define POWER_SAMPLING_PERIOD_MS 1000 /**< RECOMMENDED VALUE: 1000. Sampling period in milliseconds for all power measurement mechanisms. */ @@ -254,32 +294,13 @@ namespace xmem { #error Only one type of timer may be defined! #endif -//Compile-time options checks: benchmarking type: size or time-limited -#if (defined(USE_TIME_BASED_BENCHMARKS) && defined(USE_SIZE_BASED_BENCHMARKS)) || (!defined(USE_TIME_BASED_BENCHMARKS) && !defined(USE_SIZE_BASED_BENCHMARKS)) -#error Exactly one of USE_TIME_BASED_BENCHMARKS and USE_SIZE_BASED_BENCHMARKS must be defined! -#endif - -#ifdef USE_TIME_BASED_BENCHMARKS -#ifndef BENCHMARK_DURATION_MS -#error BENCHMARK_DURATION_MS must be defined! -#endif #if BENCHMARK_DURATION_MS <= 0 #error BENCHMARK_DURATION_MS must be positive! #endif -#ifndef THROUGHPUT_BENCHMARK_BYTES_PER_PASS -#error THROUGHPUT_BENCHMARK_BYTES_PER_PASS must be defined! -#else + #if THROUGHPUT_BENCHMARK_BYTES_PER_PASS > DEFAULT_PAGE_SIZE || THROUGHPUT_BENCHMARK_BYTES_PER_PASS <= 0 #error THROUGHPUT_BENCHMARK_BYTES_PER_PASS must be less than or equal to the minimum possible working set size. It also must be a positive integer. #endif -#endif -#endif - -#ifdef USE_SIZE_BASED_BENCHMARKS -#if (defined(USE_PASSES_CURVE_1) && defined(USE_PASSES_CURVE_2)) || (!defined(USE_PASSES_CURVE_1) && !defined(USE_PASSES_CURVE_2)) -#error Exactly one passes curve must be defined. -#endif -#endif //Compile-time options checks: power sampling frequency. TODO: this should probably be a runtime option #if !defined(POWER_SAMPLING_PERIOD_MS) || POWER_SAMPLING_PERIOD_MS <= 0 diff --git a/src/include/ext/DelayInjectedLoadedLatencyBenchmark/DelayInjectedLoadedLatencyBenchmark.h b/src/include/ext/DelayInjectedLoadedLatencyBenchmark/DelayInjectedLoadedLatencyBenchmark.h index 8d5232f..5019bff 100644 --- a/src/include/ext/DelayInjectedLoadedLatencyBenchmark/DelayInjectedLoadedLatencyBenchmark.h +++ b/src/include/ext/DelayInjectedLoadedLatencyBenchmark/DelayInjectedLoadedLatencyBenchmark.h @@ -55,9 +55,6 @@ namespace xmem { void* mem_array, size_t len, uint32_t iterations, -#ifdef USE_SIZE_BASED_BENCHMARKS - uint32_t passes_per_iteration, -#endif uint32_t num_worker_threads, uint32_t mem_node, uint32_t cpu_node, diff --git a/src/win/x86_64/win_asm_dummy_forwSequentialLoop_Word128.asm b/src/win/x86_64/win_x86_64_asm_dummy_forwSequentialLoop_Word128.asm similarity index 100% rename from src/win/x86_64/win_asm_dummy_forwSequentialLoop_Word128.asm rename to src/win/x86_64/win_x86_64_asm_dummy_forwSequentialLoop_Word128.asm diff --git a/src/win/x86_64/win_asm_dummy_forwSequentialLoop_Word256.asm b/src/win/x86_64/win_x86_64_asm_dummy_forwSequentialLoop_Word256.asm similarity index 100% rename from src/win/x86_64/win_asm_dummy_forwSequentialLoop_Word256.asm rename to src/win/x86_64/win_x86_64_asm_dummy_forwSequentialLoop_Word256.asm diff --git a/src/win/x86_64/win_asm_dummy_revSequentialLoop_Word128.asm b/src/win/x86_64/win_x86_64_asm_dummy_revSequentialLoop_Word128.asm similarity index 100% rename from src/win/x86_64/win_asm_dummy_revSequentialLoop_Word128.asm rename to src/win/x86_64/win_x86_64_asm_dummy_revSequentialLoop_Word128.asm diff --git a/src/win/x86_64/win_asm_dummy_revSequentialLoop_Word256.asm b/src/win/x86_64/win_x86_64_asm_dummy_revSequentialLoop_Word256.asm similarity index 100% rename from src/win/x86_64/win_asm_dummy_revSequentialLoop_Word256.asm rename to src/win/x86_64/win_x86_64_asm_dummy_revSequentialLoop_Word256.asm diff --git a/src/win/x86_64/win_asm_forwSequentialRead_Word128.asm b/src/win/x86_64/win_x86_64_asm_forwSequentialRead_Word128.asm similarity index 100% rename from src/win/x86_64/win_asm_forwSequentialRead_Word128.asm rename to src/win/x86_64/win_x86_64_asm_forwSequentialRead_Word128.asm diff --git a/src/win/x86_64/win_asm_forwSequentialRead_Word256.asm b/src/win/x86_64/win_x86_64_asm_forwSequentialRead_Word256.asm similarity index 100% rename from src/win/x86_64/win_asm_forwSequentialRead_Word256.asm rename to src/win/x86_64/win_x86_64_asm_forwSequentialRead_Word256.asm diff --git a/src/win/x86_64/win_asm_forwSequentialWrite_Word128.asm b/src/win/x86_64/win_x86_64_asm_forwSequentialWrite_Word128.asm similarity index 100% rename from src/win/x86_64/win_asm_forwSequentialWrite_Word128.asm rename to src/win/x86_64/win_x86_64_asm_forwSequentialWrite_Word128.asm diff --git a/src/win/x86_64/win_asm_forwSequentialWrite_Word256.asm b/src/win/x86_64/win_x86_64_asm_forwSequentialWrite_Word256.asm similarity index 100% rename from src/win/x86_64/win_asm_forwSequentialWrite_Word256.asm rename to src/win/x86_64/win_x86_64_asm_forwSequentialWrite_Word256.asm diff --git a/src/win/x86_64/win_asm_revSequentialRead_Word128.asm b/src/win/x86_64/win_x86_64_asm_revSequentialRead_Word128.asm similarity index 100% rename from src/win/x86_64/win_asm_revSequentialRead_Word128.asm rename to src/win/x86_64/win_x86_64_asm_revSequentialRead_Word128.asm diff --git a/src/win/x86_64/win_asm_revSequentialRead_Word256.asm b/src/win/x86_64/win_x86_64_asm_revSequentialRead_Word256.asm similarity index 100% rename from src/win/x86_64/win_asm_revSequentialRead_Word256.asm rename to src/win/x86_64/win_x86_64_asm_revSequentialRead_Word256.asm diff --git a/src/win/x86_64/win_asm_revSequentialWrite_Word128.asm b/src/win/x86_64/win_x86_64_asm_revSequentialWrite_Word128.asm similarity index 100% rename from src/win/x86_64/win_asm_revSequentialWrite_Word128.asm rename to src/win/x86_64/win_x86_64_asm_revSequentialWrite_Word128.asm diff --git a/src/win/x86_64/win_asm_revSequentialWrite_Word256.asm b/src/win/x86_64/win_x86_64_asm_revSequentialWrite_Word256.asm similarity index 100% rename from src/win/x86_64/win_asm_revSequentialWrite_Word256.asm rename to src/win/x86_64/win_x86_64_asm_revSequentialWrite_Word256.asm