From aaf39cbd5b73e3a4adefab3031bd704bbadc8bde Mon Sep 17 00:00:00 2001 From: Mark Gottscho Date: Sat, 17 Jan 2015 16:21:41 -0800 Subject: [PATCH] Porting the init sequence for querying system information to Linux. Right now it is in a rough and bandaid shape. Also the program segfaults in ThroughputBenchmark and crashes on illegal instruction in LatencyBenchmark for Linux. --- SConscript_unix | 2 +- src/README.md | 428 ++++++++-------- src/common.cpp | 78 ++- src/include/common.h | 922 +++++++++++++++++----------------- src/include/x86_64/TSCTimer.h | 214 ++++---- src/main.cpp | 9 +- src/x86_64/TSCTimer.cpp | 206 ++++---- 7 files changed, 977 insertions(+), 882 deletions(-) diff --git a/SConscript_unix b/SConscript_unix index 0403799..2eca9db 100644 --- a/SConscript_unix +++ b/SConscript_unix @@ -9,7 +9,7 @@ env = Environment() # Customize build settings # LINUX -env.Append(CPPFLAGS = '-Wall -g -O3 -std=c++11 -mavx') +env.Append(CPPFLAGS = '-Wall -g -O3 -std=c++11 -mavx -mavx2') env.Append(CPPPATH = ['src/include', '/usr/include']) env.Append(LIBS = ['pthread', 'numa']) diff --git a/src/README.md b/src/README.md index 94b6045..3b01c2f 100644 --- a/src/README.md +++ b/src/README.md @@ -1,214 +1,214 @@ -README ------------------------------------------------------------------------------------------------------------- - -X-Mem: Extensible Memory Benchmarking Tool v1.04 ------------------------------------------------------------------------------------------------------------- - -The flexible open-source research tool for characterizing memory hierarchy throughput, latency, and power. - -Originally authored by Mark Gottscho (Email: ) as a Summer 2014 intern at Microsoft Research, Redmond, WA. - -This project is under active development. Stay tuned for more updates. - -PROJECT REVISION DATE: January 14, 2015. - ------------------------------------------------------------------------------------------------------------- -LICENSE ------------------------------------------------------------------------------------------------------------- - -The MIT License (MIT) - -Copyright (c) 2014 Microsoft - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - ------------------------------------------------------------------------------------------------------------- -FEATURES ------------------------------------------------------------------------------------------------------------- - -This tool is provided as open source with the hope of being useful to the broader research and development community. Here are some of X-Mem's features. - -Flexibility: Easy reconfiguration for different sets of tests - - Cache levels - - NUMA - - Multi-threading - - Forward and reverse strides - - Sequential and random access - - Read and write - - 32, 64, 128, 256-bit width memory instructions - -Extensibility: C++ object-oriented principles - - Supports rapid addition of new benchmark kernel routines by others - - Example: stream triad algorithm, impact of false sharing, etc. are possible with minor additions - -Cross-platform: Currently implemented for Windows on x86-64 CPUs with AVX extensions - - Designed to allow straightforward porting to other operating systems and ISAs - - Planning to implement Unix port - -Memory throughput: - - Accurate measurement of sustained memory throughput to all levels of cache - - Burst mode benchmark kernels possible (extensible!) - -Memory latency: - - Accurate measurement of round-trip unloaded memory latency to all levels of cache - - Loaded latency measurements planned - -Memory power: - - Currently collecting DRAM power via custom driver exposed in Windows performance counter API - - Supports custom power instrumentation without much modification - - Thorough Documentation: extensive Doxygen source comments, HTML, PDF docs - -For feature requests, please refer to the contact information at the end of this README. - ------------------------------------------------------------------------------------------------------------- -RUNTIME PREREQUISITES ------------------------------------------------------------------------------------------------------------- - -There are a few runtime prerequisites in order for the software to run correctly. - -HARDWARE: - -- Intel x86-64 CPU. RECOMMENDED: Recent CPU with SSE2 and AVX extended instruction sets for improved throughput benchmarking capabilities. - -WINDOWS: - -- Microsoft Windows 64-bit, 8.0 or later, Server 2012 or later. -- Microsoft Visual C++ 2013 Redistributables (64-bit) - -UNIX/LINUX: - -- TBD - ------------------------------------------------------------------------------------------------------------- -INSTALLATION ------------------------------------------------------------------------------------------------------------- - -The only file that is needed to run is xmem.exe. It has no other dependencies aside from the system prerequisites listed above. - ------------------------------------------------------------------------------------------------------------- -USAGE ------------------------------------------------------------------------------------------------------------- - -NOTE: On Windows, make sure you run X-Mem with Administrator privileges. This is needed in order to: - - Allocate "large pages" for improved performance as well as query - - Read performance counter data from the OS for reporting power (when applicable) - - Elevate thread priority and pin threads to CPUs for improved performance and benchmarking consistency - -xmem [options] - -Options: - - -h, --help Print usage and exit. - - -l, --latency Measure memory latency - - -t, --throughput Measure memory throughput - - -w, --working_set_size Working set size in KB. This must be a multiple of - 4KB. - - -n, --iterations Iterations per benchmark test - - -i, --base_test_index Numerical index of the first benchmark, for - tracking unique test IDs. - - -f, --output_file Output filename to use. If not specified, no - output file generated. - -Examples: - - xmem --help - - xmem -h - - xmem -t - - xmem -t --latency -n10 -w524288 -f results.csv -i 101 - ------------------------------------------------------------------------------------------------------------- -BUILDING FROM SOURCE ------------------------------------------------------------------------------------------------------------- - -Before building the source, enable and disable the relevant compile-time options in src/include/common.h, under the section "User-configurable compilation configuration". Please read the comments by each #define statement to understand the context of each option. - -After you have set the desired compile-time options, build the source. On Windows, running build-win.bat should suffice. On Unix, run build-unix.sh. - -If you customize your build, make sure you use the "Release" mode for your OS. Do not include debug capabilities as it can dramatically affect performance of the benchmarks, leading to pessimistic results. - ------------------------------------------------------------------------------------------------------------- -BUILD PREREQUISITES ------------------------------------------------------------------------------------------------------------- - -There are a few software build prerequisites, depending on your platform. - -WINDOWS: - -- Any version of Visual Studio 2013 64-bit (also known as version 12.0). -- Python 2.7. You can obtain it at . -- SCons build system. You can obtain it at . Build tested with SCons 2.3.4. - -UNIX/LINUX: - -- gcc with support for the C++11 standard. Tested with gcc version 4.8.2 on Ubuntu 14.04 LTS for x86-64. -- Python 2.7. You can obtain it at . On Ubuntu systems, you can install using "sudo apt-get install python2.7". You may need some other Python 2.7 packages as well. -- SCons build system. You can obtain it at . On Ubuntu systems, you can install using "sudo apt-get install scons". Build tested with SCons 2.3.4. - ------------------------------------------------------------------------------------------------------------- -DOCUMENTATION BUILD PREREQUISITES ------------------------------------------------------------------------------------------------------------- - -The following tools are only needed for automatically regenerating source code documentation with HTML and PDF. - -WINDOWS: - -- doxygen tool. You can obtain it at . -- LaTeX distribution. You can get a Windows distribution at . -- make for Windows. You can obtain it at . You will have to manually add it to your Windows path. - -UNIX/LINUX: - -- doxygen tool. You can obtain it at . On Ubuntu systems, you can install with "sudo apt-get install doxygen". -- LaTeX distribution. On Ubuntu systems, LaTeX distributed with doxygen should actually be sufficient. You can install with "sudo apt-get install doxygen-latex". -- make. This should be included on any Unix/Linux system. - ------------------------------------------------------------------------------------------------------------- -SOURCE CODE DOCUMENTATION ------------------------------------------------------------------------------------------------------------- - -The tool comes with built-in Doxygen comments in the source code, which can be used to generate both HTML and LaTeX --> PDF documentation. Documentation is maintained under the doc/ subdirectory. To build documentation after modifying the source, run build-docs-win.bat on Windows, or build-docs-unix.sh on Unix systems. Note that Doxygen and a LaTeX distribution must be installed on the system. - ------------------------------------------------------------------------------------------------------------- -VERSION CONTROL ------------------------------------------------------------------------------------------------------------- - -This project is under version control using git. Its master repository is hosted at . - ------------------------------------------------------------------------------------------------------------- -CONTACT, FEEDBACK, AND BUG REPORTS ------------------------------------------------------------------------------------------------------------- - -For questions, comments, criticism, bug reports, and other feedback for this software, please contact Mark Gottscho via email at or via web at . - -For inquiries about this work while conducted at Microsoft, please contact Dr. Mohammed Shoaib at or Dr. Sriram Govindan at . - ------------------------------------------------------------------------------------------------------------- -ACKNOWLEDGMENT ------------------------------------------------------------------------------------------------------------- - -Mark Gottscho would like to thank Dr. Mohammed Shoaib of Microsoft Research and Dr. Sriram Govindan of Microsoft for their mentorship in the creation of this software. Further thanks to Dr. Bikash Sharma, Mark Santaniello, Mike Andrewartha, and Laura Caulfield of Microsoft for their contributions, feedback, and assistance. Finally, thank you to Dr. Jie Liu of Microsoft Research, Dr. Badriddine Khessib and Dr. Kushagra Vaid of Microsoft, and Prof. Puneet Gupta of UCLA for giving me the opportunity to create this work. +README +------------------------------------------------------------------------------------------------------------ + +X-Mem: Extensible Memory Benchmarking Tool v1.04 +------------------------------------------------------------------------------------------------------------ + +The flexible open-source research tool for characterizing memory hierarchy throughput, latency, and power. + +Originally authored by Mark Gottscho (Email: ) as a Summer 2014 intern at Microsoft Research, Redmond, WA. + +This project is under active development. Stay tuned for more updates. + +PROJECT REVISION DATE: January 14, 2015. + +------------------------------------------------------------------------------------------------------------ +LICENSE +------------------------------------------------------------------------------------------------------------ + +The MIT License (MIT) + +Copyright (c) 2014 Microsoft + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +------------------------------------------------------------------------------------------------------------ +FEATURES +------------------------------------------------------------------------------------------------------------ + +This tool is provided as open source with the hope of being useful to the broader research and development community. Here are some of X-Mem's features. + +Flexibility: Easy reconfiguration for different sets of tests + - Cache levels + - NUMA + - Multi-threading + - Forward and reverse strides + - Sequential and random access + - Read and write + - 32, 64, 128, 256-bit width memory instructions + +Extensibility: C++ object-oriented principles + - Supports rapid addition of new benchmark kernel routines by others + - Example: stream triad algorithm, impact of false sharing, etc. are possible with minor additions + +Cross-platform: Currently implemented for Windows on x86-64 CPUs with AVX extensions + - Designed to allow straightforward porting to other operating systems and ISAs + - Planning to implement Unix port + +Memory throughput: + - Accurate measurement of sustained memory throughput to all levels of cache + - Burst mode benchmark kernels possible (extensible!) + +Memory latency: + - Accurate measurement of round-trip unloaded memory latency to all levels of cache + - Loaded latency measurements planned + +Memory power: + - Currently collecting DRAM power via custom driver exposed in Windows performance counter API + - Supports custom power instrumentation without much modification + - Thorough Documentation: extensive Doxygen source comments, HTML, PDF docs + +For feature requests, please refer to the contact information at the end of this README. + +------------------------------------------------------------------------------------------------------------ +RUNTIME PREREQUISITES +------------------------------------------------------------------------------------------------------------ + +There are a few runtime prerequisites in order for the software to run correctly. + +HARDWARE: + +- Intel x86-64 CPU. RECOMMENDED: Recent CPU with SSE2 and AVX extended instruction sets for improved throughput benchmarking capabilities. + +WINDOWS: + +- Microsoft Windows 64-bit, 8.0 or later, Server 2012 or later. +- Microsoft Visual C++ 2013 Redistributables (64-bit) + +UNIX/LINUX: + +- TBD + +------------------------------------------------------------------------------------------------------------ +INSTALLATION +------------------------------------------------------------------------------------------------------------ + +The only file that is needed to run is xmem.exe. It has no other dependencies aside from the system prerequisites listed above. + +------------------------------------------------------------------------------------------------------------ +USAGE +------------------------------------------------------------------------------------------------------------ + +NOTE: On Windows, make sure you run X-Mem with Administrator privileges. This is needed in order to: + - Allocate "large pages" for improved performance as well as query + - Read performance counter data from the OS for reporting power (when applicable) + - Elevate thread priority and pin threads to CPUs for improved performance and benchmarking consistency + +xmem [options] + +Options: + + -h, --help Print usage and exit. + + -l, --latency Measure memory latency + + -t, --throughput Measure memory throughput + + -w, --working_set_size Working set size in KB. This must be a multiple of + 4KB. + + -n, --iterations Iterations per benchmark test + + -i, --base_test_index Numerical index of the first benchmark, for + tracking unique test IDs. + + -f, --output_file Output filename to use. If not specified, no + output file generated. + +Examples: + + xmem --help + + xmem -h + + xmem -t + + xmem -t --latency -n10 -w524288 -f results.csv -i 101 + +------------------------------------------------------------------------------------------------------------ +BUILDING FROM SOURCE +------------------------------------------------------------------------------------------------------------ + +Before building the source, enable and disable the relevant compile-time options in src/include/common.h, under the section "User-configurable compilation configuration". Please read the comments by each #define statement to understand the context of each option. + +After you have set the desired compile-time options, build the source. On Windows, running build-win.bat should suffice. On Unix, run build-unix.sh. + +If you customize your build, make sure you use the "Release" mode for your OS. Do not include debug capabilities as it can dramatically affect performance of the benchmarks, leading to pessimistic results. + +------------------------------------------------------------------------------------------------------------ +BUILD PREREQUISITES +------------------------------------------------------------------------------------------------------------ + +There are a few software build prerequisites, depending on your platform. + +WINDOWS: + +- Any version of Visual Studio 2013 64-bit (also known as version 12.0). +- Python 2.7. You can obtain it at . +- SCons build system. You can obtain it at . Build tested with SCons 2.3.4. + +UNIX/LINUX: + +- gcc with support for the C++11 standard. Tested with gcc version 4.8.2 on Ubuntu 14.04 LTS for x86-64. +- Python 2.7. You can obtain it at . On Ubuntu systems, you can install using "sudo apt-get install python2.7". You may need some other Python 2.7 packages as well. +- SCons build system. You can obtain it at . On Ubuntu systems, you can install using "sudo apt-get install scons". Build tested with SCons 2.3.4. + +------------------------------------------------------------------------------------------------------------ +DOCUMENTATION BUILD PREREQUISITES +------------------------------------------------------------------------------------------------------------ + +The following tools are only needed for automatically regenerating source code documentation with HTML and PDF. + +WINDOWS: + +- doxygen tool. You can obtain it at . +- LaTeX distribution. You can get a Windows distribution at . +- make for Windows. You can obtain it at . You will have to manually add it to your Windows path. + +UNIX/LINUX: + +- doxygen tool. You can obtain it at . On Ubuntu systems, you can install with "sudo apt-get install doxygen". +- LaTeX distribution. On Ubuntu systems, LaTeX distributed with doxygen should actually be sufficient. You can install with "sudo apt-get install doxygen-latex". +- make. This should be included on any Unix/Linux system. + +------------------------------------------------------------------------------------------------------------ +SOURCE CODE DOCUMENTATION +------------------------------------------------------------------------------------------------------------ + +The tool comes with built-in Doxygen comments in the source code, which can be used to generate both HTML and LaTeX --> PDF documentation. Documentation is maintained under the doc/ subdirectory. To build documentation after modifying the source, run build-docs-win.bat on Windows, or build-docs-unix.sh on Unix systems. Note that Doxygen and a LaTeX distribution must be installed on the system. + +------------------------------------------------------------------------------------------------------------ +VERSION CONTROL +------------------------------------------------------------------------------------------------------------ + +This project is under version control using git. Its master repository is hosted at . + +------------------------------------------------------------------------------------------------------------ +CONTACT, FEEDBACK, AND BUG REPORTS +------------------------------------------------------------------------------------------------------------ + +For questions, comments, criticism, bug reports, and other feedback for this software, please contact Mark Gottscho via email at or via web at . + +For inquiries about this work while conducted at Microsoft, please contact Dr. Mohammed Shoaib at or Dr. Sriram Govindan at . + +------------------------------------------------------------------------------------------------------------ +ACKNOWLEDGMENT +------------------------------------------------------------------------------------------------------------ + +Mark Gottscho would like to thank Dr. Mohammed Shoaib of Microsoft Research and Dr. Sriram Govindan of Microsoft for their mentorship in the creation of this software. Further thanks to Dr. Bikash Sharma, Mark Santaniello, Mike Andrewartha, and Laura Caulfield of Microsoft for their contributions, feedback, and assistance. Finally, thank you to Dr. Jie Liu of Microsoft Research, Dr. Badriddine Khessib and Dr. Kushagra Vaid of Microsoft, and Prof. Puneet Gupta of UCLA for giving me the opportunity to create this work. diff --git a/src/common.cpp b/src/common.cpp index 1365e97..19938a8 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -52,10 +52,17 @@ namespace xmem { namespace common { size_t g_page_size; /**< Default page size on the system, in bytes. */ +#ifdef USE_LARGE_PAGES size_t g_large_page_size; /**< Large page size on the system, in bytes. */ +#endif uint32_t g_num_nodes; /**< Number of NUMA nodes in the system. */ - uint32_t g_num_logical_cpus; /**< Number of logical CPU cores in the system. */ + uint32_t g_num_logical_cpus; /**< Number of logical CPU cores in the system. This may be different than physical CPUs, e.g. Intel hyperthreading. */ + uint32_t g_num_physical_cpus; /**< Number of physical CPU cores in the system. */ uint32_t g_num_physical_packages; /**< Number of physical CPU packages in the system. Generally this is the same as number of NUMA nodes, unless UMA emulation is done in hardware. */ + uint32_t g_total_l1_caches; /**< Total number of L1 caches in the system. */ + uint32_t g_total_l2_caches; /**< Total number of L2 caches in the system. */ + uint32_t g_total_l3_caches; /**< Total number of L3 caches in the system. */ + uint32_t g_total_l4_caches; /**< Total number of L4 caches in the system. */ uint32_t g_starting_test_index; /**< Numeric identifier for the first benchmark test. */ uint32_t g_test_index; /**< Numeric identifier for the current benchmark test. */ }; @@ -110,6 +117,9 @@ void xmem::common::print_compile_time_options() { #ifdef _WIN64 std::cout << "Win64" << std::endl; #endif +#ifdef __unix__ + std::cout << "Unix" << std::endl; +#endif #ifdef ARCH_INTEL_X86 std::cout << "ARCH_INTEL_X86" << std::endl; #endif @@ -382,3 +392,69 @@ size_t xmem::common::compute_number_of_passes(size_t working_set_size_KB) { passes = 1; return passes; } + +int32_t xmem::common::query_sys_info() { +#ifdef VERBOSE + std::cout << std::endl; + std::cout << "Initializing default system information..."; +#endif + //Initialize to defaults. + g_num_nodes = DEFAULT_NUM_NODES; + g_num_physical_packages = DEFAULT_NUM_PHYSICAL_PACKAGES; + g_num_physical_cpus = DEFAULT_NUM_PHYSICAL_CPUS; + g_num_logical_cpus = DEFAULT_NUM_LOGICAL_CPUS; + g_total_l1_caches = DEFAULT_NUM_L1_CACHES; + g_total_l2_caches = DEFAULT_NUM_L2_CACHES; + g_total_l3_caches = DEFAULT_NUM_L3_CACHES; + g_total_l4_caches = DEFAULT_NUM_L4_CACHES; + g_page_size = DEFAULT_PAGE_SIZE; +#ifdef USE_LARGE_PAGES + g_large_page_size = DEFAULT_LARGE_PAGE_SIZE; +#endif + +#ifdef VERBOSE + std::cout << "done" << std::endl; + std::cout << "Querying system information..."; +#endif + +#ifdef _WIN32 +//TODO: refactor from win_common_third_party.cpp +#endif + +#ifdef __unix__ + //Check that NUMA is available. + if (numa_available() == -1) { + std::cout << "FATAL: NUMA API is not available on this system." << std::endl; + exit(-1); + } + + g_num_nodes = numa_max_node()+1; + g_num_physical_packages = g_num_nodes; //FIXME: this is totally a bandaid + g_num_logical_cpus = sysconf(_SC_NPROCESSORS_ONLN); //FIXME: this isn't really portable -- requires glibc extensions to sysconf() + g_num_physical_cpus = g_num_logical_cpus / 2; //FIXME: this is totally a bandaid and assumes something like Intel HyperThreading + g_total_l1_caches = g_num_physical_cpus; //FIXME: this is totally a bandaid + g_total_l2_caches = g_num_physical_cpus; //FIXME: this is totally a bandaid + g_total_l3_caches = 1; //FIXME: this is totally a bandaid + g_total_l4_caches = 0; //FIXME: this is totally a bandaid + g_page_size = static_cast(sysconf(_SC_PAGESIZE)); +#ifdef USE_LARGE_PAGES + //g_large_page_size = //FIXME: implement +#endif +#endif + +#ifdef VERBOSE + std::cout << "done" << std::endl; + std::cout << "Number of NUMA nodes: " << g_num_nodes << std::endl; + std::cout << "Number of physical processor packages: " << g_num_physical_packages << std::endl; + std::cout << "Number of physical processor cores: " << g_num_physical_cpus << std::endl; + std::cout << "Number of logical processor cores: " << g_num_logical_cpus << std::endl; + std::cout << "Number of processor L1/L2/L3/L4 caches: " << g_total_l1_caches << "/" << g_total_l2_caches << "/" << g_total_l3_caches << "/" << g_total_l4_caches << std::endl; +#ifdef USE_LARGE_PAGES + std::cout << "(Large) page size to be used for benchmarks: " << g_large_page_size << " B" << std::endl; +#else + std::cout << "(Regular) page size to be used for benchmarks: " << g_page_size << " B" << std::endl; +#endif +#endif + + return 0; +} diff --git a/src/include/common.h b/src/include/common.h index e038e86..e1b24f7 100644 --- a/src/include/common.h +++ b/src/include/common.h @@ -1,453 +1,469 @@ -/* The MIT License (MIT) - * - * Copyright (c) 2014 Microsoft - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/** - * @file - * - * @brief Header file for common preprocessor definitions, macros, functions, and global constants. - */ - -#ifndef __COMMON_H -#define __COMMON_H - -//Libraries -#include -#include - -#ifdef _WIN32 -#include -#endif - -#ifdef __unix__ -#include -#endif - -namespace xmem { - namespace common { - -#define VERSION "1.04" - -#if !defined(_WIN32) && !defined(__unix__) -#error Neither Windows or Unix build environments were detected! -#endif - -//Windows: convert platform-specific preprocessor flags for architecture to xmem-specific constants -#ifdef _WIN32 - -#ifdef _M_IX86 //Intel x86 -#define ARCH_INTEL_X86 -#endif - -#ifdef _M_X64 //Intel x86-64 -#define ARCH_INTEL_X86_64 -#endif - -#ifdef _M_IX86_FP //Intel x86-64 SSE2 extensions -#define ARCH_INTEL_X86_64_SSE2 -#endif - -#ifdef __AVX__ //Intel x86-64 AVX extensions -#define ARCH_INTEL_X86_64_AVX -#endif - -#ifdef __AVX2__ //Intel x86-64 AVX2 extensions -#define ARCH_INTEL_X86_64_AVX2 -#endif - -#ifdef _AMD64 //AMD64 -#define ARCH_AMD64 -#endif - -#ifdef _M_ARM //ARM architecture -#define ARCH_ARM -#endif - -#endif - -//Unix: convert platform-specific preprocessor flags for architecture to xmem-specific constants -#ifdef __unix__ - -#ifdef __i386__ //Intel x86 -#define ARCH_INTEL_X86 -#endif - -#ifdef __x86_64__ //Intel x86-64 -#define ARCH_INTEL_X86_64 -#endif - -#ifdef __SSE2__ //Intel x86-64 SSE2 extensions -#define ARCH_INTEL_X86_64_SSE2 -#endif - -#ifdef __AVX__ //Intel x86-64 AVX extensions -#define ARCH_INTEL_X86_64_AVX -#endif - -#ifdef __AVX2__ //Intel x86-64 AVX2 extensions -#define ARCH_INTEL_X86_64_AVX2 -#endif - -#ifdef __amd64__ //AMD64 -#define ARCH_AMD64 -#endif - -#ifdef __arm__ //ARM architecture -#define ARCH_ARM -#endif - -#endif - -//Common size constants in bytes -#define KB 1024 -#define MB 1048576 -#define MB_4 4194304 -#define MB_16 16777216 -#define MB_64 67108864 -#define MB_256 268435456 -#define MB_512 536870912 -#define GB 1073741824 -#define GB_4 4294967296 - -//Default compile-time constants -#define DEFAULT_PAGE_SIZE 4096 /**< Default platform page size in bytes. This generally should not be relied on, but is a failsafe. */ -#define DEFAULT_WORKING_SET_SIZE DEFAULT_PAGE_SIZE /**< Default working set size in bytes. */ -#define DEFAULT_NUM_CPUS 1 /**< Default number of logical CPU cores. */ -#define DEFAULT_NUM_NODES 1 /**< Default number of NUMA nodes. */ -#define DEFAULT_THREAD_JOIN_TIMEOUT 600000 /**< Default number of milliseconds to wait for a thread to join. Negative values mean indefinite wait. */ -#define MIN_ELAPSED_TICKS 10000 /**< If any routine measured fewer than this number of ticks its results should be viewed with suspicion. This is because the latency of the timer itself will matter. */ - - -//Loop unrolling tricks. There are a bunch so that we can use the length needed for each situation. Unrolling too much hurts code size and instruction reuse. Yes, an unroll of 65536 is probably unnecessary. :) -//Note that I don't provide a default "UNROLL" macro, because I believe the programmer should know exactly how many times they are unrolling to make sure loop bounds are not violated. -#define UNROLL2(x) x x -#define UNROLL4(x) UNROLL2(x) UNROLL2(x) -#define UNROLL8(x) UNROLL4(x) UNROLL4(x) -#define UNROLL16(x) UNROLL8(x) UNROLL8(x) -#define UNROLL32(x) UNROLL16(x) UNROLL16(x) -#define UNROLL64(x) UNROLL32(x) UNROLL32(x) -#define UNROLL128(x) UNROLL64(x) UNROLL64(x) -#define UNROLL256(x) UNROLL128(x) UNROLL128(x) -#define UNROLL512(x) UNROLL256(x) UNROLL256(x) -#define UNROLL1024(x) UNROLL512(x) UNROLL512(x) -#define UNROLL2048(x) UNROLL1024(x) UNROLL1024(x) -#define UNROLL4096(x) UNROLL2048(x) UNROLL2048(x) -#define UNROLL8192(x) UNROLL4096(x) UNROLL4096(x) -#define UNROLL16384(x) UNROLL8192(x) UNROLL8192(x) -#define UNROLL32768(x) UNROLL16384(x) UNROLL16384(x) -#define UNROLL65536(x) UNROLL32768(x) UNROLL32768(x) - -#define LATENCY_BENCHMARK_UNROLL_LENGTH 512 /**< Number of unrolls in the latency benchmark pointer chasing core function. */ - -/***********************************************************************************************************/ -/***********************************************************************************************************/ -/***********************************************************************************************************/ -/***********************************************************************************************************/ -/* - * User-configurable compilation configuration - * - * Feel free to change these as needed. To disable an option, simply comment out its #define statement. To enable an option, ensure it is not commented out. - * In some cases, such as chunk size, stride size, etc. for throughput benchmarks, all combinations of the options will be used! This might dramatically increase runtime. - */ - -#define VERBOSE /**< Increases console output information detail by a lot. */ - -#define USE_ALL_NUMA_NODES /**< RECOMMENDED ENABLED. Test all NUMA node combinations for CPU and memory. If disabled, only node 0 is used for both CPU and memory. */ - -#define MULTITHREADING_ENABLE /**< RECOMMENDED ENABLED. Use multiple threads for benchmarks wherever applicable. Note that power measurement is always done with multiple threads separate from the benchmarking threads, regardless if this option is set or not. */ - -//Which timer to use in the benchmarks. Only one may be selected! -//#define USE_QPC_TIMER /**< RECOMMENDED ENABLED. WINDOWS ONLY. Use the Windows QueryPerformanceCounter timer API. This is a safe bet as it is more hardware-agnostic and has fewer quirks, but it has lower resolution than the TSC timer. */ -#define USE_TSC_TIMER /**< RECOMMENDED DISABLED. Use the Intel Time Stamp Counter native hardware timer. Only use this if you know what you are doing. */ - -#ifdef _WIN32 //DO NOT COMMENT THIS OUT -#define USE_LARGE_PAGES /**< RECOMMENDED ENABLED. Currently only implemented for Windows. Allocate memory using large pages rather than small normal pages. In general, this is highly recommended, as the TLB can skew benchmark results for DRAM. */ -#endif - -//Benchmarking methodology. Only one may be selected! -#define USE_TIME_BASED_BENCHMARKS /**< RECOMMENDED ENABLED. All benchmarks run for an estimated amount of time, and the figures of merit are computed based on the amount of memory accesses completed in the time limit. This mode has more consistent runtime across different machines, memory performance, and working set sizes, but may have more conservative measurements for differing levels of cache hierarchy (overestimating latency and underestimating throughput). */ -//#define USE_SIZE_BASED_BENCHMARKS /**< RECOMMENDED DISABLED. All benchmarks run for an estimated amount of memory accesses, and the figures of merit are computed based on the length of time required to run the benchmark. This mode may have highly varying runtime across different machines, memory performance, and working set sizes, but may have more optimistic measurements across differing levels of cache hierarchy (underestimating latency and overestimating throughput). */ - -#ifdef USE_TIME_BASED_BENCHMARKS //DO NOT COMMENT THIS OUT! -#define BENCHMARK_DURATION_SEC 4 /**< RECOMMENDED VALUE: At least 2. Number of seconds to run in each benchmark. */ -#define THROUGHPUT_BENCHMARK_BYTES_PER_PASS 4096 /**< RECOMMENDED VALUE: 4096. Number of bytes read or written per pass of any ThroughputBenchmark. This must be less than or equal to the minimum working set size, which is currently 4 KB. */ -#endif //DO NOT COMMENT THIS OUT - -#ifdef USE_SIZE_BASED_BENCHMARKS //DO NOT COMMENT THIS OUT -//#define USE_PASSES_CURVE_1 /**< RECOMMENDED DISABLED. The passes per iteration of a benchmark will be given by y = 65536 / working_set_size_KB */ -#define USE_PASSES_CURVE_2 /**< RECOMMENDED ENABLED. The passes per iteration of a benchmark will be given by y = 4*2097152 / working_set_size_KB^2 */ -#endif //DO NOT COMMENT THIS OUT - -//Chunk sizes -//#define USE_CHUNK_32b /**< RECOMMENDED DISABLED. Use 32-bit chunks. */ -#ifdef ARCH_INTEL_X86_64 //DO NOT COMMENT THIS OUT -//#define USE_CHUNK_64b /**< RECOMMENDED DISABLED. Use 64-bit chunks. */ -//#define USE_CHUNK_128b /**< RECOMMENDED DISABLED. Use 128-bit chunks. x86-64 processors with SSE only. TODO: Not yet implemented. */ -#ifdef ARCH_INTEL_X86_64_AVX -#define USE_CHUNK_256b /**< RECOMMENDED ENABLED. Use 256-bit chunks. x86-64 processors only with AVX ISA extensions. */ -#endif -#endif //DO NOT COMMENT THIS OUT - -//Throughput benchmark access patterns -#define USE_THROUGHPUT_SEQUENTIAL_PATTERN /**< RECOMMENDED ENABLED. Run the sequential family pattern of ThroughputBenchmarks. */ -//#define USE_THROUGHPUT_RANDOM_PATTERN /**< RECOMMENDED DISABLED. Run the random-access family pattern of ThroughputBenchmarks. TODO: Not yet implemented. */ - -#ifdef USE_THROUGHPUT_SEQUENTIAL_PATTERN //DO NOT COMMENT THIS OUT -//Throughput benchmark forward strides -#define USE_THROUGHPUT_FORW_STRIDE_1 /**< RECOMMENDED ENABLED. In throughput benchmarks with sequential pattern, do forward strides of 1 chunk (forward sequential). */ -//#define USE_THROUGHPUT_FORW_STRIDE_2 /**< RECOMMENDED DISABLED. In throughput benchmarks with sequential pattern, do forward strides of 2 chunks. TODO: Not yet implemented for 128 and 256-bit chunks */ -//#define USE_THROUGHPUT_FORW_STRIDE_4 /**< RECOMMENDED DISABLED. In throughput benchmarks with sequential pattern, do forward strides of 4 chunks. TODO: Not yet implemented for 128 and 256-bit chunks */ -//#define USE_THROUGHPUT_FORW_STRIDE_8 /**< RECOMMENDED DISABLED. In throughput benchmarks with sequential pattern, do forward strides of 8 chunks. TODO: Not yet implemented for 128 and 256-bit chunks */ -//#define USE_THROUGHPUT_FORW_STRIDE_16 /**< RECOMMENDED DISABLED. In throughput benchmarks with sequential pattern, do forward strides of 16 chunks. TODO: Not yet implemented for 128 and 256-bit chunks */ - -//Throughput benchmark reverse strides -//#define USE_THROUGHPUT_REV_STRIDE_1 /**< RECOMMENDED ENABLED. In throughput benchmarks with sequential pattern, do reverse strides of 1 chunk (reverse sequential). FIXME: Problem with reverse throughput benchmarks in terms of addressing. Don't use this for now. */ -//#define USE_THROUGHPUT_REV_STRIDE_2 /**< RECOMMENDED DISABLED. In throughput benchmarks with sequential pattern, do reverse strides of 2 chunks. TODO: Not yet implemented for 128 and 256-bit chunks */ -//#define USE_THROUGHPUT_REV_STRIDE_4 /**< RECOMMENDED DISABLED. In throughput benchmarks with sequential pattern, do reverse strides of 4 chunks. TODO: Not yet implemented for 128 and 256-bit chunks */ -//#define USE_THROUGHPUT_REV_STRIDE_8 /**< RECOMMENDED DISABLED. In throughput benchmarks with sequential pattern, do reverse strides of 8 chunks. TODO: Not yet implemented for 128 and 256-bit chunks */ -//#define USE_THROUGHPUT_REV_STRIDE_16 /**< RECOMMENDED DISABLED. In throughput benchmarks with sequential pattern, do reverse strides of 16 chunks. TODO: Not yet implemented for 128 and 256-bit chunks */ -#endif //DO NOT COMMENT THIS OUT - -//Throughput benchmark reads and writes -#define USE_THROUGHPUT_READS /**< RECOMMENDED ENABLED. In throughput benchmarks, read from memory. */ -#define USE_THROUGHPUT_WRITES /**< RECOMMENDED ENABLED. In throughput benchmarks, write to memory. */ - -//Latency benchmark pointer chasing construction method -#define USE_LATENCY_BENCHMARK_RANDOM_SHUFFLE_PATTERN /**< RECOMMENDED ENABLED. In latency benchmarks, generate the pointer chasing pattern using a random shuffle, which has a chance of creating small cycles. Much faster to run but strictly less correct. O(N) */ -//#define USE_LATENCY_BENCHMARK_RANDOM_HAMILTONIAN_CYCLE_PATTERN /**< RECOMMENDED DISABLED. In latency benchmarks, generate the pointer chasing pattern using a random directed Hamiltonian Cycle across the entire memory space under test. Slow to compute as it is O(N^2), but strictly more correct. */ - -#define POWER_SAMPLING_PERIOD_SEC 1 /**< RECOMMENDED VALUE: 1. Sampling period in seconds for all power measurement mechanisms. */ -/***********************************************************************************************************/ -/***********************************************************************************************************/ -/***********************************************************************************************************/ -/***********************************************************************************************************/ - - -//Compile-time options checks -#if defined(USE_QPC_TIMER) && !defined(_WIN32) -#error USE_QPC_TIMER may only be defined for a Windows system! -#endif - -#if !defined(USE_QPC_TIMER) && !defined(USE_TSC_TIMER) -#error One type of timer must be selected! -#endif - -#if defined(USE_QPC_TIMER) && defined(USE_TSC_TIMER) -#error Only one type of timer may be specified! -#endif - -#if (defined(USE_TIME_BASED_BENCHMARKS) && defined(USE_SIZE_BASED_BENCHMARKS)) || (!defined(USE_TIME_BASED_BENCHMARKS) && !defined(USE_SIZE_BASED_BENCHMARKS)) -#error Exactly one of USE_TIME_BASED_BENCHMARKS and USE_SIZE_BASED_BENCHMARKS must be defined! -#endif - -#ifdef USE_TIME_BASED_BENCHMARKS -#ifndef BENCHMARK_DURATION_SEC -#error BENCHMARK_DURATION_SEC must be defined! -#endif -#if BENCHMARK_DURATION_SEC <= 0 -#error BENCHMARK_DURATION_SEC must be a positive integer! -#endif -#ifndef THROUGHPUT_BENCHMARK_BYTES_PER_PASS -#error THROUGHPUT_BENCHMARK_BYTES_PER_PASS must be defined! -#else -#if THROUGHPUT_BENCHMARK_BYTES_PER_PASS > DEFAULT_PAGE_SIZE || THROUGHPUT_BENCHMARK_BYTES_PER_PASS <= 0 -#error THROUGHPUT_BENCHMARK_BYTES_PER_PASS must be less than or equal to the minimum possible working set size. It also must be a positive integer. -#endif -#endif -#endif - -#ifdef USE_SIZE_BASED_BENCHMARKS -#if (defined(USE_PASSES_CURVE_1) && defined(USE_PASSES_CURVE_2)) || (!defined(USE_PASSES_CURVE_1) && !defined(USE_PASSES_CURVE_2)) -#error Exactly one passes curve must be defined. -#endif -#endif - -#if !defined(USE_CHUNK_32b) && !defined(USE_CHUNK_64b) && !defined(USE_CHUNK_128b) && !defined(USE_CHUNK_256b) -#error At least one chunk size compile-time option must be selected! -#endif - -#if !defined (USE_THROUGHPUT_SEQUENTIAL_PATTERN) && !defined(USE_THROUGHPUT_RANDOM_PATTERN) -#error At least one throughput pattern compile-time option must be set! -#endif - -#if defined(USE_THROUGHPUT_SEQUENTIAL_PATTERN) -#if !defined(USE_THROUGHPUT_FORW_STRIDE_1) && !defined(USE_THROUGHPUT_FORW_STRIDE_2) && !defined(USE_THROUGHPUT_FORW_STRIDE_4) && !defined(USE_THROUGHPUT_FORW_STRIDE_8) && !defined(USE_THROUGHPUT_FORW_STRIDE_16) && !defined(USE_THROUGHPUT_REV_STRIDE_1) && !defined(USE_THROUGHPUT_REV_STRIDE_2) && !defined(USE_THROUGHPUT_REV_STRIDE_4) && !defined(USE_THROUGHPUT_REV_STRIDE_8) && !defined(USE_THROUGHPUT_REV_STRIDE_16) -#error Throughput benchmark sequential pattern compile-time option was selected, but no stride options were set! At least one must be enabled. -#endif -#endif - -#if !defined(USE_THROUGHPUT_READS) && !defined(USE_THROUGHPUT_WRITES) -#error At least one read or write mode compile-time option must be selected for throughput benchmarks! -#endif - -#if !defined(USE_LATENCY_BENCHMARK_RANDOM_SHUFFLE_PATTERN) && !defined(USE_LATENCY_BENCHMARK_RANDOM_HAMILTONIAN_CYCLE_PATTERN) -#error One latency benchmark pattern compile-time option must be selected! -#endif - -#if defined(USE_LATENCY_BENCHMARK_RANDOM_SHUFFLE_PATTERN) && defined(USE_LATENCY_BENCHMARK_RANDOM_HAMILTONIAN_CYCLE_PATTERN) -#error Only one latency benchmark pattern compile-time option must be selected! -#endif - -#if !defined(POWER_SAMPLING_PERIOD_SEC) || POWER_SAMPLING_PERIOD_SEC <= 0 -#error POWER_SAMPLING_PERIOD_SEC must be defined and greater than 0! -#endif - - extern size_t g_page_size; - extern size_t g_large_page_size; - extern uint32_t g_num_nodes; - extern uint32_t g_num_logical_cpus; - extern uint32_t g_num_physical_packages; - extern uint32_t g_starting_test_index; - extern uint32_t g_test_index; - - //Typedef the platform specific stuff to word sizes to match 4 different chunk options - typedef uint32_t Word32_t; -#ifdef ARCH_INTEL_X86_64 - typedef uint64_t Word64_t; -#endif -#ifdef ARCH_INTEL_X86_64_AVX - typedef __m128i Word128_t; - typedef __m256i Word256_t; -#endif - /** - * @brief Memory access patterns are broadly categorized by sequential or random-access. - */ - typedef enum { -#ifdef USE_THROUGHPUT_SEQUENTIAL_PATTERN - SEQUENTIAL, -#endif -#ifdef USE_THROUGHPUT_RANDOM_PATTERN - RANDOM, -#endif - NUM_PATTERN_MODES - } pattern_mode_t; - - /** - * @brief Memory access batterns are broadly categorized by reads and writes. - */ - typedef enum { -#ifdef USE_THROUGHPUT_READS - READ, -#endif -#ifdef USE_THROUGHPUT_WRITES - WRITE, -#endif - NUM_RW_MODES - } rw_mode_t; - - /** - * @brief Legal memory read/write chunk sizes in bits. - */ - typedef enum { -#ifdef USE_CHUNK_32b - CHUNK_32b, -#endif -#ifdef USE_CHUNK_64b - CHUNK_64b, -#endif -#ifdef USE_CHUNK_128b - CHUNK_128b, -#endif -#ifdef USE_CHUNK_256b - CHUNK_256b, -#endif - NUM_CHUNK_SIZES - } chunk_size_t; - - /** - * @brief Prints a basic welcome message to the console with useful information. - */ - void print_welcome_message(); - - /** - * @brief Prints the various C/C++ types to the console for this machine. - */ - void print_types_report(); - - /** - * @brief Prints compile-time option information to the console. - */ - void print_compile_time_options(); - - /** - * @brief Tests any enabled timers and outputs results to the console for sanity checking. - */ - void test_timers(); - - /** - * @brief Checks to see if the calling thread can be locked to all logical CPUs in the system, and reports to the console the progress. - */ - void test_thread_affinities(); - - /** - * @brief Sets the affinity of the calling thread to the lowest numbered logical CPU in the given NUMA node. - * TODO: Improve this functionality, it is quite limiting. - * @param numa_node The NUMA node number to select a CPU from. - * @returns True on success. - */ - bool lock_thread_to_numa_node(uint32_t numa_node); - - /** - * @brief Clears the affinity of the calling thread to any given NUMA node. - * @returns True on success. - */ - bool unlock_thread_to_numa_node(); - - /** - * @brief Sets the affinity of the calling thread to a given logical CPU. - * @param cpu_id The logical CPU identifier to lock the thread to. - * @returns True on success. - */ - bool lock_thread_to_cpu(uint32_t cpu_id); - - /** - * @brief Clears the affinity of the calling thread to any given logical CPU. - * @returns True on success. - */ - bool unlock_thread_to_cpu(); - - /** - * @brief Gets the CPU ID for a logical CPU of interest in a particular NUMA node. - * For example, if numa_node is 1 and cpu_in_node is 2, and there are 4 logical CPUs per node, then this will give the answer 6 (6th CPU), assuming CPU IDs start at 0. - * @param numa_node The NUMA node of interest. - * @param cpu_in_node The Nth logical CPU in the node. - * @returns The Nth logical CPU ID in the specified NUMA node. If none is found, returns -1. - */ - int32_t cpu_id_in_numa_node(uint32_t numa_node, uint32_t cpu_in_node); - - /** - * @brief Computes the number of passes to use for a given working set size in KB, when size-based benchmarking mode is enabled at compile-time. - * You may want to change this implementation to suit your needs. See the compile-time options in common.h. - * @param working_set_size_KB The working set size of the memory in KB. - * @returns The number of passes to use. - */ - size_t compute_number_of_passes(size_t working_set_size_KB); - - /** - * @brief Queries the page sizes from the system and sets relevant global variables. - * @returns False if the default value has to be used because the appropriate values could not be queried successfully from the OS. - */ - bool config_page_size(); - }; -}; - -#endif +/* The MIT License (MIT) + * + * Copyright (c) 2014 Microsoft + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file + * + * @brief Header file for common preprocessor definitions, macros, functions, and global constants. + */ + +#ifndef __COMMON_H +#define __COMMON_H + +//Libraries +#include +#include + +#ifdef _WIN32 +#include +#endif + +#ifdef __unix__ +#include +#endif + +namespace xmem { + namespace common { + +#define VERSION "1.04" + +#if !defined(_WIN32) && !defined(__unix__) +#error Neither Windows or Unix build environments were detected! +#endif + +//Windows: convert platform-specific preprocessor flags for architecture to xmem-specific constants +#ifdef _WIN32 + +#ifdef _M_IX86 //Intel x86 +#define ARCH_INTEL_X86 +#endif + +#ifdef _M_X64 //Intel x86-64 +#define ARCH_INTEL_X86_64 +#endif + +#ifdef _M_IX86_FP //Intel x86-64 SSE2 extensions +#define ARCH_INTEL_X86_64_SSE2 +#endif + +#ifdef __AVX__ //Intel x86-64 AVX extensions +#define ARCH_INTEL_X86_64_AVX +#endif + +#ifdef __AVX2__ //Intel x86-64 AVX2 extensions +#define ARCH_INTEL_X86_64_AVX2 +#endif + +#ifdef _AMD64 //AMD64 +#define ARCH_AMD64 +#endif + +#ifdef _M_ARM //ARM architecture +#define ARCH_ARM +#endif + +#endif + +//Unix: convert platform-specific preprocessor flags for architecture to xmem-specific constants +#ifdef __unix__ + +#ifdef __i386__ //Intel x86 +#define ARCH_INTEL_X86 +#endif + +#ifdef __x86_64__ //Intel x86-64 +#define ARCH_INTEL_X86_64 +#endif + +#ifdef __SSE2__ //Intel x86-64 SSE2 extensions +#define ARCH_INTEL_X86_64_SSE2 +#endif + +#ifdef __AVX__ //Intel x86-64 AVX extensions +#define ARCH_INTEL_X86_64_AVX +#endif + +#ifdef __AVX2__ //Intel x86-64 AVX2 extensions +#define ARCH_INTEL_X86_64_AVX2 +#endif + +#ifdef __amd64__ //AMD64 +#define ARCH_AMD64 +#endif + +#ifdef __arm__ //ARM architecture +#define ARCH_ARM +#endif + +#endif + +//Common size constants in bytes +#define KB 1024 +#define MB 1048576 +#define MB_4 4194304 +#define MB_16 16777216 +#define MB_64 67108864 +#define MB_256 268435456 +#define MB_512 536870912 +#define GB 1073741824 +#define GB_4 4294967296 + +//Default compile-time constants +#define DEFAULT_PAGE_SIZE 4*KB /**< Default platform page size in bytes. This generally should not be relied on, but is a failsafe. */ +#define DEFAULT_LARGE_PAGE_SIZE 2*MB /**< Default platform large page size in bytes. This generally should not be relied on, but is a failsafe. */ +#define DEFAULT_WORKING_SET_SIZE DEFAULT_PAGE_SIZE /**< Default working set size in bytes. */ +#define DEFAULT_NUM_NODES 1 /**< Default number of NUMA nodes. */ +#define DEFAULT_NUM_PHYSICAL_PACKAGES 1 /**< Default number of physical packages. */ +#define DEFAULT_NUM_PHYSICAL_CPUS 1 /**< Default number of physical CPU cores. */ +#define DEFAULT_NUM_LOGICAL_CPUS 1 /**< Default number of logical CPU cores. */ +#define DEFAULT_NUM_L1_CACHES 0 /**< Default number of L1 caches. */ +#define DEFAULT_NUM_L2_CACHES 0 /**< Default number of L2 caches. */ +#define DEFAULT_NUM_L3_CACHES 0 /**< Default number of L3 caches. */ +#define DEFAULT_NUM_L4_CACHES 0 /**< Default number of L4 caches. */ +#define DEFAULT_THREAD_JOIN_TIMEOUT 600000 /**< Default number of milliseconds to wait for a thread to join. Negative values mean indefinite wait. TODO: remove this */ +#define MIN_ELAPSED_TICKS 10000 /**< If any routine measured fewer than this number of ticks its results should be viewed with suspicion. This is because the latency of the timer itself will matter. */ + + +//Loop unrolling tricks. There are a bunch so that we can use the length needed for each situation. Unrolling too much hurts code size and instruction reuse. Yes, an unroll of 65536 is probably unnecessary. :) +//Note that I don't provide a default "UNROLL" macro, because I believe the programmer should know exactly how many times they are unrolling to make sure loop bounds are not violated. +#define UNROLL2(x) x x +#define UNROLL4(x) UNROLL2(x) UNROLL2(x) +#define UNROLL8(x) UNROLL4(x) UNROLL4(x) +#define UNROLL16(x) UNROLL8(x) UNROLL8(x) +#define UNROLL32(x) UNROLL16(x) UNROLL16(x) +#define UNROLL64(x) UNROLL32(x) UNROLL32(x) +#define UNROLL128(x) UNROLL64(x) UNROLL64(x) +#define UNROLL256(x) UNROLL128(x) UNROLL128(x) +#define UNROLL512(x) UNROLL256(x) UNROLL256(x) +#define UNROLL1024(x) UNROLL512(x) UNROLL512(x) +#define UNROLL2048(x) UNROLL1024(x) UNROLL1024(x) +#define UNROLL4096(x) UNROLL2048(x) UNROLL2048(x) +#define UNROLL8192(x) UNROLL4096(x) UNROLL4096(x) +#define UNROLL16384(x) UNROLL8192(x) UNROLL8192(x) +#define UNROLL32768(x) UNROLL16384(x) UNROLL16384(x) +#define UNROLL65536(x) UNROLL32768(x) UNROLL32768(x) + +#define LATENCY_BENCHMARK_UNROLL_LENGTH 512 /**< Number of unrolls in the latency benchmark pointer chasing core function. */ + +/***********************************************************************************************************/ +/***********************************************************************************************************/ +/***********************************************************************************************************/ +/***********************************************************************************************************/ +/* + * User-configurable compilation configuration + * + * Feel free to change these as needed. To disable an option, simply comment out its #define statement. To enable an option, ensure it is not commented out. + * In some cases, such as chunk size, stride size, etc. for throughput benchmarks, all combinations of the options will be used! This might dramatically increase runtime. + */ + +#define VERBOSE /**< Increases console output information detail by a lot. */ + +#define USE_ALL_NUMA_NODES /**< RECOMMENDED ENABLED. Test all NUMA node combinations for CPU and memory. If disabled, only node 0 is used for both CPU and memory. */ + +#define MULTITHREADING_ENABLE /**< RECOMMENDED ENABLED. Use multiple threads for benchmarks wherever applicable. Note that power measurement is always done with multiple threads separate from the benchmarking threads, regardless if this option is set or not. */ + +//Which timer to use in the benchmarks. Only one may be selected! +//#define USE_QPC_TIMER /**< RECOMMENDED ENABLED. WINDOWS ONLY. Use the Windows QueryPerformanceCounter timer API. This is a safe bet as it is more hardware-agnostic and has fewer quirks, but it has lower resolution than the TSC timer. */ +#define USE_TSC_TIMER /**< RECOMMENDED DISABLED. Use the Intel Time Stamp Counter native hardware timer. Only use this if you know what you are doing. */ + +#ifdef _WIN32 //DO NOT COMMENT THIS OUT +#define USE_LARGE_PAGES /**< RECOMMENDED ENABLED. Currently only implemented for Windows. Allocate memory using large pages rather than small normal pages. In general, this is highly recommended, as the TLB can skew benchmark results for DRAM. */ +#endif + +//Benchmarking methodology. Only one may be selected! +#define USE_TIME_BASED_BENCHMARKS /**< RECOMMENDED ENABLED. All benchmarks run for an estimated amount of time, and the figures of merit are computed based on the amount of memory accesses completed in the time limit. This mode has more consistent runtime across different machines, memory performance, and working set sizes, but may have more conservative measurements for differing levels of cache hierarchy (overestimating latency and underestimating throughput). */ +//#define USE_SIZE_BASED_BENCHMARKS /**< RECOMMENDED DISABLED. All benchmarks run for an estimated amount of memory accesses, and the figures of merit are computed based on the length of time required to run the benchmark. This mode may have highly varying runtime across different machines, memory performance, and working set sizes, but may have more optimistic measurements across differing levels of cache hierarchy (underestimating latency and overestimating throughput). */ + +#ifdef USE_TIME_BASED_BENCHMARKS //DO NOT COMMENT THIS OUT! +#define BENCHMARK_DURATION_SEC 4 /**< RECOMMENDED VALUE: At least 2. Number of seconds to run in each benchmark. */ +#define THROUGHPUT_BENCHMARK_BYTES_PER_PASS 4096 /**< RECOMMENDED VALUE: 4096. Number of bytes read or written per pass of any ThroughputBenchmark. This must be less than or equal to the minimum working set size, which is currently 4 KB. */ +#endif //DO NOT COMMENT THIS OUT + +#ifdef USE_SIZE_BASED_BENCHMARKS //DO NOT COMMENT THIS OUT +//#define USE_PASSES_CURVE_1 /**< RECOMMENDED DISABLED. The passes per iteration of a benchmark will be given by y = 65536 / working_set_size_KB */ +#define USE_PASSES_CURVE_2 /**< RECOMMENDED ENABLED. The passes per iteration of a benchmark will be given by y = 4*2097152 / working_set_size_KB^2 */ +#endif //DO NOT COMMENT THIS OUT + +//Chunk sizes +//#define USE_CHUNK_32b /**< RECOMMENDED DISABLED. Use 32-bit chunks. */ +#ifdef ARCH_INTEL_X86_64 //DO NOT COMMENT THIS OUT +//#define USE_CHUNK_64b /**< RECOMMENDED DISABLED. Use 64-bit chunks. */ +//#define USE_CHUNK_128b /**< RECOMMENDED DISABLED. Use 128-bit chunks. x86-64 processors with SSE only. TODO: Not yet implemented. */ +#ifdef ARCH_INTEL_X86_64_AVX //TODO: Is this supposed to be AVX2 instead of AVX? +#define USE_CHUNK_256b /**< RECOMMENDED ENABLED. Use 256-bit chunks. x86-64 processors only with AVX ISA extensions. */ +#endif +#ifdef ARCH_INTEL_X86_64_AVX512 +//#define USE_CHUNK_512b /**< TODO. Not yet implemented. */ +#endif +#endif //DO NOT COMMENT THIS OUT + +//Throughput benchmark access patterns +#define USE_THROUGHPUT_SEQUENTIAL_PATTERN /**< RECOMMENDED ENABLED. Run the sequential family pattern of ThroughputBenchmarks. */ +//#define USE_THROUGHPUT_RANDOM_PATTERN /**< RECOMMENDED DISABLED. Run the random-access family pattern of ThroughputBenchmarks. TODO: Not yet implemented. */ + +#ifdef USE_THROUGHPUT_SEQUENTIAL_PATTERN //DO NOT COMMENT THIS OUT +//Throughput benchmark forward strides +#define USE_THROUGHPUT_FORW_STRIDE_1 /**< RECOMMENDED ENABLED. In throughput benchmarks with sequential pattern, do forward strides of 1 chunk (forward sequential). */ +//#define USE_THROUGHPUT_FORW_STRIDE_2 /**< RECOMMENDED DISABLED. In throughput benchmarks with sequential pattern, do forward strides of 2 chunks. TODO: Not yet implemented for 128 and 256-bit chunks */ +//#define USE_THROUGHPUT_FORW_STRIDE_4 /**< RECOMMENDED DISABLED. In throughput benchmarks with sequential pattern, do forward strides of 4 chunks. TODO: Not yet implemented for 128 and 256-bit chunks */ +//#define USE_THROUGHPUT_FORW_STRIDE_8 /**< RECOMMENDED DISABLED. In throughput benchmarks with sequential pattern, do forward strides of 8 chunks. TODO: Not yet implemented for 128 and 256-bit chunks */ +//#define USE_THROUGHPUT_FORW_STRIDE_16 /**< RECOMMENDED DISABLED. In throughput benchmarks with sequential pattern, do forward strides of 16 chunks. TODO: Not yet implemented for 128 and 256-bit chunks */ + +//Throughput benchmark reverse strides +//#define USE_THROUGHPUT_REV_STRIDE_1 /**< RECOMMENDED ENABLED. In throughput benchmarks with sequential pattern, do reverse strides of 1 chunk (reverse sequential). FIXME: Problem with reverse throughput benchmarks in terms of addressing. Don't use this for now. */ +//#define USE_THROUGHPUT_REV_STRIDE_2 /**< RECOMMENDED DISABLED. In throughput benchmarks with sequential pattern, do reverse strides of 2 chunks. TODO: Not yet implemented for 128 and 256-bit chunks */ +//#define USE_THROUGHPUT_REV_STRIDE_4 /**< RECOMMENDED DISABLED. In throughput benchmarks with sequential pattern, do reverse strides of 4 chunks. TODO: Not yet implemented for 128 and 256-bit chunks */ +//#define USE_THROUGHPUT_REV_STRIDE_8 /**< RECOMMENDED DISABLED. In throughput benchmarks with sequential pattern, do reverse strides of 8 chunks. TODO: Not yet implemented for 128 and 256-bit chunks */ +//#define USE_THROUGHPUT_REV_STRIDE_16 /**< RECOMMENDED DISABLED. In throughput benchmarks with sequential pattern, do reverse strides of 16 chunks. TODO: Not yet implemented for 128 and 256-bit chunks */ +#endif //DO NOT COMMENT THIS OUT + +//Throughput benchmark reads and writes +#define USE_THROUGHPUT_READS /**< RECOMMENDED ENABLED. In throughput benchmarks, read from memory. */ +#define USE_THROUGHPUT_WRITES /**< RECOMMENDED ENABLED. In throughput benchmarks, write to memory. */ + +//Latency benchmark pointer chasing construction method +#define USE_LATENCY_BENCHMARK_RANDOM_SHUFFLE_PATTERN /**< RECOMMENDED ENABLED. In latency benchmarks, generate the pointer chasing pattern using a random shuffle, which has a chance of creating small cycles. Much faster to run but strictly less correct. O(N) */ +//#define USE_LATENCY_BENCHMARK_RANDOM_HAMILTONIAN_CYCLE_PATTERN /**< RECOMMENDED DISABLED. In latency benchmarks, generate the pointer chasing pattern using a random directed Hamiltonian Cycle across the entire memory space under test. Slow to compute as it is O(N^2), but strictly more correct. */ + +#define POWER_SAMPLING_PERIOD_SEC 1 /**< RECOMMENDED VALUE: 1. Sampling period in seconds for all power measurement mechanisms. */ +/***********************************************************************************************************/ +/***********************************************************************************************************/ +/***********************************************************************************************************/ +/***********************************************************************************************************/ + + +//Compile-time options checks +#if defined(USE_QPC_TIMER) && !defined(_WIN32) +#error USE_QPC_TIMER may only be defined for a Windows system! +#endif + +#if !defined(USE_QPC_TIMER) && !defined(USE_TSC_TIMER) +#error One type of timer must be selected! +#endif + +#if defined(USE_QPC_TIMER) && defined(USE_TSC_TIMER) +#error Only one type of timer may be specified! +#endif + +#if (defined(USE_TIME_BASED_BENCHMARKS) && defined(USE_SIZE_BASED_BENCHMARKS)) || (!defined(USE_TIME_BASED_BENCHMARKS) && !defined(USE_SIZE_BASED_BENCHMARKS)) +#error Exactly one of USE_TIME_BASED_BENCHMARKS and USE_SIZE_BASED_BENCHMARKS must be defined! +#endif + +#ifdef USE_TIME_BASED_BENCHMARKS +#ifndef BENCHMARK_DURATION_SEC +#error BENCHMARK_DURATION_SEC must be defined! +#endif +#if BENCHMARK_DURATION_SEC <= 0 +#error BENCHMARK_DURATION_SEC must be a positive integer! +#endif +#ifndef THROUGHPUT_BENCHMARK_BYTES_PER_PASS +#error THROUGHPUT_BENCHMARK_BYTES_PER_PASS must be defined! +#else +#if THROUGHPUT_BENCHMARK_BYTES_PER_PASS > DEFAULT_PAGE_SIZE || THROUGHPUT_BENCHMARK_BYTES_PER_PASS <= 0 +#error THROUGHPUT_BENCHMARK_BYTES_PER_PASS must be less than or equal to the minimum possible working set size. It also must be a positive integer. +#endif +#endif +#endif + +#ifdef USE_SIZE_BASED_BENCHMARKS +#if (defined(USE_PASSES_CURVE_1) && defined(USE_PASSES_CURVE_2)) || (!defined(USE_PASSES_CURVE_1) && !defined(USE_PASSES_CURVE_2)) +#error Exactly one passes curve must be defined. +#endif +#endif + +#if !defined(USE_CHUNK_32b) && !defined(USE_CHUNK_64b) && !defined(USE_CHUNK_128b) && !defined(USE_CHUNK_256b) +#error At least one chunk size compile-time option must be selected! +#endif + +#if !defined (USE_THROUGHPUT_SEQUENTIAL_PATTERN) && !defined(USE_THROUGHPUT_RANDOM_PATTERN) +#error At least one throughput pattern compile-time option must be set! +#endif + +#if defined(USE_THROUGHPUT_SEQUENTIAL_PATTERN) +#if !defined(USE_THROUGHPUT_FORW_STRIDE_1) && !defined(USE_THROUGHPUT_FORW_STRIDE_2) && !defined(USE_THROUGHPUT_FORW_STRIDE_4) && !defined(USE_THROUGHPUT_FORW_STRIDE_8) && !defined(USE_THROUGHPUT_FORW_STRIDE_16) && !defined(USE_THROUGHPUT_REV_STRIDE_1) && !defined(USE_THROUGHPUT_REV_STRIDE_2) && !defined(USE_THROUGHPUT_REV_STRIDE_4) && !defined(USE_THROUGHPUT_REV_STRIDE_8) && !defined(USE_THROUGHPUT_REV_STRIDE_16) +#error Throughput benchmark sequential pattern compile-time option was selected, but no stride options were set! At least one must be enabled. +#endif +#endif + +#if !defined(USE_THROUGHPUT_READS) && !defined(USE_THROUGHPUT_WRITES) +#error At least one read or write mode compile-time option must be selected for throughput benchmarks! +#endif + +#if !defined(USE_LATENCY_BENCHMARK_RANDOM_SHUFFLE_PATTERN) && !defined(USE_LATENCY_BENCHMARK_RANDOM_HAMILTONIAN_CYCLE_PATTERN) +#error One latency benchmark pattern compile-time option must be selected! +#endif + +#if defined(USE_LATENCY_BENCHMARK_RANDOM_SHUFFLE_PATTERN) && defined(USE_LATENCY_BENCHMARK_RANDOM_HAMILTONIAN_CYCLE_PATTERN) +#error Only one latency benchmark pattern compile-time option must be selected! +#endif + +#if !defined(POWER_SAMPLING_PERIOD_SEC) || POWER_SAMPLING_PERIOD_SEC <= 0 +#error POWER_SAMPLING_PERIOD_SEC must be defined and greater than 0! +#endif + + extern size_t g_page_size; + extern size_t g_large_page_size; + extern uint32_t g_num_nodes; + extern uint32_t g_num_logical_cpus; + extern uint32_t g_num_physical_packages; + extern uint32_t g_starting_test_index; + extern uint32_t g_test_index; + + //Typedef the platform specific stuff to word sizes to match 4 different chunk options + typedef uint32_t Word32_t; +#ifdef ARCH_INTEL_X86_64 + typedef uint64_t Word64_t; +#endif +#ifdef ARCH_INTEL_X86_64_AVX + typedef __m128i Word128_t; + typedef __m256i Word256_t; +#endif + /** + * @brief Memory access patterns are broadly categorized by sequential or random-access. + */ + typedef enum { +#ifdef USE_THROUGHPUT_SEQUENTIAL_PATTERN + SEQUENTIAL, +#endif +#ifdef USE_THROUGHPUT_RANDOM_PATTERN + RANDOM, +#endif + NUM_PATTERN_MODES + } pattern_mode_t; + + /** + * @brief Memory access batterns are broadly categorized by reads and writes. + */ + typedef enum { +#ifdef USE_THROUGHPUT_READS + READ, +#endif +#ifdef USE_THROUGHPUT_WRITES + WRITE, +#endif + NUM_RW_MODES + } rw_mode_t; + + /** + * @brief Legal memory read/write chunk sizes in bits. + */ + typedef enum { +#ifdef USE_CHUNK_32b + CHUNK_32b, +#endif +#ifdef USE_CHUNK_64b + CHUNK_64b, +#endif +#ifdef USE_CHUNK_128b + CHUNK_128b, +#endif +#ifdef USE_CHUNK_256b + CHUNK_256b, +#endif + NUM_CHUNK_SIZES + } chunk_size_t; + + /** + * @brief Prints a basic welcome message to the console with useful information. + */ + void print_welcome_message(); + + /** + * @brief Prints the various C/C++ types to the console for this machine. + */ + void print_types_report(); + + /** + * @brief Prints compile-time option information to the console. + */ + void print_compile_time_options(); + + /** + * @brief Tests any enabled timers and outputs results to the console for sanity checking. + */ + void test_timers(); + + /** + * @brief Checks to see if the calling thread can be locked to all logical CPUs in the system, and reports to the console the progress. + */ + void test_thread_affinities(); + + /** + * @brief Sets the affinity of the calling thread to the lowest numbered logical CPU in the given NUMA node. + * TODO: Improve this functionality, it is quite limiting. + * @param numa_node The NUMA node number to select a CPU from. + * @returns True on success. + */ + bool lock_thread_to_numa_node(uint32_t numa_node); + + /** + * @brief Clears the affinity of the calling thread to any given NUMA node. + * @returns True on success. + */ + bool unlock_thread_to_numa_node(); + + /** + * @brief Sets the affinity of the calling thread to a given logical CPU. + * @param cpu_id The logical CPU identifier to lock the thread to. + * @returns True on success. + */ + bool lock_thread_to_cpu(uint32_t cpu_id); + + /** + * @brief Clears the affinity of the calling thread to any given logical CPU. + * @returns True on success. + */ + bool unlock_thread_to_cpu(); + + /** + * @brief Gets the CPU ID for a logical CPU of interest in a particular NUMA node. + * For example, if numa_node is 1 and cpu_in_node is 2, and there are 4 logical CPUs per node, then this will give the answer 6 (6th CPU), assuming CPU IDs start at 0. + * @param numa_node The NUMA node of interest. + * @param cpu_in_node The Nth logical CPU in the node. + * @returns The Nth logical CPU ID in the specified NUMA node. If none is found, returns -1. + */ + int32_t cpu_id_in_numa_node(uint32_t numa_node, uint32_t cpu_in_node); + + /** + * @brief Computes the number of passes to use for a given working set size in KB, when size-based benchmarking mode is enabled at compile-time. + * You may want to change this implementation to suit your needs. See the compile-time options in common.h. + * @param working_set_size_KB The working set size of the memory in KB. + * @returns The number of passes to use. + */ + size_t compute_number_of_passes(size_t working_set_size_KB); + + /** + * @brief Queries the page sizes from the system and sets relevant global variables. + * @returns False if the default value has to be used because the appropriate values could not be queried successfully from the OS. + */ + bool config_page_size(); + + /** + * @brief Sets up global variables based on system information at runtime. + * @returns 0 on success. + */ + int32_t query_sys_info(); + }; +}; + +#endif diff --git a/src/include/x86_64/TSCTimer.h b/src/include/x86_64/TSCTimer.h index c64cac9..93597db 100644 --- a/src/include/x86_64/TSCTimer.h +++ b/src/include/x86_64/TSCTimer.h @@ -1,107 +1,107 @@ -/* The MIT License (MIT) - * - * Copyright (c) 2014 Microsoft - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/** - * @file - * - * @brief Header file for the TSCTimer class as well as some C-style functions for working with the TSC timer hardware directly. - */ - -#ifndef __TSCTIMER_H -#define __TSCTIMER_H - -//Headers -#include - -#ifdef ARCH_INTEL_X86_64 - -//Headers -#include - -//Libraries -#include - -namespace xmem { - namespace timers { - namespace x86_64 { - /** - * @brief This class implements a simple high resolution stopwatch timer based on Intel's TimeStamp Counter (TSC) hardware timer. - * - * WARNING: These objects are NOT thread safe! - * - * Note that the nanoseconds per tick -could- be inaccurate due to dynamic frequency scaling of the processor on certain platforms. For this reason, - * you should either fix the clock frequency and/or validate the ns per tick before trusting results from this timer, or otherwise know that your processor has a - * frequency-invariant TSC timer. - * - * Also, the hardware implementation of this timer may vary, and in some cases, different CPU cores may have different timers - * and they are not guaranteed to be synchronized! It is recommended to ensure the thread using the TSCTimer stays on the same core for the duration of the - * timed code. - * - * Definitely only use this class if you know what you are doing! - */ - class TSCTimer : public Timer { - public: - /** - * @brief Constructor. - * - * This may take a noticeable amount of time (milliseconds to seconds) - * in order to measure the tick rate against another known timer. - */ - TSCTimer(); - - /** - * @brief Starts the timer. - */ - virtual void start(); - - /** - * @brief Stops the timer. - * @returns Elapsed time since last start() call in ticks. - */ - virtual uint64_t stop(); - - private: - uint64_t __start_tick; /**< The reading from the TSC register when start() was last called. */ - uint64_t __stop_tick; /**< The reading from the TSC register when stop() was last called. */ - }; - - /** - * @brief Query the TSC hardware timer for the start of a timed section of code. - * @returns The starting tick for some timed section of code using the hardware TSC timer. - */ - uint64_t start_tsc_timer(); - - /** - * @brief Query the TSC hardware timer for the end of a timed section of code. - * @returns The ending tick for some timed section of code using the hardware TSC timer. - */ - uint64_t stop_tsc_timer(); - }; - }; -}; - -#else -#error This file should only be included in Intel x86_64 builds. -#endif - -#endif +/* The MIT License (MIT) + * + * Copyright (c) 2014 Microsoft + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file + * + * @brief Header file for the TSCTimer class as well as some C-style functions for working with the TSC timer hardware directly. + */ + +#ifndef __TSCTIMER_H +#define __TSCTIMER_H + +//Headers +#include + +#ifdef ARCH_INTEL_X86_64 + +//Headers +#include + +//Libraries +#include + +namespace xmem { + namespace timers { + namespace x86_64 { + /** + * @brief This class implements a simple high resolution stopwatch timer based on Intel's TimeStamp Counter (TSC) hardware timer. + * + * WARNING: These objects are NOT thread safe! + * + * Note that the nanoseconds per tick -could- be inaccurate due to dynamic frequency scaling of the processor on certain platforms. For this reason, + * you should either fix the clock frequency and/or validate the ns per tick before trusting results from this timer, or otherwise know that your processor has a + * frequency-invariant TSC timer. + * + * Also, the hardware implementation of this timer may vary, and in some cases, different CPU cores may have different timers + * and they are not guaranteed to be synchronized! It is recommended to ensure the thread using the TSCTimer stays on the same core for the duration of the + * timed code. + * + * Definitely only use this class if you know what you are doing! + */ + class TSCTimer : public Timer { + public: + /** + * @brief Constructor. + * + * This may take a noticeable amount of time (milliseconds to seconds) + * in order to measure the tick rate against another known timer. + */ + TSCTimer(); + + /** + * @brief Starts the timer. + */ + virtual void start(); + + /** + * @brief Stops the timer. + * @returns Elapsed time since last start() call in ticks. + */ + virtual uint64_t stop(); + + private: + uint64_t __start_tick; /**< The reading from the TSC register when start() was last called. */ + uint64_t __stop_tick; /**< The reading from the TSC register when stop() was last called. */ + }; + + /** + * @brief Query the TSC hardware timer for the start of a timed section of code. + * @returns The starting tick for some timed section of code using the hardware TSC timer. + */ + uint64_t start_tsc_timer(); + + /** + * @brief Query the TSC hardware timer for the end of a timed section of code. + * @returns The ending tick for some timed section of code using the hardware TSC timer. + */ + uint64_t stop_tsc_timer(); + }; + }; +}; + +#else +#error This file should only be included in Intel x86_64 builds. +#endif + +#endif diff --git a/src/main.cpp b/src/main.cpp index 35ca68c..1c7407d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -39,7 +39,7 @@ #include #ifdef _WIN32 -//FIXME +//FIXME. Clean this up. #include #include #endif @@ -60,13 +60,16 @@ int main(int argc, char* argv[]) { common::print_compile_time_options(); #endif +//FIXME. Clean this up. #ifdef _WIN32 - //FIXME if (common::win::third_party::query_sys_info()) { +#endif +#ifdef __unix__ + if (common::query_sys_info()) { +#endif std::cerr << "ERROR occurred while querying CPU information." << std::endl; return -1; } -#endif config::Configurator config; bool configSuccess = !config.configureFromInput(argc, argv); diff --git a/src/x86_64/TSCTimer.cpp b/src/x86_64/TSCTimer.cpp index e8530d5..381b254 100644 --- a/src/x86_64/TSCTimer.cpp +++ b/src/x86_64/TSCTimer.cpp @@ -1,103 +1,103 @@ -/* The MIT License (MIT) - * - * Copyright (c) 2014 Microsoft - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/** - * @file - * - * @brief Implementation file for the TSCTimer class as well as some C-style functions for working with the TSC timer hardware directly. - */ - -//Headers -#include - -//Libraries -#ifdef _WIN32 -#include -#include -#endif - -#ifdef __unix__ -#include -#include -#include -#include -#endif - -using namespace xmem::timers::x86_64; - -TSCTimer::TSCTimer() : - Timer(), - __start_tick(0), - __stop_tick(0) -{ - start(); -#ifdef _WIN32 - Sleep(1000); -#endif -#ifdef __unix__ - struct timespec duration, remainder; - duration.tv_sec = 1; - duration.tv_nsec = 0; - nanosleep(&duration, &remainder); -#endif - _ticks_per_sec = stop(); - _ns_per_tick = 1/((double)(_ticks_per_sec)) * 1e9; -} - -void TSCTimer::start() { - __start_tick = start_tsc_timer(); -} - -uint64_t TSCTimer::stop() { - __stop_tick = stop_tsc_timer(); - return (__stop_tick - __start_tick); -} - - -uint64_t xmem::timers::x86_64::start_tsc_timer() { -#ifdef _WIN32 - int32_t dontcare[4]; - __cpuid(dontcare, 0); //Serializing instruction. This forces all previous instructions to finish -#endif -#ifdef __unix__ - int32_t dc0, dc1, dc2, dc3, dc4; - __cpuid(dc0, dc1, dc2, dc3, dc4); //Serializing instruction. This forces all previous instructions to finish -#endif - return __rdtsc(); //Get clock tick -} - -uint64_t xmem::timers::x86_64::stop_tsc_timer() { - uint64_t tick; - uint32_t filler; -#ifdef _WIN32 - int32_t dontcare[4]; - tick = __rdtscp(&filler); //Get clock tick. This is a partially serializing instruction. All previous instructions must finish - __cpuid(dontcare, 0); //Fully serializing instruction. We do this to prevent later instructions from being moved inside the timed section -#endif -#ifdef __unix__ - int32_t dc0, dc1, dc2, dc3, dc4; - tick = __rdtscp(&filler); //Get clock tick. This is a partially serializing instruction. All previous instructions must finish - __cpuid(dc0, dc1, dc2, dc3, dc4); //Serializing instruction. This forces all previous instructions to finish -#endif - return tick; -} +/* The MIT License (MIT) + * + * Copyright (c) 2014 Microsoft + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file + * + * @brief Implementation file for the TSCTimer class as well as some C-style functions for working with the TSC timer hardware directly. + */ + +//Headers +#include + +//Libraries +#ifdef _WIN32 +#include +#include +#endif + +#ifdef __unix__ +#include +#include +#include +#include +#endif + +using namespace xmem::timers::x86_64; + +TSCTimer::TSCTimer() : + Timer(), + __start_tick(0), + __stop_tick(0) +{ + start(); +#ifdef _WIN32 + Sleep(1000); +#endif +#ifdef __unix__ + struct timespec duration, remainder; + duration.tv_sec = 1; + duration.tv_nsec = 0; + nanosleep(&duration, &remainder); +#endif + _ticks_per_sec = stop(); + _ns_per_tick = 1/((double)(_ticks_per_sec)) * 1e9; +} + +void TSCTimer::start() { + __start_tick = start_tsc_timer(); +} + +uint64_t TSCTimer::stop() { + __stop_tick = stop_tsc_timer(); + return (__stop_tick - __start_tick); +} + + +uint64_t xmem::timers::x86_64::start_tsc_timer() { +#ifdef _WIN32 + int32_t dontcare[4]; + __cpuid(dontcare, 0); //Serializing instruction. This forces all previous instructions to finish +#endif +#ifdef __unix__ + int32_t dc0, dc1, dc2, dc3, dc4; + __cpuid(dc0, dc1, dc2, dc3, dc4); //Serializing instruction. This forces all previous instructions to finish +#endif + return __rdtsc(); //Get clock tick +} + +uint64_t xmem::timers::x86_64::stop_tsc_timer() { + uint64_t tick; + uint32_t filler; +#ifdef _WIN32 + int32_t dontcare[4]; + tick = __rdtscp(&filler); //Get clock tick. This is a partially serializing instruction. All previous instructions must finish + __cpuid(dontcare, 0); //Fully serializing instruction. We do this to prevent later instructions from being moved inside the timed section +#endif +#ifdef __unix__ + int32_t dc0, dc1, dc2, dc3, dc4; + tick = __rdtscp(&filler); //Get clock tick. This is a partially serializing instruction. All previous instructions must finish + __cpuid(dc0, dc1, dc2, dc3, dc4); //Serializing instruction. This forces all previous instructions to finish +#endif + return tick; +}