X-Mem now builds and runs correctly on GNU/Linux x86-64 with AVX, x86-64, and x86 (32-bit). On 32-bit mode, NUMA support is disabled at build time. This is because libnuma does not seem to support 32-bit x86 nor ARM, and it saves me a headache for now.

2015-04-22 18:54:25 -07:00 · 2015-04-22 18:54:25 -07:00 · 8a52ccd072
--- a/2
+++ b/2
@ -38,7 +38,7 @@ PROJECT_NAME           = X-Mem
 # could be handy for archiving the generated documentation or if some version
 # control system is used.

-PROJECT_NUMBER         = 2.1.12
+PROJECT_NUMBER         = 2.1.13

 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 README
 ------------------------------------------------------------------------------------------------------------

-X-Mem: Extensible Memory Benchmarking Tool v2.1.12
+X-Mem: Extensible Memory Benchmarking Tool v2.1.13
 ------------------------------------------------------------------------------------------------------------

 The flexible open-source research tool for characterizing memory hierarchy throughput, latency, and power. 
--- a/4
+++ b/4
@ -13,7 +13,9 @@ env.Append(CPPFLAGS = '-Wall -Wno-unused-but-set-variable -Wno-unused-variable -
 env.Append(CPPPATH = ['src/include', '/usr/include', '/lib'])
 env.Append(CPPPATH = ['src/include/ext/DelayInjectedLoadedLatencyBenchmark']) # Extension: Delay-injected loaded latency benchmark
 env.Append(CPPPATH = ['src/include/ext/StreamBenchmark']) # Extension: Stream benchmark
-env.Append(LIBS = ['pthread', 'numa', 'hugetlbfs'])
+env.Append(LIBPATH = ['/usr/lib32'])
+env.Append(LIBS = ['pthread', 'hugetlbfs'])
+env.Append(LINKFLAGS = ['-m32'])

 # List all C++ source files
 sources = [
--- a/X-Mem_Developer_Manual.pdf
+++ b/X-Mem_Developer_Manual.pdf
--- a/bin/xmem-linux-x64
+++ b/bin/xmem-linux-x64
--- a/bin/xmem-linux-x64_avx
+++ b/bin/xmem-linux-x64_avx
--- a/bin/xmem-linux-x86
+++ b/bin/xmem-linux-x86
--- a/prereq/linux/dl_linux_prereq.sh
+++ b/prereq/linux/dl_linux_prereq.sh
@ -8,8 +8,10 @@ PYTHON=https://www.python.org/ftp/python/2.7.9/Python-2.7.9.tgz
 SCONS=http://sourceforge.net/projects/scons/files/scons/2.3.4/scons-2.3.4.tar.gz
 GCC=http://www.netgull.com/gcc/releases/gcc-4.8.2/gcc-4.8.2.tar.gz
 DOXYGEN=http://ftp.stack.nl/pub/users/dimitri/doxygen-1.8.9.1.linux.bin.tar.gz
+LIBNUMA=ftp://oss.sgi.com/www/projects/libnuma/download/numactl-2.0.10.tar.gz
 wget $LIBHUGETLBFS
 wget $PYTHON
 wget $SCONS
 wget $GCC
 wget $DOXYGEN
+wget $LIBNUMA
--- a/src/Benchmark.cpp
+++ b/src/Benchmark.cpp
@ -101,7 +101,7 @@ bool Benchmark::run() {

 	//Write to all of the memory region of interest to make sure
 	//pages are resident in physical memory and are not shared
-	forwSequentialWrite_Word64(_mem_array,
+	forwSequentialWrite_Word32(_mem_array,
 							   reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(_mem_array) + _len));

 	bool success = _run_core();
--- a/src/BenchmarkManager.cpp
+++ b/src/BenchmarkManager.cpp
@ -57,7 +57,9 @@
 #endif

 #ifdef __gnu_linux__
+#ifdef HAS_NUMA
 #include <numa.h>
+#endif
 extern "C" {
 #include <hugetlbfs.h> //for allocating and freeing huge pages
 }
@ -72,6 +74,9 @@ BenchmarkManager::BenchmarkManager(
 		__num_numa_nodes(g_num_nodes),
 		__benchmark_num_numa_nodes(g_num_nodes),
 		__mem_arrays(),
+#ifndef HAS_NUMA
+		__orig_malloc_addr(NULL),
+#endif
 		__mem_array_lens(),
 		__tp_benchmarks(),
 		__lat_benchmarks(),
@ -140,7 +145,12 @@ BenchmarkManager::~BenchmarkManager() {
 			if (__config.useLargePages())
 				free_huge_pages(__mem_arrays[i]);
 			else
+#ifdef HAS_NUMA
 				numa_free(__mem_arrays[i], __mem_array_lens[i]); 
+#endif
+#ifndef HAS_NUMA
+				free(__orig_malloc_addr); //this is somewhat of a band-aid
+#endif
 #endif
 		}
 	//Close results file
@ -399,8 +409,14 @@ void BenchmarkManager::__setupWorkingSets(size_t working_set_size) {
 			__mem_arrays[numa_node] = VirtualAllocExNuma(GetCurrentProcess(), NULL, allocation_size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE, numa_node); //Windows NUMA allocation. Make the allocation one page bigger than necessary so that we can do alignment.
 #endif
 #ifdef __gnu_linux__
+#ifdef HAS_NUMA
 			numa_set_strict(1); //Enforce NUMA memory allocation to land on specified node or fail otherwise. Alternative node fallback is forbidden.
 			__mem_arrays[numa_node] = numa_alloc_onnode(allocation_size, numa_node);
+#endif
+#ifndef HAS_NUMA //special case
+			__mem_arrays[numa_node] = malloc(allocation_size);
+			__orig_malloc_addr = __mem_arrays[numa_node];
+#endif
 #endif
 		}
 		
--- a/src/Configurator.cpp
+++ b/src/Configurator.cpp
@ -55,8 +55,11 @@ Configurator::Configurator(
 	__runThroughput(true),
 	__working_set_size_per_thread(DEFAULT_WORKING_SET_SIZE_PER_THREAD),
 	__num_worker_threads(DEFAULT_NUM_WORKER_THREADS),
-	__use_chunk_32b(false),
+#ifndef HAS_WORD_64
+	__use_chunk_32b(true),
+#endif
 #ifdef HAS_WORD_64
+	__use_chunk_32b(false),
 	__use_chunk_64b(true),
 #endif
 #ifdef HAS_WORD_128
@ -65,7 +68,12 @@ Configurator::Configurator(
 #ifdef HAS_WORD_256
 	__use_chunk_256b(false),
 #endif
+#ifdef HAS_NUMA
 	__numa_enabled(true),
+#endif
+#ifndef HAS_NUMA
+	__numa_enabled(false),
+#endif
 	__iterations(1),
 	__use_random_access_pattern(false),
 	__use_sequential_access_pattern(true),
@ -285,7 +293,7 @@ int32_t Configurator::configureFromInput(int argc, char* argv[]) {
 	}
 	
 	//Check NUMA selection
-	if (options[NUMA_DISABLE])
+	if (options[NUMA_DISABLE]) //NUMA is not supported currently on anything but x86-64 systems anyway.
 		__numa_enabled = false;
 	
 	//Check if large pages should be used for allocation of memory under test.
@ -627,11 +635,13 @@ int32_t Configurator::configureFromInput(int argc, char* argv[]) {
 		std::cout << std::endl;
 		std::cout << "---> Number of worker threads:        ";
 		std::cout << __num_worker_threads << std::endl;
+#ifdef HAS_NUMA
 		std::cout << "---> NUMA enabled:                    ";
 		if (__numa_enabled)
 			std::cout << "yes" << std::endl;
 		else
 			std::cout << "no" << std::endl;
+#endif
 		std::cout << "---> Large pages:                     ";
 		if (__use_large_pages)
 			std::cout << "yes" << std::endl;
--- a/src/LatencyBenchmark.cpp
+++ b/src/LatencyBenchmark.cpp
@ -94,7 +94,8 @@ LatencyBenchmark::LatencyBenchmark(
 void LatencyBenchmark::report_benchmark_info() const {
 	std::cout << "CPU NUMA Node: " << _cpu_node << std::endl;
 	std::cout << "Memory NUMA Node: " << _mem_node << std::endl;
-	std::cout << "Latency measurement chunk size: 64-bit" << std::endl;
+	std::cout << "Latency measurement chunk size: ";
+	std::cout << sizeof(uintptr_t)*8 << "-bit" << std::endl;
 	std::cout << "Latency measurement access pattern: random read (pointer-chasing)" << std::endl;

 	if (_num_worker_threads > 1) {
@ -223,13 +224,18 @@ bool LatencyBenchmark::_run_core() {
 	RandomFunction lat_kernel_dummy_fptr = &dummy_chasePointers;

 	//Initialize memory regions for all threads by writing to them, causing the memory to be physically resident.
-	forwSequentialWrite_Word64(_mem_array,
+	forwSequentialWrite_Word32(_mem_array,
 							   reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(_mem_array)+_len)); //static casts to silence compiler warnings

 	//Build pointer indices for random-access latency thread. We assume that latency thread is the first one, so we use beginning of memory region.
 	if (!buildRandomPointerPermutation(_mem_array,
 									   reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(_mem_array)+len_per_thread), //static casts to silence compiler warnings
+#ifndef HAS_WORD_64 //special case: 32-bit architectures
+									   CHUNK_32b)) { 
+#endif
+#ifdef HAS_WORD_64
 									   CHUNK_64b)) { 
+#endif
 		std::cerr << "ERROR: Failed to build a random pointer permutation for the latency measurement thread!" << std::endl;
 		return false;
 	}
--- a/src/LatencyWorker.cpp
+++ b/src/LatencyWorker.cpp
@ -130,7 +130,7 @@ void LatencyWorker::run() {
 	for (uint32_t i = 0; i < 4; i++) {
 		void* prime_start_address = mem_array; 
 		void* prime_end_address = reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(mem_array) + len);
-		forwSequentialRead_Word64(prime_start_address, prime_end_address); //dependent reads on the memory, make sure caches are ready, coherence, etc...
+		forwSequentialRead_Word32(prime_start_address, prime_end_address); //dependent reads on the memory, make sure caches are ready, coherence, etc...
 	}

 	//Run benchmark
--- a/src/LoadWorker.cpp
+++ b/src/LoadWorker.cpp
@ -171,7 +171,7 @@ void LoadWorker::run() {

 	//Prime memory
 	for (uint32_t i = 0; i < 4; i++) {
-		forwSequentialRead_Word64(prime_start_address, prime_end_address); //dependent reads on the memory, make sure caches are ready, coherence, etc...
+		forwSequentialRead_Word32(prime_start_address, prime_end_address); //dependent reads on the memory, make sure caches are ready, coherence, etc...
 	}

 	//Run the benchmark!
--- a/src/README.md
+++ b/src/README.md
@ -1,7 +1,7 @@
 README
 ------------------------------------------------------------------------------------------------------------

-X-Mem: Extensible Memory Benchmarking Tool v2.1.12
+X-Mem: Extensible Memory Benchmarking Tool v2.1.13
 ------------------------------------------------------------------------------------------------------------

 The flexible open-source research tool for characterizing memory hierarchy throughput, latency, and power. 
--- a/src/benchmark_kernels.cpp
+++ b/src/benchmark_kernels.cpp
@ -717,12 +717,7 @@ bool xmem::buildRandomPointerPermutation(void* start_address, void* end_address,
 int32_t xmem::dummy_chasePointers(uintptr_t*, uintptr_t**, size_t len) {
 	volatile uintptr_t placeholder = 0; //Try to defeat compiler optimizations removing this method
 #ifdef USE_SIZE_BASED_BENCHMARKS
-#ifndef HAS_WORD_64 //special case for 32-bit architectures
-	for (size_t i = 0; i < len / sizeof(uintptr_t); i += 1024)
-#endif
-#ifdef HAS_WORD_64
 	for (size_t i = 0; i < len / sizeof(uintptr_t); i += 512)
-#endif
 		placeholder = 0;
 #endif
 	return 0;
@ -734,22 +729,11 @@ int32_t xmem::chasePointers(uintptr_t* first_address, uintptr_t** last_touched_a
 	volatile uintptr_t* p = first_address;

 #ifdef USE_TIME_BASED_BENCHMARKS
-#ifndef HAS_WORD_64 //special case for 32-bit architectures
-	UNROLL1024(p = reinterpret_cast<uintptr_t*>(*p);)
-#endif
-#ifdef HAS_WORD_64
 	UNROLL512(p = reinterpret_cast<uintptr_t*>(*p);)
 #endif
-#endif
 #ifdef USE_SIZE_BASED_BENCHMARKS
-#ifndef HAS_WORD_64 //special case for 32-bit architectures
-	for (size_t i = 0; i < len / sizeof(uintptr_t); i += 1024) {
-		UNROLL1024(p = reinterpret_cast<uintptr_t*>(*p);)
-#endif
-#ifdef HAS_WORD_64
 	for (size_t i = 0; i < len / sizeof(uintptr_t); i += 512) {
 		UNROLL512(p = reinterpret_cast<uintptr_t*>(*p);)
-#endif
 	}
 #endif
 	*last_touched_address = const_cast<uintptr_t*>(p);
@ -2728,26 +2712,33 @@ int32_t xmem::revStride16Write_Word256(void* start_address, void* end_address) {

 /* ------------ RANDOM READ --------------*/

+#ifndef HAS_WORD_64 //special case: 32-bit machine
+int32_t xmem::randomRead_Word32(uintptr_t* first_address, uintptr_t** last_touched_address, size_t len) {
+	volatile uintptr_t* p = first_address;
+
+#ifdef USE_TIME_BASED_BENCHMARKS
+	UNROLL1024(p = reinterpret_cast<uintptr_t*>(*p);)
+#endif
+#ifdef USE_SIZE_BASED_BENCHMARKS
+	for (size_t i = 0; i < len / sizeof(uintptr_t); i += 1024) {
+		UNROLL1024(p = reinterpret_cast<uintptr_t*>(*p);)
+	}
+#endif
+	*last_touched_address = const_cast<uintptr_t*>(p);
+	return 0;
+}
+#endif
+
 #ifdef HAS_WORD_64
 int32_t xmem::randomRead_Word64(uintptr_t* first_address, uintptr_t** last_touched_address, size_t len) {
 	volatile uintptr_t* p = first_address;

 #ifdef USE_TIME_BASED_BENCHMARKS
-#ifndef HAS_WORD_64 //special case: 32-bit machine
-	UNROLL1024(p = reinterpret_cast<uintptr_t*>(*p);)
-#endif
-#ifdef HAS_WORD_64
 	UNROLL512(p = reinterpret_cast<uintptr_t*>(*p);)
 #endif
-#endif
 #ifdef USE_SIZE_BASED_BENCHMARKS
 	for (size_t i = 0; i < len / sizeof(uintptr_t); i += 512) {
-#ifndef HAS_WORD_64 //special case: 32-bit machine
-		UNROLL1024(p = reinterpret_cast<uintptr_t*>(*p);)
-#endif
-#ifdef HAS_WORD_64
 		UNROLL512(p = reinterpret_cast<uintptr_t*>(*p);)
-#endif
 	}
 #endif
 	*last_touched_address = const_cast<uintptr_t*>(p);
--- a/src/common.cpp
+++ b/src/common.cpp
@ -42,7 +42,9 @@
 #ifdef __gnu_linux__
 #include <unistd.h>
 #include <pthread.h>
+#ifdef HAS_NUMA
 #include <numa.h>
+#endif
 #include <fstream> //for std::ifstream
 #include <vector> //for std::vector
 #include <algorithm> //for std::find
@ -163,6 +165,9 @@ void xmem::print_compile_time_options() {
 #ifdef ARCH_64BIT
 	std::cout << "ARCH_64BIT" << std::endl;
 #endif
+#ifdef HAS_NUMA
+	std::cout << "HAS_NUMA" << std::endl;
+#endif
 #ifdef HAS_WORD_64
 	std::cout << "HAS_WORD_64" << std::endl;
 #endif
@ -312,8 +317,18 @@ bool xmem::unlock_thread_to_cpu() {
 	return (!pthread_setaffinity_np(tid, sizeof(cpu_set_t), &cpus));
 #endif
 }
-		
+	
 int32_t xmem::cpu_id_in_numa_node(uint32_t numa_node, uint32_t cpu_in_node) {
+#ifndef HAS_NUMA
+	if (numa_node != 0) {
+		std::cerr << "WARNING: NUMA is not supported on this X-Mem build." << std::endl;
+		return -1;
+	}
+
+	return cpu_in_node;
+#endif
+
+#ifdef HAS_NUMA
 	int32_t cpu_id = -1;
 	uint32_t rank_in_node = 0;
 #ifdef _WIN32
@ -361,6 +376,7 @@ int32_t xmem::cpu_id_in_numa_node(uint32_t numa_node, uint32_t cpu_in_node) {
 		free(bm_ptr);
 #endif
 	return cpu_id;
+#endif
 }
 	
 size_t xmem::compute_number_of_passes(size_t working_set_size_KB) {
@ -413,6 +429,14 @@ int32_t xmem::query_sys_info() {
 	retval = GetLogicalProcessorInformation(buffer, &len); //try again
 #endif

+#ifdef __gnu_linux__
+	std::ifstream in;
+	in.open("/proc/cpuinfo");
+	char line[512];
+	uint32_t id = 0;
+#endif
+
+#ifdef HAS_NUMA
 	//Get NUMA info
 #ifdef _WIN32
 	curr = buffer;
@ -436,11 +460,7 @@ int32_t xmem::query_sys_info() {
 	g_num_nodes = numa_max_node()+1;

 	//Get number of physical packages. This is somewhat convoluted, but not sure of a better way on Linux. Technically there could be on-chip NUMA, so...
-	std::ifstream in;
-	in.open("/proc/cpuinfo");
-	char line[512];
 	std::vector<uint32_t> phys_package_ids;
-	uint32_t id = 0;
 	while (!in.eof()) {
 		in.getline(line, 512, '\n');

@ -461,7 +481,7 @@ int32_t xmem::query_sys_info() {
 		}
 	}
 	g_num_physical_packages = phys_package_ids.size();
-	in.close();
+#endif
 #endif

 	//Get number of CPUs
@ -495,7 +515,6 @@ int32_t xmem::query_sys_info() {

 	//Get number of physical CPUs. This is somewhat convoluted, but not sure of a better way on Linux. I don't want to assume anything about HyperThreading-like things.
 	std::vector<uint32_t> core_ids;
-	in.open("/proc/cpuinfo");
 	while (!in.eof()) {
 		in.getline(line, 512, '\n');
 		
@ -516,7 +535,6 @@ int32_t xmem::query_sys_info() {
 		}
 	}
 	g_num_physical_cpus = core_ids.size() * g_num_physical_packages; //FIXME: currently this assumes each processor package has an equal number of cores. This may not be true in general! Need more complicated /proc/cpuinfo parsing.
-	in.close();
 #endif

 	//Get number of caches
@ -566,6 +584,7 @@ int32_t xmem::query_sys_info() {
 #ifdef __gnu_linux__
 	g_page_size = static_cast<size_t>(sysconf(_SC_PAGESIZE));
 	g_large_page_size = gethugepagesize(); 
+	in.close();
 #endif

 #ifdef _WIN32
--- a/src/ext/DelayInjectedLoadedLatencyBenchmark/DelayInjectedLoadedLatencyBenchmark.cpp
+++ b/src/ext/DelayInjectedLoadedLatencyBenchmark/DelayInjectedLoadedLatencyBenchmark.cpp
@ -105,13 +105,18 @@ bool DelayInjectedLoadedLatencyBenchmark::_run_core() {
 	RandomFunction lat_kernel_dummy_fptr = &dummy_chasePointers;

 	//Initialize memory regions for all threads by writing to them, causing the memory to be physically resident.
-	forwSequentialWrite_Word64(_mem_array,
+	forwSequentialWrite_Word32(_mem_array,
 							   reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(_mem_array)+_len)); //static casts to silence compiler warnings

 	//Build pointer indices for random-access latency thread. We assume that latency thread is the first one, so we use beginning of memory region.
 	if (!buildRandomPointerPermutation(_mem_array,
 									   reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(_mem_array)+len_per_thread), //static casts to silence compiler warnings
+#ifndef HAS_WORD_64 //special case: 32-bit architectures
+									   CHUNK_32b)) { 
+#endif
+#ifdef HAS_WORD_64
 									   CHUNK_64b)) { 
+#endif
 		std::cerr << "ERROR: Failed to build a random pointer permutation for the latency measurement thread!" << std::endl;
 		return false;
 	}
--- a/src/include/BenchmarkManager.h
+++ b/src/include/BenchmarkManager.h
@ -113,6 +113,9 @@ namespace xmem {
 		uint32_t __num_numa_nodes; /**< Number of NUMA nodes in the system. */
 		uint32_t __benchmark_num_numa_nodes; /**< Number of NUMA nodes to use in benchmarks. */
 		std::vector<void*> __mem_arrays; /**< Memory regions to use in benchmarks. One for each benchmarked NUMA node. */
+#ifndef HAS_NUMA
+		void* __orig_malloc_addr; /**< Points to the original address returned by the malloc() for __mem_arrays on non-NUMA machines. Special case. */
+#endif
 		std::vector<size_t> __mem_array_lens; /**< Length of each memory region to use in benchmarks. */
 		std::vector<ThroughputBenchmark*> __tp_benchmarks; /**< Set of throughput benchmarks. */
 		std::vector<LatencyBenchmark*> __lat_benchmarks; /**< Set of latency benchmarks. */
--- a/src/include/benchmark_kernels.h
+++ b/src/include/benchmark_kernels.h
@ -1377,7 +1377,7 @@ namespace xmem {
 	/* ------------ RANDOM WRITE --------------*/

 	//32-bit machines only
-#ifdef HAS_WORD_64
+#ifndef HAS_WORD_64
 	/**
 	 * @brief Walks over the allocated memory in random order by chasing 32-bit pointers. A pointer is read and written back with the same value before chasing to the next pointer. Thus, each memory address is a read followed by immediate write operation. 
 	 * @param first_address Starting address to deference.
--- a/src/include/common.h
+++ b/src/include/common.h
@ -45,7 +45,7 @@

 namespace xmem {

-#define VERSION "2.1.12"
+#define VERSION "2.1.13"

 #if !defined(_WIN32) && !defined(__gnu_linux__)
 #error Neither Windows/GNULinux build environments were detected!
@ -63,6 +63,7 @@ namespace xmem {
 #define ARCH_INTEL_X86_64
 #define ARCH_INTEL
 #define ARCH_64BIT
+#define HAS_NUMA
 #endif

 #ifdef _M_IX86_FP //Intel x86-64 SSE2 extensions
@ -106,6 +107,7 @@ namespace xmem {
 #define ARCH_INTEL_X86_64
 #define ARCH_64BIT
 #define ARCH_INTEL
+#define HAS_NUMA
 #endif

 #ifdef __SSE2__ //Intel x86-64 SSE2 extensions
@ -155,10 +157,10 @@ namespace xmem {
 #define DEFAULT_LARGE_PAGE_SIZE 2*MB /**< Default platform large page size in bytes. This generally should not be relied on, but is a failsafe. */
 #define DEFAULT_WORKING_SET_SIZE_PER_THREAD DEFAULT_PAGE_SIZE /**< Default working set size in bytes. */
 #define DEFAULT_NUM_WORKER_THREADS 1 /**< Default number of worker threads to use. */
-#define DEFAULT_NUM_NODES 0 /**< Default number of NUMA nodes. */
-#define DEFAULT_NUM_PHYSICAL_PACKAGES 0 /**< Default number of physical packages. */
-#define DEFAULT_NUM_PHYSICAL_CPUS 0 /**< Default number of physical CPU cores. */
-#define DEFAULT_NUM_LOGICAL_CPUS 0 /**< Default number of logical CPU cores. */
+#define DEFAULT_NUM_NODES 1 /**< Default number of NUMA nodes. */
+#define DEFAULT_NUM_PHYSICAL_PACKAGES 1 /**< Default number of physical packages. */
+#define DEFAULT_NUM_PHYSICAL_CPUS 1 /**< Default number of physical CPU cores. */
+#define DEFAULT_NUM_LOGICAL_CPUS 1 /**< Default number of logical CPU cores. */
 #define DEFAULT_NUM_L1_CACHES 0 /**< Default number of L1 caches. */
 #define DEFAULT_NUM_L2_CACHES 0 /**< Default number of L2 caches. */
 #define DEFAULT_NUM_L3_CACHES 0 /**< Default number of L3 caches. */
@ -439,7 +441,7 @@ namespace xmem {
 	 * @returns True on success.
 	 */
 	bool unlock_thread_to_cpu();
-	
+
 	/**
 	 * @brief Gets the CPU ID for a logical CPU of interest in a particular NUMA node.
 	 * For example, if numa_node is 1 and cpu_in_node is 2, and there are 4 logical CPUs per node, then this will give the answer 6 (6th CPU), assuming CPU IDs start at 0.