зеркало из https://github.com/microsoft/X-Mem.git
X-Mem now builds and runs correctly on GNU/Linux x86-64 with AVX, x86-64, and x86 (32-bit). On 32-bit mode, NUMA support is disabled at build time. This is because libnuma does not seem to support 32-bit x86 nor ARM, and it saves me a headache for now.
This commit is contained in:
Родитель
f6d824d77d
Коммит
8a52ccd072
2
Doxyfile
2
Doxyfile
|
@ -38,7 +38,7 @@ PROJECT_NAME = X-Mem
|
|||
# could be handy for archiving the generated documentation or if some version
|
||||
# control system is used.
|
||||
|
||||
PROJECT_NUMBER = 2.1.12
|
||||
PROJECT_NUMBER = 2.1.13
|
||||
|
||||
# Using the PROJECT_BRIEF tag one can provide an optional one line description
|
||||
# for a project that appears at the top of each page and should give viewer a
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
README
|
||||
------------------------------------------------------------------------------------------------------------
|
||||
|
||||
X-Mem: Extensible Memory Benchmarking Tool v2.1.12
|
||||
X-Mem: Extensible Memory Benchmarking Tool v2.1.13
|
||||
------------------------------------------------------------------------------------------------------------
|
||||
|
||||
The flexible open-source research tool for characterizing memory hierarchy throughput, latency, and power.
|
||||
|
|
|
@ -13,7 +13,9 @@ env.Append(CPPFLAGS = '-Wall -Wno-unused-but-set-variable -Wno-unused-variable -
|
|||
env.Append(CPPPATH = ['src/include', '/usr/include', '/lib'])
|
||||
env.Append(CPPPATH = ['src/include/ext/DelayInjectedLoadedLatencyBenchmark']) # Extension: Delay-injected loaded latency benchmark
|
||||
env.Append(CPPPATH = ['src/include/ext/StreamBenchmark']) # Extension: Stream benchmark
|
||||
env.Append(LIBS = ['pthread', 'numa', 'hugetlbfs'])
|
||||
env.Append(LIBPATH = ['/usr/lib32'])
|
||||
env.Append(LIBS = ['pthread', 'hugetlbfs'])
|
||||
env.Append(LINKFLAGS = ['-m32'])
|
||||
|
||||
# List all C++ source files
|
||||
sources = [
|
||||
|
|
Двоичные данные
X-Mem_Developer_Manual.pdf
Двоичные данные
X-Mem_Developer_Manual.pdf
Двоичный файл не отображается.
Двоичные данные
bin/xmem-linux-x64
Двоичные данные
bin/xmem-linux-x64
Двоичный файл не отображается.
Двоичные данные
bin/xmem-linux-x64_avx
Двоичные данные
bin/xmem-linux-x64_avx
Двоичный файл не отображается.
Двоичный файл не отображается.
|
@ -8,8 +8,10 @@ PYTHON=https://www.python.org/ftp/python/2.7.9/Python-2.7.9.tgz
|
|||
SCONS=http://sourceforge.net/projects/scons/files/scons/2.3.4/scons-2.3.4.tar.gz
|
||||
GCC=http://www.netgull.com/gcc/releases/gcc-4.8.2/gcc-4.8.2.tar.gz
|
||||
DOXYGEN=http://ftp.stack.nl/pub/users/dimitri/doxygen-1.8.9.1.linux.bin.tar.gz
|
||||
LIBNUMA=ftp://oss.sgi.com/www/projects/libnuma/download/numactl-2.0.10.tar.gz
|
||||
wget $LIBHUGETLBFS
|
||||
wget $PYTHON
|
||||
wget $SCONS
|
||||
wget $GCC
|
||||
wget $DOXYGEN
|
||||
wget $LIBNUMA
|
||||
|
|
|
@ -101,7 +101,7 @@ bool Benchmark::run() {
|
|||
|
||||
//Write to all of the memory region of interest to make sure
|
||||
//pages are resident in physical memory and are not shared
|
||||
forwSequentialWrite_Word64(_mem_array,
|
||||
forwSequentialWrite_Word32(_mem_array,
|
||||
reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(_mem_array) + _len));
|
||||
|
||||
bool success = _run_core();
|
||||
|
|
|
@ -57,7 +57,9 @@
|
|||
#endif
|
||||
|
||||
#ifdef __gnu_linux__
|
||||
#ifdef HAS_NUMA
|
||||
#include <numa.h>
|
||||
#endif
|
||||
extern "C" {
|
||||
#include <hugetlbfs.h> //for allocating and freeing huge pages
|
||||
}
|
||||
|
@ -72,6 +74,9 @@ BenchmarkManager::BenchmarkManager(
|
|||
__num_numa_nodes(g_num_nodes),
|
||||
__benchmark_num_numa_nodes(g_num_nodes),
|
||||
__mem_arrays(),
|
||||
#ifndef HAS_NUMA
|
||||
__orig_malloc_addr(NULL),
|
||||
#endif
|
||||
__mem_array_lens(),
|
||||
__tp_benchmarks(),
|
||||
__lat_benchmarks(),
|
||||
|
@ -140,7 +145,12 @@ BenchmarkManager::~BenchmarkManager() {
|
|||
if (__config.useLargePages())
|
||||
free_huge_pages(__mem_arrays[i]);
|
||||
else
|
||||
#ifdef HAS_NUMA
|
||||
numa_free(__mem_arrays[i], __mem_array_lens[i]);
|
||||
#endif
|
||||
#ifndef HAS_NUMA
|
||||
free(__orig_malloc_addr); //this is somewhat of a band-aid
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
//Close results file
|
||||
|
@ -399,8 +409,14 @@ void BenchmarkManager::__setupWorkingSets(size_t working_set_size) {
|
|||
__mem_arrays[numa_node] = VirtualAllocExNuma(GetCurrentProcess(), NULL, allocation_size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE, numa_node); //Windows NUMA allocation. Make the allocation one page bigger than necessary so that we can do alignment.
|
||||
#endif
|
||||
#ifdef __gnu_linux__
|
||||
#ifdef HAS_NUMA
|
||||
numa_set_strict(1); //Enforce NUMA memory allocation to land on specified node or fail otherwise. Alternative node fallback is forbidden.
|
||||
__mem_arrays[numa_node] = numa_alloc_onnode(allocation_size, numa_node);
|
||||
#endif
|
||||
#ifndef HAS_NUMA //special case
|
||||
__mem_arrays[numa_node] = malloc(allocation_size);
|
||||
__orig_malloc_addr = __mem_arrays[numa_node];
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -55,8 +55,11 @@ Configurator::Configurator(
|
|||
__runThroughput(true),
|
||||
__working_set_size_per_thread(DEFAULT_WORKING_SET_SIZE_PER_THREAD),
|
||||
__num_worker_threads(DEFAULT_NUM_WORKER_THREADS),
|
||||
__use_chunk_32b(false),
|
||||
#ifndef HAS_WORD_64
|
||||
__use_chunk_32b(true),
|
||||
#endif
|
||||
#ifdef HAS_WORD_64
|
||||
__use_chunk_32b(false),
|
||||
__use_chunk_64b(true),
|
||||
#endif
|
||||
#ifdef HAS_WORD_128
|
||||
|
@ -65,7 +68,12 @@ Configurator::Configurator(
|
|||
#ifdef HAS_WORD_256
|
||||
__use_chunk_256b(false),
|
||||
#endif
|
||||
#ifdef HAS_NUMA
|
||||
__numa_enabled(true),
|
||||
#endif
|
||||
#ifndef HAS_NUMA
|
||||
__numa_enabled(false),
|
||||
#endif
|
||||
__iterations(1),
|
||||
__use_random_access_pattern(false),
|
||||
__use_sequential_access_pattern(true),
|
||||
|
@ -285,7 +293,7 @@ int32_t Configurator::configureFromInput(int argc, char* argv[]) {
|
|||
}
|
||||
|
||||
//Check NUMA selection
|
||||
if (options[NUMA_DISABLE])
|
||||
if (options[NUMA_DISABLE]) //NUMA is not supported currently on anything but x86-64 systems anyway.
|
||||
__numa_enabled = false;
|
||||
|
||||
//Check if large pages should be used for allocation of memory under test.
|
||||
|
@ -627,11 +635,13 @@ int32_t Configurator::configureFromInput(int argc, char* argv[]) {
|
|||
std::cout << std::endl;
|
||||
std::cout << "---> Number of worker threads: ";
|
||||
std::cout << __num_worker_threads << std::endl;
|
||||
#ifdef HAS_NUMA
|
||||
std::cout << "---> NUMA enabled: ";
|
||||
if (__numa_enabled)
|
||||
std::cout << "yes" << std::endl;
|
||||
else
|
||||
std::cout << "no" << std::endl;
|
||||
#endif
|
||||
std::cout << "---> Large pages: ";
|
||||
if (__use_large_pages)
|
||||
std::cout << "yes" << std::endl;
|
||||
|
|
|
@ -94,7 +94,8 @@ LatencyBenchmark::LatencyBenchmark(
|
|||
void LatencyBenchmark::report_benchmark_info() const {
|
||||
std::cout << "CPU NUMA Node: " << _cpu_node << std::endl;
|
||||
std::cout << "Memory NUMA Node: " << _mem_node << std::endl;
|
||||
std::cout << "Latency measurement chunk size: 64-bit" << std::endl;
|
||||
std::cout << "Latency measurement chunk size: ";
|
||||
std::cout << sizeof(uintptr_t)*8 << "-bit" << std::endl;
|
||||
std::cout << "Latency measurement access pattern: random read (pointer-chasing)" << std::endl;
|
||||
|
||||
if (_num_worker_threads > 1) {
|
||||
|
@ -223,13 +224,18 @@ bool LatencyBenchmark::_run_core() {
|
|||
RandomFunction lat_kernel_dummy_fptr = &dummy_chasePointers;
|
||||
|
||||
//Initialize memory regions for all threads by writing to them, causing the memory to be physically resident.
|
||||
forwSequentialWrite_Word64(_mem_array,
|
||||
forwSequentialWrite_Word32(_mem_array,
|
||||
reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(_mem_array)+_len)); //static casts to silence compiler warnings
|
||||
|
||||
//Build pointer indices for random-access latency thread. We assume that latency thread is the first one, so we use beginning of memory region.
|
||||
if (!buildRandomPointerPermutation(_mem_array,
|
||||
reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(_mem_array)+len_per_thread), //static casts to silence compiler warnings
|
||||
#ifndef HAS_WORD_64 //special case: 32-bit architectures
|
||||
CHUNK_32b)) {
|
||||
#endif
|
||||
#ifdef HAS_WORD_64
|
||||
CHUNK_64b)) {
|
||||
#endif
|
||||
std::cerr << "ERROR: Failed to build a random pointer permutation for the latency measurement thread!" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -130,7 +130,7 @@ void LatencyWorker::run() {
|
|||
for (uint32_t i = 0; i < 4; i++) {
|
||||
void* prime_start_address = mem_array;
|
||||
void* prime_end_address = reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(mem_array) + len);
|
||||
forwSequentialRead_Word64(prime_start_address, prime_end_address); //dependent reads on the memory, make sure caches are ready, coherence, etc...
|
||||
forwSequentialRead_Word32(prime_start_address, prime_end_address); //dependent reads on the memory, make sure caches are ready, coherence, etc...
|
||||
}
|
||||
|
||||
//Run benchmark
|
||||
|
|
|
@ -171,7 +171,7 @@ void LoadWorker::run() {
|
|||
|
||||
//Prime memory
|
||||
for (uint32_t i = 0; i < 4; i++) {
|
||||
forwSequentialRead_Word64(prime_start_address, prime_end_address); //dependent reads on the memory, make sure caches are ready, coherence, etc...
|
||||
forwSequentialRead_Word32(prime_start_address, prime_end_address); //dependent reads on the memory, make sure caches are ready, coherence, etc...
|
||||
}
|
||||
|
||||
//Run the benchmark!
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
README
|
||||
------------------------------------------------------------------------------------------------------------
|
||||
|
||||
X-Mem: Extensible Memory Benchmarking Tool v2.1.12
|
||||
X-Mem: Extensible Memory Benchmarking Tool v2.1.13
|
||||
------------------------------------------------------------------------------------------------------------
|
||||
|
||||
The flexible open-source research tool for characterizing memory hierarchy throughput, latency, and power.
|
||||
|
|
|
@ -717,12 +717,7 @@ bool xmem::buildRandomPointerPermutation(void* start_address, void* end_address,
|
|||
int32_t xmem::dummy_chasePointers(uintptr_t*, uintptr_t**, size_t len) {
|
||||
volatile uintptr_t placeholder = 0; //Try to defeat compiler optimizations removing this method
|
||||
#ifdef USE_SIZE_BASED_BENCHMARKS
|
||||
#ifndef HAS_WORD_64 //special case for 32-bit architectures
|
||||
for (size_t i = 0; i < len / sizeof(uintptr_t); i += 1024)
|
||||
#endif
|
||||
#ifdef HAS_WORD_64
|
||||
for (size_t i = 0; i < len / sizeof(uintptr_t); i += 512)
|
||||
#endif
|
||||
placeholder = 0;
|
||||
#endif
|
||||
return 0;
|
||||
|
@ -734,22 +729,11 @@ int32_t xmem::chasePointers(uintptr_t* first_address, uintptr_t** last_touched_a
|
|||
volatile uintptr_t* p = first_address;
|
||||
|
||||
#ifdef USE_TIME_BASED_BENCHMARKS
|
||||
#ifndef HAS_WORD_64 //special case for 32-bit architectures
|
||||
UNROLL1024(p = reinterpret_cast<uintptr_t*>(*p);)
|
||||
#endif
|
||||
#ifdef HAS_WORD_64
|
||||
UNROLL512(p = reinterpret_cast<uintptr_t*>(*p);)
|
||||
#endif
|
||||
#endif
|
||||
#ifdef USE_SIZE_BASED_BENCHMARKS
|
||||
#ifndef HAS_WORD_64 //special case for 32-bit architectures
|
||||
for (size_t i = 0; i < len / sizeof(uintptr_t); i += 1024) {
|
||||
UNROLL1024(p = reinterpret_cast<uintptr_t*>(*p);)
|
||||
#endif
|
||||
#ifdef HAS_WORD_64
|
||||
for (size_t i = 0; i < len / sizeof(uintptr_t); i += 512) {
|
||||
UNROLL512(p = reinterpret_cast<uintptr_t*>(*p);)
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
*last_touched_address = const_cast<uintptr_t*>(p);
|
||||
|
@ -2728,26 +2712,33 @@ int32_t xmem::revStride16Write_Word256(void* start_address, void* end_address) {
|
|||
|
||||
/* ------------ RANDOM READ --------------*/
|
||||
|
||||
#ifndef HAS_WORD_64 //special case: 32-bit machine
|
||||
int32_t xmem::randomRead_Word32(uintptr_t* first_address, uintptr_t** last_touched_address, size_t len) {
|
||||
volatile uintptr_t* p = first_address;
|
||||
|
||||
#ifdef USE_TIME_BASED_BENCHMARKS
|
||||
UNROLL1024(p = reinterpret_cast<uintptr_t*>(*p);)
|
||||
#endif
|
||||
#ifdef USE_SIZE_BASED_BENCHMARKS
|
||||
for (size_t i = 0; i < len / sizeof(uintptr_t); i += 1024) {
|
||||
UNROLL1024(p = reinterpret_cast<uintptr_t*>(*p);)
|
||||
}
|
||||
#endif
|
||||
*last_touched_address = const_cast<uintptr_t*>(p);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_WORD_64
|
||||
int32_t xmem::randomRead_Word64(uintptr_t* first_address, uintptr_t** last_touched_address, size_t len) {
|
||||
volatile uintptr_t* p = first_address;
|
||||
|
||||
#ifdef USE_TIME_BASED_BENCHMARKS
|
||||
#ifndef HAS_WORD_64 //special case: 32-bit machine
|
||||
UNROLL1024(p = reinterpret_cast<uintptr_t*>(*p);)
|
||||
#endif
|
||||
#ifdef HAS_WORD_64
|
||||
UNROLL512(p = reinterpret_cast<uintptr_t*>(*p);)
|
||||
#endif
|
||||
#endif
|
||||
#ifdef USE_SIZE_BASED_BENCHMARKS
|
||||
for (size_t i = 0; i < len / sizeof(uintptr_t); i += 512) {
|
||||
#ifndef HAS_WORD_64 //special case: 32-bit machine
|
||||
UNROLL1024(p = reinterpret_cast<uintptr_t*>(*p);)
|
||||
#endif
|
||||
#ifdef HAS_WORD_64
|
||||
UNROLL512(p = reinterpret_cast<uintptr_t*>(*p);)
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
*last_touched_address = const_cast<uintptr_t*>(p);
|
||||
|
|
|
@ -42,7 +42,9 @@
|
|||
#ifdef __gnu_linux__
|
||||
#include <unistd.h>
|
||||
#include <pthread.h>
|
||||
#ifdef HAS_NUMA
|
||||
#include <numa.h>
|
||||
#endif
|
||||
#include <fstream> //for std::ifstream
|
||||
#include <vector> //for std::vector
|
||||
#include <algorithm> //for std::find
|
||||
|
@ -163,6 +165,9 @@ void xmem::print_compile_time_options() {
|
|||
#ifdef ARCH_64BIT
|
||||
std::cout << "ARCH_64BIT" << std::endl;
|
||||
#endif
|
||||
#ifdef HAS_NUMA
|
||||
std::cout << "HAS_NUMA" << std::endl;
|
||||
#endif
|
||||
#ifdef HAS_WORD_64
|
||||
std::cout << "HAS_WORD_64" << std::endl;
|
||||
#endif
|
||||
|
@ -312,8 +317,18 @@ bool xmem::unlock_thread_to_cpu() {
|
|||
return (!pthread_setaffinity_np(tid, sizeof(cpu_set_t), &cpus));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
int32_t xmem::cpu_id_in_numa_node(uint32_t numa_node, uint32_t cpu_in_node) {
|
||||
#ifndef HAS_NUMA
|
||||
if (numa_node != 0) {
|
||||
std::cerr << "WARNING: NUMA is not supported on this X-Mem build." << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
return cpu_in_node;
|
||||
#endif
|
||||
|
||||
#ifdef HAS_NUMA
|
||||
int32_t cpu_id = -1;
|
||||
uint32_t rank_in_node = 0;
|
||||
#ifdef _WIN32
|
||||
|
@ -361,6 +376,7 @@ int32_t xmem::cpu_id_in_numa_node(uint32_t numa_node, uint32_t cpu_in_node) {
|
|||
free(bm_ptr);
|
||||
#endif
|
||||
return cpu_id;
|
||||
#endif
|
||||
}
|
||||
|
||||
size_t xmem::compute_number_of_passes(size_t working_set_size_KB) {
|
||||
|
@ -413,6 +429,14 @@ int32_t xmem::query_sys_info() {
|
|||
retval = GetLogicalProcessorInformation(buffer, &len); //try again
|
||||
#endif
|
||||
|
||||
#ifdef __gnu_linux__
|
||||
std::ifstream in;
|
||||
in.open("/proc/cpuinfo");
|
||||
char line[512];
|
||||
uint32_t id = 0;
|
||||
#endif
|
||||
|
||||
#ifdef HAS_NUMA
|
||||
//Get NUMA info
|
||||
#ifdef _WIN32
|
||||
curr = buffer;
|
||||
|
@ -436,11 +460,7 @@ int32_t xmem::query_sys_info() {
|
|||
g_num_nodes = numa_max_node()+1;
|
||||
|
||||
//Get number of physical packages. This is somewhat convoluted, but not sure of a better way on Linux. Technically there could be on-chip NUMA, so...
|
||||
std::ifstream in;
|
||||
in.open("/proc/cpuinfo");
|
||||
char line[512];
|
||||
std::vector<uint32_t> phys_package_ids;
|
||||
uint32_t id = 0;
|
||||
while (!in.eof()) {
|
||||
in.getline(line, 512, '\n');
|
||||
|
||||
|
@ -461,7 +481,7 @@ int32_t xmem::query_sys_info() {
|
|||
}
|
||||
}
|
||||
g_num_physical_packages = phys_package_ids.size();
|
||||
in.close();
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//Get number of CPUs
|
||||
|
@ -495,7 +515,6 @@ int32_t xmem::query_sys_info() {
|
|||
|
||||
//Get number of physical CPUs. This is somewhat convoluted, but not sure of a better way on Linux. I don't want to assume anything about HyperThreading-like things.
|
||||
std::vector<uint32_t> core_ids;
|
||||
in.open("/proc/cpuinfo");
|
||||
while (!in.eof()) {
|
||||
in.getline(line, 512, '\n');
|
||||
|
||||
|
@ -516,7 +535,6 @@ int32_t xmem::query_sys_info() {
|
|||
}
|
||||
}
|
||||
g_num_physical_cpus = core_ids.size() * g_num_physical_packages; //FIXME: currently this assumes each processor package has an equal number of cores. This may not be true in general! Need more complicated /proc/cpuinfo parsing.
|
||||
in.close();
|
||||
#endif
|
||||
|
||||
//Get number of caches
|
||||
|
@ -566,6 +584,7 @@ int32_t xmem::query_sys_info() {
|
|||
#ifdef __gnu_linux__
|
||||
g_page_size = static_cast<size_t>(sysconf(_SC_PAGESIZE));
|
||||
g_large_page_size = gethugepagesize();
|
||||
in.close();
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
|
|
|
@ -105,13 +105,18 @@ bool DelayInjectedLoadedLatencyBenchmark::_run_core() {
|
|||
RandomFunction lat_kernel_dummy_fptr = &dummy_chasePointers;
|
||||
|
||||
//Initialize memory regions for all threads by writing to them, causing the memory to be physically resident.
|
||||
forwSequentialWrite_Word64(_mem_array,
|
||||
forwSequentialWrite_Word32(_mem_array,
|
||||
reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(_mem_array)+_len)); //static casts to silence compiler warnings
|
||||
|
||||
//Build pointer indices for random-access latency thread. We assume that latency thread is the first one, so we use beginning of memory region.
|
||||
if (!buildRandomPointerPermutation(_mem_array,
|
||||
reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(_mem_array)+len_per_thread), //static casts to silence compiler warnings
|
||||
#ifndef HAS_WORD_64 //special case: 32-bit architectures
|
||||
CHUNK_32b)) {
|
||||
#endif
|
||||
#ifdef HAS_WORD_64
|
||||
CHUNK_64b)) {
|
||||
#endif
|
||||
std::cerr << "ERROR: Failed to build a random pointer permutation for the latency measurement thread!" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -113,6 +113,9 @@ namespace xmem {
|
|||
uint32_t __num_numa_nodes; /**< Number of NUMA nodes in the system. */
|
||||
uint32_t __benchmark_num_numa_nodes; /**< Number of NUMA nodes to use in benchmarks. */
|
||||
std::vector<void*> __mem_arrays; /**< Memory regions to use in benchmarks. One for each benchmarked NUMA node. */
|
||||
#ifndef HAS_NUMA
|
||||
void* __orig_malloc_addr; /**< Points to the original address returned by the malloc() for __mem_arrays on non-NUMA machines. Special case. */
|
||||
#endif
|
||||
std::vector<size_t> __mem_array_lens; /**< Length of each memory region to use in benchmarks. */
|
||||
std::vector<ThroughputBenchmark*> __tp_benchmarks; /**< Set of throughput benchmarks. */
|
||||
std::vector<LatencyBenchmark*> __lat_benchmarks; /**< Set of latency benchmarks. */
|
||||
|
|
|
@ -1377,7 +1377,7 @@ namespace xmem {
|
|||
/* ------------ RANDOM WRITE --------------*/
|
||||
|
||||
//32-bit machines only
|
||||
#ifdef HAS_WORD_64
|
||||
#ifndef HAS_WORD_64
|
||||
/**
|
||||
* @brief Walks over the allocated memory in random order by chasing 32-bit pointers. A pointer is read and written back with the same value before chasing to the next pointer. Thus, each memory address is a read followed by immediate write operation.
|
||||
* @param first_address Starting address to deference.
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
|
||||
namespace xmem {
|
||||
|
||||
#define VERSION "2.1.12"
|
||||
#define VERSION "2.1.13"
|
||||
|
||||
#if !defined(_WIN32) && !defined(__gnu_linux__)
|
||||
#error Neither Windows/GNULinux build environments were detected!
|
||||
|
@ -63,6 +63,7 @@ namespace xmem {
|
|||
#define ARCH_INTEL_X86_64
|
||||
#define ARCH_INTEL
|
||||
#define ARCH_64BIT
|
||||
#define HAS_NUMA
|
||||
#endif
|
||||
|
||||
#ifdef _M_IX86_FP //Intel x86-64 SSE2 extensions
|
||||
|
@ -106,6 +107,7 @@ namespace xmem {
|
|||
#define ARCH_INTEL_X86_64
|
||||
#define ARCH_64BIT
|
||||
#define ARCH_INTEL
|
||||
#define HAS_NUMA
|
||||
#endif
|
||||
|
||||
#ifdef __SSE2__ //Intel x86-64 SSE2 extensions
|
||||
|
@ -155,10 +157,10 @@ namespace xmem {
|
|||
#define DEFAULT_LARGE_PAGE_SIZE 2*MB /**< Default platform large page size in bytes. This generally should not be relied on, but is a failsafe. */
|
||||
#define DEFAULT_WORKING_SET_SIZE_PER_THREAD DEFAULT_PAGE_SIZE /**< Default working set size in bytes. */
|
||||
#define DEFAULT_NUM_WORKER_THREADS 1 /**< Default number of worker threads to use. */
|
||||
#define DEFAULT_NUM_NODES 0 /**< Default number of NUMA nodes. */
|
||||
#define DEFAULT_NUM_PHYSICAL_PACKAGES 0 /**< Default number of physical packages. */
|
||||
#define DEFAULT_NUM_PHYSICAL_CPUS 0 /**< Default number of physical CPU cores. */
|
||||
#define DEFAULT_NUM_LOGICAL_CPUS 0 /**< Default number of logical CPU cores. */
|
||||
#define DEFAULT_NUM_NODES 1 /**< Default number of NUMA nodes. */
|
||||
#define DEFAULT_NUM_PHYSICAL_PACKAGES 1 /**< Default number of physical packages. */
|
||||
#define DEFAULT_NUM_PHYSICAL_CPUS 1 /**< Default number of physical CPU cores. */
|
||||
#define DEFAULT_NUM_LOGICAL_CPUS 1 /**< Default number of logical CPU cores. */
|
||||
#define DEFAULT_NUM_L1_CACHES 0 /**< Default number of L1 caches. */
|
||||
#define DEFAULT_NUM_L2_CACHES 0 /**< Default number of L2 caches. */
|
||||
#define DEFAULT_NUM_L3_CACHES 0 /**< Default number of L3 caches. */
|
||||
|
@ -439,7 +441,7 @@ namespace xmem {
|
|||
* @returns True on success.
|
||||
*/
|
||||
bool unlock_thread_to_cpu();
|
||||
|
||||
|
||||
/**
|
||||
* @brief Gets the CPU ID for a logical CPU of interest in a particular NUMA node.
|
||||
* For example, if numa_node is 1 and cpu_in_node is 2, and there are 4 logical CPUs per node, then this will give the answer 6 (6th CPU), assuming CPU IDs start at 0.
|
||||
|
|
Загрузка…
Ссылка в новой задаче