X-Mem now builds and runs correctly on GNU/Linux x86-64 with AVX, x86-64, and x86 (32-bit). On 32-bit mode, NUMA support is disabled at build time. This is because libnuma does not seem to support 32-bit x86 nor ARM, and it saves me a headache for now.

This commit is contained in:
Mark Gottscho 2015-04-22 18:54:25 -07:00
Родитель f6d824d77d
Коммит 8a52ccd072
21 изменённых файлов: 109 добавлений и 53 удалений

Просмотреть файл

@ -38,7 +38,7 @@ PROJECT_NAME = X-Mem
# could be handy for archiving the generated documentation or if some version
# control system is used.
PROJECT_NUMBER = 2.1.12
PROJECT_NUMBER = 2.1.13
# Using the PROJECT_BRIEF tag one can provide an optional one line description
# for a project that appears at the top of each page and should give viewer a

Просмотреть файл

@ -1,7 +1,7 @@
README
------------------------------------------------------------------------------------------------------------
X-Mem: Extensible Memory Benchmarking Tool v2.1.12
X-Mem: Extensible Memory Benchmarking Tool v2.1.13
------------------------------------------------------------------------------------------------------------
The flexible open-source research tool for characterizing memory hierarchy throughput, latency, and power.

Просмотреть файл

@ -13,7 +13,9 @@ env.Append(CPPFLAGS = '-Wall -Wno-unused-but-set-variable -Wno-unused-variable -
env.Append(CPPPATH = ['src/include', '/usr/include', '/lib'])
env.Append(CPPPATH = ['src/include/ext/DelayInjectedLoadedLatencyBenchmark']) # Extension: Delay-injected loaded latency benchmark
env.Append(CPPPATH = ['src/include/ext/StreamBenchmark']) # Extension: Stream benchmark
env.Append(LIBS = ['pthread', 'numa', 'hugetlbfs'])
env.Append(LIBPATH = ['/usr/lib32'])
env.Append(LIBS = ['pthread', 'hugetlbfs'])
env.Append(LINKFLAGS = ['-m32'])
# List all C++ source files
sources = [

Двоичные данные
X-Mem_Developer_Manual.pdf

Двоичный файл не отображается.

Двоичные данные
bin/xmem-linux-x64

Двоичный файл не отображается.

Двоичные данные
bin/xmem-linux-x64_avx

Двоичный файл не отображается.

Двоичные данные
bin/xmem-linux-x86 Executable file

Двоичный файл не отображается.

Просмотреть файл

@ -8,8 +8,10 @@ PYTHON=https://www.python.org/ftp/python/2.7.9/Python-2.7.9.tgz
SCONS=http://sourceforge.net/projects/scons/files/scons/2.3.4/scons-2.3.4.tar.gz
GCC=http://www.netgull.com/gcc/releases/gcc-4.8.2/gcc-4.8.2.tar.gz
DOXYGEN=http://ftp.stack.nl/pub/users/dimitri/doxygen-1.8.9.1.linux.bin.tar.gz
LIBNUMA=ftp://oss.sgi.com/www/projects/libnuma/download/numactl-2.0.10.tar.gz
wget $LIBHUGETLBFS
wget $PYTHON
wget $SCONS
wget $GCC
wget $DOXYGEN
wget $LIBNUMA

Просмотреть файл

@ -101,7 +101,7 @@ bool Benchmark::run() {
//Write to all of the memory region of interest to make sure
//pages are resident in physical memory and are not shared
forwSequentialWrite_Word64(_mem_array,
forwSequentialWrite_Word32(_mem_array,
reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(_mem_array) + _len));
bool success = _run_core();

Просмотреть файл

@ -57,7 +57,9 @@
#endif
#ifdef __gnu_linux__
#ifdef HAS_NUMA
#include <numa.h>
#endif
extern "C" {
#include <hugetlbfs.h> //for allocating and freeing huge pages
}
@ -72,6 +74,9 @@ BenchmarkManager::BenchmarkManager(
__num_numa_nodes(g_num_nodes),
__benchmark_num_numa_nodes(g_num_nodes),
__mem_arrays(),
#ifndef HAS_NUMA
__orig_malloc_addr(NULL),
#endif
__mem_array_lens(),
__tp_benchmarks(),
__lat_benchmarks(),
@ -140,7 +145,12 @@ BenchmarkManager::~BenchmarkManager() {
if (__config.useLargePages())
free_huge_pages(__mem_arrays[i]);
else
#ifdef HAS_NUMA
numa_free(__mem_arrays[i], __mem_array_lens[i]);
#endif
#ifndef HAS_NUMA
free(__orig_malloc_addr); //this is somewhat of a band-aid
#endif
#endif
}
//Close results file
@ -399,8 +409,14 @@ void BenchmarkManager::__setupWorkingSets(size_t working_set_size) {
__mem_arrays[numa_node] = VirtualAllocExNuma(GetCurrentProcess(), NULL, allocation_size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE, numa_node); //Windows NUMA allocation. Make the allocation one page bigger than necessary so that we can do alignment.
#endif
#ifdef __gnu_linux__
#ifdef HAS_NUMA
numa_set_strict(1); //Enforce NUMA memory allocation to land on specified node or fail otherwise. Alternative node fallback is forbidden.
__mem_arrays[numa_node] = numa_alloc_onnode(allocation_size, numa_node);
#endif
#ifndef HAS_NUMA //special case
__mem_arrays[numa_node] = malloc(allocation_size);
__orig_malloc_addr = __mem_arrays[numa_node];
#endif
#endif
}

Просмотреть файл

@ -55,8 +55,11 @@ Configurator::Configurator(
__runThroughput(true),
__working_set_size_per_thread(DEFAULT_WORKING_SET_SIZE_PER_THREAD),
__num_worker_threads(DEFAULT_NUM_WORKER_THREADS),
__use_chunk_32b(false),
#ifndef HAS_WORD_64
__use_chunk_32b(true),
#endif
#ifdef HAS_WORD_64
__use_chunk_32b(false),
__use_chunk_64b(true),
#endif
#ifdef HAS_WORD_128
@ -65,7 +68,12 @@ Configurator::Configurator(
#ifdef HAS_WORD_256
__use_chunk_256b(false),
#endif
#ifdef HAS_NUMA
__numa_enabled(true),
#endif
#ifndef HAS_NUMA
__numa_enabled(false),
#endif
__iterations(1),
__use_random_access_pattern(false),
__use_sequential_access_pattern(true),
@ -285,7 +293,7 @@ int32_t Configurator::configureFromInput(int argc, char* argv[]) {
}
//Check NUMA selection
if (options[NUMA_DISABLE])
if (options[NUMA_DISABLE]) //NUMA is not supported currently on anything but x86-64 systems anyway.
__numa_enabled = false;
//Check if large pages should be used for allocation of memory under test.
@ -627,11 +635,13 @@ int32_t Configurator::configureFromInput(int argc, char* argv[]) {
std::cout << std::endl;
std::cout << "---> Number of worker threads: ";
std::cout << __num_worker_threads << std::endl;
#ifdef HAS_NUMA
std::cout << "---> NUMA enabled: ";
if (__numa_enabled)
std::cout << "yes" << std::endl;
else
std::cout << "no" << std::endl;
#endif
std::cout << "---> Large pages: ";
if (__use_large_pages)
std::cout << "yes" << std::endl;

Просмотреть файл

@ -94,7 +94,8 @@ LatencyBenchmark::LatencyBenchmark(
void LatencyBenchmark::report_benchmark_info() const {
std::cout << "CPU NUMA Node: " << _cpu_node << std::endl;
std::cout << "Memory NUMA Node: " << _mem_node << std::endl;
std::cout << "Latency measurement chunk size: 64-bit" << std::endl;
std::cout << "Latency measurement chunk size: ";
std::cout << sizeof(uintptr_t)*8 << "-bit" << std::endl;
std::cout << "Latency measurement access pattern: random read (pointer-chasing)" << std::endl;
if (_num_worker_threads > 1) {
@ -223,13 +224,18 @@ bool LatencyBenchmark::_run_core() {
RandomFunction lat_kernel_dummy_fptr = &dummy_chasePointers;
//Initialize memory regions for all threads by writing to them, causing the memory to be physically resident.
forwSequentialWrite_Word64(_mem_array,
forwSequentialWrite_Word32(_mem_array,
reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(_mem_array)+_len)); //static casts to silence compiler warnings
//Build pointer indices for random-access latency thread. We assume that latency thread is the first one, so we use beginning of memory region.
if (!buildRandomPointerPermutation(_mem_array,
reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(_mem_array)+len_per_thread), //static casts to silence compiler warnings
#ifndef HAS_WORD_64 //special case: 32-bit architectures
CHUNK_32b)) {
#endif
#ifdef HAS_WORD_64
CHUNK_64b)) {
#endif
std::cerr << "ERROR: Failed to build a random pointer permutation for the latency measurement thread!" << std::endl;
return false;
}

Просмотреть файл

@ -130,7 +130,7 @@ void LatencyWorker::run() {
for (uint32_t i = 0; i < 4; i++) {
void* prime_start_address = mem_array;
void* prime_end_address = reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(mem_array) + len);
forwSequentialRead_Word64(prime_start_address, prime_end_address); //dependent reads on the memory, make sure caches are ready, coherence, etc...
forwSequentialRead_Word32(prime_start_address, prime_end_address); //dependent reads on the memory, make sure caches are ready, coherence, etc...
}
//Run benchmark

Просмотреть файл

@ -171,7 +171,7 @@ void LoadWorker::run() {
//Prime memory
for (uint32_t i = 0; i < 4; i++) {
forwSequentialRead_Word64(prime_start_address, prime_end_address); //dependent reads on the memory, make sure caches are ready, coherence, etc...
forwSequentialRead_Word32(prime_start_address, prime_end_address); //dependent reads on the memory, make sure caches are ready, coherence, etc...
}
//Run the benchmark!

Просмотреть файл

@ -1,7 +1,7 @@
README
------------------------------------------------------------------------------------------------------------
X-Mem: Extensible Memory Benchmarking Tool v2.1.12
X-Mem: Extensible Memory Benchmarking Tool v2.1.13
------------------------------------------------------------------------------------------------------------
The flexible open-source research tool for characterizing memory hierarchy throughput, latency, and power.

Просмотреть файл

@ -717,12 +717,7 @@ bool xmem::buildRandomPointerPermutation(void* start_address, void* end_address,
int32_t xmem::dummy_chasePointers(uintptr_t*, uintptr_t**, size_t len) {
volatile uintptr_t placeholder = 0; //Try to defeat compiler optimizations removing this method
#ifdef USE_SIZE_BASED_BENCHMARKS
#ifndef HAS_WORD_64 //special case for 32-bit architectures
for (size_t i = 0; i < len / sizeof(uintptr_t); i += 1024)
#endif
#ifdef HAS_WORD_64
for (size_t i = 0; i < len / sizeof(uintptr_t); i += 512)
#endif
placeholder = 0;
#endif
return 0;
@ -734,22 +729,11 @@ int32_t xmem::chasePointers(uintptr_t* first_address, uintptr_t** last_touched_a
volatile uintptr_t* p = first_address;
#ifdef USE_TIME_BASED_BENCHMARKS
#ifndef HAS_WORD_64 //special case for 32-bit architectures
UNROLL1024(p = reinterpret_cast<uintptr_t*>(*p);)
#endif
#ifdef HAS_WORD_64
UNROLL512(p = reinterpret_cast<uintptr_t*>(*p);)
#endif
#endif
#ifdef USE_SIZE_BASED_BENCHMARKS
#ifndef HAS_WORD_64 //special case for 32-bit architectures
for (size_t i = 0; i < len / sizeof(uintptr_t); i += 1024) {
UNROLL1024(p = reinterpret_cast<uintptr_t*>(*p);)
#endif
#ifdef HAS_WORD_64
for (size_t i = 0; i < len / sizeof(uintptr_t); i += 512) {
UNROLL512(p = reinterpret_cast<uintptr_t*>(*p);)
#endif
}
#endif
*last_touched_address = const_cast<uintptr_t*>(p);
@ -2728,26 +2712,33 @@ int32_t xmem::revStride16Write_Word256(void* start_address, void* end_address) {
/* ------------ RANDOM READ --------------*/
#ifndef HAS_WORD_64 //special case: 32-bit machine
int32_t xmem::randomRead_Word32(uintptr_t* first_address, uintptr_t** last_touched_address, size_t len) {
volatile uintptr_t* p = first_address;
#ifdef USE_TIME_BASED_BENCHMARKS
UNROLL1024(p = reinterpret_cast<uintptr_t*>(*p);)
#endif
#ifdef USE_SIZE_BASED_BENCHMARKS
for (size_t i = 0; i < len / sizeof(uintptr_t); i += 1024) {
UNROLL1024(p = reinterpret_cast<uintptr_t*>(*p);)
}
#endif
*last_touched_address = const_cast<uintptr_t*>(p);
return 0;
}
#endif
#ifdef HAS_WORD_64
int32_t xmem::randomRead_Word64(uintptr_t* first_address, uintptr_t** last_touched_address, size_t len) {
volatile uintptr_t* p = first_address;
#ifdef USE_TIME_BASED_BENCHMARKS
#ifndef HAS_WORD_64 //special case: 32-bit machine
UNROLL1024(p = reinterpret_cast<uintptr_t*>(*p);)
#endif
#ifdef HAS_WORD_64
UNROLL512(p = reinterpret_cast<uintptr_t*>(*p);)
#endif
#endif
#ifdef USE_SIZE_BASED_BENCHMARKS
for (size_t i = 0; i < len / sizeof(uintptr_t); i += 512) {
#ifndef HAS_WORD_64 //special case: 32-bit machine
UNROLL1024(p = reinterpret_cast<uintptr_t*>(*p);)
#endif
#ifdef HAS_WORD_64
UNROLL512(p = reinterpret_cast<uintptr_t*>(*p);)
#endif
}
#endif
*last_touched_address = const_cast<uintptr_t*>(p);

Просмотреть файл

@ -42,7 +42,9 @@
#ifdef __gnu_linux__
#include <unistd.h>
#include <pthread.h>
#ifdef HAS_NUMA
#include <numa.h>
#endif
#include <fstream> //for std::ifstream
#include <vector> //for std::vector
#include <algorithm> //for std::find
@ -163,6 +165,9 @@ void xmem::print_compile_time_options() {
#ifdef ARCH_64BIT
std::cout << "ARCH_64BIT" << std::endl;
#endif
#ifdef HAS_NUMA
std::cout << "HAS_NUMA" << std::endl;
#endif
#ifdef HAS_WORD_64
std::cout << "HAS_WORD_64" << std::endl;
#endif
@ -312,8 +317,18 @@ bool xmem::unlock_thread_to_cpu() {
return (!pthread_setaffinity_np(tid, sizeof(cpu_set_t), &cpus));
#endif
}
int32_t xmem::cpu_id_in_numa_node(uint32_t numa_node, uint32_t cpu_in_node) {
#ifndef HAS_NUMA
if (numa_node != 0) {
std::cerr << "WARNING: NUMA is not supported on this X-Mem build." << std::endl;
return -1;
}
return cpu_in_node;
#endif
#ifdef HAS_NUMA
int32_t cpu_id = -1;
uint32_t rank_in_node = 0;
#ifdef _WIN32
@ -361,6 +376,7 @@ int32_t xmem::cpu_id_in_numa_node(uint32_t numa_node, uint32_t cpu_in_node) {
free(bm_ptr);
#endif
return cpu_id;
#endif
}
size_t xmem::compute_number_of_passes(size_t working_set_size_KB) {
@ -413,6 +429,14 @@ int32_t xmem::query_sys_info() {
retval = GetLogicalProcessorInformation(buffer, &len); //try again
#endif
#ifdef __gnu_linux__
std::ifstream in;
in.open("/proc/cpuinfo");
char line[512];
uint32_t id = 0;
#endif
#ifdef HAS_NUMA
//Get NUMA info
#ifdef _WIN32
curr = buffer;
@ -436,11 +460,7 @@ int32_t xmem::query_sys_info() {
g_num_nodes = numa_max_node()+1;
//Get number of physical packages. This is somewhat convoluted, but not sure of a better way on Linux. Technically there could be on-chip NUMA, so...
std::ifstream in;
in.open("/proc/cpuinfo");
char line[512];
std::vector<uint32_t> phys_package_ids;
uint32_t id = 0;
while (!in.eof()) {
in.getline(line, 512, '\n');
@ -461,7 +481,7 @@ int32_t xmem::query_sys_info() {
}
}
g_num_physical_packages = phys_package_ids.size();
in.close();
#endif
#endif
//Get number of CPUs
@ -495,7 +515,6 @@ int32_t xmem::query_sys_info() {
//Get number of physical CPUs. This is somewhat convoluted, but not sure of a better way on Linux. I don't want to assume anything about HyperThreading-like things.
std::vector<uint32_t> core_ids;
in.open("/proc/cpuinfo");
while (!in.eof()) {
in.getline(line, 512, '\n');
@ -516,7 +535,6 @@ int32_t xmem::query_sys_info() {
}
}
g_num_physical_cpus = core_ids.size() * g_num_physical_packages; //FIXME: currently this assumes each processor package has an equal number of cores. This may not be true in general! Need more complicated /proc/cpuinfo parsing.
in.close();
#endif
//Get number of caches
@ -566,6 +584,7 @@ int32_t xmem::query_sys_info() {
#ifdef __gnu_linux__
g_page_size = static_cast<size_t>(sysconf(_SC_PAGESIZE));
g_large_page_size = gethugepagesize();
in.close();
#endif
#ifdef _WIN32

Просмотреть файл

@ -105,13 +105,18 @@ bool DelayInjectedLoadedLatencyBenchmark::_run_core() {
RandomFunction lat_kernel_dummy_fptr = &dummy_chasePointers;
//Initialize memory regions for all threads by writing to them, causing the memory to be physically resident.
forwSequentialWrite_Word64(_mem_array,
forwSequentialWrite_Word32(_mem_array,
reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(_mem_array)+_len)); //static casts to silence compiler warnings
//Build pointer indices for random-access latency thread. We assume that latency thread is the first one, so we use beginning of memory region.
if (!buildRandomPointerPermutation(_mem_array,
reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(_mem_array)+len_per_thread), //static casts to silence compiler warnings
#ifndef HAS_WORD_64 //special case: 32-bit architectures
CHUNK_32b)) {
#endif
#ifdef HAS_WORD_64
CHUNK_64b)) {
#endif
std::cerr << "ERROR: Failed to build a random pointer permutation for the latency measurement thread!" << std::endl;
return false;
}

Просмотреть файл

@ -113,6 +113,9 @@ namespace xmem {
uint32_t __num_numa_nodes; /**< Number of NUMA nodes in the system. */
uint32_t __benchmark_num_numa_nodes; /**< Number of NUMA nodes to use in benchmarks. */
std::vector<void*> __mem_arrays; /**< Memory regions to use in benchmarks. One for each benchmarked NUMA node. */
#ifndef HAS_NUMA
void* __orig_malloc_addr; /**< Points to the original address returned by the malloc() for __mem_arrays on non-NUMA machines. Special case. */
#endif
std::vector<size_t> __mem_array_lens; /**< Length of each memory region to use in benchmarks. */
std::vector<ThroughputBenchmark*> __tp_benchmarks; /**< Set of throughput benchmarks. */
std::vector<LatencyBenchmark*> __lat_benchmarks; /**< Set of latency benchmarks. */

Просмотреть файл

@ -1377,7 +1377,7 @@ namespace xmem {
/* ------------ RANDOM WRITE --------------*/
//32-bit machines only
#ifdef HAS_WORD_64
#ifndef HAS_WORD_64
/**
* @brief Walks over the allocated memory in random order by chasing 32-bit pointers. A pointer is read and written back with the same value before chasing to the next pointer. Thus, each memory address is a read followed by immediate write operation.
* @param first_address Starting address to deference.

Просмотреть файл

@ -45,7 +45,7 @@
namespace xmem {
#define VERSION "2.1.12"
#define VERSION "2.1.13"
#if !defined(_WIN32) && !defined(__gnu_linux__)
#error Neither Windows/GNULinux build environments were detected!
@ -63,6 +63,7 @@ namespace xmem {
#define ARCH_INTEL_X86_64
#define ARCH_INTEL
#define ARCH_64BIT
#define HAS_NUMA
#endif
#ifdef _M_IX86_FP //Intel x86-64 SSE2 extensions
@ -106,6 +107,7 @@ namespace xmem {
#define ARCH_INTEL_X86_64
#define ARCH_64BIT
#define ARCH_INTEL
#define HAS_NUMA
#endif
#ifdef __SSE2__ //Intel x86-64 SSE2 extensions
@ -155,10 +157,10 @@ namespace xmem {
#define DEFAULT_LARGE_PAGE_SIZE 2*MB /**< Default platform large page size in bytes. This generally should not be relied on, but is a failsafe. */
#define DEFAULT_WORKING_SET_SIZE_PER_THREAD DEFAULT_PAGE_SIZE /**< Default working set size in bytes. */
#define DEFAULT_NUM_WORKER_THREADS 1 /**< Default number of worker threads to use. */
#define DEFAULT_NUM_NODES 0 /**< Default number of NUMA nodes. */
#define DEFAULT_NUM_PHYSICAL_PACKAGES 0 /**< Default number of physical packages. */
#define DEFAULT_NUM_PHYSICAL_CPUS 0 /**< Default number of physical CPU cores. */
#define DEFAULT_NUM_LOGICAL_CPUS 0 /**< Default number of logical CPU cores. */
#define DEFAULT_NUM_NODES 1 /**< Default number of NUMA nodes. */
#define DEFAULT_NUM_PHYSICAL_PACKAGES 1 /**< Default number of physical packages. */
#define DEFAULT_NUM_PHYSICAL_CPUS 1 /**< Default number of physical CPU cores. */
#define DEFAULT_NUM_LOGICAL_CPUS 1 /**< Default number of logical CPU cores. */
#define DEFAULT_NUM_L1_CACHES 0 /**< Default number of L1 caches. */
#define DEFAULT_NUM_L2_CACHES 0 /**< Default number of L2 caches. */
#define DEFAULT_NUM_L3_CACHES 0 /**< Default number of L3 caches. */
@ -439,7 +441,7 @@ namespace xmem {
* @returns True on success.
*/
bool unlock_thread_to_cpu();
/**
* @brief Gets the CPU ID for a logical CPU of interest in a particular NUMA node.
* For example, if numa_node is 1 and cpu_in_node is 2, and there are 4 logical CPUs per node, then this will give the answer 6 (6th CPU), assuming CPU IDs start at 0.