зеркало из https://github.com/microsoft/X-Mem.git
Finished variable/function renaming for Benchmark, LatencyBenchmark, ThroughputBenchmark, DelayInjectedLoadedLatencyBenchmark classes.
This commit is contained in:
Родитель
de5dff8803
Коммит
babee8e73f
2
Doxyfile
2
Doxyfile
|
@ -38,7 +38,7 @@ PROJECT_NAME = X-Mem
|
|||
# could be handy for archiving the generated documentation or if some version
|
||||
# control system is used.
|
||||
|
||||
PROJECT_NUMBER = 2.4.0
|
||||
PROJECT_NUMBER = 2.4.1
|
||||
|
||||
# Using the PROJECT_BRIEF tag one can provide an optional one line description
|
||||
# for a project that appears at the top of each page and should give viewer a
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
README
|
||||
------------------------------------------------------------------------------------------------------------
|
||||
|
||||
X-Mem: A Cross-Platform and Extensible Memory Characterization Tool for the Cloud v2.4.0
|
||||
X-Mem: A Cross-Platform and Extensible Memory Characterization Tool for the Cloud v2.4.1
|
||||
------------------------------------------------------------------------------------------------------------
|
||||
|
||||
X-Mem is a flexible open-source research tool for characterizing memory hierarchy throughput, latency, power, and more. The tool was developed jointly by Microsoft and the UCLA NanoCAD Lab. This project was started by Mark Gottscho (Email: mgottscho@ucla.edu) as a Summer 2014 PhD intern at Microsoft Research. X-Mem is released freely and open-source under the MIT License. The project is under active development.
|
||||
|
||||
PROJECT REVISION DATE: April 4, 2016
|
||||
PROJECT REVISION DATE: April 5, 2016
|
||||
|
||||
------------------------------------------------------------------------------------------------------------
|
||||
RESEARCH PAPER & ATTRIBUTION
|
||||
|
|
|
@ -75,11 +75,11 @@ Benchmark::Benchmark(
|
|||
metric_on_iter_(),
|
||||
mean_metric_(0),
|
||||
min_metric_(0),
|
||||
25_percentile_metric_(0),
|
||||
percentile_25_metric_(0),
|
||||
median_metric_(0),
|
||||
75_percentile_metric_(0),
|
||||
95_percentile_metric_(0),
|
||||
99_percentile_metric_(0),
|
||||
percentile_75_metric_(0),
|
||||
percentile_95_metric_(0),
|
||||
percentile_99_metric_(0),
|
||||
max_metric_(0),
|
||||
mode_metric_(0),
|
||||
metric_units_(metric_units),
|
||||
|
@ -87,7 +87,7 @@ Benchmark::Benchmark(
|
|||
peak_dram_power_socket_(),
|
||||
name_(name),
|
||||
obj_valid_(false),
|
||||
hasRun_(false),
|
||||
has_run_(false),
|
||||
warning_(false)
|
||||
{
|
||||
|
||||
|
@ -107,7 +107,7 @@ bool Benchmark::run() {
|
|||
|
||||
//Write to all of the memory region of interest to make sure
|
||||
//pages are resident in physical memory and are not shared
|
||||
forwSequentialWrite_Word32(_mem_array,
|
||||
forwSequentialWrite_Word32(mem_array_,
|
||||
reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(mem_array_) + len_));
|
||||
|
||||
bool success = runCore();
|
||||
|
@ -191,7 +191,7 @@ void Benchmark::reportBenchmarkInfo() const {
|
|||
std::cout << "read";
|
||||
break;
|
||||
case WRITE:
|
||||
if (_pattern_mode == RANDOM) //special case
|
||||
if (pattern_mode_ == RANDOM) //special case
|
||||
std::cout << "read+write";
|
||||
else
|
||||
std::cout << "write";
|
||||
|
@ -215,7 +215,7 @@ void Benchmark::reportResults() const {
|
|||
std::cout << "***" << std::endl;
|
||||
std::cout << std::endl;
|
||||
|
||||
if (hasRun_) {
|
||||
if (has_run_) {
|
||||
for (uint32_t i = 0; i < iterations_; i++) {
|
||||
std::printf("Iter #%4d: %0.3f %s", i, metric_on_iter_[i], metric_units_.c_str());
|
||||
//std::cout << "Iter #" << i << ": " << metric_on_iter_[i] << " " << metric_units_;
|
||||
|
@ -237,7 +237,7 @@ void Benchmark::reportResults() const {
|
|||
std::cout << " (WARNING)";
|
||||
std::cout << std::endl;
|
||||
|
||||
std::cout << "25th Percentile: " << 25_percentile_metric_ << " " << metric_units_;
|
||||
std::cout << "25th Percentile: " << percentile_25_metric_ << " " << metric_units_;
|
||||
if (warning_)
|
||||
std::cout << " (WARNING)";
|
||||
std::cout << std::endl;
|
||||
|
@ -247,17 +247,17 @@ void Benchmark::reportResults() const {
|
|||
std::cout << " (WARNING)";
|
||||
std::cout << std::endl;
|
||||
|
||||
std::cout << "75th Percentile: " << 75_percentile_metric_ << " " << metric_units_;
|
||||
std::cout << "75th Percentile: " << percentile_75_metric_ << " " << metric_units_;
|
||||
if (warning_)
|
||||
std::cout << " (WARNING)";
|
||||
std::cout << std::endl;
|
||||
|
||||
std::cout << "95th Percentile: " << 95_percentile_metric_ << " " << metric_units_;
|
||||
std::cout << "95th Percentile: " << percentile_95_metric_ << " " << metric_units_;
|
||||
if (warning_)
|
||||
std::cout << " (WARNING)";
|
||||
std::cout << std::endl;
|
||||
|
||||
std::cout << "99th Percentile: " << 99_percentile_metric_ << " " << metric_units_;
|
||||
std::cout << "99th Percentile: " << percentile_99_metric_ << " " << metric_units_;
|
||||
if (warning_)
|
||||
std::cout << " (WARNING)";
|
||||
std::cout << std::endl;
|
||||
|
@ -314,7 +314,7 @@ double Benchmark::getMinMetric() const {
|
|||
|
||||
double Benchmark::get25PercentileMetric() const {
|
||||
if (has_run_)
|
||||
return 25_percentile_metric_;
|
||||
return percentile_25_metric_;
|
||||
else //bad call
|
||||
return -1;
|
||||
}
|
||||
|
@ -328,21 +328,21 @@ double Benchmark::getMedianMetric() const {
|
|||
|
||||
double Benchmark::get75PercentileMetric() const {
|
||||
if (has_run_)
|
||||
return 75_percentile_metric_;
|
||||
return percentile_75_metric_;
|
||||
else //bad call
|
||||
return -1;
|
||||
}
|
||||
|
||||
double Benchmark::get95PercentileMetric() const {
|
||||
if (has_run_)
|
||||
return 95_percentile_metric_;
|
||||
return percentile_95_metric_;
|
||||
else //bad call
|
||||
return -1;
|
||||
}
|
||||
|
||||
double Benchmark::get99PercentileMetric() const {
|
||||
if (has_run_)
|
||||
return 99_percentile_metric_;
|
||||
return percentile_99_metric_;
|
||||
else //bad call
|
||||
return -1;
|
||||
}
|
||||
|
@ -433,11 +433,11 @@ void Benchmark::computeMetrics() {
|
|||
|
||||
//Compute percentiles
|
||||
min_metric_ = sortedMetrics.front();
|
||||
25_percentile_metric_ = sortedMetrics[sortedMetrics.size()/4];
|
||||
75_percentile_metric_ = sortedMetrics[sortedMetrics.size()*3/4];
|
||||
percentile_25_metric_ = sortedMetrics[sortedMetrics.size()/4];
|
||||
percentile_75_metric_ = sortedMetrics[sortedMetrics.size()*3/4];
|
||||
median_metric_ = sortedMetrics[sortedMetrics.size()/2];
|
||||
95_percentile_metric_ = sortedMetrics[sortedMetrics.size()*95/100];
|
||||
99_percentile_metric_ = sortedMetrics[sortedMetrics.size()*99/100];
|
||||
percentile_95_metric_ = sortedMetrics[sortedMetrics.size()*95/100];
|
||||
percentile_99_metric_ = sortedMetrics[sortedMetrics.size()*99/100];
|
||||
max_metric_ = sortedMetrics.back();
|
||||
|
||||
//Compute mode
|
||||
|
|
|
@ -192,7 +192,7 @@ bool BenchmarkManager::runThroughputBenchmarks() {
|
|||
|
||||
for (uint32_t i = 0; i < __tp_benchmarks.size(); i++) {
|
||||
__tp_benchmarks[i]->run();
|
||||
__tp_benchmarks[i]->report_results(); //to console
|
||||
__tp_benchmarks[i]->reportResults(); //to console
|
||||
|
||||
//Write to results file if necessary
|
||||
if (__config.useOutputFile()) {
|
||||
|
@ -306,7 +306,7 @@ bool BenchmarkManager::runLatencyBenchmarks() {
|
|||
|
||||
for (uint32_t i = 0; i < __lat_benchmarks.size(); i++) {
|
||||
__lat_benchmarks[i]->run();
|
||||
__lat_benchmarks[i]->report_results(); //to console
|
||||
__lat_benchmarks[i]->reportResults(); //to console
|
||||
|
||||
//Write to results file if necessary
|
||||
if (__config.useOutputFile()) {
|
||||
|
@ -775,7 +775,7 @@ bool BenchmarkManager::runExtDelayInjectedLoadedLatencyBenchmark() {
|
|||
//Run benchmarks
|
||||
for (uint32_t i = 0; i < del_lat_benchmarks.size(); i++) {
|
||||
del_lat_benchmarks[i]->run();
|
||||
del_lat_benchmarks[i]->report_results(); //to console
|
||||
del_lat_benchmarks[i]->reportResults(); //to console
|
||||
|
||||
//Write to results file if necessary
|
||||
if (__config.useOutputFile()) {
|
||||
|
|
|
@ -87,7 +87,7 @@ LatencyBenchmark::LatencyBenchmark(
|
|||
load_metric_on_iter_.push_back(0);
|
||||
}
|
||||
|
||||
void LatencyBenchmark::report_benchmark_info() const {
|
||||
void LatencyBenchmark::reportBenchmarkInfo() const {
|
||||
std::cout << "CPU NUMA Node: " << cpu_node_ << std::endl;
|
||||
std::cout << "Memory NUMA Node: " << mem_node_ << std::endl;
|
||||
std::cout << "Latency measurement chunk size: ";
|
||||
|
@ -168,7 +168,7 @@ void LatencyBenchmark::report_benchmark_info() const {
|
|||
}
|
||||
|
||||
|
||||
void LatencyBenchmark::report_results() const {
|
||||
void LatencyBenchmark::reportResults() const {
|
||||
std::cout << std::endl;
|
||||
std::cout << "*** RESULTS";
|
||||
std::cout << "***" << std::endl;
|
||||
|
@ -196,7 +196,7 @@ void LatencyBenchmark::report_results() const {
|
|||
std::cout << " (WARNING)";
|
||||
std::cout << std::endl;
|
||||
|
||||
std::cout << "25th Percentile: " << 25_percentile_metric_ << " " << metric_units_;
|
||||
std::cout << "25th Percentile: " << percentile_25_metric_ << " " << metric_units_;
|
||||
if (warning_)
|
||||
std::cout << " (WARNING)";
|
||||
std::cout << std::endl;
|
||||
|
@ -206,17 +206,17 @@ void LatencyBenchmark::report_results() const {
|
|||
std::cout << " (WARNING)";
|
||||
std::cout << std::endl;
|
||||
|
||||
std::cout << "75th Percentile: " << 75_percentile_metric_ << " " << metric_units_;
|
||||
std::cout << "75th Percentile: " << percentile_75_metric_ << " " << metric_units_;
|
||||
if (warning_)
|
||||
std::cout << " (WARNING)";
|
||||
std::cout << std::endl;
|
||||
|
||||
std::cout << "95th Percentile: " << 95_percentile_metric_ << " " << metric_units_;
|
||||
std::cout << "95th Percentile: " << percentile_95_metric_ << " " << metric_units_;
|
||||
if (warning_)
|
||||
std::cout << " (WARNING)";
|
||||
std::cout << std::endl;
|
||||
|
||||
std::cout << "99th Percentile: " << 99_percentile_metric_ << " " << metric_units_;
|
||||
std::cout << "99th Percentile: " << percentile_99_metric_ << " " << metric_units_;
|
||||
if (warning_)
|
||||
std::cout << " (WARNING)";
|
||||
std::cout << std::endl;
|
||||
|
@ -260,8 +260,8 @@ double LatencyBenchmark::getMeanLoadMetric() const {
|
|||
return -1;
|
||||
}
|
||||
|
||||
bool LatencyBenchmark::_run_core() {
|
||||
size_t len_per_thread = len_ / _num_worker_threads; //Carve up memory space so each worker has its own area to play in
|
||||
bool LatencyBenchmark::runCore() {
|
||||
size_t len_per_thread = len_ / num_worker_threads_; //Carve up memory space so each worker has its own area to play in
|
||||
|
||||
//Set up latency measurement kernel function pointers
|
||||
RandomFunction lat_kernel_fptr = &chasePointers;
|
||||
|
@ -295,7 +295,7 @@ bool LatencyBenchmark::_run_core() {
|
|||
std::cerr << "ERROR: Failed to find appropriate benchmark kernel." << std::endl;
|
||||
return false;
|
||||
}
|
||||
} else if (_pattern_mode == RANDOM) {
|
||||
} else if (pattern_mode_ == RANDOM) {
|
||||
if (!determineRandomKernel(rw_mode_, chunk_size_, &load_kernel_fptr_ran, &load_kernel_dummy_fptr_ran)) {
|
||||
std::cerr << "ERROR: Failed to find appropriate benchmark kernel." << std::endl;
|
||||
return false;
|
||||
|
@ -395,7 +395,7 @@ bool LatencyBenchmark::_run_core() {
|
|||
tick_t load_total_elapsed_dummy_ticks = 0;
|
||||
uint32_t load_bytes_per_pass = 0;
|
||||
double load_avg_adjusted_ticks = 0;
|
||||
for (uint32_t t = 1; t < _num_worker_threads; t++) {
|
||||
for (uint32_t t = 1; t < num_worker_threads_; t++) {
|
||||
load_total_passes += workers[t]->getPasses();
|
||||
load_total_adjusted_ticks += workers[t]->getAdjustedTicks();
|
||||
load_total_elapsed_dummy_ticks += workers[t]->getElapsedDummyTicks();
|
||||
|
@ -405,7 +405,7 @@ bool LatencyBenchmark::_run_core() {
|
|||
|
||||
//Compute load metrics for this iteration
|
||||
load_avg_adjusted_ticks = static_cast<double>(load_total_adjusted_ticks) / (num_worker_threads_-1);
|
||||
if (_num_worker_threads > 1)
|
||||
if (num_worker_threads_ > 1)
|
||||
load_metric_on_iter_[i] = (((static_cast<double>(load_total_passes) * static_cast<double>(load_bytes_per_pass)) / static_cast<double>(MB))) / ((load_avg_adjusted_ticks * g_ns_per_tick) / 1e9);
|
||||
|
||||
if (iterwarning)
|
||||
|
|
|
@ -96,8 +96,8 @@ bool ThroughputBenchmark::runCore() {
|
|||
|
||||
//Build pointer indices. Note that the pointers for each thread must stay within its respective region, otherwise sharing may occur.
|
||||
for (uint32_t i = 0; i < num_worker_threads_; i++) {
|
||||
if (!buildRandomPointerPermutation(reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(_mem_array) + i*len_per_thread), //casts to silence compiler warnings
|
||||
reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(_mem_array) + (i+1)*len_per_thread), //casts to silence compiler warnings
|
||||
if (!buildRandomPointerPermutation(reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(mem_array_) + i*len_per_thread), //casts to silence compiler warnings
|
||||
reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(mem_array_) + (i+1)*len_per_thread), //casts to silence compiler warnings
|
||||
chunk_size_)) {
|
||||
std::cerr << "ERROR: Failed to build a random pointer permutation for a worker thread!" << std::endl;
|
||||
return false;
|
||||
|
@ -115,7 +115,7 @@ bool ThroughputBenchmark::runCore() {
|
|||
//Start power measurement
|
||||
if (g_verbose)
|
||||
std::cout << "Starting power measurement threads...";
|
||||
if (!_start_power_threads()) {
|
||||
if (!startPowerThreads()) {
|
||||
if (g_verbose)
|
||||
std::cout << "FAIL" << std::endl;
|
||||
std::cerr << "WARNING: Failed to start power measurement threads." << std::endl;
|
||||
|
@ -127,21 +127,21 @@ bool ThroughputBenchmark::runCore() {
|
|||
std::cout << "Running benchmark." << std::endl << std::endl;
|
||||
|
||||
//Do a bunch of iterations of the core benchmark routines
|
||||
for (uint32_t i = 0; i < _iterations; i++) {
|
||||
for (uint32_t i = 0; i < iterations_; i++) {
|
||||
//Create workers and worker threads
|
||||
for (uint32_t t = 0; t < num_worker_threads_; t++) {
|
||||
void* thread_mem_array = reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(_mem_array) + t * len_per_thread);
|
||||
int32_t cpu_id = cpu_id_in_numa_node(_cpu_node, t);
|
||||
void* threadmem_array_ = reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(mem_array_) + t * len_per_thread);
|
||||
int32_t cpu_id = cpu_id_in_numa_node(cpu_node_, t);
|
||||
if (cpu_id < 0)
|
||||
std::cerr << "WARNING: Failed to find logical CPU " << t << " in NUMA node " << _cpu_node << std::endl;
|
||||
std::cerr << "WARNING: Failed to find logical CPU " << t << " in NUMA node " << cpu_node_ << std::endl;
|
||||
if (pattern_mode_ == SEQUENTIAL)
|
||||
workers.push_back(new LoadWorker(thread_mem_array,
|
||||
workers.push_back(new LoadWorker(threadmem_array_,
|
||||
len_per_thread,
|
||||
kernel_fptr_seq,
|
||||
kernel_dummy_fptr_seq,
|
||||
cpu_id));
|
||||
else if (pattern_mode_ == RANDOM)
|
||||
workers.push_back(new LoadWorker(thread_mem_array,
|
||||
workers.push_back(new LoadWorker(threadmem_array_,
|
||||
len_per_thread,
|
||||
kernel_fptr_ran,
|
||||
kernel_dummy_fptr_ran,
|
||||
|
@ -177,7 +177,7 @@ bool ThroughputBenchmark::runCore() {
|
|||
avg_adjusted_ticks = total_adjusted_ticks / num_worker_threads_;
|
||||
|
||||
if (iter_warning)
|
||||
_warning = true;
|
||||
warning_ = true;
|
||||
|
||||
if (g_verbose ) { //Report duration for this iteration
|
||||
std::cout << "Iter " << i+1 << " had " << total_passes << " passes in total across " << num_worker_threads_ << " threads, with " << bytes_per_pass << " bytes touched per pass:";
|
||||
|
|
|
@ -80,33 +80,33 @@ DelayInjectedLoadedLatencyBenchmark::DelayInjectedLoadedLatencyBenchmark(
|
|||
dram_power_readers,
|
||||
name
|
||||
),
|
||||
__delay(delay)
|
||||
delay_(delay)
|
||||
{
|
||||
}
|
||||
|
||||
void DelayInjectedLoadedLatencyBenchmark::report_benchmark_info() const {
|
||||
LatencyBenchmark::report_benchmark_info();
|
||||
std::cout << "Load worker kernel delay value: " << __delay << std::endl;
|
||||
void DelayInjectedLoadedLatencyBenchmark::reportBenchmarkInfo() const {
|
||||
LatencyBenchmark::reportBenchmarkInfo();
|
||||
std::cout << "Load worker kernel delay value: " << delay_ << std::endl;
|
||||
}
|
||||
|
||||
uint32_t DelayInjectedLoadedLatencyBenchmark::getDelay() const {
|
||||
return __delay;
|
||||
return delay_;
|
||||
}
|
||||
|
||||
bool DelayInjectedLoadedLatencyBenchmark::_run_core() {
|
||||
size_t len_per_thread = _len / _num_worker_threads; //Carve up memory space so each worker has its own area to play in
|
||||
bool DelayInjectedLoadedLatencyBenchmark::runCore() {
|
||||
size_t len_per_thread = len_ / num_worker_threads_; //Carve up memory space so each worker has its own area to play in
|
||||
|
||||
//Set up latency measurement kernel function pointers
|
||||
RandomFunction lat_kernel_fptr = &chasePointers;
|
||||
RandomFunction lat_kernel_dummy_fptr = &dummy_chasePointers;
|
||||
|
||||
//Initialize memory regions for all threads by writing to them, causing the memory to be physically resident.
|
||||
forwSequentialWrite_Word32(_mem_array,
|
||||
reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(_mem_array)+_len)); //static casts to silence compiler warnings
|
||||
forwSequentialWrite_Word32(mem_array_,
|
||||
reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(mem_array_)+len_)); //static casts to silence compiler warnings
|
||||
|
||||
//Build pointer indices for random-access latency thread. We assume that latency thread is the first one, so we use beginning of memory region.
|
||||
if (!buildRandomPointerPermutation(_mem_array,
|
||||
reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(_mem_array)+len_per_thread), //static casts to silence compiler warnings
|
||||
if (!buildRandomPointerPermutation(mem_array_,
|
||||
reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(mem_array_)+len_per_thread), //static casts to silence compiler warnings
|
||||
#ifndef HAS_WORD_64 //special case: 32-bit architectures
|
||||
CHUNK_32b)) {
|
||||
#endif
|
||||
|
@ -120,10 +120,10 @@ bool DelayInjectedLoadedLatencyBenchmark::_run_core() {
|
|||
//Set up load generation kernel function pointers
|
||||
SequentialFunction load_kernel_fptr = NULL;
|
||||
SequentialFunction load_kernel_dummy_fptr = NULL;
|
||||
if (_num_worker_threads > 1) { //If we only have one worker thread, it is used for latency measurement only, and no load threads will be used.
|
||||
switch (_chunk_size) {
|
||||
if (num_worker_threads_ > 1) { //If we only have one worker thread, it is used for latency measurement only, and no load threads will be used.
|
||||
switch (chunk_size_) {
|
||||
case CHUNK_32b:
|
||||
switch (__delay) {
|
||||
switch (delay_) {
|
||||
case 0:
|
||||
load_kernel_fptr = &forwSequentialRead_Word32; //not an extended kernel
|
||||
load_kernel_dummy_fptr = &dummy_forwSequentialLoop_Word32; //not an extended kernel
|
||||
|
@ -179,7 +179,7 @@ bool DelayInjectedLoadedLatencyBenchmark::_run_core() {
|
|||
break;
|
||||
#ifdef HAS_WORD_64
|
||||
case CHUNK_64b:
|
||||
switch (__delay) {
|
||||
switch (delay_) {
|
||||
case 0:
|
||||
load_kernel_fptr = &forwSequentialRead_Word64; //not an extended kernel
|
||||
load_kernel_dummy_fptr = &dummy_forwSequentialLoop_Word64; //not an extended kernel
|
||||
|
@ -236,7 +236,7 @@ bool DelayInjectedLoadedLatencyBenchmark::_run_core() {
|
|||
#endif
|
||||
#ifdef HAS_WORD_128
|
||||
case CHUNK_128b:
|
||||
switch (__delay) {
|
||||
switch (delay_) {
|
||||
case 0:
|
||||
load_kernel_fptr = &forwSequentialRead_Word128; //not an extended kernel
|
||||
load_kernel_dummy_fptr = &dummy_forwSequentialLoop_Word128; //not an extended kernel
|
||||
|
@ -293,7 +293,7 @@ bool DelayInjectedLoadedLatencyBenchmark::_run_core() {
|
|||
#endif
|
||||
#ifdef HAS_WORD_256
|
||||
case CHUNK_256b:
|
||||
switch (__delay) {
|
||||
switch (delay_) {
|
||||
case 0:
|
||||
load_kernel_fptr = &forwSequentialRead_Word256; //not an extended kernel
|
||||
load_kernel_dummy_fptr = &dummy_forwSequentialLoop_Word256; //not an extended kernel
|
||||
|
@ -362,7 +362,7 @@ bool DelayInjectedLoadedLatencyBenchmark::_run_core() {
|
|||
if (g_verbose)
|
||||
std::cout << "Starting power measurement threads...";
|
||||
|
||||
if (!_start_power_threads()) {
|
||||
if (!startPowerThreads()) {
|
||||
if (g_verbose)
|
||||
std::cout << "FAIL" << std::endl;
|
||||
std::cerr << "WARNING: Failed to start power threads." << std::endl;
|
||||
|
@ -374,22 +374,22 @@ bool DelayInjectedLoadedLatencyBenchmark::_run_core() {
|
|||
std::cout << "Running benchmark." << std::endl << std::endl;
|
||||
|
||||
//Do a bunch of iterations of the core benchmark routine
|
||||
for (uint32_t i = 0; i < _iterations; i++) {
|
||||
for (uint32_t i = 0; i < iterations_; i++) {
|
||||
|
||||
//Create load workers and load worker threads
|
||||
for (uint32_t t = 0; t < _num_worker_threads; t++) {
|
||||
void* thread_mem_array = reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(_mem_array) + t*len_per_thread);
|
||||
int32_t cpu_id = cpu_id_in_numa_node(_cpu_node, t);
|
||||
for (uint32_t t = 0; t < num_worker_threads_; t++) {
|
||||
void* threadmem_array_ = reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(mem_array_) + t*len_per_thread);
|
||||
int32_t cpu_id = cpu_id_in_numa_node(cpu_node_, t);
|
||||
if (cpu_id < 0)
|
||||
std::cerr << "WARNING: Failed to find logical CPU " << t << " in NUMA node " << _cpu_node << std::endl;
|
||||
std::cerr << "WARNING: Failed to find logical CPU " << t << " in NUMA node " << cpu_node_ << std::endl;
|
||||
if (t == 0) { //special case: thread 0 is always latency thread
|
||||
workers.push_back(new LatencyWorker(thread_mem_array,
|
||||
workers.push_back(new LatencyWorker(threadmem_array_,
|
||||
len_per_thread,
|
||||
lat_kernel_fptr,
|
||||
lat_kernel_dummy_fptr,
|
||||
cpu_id));
|
||||
} else {
|
||||
workers.push_back(new LoadWorker(thread_mem_array,
|
||||
workers.push_back(new LoadWorker(threadmem_array_,
|
||||
len_per_thread,
|
||||
load_kernel_fptr,
|
||||
load_kernel_dummy_fptr,
|
||||
|
@ -399,16 +399,16 @@ bool DelayInjectedLoadedLatencyBenchmark::_run_core() {
|
|||
}
|
||||
|
||||
//Start worker threads! gogogo
|
||||
for (uint32_t t = 0; t < _num_worker_threads; t++)
|
||||
for (uint32_t t = 0; t < num_worker_threads_; t++)
|
||||
worker_threads[t]->create_and_start();
|
||||
|
||||
//Wait for all threads to complete
|
||||
for (uint32_t t = 0; t < _num_worker_threads; t++)
|
||||
for (uint32_t t = 0; t < num_worker_threads_; t++)
|
||||
if (!worker_threads[t]->join())
|
||||
std::cerr << "WARNING: A worker thread failed to complete correctly!" << std::endl;
|
||||
|
||||
//Compute metrics for this iteration
|
||||
bool iter_warning = false;
|
||||
bool iterwarning_ = false;
|
||||
|
||||
//Compute latency metric
|
||||
uint32_t lat_passes = workers[0]->getPasses();
|
||||
|
@ -416,7 +416,7 @@ bool DelayInjectedLoadedLatencyBenchmark::_run_core() {
|
|||
tick_t lat_elapsed_dummy_ticks = workers[0]->getElapsedDummyTicks();
|
||||
uint32_t lat_bytes_per_pass = workers[0]->getBytesPerPass();
|
||||
uint32_t lat_accesses_per_pass = lat_bytes_per_pass / 8;
|
||||
iter_warning |= workers[0]->hadWarning();
|
||||
iterwarning_ |= workers[0]->hadWarning();
|
||||
|
||||
//Compute throughput generated by load threads
|
||||
uint32_t load_total_passes = 0;
|
||||
|
@ -424,65 +424,65 @@ bool DelayInjectedLoadedLatencyBenchmark::_run_core() {
|
|||
tick_t load_total_elapsed_dummy_ticks = 0;
|
||||
uint32_t load_bytes_per_pass = 0;
|
||||
double load_avg_adjusted_ticks = 0;
|
||||
for (uint32_t t = 1; t < _num_worker_threads; t++) {
|
||||
for (uint32_t t = 1; t < num_worker_threads_; t++) {
|
||||
load_total_passes += workers[t]->getPasses();
|
||||
load_total_adjusted_ticks += workers[t]->getAdjustedTicks();
|
||||
load_total_elapsed_dummy_ticks += workers[t]->getElapsedDummyTicks();
|
||||
load_bytes_per_pass = workers[t]->getBytesPerPass(); //all should be the same.
|
||||
iter_warning |= workers[t]->hadWarning();
|
||||
iterwarning_ |= workers[t]->hadWarning();
|
||||
}
|
||||
|
||||
//Compute load metrics for this iteration
|
||||
load_avg_adjusted_ticks = static_cast<double>(load_total_adjusted_ticks) / (_num_worker_threads-1);
|
||||
if (_num_worker_threads > 1)
|
||||
_loadMetricOnIter[i] = (((static_cast<double>(load_total_passes) * static_cast<double>(load_bytes_per_pass)) / static_cast<double>(MB))) / ((load_avg_adjusted_ticks * g_ns_per_tick) / 1e9);
|
||||
load_avg_adjusted_ticks = static_cast<double>(load_total_adjusted_ticks) / (num_worker_threads_-1);
|
||||
if (num_worker_threads_ > 1)
|
||||
load_metric_on_iter_[i] = (((static_cast<double>(load_total_passes) * static_cast<double>(load_bytes_per_pass)) / static_cast<double>(MB))) / ((load_avg_adjusted_ticks * g_ns_per_tick) / 1e9);
|
||||
|
||||
if (iter_warning)
|
||||
_warning = true;
|
||||
if (iterwarning_)
|
||||
warning_ = true;
|
||||
|
||||
if (g_verbose) { //Report metrics for this iteration
|
||||
//Latency thread
|
||||
std::cout << "Iter " << i+1 << " had " << lat_passes << " latency measurement passes, with " << lat_accesses_per_pass << " accesses per pass:";
|
||||
if (iter_warning) std::cout << " -- WARNING";
|
||||
if (iterwarning_) std::cout << " -- WARNING";
|
||||
std::cout << std::endl;
|
||||
|
||||
std::cout << "...lat clock ticks == " << lat_adjusted_ticks << " (adjusted by -" << lat_elapsed_dummy_ticks << ")";
|
||||
if (iter_warning) std::cout << " -- WARNING";
|
||||
if (iterwarning_) std::cout << " -- WARNING";
|
||||
std::cout << std::endl;
|
||||
|
||||
std::cout << "...lat ns == " << lat_adjusted_ticks * g_ns_per_tick << " (adjusted by -" << lat_elapsed_dummy_ticks * g_ns_per_tick << ")";
|
||||
if (iter_warning) std::cout << " -- WARNING";
|
||||
if (iterwarning_) std::cout << " -- WARNING";
|
||||
std::cout << std::endl;
|
||||
|
||||
std::cout << "...lat sec == " << lat_adjusted_ticks * g_ns_per_tick / 1e9 << " (adjusted by -" << lat_elapsed_dummy_ticks * g_ns_per_tick / 1e9 << ")";
|
||||
if (iter_warning) std::cout << " -- WARNING";
|
||||
if (iterwarning_) std::cout << " -- WARNING";
|
||||
std::cout << std::endl;
|
||||
|
||||
//Load threads
|
||||
if (_num_worker_threads > 1) {
|
||||
if (num_worker_threads_ > 1) {
|
||||
std::cout << "Iter " << i+1 << " had " << load_total_passes << " total load generation passes, with " << load_bytes_per_pass << " bytes per pass:";
|
||||
if (iter_warning) std::cout << " -- WARNING";
|
||||
if (iterwarning_) std::cout << " -- WARNING";
|
||||
std::cout << std::endl;
|
||||
|
||||
std::cout << "...load total clock ticks across " << _num_worker_threads-1 << " threads == " << load_total_adjusted_ticks << " (adjusted by -" << load_total_elapsed_dummy_ticks << ")";
|
||||
if (iter_warning) std::cout << " -- WARNING";
|
||||
std::cout << "...load total clock ticks across " << num_worker_threads_-1 << " threads == " << load_total_adjusted_ticks << " (adjusted by -" << load_total_elapsed_dummy_ticks << ")";
|
||||
if (iterwarning_) std::cout << " -- WARNING";
|
||||
std::cout << std::endl;
|
||||
|
||||
std::cout << "...load total ns across " << _num_worker_threads-1 << " threads == " << load_total_adjusted_ticks * g_ns_per_tick << " (adjusted by -" << load_total_elapsed_dummy_ticks * g_ns_per_tick << ")";
|
||||
if (iter_warning) std::cout << " -- WARNING";
|
||||
std::cout << "...load total ns across " << num_worker_threads_-1 << " threads == " << load_total_adjusted_ticks * g_ns_per_tick << " (adjusted by -" << load_total_elapsed_dummy_ticks * g_ns_per_tick << ")";
|
||||
if (iterwarning_) std::cout << " -- WARNING";
|
||||
std::cout << std::endl;
|
||||
|
||||
std::cout << "...load total sec across " << _num_worker_threads-1 << " threads == " << load_total_adjusted_ticks * g_ns_per_tick / 1e9 << " (adjusted by -" << load_total_elapsed_dummy_ticks * g_ns_per_tick / 1e9 << ")";
|
||||
if (iter_warning) std::cout << " -- WARNING";
|
||||
std::cout << "...load total sec across " << num_worker_threads_-1 << " threads == " << load_total_adjusted_ticks * g_ns_per_tick / 1e9 << " (adjusted by -" << load_total_elapsed_dummy_ticks * g_ns_per_tick / 1e9 << ")";
|
||||
if (iterwarning_) std::cout << " -- WARNING";
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
//Compute overall metrics for this iteration
|
||||
_metricOnIter[i] = static_cast<double>(lat_adjusted_ticks * g_ns_per_tick) / static_cast<double>(lat_accesses_per_pass * lat_passes);
|
||||
metric_on_iter_[i] = static_cast<double>(lat_adjusted_ticks * g_ns_per_tick) / static_cast<double>(lat_accesses_per_pass * lat_passes);
|
||||
|
||||
//Clean up workers and threads for this iteration
|
||||
for (uint32_t t = 0; t < _num_worker_threads; t++) {
|
||||
for (uint32_t t = 0; t < num_worker_threads_; t++) {
|
||||
delete worker_threads[t];
|
||||
delete workers[t];
|
||||
}
|
||||
|
@ -496,7 +496,7 @@ bool DelayInjectedLoadedLatencyBenchmark::_run_core() {
|
|||
std::cout << "Stopping power measurement threads...";
|
||||
}
|
||||
|
||||
if (!_stop_power_threads()) {
|
||||
if (!stopPowerThreads()) {
|
||||
if (g_verbose)
|
||||
std::cout << "FAIL" << std::endl;
|
||||
std::cerr << "WARNING: Failed to stop power measurement threads." << std::endl;
|
||||
|
@ -504,13 +504,13 @@ bool DelayInjectedLoadedLatencyBenchmark::_run_core() {
|
|||
std::cout << "done" << std::endl;
|
||||
|
||||
//Run metadata
|
||||
_hasRun = true;
|
||||
has_run_ = true;
|
||||
|
||||
//Get mean load metrics -- these aren't part of Benchmark class thus not covered by _computeMetrics()
|
||||
_computeMetrics();
|
||||
for (uint32_t i = 0; i < _iterations; i++)
|
||||
_meanLoadMetric += _loadMetricOnIter[i];
|
||||
_meanLoadMetric /= static_cast<double>(_iterations);
|
||||
//Get mean load metrics -- these aren't part of Benchmark class thus not covered by computeMetrics()
|
||||
computeMetrics();
|
||||
for (uint32_t i = 0; i < iterations_; i++)
|
||||
mean_load_metric_ += load_metric_on_iter_[i];
|
||||
mean_load_metric_ /= static_cast<double>(iterations_);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -319,11 +319,11 @@ namespace xmem {
|
|||
std::vector<double> metric_on_iter_; /**< Metrics for each iteration of the benchmark. Unit-less because any benchmark can set this metric as needed. It is up to the descendant class to interpret units. */
|
||||
double mean_metric_; /**< Average metric over all iterations. Unit-less because any benchmark can set this metric as needed. It is up to the descendant class to interpret units. */
|
||||
double min_metric_; /**< Minimum metric over all iterations. Unit-less because any benchmark can set this metric as needed. It is up to the descendant class to interpret units. */
|
||||
double 25_percentile_metric_; /**< 25th percentile metric over all iterations. Unit-less because any benchmark can set this metric as needed. It is up to the descendant class to interpret units. */
|
||||
double percentile_25_metric_; /**< 25th percentile metric over all iterations. Unit-less because any benchmark can set this metric as needed. It is up to the descendant class to interpret units. */
|
||||
double median_metric_; /**< Median metric over all iterations. Unit-less because any benchmark can set this metric as needed. It is up to the descendant class to interpret units. */
|
||||
double 75_percentile_metric_; /**< 75th percentile metric over all iterations. Unit-less because any benchmark can set this metric as needed. It is up to the descendant class to interpret units. */
|
||||
double 95_percentile_metric_; /**< 95th percentile metric over all iterations. Unit-less because any benchmark can set this metric as needed. It is up to the descendant class to interpret units. */
|
||||
double 99_percentile_metric_; /**< 99th percentile metric over all iterations. Unit-less because any benchmark can set this metric as needed. It is up to the descendant class to interpret units. */
|
||||
double percentile_75_metric_; /**< 75th percentile metric over all iterations. Unit-less because any benchmark can set this metric as needed. It is up to the descendant class to interpret units. */
|
||||
double percentile_95_metric_; /**< 95th percentile metric over all iterations. Unit-less because any benchmark can set this metric as needed. It is up to the descendant class to interpret units. */
|
||||
double percentile_99_metric_; /**< 99th percentile metric over all iterations. Unit-less because any benchmark can set this metric as needed. It is up to the descendant class to interpret units. */
|
||||
double max_metric_; /**< Maximum metric over all iterations. Unit-less because any benchmark can set this metric as needed. It is up to the descendant class to interpret units. */
|
||||
double mode_metric_; /**< Mode metric over all iterations. Unit-less because any benchmark can set this metric as needed. It is up to the descendant class to interpret units. */
|
||||
std::string metric_units_; /**< String representing the units of measurement for the metric. */
|
||||
|
|
|
@ -31,8 +31,8 @@
|
|||
|
||||
#ifdef EXT_DELAY_INJECTED_LOADED_LATENCY_BENCHMARK
|
||||
|
||||
#ifndef __DELAY_INJECTED_LOADED_LATENCY_BENCHMARK_H
|
||||
#define __DELAY_INJECTED_LOADED_LATENCY_BENCHMARK_H
|
||||
#ifndef DELAY_INJECTED_LOADED_LATENCY_BENCHMARK_H
|
||||
#define DELAY_INJECTED_LOADED_LATENCY_BENCHMARK_H
|
||||
|
||||
//Headers
|
||||
#include <LatencyBenchmark.h>
|
||||
|
@ -74,7 +74,7 @@ namespace xmem {
|
|||
/**
|
||||
* @brief Reports benchmark configuration details to the console.
|
||||
*/
|
||||
virtual void report_benchmark_info() const;
|
||||
virtual void reportBenchmarkInfo() const;
|
||||
|
||||
/**
|
||||
* @brief Gets the delay injection used in load thread kernels. A delay of 5 corresponds to 5 nop instructions.
|
||||
|
@ -83,10 +83,10 @@ namespace xmem {
|
|||
uint32_t getDelay() const;
|
||||
|
||||
protected:
|
||||
virtual bool _run_core();
|
||||
virtual bool runCore();
|
||||
|
||||
private:
|
||||
uint32_t __delay; /**< Number of nops to insert between load thread memory instructions. This is a form of delay injection to reduce memory loading. */
|
||||
uint32_t delay_; /**< Number of nops to insert between load thread memory instructions. This is a form of delay injection to reduce memory loading. */
|
||||
};
|
||||
};
|
||||
|
||||
|
|
|
@ -105,6 +105,7 @@ void xmem::print_welcome_message() {
|
|||
* @brief The main entry point to the program.
|
||||
*/
|
||||
int main(int argc, char* argv[]) {
|
||||
bool configSuccess = false;
|
||||
try {
|
||||
init_globals();
|
||||
print_welcome_message();
|
||||
|
@ -117,7 +118,7 @@ int main(int argc, char* argv[]) {
|
|||
|
||||
//Configure runtime based on user inputs
|
||||
Configurator config;
|
||||
bool configSuccess = !config.configureFromInput(argc, argv);
|
||||
configSuccess = !config.configureFromInput(argc, argv);
|
||||
|
||||
if (configSuccess) {
|
||||
if (g_verbose) {
|
||||
|
@ -176,5 +177,8 @@ int main(int argc, char* argv[]) {
|
|||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
(configSuccess) ? return EXIT_SUCCESS : return EXIT_FAILURE;
|
||||
if (configSuccess)
|
||||
return EXIT_SUCCESS;
|
||||
else
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче