Benchmarks: Revision - Support flexible warmup and non-random data initialization in cublas-benchmark (#479)
**Description** revise cublas-benchmark for flexible warmup and fill data with fixed number for perf test to improve the running efficiency. **Major Revision** - remove num_in_steps for warmup to support more flexible warmup setting for users - Add support to generate input with fixed number for perf test
This commit is contained in:
Родитель
0292366075
Коммит
eba298f5f0
|
@ -190,23 +190,26 @@ class CublasBenchmark(MicroBenchmarkWithInvoke):
|
|||
self._parser.add_argument(
|
||||
'--num_warmup',
|
||||
type=int,
|
||||
default=8,
|
||||
default=8 * 1000,
|
||||
required=False,
|
||||
help='The number of warmup step.',
|
||||
help='The number of functions for warmup. By default, the total number of functions to run in warmup ' +
|
||||
'is 8 warmup steps * 1000 num_in_step.',
|
||||
)
|
||||
self._parser.add_argument(
|
||||
'--num_steps',
|
||||
type=int,
|
||||
default=100,
|
||||
required=False,
|
||||
help='The number of test step.',
|
||||
help='The number of test steps. By default, the total number of functions to run in the measured test ' +
|
||||
'is 100 test steps * 1000 num_in_step.',
|
||||
)
|
||||
self._parser.add_argument(
|
||||
'--num_in_step',
|
||||
type=int,
|
||||
default=1000,
|
||||
required=False,
|
||||
help='The number of functions in one step.',
|
||||
help='The number of functions in one step. By default, the total number of functions to run ' +
|
||||
'in each step is 1000.',
|
||||
)
|
||||
self._parser.add_argument(
|
||||
'--random_seed',
|
||||
|
@ -236,6 +239,13 @@ class CublasBenchmark(MicroBenchmarkWithInvoke):
|
|||
required=False,
|
||||
help='The acceptable error bound for correctness check.',
|
||||
)
|
||||
self._parser.add_argument(
|
||||
'--random_data',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='Enable random data generation for performance test. ' +
|
||||
'By default, the data is filled with fixed value for performance test.',
|
||||
)
|
||||
|
||||
def _preprocess(self):
|
||||
"""Preprocess/preparation operations before the benchmarking.
|
||||
|
@ -253,6 +263,7 @@ class CublasBenchmark(MicroBenchmarkWithInvoke):
|
|||
command += (' --random_seed ' + str(self._args.random_seed))
|
||||
command += ' --correctness' if self._args.correctness else ''
|
||||
command += (' --eps ' + str(self._args.eps)) if self._args.eps is not None else ''
|
||||
command += ' --random_data' if self._args.random_data else ''
|
||||
|
||||
try:
|
||||
if not self._args.config_json_str:
|
||||
|
|
|
@ -54,6 +54,7 @@ class CublasFunction {
|
|||
int random_seed; ///< the random seed used to generate random data
|
||||
double eps; ///< the acceptable error bound for numeric stability
|
||||
bool correctness; ///< whether enable correctness check or not
|
||||
bool random_data; ///< whether enable random data generation or not
|
||||
std::string name_; ///< the name of the cublas function
|
||||
int m_; ///< the m dim of matrix
|
||||
int k_; ///< the k dim of matrix
|
||||
|
@ -70,17 +71,17 @@ class CublasFunction {
|
|||
/**
|
||||
* @brief Fill the random data into the input
|
||||
*/
|
||||
template <typename T> void fill_data(T *Parameter_0_0_host, T *Parameter_1_0_host);
|
||||
template <typename T> void fill_data(T *Parameter_0_0_host, T *Parameter_1_0_host, bool random = true);
|
||||
/**
|
||||
* @brief Prepare memory and data of the input and output
|
||||
*/
|
||||
template <typename T>
|
||||
void prepare_tensor_template(T **Parameter_0_0, T **Parameter_1_0, T **Result_3_0, T **Parameter_0_0_host,
|
||||
T **Parameter_1_0_host);
|
||||
T **Parameter_1_0_host, bool random = true);
|
||||
/**
|
||||
* @brief Prepare memory and data of the input and output for kernel running
|
||||
*/
|
||||
virtual void prepare_tensor() {}
|
||||
virtual void prepare_tensor(bool random = true) {}
|
||||
/**
|
||||
* @brief Execute the kernel/function
|
||||
*/
|
||||
|
@ -140,11 +141,15 @@ class CublasFunction {
|
|||
* @param eps the acceptable error bound for numeric stability
|
||||
*/
|
||||
void set_eps(double eps) { this->eps = eps; }
|
||||
/**
|
||||
* @brief Set the random data
|
||||
* @param random_data if generate random data
|
||||
*/
|
||||
void set_random_data(bool random_data) { this->random_data = random_data; }
|
||||
/**
|
||||
* @brief Set the params string
|
||||
* @param str the str representing the params of the function
|
||||
*/
|
||||
|
||||
void set_function(std::string &str) { this->function_str_ = str; }
|
||||
/**
|
||||
* @brief Set the name member
|
||||
|
@ -228,39 +233,59 @@ class CublasFunction {
|
|||
/**
|
||||
* @brief Fill the random data into the input in float type
|
||||
*/
|
||||
template <> void CublasFunction::fill_data(float *Parameter_0_0_host, float *Parameter_1_0_host) {
|
||||
srand(random_seed);
|
||||
for (int i = 0; i < m_ * k_ * batch_count_; i++) {
|
||||
Parameter_0_0_host[i] = ((float)rand() / (float)(RAND_MAX));
|
||||
}
|
||||
for (int i = 0; i < k_ * n_ * batch_count_; ++i) {
|
||||
Parameter_1_0_host[i] = ((float)rand() / (float)(RAND_MAX));
|
||||
template <> void CublasFunction::fill_data(float *Parameter_0_0_host, float *Parameter_1_0_host, bool random) {
|
||||
if (random) {
|
||||
srand(random_seed);
|
||||
for (int i = 0; i < m_ * k_ * batch_count_; i++) {
|
||||
Parameter_0_0_host[i] = ((float)rand() / (float)(RAND_MAX));
|
||||
}
|
||||
for (int i = 0; i < k_ * n_ * batch_count_; ++i) {
|
||||
Parameter_1_0_host[i] = ((float)rand() / (float)(RAND_MAX));
|
||||
}
|
||||
} else {
|
||||
// memset the input data to fixed float value
|
||||
memset(Parameter_0_0_host, 2,
|
||||
(unsigned long)m_ * (unsigned long)k_ * (unsigned long)batch_count_ * sizeof(float));
|
||||
memset(Parameter_1_0_host, 3,
|
||||
(unsigned long)k_ * (unsigned long)n_ * (unsigned long)batch_count_ * sizeof(float));
|
||||
}
|
||||
}
|
||||
/**
|
||||
* @brief Fill the random data into the input in half type
|
||||
*/
|
||||
template <> void CublasFunction::fill_data(half *Parameter_0_0_host, half *Parameter_1_0_host) {
|
||||
srand(random_seed);
|
||||
for (int i = 0; i < m_ * k_ * batch_count_; i++) {
|
||||
Parameter_0_0_host[i] = half((float)rand() / (float)(RAND_MAX));
|
||||
}
|
||||
for (int i = 0; i < k_ * n_ * batch_count_; ++i) {
|
||||
Parameter_1_0_host[i] = half((float)rand() / (float)(RAND_MAX));
|
||||
template <> void CublasFunction::fill_data(half *Parameter_0_0_host, half *Parameter_1_0_host, bool random) {
|
||||
if (random) {
|
||||
srand(random_seed);
|
||||
for (int i = 0; i < m_ * k_ * batch_count_; i++) {
|
||||
Parameter_0_0_host[i] = half((float)rand() / (float)(RAND_MAX));
|
||||
}
|
||||
for (int i = 0; i < k_ * n_ * batch_count_; ++i) {
|
||||
Parameter_1_0_host[i] = half((float)rand() / (float)(RAND_MAX));
|
||||
}
|
||||
} else {
|
||||
// memset the input data to fixed float value
|
||||
std::fill(Parameter_0_0_host, Parameter_0_0_host + m_ * k_ * batch_count_, half(2.0));
|
||||
std::fill(Parameter_1_0_host, Parameter_1_0_host + k_ * n_ * batch_count_, half(3.0));
|
||||
}
|
||||
}
|
||||
/**
|
||||
* @brief Fill the random data into the input in cuComplex type
|
||||
*/
|
||||
template <> void CublasFunction::fill_data(cuComplex *Parameter_0_0_host, cuComplex *Parameter_1_0_host) {
|
||||
srand(random_seed);
|
||||
for (int i = 0; i < m_ * k_ * batch_count_; i++) {
|
||||
Parameter_0_0_host[i] =
|
||||
make_cuComplex(((float)rand() / (float)(RAND_MAX)), ((float)rand() / (float)(RAND_MAX)));
|
||||
}
|
||||
for (int i = 0; i < k_ * n_ * batch_count_; ++i) {
|
||||
Parameter_1_0_host[i] =
|
||||
make_cuComplex(((float)rand() / (float)(RAND_MAX)), ((float)rand() / (float)(RAND_MAX)));
|
||||
template <> void CublasFunction::fill_data(cuComplex *Parameter_0_0_host, cuComplex *Parameter_1_0_host, bool random) {
|
||||
if (random) {
|
||||
srand(random_seed);
|
||||
for (int i = 0; i < m_ * k_ * batch_count_; i++) {
|
||||
Parameter_0_0_host[i] =
|
||||
make_cuComplex(((float)rand() / (float)(RAND_MAX)), ((float)rand() / (float)(RAND_MAX)));
|
||||
}
|
||||
for (int i = 0; i < k_ * n_ * batch_count_; ++i) {
|
||||
Parameter_1_0_host[i] =
|
||||
make_cuComplex(((float)rand() / (float)(RAND_MAX)), ((float)rand() / (float)(RAND_MAX)));
|
||||
}
|
||||
} else {
|
||||
// memset the input data to fixed float value
|
||||
std::fill(Parameter_0_0_host, Parameter_0_0_host + m_ * k_ * batch_count_, make_cuComplex(2.0f, 2.0f));
|
||||
std::fill(Parameter_1_0_host, Parameter_1_0_host + k_ * n_ * batch_count_, make_cuComplex(3.0f, 3.0f));
|
||||
}
|
||||
}
|
||||
/**
|
||||
|
@ -268,7 +293,7 @@ template <> void CublasFunction::fill_data(cuComplex *Parameter_0_0_host, cuComp
|
|||
*/
|
||||
template <typename T>
|
||||
void CublasFunction::prepare_tensor_template(T **Parameter_0_0, T **Parameter_1_0, T **Result_3_0,
|
||||
T **Parameter_0_0_host, T **Parameter_1_0_host) {
|
||||
T **Parameter_0_0_host, T **Parameter_1_0_host, bool random) {
|
||||
int m = this->m_, n = this->n_, k = this->k_, batch_count = this->batch_count_;
|
||||
// input argument
|
||||
CUDA_SAFE_CALL(cudaMallocHost((void **)Parameter_0_0_host, sizeof(T) * m * k * batch_count_));
|
||||
|
@ -278,7 +303,7 @@ void CublasFunction::prepare_tensor_template(T **Parameter_0_0, T **Parameter_1_
|
|||
CUDA_SAFE_CALL(cudaMalloc((void **)Parameter_1_0, sizeof(T) * n * k * batch_count_));
|
||||
|
||||
// fill input values
|
||||
fill_data(reinterpret_cast<T *>(*Parameter_0_0_host), reinterpret_cast<T *>(*Parameter_1_0_host));
|
||||
fill_data(reinterpret_cast<T *>(*Parameter_0_0_host), reinterpret_cast<T *>(*Parameter_1_0_host), random);
|
||||
|
||||
// copy input data from host to device
|
||||
CUDA_SAFE_CALL(
|
||||
|
@ -469,13 +494,12 @@ int CublasFunction::check_result(int batch_count, cuComplex *Result_3_0, std::co
|
|||
*/
|
||||
void CublasFunction::benchmark() {
|
||||
// Malloc memory for input and output data
|
||||
this->prepare_tensor();
|
||||
bool random = this->correctness ? true : this->random_data;
|
||||
this->prepare_tensor(random);
|
||||
|
||||
// Warm up
|
||||
for (int i_ = 0; i_ < warm_up; i_++) {
|
||||
for (int j = 0; j < num_in_step; j++) {
|
||||
this->kernel_entry();
|
||||
}
|
||||
this->kernel_entry();
|
||||
}
|
||||
CUDA_SAFE_CALL(cudaDeviceSynchronize());
|
||||
|
||||
|
|
|
@ -39,8 +39,9 @@ class SgemmFunction : public CublasFunction {
|
|||
/**
|
||||
* @brief Prepare memory and data of the input and output for kernel running
|
||||
*/
|
||||
virtual void prepare_tensor() {
|
||||
prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host);
|
||||
virtual void prepare_tensor(bool random) {
|
||||
prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host,
|
||||
random);
|
||||
}
|
||||
/**
|
||||
* @brief Check the correctness of function calculation result
|
||||
|
@ -107,8 +108,9 @@ class CgemmFunction : public CublasFunction {
|
|||
/**
|
||||
* @brief Prepare memory and data of the input and output for kernel running
|
||||
*/
|
||||
virtual void prepare_tensor() {
|
||||
prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host);
|
||||
virtual void prepare_tensor(bool random) {
|
||||
prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host,
|
||||
random);
|
||||
}
|
||||
/**
|
||||
* @brief Check the correctness of function calculation result
|
||||
|
@ -169,17 +171,17 @@ class GemmExFunction : public CublasFunction {
|
|||
/**
|
||||
* @brief Prepare memory and data of the input and output for kernel running
|
||||
*/
|
||||
virtual void prepare_tensor() {
|
||||
virtual void prepare_tensor(bool random) {
|
||||
if (this->datatype_.compare("half") == 0) {
|
||||
CublasFunction::prepare_tensor_template<half>(
|
||||
reinterpret_cast<half **>(&Parameter_0_0), reinterpret_cast<half **>(&Parameter_1_0),
|
||||
reinterpret_cast<half **>(&Result_3_0), reinterpret_cast<half **>(&Parameter_0_0_host),
|
||||
reinterpret_cast<half **>(&Parameter_1_0_host));
|
||||
reinterpret_cast<half **>(&Parameter_1_0_host), random);
|
||||
} else if (this->datatype_.compare("float") == 0) {
|
||||
CublasFunction::prepare_tensor_template<float>(
|
||||
reinterpret_cast<float **>(&Parameter_0_0), reinterpret_cast<float **>(&Parameter_1_0),
|
||||
reinterpret_cast<float **>(&Result_3_0), reinterpret_cast<float **>(&Parameter_0_0_host),
|
||||
reinterpret_cast<float **>(&Parameter_1_0_host));
|
||||
reinterpret_cast<float **>(&Parameter_1_0_host), random);
|
||||
}
|
||||
}
|
||||
/**
|
||||
|
@ -265,17 +267,17 @@ class GemmStridedBatchedExFunction : public CublasFunction {
|
|||
/**
|
||||
* @brief Prepare memory and data of the input and output for kernel running
|
||||
*/
|
||||
virtual void prepare_tensor() {
|
||||
virtual void prepare_tensor(bool random) {
|
||||
if (this->datatype_.compare("half") == 0) {
|
||||
prepare_tensor_template<half>(
|
||||
reinterpret_cast<half **>(&Parameter_0_0), reinterpret_cast<half **>(&Parameter_1_0),
|
||||
reinterpret_cast<half **>(&Result_3_0), reinterpret_cast<half **>(&Parameter_0_0_host),
|
||||
reinterpret_cast<half **>(&Parameter_1_0_host));
|
||||
reinterpret_cast<half **>(&Parameter_1_0_host), random);
|
||||
} else if (this->datatype_.compare("float") == 0) {
|
||||
prepare_tensor_template<float>(
|
||||
reinterpret_cast<float **>(&Parameter_0_0), reinterpret_cast<float **>(&Parameter_1_0),
|
||||
reinterpret_cast<float **>(&Result_3_0), reinterpret_cast<float **>(&Parameter_0_0_host),
|
||||
reinterpret_cast<float **>(&Parameter_1_0_host));
|
||||
reinterpret_cast<float **>(&Parameter_1_0_host), random);
|
||||
}
|
||||
}
|
||||
/**
|
||||
|
@ -355,8 +357,9 @@ class SgemmStridedBatchedFunction : public CublasFunction {
|
|||
/**
|
||||
* @brief Prepare memory and data of the input and output for kernel running
|
||||
*/
|
||||
virtual void prepare_tensor() {
|
||||
prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host);
|
||||
virtual void prepare_tensor(bool random) {
|
||||
prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host,
|
||||
random);
|
||||
}
|
||||
/**
|
||||
* @brief Function calculation on CPU side
|
||||
|
@ -419,8 +422,9 @@ class Cgemm3mStridedBatchedFunction : public CublasFunction {
|
|||
/**
|
||||
* @brief Prepare memory and data of the input and output for kernel running
|
||||
*/
|
||||
virtual void prepare_tensor() {
|
||||
prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host);
|
||||
virtual void prepare_tensor(bool random) {
|
||||
prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host,
|
||||
random);
|
||||
}
|
||||
/**
|
||||
* @brief Function calculation on CPU side
|
||||
|
|
|
@ -98,6 +98,7 @@ class Options {
|
|||
std::string para_info_json;
|
||||
bool correctness_check;
|
||||
double eps;
|
||||
bool random_data;
|
||||
|
||||
/**
|
||||
* @brief Construct a options object according to cmd or set a default value used to test
|
||||
|
@ -120,6 +121,7 @@ class Options {
|
|||
: para_info_json;
|
||||
correctness_check = get_cmd_line_argument_bool("--correctness");
|
||||
eps = get_cmd_line_argument_double("--eps");
|
||||
random_data = get_cmd_line_argument_bool("--random_data");
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -241,6 +243,7 @@ void run_benchmark(Options &options) {
|
|||
function.set_random_seed(options.random_seed);
|
||||
function.set_correctness(options.correctness_check);
|
||||
function.set_eps(options.eps);
|
||||
function.set_random_data(options.random_data);
|
||||
CublasFunction *p_function = get_cublas_function_pointer(function);
|
||||
p_function->benchmark();
|
||||
delete p_function;
|
||||
|
|
Загрузка…
Ссылка в новой задаче