Benchmarks: Revision - Support flexible warmup and non-random data initialization in cublas-benchmark (#479)

**Description**
revise cublas-benchmark for flexible warmup and fill data with fixed
number for perf test to improve the running efficiency.

**Major Revision**
- remove num_in_steps for warmup to support more flexible warmup setting
for users
- Add support to generate input with fixed number for perf test
This commit is contained in:
Yuting Jiang 2023-02-28 06:35:18 +08:00 коммит произвёл GitHub
Родитель 0292366075
Коммит eba298f5f0
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
4 изменённых файлов: 93 добавлений и 51 удалений

Просмотреть файл

@ -190,23 +190,26 @@ class CublasBenchmark(MicroBenchmarkWithInvoke):
self._parser.add_argument(
'--num_warmup',
type=int,
default=8,
default=8 * 1000,
required=False,
help='The number of warmup step.',
help='The number of functions for warmup. By default, the total number of functions to run in warmup ' +
'is 8 warmup steps * 1000 num_in_step.',
)
self._parser.add_argument(
'--num_steps',
type=int,
default=100,
required=False,
help='The number of test step.',
help='The number of test steps. By default, the total number of functions to run in the measured test ' +
'is 100 test steps * 1000 num_in_step.',
)
self._parser.add_argument(
'--num_in_step',
type=int,
default=1000,
required=False,
help='The number of functions in one step.',
help='The number of functions in one step. By default, the total number of functions to run ' +
'in each step is 1000.',
)
self._parser.add_argument(
'--random_seed',
@ -236,6 +239,13 @@ class CublasBenchmark(MicroBenchmarkWithInvoke):
required=False,
help='The acceptable error bound for correctness check.',
)
self._parser.add_argument(
'--random_data',
action='store_true',
default=False,
help='Enable random data generation for performance test. ' +
'By default, the data is filled with fixed value for performance test.',
)
def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
@ -253,6 +263,7 @@ class CublasBenchmark(MicroBenchmarkWithInvoke):
command += (' --random_seed ' + str(self._args.random_seed))
command += ' --correctness' if self._args.correctness else ''
command += (' --eps ' + str(self._args.eps)) if self._args.eps is not None else ''
command += ' --random_data' if self._args.random_data else ''
try:
if not self._args.config_json_str:

Просмотреть файл

@ -54,6 +54,7 @@ class CublasFunction {
int random_seed; ///< the random seed used to generate random data
double eps; ///< the acceptable error bound for numeric stability
bool correctness; ///< whether enable correctness check or not
bool random_data; ///< whether enable random data generation or not
std::string name_; ///< the name of the cublas function
int m_; ///< the m dim of matrix
int k_; ///< the k dim of matrix
@ -70,17 +71,17 @@ class CublasFunction {
/**
* @brief Fill the random data into the input
*/
template <typename T> void fill_data(T *Parameter_0_0_host, T *Parameter_1_0_host);
template <typename T> void fill_data(T *Parameter_0_0_host, T *Parameter_1_0_host, bool random = true);
/**
* @brief Prepare memory and data of the input and output
*/
template <typename T>
void prepare_tensor_template(T **Parameter_0_0, T **Parameter_1_0, T **Result_3_0, T **Parameter_0_0_host,
T **Parameter_1_0_host);
T **Parameter_1_0_host, bool random = true);
/**
* @brief Prepare memory and data of the input and output for kernel running
*/
virtual void prepare_tensor() {}
virtual void prepare_tensor(bool random = true) {}
/**
* @brief Execute the kernel/function
*/
@ -140,11 +141,15 @@ class CublasFunction {
* @param eps the acceptable error bound for numeric stability
*/
void set_eps(double eps) { this->eps = eps; }
/**
* @brief Set the random data
* @param random_data if generate random data
*/
void set_random_data(bool random_data) { this->random_data = random_data; }
/**
* @brief Set the params string
* @param str the str representing the params of the function
*/
void set_function(std::string &str) { this->function_str_ = str; }
/**
* @brief Set the name member
@ -228,39 +233,59 @@ class CublasFunction {
/**
* @brief Fill the random data into the input in float type
*/
template <> void CublasFunction::fill_data(float *Parameter_0_0_host, float *Parameter_1_0_host) {
srand(random_seed);
for (int i = 0; i < m_ * k_ * batch_count_; i++) {
Parameter_0_0_host[i] = ((float)rand() / (float)(RAND_MAX));
}
for (int i = 0; i < k_ * n_ * batch_count_; ++i) {
Parameter_1_0_host[i] = ((float)rand() / (float)(RAND_MAX));
template <> void CublasFunction::fill_data(float *Parameter_0_0_host, float *Parameter_1_0_host, bool random) {
if (random) {
srand(random_seed);
for (int i = 0; i < m_ * k_ * batch_count_; i++) {
Parameter_0_0_host[i] = ((float)rand() / (float)(RAND_MAX));
}
for (int i = 0; i < k_ * n_ * batch_count_; ++i) {
Parameter_1_0_host[i] = ((float)rand() / (float)(RAND_MAX));
}
} else {
// memset the input data to fixed float value
memset(Parameter_0_0_host, 2,
(unsigned long)m_ * (unsigned long)k_ * (unsigned long)batch_count_ * sizeof(float));
memset(Parameter_1_0_host, 3,
(unsigned long)k_ * (unsigned long)n_ * (unsigned long)batch_count_ * sizeof(float));
}
}
/**
* @brief Fill the random data into the input in half type
*/
template <> void CublasFunction::fill_data(half *Parameter_0_0_host, half *Parameter_1_0_host) {
srand(random_seed);
for (int i = 0; i < m_ * k_ * batch_count_; i++) {
Parameter_0_0_host[i] = half((float)rand() / (float)(RAND_MAX));
}
for (int i = 0; i < k_ * n_ * batch_count_; ++i) {
Parameter_1_0_host[i] = half((float)rand() / (float)(RAND_MAX));
template <> void CublasFunction::fill_data(half *Parameter_0_0_host, half *Parameter_1_0_host, bool random) {
if (random) {
srand(random_seed);
for (int i = 0; i < m_ * k_ * batch_count_; i++) {
Parameter_0_0_host[i] = half((float)rand() / (float)(RAND_MAX));
}
for (int i = 0; i < k_ * n_ * batch_count_; ++i) {
Parameter_1_0_host[i] = half((float)rand() / (float)(RAND_MAX));
}
} else {
// memset the input data to fixed float value
std::fill(Parameter_0_0_host, Parameter_0_0_host + m_ * k_ * batch_count_, half(2.0));
std::fill(Parameter_1_0_host, Parameter_1_0_host + k_ * n_ * batch_count_, half(3.0));
}
}
/**
* @brief Fill the random data into the input in cuComplex type
*/
template <> void CublasFunction::fill_data(cuComplex *Parameter_0_0_host, cuComplex *Parameter_1_0_host) {
srand(random_seed);
for (int i = 0; i < m_ * k_ * batch_count_; i++) {
Parameter_0_0_host[i] =
make_cuComplex(((float)rand() / (float)(RAND_MAX)), ((float)rand() / (float)(RAND_MAX)));
}
for (int i = 0; i < k_ * n_ * batch_count_; ++i) {
Parameter_1_0_host[i] =
make_cuComplex(((float)rand() / (float)(RAND_MAX)), ((float)rand() / (float)(RAND_MAX)));
template <> void CublasFunction::fill_data(cuComplex *Parameter_0_0_host, cuComplex *Parameter_1_0_host, bool random) {
if (random) {
srand(random_seed);
for (int i = 0; i < m_ * k_ * batch_count_; i++) {
Parameter_0_0_host[i] =
make_cuComplex(((float)rand() / (float)(RAND_MAX)), ((float)rand() / (float)(RAND_MAX)));
}
for (int i = 0; i < k_ * n_ * batch_count_; ++i) {
Parameter_1_0_host[i] =
make_cuComplex(((float)rand() / (float)(RAND_MAX)), ((float)rand() / (float)(RAND_MAX)));
}
} else {
// memset the input data to fixed float value
std::fill(Parameter_0_0_host, Parameter_0_0_host + m_ * k_ * batch_count_, make_cuComplex(2.0f, 2.0f));
std::fill(Parameter_1_0_host, Parameter_1_0_host + k_ * n_ * batch_count_, make_cuComplex(3.0f, 3.0f));
}
}
/**
@ -268,7 +293,7 @@ template <> void CublasFunction::fill_data(cuComplex *Parameter_0_0_host, cuComp
*/
template <typename T>
void CublasFunction::prepare_tensor_template(T **Parameter_0_0, T **Parameter_1_0, T **Result_3_0,
T **Parameter_0_0_host, T **Parameter_1_0_host) {
T **Parameter_0_0_host, T **Parameter_1_0_host, bool random) {
int m = this->m_, n = this->n_, k = this->k_, batch_count = this->batch_count_;
// input argument
CUDA_SAFE_CALL(cudaMallocHost((void **)Parameter_0_0_host, sizeof(T) * m * k * batch_count_));
@ -278,7 +303,7 @@ void CublasFunction::prepare_tensor_template(T **Parameter_0_0, T **Parameter_1_
CUDA_SAFE_CALL(cudaMalloc((void **)Parameter_1_0, sizeof(T) * n * k * batch_count_));
// fill input values
fill_data(reinterpret_cast<T *>(*Parameter_0_0_host), reinterpret_cast<T *>(*Parameter_1_0_host));
fill_data(reinterpret_cast<T *>(*Parameter_0_0_host), reinterpret_cast<T *>(*Parameter_1_0_host), random);
// copy input data from host to device
CUDA_SAFE_CALL(
@ -469,13 +494,12 @@ int CublasFunction::check_result(int batch_count, cuComplex *Result_3_0, std::co
*/
void CublasFunction::benchmark() {
// Malloc memory for input and output data
this->prepare_tensor();
bool random = this->correctness ? true : this->random_data;
this->prepare_tensor(random);
// Warm up
for (int i_ = 0; i_ < warm_up; i_++) {
for (int j = 0; j < num_in_step; j++) {
this->kernel_entry();
}
this->kernel_entry();
}
CUDA_SAFE_CALL(cudaDeviceSynchronize());

Просмотреть файл

@ -39,8 +39,9 @@ class SgemmFunction : public CublasFunction {
/**
* @brief Prepare memory and data of the input and output for kernel running
*/
virtual void prepare_tensor() {
prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host);
virtual void prepare_tensor(bool random) {
prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host,
random);
}
/**
* @brief Check the correctness of function calculation result
@ -107,8 +108,9 @@ class CgemmFunction : public CublasFunction {
/**
* @brief Prepare memory and data of the input and output for kernel running
*/
virtual void prepare_tensor() {
prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host);
virtual void prepare_tensor(bool random) {
prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host,
random);
}
/**
* @brief Check the correctness of function calculation result
@ -169,17 +171,17 @@ class GemmExFunction : public CublasFunction {
/**
* @brief Prepare memory and data of the input and output for kernel running
*/
virtual void prepare_tensor() {
virtual void prepare_tensor(bool random) {
if (this->datatype_.compare("half") == 0) {
CublasFunction::prepare_tensor_template<half>(
reinterpret_cast<half **>(&Parameter_0_0), reinterpret_cast<half **>(&Parameter_1_0),
reinterpret_cast<half **>(&Result_3_0), reinterpret_cast<half **>(&Parameter_0_0_host),
reinterpret_cast<half **>(&Parameter_1_0_host));
reinterpret_cast<half **>(&Parameter_1_0_host), random);
} else if (this->datatype_.compare("float") == 0) {
CublasFunction::prepare_tensor_template<float>(
reinterpret_cast<float **>(&Parameter_0_0), reinterpret_cast<float **>(&Parameter_1_0),
reinterpret_cast<float **>(&Result_3_0), reinterpret_cast<float **>(&Parameter_0_0_host),
reinterpret_cast<float **>(&Parameter_1_0_host));
reinterpret_cast<float **>(&Parameter_1_0_host), random);
}
}
/**
@ -265,17 +267,17 @@ class GemmStridedBatchedExFunction : public CublasFunction {
/**
* @brief Prepare memory and data of the input and output for kernel running
*/
virtual void prepare_tensor() {
virtual void prepare_tensor(bool random) {
if (this->datatype_.compare("half") == 0) {
prepare_tensor_template<half>(
reinterpret_cast<half **>(&Parameter_0_0), reinterpret_cast<half **>(&Parameter_1_0),
reinterpret_cast<half **>(&Result_3_0), reinterpret_cast<half **>(&Parameter_0_0_host),
reinterpret_cast<half **>(&Parameter_1_0_host));
reinterpret_cast<half **>(&Parameter_1_0_host), random);
} else if (this->datatype_.compare("float") == 0) {
prepare_tensor_template<float>(
reinterpret_cast<float **>(&Parameter_0_0), reinterpret_cast<float **>(&Parameter_1_0),
reinterpret_cast<float **>(&Result_3_0), reinterpret_cast<float **>(&Parameter_0_0_host),
reinterpret_cast<float **>(&Parameter_1_0_host));
reinterpret_cast<float **>(&Parameter_1_0_host), random);
}
}
/**
@ -355,8 +357,9 @@ class SgemmStridedBatchedFunction : public CublasFunction {
/**
* @brief Prepare memory and data of the input and output for kernel running
*/
virtual void prepare_tensor() {
prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host);
virtual void prepare_tensor(bool random) {
prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host,
random);
}
/**
* @brief Function calculation on CPU side
@ -419,8 +422,9 @@ class Cgemm3mStridedBatchedFunction : public CublasFunction {
/**
* @brief Prepare memory and data of the input and output for kernel running
*/
virtual void prepare_tensor() {
prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host);
virtual void prepare_tensor(bool random) {
prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host,
random);
}
/**
* @brief Function calculation on CPU side

Просмотреть файл

@ -98,6 +98,7 @@ class Options {
std::string para_info_json;
bool correctness_check;
double eps;
bool random_data;
/**
* @brief Construct a options object according to cmd or set a default value used to test
@ -120,6 +121,7 @@ class Options {
: para_info_json;
correctness_check = get_cmd_line_argument_bool("--correctness");
eps = get_cmd_line_argument_double("--eps");
random_data = get_cmd_line_argument_bool("--random_data");
}
};
@ -241,6 +243,7 @@ void run_benchmark(Options &options) {
function.set_random_seed(options.random_seed);
function.set_correctness(options.correctness_check);
function.set_eps(options.eps);
function.set_random_data(options.random_data);
CublasFunction *p_function = get_cublas_function_pointer(function);
p_function->benchmark();
delete p_function;