Benchmarks: Revision - Support flexible warmup and non-random data initialization in cublas-benchmark (#479)

**Description** revise cublas-benchmark for flexible warmup and fill data with fixed number for perf test to improve the running efficiency. **Major Revision** - remove num_in_steps for warmup to support more flexible warmup setting for users - Add support to generate input with fixed number for perf test
2023-02-28 06:35:18 +08:00 · 2023-02-28 06:35:18 +08:00 · eba298f5f0
--- a/superbench/benchmarks/micro_benchmarks/cublas_function.py
+++ b/superbench/benchmarks/micro_benchmarks/cublas_function.py
@ -190,23 +190,26 @@ class CublasBenchmark(MicroBenchmarkWithInvoke):
        self._parser.add_argument(
            '--num_warmup',
            type=int,
-            default=8,
+            default=8 * 1000,
            required=False,
-            help='The number of warmup step.',
+            help='The number of functions for warmup. By default, the total number of functions to run in warmup ' +
+            'is 8 warmup steps * 1000 num_in_step.',
        )
        self._parser.add_argument(
            '--num_steps',
            type=int,
            default=100,
            required=False,
-            help='The number of test step.',
+            help='The number of test steps. By default, the total number of functions to run in the measured test ' +
+            'is 100 test steps * 1000 num_in_step.',
        )
        self._parser.add_argument(
            '--num_in_step',
            type=int,
            default=1000,
            required=False,
-            help='The number of functions in one step.',
+            help='The number of functions in one step. By default, the total number of functions to run ' +
+            'in each step is 1000.',
        )
        self._parser.add_argument(
            '--random_seed',
@ -236,6 +239,13 @@ class CublasBenchmark(MicroBenchmarkWithInvoke):
            required=False,
            help='The acceptable error bound for correctness check.',
        )
+        self._parser.add_argument(
+            '--random_data',
+            action='store_true',
+            default=False,
+            help='Enable random data generation for performance test. ' +
+            'By default, the data is filled with fixed value for performance test.',
+        )

    def _preprocess(self):
        """Preprocess/preparation operations before the benchmarking.
@ -253,6 +263,7 @@ class CublasBenchmark(MicroBenchmarkWithInvoke):
        command += (' --random_seed ' + str(self._args.random_seed))
        command += ' --correctness' if self._args.correctness else ''
        command += (' --eps ' + str(self._args.eps)) if self._args.eps is not None else ''
+        command += ' --random_data' if self._args.random_data else ''

        try:
            if not self._args.config_json_str:
--- a/superbench/benchmarks/micro_benchmarks/cublas_function/cublas_benchmark.h
+++ b/superbench/benchmarks/micro_benchmarks/cublas_function/cublas_benchmark.h
@ -54,6 +54,7 @@ class CublasFunction {
    int random_seed;                   ///< the random seed used to generate random data
    double eps;                        ///< the acceptable error bound for numeric stability
    bool correctness;                  ///< whether enable correctness check or not
+    bool random_data;                  ///< whether enable random data generation or not
    std::string name_;                 ///< the name of the cublas function
    int m_;                            ///< the m dim of matrix
    int k_;                            ///< the k dim of matrix
@ -70,17 +71,17 @@ class CublasFunction {
    /**
     * @brief Fill the random data into the input
     */
-    template <typename T> void fill_data(T *Parameter_0_0_host, T *Parameter_1_0_host);
+    template <typename T> void fill_data(T *Parameter_0_0_host, T *Parameter_1_0_host, bool random = true);
    /**
     * @brief Prepare memory and data of the input and output
     */
    template <typename T>
    void prepare_tensor_template(T **Parameter_0_0, T **Parameter_1_0, T **Result_3_0, T **Parameter_0_0_host,
-                                 T **Parameter_1_0_host);
+                                 T **Parameter_1_0_host, bool random = true);
    /**
     * @brief Prepare memory and data of the input and output for kernel running
     */
-    virtual void prepare_tensor() {}
+    virtual void prepare_tensor(bool random = true) {}
    /**
     * @brief Execute the kernel/function
     */
@ -140,11 +141,15 @@ class CublasFunction {
     * @param  eps      the acceptable error bound for numeric stability
     */
    void set_eps(double eps) { this->eps = eps; }
+    /**
+     * @brief Set the random data
+     * @param  random_data      if generate random data
+     */
+    void set_random_data(bool random_data) { this->random_data = random_data; }
    /**
     * @brief Set the params string
     * @param  str             the str representing the params of the function
     */
-
    void set_function(std::string &str) { this->function_str_ = str; }
    /**
     * @brief Set the name member
@ -228,39 +233,59 @@ class CublasFunction {
 /**
 * @brief Fill the random data into the input in float type
 */
-template <> void CublasFunction::fill_data(float *Parameter_0_0_host, float *Parameter_1_0_host) {
-    srand(random_seed);
-    for (int i = 0; i < m_ * k_ * batch_count_; i++) {
-        Parameter_0_0_host[i] = ((float)rand() / (float)(RAND_MAX));
-    }
-    for (int i = 0; i < k_ * n_ * batch_count_; ++i) {
-        Parameter_1_0_host[i] = ((float)rand() / (float)(RAND_MAX));
+template <> void CublasFunction::fill_data(float *Parameter_0_0_host, float *Parameter_1_0_host, bool random) {
+    if (random) {
+        srand(random_seed);
+        for (int i = 0; i < m_ * k_ * batch_count_; i++) {
+            Parameter_0_0_host[i] = ((float)rand() / (float)(RAND_MAX));
+        }
+        for (int i = 0; i < k_ * n_ * batch_count_; ++i) {
+            Parameter_1_0_host[i] = ((float)rand() / (float)(RAND_MAX));
+        }
+    } else {
+        // memset the input data to fixed float value
+        memset(Parameter_0_0_host, 2,
+               (unsigned long)m_ * (unsigned long)k_ * (unsigned long)batch_count_ * sizeof(float));
+        memset(Parameter_1_0_host, 3,
+               (unsigned long)k_ * (unsigned long)n_ * (unsigned long)batch_count_ * sizeof(float));
    }
 }
 /**
 * @brief Fill the random data into the input in half type
 */
-template <> void CublasFunction::fill_data(half *Parameter_0_0_host, half *Parameter_1_0_host) {
-    srand(random_seed);
-    for (int i = 0; i < m_ * k_ * batch_count_; i++) {
-        Parameter_0_0_host[i] = half((float)rand() / (float)(RAND_MAX));
-    }
-    for (int i = 0; i < k_ * n_ * batch_count_; ++i) {
-        Parameter_1_0_host[i] = half((float)rand() / (float)(RAND_MAX));
+template <> void CublasFunction::fill_data(half *Parameter_0_0_host, half *Parameter_1_0_host, bool random) {
+    if (random) {
+        srand(random_seed);
+        for (int i = 0; i < m_ * k_ * batch_count_; i++) {
+            Parameter_0_0_host[i] = half((float)rand() / (float)(RAND_MAX));
+        }
+        for (int i = 0; i < k_ * n_ * batch_count_; ++i) {
+            Parameter_1_0_host[i] = half((float)rand() / (float)(RAND_MAX));
+        }
+    } else {
+        // memset the input data to fixed float value
+        std::fill(Parameter_0_0_host, Parameter_0_0_host + m_ * k_ * batch_count_, half(2.0));
+        std::fill(Parameter_1_0_host, Parameter_1_0_host + k_ * n_ * batch_count_, half(3.0));
    }
 }
 /**
 * @brief Fill the random data into the input in cuComplex type
 */
-template <> void CublasFunction::fill_data(cuComplex *Parameter_0_0_host, cuComplex *Parameter_1_0_host) {
-    srand(random_seed);
-    for (int i = 0; i < m_ * k_ * batch_count_; i++) {
-        Parameter_0_0_host[i] =
-            make_cuComplex(((float)rand() / (float)(RAND_MAX)), ((float)rand() / (float)(RAND_MAX)));
-    }
-    for (int i = 0; i < k_ * n_ * batch_count_; ++i) {
-        Parameter_1_0_host[i] =
-            make_cuComplex(((float)rand() / (float)(RAND_MAX)), ((float)rand() / (float)(RAND_MAX)));
+template <> void CublasFunction::fill_data(cuComplex *Parameter_0_0_host, cuComplex *Parameter_1_0_host, bool random) {
+    if (random) {
+        srand(random_seed);
+        for (int i = 0; i < m_ * k_ * batch_count_; i++) {
+            Parameter_0_0_host[i] =
+                make_cuComplex(((float)rand() / (float)(RAND_MAX)), ((float)rand() / (float)(RAND_MAX)));
+        }
+        for (int i = 0; i < k_ * n_ * batch_count_; ++i) {
+            Parameter_1_0_host[i] =
+                make_cuComplex(((float)rand() / (float)(RAND_MAX)), ((float)rand() / (float)(RAND_MAX)));
+        }
+    } else {
+        // memset the input data to fixed float value
+        std::fill(Parameter_0_0_host, Parameter_0_0_host + m_ * k_ * batch_count_, make_cuComplex(2.0f, 2.0f));
+        std::fill(Parameter_1_0_host, Parameter_1_0_host + k_ * n_ * batch_count_, make_cuComplex(3.0f, 3.0f));
    }
 }
 /**
@ -268,7 +293,7 @@ template <> void CublasFunction::fill_data(cuComplex *Parameter_0_0_host, cuComp
 */
 template <typename T>
 void CublasFunction::prepare_tensor_template(T **Parameter_0_0, T **Parameter_1_0, T **Result_3_0,
-                                             T **Parameter_0_0_host, T **Parameter_1_0_host) {
+                                             T **Parameter_0_0_host, T **Parameter_1_0_host, bool random) {
    int m = this->m_, n = this->n_, k = this->k_, batch_count = this->batch_count_;
    // input argument
    CUDA_SAFE_CALL(cudaMallocHost((void **)Parameter_0_0_host, sizeof(T) * m * k * batch_count_));
@ -278,7 +303,7 @@ void CublasFunction::prepare_tensor_template(T **Parameter_0_0, T **Parameter_1_
    CUDA_SAFE_CALL(cudaMalloc((void **)Parameter_1_0, sizeof(T) * n * k * batch_count_));

    // fill input values
-    fill_data(reinterpret_cast<T *>(*Parameter_0_0_host), reinterpret_cast<T *>(*Parameter_1_0_host));
+    fill_data(reinterpret_cast<T *>(*Parameter_0_0_host), reinterpret_cast<T *>(*Parameter_1_0_host), random);

    // copy input data from host to device
    CUDA_SAFE_CALL(
@ -469,13 +494,12 @@ int CublasFunction::check_result(int batch_count, cuComplex *Result_3_0, std::co
 */
 void CublasFunction::benchmark() {
    // Malloc memory for input and output data
-    this->prepare_tensor();
+    bool random = this->correctness ? true : this->random_data;
+    this->prepare_tensor(random);

    // Warm up
    for (int i_ = 0; i_ < warm_up; i_++) {
-        for (int j = 0; j < num_in_step; j++) {
-            this->kernel_entry();
-        }
+        this->kernel_entry();
    }
    CUDA_SAFE_CALL(cudaDeviceSynchronize());

--- a/superbench/benchmarks/micro_benchmarks/cublas_function/cublas_function.h
+++ b/superbench/benchmarks/micro_benchmarks/cublas_function/cublas_function.h
@ -39,8 +39,9 @@ class SgemmFunction : public CublasFunction {
    /**
     * @brief Prepare memory and data of the input and output for kernel running
     */
-    virtual void prepare_tensor() {
-        prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host);
+    virtual void prepare_tensor(bool random) {
+        prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host,
+                                random);
    }
    /**
     * @brief Check the correctness of function calculation result
@ -107,8 +108,9 @@ class CgemmFunction : public CublasFunction {
    /**
     * @brief Prepare memory and data of the input and output for kernel running
     */
-    virtual void prepare_tensor() {
-        prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host);
+    virtual void prepare_tensor(bool random) {
+        prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host,
+                                random);
    }
    /**
     * @brief Check the correctness of function calculation result
@ -169,17 +171,17 @@ class GemmExFunction : public CublasFunction {
    /**
     * @brief Prepare memory and data of the input and output for kernel running
     */
-    virtual void prepare_tensor() {
+    virtual void prepare_tensor(bool random) {
        if (this->datatype_.compare("half") == 0) {
            CublasFunction::prepare_tensor_template<half>(
                reinterpret_cast<half **>(&Parameter_0_0), reinterpret_cast<half **>(&Parameter_1_0),
                reinterpret_cast<half **>(&Result_3_0), reinterpret_cast<half **>(&Parameter_0_0_host),
-                reinterpret_cast<half **>(&Parameter_1_0_host));
+                reinterpret_cast<half **>(&Parameter_1_0_host), random);
        } else if (this->datatype_.compare("float") == 0) {
            CublasFunction::prepare_tensor_template<float>(
                reinterpret_cast<float **>(&Parameter_0_0), reinterpret_cast<float **>(&Parameter_1_0),
                reinterpret_cast<float **>(&Result_3_0), reinterpret_cast<float **>(&Parameter_0_0_host),
-                reinterpret_cast<float **>(&Parameter_1_0_host));
+                reinterpret_cast<float **>(&Parameter_1_0_host), random);
        }
    }
    /**
@ -265,17 +267,17 @@ class GemmStridedBatchedExFunction : public CublasFunction {
    /**
     * @brief Prepare memory and data of the input and output for kernel running
     */
-    virtual void prepare_tensor() {
+    virtual void prepare_tensor(bool random) {
        if (this->datatype_.compare("half") == 0) {
            prepare_tensor_template<half>(
                reinterpret_cast<half **>(&Parameter_0_0), reinterpret_cast<half **>(&Parameter_1_0),
                reinterpret_cast<half **>(&Result_3_0), reinterpret_cast<half **>(&Parameter_0_0_host),
-                reinterpret_cast<half **>(&Parameter_1_0_host));
+                reinterpret_cast<half **>(&Parameter_1_0_host), random);
        } else if (this->datatype_.compare("float") == 0) {
            prepare_tensor_template<float>(
                reinterpret_cast<float **>(&Parameter_0_0), reinterpret_cast<float **>(&Parameter_1_0),
                reinterpret_cast<float **>(&Result_3_0), reinterpret_cast<float **>(&Parameter_0_0_host),
-                reinterpret_cast<float **>(&Parameter_1_0_host));
+                reinterpret_cast<float **>(&Parameter_1_0_host), random);
        }
    }
    /**
@ -355,8 +357,9 @@ class SgemmStridedBatchedFunction : public CublasFunction {
    /**
     * @brief Prepare memory and data of the input and output for kernel running
     */
-    virtual void prepare_tensor() {
-        prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host);
+    virtual void prepare_tensor(bool random) {
+        prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host,
+                                random);
    }
    /**
     * @brief  Function calculation on CPU side
@ -419,8 +422,9 @@ class Cgemm3mStridedBatchedFunction : public CublasFunction {
    /**
     * @brief Prepare memory and data of the input and output for kernel running
     */
-    virtual void prepare_tensor() {
-        prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host);
+    virtual void prepare_tensor(bool random) {
+        prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host,
+                                random);
    }
    /**
     * @brief  Function calculation on CPU side
--- a/superbench/benchmarks/micro_benchmarks/cublas_function/cublas_function_helper.h
+++ b/superbench/benchmarks/micro_benchmarks/cublas_function/cublas_function_helper.h
@ -98,6 +98,7 @@ class Options {
    std::string para_info_json;
    bool correctness_check;
    double eps;
+    bool random_data;

    /**
     * @brief Construct a options object according to cmd or set a default value used to test
@ -120,6 +121,7 @@ class Options {
                                              : para_info_json;
        correctness_check = get_cmd_line_argument_bool("--correctness");
        eps = get_cmd_line_argument_double("--eps");
+        random_data = get_cmd_line_argument_bool("--random_data");
    }
 };

@ -241,6 +243,7 @@ void run_benchmark(Options &options) {
        function.set_random_seed(options.random_seed);
        function.set_correctness(options.correctness_check);
        function.set_eps(options.eps);
+        function.set_random_data(options.random_data);
        CublasFunction *p_function = get_cublas_function_pointer(function);
        p_function->benchmark();
        delete p_function;