antares/public/hip_convfwd_multialgo_cuda.cc

// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cudnn_v7.h>
#include <string>
#include <vector>
#include <unordered_map>

#define ENABLE_DEFAULT_DECLARE
#define SKIP_MAIN_BODY

// #include "convfwd_autotvm.h"

static long __conv_N, __conv_C, __conv_H, __conv_W, __conv_F, __conv_K, __conv_ST, __conv_PD, __conv_D = 1;
static long __conv_HO, __conv_WO;

#ifdef __INT8B_COMPUTE__
#define T int8b
#else
#define T float
#endif

// static T A[__conv_N][__conv_C][__conv_H][__conv_W];
// static T B[__conv_N][__conv_F][__conv_HO][__conv_WO];
// static T K[__conv_F][__conv_C][__conv_K][__conv_K];
static std::vector<T> __A, __B, __K;


namespace {
  cudaEvent_t hStart, hEnd;
  cudaEvent_t hLeft, hRight;
  float ms; const int loop_runs = 100;
  T *d_m[4];
  cudnnHandle_t hCudnn;
}

template <class F1, class F2>
static void compute(const char *type, F1 init, F2 func) {
  printf("======== For %s: ========\n", type);
  if (!init()) {
    printf("CUDNN Conv Algorithm %s doesn't support this compute shape.\n", type);
    return;
  }
  assert(0 == cudaMemset(d_m[1], 0, __B.size()));
  printf("B = %zd\n", __B.size());
  if (!func()) {
    printf("CUDNN Conv Algorithm %s doesn't support this compute shape.\n", type);
    return;
  }
  assert(0 == cudaMemcpy(__B.data(), d_m[1], __B.size(), cudaMemcpyDeviceToHost));
  double dig = 0;
  for (int i = 0; i < 4; ++i) { for (int j = 0; j < 4; ++j)
#ifdef __INT8B_COMPUTE__
    printf("%d ", __B[i]);
#else
    printf("%.1f ", __B[i]);
#endif
    puts("...");
  }
  for (int i = 0; i < __B.size(); ++i)
    dig += __B[i] * __B[i];
  printf("... digest_old = %g\n", dig);
  assert(0 == cudaStreamSynchronize(0));

  assert(0 == cudaEventRecord(hStart, 0));
  for (int i = 0; i < loop_runs; ++i)
    func();
  assert(0 == cudaEventRecord(hEnd, 0));

  assert(0 == cudaStreamSynchronize(0));
  assert(0 == cudaEventElapsedTime(&ms, hStart, hEnd));
  printf(">> GFlops = %g\n", 2LU * __conv_N * __conv_HO * __conv_WO * __conv_C * __conv_F * __conv_K * __conv_K * loop_runs / ms * 1e-6);
  printf(">> ms/op = %g\n", ms / loop_runs);
}

int main() {
  __conv_N = getenv("N") ? atol(getenv("N")) : 64;
  __conv_C = getenv("C") ? atol(getenv("C")) : 3;
  __conv_H = getenv("H") ? atol(getenv("H")) : 229;
  __conv_W = getenv("W") ? atol(getenv("W")) : 229;
  __conv_F = getenv("F") ? atol(getenv("F")) : 32;
  __conv_K = getenv("K") ? atol(getenv("K")) : 5;
  __conv_ST = getenv("ST") ? atol(getenv("ST")) : 1;
  __conv_PD = getenv("PD") ? atol(getenv("PD")) : 2;

  printf("convfwd for N=%zd, C=%zd, H=%zd, W=%zd, F=%zd, K=%zd, ST=%zd, PD=%zd\n", __conv_N, __conv_C, __conv_H, __conv_W, __conv_F, __conv_K, __conv_ST, __conv_PD);

  // printf("convfwd for NCHW = (%zd, %zd, %zd, %zd) CO = %zd, K = %zd, S = %zd, P = %zd\n", __conv_N, __conv_C, __conv_H, __conv_W, __conv_F, __conv_K, __conv_ST, __conv_PD);
  assert(0 == cudaSetDevice(0));
  assert(0 == cudnnCreate(&hCudnn));

  cudnnConvolutionDescriptor_t convDesc;
  cudnnTensorDescriptor_t xDesc, yDesc;
  cudnnFilterDescriptor_t kDesc;
  assert(0 == cudnnCreateTensorDescriptor(&xDesc));
  assert(0 == cudnnCreateTensorDescriptor(&yDesc));
  assert(0 == cudnnCreateFilterDescriptor(&kDesc));
  assert(0 == cudnnCreateConvolutionDescriptor(&convDesc));
  int dims[4] = {(int)__conv_N, (int)__conv_C, (int)__conv_H, (int)__conv_W};
  int pad[2] = {(int)__conv_PD, (int)__conv_PD}, fstride[2] = {(int)__conv_ST, (int)__conv_ST}, dila[2] = {(int)__conv_D, (int)__conv_D};
  int oihw[4] = {(int)__conv_F, (int)__conv_C, (int)__conv_K, (int)__conv_K};
  assert(0 == cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dims[0], dims[1], dims[2], dims[3]));
  assert(0 == cudnnSetFilter4dDescriptor(kDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, oihw[0], oihw[1], oihw[2], oihw[3]));
  assert(0 == cudnnSetConvolutionNdDescriptor(convDesc, 2, pad, fstride, dila, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
  printf("input shape = %d %d %d %d\n", dims[0], dims[1], dims[2], dims[3]);
  assert(0 == cudnnGetConvolution2dForwardOutputDim(convDesc, xDesc, kDesc, &dims[0], &dims[1], &dims[2], &dims[3]));
  printf("output shape = %d %d %d %d\n", dims[0], dims[1], dims[2], dims[3]);
  assert(__conv_N == dims[0]);
  assert(__conv_F == dims[1]);

  __conv_HO = dims[2];
  __conv_WO = dims[3];
  assert(0 == cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dims[0], dims[1], dims[2], dims[3]));

  __B.resize(__conv_N * __conv_F * __conv_HO * __conv_WO);

#ifdef __INT8B_COMPUTE__
  T in_val = 0x01010101;
#else
  T in_val = 1.0f;
#endif
  __A = std::vector<T>(__conv_N * __conv_C * __conv_H * __conv_W, in_val);
  __K = std::vector<T>(__conv_F * __conv_C * __conv_K * __conv_K, in_val);

  assert(0 == cudaMalloc((void**)&d_m[0], __A.size() * sizeof(T)));
  assert(0 == cudaMalloc((void**)&d_m[1], __B.size() * sizeof(T)));
  assert(0 == cudaMalloc((void**)&d_m[2], __K.size() * sizeof(T)));

  assert(0 == cudaEventCreate(&hStart));
  assert(0 == cudaEventCreate(&hEnd));
  assert(0 == cudaEventCreate(&hLeft));
  assert(0 == cudaEventCreate(&hRight));

  assert(0 == cudaMemcpy(d_m[0], __A.data(), __A.size(), cudaMemcpyHostToDevice));
  assert(0 == cudaMemcpy(d_m[2], __K.data(), __K.size(), cudaMemcpyHostToDevice));

#if 0
  compute("tvm_tune", [&]() {
  }, [&]() {
    LAUNCH_NAME(0, d_m[0], d_m[2], d_m[1]);
    // __impl__conv2d_fwd_128_3_227_227_96_11_11_4_0_1<<<dim3(1,55,128), dim3(5,1,48), 0, 0>>>(d_m[0], d_m[2], d_m[1]);
	  return true;
  });
#endif

  std::vector<cudnnConvolutionFwdAlgo_t> algos = {
    CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
    CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
    CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
  };
  std::unordered_map<cudnnConvolutionFwdAlgo_t, std::string> algo_names = {
    {CUDNN_CONVOLUTION_FWD_ALGO_DIRECT, "CUDNN_CONVOLUTION_FWD_ALGO_DIRECT"},
    {CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, "CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD"},
    {CUDNN_CONVOLUTION_FWD_ALGO_GEMM, "CUDNN_CONVOLUTION_FWD_ALGO_GEMM"},
    {CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, "CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM"},
  };

  for (auto algo: algos) {
    size_t ws_size;

    compute(("cudnn::" + algo_names[algo]).c_str(), [&]() {
      if (cudnnGetConvolutionForwardWorkspaceSize(hCudnn, xDesc, kDesc, convDesc, yDesc, algo, &ws_size) != 0)
        return false;
      printf("workspace size = %zd\n", ws_size);
      if (d_m[3]) {
        assert(0 == cudaFree(d_m[3]));
        d_m[3] = NULL;
      }
      if (ws_size)
        assert(0 == cudaMalloc((void**)&d_m[3], ws_size));
      return true;
    }, [&]() {
      float alpha = 1.0f, beta = 0.0f;
      if (cudnnConvolutionForward(hCudnn, &alpha, xDesc, d_m[0], kDesc, d_m[2], convDesc, algo, d_m[3], ws_size, &beta, yDesc, d_m[1]) != 0)
        return false;
      return true;
    });
  }

  return 0;
}
Initial Commit 2020-06-24 08:40:41 +03:00			`// Copyright (c) Microsoft Corporation.`
			`// Licensed under the MIT license.`

			`#include <assert.h>`
			`#include <stdio.h>`
			`#include <string.h>`
			`#include <stdlib.h>`
			`#include <cuda_runtime.h>`
			`#include <cudnn_v7.h>`
			`#include <string>`
			`#include <vector>`
			`#include <unordered_map>`

			`#define ENABLE_DEFAULT_DECLARE`
			`#define SKIP_MAIN_BODY`

			`// #include "convfwd_autotvm.h"`

			`static long __conv_N, __conv_C, __conv_H, __conv_W, __conv_F, __conv_K, __conv_ST, __conv_PD, __conv_D = 1;`
			`static long __conv_HO, __conv_WO;`

			`#ifdef __INT8B_COMPUTE__`
			`#define T int8b`
			`#else`
			`#define T float`
			`#endif`

			`// static T A[__conv_N][__conv_C][__conv_H][__conv_W];`
			`// static T B[__conv_N][__conv_F][__conv_HO][__conv_WO];`
			`// static T K[__conv_F][__conv_C][__conv_K][__conv_K];`
			`static std::vector<T> __A, __B, __K;`


			`namespace {`
			`cudaEvent_t hStart, hEnd;`
			`cudaEvent_t hLeft, hRight;`
			`float ms; const int loop_runs = 100;`
			`T *d_m[4];`
			`cudnnHandle_t hCudnn;`
			`}`

			`template <class F1, class F2>`
			`static void compute(const char *type, F1 init, F2 func) {`
			`printf("======== For %s: ========\n", type);`
			`if (!init()) {`
			`printf("CUDNN Conv Algorithm %s doesn't support this compute shape.\n", type);`
			`return;`
			`}`
			`assert(0 == cudaMemset(d_m[1], 0, __B.size()));`
			`printf("B = %zd\n", __B.size());`
			`if (!func()) {`
			`printf("CUDNN Conv Algorithm %s doesn't support this compute shape.\n", type);`
			`return;`
			`}`
			`assert(0 == cudaMemcpy(__B.data(), d_m[1], __B.size(), cudaMemcpyDeviceToHost));`
			`double dig = 0;`
			`for (int i = 0; i < 4; ++i) { for (int j = 0; j < 4; ++j)`
			`#ifdef __INT8B_COMPUTE__`
			`printf("%d ", __B[i]);`
			`#else`
			`printf("%.1f ", __B[i]);`
			`#endif`
			`puts("...");`
			`}`
			`for (int i = 0; i < __B.size(); ++i)`
			`dig += __B[i] * __B[i];`
			`printf("... digest_old = %g\n", dig);`
			`assert(0 == cudaStreamSynchronize(0));`

			`assert(0 == cudaEventRecord(hStart, 0));`
			`for (int i = 0; i < loop_runs; ++i)`
			`func();`
			`assert(0 == cudaEventRecord(hEnd, 0));`

			`assert(0 == cudaStreamSynchronize(0));`
			`assert(0 == cudaEventElapsedTime(&ms, hStart, hEnd));`
			`printf(">> GFlops = %g\n", 2LU * __conv_N * __conv_HO * __conv_WO * __conv_C * __conv_F * __conv_K * __conv_K * loop_runs / ms * 1e-6);`
			`printf(">> ms/op = %g\n", ms / loop_runs);`
			`}`

			`int main() {`
			`__conv_N = getenv("N") ? atol(getenv("N")) : 64;`
			`__conv_C = getenv("C") ? atol(getenv("C")) : 3;`
			`__conv_H = getenv("H") ? atol(getenv("H")) : 229;`
			`__conv_W = getenv("W") ? atol(getenv("W")) : 229;`
			`__conv_F = getenv("F") ? atol(getenv("F")) : 32;`
			`__conv_K = getenv("K") ? atol(getenv("K")) : 5;`
			`__conv_ST = getenv("ST") ? atol(getenv("ST")) : 1;`
			`__conv_PD = getenv("PD") ? atol(getenv("PD")) : 2;`

			`printf("convfwd for N=%zd, C=%zd, H=%zd, W=%zd, F=%zd, K=%zd, ST=%zd, PD=%zd\n", __conv_N, __conv_C, __conv_H, __conv_W, __conv_F, __conv_K, __conv_ST, __conv_PD);`

			`// printf("convfwd for NCHW = (%zd, %zd, %zd, %zd) CO = %zd, K = %zd, S = %zd, P = %zd\n", __conv_N, __conv_C, __conv_H, __conv_W, __conv_F, __conv_K, __conv_ST, __conv_PD);`
			`assert(0 == cudaSetDevice(0));`
			`assert(0 == cudnnCreate(&hCudnn));`

			`cudnnConvolutionDescriptor_t convDesc;`
			`cudnnTensorDescriptor_t xDesc, yDesc;`
			`cudnnFilterDescriptor_t kDesc;`
			`assert(0 == cudnnCreateTensorDescriptor(&xDesc));`
			`assert(0 == cudnnCreateTensorDescriptor(&yDesc));`
			`assert(0 == cudnnCreateFilterDescriptor(&kDesc));`
			`assert(0 == cudnnCreateConvolutionDescriptor(&convDesc));`
			`int dims[4] = {(int)__conv_N, (int)__conv_C, (int)__conv_H, (int)__conv_W};`
			`int pad[2] = {(int)__conv_PD, (int)__conv_PD}, fstride[2] = {(int)__conv_ST, (int)__conv_ST}, dila[2] = {(int)__conv_D, (int)__conv_D};`
			`int oihw[4] = {(int)__conv_F, (int)__conv_C, (int)__conv_K, (int)__conv_K};`
			`assert(0 == cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dims[0], dims[1], dims[2], dims[3]));`
			`assert(0 == cudnnSetFilter4dDescriptor(kDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, oihw[0], oihw[1], oihw[2], oihw[3]));`
			`assert(0 == cudnnSetConvolutionNdDescriptor(convDesc, 2, pad, fstride, dila, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));`
			`printf("input shape = %d %d %d %d\n", dims[0], dims[1], dims[2], dims[3]);`
			`assert(0 == cudnnGetConvolution2dForwardOutputDim(convDesc, xDesc, kDesc, &dims[0], &dims[1], &dims[2], &dims[3]));`
			`printf("output shape = %d %d %d %d\n", dims[0], dims[1], dims[2], dims[3]);`
			`assert(__conv_N == dims[0]);`
			`assert(__conv_F == dims[1]);`

			`__conv_HO = dims[2];`
			`__conv_WO = dims[3];`
			`assert(0 == cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dims[0], dims[1], dims[2], dims[3]));`

			`__B.resize(__conv_N * __conv_F * __conv_HO * __conv_WO);`

			`#ifdef __INT8B_COMPUTE__`
			`T in_val = 0x01010101;`
			`#else`
			`T in_val = 1.0f;`
			`#endif`
			`__A = std::vector<T>(__conv_N * __conv_C * __conv_H * __conv_W, in_val);`
			`__K = std::vector<T>(__conv_F * __conv_C * __conv_K * __conv_K, in_val);`

			`assert(0 == cudaMalloc((void*)&d_m[0], __A.size() sizeof(T)));`
			`assert(0 == cudaMalloc((void*)&d_m[1], __B.size() sizeof(T)));`
			`assert(0 == cudaMalloc((void*)&d_m[2], __K.size() sizeof(T)));`

			`assert(0 == cudaEventCreate(&hStart));`
			`assert(0 == cudaEventCreate(&hEnd));`
			`assert(0 == cudaEventCreate(&hLeft));`
			`assert(0 == cudaEventCreate(&hRight));`

			`assert(0 == cudaMemcpy(d_m[0], __A.data(), __A.size(), cudaMemcpyHostToDevice));`
			`assert(0 == cudaMemcpy(d_m[2], __K.data(), __K.size(), cudaMemcpyHostToDevice));`

			`#if 0`
			`compute("tvm_tune", [&]() {`
			`}, [&]() {`
			`LAUNCH_NAME(0, d_m[0], d_m[2], d_m[1]);`
			`// __impl__conv2d_fwd_128_3_227_227_96_11_11_4_0_1<<<dim3(1,55,128), dim3(5,1,48), 0, 0>>>(d_m[0], d_m[2], d_m[1]);`
			`return true;`
			`});`
			`#endif`

			`std::vector<cudnnConvolutionFwdAlgo_t> algos = {`
			`CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,`
			`CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,`
			`CUDNN_CONVOLUTION_FWD_ALGO_GEMM,`
			`CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,`
			`};`
			`std::unordered_map<cudnnConvolutionFwdAlgo_t, std::string> algo_names = {`
			`{CUDNN_CONVOLUTION_FWD_ALGO_DIRECT, "CUDNN_CONVOLUTION_FWD_ALGO_DIRECT"},`
			`{CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, "CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD"},`
			`{CUDNN_CONVOLUTION_FWD_ALGO_GEMM, "CUDNN_CONVOLUTION_FWD_ALGO_GEMM"},`
			`{CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, "CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM"},`
			`};`

			`for (auto algo: algos) {`
			`size_t ws_size;`

			`compute(("cudnn::" + algo_names[algo]).c_str(), [&]() {`
			`if (cudnnGetConvolutionForwardWorkspaceSize(hCudnn, xDesc, kDesc, convDesc, yDesc, algo, &ws_size) != 0)`
			`return false;`
			`printf("workspace size = %zd\n", ws_size);`
			`if (d_m[3]) {`
			`assert(0 == cudaFree(d_m[3]));`
			`d_m[3] = NULL;`
			`}`
			`if (ws_size)`
			`assert(0 == cudaMalloc((void**)&d_m[3], ws_size));`
			`return true;`
			`}, [&]() {`
			`float alpha = 1.0f, beta = 0.0f;`
			`if (cudnnConvolutionForward(hCudnn, &alpha, xDesc, d_m[0], kDesc, d_m[2], convDesc, algo, d_m[3], ws_size, &beta, yDesc, d_m[1]) != 0)`
			`return false;`
			`return true;`
			`});`
			`}`

			`return 0;`
			`}`