sandbox/dan2: Modify CUDA matrix allocation so that it caches freed memory and returns cached answers if they are of the size required... this is to work around the extremee slowness of cudaMalloc and cudaMallocPitch on some platforms.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/dan2@3104 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2013-10-20 20:05:51 +00:00 · 2013-10-20 20:05:51 +00:00 · 38ea57048a
--- a/src/cudamatrix/cu-block-matrix.cc
+++ b/src/cudamatrix/cu-block-matrix.cc
@ -105,8 +105,8 @@ template<class Real>
 void CuBlockMatrix<Real>::FreeCudaData() {
 #if HAVE_CUDA == 1
  if (cu_data_ != NULL) {
-    if (CuDevice::Instantiate().Enabled()) { 
+    if (CuDevice::Instantiate().Enabled()) {
-      CU_SAFE_CALL(cudaFree(cu_data_));
+      CuDevice::Instantiate().Free(cu_data_);
      cu_data_ = NULL;
    } else {
      KALDI_ERR << "CuBlockMatrix: you have CUDA data pointer but "
@ -137,7 +137,8 @@ void CuBlockMatrix<Real>::SetCudaData() {
      col_offset += this_mat.NumCols();
    }
    size_t size = NumBlocks() * sizeof(CuBlockMatrixData);
-    CU_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&cu_data_), size));
+    cu_data_ = static_cast<CuBlockMatrixData*>(
        CuDevice::Instantiate().Malloc(size));
    CU_SAFE_CALL(cudaMemcpy(cu_data_, &(tmp_cu_data[0]), size, cudaMemcpyHostToDevice));
    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());    
  }
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@ -38,20 +38,6 @@
 namespace kaldi {
 CuDevice::CuDevice()
 : active_gpu_id_(-3), verbose_(true) 
 { }
 CuDevice::~CuDevice() {
  if (Enabled()) {
    CU_SAFE_CALL(cublasShutdown());
  } else if (active_gpu_id_ == -2) {
    KALDI_WARN << "CUDA was NOT used! No CUDA GPU detected!";
  }
 }
 /** 
@ -433,8 +419,286 @@ void CuDevice::DeviceGetName(char* name, int32 len, int32 dev) {
 }
 struct CuAllocatorOptions {
  int32 count; // Number of times we free and delete a particular size before we
               // start to cache it.
  int32 cleanup_interval_bytes;
  double count_increment; // Each time we allocate a new size, we increment
                          // count by this much; it's a heuristic to say that if
                          // we are allocating many different size, we raise the
                          // count-threshold before caching any particular size.
  CuAllocatorOptions(): count(10), cleanup_interval_bytes(1000000),
                        count_increment(0.5) { }
 };
 /// We define class CuAllocator inside the .cc file, because we don't want to
 /// expose it in the header.  Its purpose is to hang on to memory that we have
 /// freed, so that we don't waste time in cudaMalloc and cudaMallocPitch().
 /// For some reason, they are sometimes very slow.
 class CuAllocator {
 public:
  CuAllocator(const CuAllocatorOptions &opts, CuDevice *device):
      device_(device), opts_(opts), count_(opts.count),
      cleanup_countdown_bytes_(opts.cleanup_interval_bytes) { }
  inline void *Malloc(size_t size);
  inline void *MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch);
  inline void Free(void *ptr);
  ~CuAllocator();
 private:
  inline void *MallocInternal(size_t row_bytes, size_t num_rows, size_t *pitch);
  // struct MemInfoForSize stores information associated with a particular size
  // of allocated memory.  The row_bytes and num_rows refer to the arguments of
  // a cudaMallocPitch call; for regular, non-pitch allocations with cudaMalloc,
  // we make "num_rows" zero.
  struct MemInfoForSize {
    size_t row_bytes; // or the size, if a regular CudaMalloc, not
                      // CudaMallocPitch.
    size_t num_rows; // or zero if it's a regular CudaMalloc call, not
                     // CudaMallocPitch.
    size_t pitch; // If CudaMallocPitch, the pitch returned by CudaMallocPitch;
                  // this code assumes (and checks) that it's a deterministic
                  // function of row_bytes and num_rows.
    size_t countdown; // number that have been freed and not cached.
    size_t currently_used; // number that are "in the wild".. kept for
                           // diagnostics and error detection.
    std::vector<void*> freed; // freed and cached...
    MemInfoForSize(size_t row_bytes,
                   size_t num_rows,
                   int32 count):
        row_bytes(row_bytes),
        num_rows(num_rows),
        pitch(0),
        countdown(count),
        currently_used(0) { }
  };
  inline MemInfoForSize *FindMemInfo(size_t row_bytes,
                                     size_t num_rows) {
    std::pair<size_t, size_t> this_pair(row_bytes, num_rows);
    // set num_rows to 0 for regular, linear allocation.
    KALDI_ASSERT(row_bytes != 0);
    unordered_map<std::pair<size_t, size_t>, MemInfoForSize*>::iterator iter =
        size_to_list_.find(this_pair);
    if (iter == size_to_list_.end()) {
      int32 count = count_;
      count_ += opts_.count_increment; // This is a kind of heuristic, that if
                                       // we're allocating a lot of different
                                       // sizes, we increase the number of times
                                       // we free a particular size before we
                                       // start caching it.
      return (size_to_list_[this_pair] =  new MemInfoForSize(row_bytes, num_rows,
                                                             count));
    } else {
      return iter->second;
    }
  }
  void PossiblyCleanup(size_t num_bytes);
  // A periodic housekeeping task..
  void Cleanup();
  void ReleaseAllCachedMemory();
  CuDevice *device_; // device this is attached to...
  CuAllocatorOptions opts_;
  unordered_map<void*, MemInfoForSize*> addr_to_list_;
  typedef unordered_map<std::pair<size_t, size_t>, MemInfoForSize*,
                        PairHasher<size_t> > SizeHash;
  typedef SizeHash::iterator SizeHashIterator;
  SizeHash size_to_list_;
  double count_; // We initialize countdown for each size to this value each time
                 // we encounter a new size.  We increment this by
                 // opts_.count_increment each time; this is a heuristic that if
                 // the program is allocating many different sizes, we put a
                 // higher threshold for any given size.
  int32 cleanup_countdown_bytes_; // countdown in bytes, until the next time we check
                                  // whether we should do cleanup
 };
 void* CuAllocator::Malloc(size_t size) {
  KALDI_ASSERT(size > 0);
  return MallocInternal(size, 0, NULL);
 }
 void* CuAllocator::MallocPitch(size_t num_rows, size_t row_bytes,
                               size_t *pitch) {
  KALDI_ASSERT(num_rows > 0 && row_bytes > 0 && pitch != NULL);
  return MallocInternal(num_rows, row_bytes, pitch);
 }
 void* CuAllocator::MallocInternal(size_t row_bytes,
                                  size_t num_rows,
                                  size_t *pitch_out) {
  // we share the code for standard cudaMalloc and cudaMallocPitch
  // because most of it is the same.  for cudaMalloc, we'll have
  // num_rows == 0, and row_bytes is just the size to be allocated.
  KALDI_ASSERT(row_bytes != 0 && (num_rows != 0) == (pitch_out != NULL));
  MemInfoForSize *info = FindMemInfo(row_bytes, num_rows);
  if (!info->freed.empty()) { // We can satisfy the request with cached,
                              // previously-allocated memory.
    void *ans = info->freed.back();
    info->freed.pop_back();
    info->currently_used++;
    addr_to_list_[ans] = info;
    if (pitch_out) *pitch_out = info->pitch;
    return ans;
  } else {
    PossiblyCleanup(num_rows == 0 ? row_bytes : row_bytes * num_rows);
    void *ans;
    if (num_rows == 0) { // Simple malloc request, not "MallocPitch".
      size_t size = row_bytes;
      int32 ret = cudaMalloc(&ans, size);
      if (ret != 0) {
        KALDI_WARN << "Allocation of memory block fo " << size << " bytes "
                   << "failed, releasing cached memory and retrying.";
        ReleaseAllCachedMemory();
        ret = cudaMalloc(&ans, size);
        if (ret != 0)
          KALDI_WARN << "Allocation failed for the second time.    Printing "
                    << "device memory usage and exiting";
          device_->PrintMemoryUsage();
          KALDI_ERR << "Memory allocation failure";
      }
    } else {
      size_t pitch;
      int32 ret = cudaMallocPitch(&ans, &pitch, row_bytes, num_rows);
      if (ret != 0) { // allocation failed...
        KALDI_WARN << "Allocation of " << num_rows << " rows, each of size "
                   << row_bytes << " bytes failed,  releasing cached "
                   << "memory and retrying.";
        ReleaseAllCachedMemory();
        ret = cudaMallocPitch(&ans, &pitch, row_bytes, num_rows);
        if (ret != 0) {
          KALDI_WARN << "Allocation failed for the second time.    Printing "
                    << "device memory usage and exiting";
          device_->PrintMemoryUsage();
          KALDI_ERR << "Memory allocation failure";
        }
      }
      KALDI_ASSERT(pitch > 0);
      if (info->pitch == 0) { // First allocation; have not set info->pitch yet.
        info->pitch = pitch;
      } else if (pitch != info->pitch) {
        KALDI_ERR << "Pitch differs between multiple calls with the same "
                  << "parameters: " << pitch << " vs. " << info->pitch;
      }
      *pitch_out = info->pitch;
    }
    addr_to_list_[ans] = info;
    info->currently_used++;
    return ans;
  }
 }
 void CuAllocator::Free(void *addr) {
  unordered_map<void*, MemInfoForSize*>::iterator iter
      = addr_to_list_.find(addr);
  if (iter == addr_to_list_.end()) {
    KALDI_ERR << "Attempt to free address " << addr << " that was not allocated "
              << "by CuDevice::Malloc() (or was previously freed);";
  }
  MemInfoForSize *info = iter->second;
  addr_to_list_.erase(addr); // Erase this element in the addr_to_list_ map.
  info->currently_used--;
  if (info->countdown == 0) { // We have freed [i.e. actually freed with
                              // CudaFree()] enough of these that we think
                              // we're wasting too much time this way and
                              // need to start caching them.
    info->freed.push_back(addr);
  } else { // Actually free the address, and decrease "countdown".
    info->countdown--;
    CU_SAFE_CALL(cudaFree(addr)); // This is how we free, even if allocated with
                                  // cudaMallocPitch().
  }
 }
 void CuAllocator::ReleaseAllCachedMemory() {
  typedef unordered_map<std::pair<size_t, size_t>, MemInfoForSize*> SetType;
  typedef SetType::const_iterator IterType;
  for (IterType iter = size_to_list_.begin(); iter != size_to_list_.end();
       ++iter) {
    MemInfoForSize *info = iter->second;
    while (!info->freed.empty()) {
      CU_SAFE_CALL(cudaFree(info->freed.back()));
      info->freed.pop_back();
    }
  }
 }
 void CuAllocator::Cleanup() {
  // TODO: implement this or remove it (and also PossiblyCleanup).
  // Actually we may never implement this, as just calling
  // ReleaseAllCachedMemory whenever an allocation fails is probably
  // sufficient.
 }
 void CuAllocator::PossiblyCleanup(size_t num_bytes) {
  if (static_cast<size_t>(cleanup_countdown_bytes_) <= num_bytes) {
    Cleanup();
    cleanup_countdown_bytes_ = opts_.cleanup_interval_bytes;
  } else {
    cleanup_countdown_bytes_ -= static_cast<int32>(num_bytes);
  }
 }
 CuAllocator::~CuAllocator() {
  // Check that nothing was allocated by thge user and not freed.
  std::set<MemInfoForSize*> unfreed_set;
  typedef unordered_map<void*, MemInfoForSize *>::iterator IterType;
  for (IterType iter = addr_to_list_.begin(); iter != addr_to_list_.end();
       ++iter)
    unfreed_set.insert(iter->second);
  for (std::set<MemInfoForSize*>::iterator iter = unfreed_set.begin();
       iter != unfreed_set.end(); ++iter) {
    MemInfoForSize *info = *iter;
    KALDI_ASSERT(info->currently_used > 0); // Or should not be in this set
                                            // (code error or memory corruption)
    if (info->num_rows == 0) {
      KALDI_WARN << info->currently_used << " memory chunks of size "
                 << info->row_bytes << " were allocated and not freed.";
    } else {
      KALDI_WARN << info->currently_used << " memory chunks of size "
                 << info->row_bytes << " per row, and " << info->num_rows
                 << " rows, were allocated and not freed.";
    }      
  }
 }
 void CuDevice::Free(void *ptr) { allocator_->Free(ptr); }
 void* CuDevice::MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch) {
  return allocator_->MallocPitch(row_bytes, num_rows, pitch);
 }
 void* CuDevice::Malloc(size_t size) {
  return allocator_->Malloc(size);
 }
 CuDevice::CuDevice(): active_gpu_id_(-3), verbose_(true),
                      allocator_(new CuAllocator(CuAllocatorOptions(), this))
  { }
 CuDevice::~CuDevice() {
  if (Enabled()) {
    CU_SAFE_CALL(cublasShutdown());
  } else if (active_gpu_id_ == -2) {
    KALDI_WARN << "CUDA was NOT used! No CUDA GPU detected!";
  }
  if (allocator_ != NULL)
    delete allocator_;
 }
 // The instance of the static singleton 
 CuDevice CuDevice::global_device_;
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@ -44,6 +44,16 @@ class CuDevice {
 public:
  ~CuDevice();
  static inline CuDevice& Instantiate() { return global_device_; }
  // We provide functions Malloc, MallocPitch and Free which replace cudaMalloc,
  // cudaMallocPitch and cudaFree.  Their function is to cache the results of
  // previous allocations to avoid the very large overhead that CUDA's
  // allocation seems to give for some setups.
  void* Malloc(size_t size);
  void* MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch);
  void Free(void *ptr);
  /**********************************/
  // Instance interface
@ -83,8 +93,8 @@ class CuDevice {
 private:
  CuDevice();
-  CuDevice(CuDevice&);
+  CuDevice(CuDevice&); // Disallow.
-  CuDevice &operator=(CuDevice&);
+  CuDevice &operator=(CuDevice&);  // Disallow.
  static CuDevice global_device_;
@ -114,7 +124,7 @@ class CuDevice {
  bool verbose_;
-  //CuAllocator allocator_;
+  CuAllocator *allocator_;
 }; // class CuDevice
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@ -60,8 +60,8 @@ void CuMatrix<Real>::Resize(MatrixIndexT rows, MatrixIndexT cols,
    Timer tim;
    MatrixIndexT row_bytes = cols * sizeof(Real);
    size_t pitch;
-    CU_SAFE_CALL(cudaMallocPitch(reinterpret_cast<void**>(&this->data_), &pitch,
+    this->data_ = static_cast<Real*>(CuDevice::Instantiate().MallocPitch(
-                                 row_bytes, rows));
+        row_bytes, rows, &pitch));
    this->num_rows_ = rows;
    this->num_cols_ = cols; 
    this->stride_ = pitch / sizeof(Real);
@ -83,7 +83,7 @@ void CuMatrix<Real>::Destroy() {
  if (CuDevice::Instantiate().Enabled()) {
    if (this->data_ != NULL) {
      Timer tim;
-      CU_SAFE_CALL(cudaFree(this->data_));
+      CuDevice::Instantiate().Free(this->data_);
      CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());    
    }
  } else
@ -1005,8 +1005,7 @@ void CuMatrix<Real>::CompObjfAndDeriv(const std::vector<MatrixElement<Real> >& s
      }
    }
 // */
-    void *addr;
+    void *addr = CuDevice::Instantiate().Malloc(sv_labels.size() * sizeof(MatrixElement<Real>));
    CU_SAFE_CALL(cudaMalloc( (void**)&addr, sv_labels.size() * sizeof(MatrixElement<Real>)));
    CU_SAFE_CALL(cudaMemcpy(addr, sv_labels.data(), sv_labels.size() * sizeof(MatrixElement<Real>), cudaMemcpyHostToDevice));
    Timer tim;
    CuVector<Real> tmp(2, kUndefined);
@ -1017,7 +1016,7 @@ void CuMatrix<Real>::CompObjfAndDeriv(const std::vector<MatrixElement<Real> >& s
    Vector<Real> tmp_cpu(tmp);
    *tot_objf = tmp_cpu(0);
    *tot_weight = tmp_cpu(1);
-    CU_SAFE_CALL(cudaFree(addr));
+    CuDevice::Instantiate().Free(addr);
    CuDevice::Instantiate().AccuProfile("Comp_Obj_Deriv", tim.Elapsed());
  } else
 #endif
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@ -51,15 +51,16 @@ void CuPackedMatrix<Real>::Resize(MatrixIndexT rows,
    this->Destroy();
  if (rows == 0) return;  
 #if HAVE_CUDA == 1
-  if (CuDevice::Instantiate().Enabled()) {
+  CuDevice &device = CuDevice::Instantiate();
  if (device.Enabled()) {
    Timer tim;
    this->num_rows_ = rows;
    size_t nr = static_cast<size_t>(num_rows_),
        num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
-    CU_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&this->data_), num_bytes));
+    this->data_ = static_cast<Real*>(device.Malloc(num_bytes));
    if (resize_type == kSetZero) this->SetZero();
-    CuDevice::Instantiate().AccuProfile("CuPackedMatrix::Resize", tim.Elapsed());    
+    device.AccuProfile("CuPackedMatrix::Resize", tim.Elapsed());    
  } else
 #endif
  { // Let the initializer of SpMatrix<Real> handle the allocation,
@ -85,7 +86,7 @@ void CuPackedMatrix<Real>::Destroy() {
 #if HAVE_CUDA == 1
  if (CuDevice::Instantiate().Enabled()) { 
    if (this->data_ != NULL) {
-      CU_SAFE_CALL(cudaFree(this->data_));
+      CuDevice::Instantiate().Free(this->data_);
    }
  } else
 #endif
--- a/src/cudamatrix/cu-rand.cc
+++ b/src/cudamatrix/cu-rand.cc
@ -41,9 +41,10 @@ void CuRand<Real>::SeedGpu(MatrixIndexT state_size) {
 template<typename Real> 
 void CuRand<Real>::SeedBuffer(MatrixIndexT state_size, uint32 **tgt) {
 #if HAVE_CUDA == 1
-  if (CuDevice::Instantiate().Enabled()) {
+  CuDevice &device = CuDevice::Instantiate();
  if (device.Enabled()) {
    if (*tgt != NULL) {
-      CU_SAFE_CALL(cudaFree(*tgt));
+      device.Free(*tgt);
      *tgt = NULL;
    }
    if (state_size == 0) return; // Nothing to do.
@ -51,7 +52,7 @@ void CuRand<Real>::SeedBuffer(MatrixIndexT state_size, uint32 **tgt) {
    for(MatrixIndexT i = 0; i < state_size; i++)
      temp_rand_data[i] = RandInt(128, RAND_MAX);
    int32 state_size_in_bytes = state_size * sizeof(uint32);
-    CU_SAFE_CALL(cudaMalloc((void**)tgt, state_size_in_bytes));
+    *tgt = static_cast<uint32*>(device.Malloc(state_size_in_bytes));
    CU_SAFE_CALL(cudaMemcpy(*tgt, &(temp_rand_data[0]),
                            state_size_in_bytes, cudaMemcpyHostToDevice));
  }
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@ -362,17 +362,11 @@ void CuVectorBase<Real>::ApplyLog() {
    int dimBlock(CU1DBLOCK);
    int dimGrid(n_blocks(dim_,CU1DBLOCK));
-    Real* device_flag;
+    CuVector<Real> flag(1);
-    CU_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&device_flag), sizeof(Real)));
+    cuda_vec_apply_log(dimGrid, dimBlock, data_, flag.Data(), dim_);
-    CU_SAFE_CALL(cudaMemset(device_flag, 0, sizeof(Real)));
+    if (flag(0) > 0)
    cuda_vec_apply_log(dimGrid, dimBlock, data_, device_flag, dim_);
    Real host_flag = 0.0;
    CU_SAFE_CALL(cudaMemcpy(&host_flag, device_flag, sizeof(Real), cudaMemcpyDeviceToHost));
    if (host_flag > 0)
      KALDI_ERR << "Trying to take log of a negative number.";
    CU_SAFE_CALL(cudaFree(device_flag));
    CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyLog", tim.Elapsed());
  } else
 #endif
  {
@ -579,13 +573,10 @@ Real CuVectorBase<Real>::Min() const {
 #if HAVE_CUDA == 1
  if (CuDevice::Instantiate().Enabled()) {
    Timer tim;
-    Real* device_value;
+    CuVector<Real> ans(1);
-    CU_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&device_value), sizeof(Real)));
+    cuda_vec_min(data_, ans.Data(), dim_);
    CU_SAFE_CALL(cudaMemset(device_value, 0, sizeof(Real)));
    cuda_vec_min(data_, device_value, dim_);
    CU_SAFE_CALL(cudaGetLastError());
-    CU_SAFE_CALL(cudaMemcpy(&result, device_value, sizeof(Real), cudaMemcpyDeviceToHost));
+    result = ans(0);
    CU_SAFE_CALL(cudaFree(device_value));
    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
  } else
 #endif
@ -601,15 +592,10 @@ Real CuVectorBase<Real>::Max() const {
 #if HAVE_CUDA == 1
  if (CuDevice::Instantiate().Enabled()) {
    Timer tim;
-    Real* device_value;
+    CuVector<Real> ans(1);
-    CU_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&device_value), sizeof(Real)));
+    cuda_vec_max(data_, ans.Data(), dim_);
-    CU_SAFE_CALL(cudaMemset(device_value, 0, sizeof(Real)));
+    result = ans(0);
    cuda_vec_max(data_, device_value, dim_);
    CU_SAFE_CALL(cudaGetLastError());
    CU_SAFE_CALL(cudaMemcpy(&result, device_value, sizeof(Real), cudaMemcpyDeviceToHost));
    CU_SAFE_CALL(cudaFree(device_value));
    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
  } else
 #endif
  {
@ -776,14 +762,7 @@ void CuVector<Real>::Resize(MatrixIndexT dim, MatrixResizeType t) {
 #if HAVE_CUDA == 1
  if (CuDevice::Instantiate().Enabled()) {
    Timer tim;
-#if 0
+    this->data_ = static_cast<Real*>(CuDevice::Instantiate().Malloc(dim * sizeof(Real)));
    // put a NaN past the end, I did this to try to find extra bugs.   None seen.
    CU_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&this->data_), (dim + 1) * sizeof(Real)));
    Real nan = std::numeric_limits<Real>::infinity() - std::numeric_limits<Real>::infinity();
    CU_SAFE_CALL(cudaMemcpy(this->data_ + dim, &nan, sizeof(Real), cudaMemcpyHostToDevice));    
 #else
    CU_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&this->data_), dim * sizeof(Real)));
 #endif
    this->dim_ = dim;
    if (t == kSetZero) this->SetZero();
    CuDevice::Instantiate().AccuProfile("CuVector::Resize", tim.Elapsed());    
@ -835,9 +814,8 @@ template<typename Real>
 void CuVector<Real>::Destroy() {
 #if HAVE_CUDA == 1
  if (CuDevice::Instantiate().Enabled()) { 
-    if (this->data_ != NULL) {
+    if (this->data_ != NULL)
-      CU_SAFE_CALL(cudaFree(this->data_));
+      CuDevice::Instantiate().Free(this->data_);
    }
  } else
 #endif
  {