dan2,cudamatrix: GPU selection logic is simplified; --use-gpu-id=N option is now replaced by --use-gpu=yes|no|optionaly,

"yes" -- Select GPU automatically (or get one by exclusive mode) and die if this fails. "optionaly" -- Do as above, but if it fails, back off to CPU "no" -- Run on CPU. The recommended GPU setup is to use compute-exclusive mode, otherwise GPU gets selected automatically. git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/dan2@3117 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2013-10-25 23:09:58 +00:00 · 2013-10-25 23:09:58 +00:00 · 2be9da4cbb
--- a/egs/wsj/s5/steps/decode_nnet.sh
+++ b/egs/wsj/s5/steps/decode_nnet.sh
@ -25,7 +25,7 @@ scoring_opts="--min-lmwt 4 --max-lmwt 15"

 num_threads=1 # if >1, will use latgen-faster-parallel
 parallel_opts="-pe smp $((num_threads+1))" # use 2 CPUs (1 DNN-forward, 1 decoder)
-use_gpu_id=-1 # -1 disable gpu
+use_gpu="no" # yes|no|optionaly
 # End configuration section.

 echo "$0 $@"  # Print the command line for logging
@ -104,7 +104,7 @@ fi
 # Run the decoding in the queue
 if [ $stage -le 0 ]; then
  $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \
-    nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu-id=$use_gpu_id $nnet "$feats" ark:- \| \
+    nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu=$use_gpu $nnet "$feats" ark:- \| \
    latgen-faster-mapped$thread_string --max-active=$max_active --max-mem=$max_mem --beam=$beam \
    --lattice-beam=$latbeam --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
    $model $graphdir/HCLG.fst ark:- "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
--- a/egs/wsj/s5/steps/train_nnet.sh
+++ b/egs/wsj/s5/steps/train_nnet.sh
@ -46,7 +46,6 @@ train_opts=        # options, passed to the training script
 train_tool=        # optionally change the training tool

 # OTHER
-use_gpu_id= # manually select GPU id to run on, (-1 disables GPU)
 analyze_alignments=true # run the alignment analysis script
 seed=777    # seed value used for training data shuffling and initialization
 # End configuration.
@ -258,7 +257,7 @@ else
  feature_transform_old=$feature_transform
  feature_transform=${feature_transform%.nnet}_cmvn-g.nnet
  echo "Renormalizing MLP input features into $feature_transform"
-  nnet-forward ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
+  nnet-forward --use-gpu=yes \
    $feature_transform_old "$(echo $feats_tr | sed 's|train.scp|train.scp.10k|')" \
    ark:- 2>$dir/log/nnet-forward-cmvn.log |\
  compute-cmvn-stats ark:- - | cmvn-to-nnet - - |\
@ -315,7 +314,6 @@ steps/train_nnet_scheduler.sh \
  ${train_opts} \
  ${train_tool:+ --train-tool "$train_tool"} \
  ${config:+ --config $config} \
-  ${use_gpu_id:+ --use-gpu-id $use_gpu_id} \
  $mlp_init "$feats_tr" "$feats_cv" "$labels_tr" "$labels_cv" $dir || exit 1


--- a/egs/wsj/s5/steps/train_nnet_mmi.sh
+++ b/egs/wsj/s5/steps/train_nnet_mmi.sh
@ -21,7 +21,6 @@ learn_rate=0.00001
 halving_factor=1.0 #ie. disable halving
 drop_frames=true
 verbose=1
-use_gpu_id=

 seed=777    # seed value used for training data shuffling
 # End configuration section
@ -168,7 +167,6 @@ while [ $x -le $num_iters ]; do
       --learn-rate=$learn_rate \
       --drop-frames=$drop_frames \
       --verbose=$verbose \
-       ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
       $cur_mdl $alidir/final.mdl "$feats" "$lats" "$ali" $dir/$x.nnet || exit 1
  fi
  cur_mdl=$dir/$x.nnet
--- a/egs/wsj/s5/steps/train_nnet_mpe.sh
+++ b/egs/wsj/s5/steps/train_nnet_mpe.sh
@ -21,7 +21,6 @@ halving_factor=1.0 #ie. disable halving
 do_smbr=true
 use_silphones=false #setting this to something will enable giving siphones to nnet-mpe
 verbose=1
-use_gpu_id=

 seed=777    # seed value used for training data shuffling
 # End configuration section
@ -151,7 +150,6 @@ while [ $x -le $num_iters ]; do
       --do-smbr=$do_smbr \
       --verbose=$verbose \
       $mpe_silphones_arg \
-       ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
       $cur_mdl $alidir/final.mdl "$feats" "$lats" "$ali" $dir/$x.nnet || exit 1
  fi
  cur_mdl=$dir/$x.nnet
--- a/egs/wsj/s5/steps/train_nnet_scheduler.sh
+++ b/egs/wsj/s5/steps/train_nnet_scheduler.sh
@ -25,8 +25,6 @@ end_halving_inc=0.1
 halving_factor=0.5
 # misc.
 verbose=1
-# gpu
-use_gpu_id=
 # tool
 train_tool="nnet-train-xent-hardlab-frmshuff"
 
@ -73,7 +71,6 @@ mlp_base=${mlp_init##*/}; mlp_base=${mlp_base%.*}
 $train_tool --cross-validate=true \
 --bunchsize=$bunch_size --cachesize=$cache_size --verbose=$verbose \
 ${feature_transform:+ --feature-transform=$feature_transform} \
- ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
 $mlp_best "$feats_cv" "$labels_cv" \
 2> $dir/log/prerun.log || exit 1;

@ -97,7 +94,6 @@ for iter in $(seq -w $max_iters); do
   --learn-rate=$learn_rate --momentum=$momentum --l1-penalty=$l1_penalty --l2-penalty=$l2_penalty \
   --bunchsize=$bunch_size --cachesize=$cache_size --randomize=true --verbose=$verbose \
   ${feature_transform:+ --feature-transform=$feature_transform} \
-   ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
   ${seed:+ --seed=$seed} \
   $mlp_best "$feats_tr" "$labels_tr" $mlp_next \
   2> $dir/log/iter$iter.log || exit 1; 
@ -110,7 +106,6 @@ for iter in $(seq -w $max_iters); do
  $train_tool --cross-validate=true \
   --bunchsize=$bunch_size --cachesize=$cache_size --verbose=$verbose \
   ${feature_transform:+ --feature-transform=$feature_transform} \
-   ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
   $mlp_next "$feats_cv" "$labels_cv" \
   2>>$dir/log/iter$iter.log || exit 1;
  
--- a/src/cudamatrix/cu-array-test.cc
+++ b/src/cudamatrix/cu-array-test.cc
@ -102,9 +102,9 @@ int main() {
  for (int32 loop = 0; loop < 2; loop++) {
 #if HAVE_CUDA == 1
    if (loop == 0)
-      CuDevice::Instantiate().SelectGpuId(-1); // -1 means no GPU
+      CuDevice::Instantiate().SelectGpuId("no");
    else
-      CuDevice::Instantiate().SelectGpuId(-2); // -2 .. automatic selection
+      CuDevice::Instantiate().SelectGpuId("yes");
 #endif

    //kaldi::UnitTestCuArray<float>();
--- a/src/cudamatrix/cu-block-matrix-test.cc
+++ b/src/cudamatrix/cu-block-matrix-test.cc
@ -212,9 +212,9 @@ int main() {
  for (int32 loop = 0; loop < 2; loop++) {
 #if HAVE_CUDA == 1
    if (loop == 0)
-      CuDevice::Instantiate().SelectGpuId(-1); // -1 means no GPU
+      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
    else
-      CuDevice::Instantiate().SelectGpuId(-2); // -2 .. automatic selection
+      CuDevice::Instantiate().SelectGpuId("yes"); // -2 .. automatic selection
 #endif

    kaldi::CuBlockMatrixUnitTest<float>();
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@ -27,6 +27,7 @@
 #include <cuda.h>
 #include <cuda_runtime_api.h>

+#include <string>
 #include <vector>
 #include <algorithm>
 #include <dlfcn.h>
@ -40,105 +41,113 @@
 namespace kaldi {


-
 /** 
- * SelectGpuId(gpu_id) 
+ * SelectGpuId(use_gpu) 
 *
- * The argument 'gpu_id' meaning: 0..N selects a GPU, 
- * -1 disables CUDA, -2 performs GPU auto-detection.
+ * There are 3 'use_gpu' modes for GPU selection:
+ * "yes"      -- Select GPU automatically (or get one by exclusive mode) 
+ *               and die if this fails.
+ * "optional" -- Do as above, but if it fails, back off to CPU.
+ * "no"       -- Run on CPU.
 *
- * If there is no GPU in the system, and we have GPU auto-detection,
- * or GPU is manually disabled the computation will run on CPU. 
- * In other cases it is an error (manual selection).
+ * In case of Compute exclusive mode, the GPU is selected by OS.
 *
- * In case of Compute exclusive mode, the GPU is selected by OS, 
- * this has priority over manual/auto selection of GPU.
+ * Otherwise GPU selection is based on largest proportion of free memory.
+ * This can eventually lead to multiple processes computing on single GPU,
+ * which is slow. More practical is to use "compute exclusive mode".
 *
- * Since the autoselection of GPU is not perfect, it may still 
- * happen that two processes compute on single GPU, which is slow. 
- * The users are advised to use manual selection or exclusive mode.
- *
- * This method must be called at the very beginning of the program
- * (before the cudamatrix objects allocate memory for the data), 
- * or not at all (when we intentionally want to run on the CPU). 
+ * This method is to be called at the very beginning of the program
+ * (before first allocation in cudamatrix), or not at all (default to CPU).
 *
 */
-void CuDevice::SelectGpuId(int32 gpu_id, bool abort_on_error) {
+void CuDevice::SelectGpuId(std::string use_gpu) {
+  // Possible modes  
+  if (use_gpu != "yes" && use_gpu != "no" && use_gpu != "optionaly") {
+    KALDI_ERR << "Please choose : --use-gpu=yes|no|optionaly, passed '" << use_gpu << "'";
+  }
+ 
  // Make sure this function is not called twice!
  if (Enabled()) {
    KALDI_ERR << "There is already an active GPU " << active_gpu_id_ 
              << ", cannot change it on the fly!";
  }
  // Allow the GPU to stay disabled
-  if(!Enabled() && gpu_id == -1) { 
-    KALDI_LOG << "Selected device: " << gpu_id 
-              << ", we don't even try to get a GPU. We run on CPU.";
-    active_gpu_id_ = -1;
+  if(!Enabled() && use_gpu == "no") { 
+    KALDI_LOG << "Manually selected to compute on CPU.";
    return;
  }
+
  // Check that we have a gpu available
  int32 n_gpu = 0;
  cudaGetDeviceCount(&n_gpu);
  if(n_gpu == 0) {
-    // If we do automatic selection and no GPU is found, we run on a CPU
-    if (abort_on_error) {
-      KALDI_ERR << "No CUDA capable GPU was detected";
-    } else {
-      KALDI_WARN << "CUDA will NOT be used!!! No CUDA capable GPU detected...";
-      active_gpu_id_ = -2;
+    if (use_gpu == "yes") {
+      KALDI_ERR << "No CUDA GPU detected!";
+    }
+    if (use_gpu == "optional") {
+      KALDI_WARN << "Running on CPU!!! No CUDA GPU detected...";
      return;
    }
  }
-  
-  // Now we know that there is a GPU in the system, 
-  // and we don't want to have it disabled. 
+
  //
-  // For the GPU selection there are 3 possibilities, 
-  // with priorities according to the order:
+  // Create a CUDA context : in case of compute-exclusive mode OS selects gpu_id,
+  // or default gpu_id=0. In the case with no free GPUs a context cannot be created
+  // (compute-exclusive mode).
  //
-  // 1.) We have compute exclusive mode on (GPU is selected by OS)
-  // 2.) User did not specify the GPU-id (default value -2), 
-  //     we will do automatic selection.
-  // 3.) User specified the GPU to run on, so we select it.
-  bool error;
-  if (IsComputeExclusive(&error)) { 
-    FinalizeActiveGpu();
-    return;
-  }
-  if (error) { // There was some error detecting compute-exclusive status
-               // (perhaps no GPU available).  Sleep a bit and retry.
+  cudaError_t e;
+  e = cudaThreadSynchronize(); //<< CUDA context gets created here.
+  if (e != cudaSuccess) {
+    // So far no we don't have context, sleep a bit and retry.
    int32 sec_sleep = 2;
    KALDI_WARN << "Will try again to get a GPU after " << sec_sleep 
               << " seconds.";
    sleep(sec_sleep);
-
-    if (IsComputeExclusive(&error)) {
-      FinalizeActiveGpu();
-      return;
-    } else {
-      if (abort_on_error) {
-        KALDI_ERR << "Error acquiring GPU in exclusive mode.";
-      } else {
-        KALDI_WARN << "Error selecting GPU.  CUDA will NOT be used!!!.";
-        active_gpu_id_ = -2;
+    //
+    e = cudaThreadSynchronize(); //<< 2nd trial to get CUDA context.
+    if (e != cudaSuccess) {
+      if (use_gpu == "yes") {
+        KALDI_ERR << "Failed to create CUDA context, no more unused GPUs?";
+      }
+      if (use_gpu == "optional") {
+        KALDI_WARN << "Running on CPU!!! No more unused CUDA GPUs?";
        return;
      }
    }
  }
-  
-  bool ans = (gpu_id == -2 ? SelectGpuIdAuto() : SelectGpuIdManual(gpu_id));
-  if (ans) {
+
+  // Re-assure we have the context
+  KALDI_ASSERT(cudaSuccess == cudaThreadSynchronize());
+
+  // Check if the machine use compute exclusive mode 
+  if (IsComputeExclusive()) {
    FinalizeActiveGpu();
+    return;
  } else {
-    if (abort_on_error) {
-      KALDI_ERR << "Error acquiring GPU.";
-    } else {
-      KALDI_WARN << "Error selecting GPU.  CUDA will NOT be used!!!.";
-      active_gpu_id_ = -2;
+    // Or suggest to use compute exclusive mode
+    if(n_gpu > 1) { 
+      KALDI_WARN << "Hint: It is practical to set the GPUs into ``compute exclusive mode''."
+                 << " Selection of free GPUs would be done by OS automatically.";
+    }
+    // And select the GPU according to proportion of free memory
+    if(SelectGpuIdAuto()) {
+      FinalizeActiveGpu();
+      return;
+    } else { 
+      // Could not get GPU, after prevously having the CUDA context?
+      // Strange but not impossible...
+      if (use_gpu == "yes") {
+        KALDI_ERR << "Error acquiring GPU.";
+      }
+      if (use_gpu == "optional") {
+        KALDI_WARN << "Running on CPU!!! Error acquiring GPU.";
+        return;
+      }
    }
  }
 }

+
 void CuDevice::FinalizeActiveGpu() {
  // The device at this point should have active GPU, so we can query its name
  // and memory stats and notify user which GPU is finally used.
@ -171,41 +180,6 @@ void CuDevice::FinalizeActiveGpu() {
  return;
 }

-bool CuDevice::SelectGpuIdManual(int32 gpu_id) {
-  //  The user selected a particular GPU using --use-gpu-id=X; try to select
-  //  that one.
-  int32 ret = cudaSetDevice(gpu_id);
-  //handle the possible errors (no recovery!!!)
-  switch(ret) {
-  case cudaSuccess : {
-    //create the GPU context
-    cudaError_t e;
-    e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
-    if(e != cudaSuccess) {
-      KALDI_WARN << "Failed to create CUDA context on a GPU.";
-      return false;
-    }
-    //this was okay, so we are done!
-    KALDI_LOG << "Selected device: " << gpu_id << " (manually)";
-    return true;
-  }
-  case cudaErrorInvalidDevice : { 
-    int32 n_gpu = 0;
-    cudaGetDeviceCount(&n_gpu);
-    KALDI_WARN << "cudaSetDevice(" << gpu_id << "):"
-              << " '" << gpu_id << "' is not a VALID CUDA device! "
-              << " (system has " << n_gpu << " GPUs,"
-              << " valid IDs 0.." << n_gpu-1 << ")";
-    return false;
-  }
-  default :
-    KALDI_WARN << "cudaSetDevice(" << gpu_id << "): "
-              << "returned " << ret << ", " 
-              << cudaGetErrorString((cudaError_t)ret);
-    return false;
-  }
-}
-

 bool CuDevice::DoublePrecisionSupported() {
  if (!Enabled()) return true;
@ -214,45 +188,20 @@ bool CuDevice::DoublePrecisionSupported() {
 }


-bool CuDevice::IsComputeExclusive(bool *error) {
-  // check that we have a gpu
-  *error = false;
-  int32 n_gpu = 0;
-  cudaGetDeviceCount(&n_gpu);
-  if(n_gpu == 0) {
-    KALDI_LOG << "No CUDA devices found";
-    return false;
-  }
-  
-  // Create a GPU context
-  // This will be kept if we detect compute exclusive mode
-  // or released in the other case.
-  //
-  // It does not harm if the function gets called twice,
-  // and the context is already created.
-  cudaError_t e;
-  e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
-  if (e != cudaSuccess) {
-    KALDI_WARN << "Failed to create CUDA context on a GPU.  No more unused GPUs "
-               << "in compute exclusive mode?";
-    *error = true;
-    return false;
-  }
-  
+bool CuDevice::IsComputeExclusive() {
+  // assume we already have an CUDA context created
+  KALDI_ASSERT(cudaSuccess == cudaThreadSynchronize());
+
  // get the device-id and its device-properties
  int32 gpu_id = -1;
-  e = cudaGetDevice(&gpu_id);
+  cudaError_t e = cudaGetDevice(&gpu_id);
  if(e != cudaSuccess) {
-    KALDI_WARN << "Failed to get current device";
-    *error = true;
-    return false;
+    KALDI_ERR << "Failed to get current device";
  }
  struct cudaDeviceProp gpu_prop;
  e = cudaGetDeviceProperties(&gpu_prop, gpu_id);
  if(e != cudaSuccess) {
-    KALDI_WARN << "Failed to get device properties";
-    *error = true;
-    return false;
+    KALDI_ERR << "Failed to get device properties";
  }
  // find out whether compute exclusive mode is used
  switch (gpu_prop.computeMode) {
@ -271,9 +220,7 @@ bool CuDevice::IsComputeExclusive(bool *error) {
      // in this case we release the GPU context...
      e = cudaThreadExit(); //deprecated, but for legacy reason not cudaDeviceReset
      if(e != cudaSuccess) {
-        KALDI_WARN << "Failed to release CUDA context on a GPU";
-        *error = true;
-        return false;
+        KALDI_ERR << "Failed to release CUDA context on a GPU";
      }
      return false;
  }
@ -281,7 +228,7 @@ bool CuDevice::IsComputeExclusive(bool *error) {


 bool CuDevice::SelectGpuIdAuto() {
-  // check that we have at least one gpu
+  // Check that we have at least one gpu
  int32 n_gpu = 0;
  cudaGetDeviceCount(&n_gpu);
  if(n_gpu == 0) {
@ -291,7 +238,7 @@ bool CuDevice::SelectGpuIdAuto() {
  
  // The GPU is selected according to maximal free memory ratio
  std::vector<float> free_mem_ratio(n_gpu+1, 0.0);
-  //get ratios of memory use, if possible
+  // Get ratios of memory use, if possible
  KALDI_LOG << "Selecting from " << n_gpu << " GPUs";
  for(int32 n = 0; n < n_gpu; n++) {
    int32 ret = cudaSetDevice(n);
@ -337,10 +284,7 @@ bool CuDevice::SelectGpuIdAuto() {
    if(free_mem_ratio[n] > free_mem_ratio[max_id]) max_id=n;
  }
  //the free_mem_ratio should be bigger than zero
-  if(!free_mem_ratio[max_id] > 0.0) {
-    KALDI_WARN << "No device could be selected (this should never happen)";
-    return false;
-  }
+  KALDI_ASSERT(free_mem_ratio[max_id] > 0.0);

  //finally select the GPU
  KALDI_LOG << "Selected device: " << max_id << " (automatically)";
@ -738,7 +682,7 @@ void* CuDevice::Malloc(size_t size) {
  return allocator_->Malloc(size);
 }

-CuDevice::CuDevice(): active_gpu_id_(-3), verbose_(true),
+CuDevice::CuDevice(): active_gpu_id_(-1), verbose_(true),
                      allocator_(new CuAllocator(CuAllocatorOptions(), this))
  { }

@ -746,8 +690,8 @@ CuDevice::CuDevice(): active_gpu_id_(-3), verbose_(true),
 CuDevice::~CuDevice() {
  if (Enabled()) {
    CU_SAFE_CALL(cublasShutdown());
-  } else if (active_gpu_id_ == -2) {
-    KALDI_WARN << "CUDA was NOT used! No CUDA GPU detected!";
+  } else if (active_gpu_id_ == -1) {
+    KALDI_WARN << "CUDA was NOT used....";
  }
  if (allocator_ != NULL)
    delete allocator_;
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@ -55,18 +55,18 @@ class CuDevice {
  
  void Free(void *ptr);
  
-  /**********************************/
-  // Instance interface
- 
-  /// Check if the CUDA device is selected for use
+  /// Select a GPU for computation, the 'use_gpu' modes are:
+  ///  "yes"      -- Select GPU automatically and die if this fails.
+  ///  "optional" -- Do as above, but if it fails, back off to CPU. 
+  ///  "no"       -- Run on CPU. 
+  ///  (more comments in cu-device.cc)
+  void SelectGpuId(std::string use_gpu);
+
+  /// Check if the CUDA GPU is selected for use
  bool Enabled() const {
    return (active_gpu_id_ > -1); 
  }

-  /// Manually select GPU by id (more comments in cu-device.cc)
-  void SelectGpuId(int32 gpu_id,
-                   bool abort_on_failure = true);
-
  /// Get the active GPU id
  int32 ActiveGpuId() {
    return active_gpu_id_;
@ -105,7 +105,7 @@ class CuDevice {
  /// otherwise.  Sets error to true if there was some error, such as that we
  /// were running in compute exclusive modes but no GPUs available; otherwise
  /// sets it to false.
-  bool IsComputeExclusive(bool *error);
+  bool IsComputeExclusive();

  /// Automatically select GPU and get CUDA context.  Returns true on success.
  bool SelectGpuIdAuto();
--- a/src/cudamatrix/cu-math-test.cc
+++ b/src/cudamatrix/cu-math-test.cc
@ -151,9 +151,9 @@ int main() {
  for (int32 loop = 0; loop < 2; loop++) {
 #if HAVE_CUDA == 1
    if (loop == 0)
-      CuDevice::Instantiate().SelectGpuId(-1); // -1 means no GPU
+      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
    else
-      CuDevice::Instantiate().SelectGpuId(-2); // -2 .. automatic selection
+      CuDevice::Instantiate().SelectGpuId("yes"); // -2 .. automatic selection
 #endif
    srand(time(NULL));
    kaldi::CudaMathUnitTest<float>();
--- a/src/cudamatrix/cu-matrix-speed-test.cc
+++ b/src/cudamatrix/cu-matrix-speed-test.cc
@ -176,7 +176,7 @@ template<typename Real> void CudaMatrixSpeedTest() {
 int main() {
    //Select the GPU
 #if HAVE_CUDA == 1
-    CuDevice::Instantiate().SelectGpuId(-2); //-2 .. automatic selection
+    CuDevice::Instantiate().SelectGpuId("yes"); //-2 .. automatic selection
 #endif

    kaldi::CudaMatrixSpeedTest<float>();
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@ -1733,9 +1733,9 @@ int main() {
  for (int32 loop = 0; loop < 2; loop++) {
 #if HAVE_CUDA == 1
    if (loop == 0)
-      CuDevice::Instantiate().SelectGpuId(-1); // -1 means no GPU
+      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
    else
-      CuDevice::Instantiate().SelectGpuId(-2); // -2 .. automatic selection
+      CuDevice::Instantiate().SelectGpuId("yes"); // -2 .. automatic selection
 #endif

    kaldi::CudaMatrixUnitTest<float>();
--- a/src/cudamatrix/cu-packed-matrix-test.cc
+++ b/src/cudamatrix/cu-packed-matrix-test.cc
@ -244,8 +244,7 @@ int main() {
  using namespace kaldi;
 #if HAVE_CUDA == 1
  // Select the GPU
-  kaldi::int32 use_gpu_id = -2;
-  CuDevice::Instantiate().SelectGpuId(use_gpu_id);
+  CuDevice::Instantiate().SelectGpuId("yes");
 #endif
  kaldi::CudaPackedMatrixUnitTest<float>();
 #if HAVE_CUDA == 1
--- a/src/cudamatrix/cu-sp-matrix-speed-test.cc
+++ b/src/cudamatrix/cu-sp-matrix-speed-test.cc
@ -167,7 +167,7 @@ template<typename Real> void CuSpMatrixSpeedTest() {
 int main() {
    //Select the GPU
 #if HAVE_CUDA == 1
-    CuDevice::Instantiate().SelectGpuId(-2); //-2 .. automatic selection
+    CuDevice::Instantiate().SelectGpuId("yes"); //-2 .. automatic selection
 #endif

    kaldi::CuSpMatrixSpeedTest<float>();
--- a/src/cudamatrix/cu-sp-matrix-test.cc
+++ b/src/cudamatrix/cu-sp-matrix-test.cc
@ -403,9 +403,9 @@ int main() {
  for (int32 loop = 0; loop < 2; loop++) {
 #if HAVE_CUDA == 1
    if (loop == 0)
-      CuDevice::Instantiate().SelectGpuId(-1); // -1 means no GPU
+      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
    else
-      CuDevice::Instantiate().SelectGpuId(-2); // -2 .. automatic selection
+      CuDevice::Instantiate().SelectGpuId("yes"); // -2 .. automatic selection
 #endif

    kaldi::CudaSpMatrixUnitTest<float>();
--- a/src/cudamatrix/cu-test.cc
+++ b/src/cudamatrix/cu-test.cc
@ -558,8 +558,7 @@ static void CuMatrixUnitTest() {
 int main() {
  using namespace kaldi;
 #if HAVE_CUDA == 1
-  kaldi::int32 use_gpu_id = -2; // -2 means automatic selection.
-  kaldi::CuDevice::Instantiate().SelectGpuId(use_gpu_id);
+  kaldi::CuDevice::Instantiate().SelectGpuId("yes");
 #endif
  
  kaldi::CuMatrixUnitTest<float>();
--- a/src/cudamatrix/cu-tp-matrix-test.cc
+++ b/src/cudamatrix/cu-tp-matrix-test.cc
@ -191,9 +191,9 @@ int main() {
  for (int32 loop = 0; loop < 2; loop++) {
 #if HAVE_CUDA == 1
    if (loop == 0)
-      CuDevice::Instantiate().SelectGpuId(-1); // -1 means no GPU
+      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
    else
-      CuDevice::Instantiate().SelectGpuId(-2); // -2 .. automatic selection
+      CuDevice::Instantiate().SelectGpuId("yes"); // -2 .. automatic selection
 #endif
    kaldi::CudaTpMatrixUnitTest<float>();
 #if HAVE_CUDA == 1
--- a/src/cudamatrix/cu-vector-speed-test.cc
+++ b/src/cudamatrix/cu-vector-speed-test.cc
@ -151,7 +151,7 @@ template<typename Real> void CudaVectorSpeedTest() {
 int main() {
    //Select the GPU
 #if HAVE_CUDA == 1
-    CuDevice::Instantiate().SelectGpuId(-2); //-2 .. automatic selection
+    CuDevice::Instantiate().SelectGpuId("yes"); //-2 .. automatic selection
 #endif

    kaldi::CudaVectorSpeedTest<float>();
--- a/src/cudamatrix/cu-vector-test.cc
+++ b/src/cudamatrix/cu-vector-test.cc
@ -695,10 +695,8 @@ int main(int argc, char *argv[]) {
  const char *usage = "Usage: cu-vector-test [options]";

  ParseOptions po(usage);
-  int32 use_gpu_id = -2;    
-  po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID (-2 automatic "
-              "selection, -1 disable GPU, 0..N select GPU).  Only has effect if compiled "
-              "with CUDA");
+  std::string use_gpu = "yes";    
+  po.Register("use-gpu", &use_gpu, "yes|no|optional");
  po.Read(argc, argv);

  if (po.NumArgs() != 0) {
@ -709,9 +707,9 @@ int main(int argc, char *argv[]) {
  for (int32 loop = 0; loop < 2; loop++) {
 #if HAVE_CUDA == 1
    if (loop == 0)
-      CuDevice::Instantiate().SelectGpuId(-1); // -1 means no GPU
+      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
    else
-      CuDevice::Instantiate().SelectGpuId(use_gpu_id);
+      CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif


--- a/src/nnet2bin/nnet-combine-fast.cc
+++ b/src/nnet2bin/nnet-combine-fast.cc
@ -47,13 +47,11 @@ int main(int argc, char *argv[]) {
    
    bool binary_write = true;
    NnetCombineFastConfig combine_config;
-    int32 use_gpu_id = -2;
+    std::string use_gpu = "yes";
    
    ParseOptions po(usage);
    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID (-2 automatic "
-                "selection, -1 disable GPU, 0..N select GPU).  Only has effect if compiled "
-                "with CUDA and --num-threads=1");
+    po.Register("use-gpu", &use_gpu, "yes|no|optionaly, only has effect if compiled with CUDA"); 
    
    combine_config.Register(&po);
    
@ -71,7 +69,7 @@ int main(int argc, char *argv[]) {

 #if HAVE_CUDA==1
    if (combine_config.num_threads == 1)
-      CuDevice::Instantiate().SelectGpuId(use_gpu_id);
+      CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif

    
--- a/src/nnet2bin/nnet-show-progress.cc
+++ b/src/nnet2bin/nnet-show-progress.cc
@ -45,13 +45,11 @@ int main(int argc, char *argv[]) {

    int32 num_segments = 1;
    int32 batch_size = 1024;
-    int32 use_gpu_id = -2;
+    std::string use_gpu = "yes";
    
    po.Register("num-segments", &num_segments,
                "Number of line segments used for computing derivatives");
-    po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID (-2 automatic "
-                "selection, -1 disable GPU, 0..N select GPU).  Only has effect if compiled "
-                "with CUDA");    
+    po.Register("use-gpu", &use_gpu, "yes|no|optionaly, only has effect if compiled with CUDA"); 
    
    po.Read(argc, argv);
    
@ -61,7 +59,7 @@ int main(int argc, char *argv[]) {
    }
    
 #if HAVE_CUDA==1
-    CuDevice::Instantiate().SelectGpuId(use_gpu_id);
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif

    std::string nnet1_rxfilename = po.GetArg(1),
--- a/src/nnet2bin/nnet-train-simple.cc
+++ b/src/nnet2bin/nnet-train-simple.cc
@ -47,7 +47,7 @@ int main(int argc, char *argv[]) {
    bool binary_write = true;
    bool zero_stats = true;
    int32 srand_seed = 0;
-    int32 use_gpu_id = -2;
+    std::string use_gpu="yes";
    NnetSimpleTrainerConfig train_config;
    
    ParseOptions po(usage);
@ -57,9 +57,7 @@ int main(int argc, char *argv[]) {
    po.Register("srand", &srand_seed, "Seed for random number generator "
                "(relevant if you have layers of type AffineComponentPreconditioned "
                "with l2-penalty != 0.0");
-    po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID (-2 automatic "
-                "selection, -1 disable GPU, 0..N select GPU).  Only has effect if compiled "
-                "with CUDA");    
+    po.Register("use-gpu", &use_gpu, "yes|no|optionaly, only has effect if compiled with CUDA"); 
    
    train_config.Register(&po);
    
@ -72,7 +70,7 @@ int main(int argc, char *argv[]) {
    srand(srand_seed);
    
 #if HAVE_CUDA==1
-    CuDevice::Instantiate().SelectGpuId(use_gpu_id);
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif

    std::string nnet_rxfilename = po.GetArg(1),
--- a/src/nnetbin/nnet-forward.cc
+++ b/src/nnetbin/nnet-forward.cc
@ -51,13 +51,8 @@ int main(int argc, char *argv[]) {
    bool apply_log = false;
    po.Register("apply-log", &apply_log, "Transform MLP output to logscale");

-#if HAVE_CUDA==1
-    int32 use_gpu_id=-2;
-    po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID (-2 automatic selection, -1 disable GPU, 0..N select GPU)");
-#else
-    int32 use_gpu_id=0;
-    po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
-#endif
+    std::string use_gpu="no";
+    po.Register("use-gpu", &use_gpu, "yes|no|optionaly, only has effect if compiled with CUDA"); 

    po.Read(argc, argv);

@ -76,7 +71,7 @@ int main(int argc, char *argv[]) {

    //Select the GPU
 #if HAVE_CUDA==1
-    CuDevice::Instantiate().SelectGpuId(use_gpu_id);
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif

    Nnet nnet_transf;
--- a/src/nnetbin/nnet-train-mmi-sequential.cc
+++ b/src/nnetbin/nnet-train-mmi-sequential.cc
@ -128,16 +128,9 @@ int main(int argc, char *argv[]) {
    po.Register("drop-frames", &drop_frames, 
                "Drop frames, where is zero den-posterior under numerator path "
                "(ie. path not in lattice)");
-    

-#if HAVE_CUDA == 1
-    kaldi::int32 use_gpu_id=-2;
-    po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID "
-                "(-2 automatic selection, -1 disable GPU, 0..N select GPU)");
-#else
-    int32 use_gpu_id=0;
-    po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
-#endif
+    std::string use_gpu="yes";
+    po.Register("use-gpu", &use_gpu, "yes|no|optionaly, only has effect if compiled with CUDA"); 

    po.Read(argc, argv);

@ -162,7 +155,7 @@ int main(int argc, char *argv[]) {

    // Select the GPU
 #if HAVE_CUDA == 1
-    CuDevice::Instantiate().SelectGpuId(use_gpu_id);
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif

    Nnet nnet_transf;
--- a/src/nnetbin/nnet-train-mpe-sequential.cc
+++ b/src/nnetbin/nnet-train-mpe-sequential.cc
@ -129,15 +129,9 @@ int main(int argc, char *argv[]) {
    po.Register("do-smbr", &do_smbr, "Use state-level accuracies instead of "
                "phone accuracies.");

-#if HAVE_CUDA == 1
-    kaldi::int32 use_gpu_id=-2;
-    po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID "
-                "(-2 automatic selection, -1 disable GPU, 0..N select GPU)");
-#else
-    int32 use_gpu_id=0;
-    po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
-#endif
-
+    std::string use_gpu="yes";
+    po.Register("use-gpu", &use_gpu, "yes|no|optionaly, only has effect if compiled with CUDA");
+     
    po.Read(argc, argv);

    if (po.NumArgs() != 6) {
@ -164,7 +158,7 @@ int main(int argc, char *argv[]) {

    // Select the GPU
 #if HAVE_CUDA == 1
-    CuDevice::Instantiate().SelectGpuId(use_gpu_id);
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif

    Nnet nnet_transf;
--- a/src/nnetbin/nnet-train-mse-tgtmat-frmshuff.cc
+++ b/src/nnetbin/nnet-train-mse-tgtmat-frmshuff.cc
@ -61,13 +61,8 @@ int main(int argc, char *argv[]) {
    po.Register("cachesize", &cachesize, "Size of cache for frame level shuffling (max 8388479)");
    po.Register("seed", &seed, "Seed value for srand, sets fixed order of frame-shuffling");

-#if HAVE_CUDA==1
-    int32 use_gpu_id=-2;
-    po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID (-2 automatic selection, -1 disable GPU, 0..N select GPU)");
-#else
-    int32 use_gpu_id=0;
-    po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
-#endif
+    std::string use_gpu="yes";
+    po.Register("use-gpu", &use_gpu, "yes|no|optionaly, only has effect if compiled with CUDA"); 
    
    po.Read(argc, argv);

@ -94,7 +89,7 @@ int main(int argc, char *argv[]) {

    //Select the GPU
 #if HAVE_CUDA==1
-    CuDevice::Instantiate().SelectGpuId(use_gpu_id);
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif

    Nnet nnet_transf;
--- a/src/nnetbin/nnet-train-xent-hardlab-frmshuff.cc
+++ b/src/nnetbin/nnet-train-xent-hardlab-frmshuff.cc
@ -60,13 +60,8 @@ int main(int argc, char *argv[]) {
    kaldi::int32 max_frames = 6000; // Allow segments maximum of one minute by default
    po.Register("max-frames",&max_frames, "Maximum number of frames a segment can have to be processed");

-#if HAVE_CUDA==1
-    int32 use_gpu_id=-2;
-    po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID (-2 automatic selection, -1 disable GPU, 0..N select GPU)");
-#else
-    int32 use_gpu_id=0;
-    po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
-#endif
+    std::string use_gpu="yes";
+    po.Register("use-gpu", &use_gpu, "yes|no|optionaly, only has effect if compiled with CUDA"); 
    
    po.Read(argc, argv);

@ -93,7 +88,7 @@ int main(int argc, char *argv[]) {

    //Select the GPU
 #if HAVE_CUDA==1
-    CuDevice::Instantiate().SelectGpuId(use_gpu_id);
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif

    Nnet nnet_transf;
--- a/src/nnetbin/nnet-train-xent-hardlab-perutt.cc
+++ b/src/nnetbin/nnet-train-xent-hardlab-perutt.cc
@ -49,13 +49,8 @@ int main(int argc, char *argv[]) {
    std::string feature_transform;
    po.Register("feature-transform", &feature_transform, "Feature transform in Nnet format");

-#if HAVE_CUDA==1
-    int32 use_gpu_id=-2;
-    po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID (-2 automatic selection, -1 disable GPU, 0..N select GPU)");
-#else
-    int32 use_gpu_id=0;
-    po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
-#endif
+    std::string use_gpu="yes";
+    po.Register("use-gpu", &use_gpu, "yes|no|optionaly, only has effect if compiled with CUDA"); 
    
    po.Read(argc, argv);

@ -80,7 +75,7 @@ int main(int argc, char *argv[]) {

    //Select the GPU
 #if HAVE_CUDA==1
-    CuDevice::Instantiate().SelectGpuId(use_gpu_id);
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif

    Nnet nnet_transf;
--- a/src/nnetbin/rbm-train-cd1-frmshuff.cc
+++ b/src/nnetbin/rbm-train-cd1-frmshuff.cc
@ -65,13 +65,8 @@ int main(int argc, char *argv[]) {
    BaseFloat drop_data = 0.0; 
    po.Register("drop-data", &drop_data, "Threshold for random dropping of the data (0 no-drop, 1 drop-all)");

-#if HAVE_CUDA==1
-    int32 use_gpu_id=-2 ;
-    po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID (-2 automatic selection, -1 disable GPU, 0..N select GPU)");
-#else
-    int32 use_gpu_id=0;
-    po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
-#endif
+    std::string use_gpu="yes";
+    po.Register("use-gpu", &use_gpu, "yes|no|optionaly, only has effect if compiled with CUDA"); 

    po.Read(argc, argv);

@ -93,7 +88,7 @@ int main(int argc, char *argv[]) {

    //Select the GPU
 #if HAVE_CUDA==1
-    CuDevice::Instantiate().SelectGpuId(use_gpu_id);
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif

    Nnet rbm_transf;