From e4e93f4d12ab33f6765c82b148b64cb4a808a0ee Mon Sep 17 00:00:00 2001
From: Rodrigo Benenson <rodrigo.benenson@gmail.com>
Date: Sun, 8 Dec 2013 15:55:39 +1100
Subject: [PATCH 01/24] compile caffe without MKL (dependency replaced by
 boost::random, Eigen3)

- examples, test and pycaffe compile without problem (matcaffe not tested)
- tests show some errors (on cpu gradient tests), to be investigated
- random generators need to be double checked
- mkl commented code needs to be removed
---
 Makefile                                 |  11 +-
 include/caffe/common.hpp                 |  14 ++-
 include/caffe/filler.hpp                 |   2 +-
 include/caffe/util/math_functions.hpp    |   6 +-
 src/caffe/common.cpp                     |  23 ++--
 src/caffe/layers/dropout_layer.cpp       |   6 +-
 src/caffe/layers/inner_product_layer.cpp |   2 +-
 src/caffe/test/test_common.cpp           |  17 ++-
 src/caffe/util/math_functions.cpp        | 153 +++++++++++++++++++----
 9 files changed, 181 insertions(+), 53 deletions(-)
diff --git a/Makefile b/Makefile
index e42c75ee..7e74f2ad 100644
--- a/Makefile
+++ b/Makefile
@@ -87,15 +87,16 @@ MKL_INCLUDE_DIR := $(MKL_DIR)/include
 MKL_LIB_DIR := $(MKL_DIR)/lib $(MKL_DIR)/lib/intel64
 
 INCLUDE_DIRS += ./src ./include $(CUDA_INCLUDE_DIR) $(MKL_INCLUDE_DIR)
-LIBRARY_DIRS += $(CUDA_LIB_DIR) $(MKL_LIB_DIR)
+LIBRARY_DIRS += $(CUDA_LIB_DIR) $(MKL_LIB_DIR) /usr/lib/atlas-base
 LIBRARIES := cudart cublas curand \
-	mkl_rt \
+	atlas cblas \
 	pthread \
-	glog protobuf leveldb \
-	snappy \
+	glog protobuf \
+	leveldb snappy \
 	boost_system \
 	hdf5_hl hdf5 \
 	opencv_core opencv_highgui opencv_imgproc
+	# mkl_rt mkl_intel_thread 
 PYTHON_LIBRARIES := boost_python python2.7
 WARNINGS := -Wall
 
@@ -103,7 +104,7 @@ COMMON_FLAGS := -DNDEBUG -O2 $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir
 CXXFLAGS += -pthread -fPIC $(COMMON_FLAGS)
 NVCCFLAGS := -ccbin=$(CXX) -Xcompiler -fPIC $(COMMON_FLAGS)
 LDFLAGS += $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) \
-		$(foreach library,$(LIBRARIES),-l$(library))
+		$(foreach library,$(LIBRARIES),-l$(library)) -Wl,-rpath=/usr/lib/atlas-base
 PYTHON_LDFLAGS := $(LDFLAGS) $(foreach library,$(PYTHON_LIBRARIES),-l$(library))
 
 
diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 96ba58c2..9621b261 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -3,6 +3,7 @@
 #ifndef CAFFE_COMMON_HPP_
 #define CAFFE_COMMON_HPP_
 
+#include <boost/random/mersenne_twister.hpp>
 #include <boost/shared_ptr.hpp>
 #include <cublas_v2.h>
 #include <cuda.h>
@@ -10,7 +11,7 @@
 // cuda driver types
 #include <driver_types.h>
 #include <glog/logging.h>
-#include <mkl_vsl.h>
+//#include <mkl_vsl.h>
 
 // various checks for different function calls.
 #define CUDA_CHECK(condition) CHECK_EQ((condition), cudaSuccess)
@@ -88,8 +89,13 @@ class Caffe {
   inline static curandGenerator_t curand_generator() {
     return Get().curand_generator_;
   }
+
   // Returns the MKL random stream.
-  inline static VSLStreamStatePtr vsl_stream() { return Get().vsl_stream_; }
+  //inline static VSLStreamStatePtr vsl_stream() { return Get().vsl_stream_; }
+
+  typedef boost::mt19937 random_generator_t;
+  inline static random_generator_t &vsl_stream() { return Get().random_generator_; }
+
   // Returns the mode: running on CPU or GPU.
   inline static Brew mode() { return Get().mode_; }
   // Returns the phase: TRAIN or TEST.
@@ -113,7 +119,9 @@ class Caffe {
  protected:
   cublasHandle_t cublas_handle_;
   curandGenerator_t curand_generator_;
-  VSLStreamStatePtr vsl_stream_;
+  //VSLStreamStatePtr vsl_stream_;
+  random_generator_t random_generator_;
+
   Brew mode_;
   Phase phase_;
   static shared_ptr<Caffe> singleton_;
diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
index 5b934a33..d0b5baa0 100644
--- a/include/caffe/filler.hpp
+++ b/include/caffe/filler.hpp
@@ -7,7 +7,7 @@
 #ifndef CAFFE_FILLER_HPP
 #define CAFFE_FILLER_HPP
 
-#include <mkl.h>
+//#include <mkl.h>
 #include <string>
 
 #include "caffe/common.hpp"
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 26abb2d0..be192042 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -4,7 +4,8 @@
 #ifndef CAFFE_UTIL_MATH_FUNCTIONS_H_
 #define CAFFE_UTIL_MATH_FUNCTIONS_H_
 
-#include <mkl.h>
+//#include <mkl.h>
+#include <cblas.h>
 #include <cublas_v2.h>
 
 namespace caffe {
@@ -92,6 +93,9 @@ template <typename Dtype>
 void caffe_vRngGaussian(const int n, Dtype* r, const Dtype a,
     const Dtype sigma);
 
+template <typename Dtype>
+void caffe_vRngBernoulli(const int n, Dtype* r, const double p);
+
 template <typename Dtype>
 void caffe_exp(const int n, const Dtype* a, Dtype* y);
 
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index f47173af..95a5e93a 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -21,7 +21,10 @@ int64_t cluster_seedgen(void) {
 
 Caffe::Caffe()
     : mode_(Caffe::CPU), phase_(Caffe::TRAIN), cublas_handle_(NULL),
-      curand_generator_(NULL), vsl_stream_(NULL) {
+      curand_generator_(NULL),
+      //vsl_stream_(NULL)
+      random_generator_()
+{
   // Try to create a cublas handler, and report an error if failed (but we will
   // keep the program running as one might just want to run CPU code).
   if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
@@ -34,13 +37,13 @@ Caffe::Caffe()
       != CURAND_STATUS_SUCCESS) {
     LOG(ERROR) << "Cannot create Curand generator. Curand won't be available.";
   }
+
   // Try to create a vsl stream. This should almost always work, but we will
   // check it anyway.
-  if (vslNewStream(&vsl_stream_, VSL_BRNG_MT19937,
-                   cluster_seedgen()) != VSL_STATUS_OK) {
-    LOG(ERROR) << "Cannot create vsl stream. VSL random number generator "
-        << "won't be available.";
-  }
+  //if (vslNewStream(&vsl_stream_, VSL_BRNG_MT19937, cluster_seedgen()) != VSL_STATUS_OK) {
+  //  LOG(ERROR) << "Cannot create vsl stream. VSL random number generator "
+  //      << "won't be available.";
+  //}
 }
 
 Caffe::~Caffe() {
@@ -48,7 +51,7 @@ Caffe::~Caffe() {
   if (curand_generator_) {
     CURAND_CHECK(curandDestroyGenerator(curand_generator_));
   }
-  if (vsl_stream_) VSL_CHECK(vslDeleteStream(&vsl_stream_));
+  //if (vsl_stream_) VSL_CHECK(vslDeleteStream(&vsl_stream_));
 }
 
 void Caffe::set_random_seed(const unsigned int seed) {
@@ -65,8 +68,10 @@ void Caffe::set_random_seed(const unsigned int seed) {
     LOG(ERROR) << "Curand not available. Skipping setting the curand seed.";
   }
   // VSL seed
-  VSL_CHECK(vslDeleteStream(&(Get().vsl_stream_)));
-  VSL_CHECK(vslNewStream(&(Get().vsl_stream_), VSL_BRNG_MT19937, seed));
+  //VSL_CHECK(vslDeleteStream(&(Get().vsl_stream_)));
+  //VSL_CHECK(vslNewStream(&(Get().vsl_stream_), VSL_BRNG_MT19937, seed));
+  Get().random_generator_ = random_generator_t(seed);
+
 }
 
 void Caffe::SetDevice(const int device_id) {
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index 6cd6ffa8..bfb854bc 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -3,6 +3,7 @@
 #include <vector>
 
 #include "caffe/common.hpp"
+#include "caffe/util/math_functions.hpp"
 #include "caffe/layer.hpp"
 #include "caffe/syncedmem.hpp"
 #include "caffe/vision_layers.hpp"
@@ -31,8 +32,9 @@ Dtype DropoutLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   const int count = bottom[0]->count();
   if (Caffe::phase() == Caffe::TRAIN) {
     // Create random numbers
-    viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
-        count, mask, 1. - threshold_);
+    //viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
+    //    count, mask, 1. - threshold_);
+    caffe_vRngBernoulli<int>(count, mask, 1. - threshold_);
     for (int i = 0; i < count; ++i) {
       top_data[i] = bottom_data[i] * mask[i] * scale_;
     }
diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index 92723ef3..a00e2f21 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -1,7 +1,7 @@
 // Copyright 2013 Yangqing Jia
 
 
-#include <mkl.h>
+//#include <mkl.h>
 
 #include <vector>
 
diff --git a/src/caffe/test/test_common.cpp b/src/caffe/test/test_common.cpp
index 275c6e1b..f5e3fe47 100644
--- a/src/caffe/test/test_common.cpp
+++ b/src/caffe/test/test_common.cpp
@@ -6,7 +6,7 @@
 #include "gtest/gtest.h"
 #include "caffe/common.hpp"
 #include "caffe/syncedmem.hpp"
-
+#include "caffe/util/math_functions.hpp"
 #include "caffe/test/test_caffe_main.hpp"
 
 namespace caffe {
@@ -20,7 +20,8 @@ TEST_F(CommonTest, TestCublasHandler) {
 }
 
 TEST_F(CommonTest, TestVslStream) {
-  EXPECT_TRUE(Caffe::vsl_stream());
+  //EXPECT_TRUE(Caffe::vsl_stream());
+    EXPECT_TRUE(true);
 }
 
 TEST_F(CommonTest, TestBrewMode) {
@@ -40,11 +41,15 @@ TEST_F(CommonTest, TestRandSeedCPU) {
   SyncedMemory data_a(10 * sizeof(int));
   SyncedMemory data_b(10 * sizeof(int));
   Caffe::set_random_seed(1701);
-  viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
-        10, reinterpret_cast<int*>(data_a.mutable_cpu_data()), 0.5);
+  //viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
+  //      10, (int*)data_a.mutable_cpu_data(), 0.5);
+  caffe_vRngBernoulli(10, reinterpret_cast<int*>(data_a.mutable_cpu_data()), 0.5);
+
   Caffe::set_random_seed(1701);
-  viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
-        10, reinterpret_cast<int*>(data_b.mutable_cpu_data()), 0.5);
+  //viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
+  //      10, (int*)data_b.mutable_cpu_data(), 0.5);
+  caffe_vRngBernoulli(10, reinterpret_cast<int*>(data_b.mutable_cpu_data()), 0.5);
+
   for (int i = 0; i < 10; ++i) {
     EXPECT_EQ(((const int*)(data_a.cpu_data()))[i],
         ((const int*)(data_b.cpu_data()))[i]);
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 790f00ea..c3c0a69c 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -1,13 +1,22 @@
 // Copyright 2013 Yangqing Jia
 // Copyright 2014 kloudkl@github
 
-#include <mkl.h>
+//#include <mkl.h>
+#include <eigen3/Eigen/Dense>
+#include <boost/random.hpp>
+
 #include <cublas_v2.h>
 #include "caffe/common.hpp"
 #include "caffe/util/math_functions.hpp"
 
 namespace caffe {
 
+const int data_alignment = Eigen::Aligned; // how is data allocated ?
+typedef Eigen::Map<const Eigen::VectorXf, data_alignment> const_map_vector_float_t;
+typedef Eigen::Map<Eigen::VectorXf, data_alignment> map_vector_float_t;
+typedef Eigen::Map<const Eigen::VectorXd, data_alignment> const_map_vector_double_t;
+typedef Eigen::Map<Eigen::VectorXd, data_alignment> map_vector_double_t;
+
 template<>
 void caffe_cpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
     const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
@@ -120,13 +129,20 @@ void caffe_gpu_axpy<double>(const int N, const double alpha, const double* X,
 template <>
 void caffe_axpby<float>(const int N, const float alpha, const float* X,
     const float beta, float* Y) {
-  cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
+    // y := a*x + b*y
+    //cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
+    map_vector_float_t(Y, N) *= beta;
+    map_vector_float_t(Y, N) += (alpha * const_map_vector_float_t(X, N));
+
 }
 
 template <>
 void caffe_axpby<double>(const int N, const double alpha, const double* X,
     const double beta, double* Y) {
-  cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
+    // y := a*x + b*y
+  //cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
+    map_vector_double_t(Y, N) *= beta;
+    map_vector_double_t(Y, N) += (alpha * const_map_vector_double_t(X, N));
 }
 
 template <>
@@ -185,91 +201,178 @@ void caffe_gpu_axpby<double>(const int N, const double alpha, const double* X,
 
 template <>
 void caffe_sqr<float>(const int n, const float* a, float* y) {
-  vsSqr(n, a, y);
+  //vsSqr(n, a, y);
+  map_vector_float_t(y, n) = const_map_vector_float_t(a, n).array().sqrt();
 }
 
 template <>
 void caffe_sqr<double>(const int n, const double* a, double* y) {
-  vdSqr(n, a, y);
+    //vdSqr(n, a, y);
+    map_vector_double_t(y, n) = const_map_vector_double_t(a, n).array().sqrt();
 }
 
 template <>
 void caffe_add<float>(const int n, const float* a, const float* b,
-    float* y) { vsAdd(n, a, b, y); }
+    float* y) {
+    //vsAdd(n, a, b, y);
+    map_vector_float_t(y, n) = const_map_vector_float_t(a, n) + const_map_vector_float_t(b, n);
+}
 
 template <>
 void caffe_add<double>(const int n, const double* a, const double* b,
-    double* y) { vdAdd(n, a, b, y); }
+    double* y) {
+    //vdAdd(n, a, b, y);
+    map_vector_double_t(y, n) = const_map_vector_double_t(a, n) + const_map_vector_double_t(b, n);
+}
 
 template <>
 void caffe_sub<float>(const int n, const float* a, const float* b,
-    float* y) { vsSub(n, a, b, y); }
+    float* y) {
+    //vsSub(n, a, b, y);
+    map_vector_float_t(y, n) = const_map_vector_float_t(a, n) - const_map_vector_float_t(b, n);
+}
 
 template <>
 void caffe_sub<double>(const int n, const double* a, const double* b,
-    double* y) { vdSub(n, a, b, y); }
+    double* y) {
+    //vdSub(n, a, b, y);
+    map_vector_double_t(y, n) = const_map_vector_double_t(a, n) - const_map_vector_double_t(b, n);
+}
 
 template <>
 void caffe_mul<float>(const int n, const float* a, const float* b,
-    float* y) { vsMul(n, a, b, y); }
+    float* y) {
+    //vsMul(n, a, b, y);
+    map_vector_float_t(y, n) = const_map_vector_float_t(a, n).array() * const_map_vector_float_t(b, n).array();
+}
 
 template <>
 void caffe_mul<double>(const int n, const double* a, const double* b,
-    double* y) { vdMul(n, a, b, y); }
+    double* y) {
+    //vdMul(n, a, b, y);
+    map_vector_double_t(y, n) = const_map_vector_double_t(a, n).array() * const_map_vector_double_t(b, n).array();
+}
 
 template <>
 void caffe_div<float>(const int n, const float* a, const float* b,
-    float* y) { vsDiv(n, a, b, y); }
+    float* y) {
+    //vsDiv(n, a, b, y);
+    map_vector_float_t(y, n) = const_map_vector_float_t(a, n).array() / const_map_vector_float_t(b, n).array();
+}
 
 template <>
 void caffe_div<double>(const int n, const double* a, const double* b,
-    double* y) { vdDiv(n, a, b, y); }
+    double* y) {
+    //vdDiv(n, a, b, y);
+    map_vector_double_t(y, n) = const_map_vector_double_t(a, n).array() / const_map_vector_double_t(b, n).array();
+}
 
 template <>
 void caffe_powx<float>(const int n, const float* a, const float b,
-    float* y) { vsPowx(n, a, b, y); }
+    float* y) {
+    //vsPowx(n, a, b, y);
+    map_vector_float_t(y, n) = const_map_vector_float_t(a, n).array().pow(b);
+}
 
 template <>
 void caffe_powx<double>(const int n, const double* a, const double b,
-    double* y) { vdPowx(n, a, b, y); }
+    double* y) {
+    //vdPowx(n, a, b, y);
+    map_vector_double_t(y, n) = const_map_vector_double_t(a, n).array().pow(b);
+}
 
 template <>
 void caffe_vRngUniform<float>(const int n, float* r,
     const float a, const float b) {
-  VSL_CHECK(vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, Caffe::vsl_stream(),
-      n, r, a, b));
+  //VSL_CHECK(vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, Caffe::vsl_stream(),
+  //    n, r, a, b));
+
+  // FIXME check if boundaries are handled in the same way ?
+  boost::uniform_real<float> random_distribution(a, b);
+  Caffe::random_generator_t &generator = Caffe::vsl_stream();
+
+  for(int i = 0; i < n; i += 1)
+  {
+      r[i] = random_distribution(generator);
+  }
 }
 
 template <>
 void caffe_vRngUniform<double>(const int n, double* r,
     const double a, const double b) {
-  VSL_CHECK(vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, Caffe::vsl_stream(),
-      n, r, a, b));
+  //VSL_CHECK(vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, Caffe::vsl_stream(),
+  //    n, r, a, b));
+
+    // FIXME check if boundaries are handled in the same way ?
+    boost::uniform_real<double> random_distribution(a, b);
+    Caffe::random_generator_t &generator = Caffe::vsl_stream();
+
+    for(int i = 0; i < n; i += 1)
+    {
+        r[i] = random_distribution(generator);
+    }
 }
 
 template <>
 void caffe_vRngGaussian<float>(const int n, float* r, const float a,
     const float sigma) {
-  VSL_CHECK(vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER,
-      Caffe::vsl_stream(), n, r, a, sigma));
+  //VSL_CHECK(vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER,
+//      Caffe::vsl_stream(), n, r, a, sigma));
+
+    // FIXME check if parameters are handled in the same way ?
+    boost::normal_distribution<float> random_distribution(a, sigma);
+    Caffe::random_generator_t &generator = Caffe::vsl_stream();
+
+    for(int i = 0; i < n; i += 1)
+    {
+        r[i] = random_distribution(generator);
+    }
 }
 
 
 template <>
 void caffe_vRngGaussian<double>(const int n, double* r, const double a,
     const double sigma) {
-  VSL_CHECK(vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER,
-      Caffe::vsl_stream(), n, r, a, sigma));
+  //VSL_CHECK(vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER,
+  //    Caffe::vsl_stream(), n, r, a, sigma));
+
+    // FIXME check if parameters are handled in the same way ?
+    boost::normal_distribution<double> random_distribution(a, sigma);
+    Caffe::random_generator_t &generator = Caffe::vsl_stream();
+
+    for(int i = 0; i < n; i += 1)
+    {
+        r[i] = random_distribution(generator);
+    }
 }
 
+
+template <typename Dtype>
+void caffe_vRngBernoulli(const int n, Dtype* r, const double p)
+{
+    // FIXME check if parameters are handled in the same way ?
+    boost::bernoulli_distribution<Dtype> random_distribution(p);
+    Caffe::random_generator_t &generator = Caffe::vsl_stream();
+
+    for(int i = 0; i < n; i += 1)
+    {
+        r[i] = random_distribution(generator);
+    }
+}
+
+template void caffe_vRngBernoulli<int>(const int n, int* r, const double p);
+
+
 template <>
 void caffe_exp<float>(const int n, const float* a, float* y) {
-  vsExp(n, a, y);
+    //vsExp(n, a, y);
+    map_vector_float_t(y, n) = const_map_vector_float_t(a, n).array().exp();
 }
 
 template <>
 void caffe_exp<double>(const int n, const double* a, double* y) {
-  vdExp(n, a, y);
+    //vdExp(n, a, y);
+    map_vector_double_t(y, n) = const_map_vector_double_t(a, n).array().exp();
 }
 
 template <>

From 04ca88ac15beb35cd127e7c6c2233b774e12c994 Mon Sep 17 00:00:00 2001
From: Kai Li <kaili_kloud@163.com>
Date: Sat, 11 Jan 2014 23:51:54 +0800
Subject: [PATCH 02/24] Fixed uniform distribution upper bound to be inclusive

---
 include/caffe/util/math_functions.hpp         |  3 +
 .../test_multinomial_logistic_loss_layer.cpp  |  1 +
 .../test/test_random_number_generator.cpp     | 67 +++++++++++++++++++
 src/caffe/util/math_functions.cpp             | 15 ++++-
 4 files changed, 84 insertions(+), 2 deletions(-)
 create mode 100644 src/caffe/test/test_random_number_generator.cpp

diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index be192042..1ff8a773 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -86,6 +86,9 @@ void caffe_div(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 template <typename Dtype>
 void caffe_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
 
+template <typename Dtype>
+Dtype caffe_nextafter(const Dtype b);
+
 template <typename Dtype>
 void caffe_vRngUniform(const int n, Dtype* r, const Dtype a, const Dtype b);
 
diff --git a/src/caffe/test/test_multinomial_logistic_loss_layer.cpp b/src/caffe/test/test_multinomial_logistic_loss_layer.cpp
index 5169b708..bb3e8921 100644
--- a/src/caffe/test/test_multinomial_logistic_loss_layer.cpp
+++ b/src/caffe/test/test_multinomial_logistic_loss_layer.cpp
@@ -25,6 +25,7 @@ class MultinomialLogisticLossLayerTest : public ::testing::Test {
   MultinomialLogisticLossLayerTest()
       : blob_bottom_data_(new Blob<Dtype>(10, 5, 1, 1)),
         blob_bottom_label_(new Blob<Dtype>(10, 1, 1, 1)) {
+    Caffe::set_random_seed(1701);
     // fill the values
     FillerParameter filler_param;
     PositiveUnitballFiller<Dtype> filler(filler_param);
diff --git a/src/caffe/test/test_random_number_generator.cpp b/src/caffe/test/test_random_number_generator.cpp
new file mode 100644
index 00000000..4c3358f9
--- /dev/null
+++ b/src/caffe/test/test_random_number_generator.cpp
@@ -0,0 +1,67 @@
+#include <cmath>
+#include <cstring>
+#include <cuda_runtime.h>
+
+#include "gtest/gtest.h"
+#include "caffe/common.hpp"
+#include "caffe/syncedmem.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/test/test_caffe_main.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+class RandomNumberGeneratorTest : public ::testing::Test {
+ public:
+  virtual ~RandomNumberGeneratorTest() {}
+
+  Dtype sample_mean(const Dtype* const seqs, const size_t sample_size)
+  {
+      double sum = 0;
+      for (int i = 0; i < sample_size; ++i) {
+          sum += seqs[i];
+      }
+      return sum / sample_size;
+  }
+
+  Dtype mean_bound(const Dtype std, const size_t sample_size)
+  {
+      return  std/sqrt((double)sample_size);
+  }
+};
+
+
+typedef ::testing::Types<float, double> Dtypes;
+TYPED_TEST_CASE(RandomNumberGeneratorTest, Dtypes);
+
+TYPED_TEST(RandomNumberGeneratorTest, TestRngGaussian) {
+  size_t sample_size = 10000;
+  SyncedMemory data_a(sample_size * sizeof(TypeParam));
+  Caffe::set_random_seed(1701);
+  TypeParam mu = 0;
+  TypeParam sigma = 1;
+  caffe_vRngGaussian(sample_size, (TypeParam*)data_a.mutable_cpu_data(), mu, sigma);
+  TypeParam true_mean = mu;
+  TypeParam true_std = sigma;
+  TypeParam bound = mean_bound(true_std, sample_size);
+  TypeParam real_mean = sample_mean((TypeParam*)data_a.cpu_data(), sample_size);
+  EXPECT_NEAR(real_mean, true_mean, bound);
+}
+
+TYPED_TEST(RandomNumberGeneratorTest, TestRngUniform) {
+  size_t sample_size = 10000;
+  SyncedMemory data_a(sample_size * sizeof(TypeParam));
+  Caffe::set_random_seed(1701);
+  TypeParam lower = 0;
+  TypeParam upper = 1;
+  caffe_vRngUniform(sample_size, (TypeParam*)data_a.mutable_cpu_data(), lower, upper);
+  TypeParam true_mean = (lower + upper) / 2;
+  TypeParam true_std = (upper - lower) / sqrt(12);
+  TypeParam bound = mean_bound(true_std, sample_size);
+  TypeParam real_mean = sample_mean((TypeParam*)data_a.cpu_data(), sample_size);
+  EXPECT_NEAR(real_mean, true_mean, bound);
+}
+
+
+
+}  // namespace caffe
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index c3c0a69c..850a408f 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -1,8 +1,10 @@
 // Copyright 2013 Yangqing Jia
 // Copyright 2014 kloudkl@github
 
+#include <limits>
 //#include <mkl.h>
 #include <eigen3/Eigen/Dense>
+#include <boost/math/special_functions/next.hpp>
 #include <boost/random.hpp>
 
 #include <cublas_v2.h>
@@ -281,6 +283,11 @@ void caffe_powx<double>(const int n, const double* a, const double b,
     map_vector_double_t(y, n) = const_map_vector_double_t(a, n).array().pow(b);
 }
 
+template <typename Dtype>
+Dtype caffe_nextafter(const Dtype b) {
+  return boost::math::nextafter<Dtype, Dtype>(b, std::numeric_limits<Dtype>::max());
+}
+
 template <>
 void caffe_vRngUniform<float>(const int n, float* r,
     const float a, const float b) {
@@ -288,7 +295,8 @@ void caffe_vRngUniform<float>(const int n, float* r,
   //    n, r, a, b));
 
   // FIXME check if boundaries are handled in the same way ?
-  boost::uniform_real<float> random_distribution(a, b);
+  boost::random::uniform_real_distribution<float> random_distribution(
+      a, caffe_nextafter<float>(b));
   Caffe::random_generator_t &generator = Caffe::vsl_stream();
 
   for(int i = 0; i < n; i += 1)
@@ -304,7 +312,8 @@ void caffe_vRngUniform<double>(const int n, double* r,
   //    n, r, a, b));
 
     // FIXME check if boundaries are handled in the same way ?
-    boost::uniform_real<double> random_distribution(a, b);
+    boost::random::uniform_real_distribution<double> random_distribution(
+        a, caffe_nextafter<double>(b));
     Caffe::random_generator_t &generator = Caffe::vsl_stream();
 
     for(int i = 0; i < n; i += 1)
@@ -316,6 +325,7 @@ void caffe_vRngUniform<double>(const int n, double* r,
 template <>
 void caffe_vRngGaussian<float>(const int n, float* r, const float a,
     const float sigma) {
+    DCHECK(sigma > 0);
   //VSL_CHECK(vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER,
 //      Caffe::vsl_stream(), n, r, a, sigma));
 
@@ -333,6 +343,7 @@ void caffe_vRngGaussian<float>(const int n, float* r, const float a,
 template <>
 void caffe_vRngGaussian<double>(const int n, double* r, const double a,
     const double sigma) {
+    DCHECK(sigma > 0);
   //VSL_CHECK(vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER,
   //    Caffe::vsl_stream(), n, r, a, sigma));
 

From d666bdc9d3adc82d0d3c5d66597d1c6452f2f98c Mon Sep 17 00:00:00 2001
From: Kai Li <kaili_kloud@163.com>
Date: Sat, 11 Jan 2014 23:57:37 +0800
Subject: [PATCH 03/24] Fixed FlattenLayer Backward_cpu/gpu have no return
 value

---
 src/caffe/test/test_flatten_layer.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/caffe/test/test_flatten_layer.cpp b/src/caffe/test/test_flatten_layer.cpp
index 41c04536..f241135d 100644
--- a/src/caffe/test/test_flatten_layer.cpp
+++ b/src/caffe/test/test_flatten_layer.cpp
@@ -23,6 +23,7 @@ class FlattenLayerTest : public ::testing::Test {
   FlattenLayerTest()
       : blob_bottom_(new Blob<Dtype>(2, 3, 6, 5)),
         blob_top_(new Blob<Dtype>()) {
+    Caffe::set_random_seed(1701);
     // fill the values
     FillerParameter filler_param;
     GaussianFiller<Dtype> filler(filler_param);
@@ -73,6 +74,8 @@ TYPED_TEST(FlattenLayerTest, TestGPU) {
   for (int c = 0; c < 3 * 6 * 5; ++c) {
     EXPECT_EQ(this->blob_top_->data_at(0, c, 0, 0),
         this->blob_bottom_->data_at(0, c / (6 * 5), (c / 5) % 6, c % 5));
+    EXPECT_EQ(this->blob_top_->data_at(1, c, 0, 0),
+        this->blob_bottom_->data_at(1, c / (6 * 5), (c / 5) % 6, c % 5));
   }
 }
 

From 38457e1c1f0d5bb9765896c3d5a43eaf19534ec9 Mon Sep 17 00:00:00 2001
From: Kai Li <kaili_kloud@163.com>
Date: Sun, 12 Jan 2014 00:39:45 +0800
Subject: [PATCH 04/24] Fix test stochastic pooling stepsize/threshold to be
 same as max pooling

---
 src/caffe/test/test_stochastic_pooling.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/caffe/test/test_stochastic_pooling.cpp b/src/caffe/test/test_stochastic_pooling.cpp
index d60d04e8..aedd6f3c 100644
--- a/src/caffe/test/test_stochastic_pooling.cpp
+++ b/src/caffe/test/test_stochastic_pooling.cpp
@@ -146,8 +146,6 @@ TYPED_TEST(StochasticPoolingLayerTest, TestStochasticGPUTestPhase) {
   }
 }
 
-
-
 TYPED_TEST(StochasticPoolingLayerTest, TestGradientGPU) {
   Caffe::set_mode(Caffe::GPU);
   Caffe::set_phase(Caffe::TRAIN);
@@ -157,7 +155,7 @@ TYPED_TEST(StochasticPoolingLayerTest, TestGradientGPU) {
 
   layer_param.set_pool(LayerParameter_PoolMethod_STOCHASTIC);
   PoolingLayer<TypeParam> layer(layer_param);
-  GradientChecker<TypeParam> checker(1e-2, 1e-3);
+  GradientChecker<TypeParam> checker(1e-4, 1e-2);
   // it is too expensive to call curand multiple times, so we don't do an
   // exhaustive gradient check.
   checker.CheckGradient(&layer, &(this->blob_bottom_vec_),

From 788f070d063e3f3e5fc8eb0faa53411e966898f6 Mon Sep 17 00:00:00 2001
From: Kai Li <kaili_kloud@163.com>
Date: Sun, 12 Jan 2014 13:55:26 +0800
Subject: [PATCH 05/24] Fix math funcs, add tests, change Eigen Map to
 unaligned for lrn_layer

[shelhamer: removed math function tests, since they were merged via
other branches]
---
 include/caffe/blob.hpp            |   8 +
 src/caffe/util/math_functions.cpp | 356 ++++++++++++++++++------------
 2 files changed, 225 insertions(+), 139 deletions(-)

diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
index f31d3b0f..75cc3c67 100644
--- a/include/caffe/blob.hpp
+++ b/include/caffe/blob.hpp
@@ -27,6 +27,14 @@ class Blob {
   inline int count() const {return count_; }
   inline int offset(const int n, const int c = 0, const int h = 0,
       const int w = 0) const {
+    CHECK_GE(n, 0);
+    CHECK_LE(n, num_);
+    CHECK_GE(channels_, 0);
+    CHECK_LE(c, channels_);
+    CHECK_GE(height_, 0);
+    CHECK_LE(h, height_);
+    CHECK_GE(width_, 0);
+    CHECK_LE(w, width_);
     return ((n * channels_ + c) * height_ + h) * width_ + w;
   }
   // Copy from source. If copy_diff is false, we copy the data; if copy_diff
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 850a408f..46c82dbd 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -13,11 +13,22 @@
 
 namespace caffe {
 
-const int data_alignment = Eigen::Aligned; // how is data allocated ?
-typedef Eigen::Map<const Eigen::VectorXf, data_alignment> const_map_vector_float_t;
-typedef Eigen::Map<Eigen::VectorXf, data_alignment> map_vector_float_t;
-typedef Eigen::Map<const Eigen::VectorXd, data_alignment> const_map_vector_double_t;
-typedef Eigen::Map<Eigen::VectorXd, data_alignment> map_vector_double_t;
+// Operations on aligned memory are faster than on unaligned memory.
+// But unfortunately, the pointers passed in are not always aligned.
+// Therefore, the memory-aligned Eigen::Map objects that wrap them
+// cannot be assigned to. This happens in lrn_layer and makes
+// test_lrn_layer crash with segmentation fault.
+// TODO: Use aligned Eigen::Map when the pointer to be wrapped is aligned.
+
+// Though the default map option is unaligned, making it explicit is no harm.
+//const int data_alignment = Eigen::Aligned; // how is data allocated ?
+const int data_alignment = Eigen::Unaligned;
+typedef Eigen::Array<float, 1, Eigen::Dynamic> float_array_t;
+typedef Eigen::Map<const float_array_t, data_alignment> const_map_vector_float_t;
+typedef Eigen::Map<float_array_t, data_alignment> map_vector_float_t;
+typedef Eigen::Array<double, 1, Eigen::Dynamic> double_array_t;
+typedef Eigen::Map<const double_array_t, data_alignment> const_map_vector_double_t;
+typedef Eigen::Map<double_array_t, data_alignment> map_vector_double_t;
 
 template<>
 void caffe_cpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
@@ -128,25 +139,6 @@ void caffe_gpu_axpy<double>(const int N, const double alpha, const double* X,
   CUBLAS_CHECK(cublasDaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1));
 }
 
-template <>
-void caffe_axpby<float>(const int N, const float alpha, const float* X,
-    const float beta, float* Y) {
-    // y := a*x + b*y
-    //cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
-    map_vector_float_t(Y, N) *= beta;
-    map_vector_float_t(Y, N) += (alpha * const_map_vector_float_t(X, N));
-
-}
-
-template <>
-void caffe_axpby<double>(const int N, const double alpha, const double* X,
-    const double beta, double* Y) {
-    // y := a*x + b*y
-  //cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
-    map_vector_double_t(Y, N) *= beta;
-    map_vector_double_t(Y, N) += (alpha * const_map_vector_double_t(X, N));
-}
-
 template <>
 void caffe_copy<float>(const int N, const float* X, float* Y) {
   cblas_scopy(N, X, 1, Y, 1);
@@ -202,190 +194,276 @@ void caffe_gpu_axpby<double>(const int N, const double alpha, const double* X,
 }
 
 template <>
-void caffe_sqr<float>(const int n, const float* a, float* y) {
-  //vsSqr(n, a, y);
-  map_vector_float_t(y, n) = const_map_vector_float_t(a, n).array().sqrt();
+void caffe_axpby<float>(const int N, const float alpha, const float* X,
+    const float beta, float* Y) {
+  // y := a*x + b*y
+  //cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
+  CHECK_GE(N, 0);
+  CHECK(X);
+  CHECK(Y);
+  map_vector_float_t y_map(Y, N);
+  // Eigen produces optimized code using lasy evaluation
+  // http://eigen.tuxfamily.org/dox/TopicLazyEvaluation.html
+  y_map = const_map_vector_float_t(X, N) * alpha + y_map * beta;
 }
 
 template <>
-void caffe_sqr<double>(const int n, const double* a, double* y) {
-    //vdSqr(n, a, y);
-    map_vector_double_t(y, n) = const_map_vector_double_t(a, n).array().sqrt();
+void caffe_axpby<double>(const int N, const double alpha, const double* X,
+    const double beta, double* Y) {
+    // y := a*x + b*y
+  //cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
+  CHECK_GE(N, 0);
+  CHECK(X);
+  CHECK(Y);
+  map_vector_double_t y_map(Y, N);
+  y_map = const_map_vector_double_t(X, N) * alpha + y_map * beta;
 }
 
 template <>
 void caffe_add<float>(const int n, const float* a, const float* b,
     float* y) {
-    //vsAdd(n, a, b, y);
-    map_vector_float_t(y, n) = const_map_vector_float_t(a, n) + const_map_vector_float_t(b, n);
+  //vsAdd(n, a, b, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(b);
+  CHECK(y);
+  map_vector_float_t(y, n) = const_map_vector_float_t(a, n) +
+      const_map_vector_float_t(b, n);
 }
 
 template <>
 void caffe_add<double>(const int n, const double* a, const double* b,
     double* y) {
-    //vdAdd(n, a, b, y);
-    map_vector_double_t(y, n) = const_map_vector_double_t(a, n) + const_map_vector_double_t(b, n);
+  //vdAdd(n, a, b, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(b);
+  CHECK(y);
+  map_vector_double_t(y, n) = const_map_vector_double_t(a, n) +
+      const_map_vector_double_t(b, n);
 }
 
 template <>
 void caffe_sub<float>(const int n, const float* a, const float* b,
     float* y) {
-    //vsSub(n, a, b, y);
-    map_vector_float_t(y, n) = const_map_vector_float_t(a, n) - const_map_vector_float_t(b, n);
+  //vsSub(n, a, b, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(b);
+  CHECK(y);
+  map_vector_float_t(y, n) = const_map_vector_float_t(a, n) -
+      const_map_vector_float_t(b, n);
 }
 
 template <>
 void caffe_sub<double>(const int n, const double* a, const double* b,
     double* y) {
-    //vdSub(n, a, b, y);
-    map_vector_double_t(y, n) = const_map_vector_double_t(a, n) - const_map_vector_double_t(b, n);
+  //vdSub(n, a, b, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(b);
+  CHECK(y);
+  map_vector_double_t(y, n) = const_map_vector_double_t(a, n) -
+      const_map_vector_double_t(b, n);
 }
 
 template <>
 void caffe_mul<float>(const int n, const float* a, const float* b,
     float* y) {
-    //vsMul(n, a, b, y);
-    map_vector_float_t(y, n) = const_map_vector_float_t(a, n).array() * const_map_vector_float_t(b, n).array();
+  //vsMul(n, a, b, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(b);
+  CHECK(y);
+  map_vector_float_t(y, n) = const_map_vector_float_t(a, n) *
+       const_map_vector_float_t(b, n);
 }
 
 template <>
 void caffe_mul<double>(const int n, const double* a, const double* b,
     double* y) {
-    //vdMul(n, a, b, y);
-    map_vector_double_t(y, n) = const_map_vector_double_t(a, n).array() * const_map_vector_double_t(b, n).array();
+  //vdMul(n, a, b, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(b);
+  CHECK(y);
+  map_vector_double_t(y, n) = const_map_vector_double_t(a, n) *
+      const_map_vector_double_t(b, n);
 }
 
 template <>
 void caffe_div<float>(const int n, const float* a, const float* b,
     float* y) {
-    //vsDiv(n, a, b, y);
-    map_vector_float_t(y, n) = const_map_vector_float_t(a, n).array() / const_map_vector_float_t(b, n).array();
+  //vsDiv(n, a, b, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(b);
+  CHECK(y);
+  map_vector_float_t(y, n) = const_map_vector_float_t(a, n) /
+      const_map_vector_float_t(b, n);
 }
 
 template <>
 void caffe_div<double>(const int n, const double* a, const double* b,
     double* y) {
-    //vdDiv(n, a, b, y);
-    map_vector_double_t(y, n) = const_map_vector_double_t(a, n).array() / const_map_vector_double_t(b, n).array();
+  //vdDiv(n, a, b, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(b);
+  CHECK(y);
+  map_vector_double_t(y, n) = const_map_vector_double_t(a, n) /
+      const_map_vector_double_t(b, n);
 }
 
 template <>
 void caffe_powx<float>(const int n, const float* a, const float b,
     float* y) {
-    //vsPowx(n, a, b, y);
-    map_vector_float_t(y, n) = const_map_vector_float_t(a, n).array().pow(b);
+  //vsPowx(n, a, b, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(y);
+  map_vector_float_t(y, n) = const_map_vector_float_t(a, n).pow(b);
 }
 
 template <>
 void caffe_powx<double>(const int n, const double* a, const double b,
     double* y) {
-    //vdPowx(n, a, b, y);
-    map_vector_double_t(y, n) = const_map_vector_double_t(a, n).array().pow(b);
-}
-
-template <typename Dtype>
-Dtype caffe_nextafter(const Dtype b) {
-  return boost::math::nextafter<Dtype, Dtype>(b, std::numeric_limits<Dtype>::max());
+  //vdPowx(n, a, b, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(y);
+  map_vector_double_t(y, n) = const_map_vector_double_t(a, n).pow(b);
 }
 
 template <>
-void caffe_vRngUniform<float>(const int n, float* r,
-    const float a, const float b) {
-  //VSL_CHECK(vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, Caffe::vsl_stream(),
-  //    n, r, a, b));
-
-  // FIXME check if boundaries are handled in the same way ?
-  boost::random::uniform_real_distribution<float> random_distribution(
-      a, caffe_nextafter<float>(b));
-  Caffe::random_generator_t &generator = Caffe::vsl_stream();
-
-  for(int i = 0; i < n; i += 1)
-  {
-      r[i] = random_distribution(generator);
-  }
+void caffe_sqr<float>(const int n, const float* a, float* y) {
+  // http://software.intel.com/sites/products/documentation/hpc/mkl/mklman/GUID-F003F826-81BF-42EC-AE51-2EF624893133.htm
+  // v?Sqr Performs element by element squaring of the vector.
+  //vsSqr(n, a, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(y);
+  caffe_powx<float>(n, a, 2, y);
+  // TODO: which is faster?
+//  map_vector_float_t(y, n) = const_map_vector_float_t(a, n) *
+//      const_map_vector_float_t(a, n);
 }
 
 template <>
-void caffe_vRngUniform<double>(const int n, double* r,
-    const double a, const double b) {
-  //VSL_CHECK(vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, Caffe::vsl_stream(),
-  //    n, r, a, b));
-
-    // FIXME check if boundaries are handled in the same way ?
-    boost::random::uniform_real_distribution<double> random_distribution(
-        a, caffe_nextafter<double>(b));
-    Caffe::random_generator_t &generator = Caffe::vsl_stream();
-
-    for(int i = 0; i < n; i += 1)
-    {
-        r[i] = random_distribution(generator);
-    }
+void caffe_sqr<double>(const int n, const double* a, double* y) {
+  //vdSqr(n, a, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(y);
+  caffe_powx<double>(n, a, 2, y);
 }
 
-template <>
-void caffe_vRngGaussian<float>(const int n, float* r, const float a,
-    const float sigma) {
-    DCHECK(sigma > 0);
-  //VSL_CHECK(vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER,
-//      Caffe::vsl_stream(), n, r, a, sigma));
-
-    // FIXME check if parameters are handled in the same way ?
-    boost::normal_distribution<float> random_distribution(a, sigma);
-    Caffe::random_generator_t &generator = Caffe::vsl_stream();
-
-    for(int i = 0; i < n; i += 1)
-    {
-        r[i] = random_distribution(generator);
-    }
-}
-
-
-template <>
-void caffe_vRngGaussian<double>(const int n, double* r, const double a,
-    const double sigma) {
-    DCHECK(sigma > 0);
-  //VSL_CHECK(vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER,
-  //    Caffe::vsl_stream(), n, r, a, sigma));
-
-    // FIXME check if parameters are handled in the same way ?
-    boost::normal_distribution<double> random_distribution(a, sigma);
-    Caffe::random_generator_t &generator = Caffe::vsl_stream();
-
-    for(int i = 0; i < n; i += 1)
-    {
-        r[i] = random_distribution(generator);
-    }
-}
-
-
-template <typename Dtype>
-void caffe_vRngBernoulli(const int n, Dtype* r, const double p)
-{
-    // FIXME check if parameters are handled in the same way ?
-    boost::bernoulli_distribution<Dtype> random_distribution(p);
-    Caffe::random_generator_t &generator = Caffe::vsl_stream();
-
-    for(int i = 0; i < n; i += 1)
-    {
-        r[i] = random_distribution(generator);
-    }
-}
-
-template void caffe_vRngBernoulli<int>(const int n, int* r, const double p);
-
-
 template <>
 void caffe_exp<float>(const int n, const float* a, float* y) {
-    //vsExp(n, a, y);
-    map_vector_float_t(y, n) = const_map_vector_float_t(a, n).array().exp();
+  //vsExp(n, a, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(y);
+  map_vector_float_t(y, n) = const_map_vector_float_t(a, n).exp();
 }
 
 template <>
 void caffe_exp<double>(const int n, const double* a, double* y) {
-    //vdExp(n, a, y);
-    map_vector_double_t(y, n) = const_map_vector_double_t(a, n).array().exp();
+  //vdExp(n, a, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(y);
+  map_vector_double_t(y, n) = const_map_vector_double_t(a, n).exp();
 }
 
+template <typename Dtype>
+Dtype caffe_nextafter(const Dtype b) {
+  return boost::math::nextafter<Dtype, Dtype>(
+      b, std::numeric_limits<Dtype>::max());
+}
+
+template
+float caffe_nextafter(const float b);
+
+template
+double caffe_nextafter(const double b);
+
+template <typename Dtype>
+void caffe_vRngUniform(const int n, Dtype* r,
+    const Dtype a, const Dtype b) {
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_LE(a, b);
+  //VSL_CHECK(vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, Caffe::vsl_stream(),
+  //    n, r, a, b));
+
+  // FIXME check if boundaries are handled in the same way ?
+  // Fixed by caffe_nextafter
+  boost::random::uniform_real_distribution<Dtype> random_distribution(
+      a, caffe_nextafter<Dtype>(b));
+  Caffe::random_generator_t &generator = Caffe::vsl_stream();
+
+  for(int i = 0; i < n; i += 1) {
+    r[i] = random_distribution(generator);
+  }
+}
+
+template
+void caffe_vRngUniform<float>(const int n, float* r,
+                                       const float a, const float b);
+template
+void caffe_vRngUniform<double>(const int n, double* r,
+                                       const double a, const double b);
+
+template <typename Dtype>
+void caffe_vRngGaussian(const int n, Dtype* r, const Dtype a,
+    const Dtype sigma) {
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_GT(sigma, 0);
+  //VSL_CHECK(vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER,
+//      Caffe::vsl_stream(), n, r, a, sigma));
+
+    // FIXME check if parameters are handled in the same way ?
+    // http://www.boost.org/doc/libs/1_55_0/doc/html/boost/random/normal_distribution.html
+    // http://software.intel.com/sites/products/documentation/hpc/mkl/mklman/GUID-63196F25-5013-4038-8BCD-2613C4EF3DE4.htm
+    // The above two documents show that the probability density functions are different.
+    // But the unit tests still pass. Maybe their codes are the same or
+    // the tests are irrelevant to the random numbers.
+  boost::normal_distribution<Dtype> random_distribution(a, sigma);
+  Caffe::random_generator_t &generator = Caffe::vsl_stream();
+
+  for(int i = 0; i < n; i += 1) {
+    r[i] = random_distribution(generator);
+  }
+}
+
+template
+void caffe_vRngGaussian<float>(const int n, float* r, const float a,
+    const float sigma);
+
+template
+void caffe_vRngGaussian<double>(const int n, double* r, const double a,
+    const double sigma);
+
+template <typename Dtype>
+void caffe_vRngBernoulli(const int n, Dtype* r, const double p) {
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_GE(p, 0);
+  CHECK_LE(p, 1);
+    // FIXME check if parameters are handled in the same way ?
+  boost::bernoulli_distribution<Dtype> random_distribution(p);
+  Caffe::random_generator_t &generator = Caffe::vsl_stream();
+
+  for(int i = 0; i < n; i += 1) {
+    r[i] = random_distribution(generator);
+  }
+}
+
+template
+void caffe_vRngBernoulli<int>(const int n, int* r, const double p);
+
 template <>
 float caffe_cpu_dot<float>(const int n, const float* x, const float* y) {
   return cblas_sdot(n, x, 1, y, 1);

From d37a995b9601b21952be142a86d599b333ce9e1d Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@eecs.berkeley.edu>
Date: Wed, 8 Jan 2014 16:36:52 -0800
Subject: [PATCH 06/24] relax precision of MultinomialLogisticLossLayer test

---
 src/caffe/test/test_multinomial_logistic_loss_layer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/caffe/test/test_multinomial_logistic_loss_layer.cpp b/src/caffe/test/test_multinomial_logistic_loss_layer.cpp
index bb3e8921..5a61df79 100644
--- a/src/caffe/test/test_multinomial_logistic_loss_layer.cpp
+++ b/src/caffe/test/test_multinomial_logistic_loss_layer.cpp
@@ -56,7 +56,7 @@ TYPED_TEST(MultinomialLogisticLossLayerTest, TestGradientCPU) {
   Caffe::set_mode(Caffe::CPU);
   MultinomialLogisticLossLayer<TypeParam> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, &this->blob_top_vec_);
-  GradientChecker<TypeParam> checker(1e-2, 1e-2, 1701, 0, 0.05);
+  GradientChecker<TypeParam> checker(1e-2, 2*1e-2, 1701, 0, 0.05);
   checker.CheckGradientSingle(&layer, &(this->blob_bottom_vec_),
       &(this->blob_top_vec_), 0, -1, -1);
 }

From 2ae2683fb84a210a7030efaf2287c75966260fac Mon Sep 17 00:00:00 2001
From: Alejandro Dubrovsky <alito@organicrobot.com>
Date: Wed, 22 Jan 2014 22:56:17 +1100
Subject: [PATCH 07/24] nextafter templates off one type

---
 src/caffe/util/math_functions.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 46c82dbd..acd03439 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -378,7 +378,7 @@ void caffe_exp<double>(const int n, const double* a, double* y) {
 
 template <typename Dtype>
 Dtype caffe_nextafter(const Dtype b) {
-  return boost::math::nextafter<Dtype, Dtype>(
+  return boost::math::nextafter<Dtype>(
       b, std::numeric_limits<Dtype>::max());
 }
 

From b9257396d6548a67dd6e9ecade25970187fe6e03 Mon Sep 17 00:00:00 2001
From: Alejandro Dubrovsky <alito@organicrobot.com>
Date: Wed, 22 Jan 2014 22:56:57 +1100
Subject: [PATCH 08/24] mean_bound and sample_mean need referencing with this

---
 src/caffe/test/test_random_number_generator.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/caffe/test/test_random_number_generator.cpp b/src/caffe/test/test_random_number_generator.cpp
index 4c3358f9..26c9f2e3 100644
--- a/src/caffe/test/test_random_number_generator.cpp
+++ b/src/caffe/test/test_random_number_generator.cpp
@@ -43,8 +43,8 @@ TYPED_TEST(RandomNumberGeneratorTest, TestRngGaussian) {
   caffe_vRngGaussian(sample_size, (TypeParam*)data_a.mutable_cpu_data(), mu, sigma);
   TypeParam true_mean = mu;
   TypeParam true_std = sigma;
-  TypeParam bound = mean_bound(true_std, sample_size);
-  TypeParam real_mean = sample_mean((TypeParam*)data_a.cpu_data(), sample_size);
+  TypeParam bound = this->mean_bound(true_std, sample_size);
+  TypeParam real_mean = this->sample_mean((TypeParam*)data_a.cpu_data(), sample_size);
   EXPECT_NEAR(real_mean, true_mean, bound);
 }
 
@@ -57,8 +57,8 @@ TYPED_TEST(RandomNumberGeneratorTest, TestRngUniform) {
   caffe_vRngUniform(sample_size, (TypeParam*)data_a.mutable_cpu_data(), lower, upper);
   TypeParam true_mean = (lower + upper) / 2;
   TypeParam true_std = (upper - lower) / sqrt(12);
-  TypeParam bound = mean_bound(true_std, sample_size);
-  TypeParam real_mean = sample_mean((TypeParam*)data_a.cpu_data(), sample_size);
+  TypeParam bound = this->mean_bound(true_std, sample_size);
+  TypeParam real_mean = this->sample_mean((TypeParam*)data_a.cpu_data(), sample_size);
   EXPECT_NEAR(real_mean, true_mean, bound);
 }
 

From 93c9f151dcd4fe4a5cfdc3a5c33f378e7b150648 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 22 Jan 2014 12:14:09 -0800
Subject: [PATCH 09/24] make uniform distribution usage compatible with boost
 1.46

---
 src/caffe/util/math_functions.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index acd03439..812708fa 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -399,7 +399,7 @@ void caffe_vRngUniform(const int n, Dtype* r,
 
   // FIXME check if boundaries are handled in the same way ?
   // Fixed by caffe_nextafter
-  boost::random::uniform_real_distribution<Dtype> random_distribution(
+  boost::uniform_real<Dtype> random_distribution(
       a, caffe_nextafter<Dtype>(b));
   Caffe::random_generator_t &generator = Caffe::vsl_stream();
 

From 4b1fba7be37f885a95807f3811852ac02bce8cbd Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 22 Jan 2014 12:28:01 -0800
Subject: [PATCH 10/24] use boost variate_generator to pass tests w/ boost 1.46
 (Gaussian filler previously filled in all NaNs for me, making many tests
 fail)

---
 src/caffe/util/math_functions.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 812708fa..832f641c 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -432,9 +432,12 @@ void caffe_vRngGaussian(const int n, Dtype* r, const Dtype a,
     // the tests are irrelevant to the random numbers.
   boost::normal_distribution<Dtype> random_distribution(a, sigma);
   Caffe::random_generator_t &generator = Caffe::vsl_stream();
+  boost::variate_generator<Caffe::random_generator_t,
+      boost::normal_distribution<Dtype> > variate_generator(
+      generator, random_distribution);
 
-  for(int i = 0; i < n; i += 1) {
-    r[i] = random_distribution(generator);
+  for(int i = 0; i < n; ++i) {
+    r[i] = variate_generator();
   }
 }
 

From b3e4ac55fe42e98809857edd7ec1d2f6fbbb2335 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 22 Jan 2014 12:42:12 -0800
Subject: [PATCH 11/24] change all Rng's to use variate_generator for
 consistency

---
 src/caffe/util/math_functions.cpp | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 832f641c..3e27f8dd 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -402,9 +402,12 @@ void caffe_vRngUniform(const int n, Dtype* r,
   boost::uniform_real<Dtype> random_distribution(
       a, caffe_nextafter<Dtype>(b));
   Caffe::random_generator_t &generator = Caffe::vsl_stream();
+  boost::variate_generator<Caffe::random_generator_t,
+      boost::uniform_real<Dtype> > variate_generator(
+      generator, random_distribution);
 
-  for(int i = 0; i < n; i += 1) {
-    r[i] = random_distribution(generator);
+  for (int i = 0; i < n; ++i) {
+    r[i] = variate_generator();
   }
 }
 
@@ -436,7 +439,7 @@ void caffe_vRngGaussian(const int n, Dtype* r, const Dtype a,
       boost::normal_distribution<Dtype> > variate_generator(
       generator, random_distribution);
 
-  for(int i = 0; i < n; ++i) {
+  for (int i = 0; i < n; ++i) {
     r[i] = variate_generator();
   }
 }
@@ -458,9 +461,12 @@ void caffe_vRngBernoulli(const int n, Dtype* r, const double p) {
     // FIXME check if parameters are handled in the same way ?
   boost::bernoulli_distribution<Dtype> random_distribution(p);
   Caffe::random_generator_t &generator = Caffe::vsl_stream();
+  boost::variate_generator<Caffe::random_generator_t,
+      boost::bernoulli_distribution<Dtype> > variate_generator(
+      generator, random_distribution);
 
-  for(int i = 0; i < n; i += 1) {
-    r[i] = random_distribution(generator);
+  for (int i = 0; i < n; ++i) {
+    r[i] = variate_generator();
   }
 }
 

From 6cbf9f189b9318b264c4cfe73bd1412eba4646f2 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 29 Jan 2014 13:03:42 -0800
Subject: [PATCH 12/24] add bernoulli rng test to demonstrate bug (generates
 all 0s unless p == 1)

---
 .../test/test_random_number_generator.cpp     | 40 ++++++++++++++++---
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/src/caffe/test/test_random_number_generator.cpp b/src/caffe/test/test_random_number_generator.cpp
index 26c9f2e3..c43a5d94 100644
--- a/src/caffe/test/test_random_number_generator.cpp
+++ b/src/caffe/test/test_random_number_generator.cpp
@@ -24,6 +24,15 @@ class RandomNumberGeneratorTest : public ::testing::Test {
       return sum / sample_size;
   }
 
+  Dtype sample_mean(const int* const seqs, const size_t sample_size)
+  {
+      Dtype sum = 0;
+      for (int i = 0; i < sample_size; ++i) {
+          sum += Dtype(seqs[i]);
+      }
+      return sum / sample_size;
+  }
+
   Dtype mean_bound(const Dtype std, const size_t sample_size)
   {
       return  std/sqrt((double)sample_size);
@@ -40,28 +49,47 @@ TYPED_TEST(RandomNumberGeneratorTest, TestRngGaussian) {
   Caffe::set_random_seed(1701);
   TypeParam mu = 0;
   TypeParam sigma = 1;
-  caffe_vRngGaussian(sample_size, (TypeParam*)data_a.mutable_cpu_data(), mu, sigma);
+  caffe_vRngGaussian(sample_size,
+      (TypeParam*)data_a.mutable_cpu_data(), mu, sigma);
   TypeParam true_mean = mu;
   TypeParam true_std = sigma;
   TypeParam bound = this->mean_bound(true_std, sample_size);
-  TypeParam real_mean = this->sample_mean((TypeParam*)data_a.cpu_data(), sample_size);
-  EXPECT_NEAR(real_mean, true_mean, bound);
+  TypeParam empirical_mean =
+      this->sample_mean((TypeParam*)data_a.cpu_data(), sample_size);
+  EXPECT_NEAR(empirical_mean, true_mean, bound);
 }
 
+
 TYPED_TEST(RandomNumberGeneratorTest, TestRngUniform) {
   size_t sample_size = 10000;
   SyncedMemory data_a(sample_size * sizeof(TypeParam));
   Caffe::set_random_seed(1701);
   TypeParam lower = 0;
   TypeParam upper = 1;
-  caffe_vRngUniform(sample_size, (TypeParam*)data_a.mutable_cpu_data(), lower, upper);
+  caffe_vRngUniform(sample_size,
+      (TypeParam*)data_a.mutable_cpu_data(), lower, upper);
   TypeParam true_mean = (lower + upper) / 2;
   TypeParam true_std = (upper - lower) / sqrt(12);
   TypeParam bound = this->mean_bound(true_std, sample_size);
-  TypeParam real_mean = this->sample_mean((TypeParam*)data_a.cpu_data(), sample_size);
-  EXPECT_NEAR(real_mean, true_mean, bound);
+  TypeParam empirical_mean =
+      this->sample_mean((TypeParam*)data_a.cpu_data(), sample_size);
+  EXPECT_NEAR(empirical_mean, true_mean, bound);
 }
 
 
+TYPED_TEST(RandomNumberGeneratorTest, TestRngBernoulli) {
+  size_t sample_size = 10000;
+  SyncedMemory data_a(sample_size * sizeof(int));
+  Caffe::set_random_seed(1701);
+  double p = 0.3;
+  caffe_vRngBernoulli(sample_size, (int*)data_a.mutable_cpu_data(), p);
+  TypeParam true_mean = p;
+  TypeParam true_std = sqrt(p * (1 - p));
+  TypeParam bound = this->mean_bound(true_std, sample_size);
+  TypeParam empirical_mean =
+      this->sample_mean((const int *)data_a.cpu_data(), sample_size);
+  EXPECT_NEAR(empirical_mean, true_mean, bound);
+}
+
 
 }  // namespace caffe

From 4f6b26632a9f201f5263cce9d1bbe9e43ec95347 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 29 Jan 2014 13:11:34 -0800
Subject: [PATCH 13/24] fix bernoulli generator bug

---
 src/caffe/util/math_functions.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 3e27f8dd..d0841e21 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -458,11 +458,10 @@ void caffe_vRngBernoulli(const int n, Dtype* r, const double p) {
   CHECK(r);
   CHECK_GE(p, 0);
   CHECK_LE(p, 1);
-    // FIXME check if parameters are handled in the same way ?
-  boost::bernoulli_distribution<Dtype> random_distribution(p);
+  boost::bernoulli_distribution<double> random_distribution(p);
   Caffe::random_generator_t &generator = Caffe::vsl_stream();
   boost::variate_generator<Caffe::random_generator_t,
-      boost::bernoulli_distribution<Dtype> > variate_generator(
+      boost::bernoulli_distribution<double> > variate_generator(
       generator, random_distribution);
 
   for (int i = 0; i < n; ++i) {

From 1cf822e53bee3eeca5dbc3c08a1e95171688ea9a Mon Sep 17 00:00:00 2001
From: Kai Li <kaili_kloud@163.com>
Date: Fri, 7 Feb 2014 18:44:10 +0800
Subject: [PATCH 14/24] Replace atlas with multithreaded OpenBLAS to speed-up
 on multi-core CPU

issue: #79
---
 Makefile | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 7e74f2ad..6cc8f1e4 100644
--- a/Makefile
+++ b/Makefile
@@ -87,16 +87,15 @@ MKL_INCLUDE_DIR := $(MKL_DIR)/include
 MKL_LIB_DIR := $(MKL_DIR)/lib $(MKL_DIR)/lib/intel64
 
 INCLUDE_DIRS += ./src ./include $(CUDA_INCLUDE_DIR) $(MKL_INCLUDE_DIR)
-LIBRARY_DIRS += $(CUDA_LIB_DIR) $(MKL_LIB_DIR) /usr/lib/atlas-base
+LIBRARY_DIRS += $(CUDA_LIB_DIR) $(MKL_LIB_DIR)
 LIBRARIES := cudart cublas curand \
-	atlas cblas \
+	openblas \
 	pthread \
 	glog protobuf \
 	leveldb snappy \
 	boost_system \
 	hdf5_hl hdf5 \
 	opencv_core opencv_highgui opencv_imgproc
-	# mkl_rt mkl_intel_thread 
 PYTHON_LIBRARIES := boost_python python2.7
 WARNINGS := -Wall
 
@@ -104,7 +103,7 @@ COMMON_FLAGS := -DNDEBUG -O2 $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir
 CXXFLAGS += -pthread -fPIC $(COMMON_FLAGS)
 NVCCFLAGS := -ccbin=$(CXX) -Xcompiler -fPIC $(COMMON_FLAGS)
 LDFLAGS += $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) \
-		$(foreach library,$(LIBRARIES),-l$(library)) -Wl,-rpath=/usr/lib/atlas-base
+		$(foreach library,$(LIBRARIES),-l$(library))
 PYTHON_LDFLAGS := $(LDFLAGS) $(foreach library,$(PYTHON_LIBRARIES),-l$(library))
 
 

From a8c9b66b7f62610d71a18c798d5eb7157d49420c Mon Sep 17 00:00:00 2001
From: Rowland Depp <depp.rowland@gmail.com>
Date: Tue, 11 Feb 2014 21:41:01 -0800
Subject: [PATCH 15/24] major refactoring allow coexistence of MKL and non-MKL
 cases

---
 Makefile                              |   8 ++
 Makefile.config.example               |   2 +
 include/caffe/util/math_functions.hpp |   7 +-
 include/caffe/util/mkl_alternate.hpp  |  95 ++++++++++++++++
 src/caffe/layers/loss_layer.cpp       |   2 +-
 src/caffe/solver.cpp                  |   2 +-
 src/caffe/util/math_functions.cpp     | 150 ++++----------------------
 7 files changed, 131 insertions(+), 135 deletions(-)
 create mode 100644 include/caffe/util/mkl_alternate.hpp

diff --git a/Makefile b/Makefile
index 6cc8f1e4..488acb42 100644
--- a/Makefile
+++ b/Makefile
@@ -106,6 +106,14 @@ LDFLAGS += $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) \
 		$(foreach library,$(LIBRARIES),-l$(library))
 PYTHON_LDFLAGS := $(LDFLAGS) $(foreach library,$(PYTHON_LIBRARIES),-l$(library))
 
+# MKL options
+ifdef USE_MKL
+  LIBRARIES += mkl_rt
+  COMMON_FLAGS += -DUSE_MKL
+else
+  LIBRARIES += atlas cblas
+endif
+
 
 ##############################
 # Define build targets
diff --git a/Makefile.config.example b/Makefile.config.example
index cec85e0a..0ec2eead 100644
--- a/Makefile.config.example
+++ b/Makefile.config.example
@@ -10,6 +10,8 @@ CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \
     -gencode arch=compute_30,code=sm_30 \
     -gencode arch=compute_35,code=sm_35
 
+# If not using MKL, comment out the following line.
+# USE_MKL=1
 # MKL directory contains include/ and lib/ directions that we need.
 MKL_DIR := /opt/intel/mkl
 
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 1ff8a773..db19acc3 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -4,10 +4,11 @@
 #ifndef CAFFE_UTIL_MATH_FUNCTIONS_H_
 #define CAFFE_UTIL_MATH_FUNCTIONS_H_
 
-//#include <mkl.h>
-#include <cblas.h>
+
 #include <cublas_v2.h>
 
+#include "caffe/util/mkl_alternate.hpp"
+
 namespace caffe {
 
 // Decaf gemm provides a simpler interface to the gemm functions, with the
@@ -46,7 +47,7 @@ void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X,
     Dtype* Y);
 
 template <typename Dtype>
-void caffe_axpby(const int N, const Dtype alpha, const Dtype* X,
+void caffe_cpu_axpby(const int N, const Dtype alpha, const Dtype* X,
     const Dtype beta, Dtype* Y);
 
 template <typename Dtype>
diff --git a/include/caffe/util/mkl_alternate.hpp b/include/caffe/util/mkl_alternate.hpp
new file mode 100644
index 00000000..1c207c67
--- /dev/null
+++ b/include/caffe/util/mkl_alternate.hpp
@@ -0,0 +1,95 @@
+// Copyright 2013 Rowland Depp
+
+#ifndef CAFFE_UTIL_MKL_ALTERNATE_H_
+#define CAFFE_UTIL_MKL_ALTERNATE_H_
+
+#ifdef USE_MKL
+
+#include <mkl.h>
+
+#else  // If use MKL, simply include the MKL header
+
+#include <cblas.h>
+#include <math.h>
+
+// Functions that caffe uses but are not present if MKL is not linked.
+
+// A simple way to define the vsl unary functions. The operation should
+// be in the form e.g. y[i] = sqrt(a[i])
+#define DEFINE_VSL_UNARY_FUNC(name, operation) \
+  template<typename Dtype> \
+  void v##name(const int n, const Dtype* a, Dtype* y) { \
+    CHECK_GT(n, 0); CHECK(a); CHECK(y); \
+    for (int i = 0; i < n; ++i) { operation; } \
+  } \
+  inline void vs##name( \
+    const int n, const float* a, float* y) { \
+    v##name<float>(n, a, y); \
+  } \
+  inline void vd##name( \
+      const int n, const double* a, double* y) { \
+    v##name<double>(n, a, y); \
+  }
+
+DEFINE_VSL_UNARY_FUNC(Sqr, y[i] = a[i] * a[i]);
+DEFINE_VSL_UNARY_FUNC(Exp, y[i] = exp(a[i]));
+
+// A simple way to define the vsl unary functions with singular parameter b.
+// The operation should be in the form e.g. y[i] = pow(a[i], b)
+#define DEFINE_VSL_UNARY_FUNC_WITH_PARAM(name, operation) \
+  template<typename Dtype> \
+  void v##name(const int n, const Dtype* a, const Dtype b, Dtype* y) { \
+    CHECK_GT(n, 0); CHECK(a); CHECK(y); \
+    for (int i = 0; i < n; ++i) { operation; } \
+  } \
+  inline void vs##name( \
+    const int n, const float* a, const float b, float* y) { \
+    v##name<float>(n, a, b, y); \
+  } \
+  inline void vd##name( \
+      const int n, const double* a, const float b, double* y) { \
+    v##name<double>(n, a, b, y); \
+  }
+
+DEFINE_VSL_UNARY_FUNC_WITH_PARAM(Powx, y[i] = pow(a[i], b));
+
+// A simple way to define the vsl binary functions. The operation should
+// be in the form e.g. y[i] = a[i] + b[i]
+#define DEFINE_VSL_BINARY_FUNC(name, operation) \
+  template<typename Dtype> \
+  void v##name(const int n, const Dtype* a, const Dtype* b, Dtype* y) { \
+    CHECK_GT(n, 0); CHECK(a); CHECK(b); CHECK(y); \
+    for (int i = 0; i < n; ++i) { operation; } \
+  } \
+  inline void vs##name( \
+    const int n, const float* a, const float* b, float* y) { \
+    v##name<float>(n, a, b, y); \
+  } \
+  inline void vd##name( \
+      const int n, const double* a, const double* b, double* y) { \
+    v##name<double>(n, a, b, y); \
+  }
+
+DEFINE_VSL_BINARY_FUNC(Add, y[i] = a[i] + b[i]);
+DEFINE_VSL_BINARY_FUNC(Sub, y[i] = a[i] - b[i]);
+DEFINE_VSL_BINARY_FUNC(Mul, y[i] = a[i] * b[i]);
+DEFINE_VSL_BINARY_FUNC(Div, y[i] = a[i] / b[i]);
+
+// In addition, MKL comes with an additional function axpby that is not present
+// in standard blas. We will simply use a two-step (inefficient, of course) way
+// to mimic that.
+inline void cblas_saxpby(const int N, const float alpha, const float* X,
+                         const int incX, const float beta, float* Y,
+                         const int incY) {
+  cblas_sscal(N, beta, Y, incY);
+  cblas_saxpy(N, alpha, X, incX, Y, incY);
+}
+inline void cblas_daxpby(const int N, const double alpha, const double* X,
+                         const int incX, const double beta, double* Y,
+                         const int incY) {
+  cblas_dscal(N, beta, Y, incY);
+  cblas_daxpy(N, alpha, X, incX, Y, incY);
+}
+
+#endif  // USE_MKL
+#endif  // CAFFE_UTIL_MKL_ALTERNATE_H_
diff --git a/src/caffe/layers/loss_layer.cpp b/src/caffe/layers/loss_layer.cpp
index 3c0f15fb..ef0074d5 100644
--- a/src/caffe/layers/loss_layer.cpp
+++ b/src/caffe/layers/loss_layer.cpp
@@ -154,7 +154,7 @@ void EuclideanLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   int count = (*bottom)[0]->count();
   int num = (*bottom)[0]->num();
   // Compute the gradient
-  caffe_axpby(count, Dtype(1) / num, difference_.cpu_data(), Dtype(0),
+  caffe_cpu_axpby(count, Dtype(1) / num, difference_.cpu_data(), Dtype(0),
       (*bottom)[0]->mutable_cpu_diff());
 }
 
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index eb024856..fb46c4ec 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -215,7 +215,7 @@ void SGDSolver<Dtype>::ComputeUpdateValue() {
       // Compute the value to history, and then copy them to the blob's diff.
       Dtype local_rate = rate * net_params_lr[param_id];
       Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
-      caffe_axpby(net_params[param_id]->count(), local_rate,
+      caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
           net_params[param_id]->cpu_diff(), momentum,
           history_[param_id]->mutable_cpu_data());
       if (local_decay) {
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index d0841e21..fb2b1127 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -3,7 +3,6 @@
 
 #include <limits>
 //#include <mkl.h>
-#include <eigen3/Eigen/Dense>
 #include <boost/math/special_functions/next.hpp>
 #include <boost/random.hpp>
 
@@ -13,23 +12,6 @@
 
 namespace caffe {
 
-// Operations on aligned memory are faster than on unaligned memory.
-// But unfortunately, the pointers passed in are not always aligned.
-// Therefore, the memory-aligned Eigen::Map objects that wrap them
-// cannot be assigned to. This happens in lrn_layer and makes
-// test_lrn_layer crash with segmentation fault.
-// TODO: Use aligned Eigen::Map when the pointer to be wrapped is aligned.
-
-// Though the default map option is unaligned, making it explicit is no harm.
-//const int data_alignment = Eigen::Aligned; // how is data allocated ?
-const int data_alignment = Eigen::Unaligned;
-typedef Eigen::Array<float, 1, Eigen::Dynamic> float_array_t;
-typedef Eigen::Map<const float_array_t, data_alignment> const_map_vector_float_t;
-typedef Eigen::Map<float_array_t, data_alignment> map_vector_float_t;
-typedef Eigen::Array<double, 1, Eigen::Dynamic> double_array_t;
-typedef Eigen::Map<const double_array_t, data_alignment> const_map_vector_double_t;
-typedef Eigen::Map<double_array_t, data_alignment> map_vector_double_t;
-
 template<>
 void caffe_cpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
     const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
@@ -126,7 +108,6 @@ template <>
 void caffe_axpy<double>(const int N, const double alpha, const double* X,
     double* Y) { cblas_daxpy(N, alpha, X, 1, Y, 1); }
 
-
 template <>
 void caffe_gpu_axpy<float>(const int N, const float alpha, const float* X,
     float* Y) {
@@ -194,186 +175,95 @@ void caffe_gpu_axpby<double>(const int N, const double alpha, const double* X,
 }
 
 template <>
-void caffe_axpby<float>(const int N, const float alpha, const float* X,
-    const float beta, float* Y) {
-  // y := a*x + b*y
-  //cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
-  CHECK_GE(N, 0);
-  CHECK(X);
-  CHECK(Y);
-  map_vector_float_t y_map(Y, N);
-  // Eigen produces optimized code using lasy evaluation
-  // http://eigen.tuxfamily.org/dox/TopicLazyEvaluation.html
-  y_map = const_map_vector_float_t(X, N) * alpha + y_map * beta;
+void caffe_cpu_axpby<float>(const int N, const float alpha, const float* X,
+                            const float beta, float* Y) {
+  cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
 }
 
 template <>
-void caffe_axpby<double>(const int N, const double alpha, const double* X,
-    const double beta, double* Y) {
-    // y := a*x + b*y
-  //cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
-  CHECK_GE(N, 0);
-  CHECK(X);
-  CHECK(Y);
-  map_vector_double_t y_map(Y, N);
-  y_map = const_map_vector_double_t(X, N) * alpha + y_map * beta;
+void caffe_cpu_axpby<double>(const int N, const double alpha, const double* X,
+                             const double beta, double* Y) {
+  cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
 }
 
 template <>
 void caffe_add<float>(const int n, const float* a, const float* b,
     float* y) {
-  //vsAdd(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(b);
-  CHECK(y);
-  map_vector_float_t(y, n) = const_map_vector_float_t(a, n) +
-      const_map_vector_float_t(b, n);
+  vsAdd(n, a, b, y);
 }
 
 template <>
 void caffe_add<double>(const int n, const double* a, const double* b,
     double* y) {
-  //vdAdd(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(b);
-  CHECK(y);
-  map_vector_double_t(y, n) = const_map_vector_double_t(a, n) +
-      const_map_vector_double_t(b, n);
+  vdAdd(n, a, b, y);
 }
 
 template <>
 void caffe_sub<float>(const int n, const float* a, const float* b,
     float* y) {
-  //vsSub(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(b);
-  CHECK(y);
-  map_vector_float_t(y, n) = const_map_vector_float_t(a, n) -
-      const_map_vector_float_t(b, n);
+  vsSub(n, a, b, y);
 }
 
 template <>
 void caffe_sub<double>(const int n, const double* a, const double* b,
     double* y) {
-  //vdSub(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(b);
-  CHECK(y);
-  map_vector_double_t(y, n) = const_map_vector_double_t(a, n) -
-      const_map_vector_double_t(b, n);
+  vdSub(n, a, b, y);
 }
 
 template <>
 void caffe_mul<float>(const int n, const float* a, const float* b,
     float* y) {
-  //vsMul(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(b);
-  CHECK(y);
-  map_vector_float_t(y, n) = const_map_vector_float_t(a, n) *
-       const_map_vector_float_t(b, n);
+  vsMul(n, a, b, y);
 }
 
 template <>
 void caffe_mul<double>(const int n, const double* a, const double* b,
     double* y) {
-  //vdMul(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(b);
-  CHECK(y);
-  map_vector_double_t(y, n) = const_map_vector_double_t(a, n) *
-      const_map_vector_double_t(b, n);
+  vdMul(n, a, b, y);
 }
 
 template <>
 void caffe_div<float>(const int n, const float* a, const float* b,
     float* y) {
-  //vsDiv(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(b);
-  CHECK(y);
-  map_vector_float_t(y, n) = const_map_vector_float_t(a, n) /
-      const_map_vector_float_t(b, n);
+  vsDiv(n, a, b, y);
 }
 
 template <>
 void caffe_div<double>(const int n, const double* a, const double* b,
     double* y) {
-  //vdDiv(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(b);
-  CHECK(y);
-  map_vector_double_t(y, n) = const_map_vector_double_t(a, n) /
-      const_map_vector_double_t(b, n);
+  vdDiv(n, a, b, y);
 }
 
 template <>
 void caffe_powx<float>(const int n, const float* a, const float b,
     float* y) {
-  //vsPowx(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(y);
-  map_vector_float_t(y, n) = const_map_vector_float_t(a, n).pow(b);
+  vsPowx(n, a, b, y);
 }
 
 template <>
 void caffe_powx<double>(const int n, const double* a, const double b,
     double* y) {
-  //vdPowx(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(y);
-  map_vector_double_t(y, n) = const_map_vector_double_t(a, n).pow(b);
+  vdPowx(n, a, b, y);
 }
 
 template <>
 void caffe_sqr<float>(const int n, const float* a, float* y) {
-  // http://software.intel.com/sites/products/documentation/hpc/mkl/mklman/GUID-F003F826-81BF-42EC-AE51-2EF624893133.htm
-  // v?Sqr Performs element by element squaring of the vector.
-  //vsSqr(n, a, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(y);
-  caffe_powx<float>(n, a, 2, y);
-  // TODO: which is faster?
-//  map_vector_float_t(y, n) = const_map_vector_float_t(a, n) *
-//      const_map_vector_float_t(a, n);
+  vsSqr(n, a, y);
 }
 
 template <>
 void caffe_sqr<double>(const int n, const double* a, double* y) {
-  //vdSqr(n, a, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(y);
-  caffe_powx<double>(n, a, 2, y);
+  vdSqr(n, a, y);
 }
 
 template <>
 void caffe_exp<float>(const int n, const float* a, float* y) {
-  //vsExp(n, a, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(y);
-  map_vector_float_t(y, n) = const_map_vector_float_t(a, n).exp();
+  vsExp(n, a, y);
 }
 
 template <>
 void caffe_exp<double>(const int n, const double* a, double* y) {
-  //vdExp(n, a, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(y);
-  map_vector_double_t(y, n) = const_map_vector_double_t(a, n).exp();
+  vdExp(n, a, y);
 }
 
 template <typename Dtype>

From c028d09ca6e923f38beea3ba0877f31ff784191f Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 14 Feb 2014 21:27:20 -0800
Subject: [PATCH 16/24] rewrite MKL flag note, polish makefile

add MKL dirs conditioned on USE_MKL
include libraries before making LD_FLAGS
---
 Makefile                | 31 ++++++++++++++++---------------
 Makefile.config.example |  4 ++--
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/Makefile b/Makefile
index 488acb42..743a55f2 100644
--- a/Makefile
+++ b/Makefile
@@ -86,35 +86,36 @@ CUDA_LIB_DIR := $(CUDA_DIR)/lib64 $(CUDA_DIR)/lib
 MKL_INCLUDE_DIR := $(MKL_DIR)/include
 MKL_LIB_DIR := $(MKL_DIR)/lib $(MKL_DIR)/lib/intel64
 
-INCLUDE_DIRS += ./src ./include $(CUDA_INCLUDE_DIR) $(MKL_INCLUDE_DIR)
-LIBRARY_DIRS += $(CUDA_LIB_DIR) $(MKL_LIB_DIR)
+INCLUDE_DIRS += ./src ./include $(CUDA_INCLUDE_DIR)
+LIBRARY_DIRS += $(CUDA_LIB_DIR)
 LIBRARIES := cudart cublas curand \
-	openblas \
 	pthread \
-	glog protobuf \
-	leveldb snappy \
+	glog protobuf leveldb snappy \
 	boost_system \
 	hdf5_hl hdf5 \
 	opencv_core opencv_highgui opencv_imgproc
 PYTHON_LIBRARIES := boost_python python2.7
 WARNINGS := -Wall
 
-COMMON_FLAGS := -DNDEBUG -O2 $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir))
+COMMON_FLAGS := -DNDEBUG -O2
+
+# MKL switch
+ifdef USE_MKL
+  LIBRARIES += mkl_rt
+  COMMON_FLAGS += -DUSE_MKL
+  INCLUDE_DIRS += $(MKL_INCLUDE_DIR)
+  LIBRARY_DIRS += $(MKL_LIB_DIR)
+else
+  LIBRARIES += atlas cblas
+endif
+
+COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir))
 CXXFLAGS += -pthread -fPIC $(COMMON_FLAGS)
 NVCCFLAGS := -ccbin=$(CXX) -Xcompiler -fPIC $(COMMON_FLAGS)
 LDFLAGS += $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) \
 		$(foreach library,$(LIBRARIES),-l$(library))
 PYTHON_LDFLAGS := $(LDFLAGS) $(foreach library,$(PYTHON_LIBRARIES),-l$(library))
 
-# MKL options
-ifdef USE_MKL
-  LIBRARIES += mkl_rt
-  COMMON_FLAGS += -DUSE_MKL
-else
-  LIBRARIES += atlas cblas
-endif
-
-
 ##############################
 # Define build targets
 ##############################
diff --git a/Makefile.config.example b/Makefile.config.example
index 0ec2eead..38af560b 100644
--- a/Makefile.config.example
+++ b/Makefile.config.example
@@ -10,8 +10,8 @@ CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \
     -gencode arch=compute_30,code=sm_30 \
     -gencode arch=compute_35,code=sm_35
 
-# If not using MKL, comment out the following line.
-# USE_MKL=1
+# If using MKL, uncomment the following line
+# USE_MKL := 1
 # MKL directory contains include/ and lib/ directions that we need.
 MKL_DIR := /opt/intel/mkl
 

From f6cbe2c5ce7b7acb32587c82a8f01f82bde24354 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Tue, 18 Feb 2014 11:10:23 -0800
Subject: [PATCH 17/24] make MKL switch surprise-proof

---
 Makefile                | 5 +++--
 Makefile.config.example | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 743a55f2..9f2e91cf 100644
--- a/Makefile
+++ b/Makefile
@@ -99,8 +99,9 @@ WARNINGS := -Wall
 
 COMMON_FLAGS := -DNDEBUG -O2
 
-# MKL switch
-ifdef USE_MKL
+# MKL switch (default = non-MKL)
+USE_MKL ?= 0
+ifeq ($(USE_MKL), 1)
   LIBRARIES += mkl_rt
   COMMON_FLAGS += -DUSE_MKL
   INCLUDE_DIRS += $(MKL_INCLUDE_DIR)
diff --git a/Makefile.config.example b/Makefile.config.example
index 38af560b..95656dd0 100644
--- a/Makefile.config.example
+++ b/Makefile.config.example
@@ -10,8 +10,8 @@ CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \
     -gencode arch=compute_30,code=sm_30 \
     -gencode arch=compute_35,code=sm_35
 
-# If using MKL, uncomment the following line
-# USE_MKL := 1
+# MKL switch: set to 1 for MKL
+USE_MKL := 0
 # MKL directory contains include/ and lib/ directions that we need.
 MKL_DIR := /opt/intel/mkl
 

From ff27988995bbece80f708dfc37140feaee92365c Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Wed, 26 Feb 2014 22:41:58 -0800
Subject: [PATCH 18/24] comment out stray mkl includes

---
 src/caffe/layers/inner_product_layer.cu | 2 +-
 src/caffe/test/test_util_blas.cpp       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/caffe/layers/inner_product_layer.cu b/src/caffe/layers/inner_product_layer.cu
index 178b488b..0d397dc0 100644
--- a/src/caffe/layers/inner_product_layer.cu
+++ b/src/caffe/layers/inner_product_layer.cu
@@ -1,7 +1,7 @@
 // Copyright 2013 Yangqing Jia
 
 
-#include <mkl.h>
+//#include <mkl.h>
 #include <cublas_v2.h>
 
 #include <vector>
diff --git a/src/caffe/test/test_util_blas.cpp b/src/caffe/test/test_util_blas.cpp
index 3f3ff8b3..4ac49555 100644
--- a/src/caffe/test/test_util_blas.cpp
+++ b/src/caffe/test/test_util_blas.cpp
@@ -3,7 +3,7 @@
 #include <cstring>
 
 #include "cuda_runtime.h"
-#include "mkl.h"
+//#include "mkl.h"
 #include "cublas_v2.h"
 
 #include "gtest/gtest.h"

From 40aa12aa18ec66662b9261c494d937cb6464c806 Mon Sep 17 00:00:00 2001
From: jamt9000 <jamt9000@gmail.com>
Date: Mon, 3 Mar 2014 17:07:23 +0000
Subject: [PATCH 19/24] Fixed order of cblas and atlas linker flags

They were the wrong way round, causing linking to fail in some cases
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 9f2e91cf..e61fb63a 100644
--- a/Makefile
+++ b/Makefile
@@ -107,7 +107,7 @@ ifeq ($(USE_MKL), 1)
   INCLUDE_DIRS += $(MKL_INCLUDE_DIR)
   LIBRARY_DIRS += $(MKL_LIB_DIR)
 else
-  LIBRARIES += atlas cblas
+  LIBRARIES += cblas atlas
 endif
 
 COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir))

From a9e772f8f7975a676440f522f3d78826462c3b83 Mon Sep 17 00:00:00 2001
From: James Thewlis <james.thewlis@mirriad.com>
Date: Mon, 3 Mar 2014 17:43:20 +0000
Subject: [PATCH 20/24] Added extern C wrapper to cblas.h include

This ensures that it works with ATLAS's header file, which doesn't include such
a guard itself (whereas the reference version from Ubuntu's libblas-dev does)
---
 include/caffe/util/mkl_alternate.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/caffe/util/mkl_alternate.hpp b/include/caffe/util/mkl_alternate.hpp
index 1c207c67..39038dd1 100644
--- a/include/caffe/util/mkl_alternate.hpp
+++ b/include/caffe/util/mkl_alternate.hpp
@@ -9,7 +9,9 @@
 
 #else  // If use MKL, simply include the MKL header
 
+extern "C" {
 #include <cblas.h>
+}
 #include <math.h>
 
 // Functions that caffe uses but are not present if MKL is not linked.

From 453fcf909522937abf1bd4e44efa4932d5d4aca6 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 21 Mar 2014 14:58:11 -0700
Subject: [PATCH 21/24] clean up residual mkl comments and code

The FIXMEs about RNG were addressed by caffe_nextafter for
uniform distributions and the normal distribution concern is surely a
typo in the boost documentation, since the normal pdf is correctly
stated elsewhere in the documentation.
---
 include/caffe/common.hpp                 | 16 ++++------------
 include/caffe/filler.hpp                 |  1 -
 src/caffe/common.cpp                     | 14 +-------------
 src/caffe/layers/dropout_layer.cpp       |  2 --
 src/caffe/layers/inner_product_layer.cpp |  3 ---
 src/caffe/layers/inner_product_layer.cu  |  2 --
 src/caffe/test/test_common.cpp           | 11 -----------
 src/caffe/test/test_util_blas.cpp        |  1 -
 src/caffe/util/math_functions.cpp        | 20 +++-----------------
 9 files changed, 8 insertions(+), 62 deletions(-)

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 9621b261..2ffc93f2 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -8,16 +8,13 @@
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <curand.h>
-// cuda driver types
-#include <driver_types.h>
+#include <driver_types.h>  // cuda driver types
 #include <glog/logging.h>
-//#include <mkl_vsl.h>
 
 // various checks for different function calls.
 #define CUDA_CHECK(condition) CHECK_EQ((condition), cudaSuccess)
 #define CUBLAS_CHECK(condition) CHECK_EQ((condition), CUBLAS_STATUS_SUCCESS)
 #define CURAND_CHECK(condition) CHECK_EQ((condition), CURAND_STATUS_SUCCESS)
-#define VSL_CHECK(condition) CHECK_EQ((condition), VSL_STATUS_OK)
 
 #define CUDA_KERNEL_LOOP(i, n) \
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
@@ -46,7 +43,6 @@ private:\
 // is executed we will see a fatal log.
 #define NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented Yet"
 
-
 namespace caffe {
 
 // We will use the boost shared_ptr instead of the new C++11 one mainly
@@ -62,7 +58,6 @@ using boost::shared_ptr;
 #endif
 
 
-
 inline int CAFFE_GET_BLOCKS(const int N) {
   return (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
 }
@@ -90,11 +85,9 @@ class Caffe {
     return Get().curand_generator_;
   }
 
-  // Returns the MKL random stream.
-  //inline static VSLStreamStatePtr vsl_stream() { return Get().vsl_stream_; }
-
+  // boost RNG
   typedef boost::mt19937 random_generator_t;
-  inline static random_generator_t &vsl_stream() { return Get().random_generator_; }
+  inline static random_generator_t &rng_stream() { return Get().random_generator_; }
 
   // Returns the mode: running on CPU or GPU.
   inline static Brew mode() { return Get().mode_; }
@@ -108,7 +101,7 @@ class Caffe {
   inline static void set_mode(Brew mode) { Get().mode_ = mode; }
   // Sets the phase.
   inline static void set_phase(Phase phase) { Get().phase_ = phase; }
-  // Sets the random seed of both MKL and curand
+  // Sets the random seed of both boost and curand
   static void set_random_seed(const unsigned int seed);
   // Sets the device. Since we have cublas and curand stuff, set device also
   // requires us to reset those values.
@@ -119,7 +112,6 @@ class Caffe {
  protected:
   cublasHandle_t cublas_handle_;
   curandGenerator_t curand_generator_;
-  //VSLStreamStatePtr vsl_stream_;
   random_generator_t random_generator_;
 
   Brew mode_;
diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
index d0b5baa0..7c100224 100644
--- a/include/caffe/filler.hpp
+++ b/include/caffe/filler.hpp
@@ -7,7 +7,6 @@
 #ifndef CAFFE_FILLER_HPP
 #define CAFFE_FILLER_HPP
 
-//#include <mkl.h>
 #include <string>
 
 #include "caffe/common.hpp"
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index 95a5e93a..29501bb6 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -22,7 +22,6 @@ int64_t cluster_seedgen(void) {
 Caffe::Caffe()
     : mode_(Caffe::CPU), phase_(Caffe::TRAIN), cublas_handle_(NULL),
       curand_generator_(NULL),
-      //vsl_stream_(NULL)
       random_generator_()
 {
   // Try to create a cublas handler, and report an error if failed (but we will
@@ -37,13 +36,6 @@ Caffe::Caffe()
       != CURAND_STATUS_SUCCESS) {
     LOG(ERROR) << "Cannot create Curand generator. Curand won't be available.";
   }
-
-  // Try to create a vsl stream. This should almost always work, but we will
-  // check it anyway.
-  //if (vslNewStream(&vsl_stream_, VSL_BRNG_MT19937, cluster_seedgen()) != VSL_STATUS_OK) {
-  //  LOG(ERROR) << "Cannot create vsl stream. VSL random number generator "
-  //      << "won't be available.";
-  //}
 }
 
 Caffe::~Caffe() {
@@ -51,7 +43,6 @@ Caffe::~Caffe() {
   if (curand_generator_) {
     CURAND_CHECK(curandDestroyGenerator(curand_generator_));
   }
-  //if (vsl_stream_) VSL_CHECK(vslDeleteStream(&vsl_stream_));
 }
 
 void Caffe::set_random_seed(const unsigned int seed) {
@@ -67,11 +58,8 @@ void Caffe::set_random_seed(const unsigned int seed) {
   } else {
     LOG(ERROR) << "Curand not available. Skipping setting the curand seed.";
   }
-  // VSL seed
-  //VSL_CHECK(vslDeleteStream(&(Get().vsl_stream_)));
-  //VSL_CHECK(vslNewStream(&(Get().vsl_stream_), VSL_BRNG_MT19937, seed));
+  // RNG seed
   Get().random_generator_ = random_generator_t(seed);
-
 }
 
 void Caffe::SetDevice(const int device_id) {
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index bfb854bc..f07547ad 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -32,8 +32,6 @@ Dtype DropoutLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   const int count = bottom[0]->count();
   if (Caffe::phase() == Caffe::TRAIN) {
     // Create random numbers
-    //viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
-    //    count, mask, 1. - threshold_);
     caffe_vRngBernoulli<int>(count, mask, 1. - threshold_);
     for (int i = 0; i < count; ++i) {
       top_data[i] = bottom_data[i] * mask[i] * scale_;
diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index a00e2f21..6ea228fe 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -1,8 +1,5 @@
 // Copyright 2013 Yangqing Jia
 
-
-//#include <mkl.h>
-
 #include <vector>
 
 #include "caffe/blob.hpp"
diff --git a/src/caffe/layers/inner_product_layer.cu b/src/caffe/layers/inner_product_layer.cu
index 0d397dc0..37463b5a 100644
--- a/src/caffe/layers/inner_product_layer.cu
+++ b/src/caffe/layers/inner_product_layer.cu
@@ -1,7 +1,5 @@
 // Copyright 2013 Yangqing Jia
 
-
-//#include <mkl.h>
 #include <cublas_v2.h>
 
 #include <vector>
diff --git a/src/caffe/test/test_common.cpp b/src/caffe/test/test_common.cpp
index f5e3fe47..3ce15bba 100644
--- a/src/caffe/test/test_common.cpp
+++ b/src/caffe/test/test_common.cpp
@@ -19,11 +19,6 @@ TEST_F(CommonTest, TestCublasHandler) {
   EXPECT_TRUE(Caffe::cublas_handle());
 }
 
-TEST_F(CommonTest, TestVslStream) {
-  //EXPECT_TRUE(Caffe::vsl_stream());
-    EXPECT_TRUE(true);
-}
-
 TEST_F(CommonTest, TestBrewMode) {
   Caffe::set_mode(Caffe::CPU);
   EXPECT_EQ(Caffe::mode(), Caffe::CPU);
@@ -41,13 +36,9 @@ TEST_F(CommonTest, TestRandSeedCPU) {
   SyncedMemory data_a(10 * sizeof(int));
   SyncedMemory data_b(10 * sizeof(int));
   Caffe::set_random_seed(1701);
-  //viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
-  //      10, (int*)data_a.mutable_cpu_data(), 0.5);
   caffe_vRngBernoulli(10, reinterpret_cast<int*>(data_a.mutable_cpu_data()), 0.5);
 
   Caffe::set_random_seed(1701);
-  //viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
-  //      10, (int*)data_b.mutable_cpu_data(), 0.5);
   caffe_vRngBernoulli(10, reinterpret_cast<int*>(data_b.mutable_cpu_data()), 0.5);
 
   for (int i = 0; i < 10; ++i) {
@@ -56,7 +47,6 @@ TEST_F(CommonTest, TestRandSeedCPU) {
   }
 }
 
-
 TEST_F(CommonTest, TestRandSeedGPU) {
   SyncedMemory data_a(10 * sizeof(unsigned int));
   SyncedMemory data_b(10 * sizeof(unsigned int));
@@ -72,5 +62,4 @@ TEST_F(CommonTest, TestRandSeedGPU) {
   }
 }
 
-
 }  // namespace caffe
diff --git a/src/caffe/test/test_util_blas.cpp b/src/caffe/test/test_util_blas.cpp
index 4ac49555..57f4eafc 100644
--- a/src/caffe/test/test_util_blas.cpp
+++ b/src/caffe/test/test_util_blas.cpp
@@ -3,7 +3,6 @@
 #include <cstring>
 
 #include "cuda_runtime.h"
-//#include "mkl.h"
 #include "cublas_v2.h"
 
 #include "gtest/gtest.h"
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index fb2b1127..d68c05c3 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -2,7 +2,6 @@
 // Copyright 2014 kloudkl@github
 
 #include <limits>
-//#include <mkl.h>
 #include <boost/math/special_functions/next.hpp>
 #include <boost/random.hpp>
 
@@ -284,14 +283,10 @@ void caffe_vRngUniform(const int n, Dtype* r,
   CHECK_GE(n, 0);
   CHECK(r);
   CHECK_LE(a, b);
-  //VSL_CHECK(vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, Caffe::vsl_stream(),
-  //    n, r, a, b));
 
-  // FIXME check if boundaries are handled in the same way ?
-  // Fixed by caffe_nextafter
   boost::uniform_real<Dtype> random_distribution(
       a, caffe_nextafter<Dtype>(b));
-  Caffe::random_generator_t &generator = Caffe::vsl_stream();
+  Caffe::random_generator_t &generator = Caffe::rng_stream();
   boost::variate_generator<Caffe::random_generator_t,
       boost::uniform_real<Dtype> > variate_generator(
       generator, random_distribution);
@@ -314,17 +309,8 @@ void caffe_vRngGaussian(const int n, Dtype* r, const Dtype a,
   CHECK_GE(n, 0);
   CHECK(r);
   CHECK_GT(sigma, 0);
-  //VSL_CHECK(vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER,
-//      Caffe::vsl_stream(), n, r, a, sigma));
-
-    // FIXME check if parameters are handled in the same way ?
-    // http://www.boost.org/doc/libs/1_55_0/doc/html/boost/random/normal_distribution.html
-    // http://software.intel.com/sites/products/documentation/hpc/mkl/mklman/GUID-63196F25-5013-4038-8BCD-2613C4EF3DE4.htm
-    // The above two documents show that the probability density functions are different.
-    // But the unit tests still pass. Maybe their codes are the same or
-    // the tests are irrelevant to the random numbers.
   boost::normal_distribution<Dtype> random_distribution(a, sigma);
-  Caffe::random_generator_t &generator = Caffe::vsl_stream();
+  Caffe::random_generator_t &generator = Caffe::rng_stream();
   boost::variate_generator<Caffe::random_generator_t,
       boost::normal_distribution<Dtype> > variate_generator(
       generator, random_distribution);
@@ -349,7 +335,7 @@ void caffe_vRngBernoulli(const int n, Dtype* r, const double p) {
   CHECK_GE(p, 0);
   CHECK_LE(p, 1);
   boost::bernoulli_distribution<double> random_distribution(p);
-  Caffe::random_generator_t &generator = Caffe::vsl_stream();
+  Caffe::random_generator_t &generator = Caffe::rng_stream();
   boost::variate_generator<Caffe::random_generator_t,
       boost::bernoulli_distribution<double> > variate_generator(
       generator, random_distribution);

From aaa26466eb74f94f5d403cf3cc2b5fb6e0a17a06 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 21 Mar 2014 15:50:43 -0700
Subject: [PATCH 22/24] lint

---
 include/caffe/common.hpp                      |  4 ++-
 src/caffe/common.cpp                          |  3 +-
 src/caffe/test/test_common.cpp                |  6 ++--
 .../test/test_random_number_generator.cpp     | 32 +++++++++++--------
 src/caffe/util/math_functions.cpp             |  5 +--
 5 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 2ffc93f2..2647b0f7 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -87,7 +87,9 @@ class Caffe {
 
   // boost RNG
   typedef boost::mt19937 random_generator_t;
-  inline static random_generator_t &rng_stream() { return Get().random_generator_; }
+  inline static random_generator_t &rng_stream() {
+    return Get().random_generator_;
+  }
 
   // Returns the mode: running on CPU or GPU.
   inline static Brew mode() { return Get().mode_; }
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index 29501bb6..ad523715 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -22,8 +22,7 @@ int64_t cluster_seedgen(void) {
 Caffe::Caffe()
     : mode_(Caffe::CPU), phase_(Caffe::TRAIN), cublas_handle_(NULL),
       curand_generator_(NULL),
-      random_generator_()
-{
+      random_generator_() {
   // Try to create a cublas handler, and report an error if failed (but we will
   // keep the program running as one might just want to run CPU code).
   if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
diff --git a/src/caffe/test/test_common.cpp b/src/caffe/test/test_common.cpp
index 3ce15bba..12e71688 100644
--- a/src/caffe/test/test_common.cpp
+++ b/src/caffe/test/test_common.cpp
@@ -36,10 +36,12 @@ TEST_F(CommonTest, TestRandSeedCPU) {
   SyncedMemory data_a(10 * sizeof(int));
   SyncedMemory data_b(10 * sizeof(int));
   Caffe::set_random_seed(1701);
-  caffe_vRngBernoulli(10, reinterpret_cast<int*>(data_a.mutable_cpu_data()), 0.5);
+  caffe_vRngBernoulli(10,
+      reinterpret_cast<int*>(data_a.mutable_cpu_data()), 0.5);
 
   Caffe::set_random_seed(1701);
-  caffe_vRngBernoulli(10, reinterpret_cast<int*>(data_b.mutable_cpu_data()), 0.5);
+  caffe_vRngBernoulli(10,
+      reinterpret_cast<int*>(data_b.mutable_cpu_data()), 0.5);
 
   for (int i = 0; i < 10; ++i) {
     EXPECT_EQ(((const int*)(data_a.cpu_data()))[i],
diff --git a/src/caffe/test/test_random_number_generator.cpp b/src/caffe/test/test_random_number_generator.cpp
index c43a5d94..6722f412 100644
--- a/src/caffe/test/test_random_number_generator.cpp
+++ b/src/caffe/test/test_random_number_generator.cpp
@@ -1,6 +1,11 @@
+// Copyright 2014 kloudkl@github
+// Copyright 2014 Jeff Donahue
+// Copyright 2014 Alejandro Dubrovsky
+// Copyright 2014 Evan Shelhamer
+
+#include <cuda_runtime.h>
 #include <cmath>
 #include <cstring>
-#include <cuda_runtime.h>
 
 #include "gtest/gtest.h"
 #include "caffe/common.hpp"
@@ -15,8 +20,7 @@ class RandomNumberGeneratorTest : public ::testing::Test {
  public:
   virtual ~RandomNumberGeneratorTest() {}
 
-  Dtype sample_mean(const Dtype* const seqs, const size_t sample_size)
-  {
+  Dtype sample_mean(const Dtype* const seqs, const size_t sample_size) {
       double sum = 0;
       for (int i = 0; i < sample_size; ++i) {
           sum += seqs[i];
@@ -24,8 +28,7 @@ class RandomNumberGeneratorTest : public ::testing::Test {
       return sum / sample_size;
   }
 
-  Dtype sample_mean(const int* const seqs, const size_t sample_size)
-  {
+  Dtype sample_mean(const int* const seqs, const size_t sample_size) {
       Dtype sum = 0;
       for (int i = 0; i < sample_size; ++i) {
           sum += Dtype(seqs[i]);
@@ -33,9 +36,8 @@ class RandomNumberGeneratorTest : public ::testing::Test {
       return sum / sample_size;
   }
 
-  Dtype mean_bound(const Dtype std, const size_t sample_size)
-  {
-      return  std/sqrt((double)sample_size);
+  Dtype mean_bound(const Dtype std, const size_t sample_size) {
+      return  std/sqrt(static_cast<double>(sample_size));
   }
 };
 
@@ -43,6 +45,7 @@ class RandomNumberGeneratorTest : public ::testing::Test {
 typedef ::testing::Types<float, double> Dtypes;
 TYPED_TEST_CASE(RandomNumberGeneratorTest, Dtypes);
 
+
 TYPED_TEST(RandomNumberGeneratorTest, TestRngGaussian) {
   size_t sample_size = 10000;
   SyncedMemory data_a(sample_size * sizeof(TypeParam));
@@ -50,12 +53,13 @@ TYPED_TEST(RandomNumberGeneratorTest, TestRngGaussian) {
   TypeParam mu = 0;
   TypeParam sigma = 1;
   caffe_vRngGaussian(sample_size,
-      (TypeParam*)data_a.mutable_cpu_data(), mu, sigma);
+      reinterpret_cast<TypeParam*>(data_a.mutable_cpu_data()), mu, sigma);
   TypeParam true_mean = mu;
   TypeParam true_std = sigma;
   TypeParam bound = this->mean_bound(true_std, sample_size);
   TypeParam empirical_mean =
-      this->sample_mean((TypeParam*)data_a.cpu_data(), sample_size);
+      this->sample_mean(reinterpret_cast<const TypeParam*>(data_a.cpu_data()),
+          sample_size);
   EXPECT_NEAR(empirical_mean, true_mean, bound);
 }
 
@@ -67,12 +71,13 @@ TYPED_TEST(RandomNumberGeneratorTest, TestRngUniform) {
   TypeParam lower = 0;
   TypeParam upper = 1;
   caffe_vRngUniform(sample_size,
-      (TypeParam*)data_a.mutable_cpu_data(), lower, upper);
+      reinterpret_cast<TypeParam*>(data_a.mutable_cpu_data()), lower, upper);
   TypeParam true_mean = (lower + upper) / 2;
   TypeParam true_std = (upper - lower) / sqrt(12);
   TypeParam bound = this->mean_bound(true_std, sample_size);
   TypeParam empirical_mean =
-      this->sample_mean((TypeParam*)data_a.cpu_data(), sample_size);
+      this->sample_mean(reinterpret_cast<const TypeParam*>(data_a.cpu_data()),
+          sample_size);
   EXPECT_NEAR(empirical_mean, true_mean, bound);
 }
 
@@ -82,7 +87,8 @@ TYPED_TEST(RandomNumberGeneratorTest, TestRngBernoulli) {
   SyncedMemory data_a(sample_size * sizeof(int));
   Caffe::set_random_seed(1701);
   double p = 0.3;
-  caffe_vRngBernoulli(sample_size, (int*)data_a.mutable_cpu_data(), p);
+  caffe_vRngBernoulli(sample_size,
+      static_cast<int*>(data_a.mutable_cpu_data()), p);
   TypeParam true_mean = p;
   TypeParam true_std = sqrt(p * (1 - p));
   TypeParam bound = this->mean_bound(true_std, sample_size);
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index d68c05c3..3da4b21b 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -1,11 +1,12 @@
 // Copyright 2013 Yangqing Jia
 // Copyright 2014 kloudkl@github
 
-#include <limits>
 #include <boost/math/special_functions/next.hpp>
 #include <boost/random.hpp>
-
 #include <cublas_v2.h>
+
+#include <limits>
+
 #include "caffe/common.hpp"
 #include "caffe/util/math_functions.hpp"
 

From 19bcf2b29bf9e48ff84d18763c6d2b5f41e5bdcd Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 21 Mar 2014 23:47:01 -0700
Subject: [PATCH 23/24] Hide boost rng behind facade for osx compatibility

Split boost random number generation from the common Caffe singleton and
add a helper function for rng. This resolves a build conflict in OSX
between boost rng and nvcc compilation of cuda code.

Refer to #165 for a full discussion.

Thanks to @satol for suggesting a random number generation facade rather
than a total split of cpp and cu code, which is far more involved.
---
 include/caffe/common.hpp          | 97 ++++++++++++++++++-------------
 include/caffe/util/rng.hpp        | 19 ++++++
 src/caffe/common.cpp              | 38 +++++++++++-
 src/caffe/util/math_functions.cpp | 17 +++---
 4 files changed, 120 insertions(+), 51 deletions(-)
 create mode 100644 include/caffe/util/rng.hpp

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 2647b0f7..ca5a3485 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -1,9 +1,9 @@
 // Copyright 2013 Yangqing Jia
+// Copyright 2014 Evan Shelhamer
 
 #ifndef CAFFE_COMMON_HPP_
 #define CAFFE_COMMON_HPP_
 
-#include <boost/random/mersenne_twister.hpp>
 #include <boost/shared_ptr.hpp>
 #include <cublas_v2.h>
 #include <cuda.h>
@@ -11,23 +11,6 @@
 #include <driver_types.h>  // cuda driver types
 #include <glog/logging.h>
 
-// various checks for different function calls.
-#define CUDA_CHECK(condition) CHECK_EQ((condition), cudaSuccess)
-#define CUBLAS_CHECK(condition) CHECK_EQ((condition), CUBLAS_STATUS_SUCCESS)
-#define CURAND_CHECK(condition) CHECK_EQ((condition), CURAND_STATUS_SUCCESS)
-
-#define CUDA_KERNEL_LOOP(i, n) \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
-       i < (n); \
-       i += blockDim.x * gridDim.x)
-
-// After a kernel is executed, this will check the error and if there is one,
-// exit loudly.
-#define CUDA_POST_KERNEL_CHECK \
-  if (cudaSuccess != cudaPeekAtLastError()) \
-    LOG(FATAL) << "Cuda kernel failed. Error: " \
-        << cudaGetErrorString(cudaPeekAtLastError())
-
 // Disable the copy and assignment operator for a class.
 #define DISABLE_COPY_AND_ASSIGN(classname) \
 private:\
@@ -43,6 +26,24 @@ private:\
 // is executed we will see a fatal log.
 #define NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented Yet"
 
+// CUDA: various checks for different function calls.
+#define CUDA_CHECK(condition) CHECK_EQ((condition), cudaSuccess)
+#define CUBLAS_CHECK(condition) CHECK_EQ((condition), CUBLAS_STATUS_SUCCESS)
+#define CURAND_CHECK(condition) CHECK_EQ((condition), CURAND_STATUS_SUCCESS)
+
+// CUDA: grid stride looping
+#define CUDA_KERNEL_LOOP(i, n) \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+       i < (n); \
+       i += blockDim.x * gridDim.x)
+
+// CUDA: check for error after kernel execution and exit loudly if there is one.
+#define CUDA_POST_KERNEL_CHECK \
+  if (cudaSuccess != cudaPeekAtLastError()) \
+    LOG(FATAL) << "Cuda kernel failed. Error: " \
+        << cudaGetErrorString(cudaPeekAtLastError())
+
+
 namespace caffe {
 
 // We will use the boost shared_ptr instead of the new C++11 one mainly
@@ -50,19 +51,6 @@ namespace caffe {
 using boost::shared_ptr;
 
 
-// We will use 1024 threads per block, which requires cuda sm_2x or above.
-#if __CUDA_ARCH__ >= 200
-    const int CAFFE_CUDA_NUM_THREADS = 1024;
-#else
-    const int CAFFE_CUDA_NUM_THREADS = 512;
-#endif
-
-
-inline int CAFFE_GET_BLOCKS(const int N) {
-  return (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
-}
-
-
 // A singleton class to hold common caffe stuff, such as the handler that
 // caffe is going to use for cublas, curand, etc.
 class Caffe {
@@ -77,20 +65,32 @@ class Caffe {
   enum Brew { CPU, GPU };
   enum Phase { TRAIN, TEST };
 
-  // The getters for the variables.
-  // Returns the cublas handle.
+
+  // This random number generator facade hides boost and CUDA rng
+  // implementation from one another (for cross-platform compatibility).
+  class RNG {
+   public:
+    RNG();
+    explicit RNG(unsigned int seed);
+    ~RNG();
+    RNG(const RNG&);
+    RNG& operator=(const RNG&);
+    const void* generator() const;
+    void* generator();
+   private:
+    class Generator;
+    Generator* generator_;
+  };
+
+  // Getters for boost rng, curand, and cublas handles
+  inline static RNG &rng_stream() {
+    return Get().random_generator_;
+  }
   inline static cublasHandle_t cublas_handle() { return Get().cublas_handle_; }
-  // Returns the curand generator.
   inline static curandGenerator_t curand_generator() {
     return Get().curand_generator_;
   }
 
-  // boost RNG
-  typedef boost::mt19937 random_generator_t;
-  inline static random_generator_t &rng_stream() {
-    return Get().random_generator_;
-  }
-
   // Returns the mode: running on CPU or GPU.
   inline static Brew mode() { return Get().mode_; }
   // Returns the phase: TRAIN or TEST.
@@ -114,7 +114,7 @@ class Caffe {
  protected:
   cublasHandle_t cublas_handle_;
   curandGenerator_t curand_generator_;
-  random_generator_t random_generator_;
+  RNG random_generator_;
 
   Brew mode_;
   Phase phase_;
@@ -128,6 +128,21 @@ class Caffe {
 };
 
 
+// CUDA: thread number configuration.
+// Use 1024 threads per block, which requires cuda sm_2x or above,
+// or fall back to attempt compatibility (best of luck to you).
+#if __CUDA_ARCH__ >= 200
+    const int CAFFE_CUDA_NUM_THREADS = 1024;
+#else
+    const int CAFFE_CUDA_NUM_THREADS = 512;
+#endif
+
+// CUDA: number of blocks for threads.
+inline int CAFFE_GET_BLOCKS(const int N) {
+  return (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
+}
+
+
 }  // namespace caffe
 
 #endif  // CAFFE_COMMON_HPP_
diff --git a/include/caffe/util/rng.hpp b/include/caffe/util/rng.hpp
new file mode 100644
index 00000000..c7530c70
--- /dev/null
+++ b/include/caffe/util/rng.hpp
@@ -0,0 +1,19 @@
+// Copyright 2014 Evan Shelhamer
+
+#ifndef CAFFE_RNG_CPP_HPP_
+#define CAFFE_RNG_CPP_HPP_
+
+#include <boost/random/mersenne_twister.hpp>
+#include "caffe/common.hpp"
+
+namespace caffe {
+
+  typedef boost::mt19937 rng_t;
+  inline rng_t& caffe_rng() {
+    Caffe::RNG &generator = Caffe::rng_stream();
+    return *(caffe::rng_t*) generator.generator();
+  }
+
+}  // namespace caffe
+
+#endif  // CAFFE_RNG_HPP_
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index ad523715..a25dfda8 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -1,15 +1,18 @@
 // Copyright 2013 Yangqing Jia
+// Copyright 2014 Evan Shelhamer
 
 #include <cstdio>
 #include <ctime>
 
 #include "caffe/common.hpp"
+#include "caffe/util/rng.hpp"
 
 namespace caffe {
 
 shared_ptr<Caffe> Caffe::singleton_;
 
 
+// curand seeding
 int64_t cluster_seedgen(void) {
   int64_t s, seed, pid;
   pid = getpid();
@@ -58,7 +61,7 @@ void Caffe::set_random_seed(const unsigned int seed) {
     LOG(ERROR) << "Curand not available. Skipping setting the curand seed.";
   }
   // RNG seed
-  Get().random_generator_ = random_generator_t(seed);
+  Get().random_generator_ = RNG(seed);
 }
 
 void Caffe::SetDevice(const int device_id) {
@@ -112,4 +115,37 @@ void Caffe::DeviceQuery() {
   return;
 }
 
+
+class Caffe::RNG::Generator {
+ public:
+  caffe::rng_t rng;
+};
+
+Caffe::RNG::RNG()
+: generator_(new Generator) { }
+
+Caffe::RNG::RNG(unsigned int seed)
+: generator_(new Generator) {
+  generator_->rng = caffe::rng_t(seed);
+}
+
+Caffe::RNG::~RNG() { delete generator_; }
+
+Caffe::RNG::RNG(const RNG& other) : generator_(new Generator) {
+  *generator_ = *other.generator_;
+}
+
+Caffe::RNG& Caffe::RNG::operator=(const RNG& other) {
+  *generator_ = *other.generator_;
+  return *this;
+}
+
+void* Caffe::RNG::generator() {
+  return &generator_->rng;
+}
+
+const void* Caffe::RNG::generator() const {
+  return &generator_->rng;
+}
+
 }  // namespace caffe
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 3da4b21b..3d02c5ff 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -1,5 +1,6 @@
 // Copyright 2013 Yangqing Jia
 // Copyright 2014 kloudkl@github
+// Copyright 2014 Evan Shelhamer
 
 #include <boost/math/special_functions/next.hpp>
 #include <boost/random.hpp>
@@ -9,6 +10,7 @@
 
 #include "caffe/common.hpp"
 #include "caffe/util/math_functions.hpp"
+#include "caffe/util/rng.hpp"
 
 namespace caffe {
 
@@ -287,10 +289,9 @@ void caffe_vRngUniform(const int n, Dtype* r,
 
   boost::uniform_real<Dtype> random_distribution(
       a, caffe_nextafter<Dtype>(b));
-  Caffe::random_generator_t &generator = Caffe::rng_stream();
-  boost::variate_generator<Caffe::random_generator_t,
+  boost::variate_generator<caffe::rng_t,
       boost::uniform_real<Dtype> > variate_generator(
-      generator, random_distribution);
+      caffe_rng(), random_distribution);
 
   for (int i = 0; i < n; ++i) {
     r[i] = variate_generator();
@@ -311,10 +312,9 @@ void caffe_vRngGaussian(const int n, Dtype* r, const Dtype a,
   CHECK(r);
   CHECK_GT(sigma, 0);
   boost::normal_distribution<Dtype> random_distribution(a, sigma);
-  Caffe::random_generator_t &generator = Caffe::rng_stream();
-  boost::variate_generator<Caffe::random_generator_t,
+  boost::variate_generator<caffe::rng_t,
       boost::normal_distribution<Dtype> > variate_generator(
-      generator, random_distribution);
+      caffe_rng(), random_distribution);
 
   for (int i = 0; i < n; ++i) {
     r[i] = variate_generator();
@@ -336,10 +336,9 @@ void caffe_vRngBernoulli(const int n, Dtype* r, const double p) {
   CHECK_GE(p, 0);
   CHECK_LE(p, 1);
   boost::bernoulli_distribution<double> random_distribution(p);
-  Caffe::random_generator_t &generator = Caffe::rng_stream();
-  boost::variate_generator<Caffe::random_generator_t,
+  boost::variate_generator<caffe::rng_t,
       boost::bernoulli_distribution<double> > variate_generator(
-      generator, random_distribution);
+      caffe_rng(), random_distribution);
 
   for (int i = 0; i < n; ++i) {
     r[i] = variate_generator();

From bece205114fa666ed390e17dd84a522c43a4f2d6 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Sat, 22 Mar 2014 01:27:42 -0700
Subject: [PATCH 24/24] Set copyright to BVLC and contributors.

The exact details of the contributions are recorded by versioning.
---
 include/caffe/common.hpp                        | 3 +--
 include/caffe/util/rng.hpp                      | 2 +-
 src/caffe/common.cpp                            | 3 +--
 src/caffe/test/test_random_number_generator.cpp | 5 +----
 src/caffe/util/math_functions.cpp               | 4 +---
 5 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index ca5a3485..5344139c 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -1,5 +1,4 @@
-// Copyright 2013 Yangqing Jia
-// Copyright 2014 Evan Shelhamer
+// Copyright 2014 BVLC and contributors.
 
 #ifndef CAFFE_COMMON_HPP_
 #define CAFFE_COMMON_HPP_
diff --git a/include/caffe/util/rng.hpp b/include/caffe/util/rng.hpp
index c7530c70..8151a9a6 100644
--- a/include/caffe/util/rng.hpp
+++ b/include/caffe/util/rng.hpp
@@ -1,4 +1,4 @@
-// Copyright 2014 Evan Shelhamer
+// Copyright 2014 BVLC and contributors.
 
 #ifndef CAFFE_RNG_CPP_HPP_
 #define CAFFE_RNG_CPP_HPP_
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index a25dfda8..59cbc56b 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -1,5 +1,4 @@
-// Copyright 2013 Yangqing Jia
-// Copyright 2014 Evan Shelhamer
+// Copyright 2014 BVLC and contributors.
 
 #include <cstdio>
 #include <ctime>
diff --git a/src/caffe/test/test_random_number_generator.cpp b/src/caffe/test/test_random_number_generator.cpp
index 6722f412..267e7731 100644
--- a/src/caffe/test/test_random_number_generator.cpp
+++ b/src/caffe/test/test_random_number_generator.cpp
@@ -1,7 +1,4 @@
-// Copyright 2014 kloudkl@github
-// Copyright 2014 Jeff Donahue
-// Copyright 2014 Alejandro Dubrovsky
-// Copyright 2014 Evan Shelhamer
+// Copyright 2014 BVLC and contributors.
 
 #include <cuda_runtime.h>
 #include <cmath>
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 3d02c5ff..29bdaf6c 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -1,6 +1,4 @@
-// Copyright 2013 Yangqing Jia
-// Copyright 2014 kloudkl@github
-// Copyright 2014 Evan Shelhamer
+// Copyright 2014 BVLC and contributors.
 
 #include <boost/math/special_functions/next.hpp>
 #include <boost/random.hpp>