diff --git a/Makefile b/Makefile
index a81e2338b..33753d12c 100644
--- a/Makefile
+++ b/Makefile
@@ -132,6 +132,13 @@ ifeq ("$(MATHLIB)","mkl")
   CPPFLAGS += -DUSE_MKL
 endif
 
+ifeq ("$(MATHLIB)","openblas")
+  INCLUDEPATH += $(OPENBLAS_PATH)/include
+  LIBPATH += $(OPENBLAS_PATH)/lib
+  LIBS += -lopenblas -lm -lpthread
+  CPPFLAGS += -DUSE_OPENBLAS
+endif
+
 
 ifdef KALDI_PATH
   ########## Copy includes and defines from $(KALDI_PATH)/src/kaldi.mk ##########
diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp
index 309c0c0f7..0584d0ef0 100644
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@@ -25,9 +25,6 @@
 #define NOMINMAX
 #include "Windows.h"
 #else
-#ifndef max
-#define max(a, b) (((a) > (b)) ? (a) : (b))
-#endif
 #include <cfloat>
 #endif
 
@@ -38,20 +35,27 @@
 #pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
 #pragma warning(disable : 4702) // unreachable code; triggered for unknown reasons
 
-#ifndef USE_MKL
-// use ACML as default.
+#ifdef USE_ACML
 // Download ACML 5.3.1 (e.g., acml5.3.1-ifort64.exe) or above
 // from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/
 // Install the ifort64_mp variant (compiled with intel compiler) of the library
 // Set Environment variable ACML_PATH to C:\AMD\acml5.3.1\ifort64_mp or the folder you installed acml
 // to point to your folder for the include file and link library
 #include <acml.h> // requires ACML 5.3.1 and above
-#else
+#elif defined(USE_MKL)
 // requires MKL 10.0 and above
 #include <mkl.h>
+#else
+#ifdef _MSC_VER
+// Visual Studio doesn't define standard complex types properly
+#define HAVE_LAPACK_CONFIG_H
+#define LAPACK_COMPLEX_STRUCTURE
+#endif
+#include <cblas.h>
+#include <lapacke.h>
 #endif
 
-#ifndef USE_MKL // MKL has one additional parameter for different matrix order
+#ifdef USE_ACML // MKL has one additional parameter for different matrix order
 #define BLAS_COLMAJOR
 #else
 #define BLAS_COLMAJOR (int) MatrixOrder::ColMajor,
@@ -878,7 +882,7 @@ void CPUMatrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, E
 #pragma omp parallel for
                     foreach_column (j, us)
                     {
-#ifndef USE_MKL
+#ifdef USE_ACML
                         dcopy((int) numRows, reinterpret_cast<double*>(pArray + j), (int) numCols, reinterpret_cast<double*>(m_pArray + LocateColumn(j)), 1);
 #else
                         cblas_dcopy((int) numRows, reinterpret_cast<double*>(pArray + j), (int) numCols, reinterpret_cast<double*>(m_pArray + LocateColumn(j)), 1);
@@ -892,7 +896,7 @@ void CPUMatrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, E
                     {
                         {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
                             scopy((int) numRows, reinterpret_cast<float*>(pArray + j), (int) numCols, reinterpret_cast<float*>(m_pArray + LocateColumn(j)), 1);
 #else
                             cblas_scopy((int) numRows, reinterpret_cast<float*>(pArray + j), (int) numCols, reinterpret_cast<float*>(m_pArray + LocateColumn(j)), 1);
@@ -1316,9 +1320,9 @@ ElemType CPUMatrix<ElemType>::RmsProp(CPUMatrix<ElemType>& gradients,
         const int grad_sign = (ElemType(0) < curr_grad[i]) - (curr_grad[i] < ElemType(0));
 
         if (signs[i] * grad_sign > 0)
-            steps[i] = min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX);
+            steps[i] = std::min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX);
         else
-            steps[i] = max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN);
+            steps[i] = std::max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN);
 
         a = steps[i] / sqrt(avars[i] + floor);
         curr_grad[i] *= a;
@@ -2237,7 +2241,7 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignLogSoftmaxOf(const CPUMatrix<Ele
             // we need to extract max before applying exp to avoid overflow
             ElemType maxV = a(0, j);
             foreach_row (i, a)
-                maxV = max(maxV, a(i, j));
+                maxV = std::max(maxV, a(i, j));
 
             ElemType sum = 0;
             foreach_row (i, a)
@@ -2255,7 +2259,7 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignLogSoftmaxOf(const CPUMatrix<Ele
             // we need to extract max before applying exp to avoid overflow
             ElemType maxV = a(i, 0);
             foreach_column (j, a)
-                maxV = max(maxV, a(i, j));
+                maxV = std::max(maxV, a(i, j));
 
             ElemType sum = 0;
             foreach_column (j, a)
@@ -2808,7 +2812,7 @@ ElemType CPUMatrix<ElemType>::SumOfAbsElements() const
 
     if (sizeof(ElemType) == sizeof(double))
     {
-#ifndef USE_MKL
+#ifdef USE_ACML
         return (ElemType) dasum((int) GetNumElements(), reinterpret_cast<double*>(m_pArray), 1);
 #else
         return (ElemType) cblas_dasum((int) GetNumElements(), reinterpret_cast<double*>(m_pArray), 1);
@@ -2817,7 +2821,7 @@ ElemType CPUMatrix<ElemType>::SumOfAbsElements() const
     else
     {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
         return sasum((int) GetNumElements(), reinterpret_cast<float*>(m_pArray), 1);
 #else
         return cblas_sasum((int) GetNumElements(), reinterpret_cast<float*>(m_pArray), 1);
@@ -2990,7 +2994,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
 #pragma omp parallel for
             foreach_column (j, c)
             {
-#ifndef USE_MKL
+#ifdef USE_ACML
                 c(0, j) = (ElemType) dnrm2(m, reinterpret_cast<double*>(us.m_pArray + us.LocateColumn(j)), 1);
 #else
                 c(0, j) = (ElemType) cblas_dnrm2(m, reinterpret_cast<double*>(us.m_pArray + us.LocateColumn(j)), 1);
@@ -3003,7 +3007,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
             foreach_column (j, c)
             {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
                 c(0, j) = snrm2(m, reinterpret_cast<float*>(us.m_pArray + us.LocateColumn(j)), 1);
 #else
                 c(0, j) = cblas_snrm2(m, reinterpret_cast<float*>(us.m_pArray + us.LocateColumn(j)), 1);
@@ -3020,7 +3024,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
 #pragma omp parallel for
             foreach_row (i, c)
             {
-#ifndef USE_MKL
+#ifdef USE_ACML
                 c(i, 0) = dnrm2(n, reinterpret_cast<double*>(us.m_pArray + i), m);
 #else
                 c(i, 0) = cblas_dnrm2(n, reinterpret_cast<double*>(us.m_pArray + i), m);
@@ -3033,7 +3037,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
             foreach_row (i, c)
             {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
                 c(i, 0) = snrm2(n, reinterpret_cast<float*>(us.m_pArray + i), m);
 #else
                 c(i, 0) = cblas_snrm2(n, reinterpret_cast<float*>(us.m_pArray + i), m);
@@ -3073,7 +3077,7 @@ void CPUMatrix<ElemType>::VectorNormInf(CPUMatrix<ElemType>& c, const bool isCol
             ElemType v = 0;
             foreach_row (i, us)
             {
-                v = max(v, abs(us(i, j)));
+                v = std::max(v, abs(us(i, j)));
             }
             c(0, j) = v;
         }
@@ -3088,7 +3092,7 @@ void CPUMatrix<ElemType>::VectorNormInf(CPUMatrix<ElemType>& c, const bool isCol
             ElemType v = 0;
             foreach_column (j, us)
             {
-                v = max(v, abs(us(i, j)));
+                v = std::max(v, abs(us(i, j)));
             }
             c(i, 0) = v;
         }
@@ -3282,7 +3286,7 @@ ElemType CPUMatrix<ElemType>::MatrixNormInf() const
     {
 #pragma omp critical
         {
-            v = max(v, abs(us(i, j)));
+            v = std::max(v, abs(us(i, j)));
         }
     }
     return v;
@@ -3866,8 +3870,8 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignMaxPoolingResult(const CPUMatrix
                 for (long rowInWindow = 0; rowInWindow < windowHeight; rowInWindow++)
                 {
                     const ElemType val = inputBatch(rowInInput, sample); // pf[rowInWindow*channels];
-                    maxVal = max(maxVal, val);
-                    minVal = min(minVal, val);
+                    maxVal = std::max(maxVal, val);
+                    minVal = std::min(minVal, val);
                     rowInInput += (long) channels;
                 }
             }
@@ -4040,7 +4044,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
 
     int m, n, k, l;
     int lda, ldb, ldc;
-#ifndef USE_MKL
+#ifdef USE_ACML
     char transA, transB;
 #else
     CBLAS_TRANSPOSE mklTransA;
@@ -4052,7 +4056,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
         m = (int) a.GetNumCols();
         k = (int) a.GetNumRows();
         lda = k;
-#ifndef USE_MKL
+#ifdef USE_ACML
         transA = (char) MatrixTranspose::Trans;
 #else
         mklTransA = CBLAS_TRANSPOSE::CblasTrans;
@@ -4063,7 +4067,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
         m = (int) a.GetNumRows();
         k = (int) a.GetNumCols();
         lda = m;
-#ifndef USE_MKL
+#ifdef USE_ACML
         transA = (char) MatrixTranspose::NoTrans;
 #else
         mklTransA = CBLAS_TRANSPOSE::CblasNoTrans;
@@ -4075,7 +4079,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
         l = (int) b.GetNumCols();
         n = (int) b.GetNumRows();
         ldb = n;
-#ifndef USE_MKL
+#ifdef USE_ACML
         transB = (char) MatrixTranspose::Trans;
 #else
         mklTransB = CBLAS_TRANSPOSE::CblasTrans;
@@ -4086,7 +4090,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
         l = (int) b.GetNumRows();
         n = (int) b.GetNumCols();
         ldb = l;
-#ifndef USE_MKL
+#ifdef USE_ACML
         transB = (char) MatrixTranspose::NoTrans;
 #else
         mklTransB = CBLAS_TRANSPOSE::CblasNoTrans;
@@ -4107,7 +4111,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
 
     if (sizeof(ElemType) == sizeof(double))
     {
-#ifndef USE_MKL
+#ifdef USE_ACML
         dgemm(transA, transB, m, n, k, alpha, reinterpret_cast<double*>(a.m_pArray), lda, reinterpret_cast<double*>(b.m_pArray), ldb, beta, reinterpret_cast<double*>(c.m_pArray), ldc);
 #else
         cblas_dgemm((CBLAS_ORDER) BLAS_COLMAJOR mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast<double*>(a.m_pArray), lda, reinterpret_cast<double*>(b.m_pArray), ldb, beta, reinterpret_cast<double*>(c.m_pArray), ldc);
@@ -4116,7 +4120,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
     else
     {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
         sgemm(BLAS_COLMAJOR transA, transB, m, n, k, alpha, reinterpret_cast<float*>(a.m_pArray), lda, reinterpret_cast<float*>(b.m_pArray), ldb, beta, reinterpret_cast<float*>(c.m_pArray), ldc);
 #else
         cblas_sgemm((CBLAS_ORDER) BLAS_COLMAJOR mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast<float*>(a.m_pArray), lda, reinterpret_cast<float*>(b.m_pArray), ldb, beta, reinterpret_cast<float*>(c.m_pArray), ldc);
@@ -4160,34 +4164,42 @@ void CPUMatrix<ElemType>::SVD(const CPUMatrix<ElemType>& A, CPUMatrix<ElemType>&
     ldu = m;
     ldvt = n;
     U.Resize(m, m);
-    SIGMA.Resize(min(m, n), 1);
+    SIGMA.Resize(std::min(m, n), 1);
     VT.Resize(n, n);
 
     if (sizeof(ElemType) == sizeof(double))
     {
-#ifndef USE_MKL
+#ifdef USE_ACML
         dgesvd('A', 'A', (int) m, (int) n, reinterpret_cast<double*>(A.m_pArray), (int) lda, reinterpret_cast<double*>(SIGMA.m_pArray), reinterpret_cast<double*>(U.m_pArray), (int) ldu, reinterpret_cast<double*>(VT.m_pArray), (int) ldvt, &info);
-#else
+#elif defined(USE_MKL)
         double wkopt;
         int lwork = -1;
         dgesvd("All", "All", &m, &n, reinterpret_cast<double*>(A.m_pArray), &lda, reinterpret_cast<double*>(SIGMA.m_pArray), reinterpret_cast<double*>(U.m_pArray), &ldu, reinterpret_cast<double*>(VT.m_pArray), &ldvt, &wkopt, &lwork, &info);
         lwork = (int) wkopt;
         W.Resize(lwork, 1);
         dgesvd("All", "All", &m, &n, reinterpret_cast<double*>(A.m_pArray), &lda, reinterpret_cast<double*>(SIGMA.m_pArray), reinterpret_cast<double*>(U.m_pArray), &ldu, reinterpret_cast<double*>(VT.m_pArray), &ldvt, reinterpret_cast<double*>(W.m_pArray), &lwork, &info);
+#else
+        std::vector<double> superb(std::max(std::min(m, n) - 1, 1));
+        info = LAPACKE_dgesvd(BLAS_COLMAJOR 'A', 'A', (int) m, (int) n, reinterpret_cast<double*>(A.m_pArray), (int) lda, reinterpret_cast<double*>(SIGMA.m_pArray),
+            reinterpret_cast<double*>(U.m_pArray), (int) ldu, reinterpret_cast<double*>(VT.m_pArray), (int) ldvt, &superb[0]);
 #endif
     }
     else
     {
-#ifndef USE_MKL
+#ifdef USE_ACML
 #pragma warning(suppress : 4244)
         sgesvd('A', 'A', (int) m, (int) n, reinterpret_cast<float*>(A.m_pArray), (int) lda, reinterpret_cast<float*>(SIGMA.m_pArray), reinterpret_cast<float*>(U.m_pArray), (int) ldu, reinterpret_cast<float*>(VT.m_pArray), (int) ldvt, &info);
-#else
+#elif defined(USE_MKL)
         float wkopt;
         int lwork = -1;
         sgesvd("All", "All", &m, &n, reinterpret_cast<float*>(A.m_pArray), &lda, reinterpret_cast<float*>(SIGMA.m_pArray), reinterpret_cast<float*>(U.m_pArray), &ldu, reinterpret_cast<float*>(VT.m_pArray), &ldvt, &wkopt, &lwork, &info);
         lwork = (int) wkopt;
         W.Resize(lwork, 1);
         sgesvd("All", "All", &m, &n, reinterpret_cast<float*>(A.m_pArray), &lda, reinterpret_cast<float*>(SIGMA.m_pArray), reinterpret_cast<float*>(U.m_pArray), &ldu, reinterpret_cast<float*>(VT.m_pArray), &ldvt, reinterpret_cast<float*>(W.m_pArray), &lwork, &info);
+#else
+        std::vector<float> superb(std::max(std::min(m, n) - 1, 1));
+        info = LAPACKE_sgesvd(BLAS_COLMAJOR 'A', 'A', (int) m, (int) n, reinterpret_cast<float*>(A.m_pArray), (int) lda, reinterpret_cast<float*>(SIGMA.m_pArray),
+            reinterpret_cast<float*>(U.m_pArray), (int) ldu, reinterpret_cast<float*>(VT.m_pArray), (int) ldvt, &superb[0]);
 #endif
     }
 
@@ -4383,7 +4395,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
 
         if (sizeof(ElemType) == sizeof(double))
         {
-#ifndef USE_MKL
+#ifdef USE_ACML
             daxpy(len, alpha, reinterpret_cast<double*>(a.m_pArray), incx, reinterpret_cast<double*>(c.m_pArray), incy);
 #else
             cblas_daxpy(len, alpha, reinterpret_cast<double*>(a.m_pArray), incx, reinterpret_cast<double*>(c.m_pArray), incy);
@@ -4392,7 +4404,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
         else
         {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
             saxpy(len, alpha, reinterpret_cast<float*>(a.m_pArray), incx, reinterpret_cast<float*>(c.m_pArray), incy);
 #else
             cblas_saxpy(len, alpha, reinterpret_cast<float*>(a.m_pArray), incx, reinterpret_cast<float*>(c.m_pArray), incy);
@@ -4433,7 +4445,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
 #pragma omp parallel for
             foreach_column (j, c)
             {
-#ifndef USE_MKL
+#ifdef USE_ACML
                 daxpy(m, alpha, reinterpret_cast<double*>(a.m_pArray), 1, reinterpret_cast<double*>(c.m_pArray + c.LocateColumn(j)), 1);
 #else
                 cblas_daxpy(m, alpha, reinterpret_cast<double*>(a.m_pArray), 1, reinterpret_cast<double*>(c.m_pArray + c.LocateColumn(j)), 1);
@@ -4446,7 +4458,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
             foreach_column (j, c)
             {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
                 saxpy(m, alpha, reinterpret_cast<float*>(a.m_pArray), 1, reinterpret_cast<float*>(c.m_pArray + c.LocateColumn(j)), 1);
 #else
                 cblas_saxpy(m, alpha, reinterpret_cast<float*>(a.m_pArray), 1, reinterpret_cast<float*>(c.m_pArray + c.LocateColumn(j)), 1);
@@ -4467,7 +4479,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
 #pragma omp parallel for
             foreach_row (i, c)
             {
-#ifndef USE_MKL
+#ifdef USE_ACML
                 daxpy(n, alpha, reinterpret_cast<double*>(a.m_pArray), 1, reinterpret_cast<double*>(c.m_pArray + i), m);
 #else
                 cblas_daxpy(n, alpha, reinterpret_cast<double*>(a.m_pArray), 1, reinterpret_cast<double*>(c.m_pArray + i), m);
@@ -4480,7 +4492,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
             foreach_row (i, c)
             {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
                 saxpy(n, alpha, reinterpret_cast<float*>(a.m_pArray), 1, reinterpret_cast<float*>(c.m_pArray + i), m);
 #else
                 cblas_saxpy(n, alpha, reinterpret_cast<float*>(a.m_pArray), 1, reinterpret_cast<float*>(c.m_pArray + i), m);
@@ -4682,7 +4694,7 @@ void CPUMatrix<ElemType>::Scale(ElemType alpha, CPUMatrix<ElemType>& a)
 
     if (sizeof(ElemType) == sizeof(double))
     {
-#ifndef USE_MKL
+#ifdef USE_ACML
         dscal(len, alpha, reinterpret_cast<double*>(a.m_pArray), incx);
 #else
         cblas_dscal(len, alpha, reinterpret_cast<double*>(a.m_pArray), incx);
@@ -4691,7 +4703,7 @@ void CPUMatrix<ElemType>::Scale(ElemType alpha, CPUMatrix<ElemType>& a)
     else
     {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
         sscal(len, alpha, reinterpret_cast<float*>(a.m_pArray), incx);
 #else
         cblas_sscal(len, alpha, reinterpret_cast<float*>(a.m_pArray), incx);
@@ -4741,7 +4753,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
 #pragma omp parallel for
             foreach_column (j, c)
             {
-#ifndef USE_MKL
+#ifdef USE_ACML
                 c(0, j) = (ElemType) ddot(m, reinterpret_cast<double*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<double*>(b.m_pArray + b.LocateColumn(j)), 1);
 #else
                 c(0, j) = (ElemType) cblas_ddot(m, reinterpret_cast<double*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<double*>(b.m_pArray + b.LocateColumn(j)), 1);
@@ -4754,7 +4766,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
             foreach_column (j, c)
             {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
                 c(0, j) = (ElemType) sdot(m, reinterpret_cast<float*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<float*>(b.m_pArray + b.LocateColumn(j)), 1);
 #else
                 c(0, j) = (ElemType) cblas_sdot(m, reinterpret_cast<float*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<float*>(b.m_pArray + b.LocateColumn(j)), 1);
@@ -4771,7 +4783,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
 #pragma omp parallel for
             foreach_row (i, c)
             {
-#ifndef USE_MKL
+#ifdef USE_ACML
                 c(i, 0) = ddot(n, reinterpret_cast<double*>(a.m_pArray + i), m, reinterpret_cast<double*>(b.m_pArray + i), m);
 #else
                 c(i, 0) = cblas_ddot(n, reinterpret_cast<double*>(a.m_pArray + i), m, reinterpret_cast<double*>(b.m_pArray + i), m);
@@ -4784,7 +4796,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
             foreach_row (i, c)
             {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
                 c(i, 0) = sdot(n, reinterpret_cast<float*>(a.m_pArray + i), m, reinterpret_cast<float*>(b.m_pArray + i), m);
 #else
                 c(i, 0) = cblas_sdot(n, reinterpret_cast<float*>(a.m_pArray + i), m, reinterpret_cast<float*>(b.m_pArray + i), m);
@@ -4813,7 +4825,7 @@ ElemType CPUMatrix<ElemType>::InnerProductOfMatrices(const CPUMatrix<ElemType>&
 
     if (sizeof(ElemType) == sizeof(double))
     {
-#ifndef USE_MKL
+#ifdef USE_ACML
         return (ElemType) ddot((int) a.GetNumElements(), reinterpret_cast<double*>(a.m_pArray), 1, reinterpret_cast<double*>(b.m_pArray), 1);
 #else
         return (ElemType) cblas_ddot((int) a.GetNumElements(), reinterpret_cast<double*>(a.m_pArray), 1, reinterpret_cast<double*>(b.m_pArray), 1);
@@ -4822,7 +4834,7 @@ ElemType CPUMatrix<ElemType>::InnerProductOfMatrices(const CPUMatrix<ElemType>&
     else
     {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
         return (ElemType) sdot((int) a.GetNumElements(), reinterpret_cast<float*>(a.m_pArray), 1, reinterpret_cast<float*>(b.m_pArray), 1);
 #else
         return (ElemType) cblas_sdot((int) a.GetNumElements(), reinterpret_cast<float*>(a.m_pArray), 1, reinterpret_cast<float*>(b.m_pArray), 1);
@@ -5052,7 +5064,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
         {
             for (long j = 0; j < n; j++)
             {
-#ifndef USE_MKL
+#ifdef USE_ACML
                 c(0, j) = (ElemType) ddot(m, reinterpret_cast<double*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<double*>(b.m_pArray + b.LocateColumn(j)), 1);
 #else
                 c(0, j) = (ElemType) cblas_ddot(m, reinterpret_cast<double*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<double*>(b.m_pArray + b.LocateColumn(j)), 1);
@@ -5062,7 +5074,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
             {
                 for (long i = 1; i < negnumber + 1; i++)
                 {
-#ifndef USE_MKL
+#ifdef USE_ACML
                     c(i, j) = (ElemType) ddot(m, reinterpret_cast<double*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<double*>(b.m_pArray + b.LocateColumn((j + shift + i - 1) % n)), 1);
 #else
                     c(i, j) = (ElemType) cblas_ddot(m, reinterpret_cast<double*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<double*>(b.m_pArray + b.LocateColumn((j + shift + i - 1) % n)), 1);
@@ -5074,7 +5086,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
         {
             for (long j = 0; j < n; j++)
             {
-#ifndef USE_MKL
+#ifdef USE_ACML
                 c(0, j) = (ElemType) sdot(m, reinterpret_cast<float*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<float*>(b.m_pArray + b.LocateColumn(j)), 1);
 #else
                 c(0, j) = (ElemType) cblas_sdot(m, reinterpret_cast<float*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<float*>(b.m_pArray + b.LocateColumn(j)), 1);
@@ -5084,7 +5096,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
             {
                 for (long i = 1; i < negnumber + 1; i++)
                 {
-#ifndef USE_MKL
+#ifdef USE_ACML
                     c(i, j) = (ElemType) sdot(m, reinterpret_cast<float*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<float*>(b.m_pArray + b.LocateColumn((j + shift + i - 1) % n)), 1);
 #else
                     c(i, j) = (ElemType) cblas_sdot(m, reinterpret_cast<float*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<float*>(b.m_pArray + b.LocateColumn((j + shift + i - 1) % n)), 1);
@@ -5104,7 +5116,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
 #pragma omp parallel for
             foreach_row (i, c)
             {
-#ifndef USE_MKL
+#ifdef USE_ACML
                 c(i, 0) = (ElemType) ddot(n, reinterpret_cast<double*>(a.m_pArray + i), m, reinterpret_cast<double*>(b.m_pArray + i), m);
 #else
                 c(i, 0) = (ElemType) cblas_ddot(n, reinterpret_cast<double*>(a.m_pArray + i), m, reinterpret_cast<double*>(b.m_pArray + i), m);
@@ -5117,7 +5129,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
             foreach_row (i, c)
             {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
                 c(i, 0) = sdot(n, reinterpret_cast<float*>(a.m_pArray + i), m, reinterpret_cast<float*>(b.m_pArray + i), m);
 #else
                 c(i, 0) = cblas_sdot(n, reinterpret_cast<float*>(a.m_pArray + i), m, reinterpret_cast<float*>(b.m_pArray + i), m);
@@ -5527,7 +5539,7 @@ int CPUMatrix<ElemType>::SetNumThreads(int numThreads)
     int mthreads = (int) std::thread::hardware_concurrency();
 
     if (numThreads <= 0)
-        numThreads = max(1, mthreads + numThreads);
+        numThreads = std::max(1, mthreads + numThreads);
     if (numThreads > mthreads)
         numThreads = mthreads;
 
@@ -5535,10 +5547,12 @@ int CPUMatrix<ElemType>::SetNumThreads(int numThreads)
     omp_set_num_threads(numThreads);
     numThreads = omp_get_max_threads();
 
-#ifndef USE_MKL
+#ifdef USE_ACML
     acmlsetnumthreads(numThreads);
-#else
+#elif defined(USE_MKL)
     mkl_set_num_threads(numThreads);
+#elif defined(USE_OPENBLAS)
+    openblas_set_num_threads(numThreads);
 #endif
 #endif
     return numThreads;
diff --git a/Source/Math/CPUSparseMatrix.cpp b/Source/Math/CPUSparseMatrix.cpp
index dd4e1b718..b33a4c1ed 100644
--- a/Source/Math/CPUSparseMatrix.cpp
+++ b/Source/Math/CPUSparseMatrix.cpp
@@ -23,7 +23,7 @@
 
 #pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
 
-#ifndef USE_MKL
+#ifdef USE_ACML
 // use ACML as default.
 // Download ACML 5.3.0 (e.g., acml5.3.0-ifort64.exe) or above
 // from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/
@@ -31,9 +31,17 @@
 // Set Environment variable ACML_PATH to C:\AMD\acml5.3.0\ifort64_mp or the folder you installed acml
 // to point to your folder for the include file and link library
 #include <acml.h> // requires ACML 5.3.0 and above
-#else
+#elif defined(USE_MKL)
 // requires MKL 10.0 and above
 #include <mkl.h>
+#else
+#ifdef _MSC_VER
+// Visual Studio doesn't define standard complex types properly
+#define HAVE_LAPACK_CONFIG_H
+#define LAPACK_COMPLEX_STRUCTURE
+#endif
+#include <cblas.h>
+#include <lapacke.h>
 #endif
 
 // This is an example of an exported variable
@@ -45,7 +53,7 @@
 //    return 42;
 //}
 
-#ifndef USE_MKL // MKL has one additional parameter for different matrix order
+#ifdef USE_ACML // MKL has one additional parameter for different matrix order
 #define BLAS_COLMAJOR
 #else
 #define BLAS_COLMAJOR (int) MatrixOrder::ColMajor,
@@ -1185,7 +1193,7 @@ ElemType CPUSparseMatrix<ElemType>::SumOfAbsElements() const
 
     if (sizeof(ElemType) == sizeof(double))
     {
-#ifndef USE_MKL
+#ifdef USE_ACML
         return (ElemType) dasum((int) this->NzCount(), reinterpret_cast<double*>(m_nzValues), 1);
 #else
         return (ElemType) cblas_dasum((int) this->NzCount(), reinterpret_cast<double*>(m_nzValues), 1);
@@ -1194,7 +1202,7 @@ ElemType CPUSparseMatrix<ElemType>::SumOfAbsElements() const
     else
     {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
         return sasum((int) this->NzCount(), reinterpret_cast<float*>(m_nzValues), 1);
 #else
         return cblas_sasum((int) this->NzCount(), reinterpret_cast<float*>(m_nzValues), 1);
diff --git a/Source/Math/Math.vcxproj b/Source/Math/Math.vcxproj
index f86cfaed7..4edd79aea 100644
--- a/Source/Math/Math.vcxproj
+++ b/Source/Math/Math.vcxproj
@@ -63,7 +63,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level4</WarningLevel>
-      <PreprocessorDefinitions>NO_SYNC; WIN32; _DEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>USE_ACML; NO_SYNC; WIN32; _DEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions)</PreprocessorDefinitions>
       <SDLCheck>true</SDLCheck>
       <AdditionalIncludeDirectories>..\Common\include\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <MultiProcessorCompilation>true</MultiProcessorCompilation>
@@ -105,7 +105,7 @@
       <Optimization>MaxSpeed</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>NO_SYNC; WIN32; NDEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>USE_ACML; NO_SYNC; WIN32; NDEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions)</PreprocessorDefinitions>
       <SDLCheck>true</SDLCheck>
       <AdditionalIncludeDirectories>..\Common\include\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <MultiProcessorCompilation>true</MultiProcessorCompilation>
diff --git a/configure b/configure
index 307bf28db..901042ae7 100755
--- a/configure
+++ b/configure
@@ -16,6 +16,11 @@ have_mkl=no
 mkl_path=
 mkl_check=mkl/include/mkl.h
 
+# Experimental OpenBLAS support.
+have_openblas=no
+openblas_path=
+openblas_check=include/openblas_config.h
+
 have_kaldi=no
 kaldi_path=
 kaldi_check=src/kaldi.mk 
@@ -45,11 +50,12 @@ default_use_1bitsgd=no
 enable_1bitsgd=$default_use_1bitsgd
 
 # List from best to worst choice
-default_path_list="/usr /usr/local /opt /opt/local"
+default_path_list="/usr /usr/local /opt /opt/local /opt/intel"
 
 # List from best to worst choice
 default_acmls="acml5.3.1/ifort64_mp"
 default_mkls=""
+default_openblas=""
 
 # NOTE: Will get compilation errors with cuda-6.0
 default_cudas="cuda-7.5 cuda-7.0 cuda-6.5"
@@ -80,13 +86,15 @@ function check_dir ()
 # $2 is some file that must exist in $1
 function find_dir ()
 {
-    for tail in $1
+    # Loop over list of tails to search, including empty (just default_path + search file)
+    for tail in $1 ''
     do
         for head in $(default_paths)
         do
-            if test x$(check_dir "$head/$tail" $2) = xyes
+            [ -n "$tail" ] && search_path="$head/$tail" || search_path=$head
+            if test x$(check_dir "$search_path" $2) = xyes
             then
-                echo $head/$tail
+                echo $search_path
                 return 0
             fi
         done
@@ -103,6 +111,11 @@ function find_mkl ()
     find_dir "$default_mkls" "$mkl_check"
 }
 
+function find_openblas ()
+{
+    find_dir "$default_openblas" "$openblas_check"
+}
+
 function find_cuda ()
 {
     find_dir "$default_cudas" "$cuda_check"
@@ -179,6 +192,7 @@ function show_help ()
     echo "  --with-cudnn[=directory] $(show_default $(find_cudnn))"
     echo "  --with-acml[=directory] $(show_default $(find_acml))"
     echo "  --with-mkl[=directory] $(show_default $(find_mkl))"
+    echo "  --with-openblas[=directory] (experimental) $(show_default $(find_openblas))"
     echo "  --with-buildtype=(debug|release) $(show_default $default_buildtype)"
     echo "  --with-kaldi[=directory] $(show_default $(find_kaldi))"
     echo "  --with-opencv[=directory] $(show_default $(find_opencv))"
@@ -333,7 +347,7 @@ do
                 acml_path=$(find_acml)
                 if test x$acml_path = x
                 then
-                    echo "Cannot fine acml directory"
+                    echo "Cannot find acml directory"
                     echo "Please specify a value for --with-acml"
                     exit 1
                 fi
@@ -355,7 +369,7 @@ do
                 mkl_path=$(find_mkl)
                 if test x$mkl_path = x
                 then
-                    echo "Cannot fine mkl directory"
+                    echo "Cannot find mkl directory"
                     echo "Please specify a value for --with-mkl"
                     exit 1
                 fi
@@ -369,6 +383,28 @@ do
                 fi
             fi
             ;;
+        --with-openblas*)
+            have_openblas=yes
+            mathlib=openblas
+            if test x$optarg = x
+            then
+                openblas_path=$(find_openblas)
+                if test x$openblas_path = x
+                then
+                    echo "Cannot find openblas directory"
+                    echo "Please specify a value for --with-openblas"
+                    exit 1
+                fi
+            else
+                if test $(check_dir $optarg $openblas_check) = yes
+                then
+                    openblas_path=$optarg
+                else
+                    echo "Invalid openblas directory $optarg"
+                    exit 1
+                fi
+            fi
+            ;;
         --with-buildtype*)
             have_buildtype=yes
             case $optarg in
@@ -439,7 +475,7 @@ then
 fi
 
 # If no math library was specified, search for acml and then mkl
-if test x$have_acml = xno && test x$have_mkl = xno
+if test x$have_acml = xno && test x$have_mkl = xno && test x$have_openblas = xno
 then
     acml_path=$(find_acml)
     if test x$acml_path = x
@@ -448,7 +484,7 @@ then
         if test x$mkl_path = x
         then
             echo "Cannot find a CPU math library."
-            echo "Please specify --with-acml or --with-mkl with a path."
+            echo "Please specify --with-acml, --with-mkl, --with-openblas with a path."
             exit 1
         else
             mathlib=mkl
@@ -530,6 +566,9 @@ case $mathlib in
     mkl)
         echo MKL_PATH=$mkl_path >> $config
         ;;
+    openblas)
+        echo OPENBLAS_PATH=$openblas_path >> $config
+        ;;
 esac
 if test $enable_cuda = yes ; then
     echo CUDA_PATH=$cuda_path >> $config