Add experimental support for OpenBLAS library

* Add 'openblas' as mathlib option in configure. Not added to auto-search so must be specified using --with-openblas * configure script searches empty tail so that libraries located at default_path_list roots (ie /usr/local/ + include/openblas_config.h) are found * Treat ACML as the odd library out in ifdefs since it doesn't conform to typical BLAS standard. Other libraries like ATLAS should be able to share OpenBLAS/MKL variants. Add default USE_ACML define in VS projects to match * Fix 'max' macro define colliding with C++ std::max once openblas headers are included Usage Notes: * For best performance, build OpenBLAS with USE_OPENMP=1. When running CNTK, set OPENBLAS_NUM_THREADS environment var or set numCPUThreads CNTK config variable to the physical core count or performance will suffer * OpenBLAS 2.16 (git HEAD) tested in Linux with GCC 4.8.4 and in Windows with OpenBLAS 2.15 (pre-built binary release + MingGW 64-bit support dlls) * For Windows, in Math.vcxproj, replace libacml_mp_dll.lib with libopenblas.dll.a and change USE_ACML define to USE_OPENBLAS. Change ACML_PATH environment variable to your OpenBLAS path. Modify openblas_config.h as per https://github.com/xianyi/OpenBLAS/issues/708 * On current generation Intel processors, OpenBLAS measures a little faster than AMD ACML and slower than Intel MKL on MNIST and other examples
2016-01-29 15:44:37 -08:00 · 2016-01-29 15:44:37 -08:00 · f0d8a23b26
--- a/7
+++ b/7
@ -132,6 +132,13 @@ ifeq ("$(MATHLIB)","mkl")
  CPPFLAGS += -DUSE_MKL
 endif

+ifeq ("$(MATHLIB)","openblas")
+  INCLUDEPATH += $(OPENBLAS_PATH)/include
+  LIBPATH += $(OPENBLAS_PATH)/lib
+  LIBS += -lopenblas -lm -lpthread
+  CPPFLAGS += -DUSE_OPENBLAS
+endif
+

 ifdef KALDI_PATH
  ########## Copy includes and defines from $(KALDI_PATH)/src/kaldi.mk ##########
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@ -25,9 +25,6 @@
 #define NOMINMAX
 #include "Windows.h"
 #else
-#ifndef max
-#define max(a, b) (((a) > (b)) ? (a) : (b))
-#endif
 #include <cfloat>
 #endif

@ -38,20 +35,27 @@
 #pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
 #pragma warning(disable : 4702) // unreachable code; triggered for unknown reasons

-#ifndef USE_MKL
-// use ACML as default.
+#ifdef USE_ACML
 // Download ACML 5.3.1 (e.g., acml5.3.1-ifort64.exe) or above
 // from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/
 // Install the ifort64_mp variant (compiled with intel compiler) of the library
 // Set Environment variable ACML_PATH to C:\AMD\acml5.3.1\ifort64_mp or the folder you installed acml
 // to point to your folder for the include file and link library
 #include <acml.h> // requires ACML 5.3.1 and above
-#else
+#elif defined(USE_MKL)
 // requires MKL 10.0 and above
 #include <mkl.h>
+#else
+#ifdef _MSC_VER
+// Visual Studio doesn't define standard complex types properly
+#define HAVE_LAPACK_CONFIG_H
+#define LAPACK_COMPLEX_STRUCTURE
+#endif
+#include <cblas.h>
+#include <lapacke.h>
 #endif

-#ifndef USE_MKL // MKL has one additional parameter for different matrix order
+#ifdef USE_ACML // MKL has one additional parameter for different matrix order
 #define BLAS_COLMAJOR
 #else
 #define BLAS_COLMAJOR (int) MatrixOrder::ColMajor,
@ -878,7 +882,7 @@ void CPUMatrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, E
 #pragma omp parallel for
                    foreach_column (j, us)
                    {
-#ifndef USE_MKL
+#ifdef USE_ACML
                        dcopy((int) numRows, reinterpret_cast<double*>(pArray + j), (int) numCols, reinterpret_cast<double*>(m_pArray + LocateColumn(j)), 1);
 #else
                        cblas_dcopy((int) numRows, reinterpret_cast<double*>(pArray + j), (int) numCols, reinterpret_cast<double*>(m_pArray + LocateColumn(j)), 1);
@ -892,7 +896,7 @@ void CPUMatrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, E
                    {
                        {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
                            scopy((int) numRows, reinterpret_cast<float*>(pArray + j), (int) numCols, reinterpret_cast<float*>(m_pArray + LocateColumn(j)), 1);
 #else
                            cblas_scopy((int) numRows, reinterpret_cast<float*>(pArray + j), (int) numCols, reinterpret_cast<float*>(m_pArray + LocateColumn(j)), 1);
@ -1316,9 +1320,9 @@ ElemType CPUMatrix<ElemType>::RmsProp(CPUMatrix<ElemType>& gradients,
        const int grad_sign = (ElemType(0) < curr_grad[i]) - (curr_grad[i] < ElemType(0));

        if (signs[i] * grad_sign > 0)
-            steps[i] = min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX);
+            steps[i] = std::min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX);
        else
-            steps[i] = max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN);
+            steps[i] = std::max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN);

        a = steps[i] / sqrt(avars[i] + floor);
        curr_grad[i] *= a;
@ -2237,7 +2241,7 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignLogSoftmaxOf(const CPUMatrix<Ele
            // we need to extract max before applying exp to avoid overflow
            ElemType maxV = a(0, j);
            foreach_row (i, a)
-                maxV = max(maxV, a(i, j));
+                maxV = std::max(maxV, a(i, j));

            ElemType sum = 0;
            foreach_row (i, a)
@ -2255,7 +2259,7 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignLogSoftmaxOf(const CPUMatrix<Ele
            // we need to extract max before applying exp to avoid overflow
            ElemType maxV = a(i, 0);
            foreach_column (j, a)
-                maxV = max(maxV, a(i, j));
+                maxV = std::max(maxV, a(i, j));

            ElemType sum = 0;
            foreach_column (j, a)
@ -2808,7 +2812,7 @@ ElemType CPUMatrix<ElemType>::SumOfAbsElements() const

    if (sizeof(ElemType) == sizeof(double))
    {
-#ifndef USE_MKL
+#ifdef USE_ACML
        return (ElemType) dasum((int) GetNumElements(), reinterpret_cast<double*>(m_pArray), 1);
 #else
        return (ElemType) cblas_dasum((int) GetNumElements(), reinterpret_cast<double*>(m_pArray), 1);
@ -2817,7 +2821,7 @@ ElemType CPUMatrix<ElemType>::SumOfAbsElements() const
    else
    {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
        return sasum((int) GetNumElements(), reinterpret_cast<float*>(m_pArray), 1);
 #else
        return cblas_sasum((int) GetNumElements(), reinterpret_cast<float*>(m_pArray), 1);
@ -2990,7 +2994,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
 #pragma omp parallel for
            foreach_column (j, c)
            {
-#ifndef USE_MKL
+#ifdef USE_ACML
                c(0, j) = (ElemType) dnrm2(m, reinterpret_cast<double*>(us.m_pArray + us.LocateColumn(j)), 1);
 #else
                c(0, j) = (ElemType) cblas_dnrm2(m, reinterpret_cast<double*>(us.m_pArray + us.LocateColumn(j)), 1);
@ -3003,7 +3007,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
            foreach_column (j, c)
            {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
                c(0, j) = snrm2(m, reinterpret_cast<float*>(us.m_pArray + us.LocateColumn(j)), 1);
 #else
                c(0, j) = cblas_snrm2(m, reinterpret_cast<float*>(us.m_pArray + us.LocateColumn(j)), 1);
@ -3020,7 +3024,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
 #pragma omp parallel for
            foreach_row (i, c)
            {
-#ifndef USE_MKL
+#ifdef USE_ACML
                c(i, 0) = dnrm2(n, reinterpret_cast<double*>(us.m_pArray + i), m);
 #else
                c(i, 0) = cblas_dnrm2(n, reinterpret_cast<double*>(us.m_pArray + i), m);
@ -3033,7 +3037,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
            foreach_row (i, c)
            {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
                c(i, 0) = snrm2(n, reinterpret_cast<float*>(us.m_pArray + i), m);
 #else
                c(i, 0) = cblas_snrm2(n, reinterpret_cast<float*>(us.m_pArray + i), m);
@ -3073,7 +3077,7 @@ void CPUMatrix<ElemType>::VectorNormInf(CPUMatrix<ElemType>& c, const bool isCol
            ElemType v = 0;
            foreach_row (i, us)
            {
-                v = max(v, abs(us(i, j)));
+                v = std::max(v, abs(us(i, j)));
            }
            c(0, j) = v;
        }
@ -3088,7 +3092,7 @@ void CPUMatrix<ElemType>::VectorNormInf(CPUMatrix<ElemType>& c, const bool isCol
            ElemType v = 0;
            foreach_column (j, us)
            {
-                v = max(v, abs(us(i, j)));
+                v = std::max(v, abs(us(i, j)));
            }
            c(i, 0) = v;
        }
@ -3282,7 +3286,7 @@ ElemType CPUMatrix<ElemType>::MatrixNormInf() const
    {
 #pragma omp critical
        {
-            v = max(v, abs(us(i, j)));
+            v = std::max(v, abs(us(i, j)));
        }
    }
    return v;
@ -3866,8 +3870,8 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignMaxPoolingResult(const CPUMatrix
                for (long rowInWindow = 0; rowInWindow < windowHeight; rowInWindow++)
                {
                    const ElemType val = inputBatch(rowInInput, sample); // pf[rowInWindow*channels];
-                    maxVal = max(maxVal, val);
-                    minVal = min(minVal, val);
+                    maxVal = std::max(maxVal, val);
+                    minVal = std::min(minVal, val);
                    rowInInput += (long) channels;
                }
            }
@ -4040,7 +4044,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix

    int m, n, k, l;
    int lda, ldb, ldc;
-#ifndef USE_MKL
+#ifdef USE_ACML
    char transA, transB;
 #else
    CBLAS_TRANSPOSE mklTransA;
@ -4052,7 +4056,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
        m = (int) a.GetNumCols();
        k = (int) a.GetNumRows();
        lda = k;
-#ifndef USE_MKL
+#ifdef USE_ACML
        transA = (char) MatrixTranspose::Trans;
 #else
        mklTransA = CBLAS_TRANSPOSE::CblasTrans;
@ -4063,7 +4067,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
        m = (int) a.GetNumRows();
        k = (int) a.GetNumCols();
        lda = m;
-#ifndef USE_MKL
+#ifdef USE_ACML
        transA = (char) MatrixTranspose::NoTrans;
 #else
        mklTransA = CBLAS_TRANSPOSE::CblasNoTrans;
@ -4075,7 +4079,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
        l = (int) b.GetNumCols();
        n = (int) b.GetNumRows();
        ldb = n;
-#ifndef USE_MKL
+#ifdef USE_ACML
        transB = (char) MatrixTranspose::Trans;
 #else
        mklTransB = CBLAS_TRANSPOSE::CblasTrans;
@ -4086,7 +4090,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
        l = (int) b.GetNumRows();
        n = (int) b.GetNumCols();
        ldb = l;
-#ifndef USE_MKL
+#ifdef USE_ACML
        transB = (char) MatrixTranspose::NoTrans;
 #else
        mklTransB = CBLAS_TRANSPOSE::CblasNoTrans;
@ -4107,7 +4111,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix

    if (sizeof(ElemType) == sizeof(double))
    {
-#ifndef USE_MKL
+#ifdef USE_ACML
        dgemm(transA, transB, m, n, k, alpha, reinterpret_cast<double*>(a.m_pArray), lda, reinterpret_cast<double*>(b.m_pArray), ldb, beta, reinterpret_cast<double*>(c.m_pArray), ldc);
 #else
        cblas_dgemm((CBLAS_ORDER) BLAS_COLMAJOR mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast<double*>(a.m_pArray), lda, reinterpret_cast<double*>(b.m_pArray), ldb, beta, reinterpret_cast<double*>(c.m_pArray), ldc);
@ -4116,7 +4120,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
    else
    {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
        sgemm(BLAS_COLMAJOR transA, transB, m, n, k, alpha, reinterpret_cast<float*>(a.m_pArray), lda, reinterpret_cast<float*>(b.m_pArray), ldb, beta, reinterpret_cast<float*>(c.m_pArray), ldc);
 #else
        cblas_sgemm((CBLAS_ORDER) BLAS_COLMAJOR mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast<float*>(a.m_pArray), lda, reinterpret_cast<float*>(b.m_pArray), ldb, beta, reinterpret_cast<float*>(c.m_pArray), ldc);
@ -4160,34 +4164,42 @@ void CPUMatrix<ElemType>::SVD(const CPUMatrix<ElemType>& A, CPUMatrix<ElemType>&
    ldu = m;
    ldvt = n;
    U.Resize(m, m);
-    SIGMA.Resize(min(m, n), 1);
+    SIGMA.Resize(std::min(m, n), 1);
    VT.Resize(n, n);

    if (sizeof(ElemType) == sizeof(double))
    {
-#ifndef USE_MKL
+#ifdef USE_ACML
        dgesvd('A', 'A', (int) m, (int) n, reinterpret_cast<double*>(A.m_pArray), (int) lda, reinterpret_cast<double*>(SIGMA.m_pArray), reinterpret_cast<double*>(U.m_pArray), (int) ldu, reinterpret_cast<double*>(VT.m_pArray), (int) ldvt, &info);
-#else
+#elif defined(USE_MKL)
        double wkopt;
        int lwork = -1;
        dgesvd("All", "All", &m, &n, reinterpret_cast<double*>(A.m_pArray), &lda, reinterpret_cast<double*>(SIGMA.m_pArray), reinterpret_cast<double*>(U.m_pArray), &ldu, reinterpret_cast<double*>(VT.m_pArray), &ldvt, &wkopt, &lwork, &info);
        lwork = (int) wkopt;
        W.Resize(lwork, 1);
        dgesvd("All", "All", &m, &n, reinterpret_cast<double*>(A.m_pArray), &lda, reinterpret_cast<double*>(SIGMA.m_pArray), reinterpret_cast<double*>(U.m_pArray), &ldu, reinterpret_cast<double*>(VT.m_pArray), &ldvt, reinterpret_cast<double*>(W.m_pArray), &lwork, &info);
+#else
+        std::vector<double> superb(std::max(std::min(m, n) - 1, 1));
+        info = LAPACKE_dgesvd(BLAS_COLMAJOR 'A', 'A', (int) m, (int) n, reinterpret_cast<double*>(A.m_pArray), (int) lda, reinterpret_cast<double*>(SIGMA.m_pArray),
+            reinterpret_cast<double*>(U.m_pArray), (int) ldu, reinterpret_cast<double*>(VT.m_pArray), (int) ldvt, &superb[0]);
 #endif
    }
    else
    {
-#ifndef USE_MKL
+#ifdef USE_ACML
 #pragma warning(suppress : 4244)
        sgesvd('A', 'A', (int) m, (int) n, reinterpret_cast<float*>(A.m_pArray), (int) lda, reinterpret_cast<float*>(SIGMA.m_pArray), reinterpret_cast<float*>(U.m_pArray), (int) ldu, reinterpret_cast<float*>(VT.m_pArray), (int) ldvt, &info);
-#else
+#elif defined(USE_MKL)
        float wkopt;
        int lwork = -1;
        sgesvd("All", "All", &m, &n, reinterpret_cast<float*>(A.m_pArray), &lda, reinterpret_cast<float*>(SIGMA.m_pArray), reinterpret_cast<float*>(U.m_pArray), &ldu, reinterpret_cast<float*>(VT.m_pArray), &ldvt, &wkopt, &lwork, &info);
        lwork = (int) wkopt;
        W.Resize(lwork, 1);
        sgesvd("All", "All", &m, &n, reinterpret_cast<float*>(A.m_pArray), &lda, reinterpret_cast<float*>(SIGMA.m_pArray), reinterpret_cast<float*>(U.m_pArray), &ldu, reinterpret_cast<float*>(VT.m_pArray), &ldvt, reinterpret_cast<float*>(W.m_pArray), &lwork, &info);
+#else
+        std::vector<float> superb(std::max(std::min(m, n) - 1, 1));
+        info = LAPACKE_sgesvd(BLAS_COLMAJOR 'A', 'A', (int) m, (int) n, reinterpret_cast<float*>(A.m_pArray), (int) lda, reinterpret_cast<float*>(SIGMA.m_pArray),
+            reinterpret_cast<float*>(U.m_pArray), (int) ldu, reinterpret_cast<float*>(VT.m_pArray), (int) ldvt, &superb[0]);
 #endif
    }

@ -4383,7 +4395,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&

        if (sizeof(ElemType) == sizeof(double))
        {
-#ifndef USE_MKL
+#ifdef USE_ACML
            daxpy(len, alpha, reinterpret_cast<double*>(a.m_pArray), incx, reinterpret_cast<double*>(c.m_pArray), incy);
 #else
            cblas_daxpy(len, alpha, reinterpret_cast<double*>(a.m_pArray), incx, reinterpret_cast<double*>(c.m_pArray), incy);
@ -4392,7 +4404,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
        else
        {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
            saxpy(len, alpha, reinterpret_cast<float*>(a.m_pArray), incx, reinterpret_cast<float*>(c.m_pArray), incy);
 #else
            cblas_saxpy(len, alpha, reinterpret_cast<float*>(a.m_pArray), incx, reinterpret_cast<float*>(c.m_pArray), incy);
@ -4433,7 +4445,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
 #pragma omp parallel for
            foreach_column (j, c)
            {
-#ifndef USE_MKL
+#ifdef USE_ACML
                daxpy(m, alpha, reinterpret_cast<double*>(a.m_pArray), 1, reinterpret_cast<double*>(c.m_pArray + c.LocateColumn(j)), 1);
 #else
                cblas_daxpy(m, alpha, reinterpret_cast<double*>(a.m_pArray), 1, reinterpret_cast<double*>(c.m_pArray + c.LocateColumn(j)), 1);
@ -4446,7 +4458,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
            foreach_column (j, c)
            {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
                saxpy(m, alpha, reinterpret_cast<float*>(a.m_pArray), 1, reinterpret_cast<float*>(c.m_pArray + c.LocateColumn(j)), 1);
 #else
                cblas_saxpy(m, alpha, reinterpret_cast<float*>(a.m_pArray), 1, reinterpret_cast<float*>(c.m_pArray + c.LocateColumn(j)), 1);
@ -4467,7 +4479,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
 #pragma omp parallel for
            foreach_row (i, c)
            {
-#ifndef USE_MKL
+#ifdef USE_ACML
                daxpy(n, alpha, reinterpret_cast<double*>(a.m_pArray), 1, reinterpret_cast<double*>(c.m_pArray + i), m);
 #else
                cblas_daxpy(n, alpha, reinterpret_cast<double*>(a.m_pArray), 1, reinterpret_cast<double*>(c.m_pArray + i), m);
@ -4480,7 +4492,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
            foreach_row (i, c)
            {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
                saxpy(n, alpha, reinterpret_cast<float*>(a.m_pArray), 1, reinterpret_cast<float*>(c.m_pArray + i), m);
 #else
                cblas_saxpy(n, alpha, reinterpret_cast<float*>(a.m_pArray), 1, reinterpret_cast<float*>(c.m_pArray + i), m);
@ -4682,7 +4694,7 @@ void CPUMatrix<ElemType>::Scale(ElemType alpha, CPUMatrix<ElemType>& a)

    if (sizeof(ElemType) == sizeof(double))
    {
-#ifndef USE_MKL
+#ifdef USE_ACML
        dscal(len, alpha, reinterpret_cast<double*>(a.m_pArray), incx);
 #else
        cblas_dscal(len, alpha, reinterpret_cast<double*>(a.m_pArray), incx);
@ -4691,7 +4703,7 @@ void CPUMatrix<ElemType>::Scale(ElemType alpha, CPUMatrix<ElemType>& a)
    else
    {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
        sscal(len, alpha, reinterpret_cast<float*>(a.m_pArray), incx);
 #else
        cblas_sscal(len, alpha, reinterpret_cast<float*>(a.m_pArray), incx);
@ -4741,7 +4753,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
 #pragma omp parallel for
            foreach_column (j, c)
            {
-#ifndef USE_MKL
+#ifdef USE_ACML
                c(0, j) = (ElemType) ddot(m, reinterpret_cast<double*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<double*>(b.m_pArray + b.LocateColumn(j)), 1);
 #else
                c(0, j) = (ElemType) cblas_ddot(m, reinterpret_cast<double*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<double*>(b.m_pArray + b.LocateColumn(j)), 1);
@ -4754,7 +4766,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
            foreach_column (j, c)
            {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
                c(0, j) = (ElemType) sdot(m, reinterpret_cast<float*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<float*>(b.m_pArray + b.LocateColumn(j)), 1);
 #else
                c(0, j) = (ElemType) cblas_sdot(m, reinterpret_cast<float*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<float*>(b.m_pArray + b.LocateColumn(j)), 1);
@ -4771,7 +4783,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
 #pragma omp parallel for
            foreach_row (i, c)
            {
-#ifndef USE_MKL
+#ifdef USE_ACML
                c(i, 0) = ddot(n, reinterpret_cast<double*>(a.m_pArray + i), m, reinterpret_cast<double*>(b.m_pArray + i), m);
 #else
                c(i, 0) = cblas_ddot(n, reinterpret_cast<double*>(a.m_pArray + i), m, reinterpret_cast<double*>(b.m_pArray + i), m);
@ -4784,7 +4796,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
            foreach_row (i, c)
            {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
                c(i, 0) = sdot(n, reinterpret_cast<float*>(a.m_pArray + i), m, reinterpret_cast<float*>(b.m_pArray + i), m);
 #else
                c(i, 0) = cblas_sdot(n, reinterpret_cast<float*>(a.m_pArray + i), m, reinterpret_cast<float*>(b.m_pArray + i), m);
@ -4813,7 +4825,7 @@ ElemType CPUMatrix<ElemType>::InnerProductOfMatrices(const CPUMatrix<ElemType>&

    if (sizeof(ElemType) == sizeof(double))
    {
-#ifndef USE_MKL
+#ifdef USE_ACML
        return (ElemType) ddot((int) a.GetNumElements(), reinterpret_cast<double*>(a.m_pArray), 1, reinterpret_cast<double*>(b.m_pArray), 1);
 #else
        return (ElemType) cblas_ddot((int) a.GetNumElements(), reinterpret_cast<double*>(a.m_pArray), 1, reinterpret_cast<double*>(b.m_pArray), 1);
@ -4822,7 +4834,7 @@ ElemType CPUMatrix<ElemType>::InnerProductOfMatrices(const CPUMatrix<ElemType>&
    else
    {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
        return (ElemType) sdot((int) a.GetNumElements(), reinterpret_cast<float*>(a.m_pArray), 1, reinterpret_cast<float*>(b.m_pArray), 1);
 #else
        return (ElemType) cblas_sdot((int) a.GetNumElements(), reinterpret_cast<float*>(a.m_pArray), 1, reinterpret_cast<float*>(b.m_pArray), 1);
@ -5052,7 +5064,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
        {
            for (long j = 0; j < n; j++)
            {
-#ifndef USE_MKL
+#ifdef USE_ACML
                c(0, j) = (ElemType) ddot(m, reinterpret_cast<double*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<double*>(b.m_pArray + b.LocateColumn(j)), 1);
 #else
                c(0, j) = (ElemType) cblas_ddot(m, reinterpret_cast<double*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<double*>(b.m_pArray + b.LocateColumn(j)), 1);
@ -5062,7 +5074,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
            {
                for (long i = 1; i < negnumber + 1; i++)
                {
-#ifndef USE_MKL
+#ifdef USE_ACML
                    c(i, j) = (ElemType) ddot(m, reinterpret_cast<double*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<double*>(b.m_pArray + b.LocateColumn((j + shift + i - 1) % n)), 1);
 #else
                    c(i, j) = (ElemType) cblas_ddot(m, reinterpret_cast<double*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<double*>(b.m_pArray + b.LocateColumn((j + shift + i - 1) % n)), 1);
@ -5074,7 +5086,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
        {
            for (long j = 0; j < n; j++)
            {
-#ifndef USE_MKL
+#ifdef USE_ACML
                c(0, j) = (ElemType) sdot(m, reinterpret_cast<float*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<float*>(b.m_pArray + b.LocateColumn(j)), 1);
 #else
                c(0, j) = (ElemType) cblas_sdot(m, reinterpret_cast<float*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<float*>(b.m_pArray + b.LocateColumn(j)), 1);
@ -5084,7 +5096,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
            {
                for (long i = 1; i < negnumber + 1; i++)
                {
-#ifndef USE_MKL
+#ifdef USE_ACML
                    c(i, j) = (ElemType) sdot(m, reinterpret_cast<float*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<float*>(b.m_pArray + b.LocateColumn((j + shift + i - 1) % n)), 1);
 #else
                    c(i, j) = (ElemType) cblas_sdot(m, reinterpret_cast<float*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<float*>(b.m_pArray + b.LocateColumn((j + shift + i - 1) % n)), 1);
@ -5104,7 +5116,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
 #pragma omp parallel for
            foreach_row (i, c)
            {
-#ifndef USE_MKL
+#ifdef USE_ACML
                c(i, 0) = (ElemType) ddot(n, reinterpret_cast<double*>(a.m_pArray + i), m, reinterpret_cast<double*>(b.m_pArray + i), m);
 #else
                c(i, 0) = (ElemType) cblas_ddot(n, reinterpret_cast<double*>(a.m_pArray + i), m, reinterpret_cast<double*>(b.m_pArray + i), m);
@ -5117,7 +5129,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
            foreach_row (i, c)
            {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
                c(i, 0) = sdot(n, reinterpret_cast<float*>(a.m_pArray + i), m, reinterpret_cast<float*>(b.m_pArray + i), m);
 #else
                c(i, 0) = cblas_sdot(n, reinterpret_cast<float*>(a.m_pArray + i), m, reinterpret_cast<float*>(b.m_pArray + i), m);
@ -5527,7 +5539,7 @@ int CPUMatrix<ElemType>::SetNumThreads(int numThreads)
    int mthreads = (int) std::thread::hardware_concurrency();

    if (numThreads <= 0)
-        numThreads = max(1, mthreads + numThreads);
+        numThreads = std::max(1, mthreads + numThreads);
    if (numThreads > mthreads)
        numThreads = mthreads;

@ -5535,10 +5547,12 @@ int CPUMatrix<ElemType>::SetNumThreads(int numThreads)
    omp_set_num_threads(numThreads);
    numThreads = omp_get_max_threads();

-#ifndef USE_MKL
+#ifdef USE_ACML
    acmlsetnumthreads(numThreads);
-#else
+#elif defined(USE_MKL)
    mkl_set_num_threads(numThreads);
+#elif defined(USE_OPENBLAS)
+    openblas_set_num_threads(numThreads);
 #endif
 #endif
    return numThreads;
--- a/Source/Math/CPUSparseMatrix.cpp
+++ b/Source/Math/CPUSparseMatrix.cpp
@ -23,7 +23,7 @@

 #pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this

-#ifndef USE_MKL
+#ifdef USE_ACML
 // use ACML as default.
 // Download ACML 5.3.0 (e.g., acml5.3.0-ifort64.exe) or above
 // from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/
@ -31,9 +31,17 @@
 // Set Environment variable ACML_PATH to C:\AMD\acml5.3.0\ifort64_mp or the folder you installed acml
 // to point to your folder for the include file and link library
 #include <acml.h> // requires ACML 5.3.0 and above
-#else
+#elif defined(USE_MKL)
 // requires MKL 10.0 and above
 #include <mkl.h>
+#else
+#ifdef _MSC_VER
+// Visual Studio doesn't define standard complex types properly
+#define HAVE_LAPACK_CONFIG_H
+#define LAPACK_COMPLEX_STRUCTURE
+#endif
+#include <cblas.h>
+#include <lapacke.h>
 #endif

 // This is an example of an exported variable
@ -45,7 +53,7 @@
 //    return 42;
 //}

-#ifndef USE_MKL // MKL has one additional parameter for different matrix order
+#ifdef USE_ACML // MKL has one additional parameter for different matrix order
 #define BLAS_COLMAJOR
 #else
 #define BLAS_COLMAJOR (int) MatrixOrder::ColMajor,
@ -1185,7 +1193,7 @@ ElemType CPUSparseMatrix<ElemType>::SumOfAbsElements() const

    if (sizeof(ElemType) == sizeof(double))
    {
-#ifndef USE_MKL
+#ifdef USE_ACML
        return (ElemType) dasum((int) this->NzCount(), reinterpret_cast<double*>(m_nzValues), 1);
 #else
        return (ElemType) cblas_dasum((int) this->NzCount(), reinterpret_cast<double*>(m_nzValues), 1);
@ -1194,7 +1202,7 @@ ElemType CPUSparseMatrix<ElemType>::SumOfAbsElements() const
    else
    {
 #pragma warning(suppress : 4244)
-#ifndef USE_MKL
+#ifdef USE_ACML
        return sasum((int) this->NzCount(), reinterpret_cast<float*>(m_nzValues), 1);
 #else
        return cblas_sasum((int) this->NzCount(), reinterpret_cast<float*>(m_nzValues), 1);
--- a/Source/Math/Math.vcxproj
+++ b/Source/Math/Math.vcxproj
@ -63,7 +63,7 @@
    <ClCompile>
      <PrecompiledHeader>NotUsing</PrecompiledHeader>
      <WarningLevel>Level4</WarningLevel>
-      <PreprocessorDefinitions>NO_SYNC; WIN32; _DEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>USE_ACML; NO_SYNC; WIN32; _DEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions)</PreprocessorDefinitions>
      <SDLCheck>true</SDLCheck>
      <AdditionalIncludeDirectories>..\Common\include\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
@ -105,7 +105,7 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>NO_SYNC; WIN32; NDEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>USE_ACML; NO_SYNC; WIN32; NDEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions)</PreprocessorDefinitions>
      <SDLCheck>true</SDLCheck>
      <AdditionalIncludeDirectories>..\Common\include\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
--- a/55
+++ b/55
@ -16,6 +16,11 @@ have_mkl=no
 mkl_path=
 mkl_check=mkl/include/mkl.h

+# Experimental OpenBLAS support.
+have_openblas=no
+openblas_path=
+openblas_check=include/openblas_config.h
+
 have_kaldi=no
 kaldi_path=
 kaldi_check=src/kaldi.mk 
@ -45,11 +50,12 @@ default_use_1bitsgd=no
 enable_1bitsgd=$default_use_1bitsgd

 # List from best to worst choice
-default_path_list="/usr /usr/local /opt /opt/local"
+default_path_list="/usr /usr/local /opt /opt/local /opt/intel"

 # List from best to worst choice
 default_acmls="acml5.3.1/ifort64_mp"
 default_mkls=""
+default_openblas=""

 # NOTE: Will get compilation errors with cuda-6.0
 default_cudas="cuda-7.5 cuda-7.0 cuda-6.5"
@ -80,13 +86,15 @@ function check_dir ()
 # $2 is some file that must exist in $1
 function find_dir ()
 {
-    for tail in $1
+    # Loop over list of tails to search, including empty (just default_path + search file)
+    for tail in $1 ''
    do
        for head in $(default_paths)
        do
-            if test x$(check_dir "$head/$tail" $2) = xyes
+            [ -n "$tail" ] && search_path="$head/$tail" || search_path=$head
+            if test x$(check_dir "$search_path" $2) = xyes
            then
-                echo $head/$tail
+                echo $search_path
                return 0
            fi
        done
@ -103,6 +111,11 @@ function find_mkl ()
    find_dir "$default_mkls" "$mkl_check"
 }

+function find_openblas ()
+{
+    find_dir "$default_openblas" "$openblas_check"
+}
+
 function find_cuda ()
 {
    find_dir "$default_cudas" "$cuda_check"
@ -179,6 +192,7 @@ function show_help ()
    echo "  --with-cudnn[=directory] $(show_default $(find_cudnn))"
    echo "  --with-acml[=directory] $(show_default $(find_acml))"
    echo "  --with-mkl[=directory] $(show_default $(find_mkl))"
+    echo "  --with-openblas[=directory] (experimental) $(show_default $(find_openblas))"
    echo "  --with-buildtype=(debug|release) $(show_default $default_buildtype)"
    echo "  --with-kaldi[=directory] $(show_default $(find_kaldi))"
    echo "  --with-opencv[=directory] $(show_default $(find_opencv))"
@ -333,7 +347,7 @@ do
                acml_path=$(find_acml)
                if test x$acml_path = x
                then
-                    echo "Cannot fine acml directory"
+                    echo "Cannot find acml directory"
                    echo "Please specify a value for --with-acml"
                    exit 1
                fi
@ -355,7 +369,7 @@ do
                mkl_path=$(find_mkl)
                if test x$mkl_path = x
                then
-                    echo "Cannot fine mkl directory"
+                    echo "Cannot find mkl directory"
                    echo "Please specify a value for --with-mkl"
                    exit 1
                fi
@ -369,6 +383,28 @@ do
                fi
            fi
            ;;
+        --with-openblas*)
+            have_openblas=yes
+            mathlib=openblas
+            if test x$optarg = x
+            then
+                openblas_path=$(find_openblas)
+                if test x$openblas_path = x
+                then
+                    echo "Cannot find openblas directory"
+                    echo "Please specify a value for --with-openblas"
+                    exit 1
+                fi
+            else
+                if test $(check_dir $optarg $openblas_check) = yes
+                then
+                    openblas_path=$optarg
+                else
+                    echo "Invalid openblas directory $optarg"
+                    exit 1
+                fi
+            fi
+            ;;
        --with-buildtype*)
            have_buildtype=yes
            case $optarg in
@ -439,7 +475,7 @@ then
 fi

 # If no math library was specified, search for acml and then mkl
-if test x$have_acml = xno && test x$have_mkl = xno
+if test x$have_acml = xno && test x$have_mkl = xno && test x$have_openblas = xno
 then
    acml_path=$(find_acml)
    if test x$acml_path = x
@ -448,7 +484,7 @@ then
        if test x$mkl_path = x
        then
            echo "Cannot find a CPU math library."
-            echo "Please specify --with-acml or --with-mkl with a path."
+            echo "Please specify --with-acml, --with-mkl, --with-openblas with a path."
            exit 1
        else
            mathlib=mkl
@ -530,6 +566,9 @@ case $mathlib in
    mkl)
        echo MKL_PATH=$mkl_path >> $config
        ;;
+    openblas)
+        echo OPENBLAS_PATH=$openblas_path >> $config
+        ;;
 esac
 if test $enable_cuda = yes ; then
    echo CUDA_PATH=$cuda_path >> $config