diff --git a/Makefile b/Makefile index a81e2338b..33753d12c 100644 --- a/Makefile +++ b/Makefile @@ -132,6 +132,13 @@ ifeq ("$(MATHLIB)","mkl") CPPFLAGS += -DUSE_MKL endif +ifeq ("$(MATHLIB)","openblas") + INCLUDEPATH += $(OPENBLAS_PATH)/include + LIBPATH += $(OPENBLAS_PATH)/lib + LIBS += -lopenblas -lm -lpthread + CPPFLAGS += -DUSE_OPENBLAS +endif + ifdef KALDI_PATH ########## Copy includes and defines from $(KALDI_PATH)/src/kaldi.mk ########## diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp index 309c0c0f7..0584d0ef0 100644 --- a/Source/Math/CPUMatrix.cpp +++ b/Source/Math/CPUMatrix.cpp @@ -25,9 +25,6 @@ #define NOMINMAX #include "Windows.h" #else -#ifndef max -#define max(a, b) (((a) > (b)) ? (a) : (b)) -#endif #include #endif @@ -38,20 +35,27 @@ #pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this #pragma warning(disable : 4702) // unreachable code; triggered for unknown reasons -#ifndef USE_MKL -// use ACML as default. +#ifdef USE_ACML // Download ACML 5.3.1 (e.g., acml5.3.1-ifort64.exe) or above // from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/ // Install the ifort64_mp variant (compiled with intel compiler) of the library // Set Environment variable ACML_PATH to C:\AMD\acml5.3.1\ifort64_mp or the folder you installed acml // to point to your folder for the include file and link library #include // requires ACML 5.3.1 and above -#else +#elif defined(USE_MKL) // requires MKL 10.0 and above #include +#else +#ifdef _MSC_VER +// Visual Studio doesn't define standard complex types properly +#define HAVE_LAPACK_CONFIG_H +#define LAPACK_COMPLEX_STRUCTURE +#endif +#include +#include #endif -#ifndef USE_MKL // MKL has one additional parameter for different matrix order +#ifdef USE_ACML // MKL has one additional parameter for different matrix order #define BLAS_COLMAJOR #else #define BLAS_COLMAJOR (int) MatrixOrder::ColMajor, @@ -878,7 +882,7 @@ void CPUMatrix::SetValue(const size_t numRows, const size_t numCols, E #pragma omp parallel for foreach_column (j, us) { -#ifndef USE_MKL +#ifdef USE_ACML dcopy((int) numRows, reinterpret_cast(pArray + j), (int) numCols, reinterpret_cast(m_pArray + LocateColumn(j)), 1); #else cblas_dcopy((int) numRows, reinterpret_cast(pArray + j), (int) numCols, reinterpret_cast(m_pArray + LocateColumn(j)), 1); @@ -892,7 +896,7 @@ void CPUMatrix::SetValue(const size_t numRows, const size_t numCols, E { { #pragma warning(suppress : 4244) -#ifndef USE_MKL +#ifdef USE_ACML scopy((int) numRows, reinterpret_cast(pArray + j), (int) numCols, reinterpret_cast(m_pArray + LocateColumn(j)), 1); #else cblas_scopy((int) numRows, reinterpret_cast(pArray + j), (int) numCols, reinterpret_cast(m_pArray + LocateColumn(j)), 1); @@ -1316,9 +1320,9 @@ ElemType CPUMatrix::RmsProp(CPUMatrix& gradients, const int grad_sign = (ElemType(0) < curr_grad[i]) - (curr_grad[i] < ElemType(0)); if (signs[i] * grad_sign > 0) - steps[i] = min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX); + steps[i] = std::min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX); else - steps[i] = max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN); + steps[i] = std::max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN); a = steps[i] / sqrt(avars[i] + floor); curr_grad[i] *= a; @@ -2237,7 +2241,7 @@ CPUMatrix& CPUMatrix::AssignLogSoftmaxOf(const CPUMatrix& CPUMatrix::AssignLogSoftmaxOf(const CPUMatrix::SumOfAbsElements() const if (sizeof(ElemType) == sizeof(double)) { -#ifndef USE_MKL +#ifdef USE_ACML return (ElemType) dasum((int) GetNumElements(), reinterpret_cast(m_pArray), 1); #else return (ElemType) cblas_dasum((int) GetNumElements(), reinterpret_cast(m_pArray), 1); @@ -2817,7 +2821,7 @@ ElemType CPUMatrix::SumOfAbsElements() const else { #pragma warning(suppress : 4244) -#ifndef USE_MKL +#ifdef USE_ACML return sasum((int) GetNumElements(), reinterpret_cast(m_pArray), 1); #else return cblas_sasum((int) GetNumElements(), reinterpret_cast(m_pArray), 1); @@ -2990,7 +2994,7 @@ void CPUMatrix::VectorNorm2(CPUMatrix& c, const bool isColWi #pragma omp parallel for foreach_column (j, c) { -#ifndef USE_MKL +#ifdef USE_ACML c(0, j) = (ElemType) dnrm2(m, reinterpret_cast(us.m_pArray + us.LocateColumn(j)), 1); #else c(0, j) = (ElemType) cblas_dnrm2(m, reinterpret_cast(us.m_pArray + us.LocateColumn(j)), 1); @@ -3003,7 +3007,7 @@ void CPUMatrix::VectorNorm2(CPUMatrix& c, const bool isColWi foreach_column (j, c) { #pragma warning(suppress : 4244) -#ifndef USE_MKL +#ifdef USE_ACML c(0, j) = snrm2(m, reinterpret_cast(us.m_pArray + us.LocateColumn(j)), 1); #else c(0, j) = cblas_snrm2(m, reinterpret_cast(us.m_pArray + us.LocateColumn(j)), 1); @@ -3020,7 +3024,7 @@ void CPUMatrix::VectorNorm2(CPUMatrix& c, const bool isColWi #pragma omp parallel for foreach_row (i, c) { -#ifndef USE_MKL +#ifdef USE_ACML c(i, 0) = dnrm2(n, reinterpret_cast(us.m_pArray + i), m); #else c(i, 0) = cblas_dnrm2(n, reinterpret_cast(us.m_pArray + i), m); @@ -3033,7 +3037,7 @@ void CPUMatrix::VectorNorm2(CPUMatrix& c, const bool isColWi foreach_row (i, c) { #pragma warning(suppress : 4244) -#ifndef USE_MKL +#ifdef USE_ACML c(i, 0) = snrm2(n, reinterpret_cast(us.m_pArray + i), m); #else c(i, 0) = cblas_snrm2(n, reinterpret_cast(us.m_pArray + i), m); @@ -3073,7 +3077,7 @@ void CPUMatrix::VectorNormInf(CPUMatrix& c, const bool isCol ElemType v = 0; foreach_row (i, us) { - v = max(v, abs(us(i, j))); + v = std::max(v, abs(us(i, j))); } c(0, j) = v; } @@ -3088,7 +3092,7 @@ void CPUMatrix::VectorNormInf(CPUMatrix& c, const bool isCol ElemType v = 0; foreach_column (j, us) { - v = max(v, abs(us(i, j))); + v = std::max(v, abs(us(i, j))); } c(i, 0) = v; } @@ -3282,7 +3286,7 @@ ElemType CPUMatrix::MatrixNormInf() const { #pragma omp critical { - v = max(v, abs(us(i, j))); + v = std::max(v, abs(us(i, j))); } } return v; @@ -3866,8 +3870,8 @@ CPUMatrix& CPUMatrix::AssignMaxPoolingResult(const CPUMatrix for (long rowInWindow = 0; rowInWindow < windowHeight; rowInWindow++) { const ElemType val = inputBatch(rowInInput, sample); // pf[rowInWindow*channels]; - maxVal = max(maxVal, val); - minVal = min(minVal, val); + maxVal = std::max(maxVal, val); + minVal = std::min(minVal, val); rowInInput += (long) channels; } } @@ -4040,7 +4044,7 @@ void CPUMatrix::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix int m, n, k, l; int lda, ldb, ldc; -#ifndef USE_MKL +#ifdef USE_ACML char transA, transB; #else CBLAS_TRANSPOSE mklTransA; @@ -4052,7 +4056,7 @@ void CPUMatrix::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix m = (int) a.GetNumCols(); k = (int) a.GetNumRows(); lda = k; -#ifndef USE_MKL +#ifdef USE_ACML transA = (char) MatrixTranspose::Trans; #else mklTransA = CBLAS_TRANSPOSE::CblasTrans; @@ -4063,7 +4067,7 @@ void CPUMatrix::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix m = (int) a.GetNumRows(); k = (int) a.GetNumCols(); lda = m; -#ifndef USE_MKL +#ifdef USE_ACML transA = (char) MatrixTranspose::NoTrans; #else mklTransA = CBLAS_TRANSPOSE::CblasNoTrans; @@ -4075,7 +4079,7 @@ void CPUMatrix::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix l = (int) b.GetNumCols(); n = (int) b.GetNumRows(); ldb = n; -#ifndef USE_MKL +#ifdef USE_ACML transB = (char) MatrixTranspose::Trans; #else mklTransB = CBLAS_TRANSPOSE::CblasTrans; @@ -4086,7 +4090,7 @@ void CPUMatrix::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix l = (int) b.GetNumRows(); n = (int) b.GetNumCols(); ldb = l; -#ifndef USE_MKL +#ifdef USE_ACML transB = (char) MatrixTranspose::NoTrans; #else mklTransB = CBLAS_TRANSPOSE::CblasNoTrans; @@ -4107,7 +4111,7 @@ void CPUMatrix::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix if (sizeof(ElemType) == sizeof(double)) { -#ifndef USE_MKL +#ifdef USE_ACML dgemm(transA, transB, m, n, k, alpha, reinterpret_cast(a.m_pArray), lda, reinterpret_cast(b.m_pArray), ldb, beta, reinterpret_cast(c.m_pArray), ldc); #else cblas_dgemm((CBLAS_ORDER) BLAS_COLMAJOR mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast(a.m_pArray), lda, reinterpret_cast(b.m_pArray), ldb, beta, reinterpret_cast(c.m_pArray), ldc); @@ -4116,7 +4120,7 @@ void CPUMatrix::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix else { #pragma warning(suppress : 4244) -#ifndef USE_MKL +#ifdef USE_ACML sgemm(BLAS_COLMAJOR transA, transB, m, n, k, alpha, reinterpret_cast(a.m_pArray), lda, reinterpret_cast(b.m_pArray), ldb, beta, reinterpret_cast(c.m_pArray), ldc); #else cblas_sgemm((CBLAS_ORDER) BLAS_COLMAJOR mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast(a.m_pArray), lda, reinterpret_cast(b.m_pArray), ldb, beta, reinterpret_cast(c.m_pArray), ldc); @@ -4160,34 +4164,42 @@ void CPUMatrix::SVD(const CPUMatrix& A, CPUMatrix& ldu = m; ldvt = n; U.Resize(m, m); - SIGMA.Resize(min(m, n), 1); + SIGMA.Resize(std::min(m, n), 1); VT.Resize(n, n); if (sizeof(ElemType) == sizeof(double)) { -#ifndef USE_MKL +#ifdef USE_ACML dgesvd('A', 'A', (int) m, (int) n, reinterpret_cast(A.m_pArray), (int) lda, reinterpret_cast(SIGMA.m_pArray), reinterpret_cast(U.m_pArray), (int) ldu, reinterpret_cast(VT.m_pArray), (int) ldvt, &info); -#else +#elif defined(USE_MKL) double wkopt; int lwork = -1; dgesvd("All", "All", &m, &n, reinterpret_cast(A.m_pArray), &lda, reinterpret_cast(SIGMA.m_pArray), reinterpret_cast(U.m_pArray), &ldu, reinterpret_cast(VT.m_pArray), &ldvt, &wkopt, &lwork, &info); lwork = (int) wkopt; W.Resize(lwork, 1); dgesvd("All", "All", &m, &n, reinterpret_cast(A.m_pArray), &lda, reinterpret_cast(SIGMA.m_pArray), reinterpret_cast(U.m_pArray), &ldu, reinterpret_cast(VT.m_pArray), &ldvt, reinterpret_cast(W.m_pArray), &lwork, &info); +#else + std::vector superb(std::max(std::min(m, n) - 1, 1)); + info = LAPACKE_dgesvd(BLAS_COLMAJOR 'A', 'A', (int) m, (int) n, reinterpret_cast(A.m_pArray), (int) lda, reinterpret_cast(SIGMA.m_pArray), + reinterpret_cast(U.m_pArray), (int) ldu, reinterpret_cast(VT.m_pArray), (int) ldvt, &superb[0]); #endif } else { -#ifndef USE_MKL +#ifdef USE_ACML #pragma warning(suppress : 4244) sgesvd('A', 'A', (int) m, (int) n, reinterpret_cast(A.m_pArray), (int) lda, reinterpret_cast(SIGMA.m_pArray), reinterpret_cast(U.m_pArray), (int) ldu, reinterpret_cast(VT.m_pArray), (int) ldvt, &info); -#else +#elif defined(USE_MKL) float wkopt; int lwork = -1; sgesvd("All", "All", &m, &n, reinterpret_cast(A.m_pArray), &lda, reinterpret_cast(SIGMA.m_pArray), reinterpret_cast(U.m_pArray), &ldu, reinterpret_cast(VT.m_pArray), &ldvt, &wkopt, &lwork, &info); lwork = (int) wkopt; W.Resize(lwork, 1); sgesvd("All", "All", &m, &n, reinterpret_cast(A.m_pArray), &lda, reinterpret_cast(SIGMA.m_pArray), reinterpret_cast(U.m_pArray), &ldu, reinterpret_cast(VT.m_pArray), &ldvt, reinterpret_cast(W.m_pArray), &lwork, &info); +#else + std::vector superb(std::max(std::min(m, n) - 1, 1)); + info = LAPACKE_sgesvd(BLAS_COLMAJOR 'A', 'A', (int) m, (int) n, reinterpret_cast(A.m_pArray), (int) lda, reinterpret_cast(SIGMA.m_pArray), + reinterpret_cast(U.m_pArray), (int) ldu, reinterpret_cast(VT.m_pArray), (int) ldvt, &superb[0]); #endif } @@ -4383,7 +4395,7 @@ void CPUMatrix::ScaleAndAdd(ElemType alpha, const CPUMatrix& if (sizeof(ElemType) == sizeof(double)) { -#ifndef USE_MKL +#ifdef USE_ACML daxpy(len, alpha, reinterpret_cast(a.m_pArray), incx, reinterpret_cast(c.m_pArray), incy); #else cblas_daxpy(len, alpha, reinterpret_cast(a.m_pArray), incx, reinterpret_cast(c.m_pArray), incy); @@ -4392,7 +4404,7 @@ void CPUMatrix::ScaleAndAdd(ElemType alpha, const CPUMatrix& else { #pragma warning(suppress : 4244) -#ifndef USE_MKL +#ifdef USE_ACML saxpy(len, alpha, reinterpret_cast(a.m_pArray), incx, reinterpret_cast(c.m_pArray), incy); #else cblas_saxpy(len, alpha, reinterpret_cast(a.m_pArray), incx, reinterpret_cast(c.m_pArray), incy); @@ -4433,7 +4445,7 @@ void CPUMatrix::ScaleAndAdd(ElemType alpha, const CPUMatrix& #pragma omp parallel for foreach_column (j, c) { -#ifndef USE_MKL +#ifdef USE_ACML daxpy(m, alpha, reinterpret_cast(a.m_pArray), 1, reinterpret_cast(c.m_pArray + c.LocateColumn(j)), 1); #else cblas_daxpy(m, alpha, reinterpret_cast(a.m_pArray), 1, reinterpret_cast(c.m_pArray + c.LocateColumn(j)), 1); @@ -4446,7 +4458,7 @@ void CPUMatrix::ScaleAndAdd(ElemType alpha, const CPUMatrix& foreach_column (j, c) { #pragma warning(suppress : 4244) -#ifndef USE_MKL +#ifdef USE_ACML saxpy(m, alpha, reinterpret_cast(a.m_pArray), 1, reinterpret_cast(c.m_pArray + c.LocateColumn(j)), 1); #else cblas_saxpy(m, alpha, reinterpret_cast(a.m_pArray), 1, reinterpret_cast(c.m_pArray + c.LocateColumn(j)), 1); @@ -4467,7 +4479,7 @@ void CPUMatrix::ScaleAndAdd(ElemType alpha, const CPUMatrix& #pragma omp parallel for foreach_row (i, c) { -#ifndef USE_MKL +#ifdef USE_ACML daxpy(n, alpha, reinterpret_cast(a.m_pArray), 1, reinterpret_cast(c.m_pArray + i), m); #else cblas_daxpy(n, alpha, reinterpret_cast(a.m_pArray), 1, reinterpret_cast(c.m_pArray + i), m); @@ -4480,7 +4492,7 @@ void CPUMatrix::ScaleAndAdd(ElemType alpha, const CPUMatrix& foreach_row (i, c) { #pragma warning(suppress : 4244) -#ifndef USE_MKL +#ifdef USE_ACML saxpy(n, alpha, reinterpret_cast(a.m_pArray), 1, reinterpret_cast(c.m_pArray + i), m); #else cblas_saxpy(n, alpha, reinterpret_cast(a.m_pArray), 1, reinterpret_cast(c.m_pArray + i), m); @@ -4682,7 +4694,7 @@ void CPUMatrix::Scale(ElemType alpha, CPUMatrix& a) if (sizeof(ElemType) == sizeof(double)) { -#ifndef USE_MKL +#ifdef USE_ACML dscal(len, alpha, reinterpret_cast(a.m_pArray), incx); #else cblas_dscal(len, alpha, reinterpret_cast(a.m_pArray), incx); @@ -4691,7 +4703,7 @@ void CPUMatrix::Scale(ElemType alpha, CPUMatrix& a) else { #pragma warning(suppress : 4244) -#ifndef USE_MKL +#ifdef USE_ACML sscal(len, alpha, reinterpret_cast(a.m_pArray), incx); #else cblas_sscal(len, alpha, reinterpret_cast(a.m_pArray), incx); @@ -4741,7 +4753,7 @@ void CPUMatrix::InnerProduct(const CPUMatrix& a, const CPUMa #pragma omp parallel for foreach_column (j, c) { -#ifndef USE_MKL +#ifdef USE_ACML c(0, j) = (ElemType) ddot(m, reinterpret_cast(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast(b.m_pArray + b.LocateColumn(j)), 1); #else c(0, j) = (ElemType) cblas_ddot(m, reinterpret_cast(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast(b.m_pArray + b.LocateColumn(j)), 1); @@ -4754,7 +4766,7 @@ void CPUMatrix::InnerProduct(const CPUMatrix& a, const CPUMa foreach_column (j, c) { #pragma warning(suppress : 4244) -#ifndef USE_MKL +#ifdef USE_ACML c(0, j) = (ElemType) sdot(m, reinterpret_cast(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast(b.m_pArray + b.LocateColumn(j)), 1); #else c(0, j) = (ElemType) cblas_sdot(m, reinterpret_cast(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast(b.m_pArray + b.LocateColumn(j)), 1); @@ -4771,7 +4783,7 @@ void CPUMatrix::InnerProduct(const CPUMatrix& a, const CPUMa #pragma omp parallel for foreach_row (i, c) { -#ifndef USE_MKL +#ifdef USE_ACML c(i, 0) = ddot(n, reinterpret_cast(a.m_pArray + i), m, reinterpret_cast(b.m_pArray + i), m); #else c(i, 0) = cblas_ddot(n, reinterpret_cast(a.m_pArray + i), m, reinterpret_cast(b.m_pArray + i), m); @@ -4784,7 +4796,7 @@ void CPUMatrix::InnerProduct(const CPUMatrix& a, const CPUMa foreach_row (i, c) { #pragma warning(suppress : 4244) -#ifndef USE_MKL +#ifdef USE_ACML c(i, 0) = sdot(n, reinterpret_cast(a.m_pArray + i), m, reinterpret_cast(b.m_pArray + i), m); #else c(i, 0) = cblas_sdot(n, reinterpret_cast(a.m_pArray + i), m, reinterpret_cast(b.m_pArray + i), m); @@ -4813,7 +4825,7 @@ ElemType CPUMatrix::InnerProductOfMatrices(const CPUMatrix& if (sizeof(ElemType) == sizeof(double)) { -#ifndef USE_MKL +#ifdef USE_ACML return (ElemType) ddot((int) a.GetNumElements(), reinterpret_cast(a.m_pArray), 1, reinterpret_cast(b.m_pArray), 1); #else return (ElemType) cblas_ddot((int) a.GetNumElements(), reinterpret_cast(a.m_pArray), 1, reinterpret_cast(b.m_pArray), 1); @@ -4822,7 +4834,7 @@ ElemType CPUMatrix::InnerProductOfMatrices(const CPUMatrix& else { #pragma warning(suppress : 4244) -#ifndef USE_MKL +#ifdef USE_ACML return (ElemType) sdot((int) a.GetNumElements(), reinterpret_cast(a.m_pArray), 1, reinterpret_cast(b.m_pArray), 1); #else return (ElemType) cblas_sdot((int) a.GetNumElements(), reinterpret_cast(a.m_pArray), 1, reinterpret_cast(b.m_pArray), 1); @@ -5052,7 +5064,7 @@ void CPUMatrix::InnerProductWithShiftNeg(const CPUMatrix& a, { for (long j = 0; j < n; j++) { -#ifndef USE_MKL +#ifdef USE_ACML c(0, j) = (ElemType) ddot(m, reinterpret_cast(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast(b.m_pArray + b.LocateColumn(j)), 1); #else c(0, j) = (ElemType) cblas_ddot(m, reinterpret_cast(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast(b.m_pArray + b.LocateColumn(j)), 1); @@ -5062,7 +5074,7 @@ void CPUMatrix::InnerProductWithShiftNeg(const CPUMatrix& a, { for (long i = 1; i < negnumber + 1; i++) { -#ifndef USE_MKL +#ifdef USE_ACML c(i, j) = (ElemType) ddot(m, reinterpret_cast(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast(b.m_pArray + b.LocateColumn((j + shift + i - 1) % n)), 1); #else c(i, j) = (ElemType) cblas_ddot(m, reinterpret_cast(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast(b.m_pArray + b.LocateColumn((j + shift + i - 1) % n)), 1); @@ -5074,7 +5086,7 @@ void CPUMatrix::InnerProductWithShiftNeg(const CPUMatrix& a, { for (long j = 0; j < n; j++) { -#ifndef USE_MKL +#ifdef USE_ACML c(0, j) = (ElemType) sdot(m, reinterpret_cast(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast(b.m_pArray + b.LocateColumn(j)), 1); #else c(0, j) = (ElemType) cblas_sdot(m, reinterpret_cast(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast(b.m_pArray + b.LocateColumn(j)), 1); @@ -5084,7 +5096,7 @@ void CPUMatrix::InnerProductWithShiftNeg(const CPUMatrix& a, { for (long i = 1; i < negnumber + 1; i++) { -#ifndef USE_MKL +#ifdef USE_ACML c(i, j) = (ElemType) sdot(m, reinterpret_cast(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast(b.m_pArray + b.LocateColumn((j + shift + i - 1) % n)), 1); #else c(i, j) = (ElemType) cblas_sdot(m, reinterpret_cast(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast(b.m_pArray + b.LocateColumn((j + shift + i - 1) % n)), 1); @@ -5104,7 +5116,7 @@ void CPUMatrix::InnerProductWithShiftNeg(const CPUMatrix& a, #pragma omp parallel for foreach_row (i, c) { -#ifndef USE_MKL +#ifdef USE_ACML c(i, 0) = (ElemType) ddot(n, reinterpret_cast(a.m_pArray + i), m, reinterpret_cast(b.m_pArray + i), m); #else c(i, 0) = (ElemType) cblas_ddot(n, reinterpret_cast(a.m_pArray + i), m, reinterpret_cast(b.m_pArray + i), m); @@ -5117,7 +5129,7 @@ void CPUMatrix::InnerProductWithShiftNeg(const CPUMatrix& a, foreach_row (i, c) { #pragma warning(suppress : 4244) -#ifndef USE_MKL +#ifdef USE_ACML c(i, 0) = sdot(n, reinterpret_cast(a.m_pArray + i), m, reinterpret_cast(b.m_pArray + i), m); #else c(i, 0) = cblas_sdot(n, reinterpret_cast(a.m_pArray + i), m, reinterpret_cast(b.m_pArray + i), m); @@ -5527,7 +5539,7 @@ int CPUMatrix::SetNumThreads(int numThreads) int mthreads = (int) std::thread::hardware_concurrency(); if (numThreads <= 0) - numThreads = max(1, mthreads + numThreads); + numThreads = std::max(1, mthreads + numThreads); if (numThreads > mthreads) numThreads = mthreads; @@ -5535,10 +5547,12 @@ int CPUMatrix::SetNumThreads(int numThreads) omp_set_num_threads(numThreads); numThreads = omp_get_max_threads(); -#ifndef USE_MKL +#ifdef USE_ACML acmlsetnumthreads(numThreads); -#else +#elif defined(USE_MKL) mkl_set_num_threads(numThreads); +#elif defined(USE_OPENBLAS) + openblas_set_num_threads(numThreads); #endif #endif return numThreads; diff --git a/Source/Math/CPUSparseMatrix.cpp b/Source/Math/CPUSparseMatrix.cpp index dd4e1b718..b33a4c1ed 100644 --- a/Source/Math/CPUSparseMatrix.cpp +++ b/Source/Math/CPUSparseMatrix.cpp @@ -23,7 +23,7 @@ #pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this -#ifndef USE_MKL +#ifdef USE_ACML // use ACML as default. // Download ACML 5.3.0 (e.g., acml5.3.0-ifort64.exe) or above // from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/ @@ -31,9 +31,17 @@ // Set Environment variable ACML_PATH to C:\AMD\acml5.3.0\ifort64_mp or the folder you installed acml // to point to your folder for the include file and link library #include // requires ACML 5.3.0 and above -#else +#elif defined(USE_MKL) // requires MKL 10.0 and above #include +#else +#ifdef _MSC_VER +// Visual Studio doesn't define standard complex types properly +#define HAVE_LAPACK_CONFIG_H +#define LAPACK_COMPLEX_STRUCTURE +#endif +#include +#include #endif // This is an example of an exported variable @@ -45,7 +53,7 @@ // return 42; //} -#ifndef USE_MKL // MKL has one additional parameter for different matrix order +#ifdef USE_ACML // MKL has one additional parameter for different matrix order #define BLAS_COLMAJOR #else #define BLAS_COLMAJOR (int) MatrixOrder::ColMajor, @@ -1185,7 +1193,7 @@ ElemType CPUSparseMatrix::SumOfAbsElements() const if (sizeof(ElemType) == sizeof(double)) { -#ifndef USE_MKL +#ifdef USE_ACML return (ElemType) dasum((int) this->NzCount(), reinterpret_cast(m_nzValues), 1); #else return (ElemType) cblas_dasum((int) this->NzCount(), reinterpret_cast(m_nzValues), 1); @@ -1194,7 +1202,7 @@ ElemType CPUSparseMatrix::SumOfAbsElements() const else { #pragma warning(suppress : 4244) -#ifndef USE_MKL +#ifdef USE_ACML return sasum((int) this->NzCount(), reinterpret_cast(m_nzValues), 1); #else return cblas_sasum((int) this->NzCount(), reinterpret_cast(m_nzValues), 1); diff --git a/Source/Math/Math.vcxproj b/Source/Math/Math.vcxproj index f86cfaed7..4edd79aea 100644 --- a/Source/Math/Math.vcxproj +++ b/Source/Math/Math.vcxproj @@ -63,7 +63,7 @@ NotUsing Level4 - NO_SYNC; WIN32; _DEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions) + USE_ACML; NO_SYNC; WIN32; _DEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions) true ..\Common\include\;%(AdditionalIncludeDirectories) true @@ -105,7 +105,7 @@ MaxSpeed true true - NO_SYNC; WIN32; NDEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions) + USE_ACML; NO_SYNC; WIN32; NDEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions) true ..\Common\include\;%(AdditionalIncludeDirectories) true diff --git a/configure b/configure index 307bf28db..901042ae7 100755 --- a/configure +++ b/configure @@ -16,6 +16,11 @@ have_mkl=no mkl_path= mkl_check=mkl/include/mkl.h +# Experimental OpenBLAS support. +have_openblas=no +openblas_path= +openblas_check=include/openblas_config.h + have_kaldi=no kaldi_path= kaldi_check=src/kaldi.mk @@ -45,11 +50,12 @@ default_use_1bitsgd=no enable_1bitsgd=$default_use_1bitsgd # List from best to worst choice -default_path_list="/usr /usr/local /opt /opt/local" +default_path_list="/usr /usr/local /opt /opt/local /opt/intel" # List from best to worst choice default_acmls="acml5.3.1/ifort64_mp" default_mkls="" +default_openblas="" # NOTE: Will get compilation errors with cuda-6.0 default_cudas="cuda-7.5 cuda-7.0 cuda-6.5" @@ -80,13 +86,15 @@ function check_dir () # $2 is some file that must exist in $1 function find_dir () { - for tail in $1 + # Loop over list of tails to search, including empty (just default_path + search file) + for tail in $1 '' do for head in $(default_paths) do - if test x$(check_dir "$head/$tail" $2) = xyes + [ -n "$tail" ] && search_path="$head/$tail" || search_path=$head + if test x$(check_dir "$search_path" $2) = xyes then - echo $head/$tail + echo $search_path return 0 fi done @@ -103,6 +111,11 @@ function find_mkl () find_dir "$default_mkls" "$mkl_check" } +function find_openblas () +{ + find_dir "$default_openblas" "$openblas_check" +} + function find_cuda () { find_dir "$default_cudas" "$cuda_check" @@ -179,6 +192,7 @@ function show_help () echo " --with-cudnn[=directory] $(show_default $(find_cudnn))" echo " --with-acml[=directory] $(show_default $(find_acml))" echo " --with-mkl[=directory] $(show_default $(find_mkl))" + echo " --with-openblas[=directory] (experimental) $(show_default $(find_openblas))" echo " --with-buildtype=(debug|release) $(show_default $default_buildtype)" echo " --with-kaldi[=directory] $(show_default $(find_kaldi))" echo " --with-opencv[=directory] $(show_default $(find_opencv))" @@ -333,7 +347,7 @@ do acml_path=$(find_acml) if test x$acml_path = x then - echo "Cannot fine acml directory" + echo "Cannot find acml directory" echo "Please specify a value for --with-acml" exit 1 fi @@ -355,7 +369,7 @@ do mkl_path=$(find_mkl) if test x$mkl_path = x then - echo "Cannot fine mkl directory" + echo "Cannot find mkl directory" echo "Please specify a value for --with-mkl" exit 1 fi @@ -369,6 +383,28 @@ do fi fi ;; + --with-openblas*) + have_openblas=yes + mathlib=openblas + if test x$optarg = x + then + openblas_path=$(find_openblas) + if test x$openblas_path = x + then + echo "Cannot find openblas directory" + echo "Please specify a value for --with-openblas" + exit 1 + fi + else + if test $(check_dir $optarg $openblas_check) = yes + then + openblas_path=$optarg + else + echo "Invalid openblas directory $optarg" + exit 1 + fi + fi + ;; --with-buildtype*) have_buildtype=yes case $optarg in @@ -439,7 +475,7 @@ then fi # If no math library was specified, search for acml and then mkl -if test x$have_acml = xno && test x$have_mkl = xno +if test x$have_acml = xno && test x$have_mkl = xno && test x$have_openblas = xno then acml_path=$(find_acml) if test x$acml_path = x @@ -448,7 +484,7 @@ then if test x$mkl_path = x then echo "Cannot find a CPU math library." - echo "Please specify --with-acml or --with-mkl with a path." + echo "Please specify --with-acml, --with-mkl, --with-openblas with a path." exit 1 else mathlib=mkl @@ -530,6 +566,9 @@ case $mathlib in mkl) echo MKL_PATH=$mkl_path >> $config ;; + openblas) + echo OPENBLAS_PATH=$openblas_path >> $config + ;; esac if test $enable_cuda = yes ; then echo CUDA_PATH=$cuda_path >> $config