Add experimental support for OpenBLAS library

* Add 'openblas' as mathlib option in configure. Not added to auto-search so
must be specified using --with-openblas

* configure script searches empty tail so that libraries located at default_path_list
roots (ie /usr/local/ + include/openblas_config.h) are found

* Treat ACML as the odd library out in ifdefs since it doesn't conform to typical
BLAS standard. Other libraries like ATLAS should be able to share
OpenBLAS/MKL variants. Add default USE_ACML define in VS projects to match

* Fix 'max' macro define colliding with C++ std::max once openblas headers are included

Usage Notes:

* For best performance, build OpenBLAS with USE_OPENMP=1. When running CNTK, set
OPENBLAS_NUM_THREADS environment var or set numCPUThreads CNTK config variable to the
physical core count or performance will suffer

* OpenBLAS 2.16 (git HEAD) tested in Linux with GCC 4.8.4 and in Windows with
OpenBLAS 2.15 (pre-built binary release + MingGW 64-bit support dlls)

* For Windows, in Math.vcxproj, replace libacml_mp_dll.lib with libopenblas.dll.a and change
USE_ACML define to USE_OPENBLAS. Change ACML_PATH environment variable to your OpenBLAS path.
Modify openblas_config.h as per https://github.com/xianyi/OpenBLAS/issues/708

* On current generation Intel processors, OpenBLAS measures a little faster than
AMD ACML and slower than Intel MKL on MNIST and other examples
This commit is contained in:
Ross Wightman 2016-01-29 15:44:37 -08:00
Родитель 9386b8d310
Коммит f0d8a23b26
5 изменённых файлов: 142 добавлений и 74 удалений

Просмотреть файл

@ -132,6 +132,13 @@ ifeq ("$(MATHLIB)","mkl")
CPPFLAGS += -DUSE_MKL
endif
ifeq ("$(MATHLIB)","openblas")
INCLUDEPATH += $(OPENBLAS_PATH)/include
LIBPATH += $(OPENBLAS_PATH)/lib
LIBS += -lopenblas -lm -lpthread
CPPFLAGS += -DUSE_OPENBLAS
endif
ifdef KALDI_PATH
########## Copy includes and defines from $(KALDI_PATH)/src/kaldi.mk ##########

Просмотреть файл

@ -25,9 +25,6 @@
#define NOMINMAX
#include "Windows.h"
#else
#ifndef max
#define max(a, b) (((a) > (b)) ? (a) : (b))
#endif
#include <cfloat>
#endif
@ -38,20 +35,27 @@
#pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
#pragma warning(disable : 4702) // unreachable code; triggered for unknown reasons
#ifndef USE_MKL
// use ACML as default.
#ifdef USE_ACML
// Download ACML 5.3.1 (e.g., acml5.3.1-ifort64.exe) or above
// from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/
// Install the ifort64_mp variant (compiled with intel compiler) of the library
// Set Environment variable ACML_PATH to C:\AMD\acml5.3.1\ifort64_mp or the folder you installed acml
// to point to your folder for the include file and link library
#include <acml.h> // requires ACML 5.3.1 and above
#else
#elif defined(USE_MKL)
// requires MKL 10.0 and above
#include <mkl.h>
#else
#ifdef _MSC_VER
// Visual Studio doesn't define standard complex types properly
#define HAVE_LAPACK_CONFIG_H
#define LAPACK_COMPLEX_STRUCTURE
#endif
#include <cblas.h>
#include <lapacke.h>
#endif
#ifndef USE_MKL // MKL has one additional parameter for different matrix order
#ifdef USE_ACML // MKL has one additional parameter for different matrix order
#define BLAS_COLMAJOR
#else
#define BLAS_COLMAJOR (int) MatrixOrder::ColMajor,
@ -878,7 +882,7 @@ void CPUMatrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, E
#pragma omp parallel for
foreach_column (j, us)
{
#ifndef USE_MKL
#ifdef USE_ACML
dcopy((int) numRows, reinterpret_cast<double*>(pArray + j), (int) numCols, reinterpret_cast<double*>(m_pArray + LocateColumn(j)), 1);
#else
cblas_dcopy((int) numRows, reinterpret_cast<double*>(pArray + j), (int) numCols, reinterpret_cast<double*>(m_pArray + LocateColumn(j)), 1);
@ -892,7 +896,7 @@ void CPUMatrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, E
{
{
#pragma warning(suppress : 4244)
#ifndef USE_MKL
#ifdef USE_ACML
scopy((int) numRows, reinterpret_cast<float*>(pArray + j), (int) numCols, reinterpret_cast<float*>(m_pArray + LocateColumn(j)), 1);
#else
cblas_scopy((int) numRows, reinterpret_cast<float*>(pArray + j), (int) numCols, reinterpret_cast<float*>(m_pArray + LocateColumn(j)), 1);
@ -1316,9 +1320,9 @@ ElemType CPUMatrix<ElemType>::RmsProp(CPUMatrix<ElemType>& gradients,
const int grad_sign = (ElemType(0) < curr_grad[i]) - (curr_grad[i] < ElemType(0));
if (signs[i] * grad_sign > 0)
steps[i] = min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX);
steps[i] = std::min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX);
else
steps[i] = max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN);
steps[i] = std::max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN);
a = steps[i] / sqrt(avars[i] + floor);
curr_grad[i] *= a;
@ -2237,7 +2241,7 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignLogSoftmaxOf(const CPUMatrix<Ele
// we need to extract max before applying exp to avoid overflow
ElemType maxV = a(0, j);
foreach_row (i, a)
maxV = max(maxV, a(i, j));
maxV = std::max(maxV, a(i, j));
ElemType sum = 0;
foreach_row (i, a)
@ -2255,7 +2259,7 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignLogSoftmaxOf(const CPUMatrix<Ele
// we need to extract max before applying exp to avoid overflow
ElemType maxV = a(i, 0);
foreach_column (j, a)
maxV = max(maxV, a(i, j));
maxV = std::max(maxV, a(i, j));
ElemType sum = 0;
foreach_column (j, a)
@ -2808,7 +2812,7 @@ ElemType CPUMatrix<ElemType>::SumOfAbsElements() const
if (sizeof(ElemType) == sizeof(double))
{
#ifndef USE_MKL
#ifdef USE_ACML
return (ElemType) dasum((int) GetNumElements(), reinterpret_cast<double*>(m_pArray), 1);
#else
return (ElemType) cblas_dasum((int) GetNumElements(), reinterpret_cast<double*>(m_pArray), 1);
@ -2817,7 +2821,7 @@ ElemType CPUMatrix<ElemType>::SumOfAbsElements() const
else
{
#pragma warning(suppress : 4244)
#ifndef USE_MKL
#ifdef USE_ACML
return sasum((int) GetNumElements(), reinterpret_cast<float*>(m_pArray), 1);
#else
return cblas_sasum((int) GetNumElements(), reinterpret_cast<float*>(m_pArray), 1);
@ -2990,7 +2994,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
#pragma omp parallel for
foreach_column (j, c)
{
#ifndef USE_MKL
#ifdef USE_ACML
c(0, j) = (ElemType) dnrm2(m, reinterpret_cast<double*>(us.m_pArray + us.LocateColumn(j)), 1);
#else
c(0, j) = (ElemType) cblas_dnrm2(m, reinterpret_cast<double*>(us.m_pArray + us.LocateColumn(j)), 1);
@ -3003,7 +3007,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
foreach_column (j, c)
{
#pragma warning(suppress : 4244)
#ifndef USE_MKL
#ifdef USE_ACML
c(0, j) = snrm2(m, reinterpret_cast<float*>(us.m_pArray + us.LocateColumn(j)), 1);
#else
c(0, j) = cblas_snrm2(m, reinterpret_cast<float*>(us.m_pArray + us.LocateColumn(j)), 1);
@ -3020,7 +3024,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
#pragma omp parallel for
foreach_row (i, c)
{
#ifndef USE_MKL
#ifdef USE_ACML
c(i, 0) = dnrm2(n, reinterpret_cast<double*>(us.m_pArray + i), m);
#else
c(i, 0) = cblas_dnrm2(n, reinterpret_cast<double*>(us.m_pArray + i), m);
@ -3033,7 +3037,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
foreach_row (i, c)
{
#pragma warning(suppress : 4244)
#ifndef USE_MKL
#ifdef USE_ACML
c(i, 0) = snrm2(n, reinterpret_cast<float*>(us.m_pArray + i), m);
#else
c(i, 0) = cblas_snrm2(n, reinterpret_cast<float*>(us.m_pArray + i), m);
@ -3073,7 +3077,7 @@ void CPUMatrix<ElemType>::VectorNormInf(CPUMatrix<ElemType>& c, const bool isCol
ElemType v = 0;
foreach_row (i, us)
{
v = max(v, abs(us(i, j)));
v = std::max(v, abs(us(i, j)));
}
c(0, j) = v;
}
@ -3088,7 +3092,7 @@ void CPUMatrix<ElemType>::VectorNormInf(CPUMatrix<ElemType>& c, const bool isCol
ElemType v = 0;
foreach_column (j, us)
{
v = max(v, abs(us(i, j)));
v = std::max(v, abs(us(i, j)));
}
c(i, 0) = v;
}
@ -3282,7 +3286,7 @@ ElemType CPUMatrix<ElemType>::MatrixNormInf() const
{
#pragma omp critical
{
v = max(v, abs(us(i, j)));
v = std::max(v, abs(us(i, j)));
}
}
return v;
@ -3866,8 +3870,8 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignMaxPoolingResult(const CPUMatrix
for (long rowInWindow = 0; rowInWindow < windowHeight; rowInWindow++)
{
const ElemType val = inputBatch(rowInInput, sample); // pf[rowInWindow*channels];
maxVal = max(maxVal, val);
minVal = min(minVal, val);
maxVal = std::max(maxVal, val);
minVal = std::min(minVal, val);
rowInInput += (long) channels;
}
}
@ -4040,7 +4044,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
int m, n, k, l;
int lda, ldb, ldc;
#ifndef USE_MKL
#ifdef USE_ACML
char transA, transB;
#else
CBLAS_TRANSPOSE mklTransA;
@ -4052,7 +4056,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
m = (int) a.GetNumCols();
k = (int) a.GetNumRows();
lda = k;
#ifndef USE_MKL
#ifdef USE_ACML
transA = (char) MatrixTranspose::Trans;
#else
mklTransA = CBLAS_TRANSPOSE::CblasTrans;
@ -4063,7 +4067,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
m = (int) a.GetNumRows();
k = (int) a.GetNumCols();
lda = m;
#ifndef USE_MKL
#ifdef USE_ACML
transA = (char) MatrixTranspose::NoTrans;
#else
mklTransA = CBLAS_TRANSPOSE::CblasNoTrans;
@ -4075,7 +4079,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
l = (int) b.GetNumCols();
n = (int) b.GetNumRows();
ldb = n;
#ifndef USE_MKL
#ifdef USE_ACML
transB = (char) MatrixTranspose::Trans;
#else
mklTransB = CBLAS_TRANSPOSE::CblasTrans;
@ -4086,7 +4090,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
l = (int) b.GetNumRows();
n = (int) b.GetNumCols();
ldb = l;
#ifndef USE_MKL
#ifdef USE_ACML
transB = (char) MatrixTranspose::NoTrans;
#else
mklTransB = CBLAS_TRANSPOSE::CblasNoTrans;
@ -4107,7 +4111,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
if (sizeof(ElemType) == sizeof(double))
{
#ifndef USE_MKL
#ifdef USE_ACML
dgemm(transA, transB, m, n, k, alpha, reinterpret_cast<double*>(a.m_pArray), lda, reinterpret_cast<double*>(b.m_pArray), ldb, beta, reinterpret_cast<double*>(c.m_pArray), ldc);
#else
cblas_dgemm((CBLAS_ORDER) BLAS_COLMAJOR mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast<double*>(a.m_pArray), lda, reinterpret_cast<double*>(b.m_pArray), ldb, beta, reinterpret_cast<double*>(c.m_pArray), ldc);
@ -4116,7 +4120,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
else
{
#pragma warning(suppress : 4244)
#ifndef USE_MKL
#ifdef USE_ACML
sgemm(BLAS_COLMAJOR transA, transB, m, n, k, alpha, reinterpret_cast<float*>(a.m_pArray), lda, reinterpret_cast<float*>(b.m_pArray), ldb, beta, reinterpret_cast<float*>(c.m_pArray), ldc);
#else
cblas_sgemm((CBLAS_ORDER) BLAS_COLMAJOR mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast<float*>(a.m_pArray), lda, reinterpret_cast<float*>(b.m_pArray), ldb, beta, reinterpret_cast<float*>(c.m_pArray), ldc);
@ -4160,34 +4164,42 @@ void CPUMatrix<ElemType>::SVD(const CPUMatrix<ElemType>& A, CPUMatrix<ElemType>&
ldu = m;
ldvt = n;
U.Resize(m, m);
SIGMA.Resize(min(m, n), 1);
SIGMA.Resize(std::min(m, n), 1);
VT.Resize(n, n);
if (sizeof(ElemType) == sizeof(double))
{
#ifndef USE_MKL
#ifdef USE_ACML
dgesvd('A', 'A', (int) m, (int) n, reinterpret_cast<double*>(A.m_pArray), (int) lda, reinterpret_cast<double*>(SIGMA.m_pArray), reinterpret_cast<double*>(U.m_pArray), (int) ldu, reinterpret_cast<double*>(VT.m_pArray), (int) ldvt, &info);
#else
#elif defined(USE_MKL)
double wkopt;
int lwork = -1;
dgesvd("All", "All", &m, &n, reinterpret_cast<double*>(A.m_pArray), &lda, reinterpret_cast<double*>(SIGMA.m_pArray), reinterpret_cast<double*>(U.m_pArray), &ldu, reinterpret_cast<double*>(VT.m_pArray), &ldvt, &wkopt, &lwork, &info);
lwork = (int) wkopt;
W.Resize(lwork, 1);
dgesvd("All", "All", &m, &n, reinterpret_cast<double*>(A.m_pArray), &lda, reinterpret_cast<double*>(SIGMA.m_pArray), reinterpret_cast<double*>(U.m_pArray), &ldu, reinterpret_cast<double*>(VT.m_pArray), &ldvt, reinterpret_cast<double*>(W.m_pArray), &lwork, &info);
#else
std::vector<double> superb(std::max(std::min(m, n) - 1, 1));
info = LAPACKE_dgesvd(BLAS_COLMAJOR 'A', 'A', (int) m, (int) n, reinterpret_cast<double*>(A.m_pArray), (int) lda, reinterpret_cast<double*>(SIGMA.m_pArray),
reinterpret_cast<double*>(U.m_pArray), (int) ldu, reinterpret_cast<double*>(VT.m_pArray), (int) ldvt, &superb[0]);
#endif
}
else
{
#ifndef USE_MKL
#ifdef USE_ACML
#pragma warning(suppress : 4244)
sgesvd('A', 'A', (int) m, (int) n, reinterpret_cast<float*>(A.m_pArray), (int) lda, reinterpret_cast<float*>(SIGMA.m_pArray), reinterpret_cast<float*>(U.m_pArray), (int) ldu, reinterpret_cast<float*>(VT.m_pArray), (int) ldvt, &info);
#else
#elif defined(USE_MKL)
float wkopt;
int lwork = -1;
sgesvd("All", "All", &m, &n, reinterpret_cast<float*>(A.m_pArray), &lda, reinterpret_cast<float*>(SIGMA.m_pArray), reinterpret_cast<float*>(U.m_pArray), &ldu, reinterpret_cast<float*>(VT.m_pArray), &ldvt, &wkopt, &lwork, &info);
lwork = (int) wkopt;
W.Resize(lwork, 1);
sgesvd("All", "All", &m, &n, reinterpret_cast<float*>(A.m_pArray), &lda, reinterpret_cast<float*>(SIGMA.m_pArray), reinterpret_cast<float*>(U.m_pArray), &ldu, reinterpret_cast<float*>(VT.m_pArray), &ldvt, reinterpret_cast<float*>(W.m_pArray), &lwork, &info);
#else
std::vector<float> superb(std::max(std::min(m, n) - 1, 1));
info = LAPACKE_sgesvd(BLAS_COLMAJOR 'A', 'A', (int) m, (int) n, reinterpret_cast<float*>(A.m_pArray), (int) lda, reinterpret_cast<float*>(SIGMA.m_pArray),
reinterpret_cast<float*>(U.m_pArray), (int) ldu, reinterpret_cast<float*>(VT.m_pArray), (int) ldvt, &superb[0]);
#endif
}
@ -4383,7 +4395,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
if (sizeof(ElemType) == sizeof(double))
{
#ifndef USE_MKL
#ifdef USE_ACML
daxpy(len, alpha, reinterpret_cast<double*>(a.m_pArray), incx, reinterpret_cast<double*>(c.m_pArray), incy);
#else
cblas_daxpy(len, alpha, reinterpret_cast<double*>(a.m_pArray), incx, reinterpret_cast<double*>(c.m_pArray), incy);
@ -4392,7 +4404,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
else
{
#pragma warning(suppress : 4244)
#ifndef USE_MKL
#ifdef USE_ACML
saxpy(len, alpha, reinterpret_cast<float*>(a.m_pArray), incx, reinterpret_cast<float*>(c.m_pArray), incy);
#else
cblas_saxpy(len, alpha, reinterpret_cast<float*>(a.m_pArray), incx, reinterpret_cast<float*>(c.m_pArray), incy);
@ -4433,7 +4445,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
#pragma omp parallel for
foreach_column (j, c)
{
#ifndef USE_MKL
#ifdef USE_ACML
daxpy(m, alpha, reinterpret_cast<double*>(a.m_pArray), 1, reinterpret_cast<double*>(c.m_pArray + c.LocateColumn(j)), 1);
#else
cblas_daxpy(m, alpha, reinterpret_cast<double*>(a.m_pArray), 1, reinterpret_cast<double*>(c.m_pArray + c.LocateColumn(j)), 1);
@ -4446,7 +4458,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
foreach_column (j, c)
{
#pragma warning(suppress : 4244)
#ifndef USE_MKL
#ifdef USE_ACML
saxpy(m, alpha, reinterpret_cast<float*>(a.m_pArray), 1, reinterpret_cast<float*>(c.m_pArray + c.LocateColumn(j)), 1);
#else
cblas_saxpy(m, alpha, reinterpret_cast<float*>(a.m_pArray), 1, reinterpret_cast<float*>(c.m_pArray + c.LocateColumn(j)), 1);
@ -4467,7 +4479,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
#pragma omp parallel for
foreach_row (i, c)
{
#ifndef USE_MKL
#ifdef USE_ACML
daxpy(n, alpha, reinterpret_cast<double*>(a.m_pArray), 1, reinterpret_cast<double*>(c.m_pArray + i), m);
#else
cblas_daxpy(n, alpha, reinterpret_cast<double*>(a.m_pArray), 1, reinterpret_cast<double*>(c.m_pArray + i), m);
@ -4480,7 +4492,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
foreach_row (i, c)
{
#pragma warning(suppress : 4244)
#ifndef USE_MKL
#ifdef USE_ACML
saxpy(n, alpha, reinterpret_cast<float*>(a.m_pArray), 1, reinterpret_cast<float*>(c.m_pArray + i), m);
#else
cblas_saxpy(n, alpha, reinterpret_cast<float*>(a.m_pArray), 1, reinterpret_cast<float*>(c.m_pArray + i), m);
@ -4682,7 +4694,7 @@ void CPUMatrix<ElemType>::Scale(ElemType alpha, CPUMatrix<ElemType>& a)
if (sizeof(ElemType) == sizeof(double))
{
#ifndef USE_MKL
#ifdef USE_ACML
dscal(len, alpha, reinterpret_cast<double*>(a.m_pArray), incx);
#else
cblas_dscal(len, alpha, reinterpret_cast<double*>(a.m_pArray), incx);
@ -4691,7 +4703,7 @@ void CPUMatrix<ElemType>::Scale(ElemType alpha, CPUMatrix<ElemType>& a)
else
{
#pragma warning(suppress : 4244)
#ifndef USE_MKL
#ifdef USE_ACML
sscal(len, alpha, reinterpret_cast<float*>(a.m_pArray), incx);
#else
cblas_sscal(len, alpha, reinterpret_cast<float*>(a.m_pArray), incx);
@ -4741,7 +4753,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
#pragma omp parallel for
foreach_column (j, c)
{
#ifndef USE_MKL
#ifdef USE_ACML
c(0, j) = (ElemType) ddot(m, reinterpret_cast<double*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<double*>(b.m_pArray + b.LocateColumn(j)), 1);
#else
c(0, j) = (ElemType) cblas_ddot(m, reinterpret_cast<double*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<double*>(b.m_pArray + b.LocateColumn(j)), 1);
@ -4754,7 +4766,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
foreach_column (j, c)
{
#pragma warning(suppress : 4244)
#ifndef USE_MKL
#ifdef USE_ACML
c(0, j) = (ElemType) sdot(m, reinterpret_cast<float*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<float*>(b.m_pArray + b.LocateColumn(j)), 1);
#else
c(0, j) = (ElemType) cblas_sdot(m, reinterpret_cast<float*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<float*>(b.m_pArray + b.LocateColumn(j)), 1);
@ -4771,7 +4783,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
#pragma omp parallel for
foreach_row (i, c)
{
#ifndef USE_MKL
#ifdef USE_ACML
c(i, 0) = ddot(n, reinterpret_cast<double*>(a.m_pArray + i), m, reinterpret_cast<double*>(b.m_pArray + i), m);
#else
c(i, 0) = cblas_ddot(n, reinterpret_cast<double*>(a.m_pArray + i), m, reinterpret_cast<double*>(b.m_pArray + i), m);
@ -4784,7 +4796,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
foreach_row (i, c)
{
#pragma warning(suppress : 4244)
#ifndef USE_MKL
#ifdef USE_ACML
c(i, 0) = sdot(n, reinterpret_cast<float*>(a.m_pArray + i), m, reinterpret_cast<float*>(b.m_pArray + i), m);
#else
c(i, 0) = cblas_sdot(n, reinterpret_cast<float*>(a.m_pArray + i), m, reinterpret_cast<float*>(b.m_pArray + i), m);
@ -4813,7 +4825,7 @@ ElemType CPUMatrix<ElemType>::InnerProductOfMatrices(const CPUMatrix<ElemType>&
if (sizeof(ElemType) == sizeof(double))
{
#ifndef USE_MKL
#ifdef USE_ACML
return (ElemType) ddot((int) a.GetNumElements(), reinterpret_cast<double*>(a.m_pArray), 1, reinterpret_cast<double*>(b.m_pArray), 1);
#else
return (ElemType) cblas_ddot((int) a.GetNumElements(), reinterpret_cast<double*>(a.m_pArray), 1, reinterpret_cast<double*>(b.m_pArray), 1);
@ -4822,7 +4834,7 @@ ElemType CPUMatrix<ElemType>::InnerProductOfMatrices(const CPUMatrix<ElemType>&
else
{
#pragma warning(suppress : 4244)
#ifndef USE_MKL
#ifdef USE_ACML
return (ElemType) sdot((int) a.GetNumElements(), reinterpret_cast<float*>(a.m_pArray), 1, reinterpret_cast<float*>(b.m_pArray), 1);
#else
return (ElemType) cblas_sdot((int) a.GetNumElements(), reinterpret_cast<float*>(a.m_pArray), 1, reinterpret_cast<float*>(b.m_pArray), 1);
@ -5052,7 +5064,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
{
for (long j = 0; j < n; j++)
{
#ifndef USE_MKL
#ifdef USE_ACML
c(0, j) = (ElemType) ddot(m, reinterpret_cast<double*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<double*>(b.m_pArray + b.LocateColumn(j)), 1);
#else
c(0, j) = (ElemType) cblas_ddot(m, reinterpret_cast<double*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<double*>(b.m_pArray + b.LocateColumn(j)), 1);
@ -5062,7 +5074,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
{
for (long i = 1; i < negnumber + 1; i++)
{
#ifndef USE_MKL
#ifdef USE_ACML
c(i, j) = (ElemType) ddot(m, reinterpret_cast<double*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<double*>(b.m_pArray + b.LocateColumn((j + shift + i - 1) % n)), 1);
#else
c(i, j) = (ElemType) cblas_ddot(m, reinterpret_cast<double*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<double*>(b.m_pArray + b.LocateColumn((j + shift + i - 1) % n)), 1);
@ -5074,7 +5086,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
{
for (long j = 0; j < n; j++)
{
#ifndef USE_MKL
#ifdef USE_ACML
c(0, j) = (ElemType) sdot(m, reinterpret_cast<float*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<float*>(b.m_pArray + b.LocateColumn(j)), 1);
#else
c(0, j) = (ElemType) cblas_sdot(m, reinterpret_cast<float*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<float*>(b.m_pArray + b.LocateColumn(j)), 1);
@ -5084,7 +5096,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
{
for (long i = 1; i < negnumber + 1; i++)
{
#ifndef USE_MKL
#ifdef USE_ACML
c(i, j) = (ElemType) sdot(m, reinterpret_cast<float*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<float*>(b.m_pArray + b.LocateColumn((j + shift + i - 1) % n)), 1);
#else
c(i, j) = (ElemType) cblas_sdot(m, reinterpret_cast<float*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<float*>(b.m_pArray + b.LocateColumn((j + shift + i - 1) % n)), 1);
@ -5104,7 +5116,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
#pragma omp parallel for
foreach_row (i, c)
{
#ifndef USE_MKL
#ifdef USE_ACML
c(i, 0) = (ElemType) ddot(n, reinterpret_cast<double*>(a.m_pArray + i), m, reinterpret_cast<double*>(b.m_pArray + i), m);
#else
c(i, 0) = (ElemType) cblas_ddot(n, reinterpret_cast<double*>(a.m_pArray + i), m, reinterpret_cast<double*>(b.m_pArray + i), m);
@ -5117,7 +5129,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
foreach_row (i, c)
{
#pragma warning(suppress : 4244)
#ifndef USE_MKL
#ifdef USE_ACML
c(i, 0) = sdot(n, reinterpret_cast<float*>(a.m_pArray + i), m, reinterpret_cast<float*>(b.m_pArray + i), m);
#else
c(i, 0) = cblas_sdot(n, reinterpret_cast<float*>(a.m_pArray + i), m, reinterpret_cast<float*>(b.m_pArray + i), m);
@ -5527,7 +5539,7 @@ int CPUMatrix<ElemType>::SetNumThreads(int numThreads)
int mthreads = (int) std::thread::hardware_concurrency();
if (numThreads <= 0)
numThreads = max(1, mthreads + numThreads);
numThreads = std::max(1, mthreads + numThreads);
if (numThreads > mthreads)
numThreads = mthreads;
@ -5535,10 +5547,12 @@ int CPUMatrix<ElemType>::SetNumThreads(int numThreads)
omp_set_num_threads(numThreads);
numThreads = omp_get_max_threads();
#ifndef USE_MKL
#ifdef USE_ACML
acmlsetnumthreads(numThreads);
#else
#elif defined(USE_MKL)
mkl_set_num_threads(numThreads);
#elif defined(USE_OPENBLAS)
openblas_set_num_threads(numThreads);
#endif
#endif
return numThreads;

Просмотреть файл

@ -23,7 +23,7 @@
#pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
#ifndef USE_MKL
#ifdef USE_ACML
// use ACML as default.
// Download ACML 5.3.0 (e.g., acml5.3.0-ifort64.exe) or above
// from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/
@ -31,9 +31,17 @@
// Set Environment variable ACML_PATH to C:\AMD\acml5.3.0\ifort64_mp or the folder you installed acml
// to point to your folder for the include file and link library
#include <acml.h> // requires ACML 5.3.0 and above
#else
#elif defined(USE_MKL)
// requires MKL 10.0 and above
#include <mkl.h>
#else
#ifdef _MSC_VER
// Visual Studio doesn't define standard complex types properly
#define HAVE_LAPACK_CONFIG_H
#define LAPACK_COMPLEX_STRUCTURE
#endif
#include <cblas.h>
#include <lapacke.h>
#endif
// This is an example of an exported variable
@ -45,7 +53,7 @@
// return 42;
//}
#ifndef USE_MKL // MKL has one additional parameter for different matrix order
#ifdef USE_ACML // MKL has one additional parameter for different matrix order
#define BLAS_COLMAJOR
#else
#define BLAS_COLMAJOR (int) MatrixOrder::ColMajor,
@ -1185,7 +1193,7 @@ ElemType CPUSparseMatrix<ElemType>::SumOfAbsElements() const
if (sizeof(ElemType) == sizeof(double))
{
#ifndef USE_MKL
#ifdef USE_ACML
return (ElemType) dasum((int) this->NzCount(), reinterpret_cast<double*>(m_nzValues), 1);
#else
return (ElemType) cblas_dasum((int) this->NzCount(), reinterpret_cast<double*>(m_nzValues), 1);
@ -1194,7 +1202,7 @@ ElemType CPUSparseMatrix<ElemType>::SumOfAbsElements() const
else
{
#pragma warning(suppress : 4244)
#ifndef USE_MKL
#ifdef USE_ACML
return sasum((int) this->NzCount(), reinterpret_cast<float*>(m_nzValues), 1);
#else
return cblas_sasum((int) this->NzCount(), reinterpret_cast<float*>(m_nzValues), 1);

Просмотреть файл

@ -63,7 +63,7 @@
<ClCompile>
<PrecompiledHeader>NotUsing</PrecompiledHeader>
<WarningLevel>Level4</WarningLevel>
<PreprocessorDefinitions>NO_SYNC; WIN32; _DEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions>USE_ACML; NO_SYNC; WIN32; _DEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions)</PreprocessorDefinitions>
<SDLCheck>true</SDLCheck>
<AdditionalIncludeDirectories>..\Common\include\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
@ -105,7 +105,7 @@
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>NO_SYNC; WIN32; NDEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions>USE_ACML; NO_SYNC; WIN32; NDEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions)</PreprocessorDefinitions>
<SDLCheck>true</SDLCheck>
<AdditionalIncludeDirectories>..\Common\include\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<MultiProcessorCompilation>true</MultiProcessorCompilation>

55
configure поставляемый
Просмотреть файл

@ -16,6 +16,11 @@ have_mkl=no
mkl_path=
mkl_check=mkl/include/mkl.h
# Experimental OpenBLAS support.
have_openblas=no
openblas_path=
openblas_check=include/openblas_config.h
have_kaldi=no
kaldi_path=
kaldi_check=src/kaldi.mk
@ -45,11 +50,12 @@ default_use_1bitsgd=no
enable_1bitsgd=$default_use_1bitsgd
# List from best to worst choice
default_path_list="/usr /usr/local /opt /opt/local"
default_path_list="/usr /usr/local /opt /opt/local /opt/intel"
# List from best to worst choice
default_acmls="acml5.3.1/ifort64_mp"
default_mkls=""
default_openblas=""
# NOTE: Will get compilation errors with cuda-6.0
default_cudas="cuda-7.5 cuda-7.0 cuda-6.5"
@ -80,13 +86,15 @@ function check_dir ()
# $2 is some file that must exist in $1
function find_dir ()
{
for tail in $1
# Loop over list of tails to search, including empty (just default_path + search file)
for tail in $1 ''
do
for head in $(default_paths)
do
if test x$(check_dir "$head/$tail" $2) = xyes
[ -n "$tail" ] && search_path="$head/$tail" || search_path=$head
if test x$(check_dir "$search_path" $2) = xyes
then
echo $head/$tail
echo $search_path
return 0
fi
done
@ -103,6 +111,11 @@ function find_mkl ()
find_dir "$default_mkls" "$mkl_check"
}
function find_openblas ()
{
find_dir "$default_openblas" "$openblas_check"
}
function find_cuda ()
{
find_dir "$default_cudas" "$cuda_check"
@ -179,6 +192,7 @@ function show_help ()
echo " --with-cudnn[=directory] $(show_default $(find_cudnn))"
echo " --with-acml[=directory] $(show_default $(find_acml))"
echo " --with-mkl[=directory] $(show_default $(find_mkl))"
echo " --with-openblas[=directory] (experimental) $(show_default $(find_openblas))"
echo " --with-buildtype=(debug|release) $(show_default $default_buildtype)"
echo " --with-kaldi[=directory] $(show_default $(find_kaldi))"
echo " --with-opencv[=directory] $(show_default $(find_opencv))"
@ -333,7 +347,7 @@ do
acml_path=$(find_acml)
if test x$acml_path = x
then
echo "Cannot fine acml directory"
echo "Cannot find acml directory"
echo "Please specify a value for --with-acml"
exit 1
fi
@ -355,7 +369,7 @@ do
mkl_path=$(find_mkl)
if test x$mkl_path = x
then
echo "Cannot fine mkl directory"
echo "Cannot find mkl directory"
echo "Please specify a value for --with-mkl"
exit 1
fi
@ -369,6 +383,28 @@ do
fi
fi
;;
--with-openblas*)
have_openblas=yes
mathlib=openblas
if test x$optarg = x
then
openblas_path=$(find_openblas)
if test x$openblas_path = x
then
echo "Cannot find openblas directory"
echo "Please specify a value for --with-openblas"
exit 1
fi
else
if test $(check_dir $optarg $openblas_check) = yes
then
openblas_path=$optarg
else
echo "Invalid openblas directory $optarg"
exit 1
fi
fi
;;
--with-buildtype*)
have_buildtype=yes
case $optarg in
@ -439,7 +475,7 @@ then
fi
# If no math library was specified, search for acml and then mkl
if test x$have_acml = xno && test x$have_mkl = xno
if test x$have_acml = xno && test x$have_mkl = xno && test x$have_openblas = xno
then
acml_path=$(find_acml)
if test x$acml_path = x
@ -448,7 +484,7 @@ then
if test x$mkl_path = x
then
echo "Cannot find a CPU math library."
echo "Please specify --with-acml or --with-mkl with a path."
echo "Please specify --with-acml, --with-mkl, --with-openblas with a path."
exit 1
else
mathlib=mkl
@ -530,6 +566,9 @@ case $mathlib in
mkl)
echo MKL_PATH=$mkl_path >> $config
;;
openblas)
echo OPENBLAS_PATH=$openblas_path >> $config
;;
esac
if test $enable_cuda = yes ; then
echo CUDA_PATH=$cuda_path >> $config