Add experimental support for OpenBLAS library
* Add 'openblas' as mathlib option in configure. Not added to auto-search so must be specified using --with-openblas * configure script searches empty tail so that libraries located at default_path_list roots (ie /usr/local/ + include/openblas_config.h) are found * Treat ACML as the odd library out in ifdefs since it doesn't conform to typical BLAS standard. Other libraries like ATLAS should be able to share OpenBLAS/MKL variants. Add default USE_ACML define in VS projects to match * Fix 'max' macro define colliding with C++ std::max once openblas headers are included Usage Notes: * For best performance, build OpenBLAS with USE_OPENMP=1. When running CNTK, set OPENBLAS_NUM_THREADS environment var or set numCPUThreads CNTK config variable to the physical core count or performance will suffer * OpenBLAS 2.16 (git HEAD) tested in Linux with GCC 4.8.4 and in Windows with OpenBLAS 2.15 (pre-built binary release + MingGW 64-bit support dlls) * For Windows, in Math.vcxproj, replace libacml_mp_dll.lib with libopenblas.dll.a and change USE_ACML define to USE_OPENBLAS. Change ACML_PATH environment variable to your OpenBLAS path. Modify openblas_config.h as per https://github.com/xianyi/OpenBLAS/issues/708 * On current generation Intel processors, OpenBLAS measures a little faster than AMD ACML and slower than Intel MKL on MNIST and other examples
This commit is contained in:
Родитель
9386b8d310
Коммит
f0d8a23b26
7
Makefile
7
Makefile
|
@ -132,6 +132,13 @@ ifeq ("$(MATHLIB)","mkl")
|
|||
CPPFLAGS += -DUSE_MKL
|
||||
endif
|
||||
|
||||
ifeq ("$(MATHLIB)","openblas")
|
||||
INCLUDEPATH += $(OPENBLAS_PATH)/include
|
||||
LIBPATH += $(OPENBLAS_PATH)/lib
|
||||
LIBS += -lopenblas -lm -lpthread
|
||||
CPPFLAGS += -DUSE_OPENBLAS
|
||||
endif
|
||||
|
||||
|
||||
ifdef KALDI_PATH
|
||||
########## Copy includes and defines from $(KALDI_PATH)/src/kaldi.mk ##########
|
||||
|
|
|
@ -25,9 +25,6 @@
|
|||
#define NOMINMAX
|
||||
#include "Windows.h"
|
||||
#else
|
||||
#ifndef max
|
||||
#define max(a, b) (((a) > (b)) ? (a) : (b))
|
||||
#endif
|
||||
#include <cfloat>
|
||||
#endif
|
||||
|
||||
|
@ -38,20 +35,27 @@
|
|||
#pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
|
||||
#pragma warning(disable : 4702) // unreachable code; triggered for unknown reasons
|
||||
|
||||
#ifndef USE_MKL
|
||||
// use ACML as default.
|
||||
#ifdef USE_ACML
|
||||
// Download ACML 5.3.1 (e.g., acml5.3.1-ifort64.exe) or above
|
||||
// from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/
|
||||
// Install the ifort64_mp variant (compiled with intel compiler) of the library
|
||||
// Set Environment variable ACML_PATH to C:\AMD\acml5.3.1\ifort64_mp or the folder you installed acml
|
||||
// to point to your folder for the include file and link library
|
||||
#include <acml.h> // requires ACML 5.3.1 and above
|
||||
#else
|
||||
#elif defined(USE_MKL)
|
||||
// requires MKL 10.0 and above
|
||||
#include <mkl.h>
|
||||
#else
|
||||
#ifdef _MSC_VER
|
||||
// Visual Studio doesn't define standard complex types properly
|
||||
#define HAVE_LAPACK_CONFIG_H
|
||||
#define LAPACK_COMPLEX_STRUCTURE
|
||||
#endif
|
||||
#include <cblas.h>
|
||||
#include <lapacke.h>
|
||||
#endif
|
||||
|
||||
#ifndef USE_MKL // MKL has one additional parameter for different matrix order
|
||||
#ifdef USE_ACML // MKL has one additional parameter for different matrix order
|
||||
#define BLAS_COLMAJOR
|
||||
#else
|
||||
#define BLAS_COLMAJOR (int) MatrixOrder::ColMajor,
|
||||
|
@ -878,7 +882,7 @@ void CPUMatrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, E
|
|||
#pragma omp parallel for
|
||||
foreach_column (j, us)
|
||||
{
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
dcopy((int) numRows, reinterpret_cast<double*>(pArray + j), (int) numCols, reinterpret_cast<double*>(m_pArray + LocateColumn(j)), 1);
|
||||
#else
|
||||
cblas_dcopy((int) numRows, reinterpret_cast<double*>(pArray + j), (int) numCols, reinterpret_cast<double*>(m_pArray + LocateColumn(j)), 1);
|
||||
|
@ -892,7 +896,7 @@ void CPUMatrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, E
|
|||
{
|
||||
{
|
||||
#pragma warning(suppress : 4244)
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
scopy((int) numRows, reinterpret_cast<float*>(pArray + j), (int) numCols, reinterpret_cast<float*>(m_pArray + LocateColumn(j)), 1);
|
||||
#else
|
||||
cblas_scopy((int) numRows, reinterpret_cast<float*>(pArray + j), (int) numCols, reinterpret_cast<float*>(m_pArray + LocateColumn(j)), 1);
|
||||
|
@ -1316,9 +1320,9 @@ ElemType CPUMatrix<ElemType>::RmsProp(CPUMatrix<ElemType>& gradients,
|
|||
const int grad_sign = (ElemType(0) < curr_grad[i]) - (curr_grad[i] < ElemType(0));
|
||||
|
||||
if (signs[i] * grad_sign > 0)
|
||||
steps[i] = min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX);
|
||||
steps[i] = std::min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX);
|
||||
else
|
||||
steps[i] = max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN);
|
||||
steps[i] = std::max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN);
|
||||
|
||||
a = steps[i] / sqrt(avars[i] + floor);
|
||||
curr_grad[i] *= a;
|
||||
|
@ -2237,7 +2241,7 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignLogSoftmaxOf(const CPUMatrix<Ele
|
|||
// we need to extract max before applying exp to avoid overflow
|
||||
ElemType maxV = a(0, j);
|
||||
foreach_row (i, a)
|
||||
maxV = max(maxV, a(i, j));
|
||||
maxV = std::max(maxV, a(i, j));
|
||||
|
||||
ElemType sum = 0;
|
||||
foreach_row (i, a)
|
||||
|
@ -2255,7 +2259,7 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignLogSoftmaxOf(const CPUMatrix<Ele
|
|||
// we need to extract max before applying exp to avoid overflow
|
||||
ElemType maxV = a(i, 0);
|
||||
foreach_column (j, a)
|
||||
maxV = max(maxV, a(i, j));
|
||||
maxV = std::max(maxV, a(i, j));
|
||||
|
||||
ElemType sum = 0;
|
||||
foreach_column (j, a)
|
||||
|
@ -2808,7 +2812,7 @@ ElemType CPUMatrix<ElemType>::SumOfAbsElements() const
|
|||
|
||||
if (sizeof(ElemType) == sizeof(double))
|
||||
{
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
return (ElemType) dasum((int) GetNumElements(), reinterpret_cast<double*>(m_pArray), 1);
|
||||
#else
|
||||
return (ElemType) cblas_dasum((int) GetNumElements(), reinterpret_cast<double*>(m_pArray), 1);
|
||||
|
@ -2817,7 +2821,7 @@ ElemType CPUMatrix<ElemType>::SumOfAbsElements() const
|
|||
else
|
||||
{
|
||||
#pragma warning(suppress : 4244)
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
return sasum((int) GetNumElements(), reinterpret_cast<float*>(m_pArray), 1);
|
||||
#else
|
||||
return cblas_sasum((int) GetNumElements(), reinterpret_cast<float*>(m_pArray), 1);
|
||||
|
@ -2990,7 +2994,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
|
|||
#pragma omp parallel for
|
||||
foreach_column (j, c)
|
||||
{
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
c(0, j) = (ElemType) dnrm2(m, reinterpret_cast<double*>(us.m_pArray + us.LocateColumn(j)), 1);
|
||||
#else
|
||||
c(0, j) = (ElemType) cblas_dnrm2(m, reinterpret_cast<double*>(us.m_pArray + us.LocateColumn(j)), 1);
|
||||
|
@ -3003,7 +3007,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
|
|||
foreach_column (j, c)
|
||||
{
|
||||
#pragma warning(suppress : 4244)
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
c(0, j) = snrm2(m, reinterpret_cast<float*>(us.m_pArray + us.LocateColumn(j)), 1);
|
||||
#else
|
||||
c(0, j) = cblas_snrm2(m, reinterpret_cast<float*>(us.m_pArray + us.LocateColumn(j)), 1);
|
||||
|
@ -3020,7 +3024,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
|
|||
#pragma omp parallel for
|
||||
foreach_row (i, c)
|
||||
{
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
c(i, 0) = dnrm2(n, reinterpret_cast<double*>(us.m_pArray + i), m);
|
||||
#else
|
||||
c(i, 0) = cblas_dnrm2(n, reinterpret_cast<double*>(us.m_pArray + i), m);
|
||||
|
@ -3033,7 +3037,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
|
|||
foreach_row (i, c)
|
||||
{
|
||||
#pragma warning(suppress : 4244)
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
c(i, 0) = snrm2(n, reinterpret_cast<float*>(us.m_pArray + i), m);
|
||||
#else
|
||||
c(i, 0) = cblas_snrm2(n, reinterpret_cast<float*>(us.m_pArray + i), m);
|
||||
|
@ -3073,7 +3077,7 @@ void CPUMatrix<ElemType>::VectorNormInf(CPUMatrix<ElemType>& c, const bool isCol
|
|||
ElemType v = 0;
|
||||
foreach_row (i, us)
|
||||
{
|
||||
v = max(v, abs(us(i, j)));
|
||||
v = std::max(v, abs(us(i, j)));
|
||||
}
|
||||
c(0, j) = v;
|
||||
}
|
||||
|
@ -3088,7 +3092,7 @@ void CPUMatrix<ElemType>::VectorNormInf(CPUMatrix<ElemType>& c, const bool isCol
|
|||
ElemType v = 0;
|
||||
foreach_column (j, us)
|
||||
{
|
||||
v = max(v, abs(us(i, j)));
|
||||
v = std::max(v, abs(us(i, j)));
|
||||
}
|
||||
c(i, 0) = v;
|
||||
}
|
||||
|
@ -3282,7 +3286,7 @@ ElemType CPUMatrix<ElemType>::MatrixNormInf() const
|
|||
{
|
||||
#pragma omp critical
|
||||
{
|
||||
v = max(v, abs(us(i, j)));
|
||||
v = std::max(v, abs(us(i, j)));
|
||||
}
|
||||
}
|
||||
return v;
|
||||
|
@ -3866,8 +3870,8 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignMaxPoolingResult(const CPUMatrix
|
|||
for (long rowInWindow = 0; rowInWindow < windowHeight; rowInWindow++)
|
||||
{
|
||||
const ElemType val = inputBatch(rowInInput, sample); // pf[rowInWindow*channels];
|
||||
maxVal = max(maxVal, val);
|
||||
minVal = min(minVal, val);
|
||||
maxVal = std::max(maxVal, val);
|
||||
minVal = std::min(minVal, val);
|
||||
rowInInput += (long) channels;
|
||||
}
|
||||
}
|
||||
|
@ -4040,7 +4044,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
|
|||
|
||||
int m, n, k, l;
|
||||
int lda, ldb, ldc;
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
char transA, transB;
|
||||
#else
|
||||
CBLAS_TRANSPOSE mklTransA;
|
||||
|
@ -4052,7 +4056,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
|
|||
m = (int) a.GetNumCols();
|
||||
k = (int) a.GetNumRows();
|
||||
lda = k;
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
transA = (char) MatrixTranspose::Trans;
|
||||
#else
|
||||
mklTransA = CBLAS_TRANSPOSE::CblasTrans;
|
||||
|
@ -4063,7 +4067,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
|
|||
m = (int) a.GetNumRows();
|
||||
k = (int) a.GetNumCols();
|
||||
lda = m;
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
transA = (char) MatrixTranspose::NoTrans;
|
||||
#else
|
||||
mklTransA = CBLAS_TRANSPOSE::CblasNoTrans;
|
||||
|
@ -4075,7 +4079,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
|
|||
l = (int) b.GetNumCols();
|
||||
n = (int) b.GetNumRows();
|
||||
ldb = n;
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
transB = (char) MatrixTranspose::Trans;
|
||||
#else
|
||||
mklTransB = CBLAS_TRANSPOSE::CblasTrans;
|
||||
|
@ -4086,7 +4090,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
|
|||
l = (int) b.GetNumRows();
|
||||
n = (int) b.GetNumCols();
|
||||
ldb = l;
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
transB = (char) MatrixTranspose::NoTrans;
|
||||
#else
|
||||
mklTransB = CBLAS_TRANSPOSE::CblasNoTrans;
|
||||
|
@ -4107,7 +4111,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
|
|||
|
||||
if (sizeof(ElemType) == sizeof(double))
|
||||
{
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
dgemm(transA, transB, m, n, k, alpha, reinterpret_cast<double*>(a.m_pArray), lda, reinterpret_cast<double*>(b.m_pArray), ldb, beta, reinterpret_cast<double*>(c.m_pArray), ldc);
|
||||
#else
|
||||
cblas_dgemm((CBLAS_ORDER) BLAS_COLMAJOR mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast<double*>(a.m_pArray), lda, reinterpret_cast<double*>(b.m_pArray), ldb, beta, reinterpret_cast<double*>(c.m_pArray), ldc);
|
||||
|
@ -4116,7 +4120,7 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
|
|||
else
|
||||
{
|
||||
#pragma warning(suppress : 4244)
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
sgemm(BLAS_COLMAJOR transA, transB, m, n, k, alpha, reinterpret_cast<float*>(a.m_pArray), lda, reinterpret_cast<float*>(b.m_pArray), ldb, beta, reinterpret_cast<float*>(c.m_pArray), ldc);
|
||||
#else
|
||||
cblas_sgemm((CBLAS_ORDER) BLAS_COLMAJOR mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast<float*>(a.m_pArray), lda, reinterpret_cast<float*>(b.m_pArray), ldb, beta, reinterpret_cast<float*>(c.m_pArray), ldc);
|
||||
|
@ -4160,34 +4164,42 @@ void CPUMatrix<ElemType>::SVD(const CPUMatrix<ElemType>& A, CPUMatrix<ElemType>&
|
|||
ldu = m;
|
||||
ldvt = n;
|
||||
U.Resize(m, m);
|
||||
SIGMA.Resize(min(m, n), 1);
|
||||
SIGMA.Resize(std::min(m, n), 1);
|
||||
VT.Resize(n, n);
|
||||
|
||||
if (sizeof(ElemType) == sizeof(double))
|
||||
{
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
dgesvd('A', 'A', (int) m, (int) n, reinterpret_cast<double*>(A.m_pArray), (int) lda, reinterpret_cast<double*>(SIGMA.m_pArray), reinterpret_cast<double*>(U.m_pArray), (int) ldu, reinterpret_cast<double*>(VT.m_pArray), (int) ldvt, &info);
|
||||
#else
|
||||
#elif defined(USE_MKL)
|
||||
double wkopt;
|
||||
int lwork = -1;
|
||||
dgesvd("All", "All", &m, &n, reinterpret_cast<double*>(A.m_pArray), &lda, reinterpret_cast<double*>(SIGMA.m_pArray), reinterpret_cast<double*>(U.m_pArray), &ldu, reinterpret_cast<double*>(VT.m_pArray), &ldvt, &wkopt, &lwork, &info);
|
||||
lwork = (int) wkopt;
|
||||
W.Resize(lwork, 1);
|
||||
dgesvd("All", "All", &m, &n, reinterpret_cast<double*>(A.m_pArray), &lda, reinterpret_cast<double*>(SIGMA.m_pArray), reinterpret_cast<double*>(U.m_pArray), &ldu, reinterpret_cast<double*>(VT.m_pArray), &ldvt, reinterpret_cast<double*>(W.m_pArray), &lwork, &info);
|
||||
#else
|
||||
std::vector<double> superb(std::max(std::min(m, n) - 1, 1));
|
||||
info = LAPACKE_dgesvd(BLAS_COLMAJOR 'A', 'A', (int) m, (int) n, reinterpret_cast<double*>(A.m_pArray), (int) lda, reinterpret_cast<double*>(SIGMA.m_pArray),
|
||||
reinterpret_cast<double*>(U.m_pArray), (int) ldu, reinterpret_cast<double*>(VT.m_pArray), (int) ldvt, &superb[0]);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
#pragma warning(suppress : 4244)
|
||||
sgesvd('A', 'A', (int) m, (int) n, reinterpret_cast<float*>(A.m_pArray), (int) lda, reinterpret_cast<float*>(SIGMA.m_pArray), reinterpret_cast<float*>(U.m_pArray), (int) ldu, reinterpret_cast<float*>(VT.m_pArray), (int) ldvt, &info);
|
||||
#else
|
||||
#elif defined(USE_MKL)
|
||||
float wkopt;
|
||||
int lwork = -1;
|
||||
sgesvd("All", "All", &m, &n, reinterpret_cast<float*>(A.m_pArray), &lda, reinterpret_cast<float*>(SIGMA.m_pArray), reinterpret_cast<float*>(U.m_pArray), &ldu, reinterpret_cast<float*>(VT.m_pArray), &ldvt, &wkopt, &lwork, &info);
|
||||
lwork = (int) wkopt;
|
||||
W.Resize(lwork, 1);
|
||||
sgesvd("All", "All", &m, &n, reinterpret_cast<float*>(A.m_pArray), &lda, reinterpret_cast<float*>(SIGMA.m_pArray), reinterpret_cast<float*>(U.m_pArray), &ldu, reinterpret_cast<float*>(VT.m_pArray), &ldvt, reinterpret_cast<float*>(W.m_pArray), &lwork, &info);
|
||||
#else
|
||||
std::vector<float> superb(std::max(std::min(m, n) - 1, 1));
|
||||
info = LAPACKE_sgesvd(BLAS_COLMAJOR 'A', 'A', (int) m, (int) n, reinterpret_cast<float*>(A.m_pArray), (int) lda, reinterpret_cast<float*>(SIGMA.m_pArray),
|
||||
reinterpret_cast<float*>(U.m_pArray), (int) ldu, reinterpret_cast<float*>(VT.m_pArray), (int) ldvt, &superb[0]);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -4383,7 +4395,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
|
|||
|
||||
if (sizeof(ElemType) == sizeof(double))
|
||||
{
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
daxpy(len, alpha, reinterpret_cast<double*>(a.m_pArray), incx, reinterpret_cast<double*>(c.m_pArray), incy);
|
||||
#else
|
||||
cblas_daxpy(len, alpha, reinterpret_cast<double*>(a.m_pArray), incx, reinterpret_cast<double*>(c.m_pArray), incy);
|
||||
|
@ -4392,7 +4404,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
|
|||
else
|
||||
{
|
||||
#pragma warning(suppress : 4244)
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
saxpy(len, alpha, reinterpret_cast<float*>(a.m_pArray), incx, reinterpret_cast<float*>(c.m_pArray), incy);
|
||||
#else
|
||||
cblas_saxpy(len, alpha, reinterpret_cast<float*>(a.m_pArray), incx, reinterpret_cast<float*>(c.m_pArray), incy);
|
||||
|
@ -4433,7 +4445,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
|
|||
#pragma omp parallel for
|
||||
foreach_column (j, c)
|
||||
{
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
daxpy(m, alpha, reinterpret_cast<double*>(a.m_pArray), 1, reinterpret_cast<double*>(c.m_pArray + c.LocateColumn(j)), 1);
|
||||
#else
|
||||
cblas_daxpy(m, alpha, reinterpret_cast<double*>(a.m_pArray), 1, reinterpret_cast<double*>(c.m_pArray + c.LocateColumn(j)), 1);
|
||||
|
@ -4446,7 +4458,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
|
|||
foreach_column (j, c)
|
||||
{
|
||||
#pragma warning(suppress : 4244)
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
saxpy(m, alpha, reinterpret_cast<float*>(a.m_pArray), 1, reinterpret_cast<float*>(c.m_pArray + c.LocateColumn(j)), 1);
|
||||
#else
|
||||
cblas_saxpy(m, alpha, reinterpret_cast<float*>(a.m_pArray), 1, reinterpret_cast<float*>(c.m_pArray + c.LocateColumn(j)), 1);
|
||||
|
@ -4467,7 +4479,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
|
|||
#pragma omp parallel for
|
||||
foreach_row (i, c)
|
||||
{
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
daxpy(n, alpha, reinterpret_cast<double*>(a.m_pArray), 1, reinterpret_cast<double*>(c.m_pArray + i), m);
|
||||
#else
|
||||
cblas_daxpy(n, alpha, reinterpret_cast<double*>(a.m_pArray), 1, reinterpret_cast<double*>(c.m_pArray + i), m);
|
||||
|
@ -4480,7 +4492,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
|
|||
foreach_row (i, c)
|
||||
{
|
||||
#pragma warning(suppress : 4244)
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
saxpy(n, alpha, reinterpret_cast<float*>(a.m_pArray), 1, reinterpret_cast<float*>(c.m_pArray + i), m);
|
||||
#else
|
||||
cblas_saxpy(n, alpha, reinterpret_cast<float*>(a.m_pArray), 1, reinterpret_cast<float*>(c.m_pArray + i), m);
|
||||
|
@ -4682,7 +4694,7 @@ void CPUMatrix<ElemType>::Scale(ElemType alpha, CPUMatrix<ElemType>& a)
|
|||
|
||||
if (sizeof(ElemType) == sizeof(double))
|
||||
{
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
dscal(len, alpha, reinterpret_cast<double*>(a.m_pArray), incx);
|
||||
#else
|
||||
cblas_dscal(len, alpha, reinterpret_cast<double*>(a.m_pArray), incx);
|
||||
|
@ -4691,7 +4703,7 @@ void CPUMatrix<ElemType>::Scale(ElemType alpha, CPUMatrix<ElemType>& a)
|
|||
else
|
||||
{
|
||||
#pragma warning(suppress : 4244)
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
sscal(len, alpha, reinterpret_cast<float*>(a.m_pArray), incx);
|
||||
#else
|
||||
cblas_sscal(len, alpha, reinterpret_cast<float*>(a.m_pArray), incx);
|
||||
|
@ -4741,7 +4753,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
|
|||
#pragma omp parallel for
|
||||
foreach_column (j, c)
|
||||
{
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
c(0, j) = (ElemType) ddot(m, reinterpret_cast<double*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<double*>(b.m_pArray + b.LocateColumn(j)), 1);
|
||||
#else
|
||||
c(0, j) = (ElemType) cblas_ddot(m, reinterpret_cast<double*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<double*>(b.m_pArray + b.LocateColumn(j)), 1);
|
||||
|
@ -4754,7 +4766,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
|
|||
foreach_column (j, c)
|
||||
{
|
||||
#pragma warning(suppress : 4244)
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
c(0, j) = (ElemType) sdot(m, reinterpret_cast<float*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<float*>(b.m_pArray + b.LocateColumn(j)), 1);
|
||||
#else
|
||||
c(0, j) = (ElemType) cblas_sdot(m, reinterpret_cast<float*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<float*>(b.m_pArray + b.LocateColumn(j)), 1);
|
||||
|
@ -4771,7 +4783,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
|
|||
#pragma omp parallel for
|
||||
foreach_row (i, c)
|
||||
{
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
c(i, 0) = ddot(n, reinterpret_cast<double*>(a.m_pArray + i), m, reinterpret_cast<double*>(b.m_pArray + i), m);
|
||||
#else
|
||||
c(i, 0) = cblas_ddot(n, reinterpret_cast<double*>(a.m_pArray + i), m, reinterpret_cast<double*>(b.m_pArray + i), m);
|
||||
|
@ -4784,7 +4796,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
|
|||
foreach_row (i, c)
|
||||
{
|
||||
#pragma warning(suppress : 4244)
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
c(i, 0) = sdot(n, reinterpret_cast<float*>(a.m_pArray + i), m, reinterpret_cast<float*>(b.m_pArray + i), m);
|
||||
#else
|
||||
c(i, 0) = cblas_sdot(n, reinterpret_cast<float*>(a.m_pArray + i), m, reinterpret_cast<float*>(b.m_pArray + i), m);
|
||||
|
@ -4813,7 +4825,7 @@ ElemType CPUMatrix<ElemType>::InnerProductOfMatrices(const CPUMatrix<ElemType>&
|
|||
|
||||
if (sizeof(ElemType) == sizeof(double))
|
||||
{
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
return (ElemType) ddot((int) a.GetNumElements(), reinterpret_cast<double*>(a.m_pArray), 1, reinterpret_cast<double*>(b.m_pArray), 1);
|
||||
#else
|
||||
return (ElemType) cblas_ddot((int) a.GetNumElements(), reinterpret_cast<double*>(a.m_pArray), 1, reinterpret_cast<double*>(b.m_pArray), 1);
|
||||
|
@ -4822,7 +4834,7 @@ ElemType CPUMatrix<ElemType>::InnerProductOfMatrices(const CPUMatrix<ElemType>&
|
|||
else
|
||||
{
|
||||
#pragma warning(suppress : 4244)
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
return (ElemType) sdot((int) a.GetNumElements(), reinterpret_cast<float*>(a.m_pArray), 1, reinterpret_cast<float*>(b.m_pArray), 1);
|
||||
#else
|
||||
return (ElemType) cblas_sdot((int) a.GetNumElements(), reinterpret_cast<float*>(a.m_pArray), 1, reinterpret_cast<float*>(b.m_pArray), 1);
|
||||
|
@ -5052,7 +5064,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
|
|||
{
|
||||
for (long j = 0; j < n; j++)
|
||||
{
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
c(0, j) = (ElemType) ddot(m, reinterpret_cast<double*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<double*>(b.m_pArray + b.LocateColumn(j)), 1);
|
||||
#else
|
||||
c(0, j) = (ElemType) cblas_ddot(m, reinterpret_cast<double*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<double*>(b.m_pArray + b.LocateColumn(j)), 1);
|
||||
|
@ -5062,7 +5074,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
|
|||
{
|
||||
for (long i = 1; i < negnumber + 1; i++)
|
||||
{
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
c(i, j) = (ElemType) ddot(m, reinterpret_cast<double*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<double*>(b.m_pArray + b.LocateColumn((j + shift + i - 1) % n)), 1);
|
||||
#else
|
||||
c(i, j) = (ElemType) cblas_ddot(m, reinterpret_cast<double*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<double*>(b.m_pArray + b.LocateColumn((j + shift + i - 1) % n)), 1);
|
||||
|
@ -5074,7 +5086,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
|
|||
{
|
||||
for (long j = 0; j < n; j++)
|
||||
{
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
c(0, j) = (ElemType) sdot(m, reinterpret_cast<float*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<float*>(b.m_pArray + b.LocateColumn(j)), 1);
|
||||
#else
|
||||
c(0, j) = (ElemType) cblas_sdot(m, reinterpret_cast<float*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<float*>(b.m_pArray + b.LocateColumn(j)), 1);
|
||||
|
@ -5084,7 +5096,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
|
|||
{
|
||||
for (long i = 1; i < negnumber + 1; i++)
|
||||
{
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
c(i, j) = (ElemType) sdot(m, reinterpret_cast<float*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<float*>(b.m_pArray + b.LocateColumn((j + shift + i - 1) % n)), 1);
|
||||
#else
|
||||
c(i, j) = (ElemType) cblas_sdot(m, reinterpret_cast<float*>(a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast<float*>(b.m_pArray + b.LocateColumn((j + shift + i - 1) % n)), 1);
|
||||
|
@ -5104,7 +5116,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
|
|||
#pragma omp parallel for
|
||||
foreach_row (i, c)
|
||||
{
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
c(i, 0) = (ElemType) ddot(n, reinterpret_cast<double*>(a.m_pArray + i), m, reinterpret_cast<double*>(b.m_pArray + i), m);
|
||||
#else
|
||||
c(i, 0) = (ElemType) cblas_ddot(n, reinterpret_cast<double*>(a.m_pArray + i), m, reinterpret_cast<double*>(b.m_pArray + i), m);
|
||||
|
@ -5117,7 +5129,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
|
|||
foreach_row (i, c)
|
||||
{
|
||||
#pragma warning(suppress : 4244)
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
c(i, 0) = sdot(n, reinterpret_cast<float*>(a.m_pArray + i), m, reinterpret_cast<float*>(b.m_pArray + i), m);
|
||||
#else
|
||||
c(i, 0) = cblas_sdot(n, reinterpret_cast<float*>(a.m_pArray + i), m, reinterpret_cast<float*>(b.m_pArray + i), m);
|
||||
|
@ -5527,7 +5539,7 @@ int CPUMatrix<ElemType>::SetNumThreads(int numThreads)
|
|||
int mthreads = (int) std::thread::hardware_concurrency();
|
||||
|
||||
if (numThreads <= 0)
|
||||
numThreads = max(1, mthreads + numThreads);
|
||||
numThreads = std::max(1, mthreads + numThreads);
|
||||
if (numThreads > mthreads)
|
||||
numThreads = mthreads;
|
||||
|
||||
|
@ -5535,10 +5547,12 @@ int CPUMatrix<ElemType>::SetNumThreads(int numThreads)
|
|||
omp_set_num_threads(numThreads);
|
||||
numThreads = omp_get_max_threads();
|
||||
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
acmlsetnumthreads(numThreads);
|
||||
#else
|
||||
#elif defined(USE_MKL)
|
||||
mkl_set_num_threads(numThreads);
|
||||
#elif defined(USE_OPENBLAS)
|
||||
openblas_set_num_threads(numThreads);
|
||||
#endif
|
||||
#endif
|
||||
return numThreads;
|
||||
|
|
|
@ -23,7 +23,7 @@
|
|||
|
||||
#pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
|
||||
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
// use ACML as default.
|
||||
// Download ACML 5.3.0 (e.g., acml5.3.0-ifort64.exe) or above
|
||||
// from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/
|
||||
|
@ -31,9 +31,17 @@
|
|||
// Set Environment variable ACML_PATH to C:\AMD\acml5.3.0\ifort64_mp or the folder you installed acml
|
||||
// to point to your folder for the include file and link library
|
||||
#include <acml.h> // requires ACML 5.3.0 and above
|
||||
#else
|
||||
#elif defined(USE_MKL)
|
||||
// requires MKL 10.0 and above
|
||||
#include <mkl.h>
|
||||
#else
|
||||
#ifdef _MSC_VER
|
||||
// Visual Studio doesn't define standard complex types properly
|
||||
#define HAVE_LAPACK_CONFIG_H
|
||||
#define LAPACK_COMPLEX_STRUCTURE
|
||||
#endif
|
||||
#include <cblas.h>
|
||||
#include <lapacke.h>
|
||||
#endif
|
||||
|
||||
// This is an example of an exported variable
|
||||
|
@ -45,7 +53,7 @@
|
|||
// return 42;
|
||||
//}
|
||||
|
||||
#ifndef USE_MKL // MKL has one additional parameter for different matrix order
|
||||
#ifdef USE_ACML // MKL has one additional parameter for different matrix order
|
||||
#define BLAS_COLMAJOR
|
||||
#else
|
||||
#define BLAS_COLMAJOR (int) MatrixOrder::ColMajor,
|
||||
|
@ -1185,7 +1193,7 @@ ElemType CPUSparseMatrix<ElemType>::SumOfAbsElements() const
|
|||
|
||||
if (sizeof(ElemType) == sizeof(double))
|
||||
{
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
return (ElemType) dasum((int) this->NzCount(), reinterpret_cast<double*>(m_nzValues), 1);
|
||||
#else
|
||||
return (ElemType) cblas_dasum((int) this->NzCount(), reinterpret_cast<double*>(m_nzValues), 1);
|
||||
|
@ -1194,7 +1202,7 @@ ElemType CPUSparseMatrix<ElemType>::SumOfAbsElements() const
|
|||
else
|
||||
{
|
||||
#pragma warning(suppress : 4244)
|
||||
#ifndef USE_MKL
|
||||
#ifdef USE_ACML
|
||||
return sasum((int) this->NzCount(), reinterpret_cast<float*>(m_nzValues), 1);
|
||||
#else
|
||||
return cblas_sasum((int) this->NzCount(), reinterpret_cast<float*>(m_nzValues), 1);
|
||||
|
|
|
@ -63,7 +63,7 @@
|
|||
<ClCompile>
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
<WarningLevel>Level4</WarningLevel>
|
||||
<PreprocessorDefinitions>NO_SYNC; WIN32; _DEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>USE_ACML; NO_SYNC; WIN32; _DEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<SDLCheck>true</SDLCheck>
|
||||
<AdditionalIncludeDirectories>..\Common\include\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
|
||||
<MultiProcessorCompilation>true</MultiProcessorCompilation>
|
||||
|
@ -105,7 +105,7 @@
|
|||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>NO_SYNC; WIN32; NDEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>USE_ACML; NO_SYNC; WIN32; NDEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<SDLCheck>true</SDLCheck>
|
||||
<AdditionalIncludeDirectories>..\Common\include\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
|
||||
<MultiProcessorCompilation>true</MultiProcessorCompilation>
|
||||
|
|
|
@ -16,6 +16,11 @@ have_mkl=no
|
|||
mkl_path=
|
||||
mkl_check=mkl/include/mkl.h
|
||||
|
||||
# Experimental OpenBLAS support.
|
||||
have_openblas=no
|
||||
openblas_path=
|
||||
openblas_check=include/openblas_config.h
|
||||
|
||||
have_kaldi=no
|
||||
kaldi_path=
|
||||
kaldi_check=src/kaldi.mk
|
||||
|
@ -45,11 +50,12 @@ default_use_1bitsgd=no
|
|||
enable_1bitsgd=$default_use_1bitsgd
|
||||
|
||||
# List from best to worst choice
|
||||
default_path_list="/usr /usr/local /opt /opt/local"
|
||||
default_path_list="/usr /usr/local /opt /opt/local /opt/intel"
|
||||
|
||||
# List from best to worst choice
|
||||
default_acmls="acml5.3.1/ifort64_mp"
|
||||
default_mkls=""
|
||||
default_openblas=""
|
||||
|
||||
# NOTE: Will get compilation errors with cuda-6.0
|
||||
default_cudas="cuda-7.5 cuda-7.0 cuda-6.5"
|
||||
|
@ -80,13 +86,15 @@ function check_dir ()
|
|||
# $2 is some file that must exist in $1
|
||||
function find_dir ()
|
||||
{
|
||||
for tail in $1
|
||||
# Loop over list of tails to search, including empty (just default_path + search file)
|
||||
for tail in $1 ''
|
||||
do
|
||||
for head in $(default_paths)
|
||||
do
|
||||
if test x$(check_dir "$head/$tail" $2) = xyes
|
||||
[ -n "$tail" ] && search_path="$head/$tail" || search_path=$head
|
||||
if test x$(check_dir "$search_path" $2) = xyes
|
||||
then
|
||||
echo $head/$tail
|
||||
echo $search_path
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
|
@ -103,6 +111,11 @@ function find_mkl ()
|
|||
find_dir "$default_mkls" "$mkl_check"
|
||||
}
|
||||
|
||||
function find_openblas ()
|
||||
{
|
||||
find_dir "$default_openblas" "$openblas_check"
|
||||
}
|
||||
|
||||
function find_cuda ()
|
||||
{
|
||||
find_dir "$default_cudas" "$cuda_check"
|
||||
|
@ -179,6 +192,7 @@ function show_help ()
|
|||
echo " --with-cudnn[=directory] $(show_default $(find_cudnn))"
|
||||
echo " --with-acml[=directory] $(show_default $(find_acml))"
|
||||
echo " --with-mkl[=directory] $(show_default $(find_mkl))"
|
||||
echo " --with-openblas[=directory] (experimental) $(show_default $(find_openblas))"
|
||||
echo " --with-buildtype=(debug|release) $(show_default $default_buildtype)"
|
||||
echo " --with-kaldi[=directory] $(show_default $(find_kaldi))"
|
||||
echo " --with-opencv[=directory] $(show_default $(find_opencv))"
|
||||
|
@ -333,7 +347,7 @@ do
|
|||
acml_path=$(find_acml)
|
||||
if test x$acml_path = x
|
||||
then
|
||||
echo "Cannot fine acml directory"
|
||||
echo "Cannot find acml directory"
|
||||
echo "Please specify a value for --with-acml"
|
||||
exit 1
|
||||
fi
|
||||
|
@ -355,7 +369,7 @@ do
|
|||
mkl_path=$(find_mkl)
|
||||
if test x$mkl_path = x
|
||||
then
|
||||
echo "Cannot fine mkl directory"
|
||||
echo "Cannot find mkl directory"
|
||||
echo "Please specify a value for --with-mkl"
|
||||
exit 1
|
||||
fi
|
||||
|
@ -369,6 +383,28 @@ do
|
|||
fi
|
||||
fi
|
||||
;;
|
||||
--with-openblas*)
|
||||
have_openblas=yes
|
||||
mathlib=openblas
|
||||
if test x$optarg = x
|
||||
then
|
||||
openblas_path=$(find_openblas)
|
||||
if test x$openblas_path = x
|
||||
then
|
||||
echo "Cannot find openblas directory"
|
||||
echo "Please specify a value for --with-openblas"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
if test $(check_dir $optarg $openblas_check) = yes
|
||||
then
|
||||
openblas_path=$optarg
|
||||
else
|
||||
echo "Invalid openblas directory $optarg"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
;;
|
||||
--with-buildtype*)
|
||||
have_buildtype=yes
|
||||
case $optarg in
|
||||
|
@ -439,7 +475,7 @@ then
|
|||
fi
|
||||
|
||||
# If no math library was specified, search for acml and then mkl
|
||||
if test x$have_acml = xno && test x$have_mkl = xno
|
||||
if test x$have_acml = xno && test x$have_mkl = xno && test x$have_openblas = xno
|
||||
then
|
||||
acml_path=$(find_acml)
|
||||
if test x$acml_path = x
|
||||
|
@ -448,7 +484,7 @@ then
|
|||
if test x$mkl_path = x
|
||||
then
|
||||
echo "Cannot find a CPU math library."
|
||||
echo "Please specify --with-acml or --with-mkl with a path."
|
||||
echo "Please specify --with-acml, --with-mkl, --with-openblas with a path."
|
||||
exit 1
|
||||
else
|
||||
mathlib=mkl
|
||||
|
@ -530,6 +566,9 @@ case $mathlib in
|
|||
mkl)
|
||||
echo MKL_PATH=$mkl_path >> $config
|
||||
;;
|
||||
openblas)
|
||||
echo OPENBLAS_PATH=$openblas_path >> $config
|
||||
;;
|
||||
esac
|
||||
if test $enable_cuda = yes ; then
|
||||
echo CUDA_PATH=$cuda_path >> $config
|
||||
|
|
Загрузка…
Ссылка в новой задаче