This commit is contained in:
Wolfgang Manousek 2016-08-16 15:08:08 +02:00
Родитель 40d247841a
Коммит 79cfcf7d4f
6 изменённых файлов: 27 добавлений и 259 удалений

Просмотреть файл

@ -37,14 +37,8 @@
#pragma warning(disable : 4244) // unreachable code; triggered for unknown reasons
#pragma warning(disable : 4702) // conversion from 'double' to 'float'
#ifdef USE_ACML
// Download ACML 5.3.1 (e.g., acml5.3.1-ifort64.exe) or above
// from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/
// Install the ifort64_mp variant (compiled with intel compiler) of the library
// Set Environment variable ACML_PATH to C:\AMD\acml5.3.1\ifort64_mp or the folder you installed acml
// to point to your folder for the include file and link library
#include <acml.h> // requires ACML 5.3.1 and above
#elif defined(USE_MKL)
#ifdef defined(USE_MKL)
// requires MKL 10.0 and above
#include <mkl.h>
#else
@ -57,12 +51,6 @@
#include <lapacke.h>
#endif
#ifdef USE_ACML // MKL has one additional parameter for different matrix order
#define BLAS_COLMAJOR
#else
#define BLAS_COLMAJOR (int) MatrixOrder::ColMajor,
#endif
#define SWAP(a, b) \
{ \
(a) ^= (b); \
@ -912,11 +900,7 @@ void CPUMatrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, E
#pragma omp parallel for
foreach_column (j, us)
{
#ifdef USE_ACML
dcopy((int) numRows, reinterpret_cast<double*>(pArray + j), (int) numCols, reinterpret_cast<double*>(bufPtr + LocateColumn(j)), 1);
#else
cblas_dcopy((int) numRows, reinterpret_cast<double*>(pArray + j), (int) numCols, reinterpret_cast<double*>(bufPtr + LocateColumn(j)), 1);
#endif
}
}
else
@ -926,11 +910,7 @@ void CPUMatrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, E
{
{
#pragma warning(suppress : 4244)
#ifdef USE_ACML
scopy((int) numRows, reinterpret_cast<float*>(pArray + j), (int) numCols, reinterpret_cast<float*>(bufPtr + LocateColumn(j)), 1);
#else
cblas_scopy((int) numRows, reinterpret_cast<float*>(pArray + j), (int) numCols, reinterpret_cast<float*>(bufPtr + LocateColumn(j)), 1);
#endif
}
}
}
@ -2844,20 +2824,12 @@ ElemType CPUMatrix<ElemType>::SumOfAbsElements() const
if (sizeof(ElemType) == sizeof(double))
{
#ifdef USE_ACML
return (ElemType) dasum((int) GetNumElements(), reinterpret_cast<double*>(Data()), 1);
#else
return (ElemType) cblas_dasum((int) GetNumElements(), reinterpret_cast<double*>(Data()), 1);
#endif
}
else
{
#pragma warning(suppress : 4244)
#ifdef USE_ACML
return sasum((int) GetNumElements(), reinterpret_cast<float*>(Data()), 1);
#else
return cblas_sasum((int) GetNumElements(), reinterpret_cast<float*>(Data()), 1);
#endif
}
}
@ -3028,11 +3000,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
#pragma omp parallel for
foreach_column (j, c)
{
#ifdef USE_ACML
c(0, j) = (ElemType) dnrm2(m, reinterpret_cast<double*>(bufPtr + us.LocateColumn(j)), 1);
#else
c(0, j) = (ElemType) cblas_dnrm2(m, reinterpret_cast<double*>(bufPtr + us.LocateColumn(j)), 1);
#endif
}
}
else
@ -3041,11 +3009,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
foreach_column (j, c)
{
#pragma warning(suppress : 4244)
#ifdef USE_ACML
c(0, j) = snrm2(m, reinterpret_cast<float*>(bufPtr + us.LocateColumn(j)), 1);
#else
c(0, j) = cblas_snrm2(m, reinterpret_cast<float*>(bufPtr + us.LocateColumn(j)), 1);
#endif
}
}
}
@ -3058,11 +3022,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
#pragma omp parallel for
foreach_row (i, c)
{
#ifdef USE_ACML
c(i, 0) = dnrm2(n, reinterpret_cast<double*>(bufPtr + i), m);
#else
c(i, 0) = cblas_dnrm2(n, reinterpret_cast<double*>(bufPtr + i), m);
#endif
}
}
else
@ -3071,11 +3031,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
foreach_row (i, c)
{
#pragma warning(suppress : 4244)
#ifdef USE_ACML
c(i, 0) = snrm2(n, reinterpret_cast<float*>(bufPtr + i), m);
#else
c(i, 0) = cblas_snrm2(n, reinterpret_cast<float*>(bufPtr + i), m);
#endif
}
}
}
@ -4486,34 +4442,22 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
int m, n, k, l;
int lda, ldb, ldc;
#ifdef USE_ACML
char transA, transB;
#else
CBLAS_TRANSPOSE mklTransA;
CBLAS_TRANSPOSE mklTransB;
#endif
if (transposeA)
{
m = (int) a.GetNumCols();
k = (int) a.GetNumRows();
lda = k;
#ifdef USE_ACML
transA = (char) MatrixTranspose::Trans;
#else
mklTransA = CBLAS_TRANSPOSE::CblasTrans;
#endif
}
else
{
m = (int) a.GetNumRows();
k = (int) a.GetNumCols();
lda = m;
#ifdef USE_ACML
transA = (char) MatrixTranspose::NoTrans;
#else
mklTransA = CBLAS_TRANSPOSE::CblasNoTrans;
#endif
}
if (transposeB)
@ -4521,22 +4465,14 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
l = (int) b.GetNumCols();
n = (int) b.GetNumRows();
ldb = n;
#ifdef USE_ACML
transB = (char) MatrixTranspose::Trans;
#else
mklTransB = CBLAS_TRANSPOSE::CblasTrans;
#endif
}
else
{
l = (int) b.GetNumRows();
n = (int) b.GetNumCols();
ldb = l;
#ifdef USE_ACML
transB = (char) MatrixTranspose::NoTrans;
#else
mklTransB = CBLAS_TRANSPOSE::CblasNoTrans;
#endif
}
assert(m > 0 && k > 0 && l > 0 && n > 0); // converting from size_t to int may cause overflow
@ -4553,20 +4489,12 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
if (sizeof(ElemType) == sizeof(double))
{
#ifdef USE_ACML
dgemm(transA, transB, m, n, k, alpha, reinterpret_cast<double*>(a.Data()), lda, reinterpret_cast<double*>(b.Data()), ldb, beta, reinterpret_cast<double*>(c.Data()), ldc);
#else
cblas_dgemm((CBLAS_ORDER) BLAS_COLMAJOR mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast<double*>(a.Data()), lda, reinterpret_cast<double*>(b.Data()), ldb, beta, reinterpret_cast<double*>(c.Data()), ldc);
#endif
cblas_dgemm((CBLAS_ORDER) (int)MatrixOrder::ColMajor, mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast<double*>(a.Data()), lda, reinterpret_cast<double*>(b.Data()), ldb, beta, reinterpret_cast<double*>(c.Data()), ldc);
}
else
{
#pragma warning(suppress : 4244)
#ifdef USE_ACML
sgemm(BLAS_COLMAJOR transA, transB, m, n, k, alpha, reinterpret_cast<float*>(a.Data()), lda, reinterpret_cast<float*>(b.Data()), ldb, beta, reinterpret_cast<float*>(c.Data()), ldc);
#else
cblas_sgemm((CBLAS_ORDER) BLAS_COLMAJOR mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast<float*>(a.Data()), lda, reinterpret_cast<float*>(b.Data()), ldb, beta, reinterpret_cast<float*>(c.Data()), ldc);
#endif
cblas_sgemm((CBLAS_ORDER) (int)MatrixOrder::ColMajor, mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast<float*>(a.Data()), lda, reinterpret_cast<float*>(b.Data()), ldb, beta, reinterpret_cast<float*>(c.Data()), ldc);
}
}
@ -4611,9 +4539,7 @@ void CPUMatrix<ElemType>::SVD(const CPUMatrix<ElemType>& A, CPUMatrix<ElemType>&
if (sizeof(ElemType) == sizeof(double))
{
#ifdef USE_ACML
dgesvd('A', 'A', (int) m, (int) n, reinterpret_cast<double*>(A.Data()), (int) lda, reinterpret_cast<double*>(SIGMA.Data()), reinterpret_cast<double*>(U.Data()), (int) ldu, reinterpret_cast<double*>(VT.Data()), (int) ldvt, &info);
#elif defined(USE_MKL)
#ifdef USE_MKL
double wkopt;
int lwork = -1;
dgesvd("All", "All", &m, &n, reinterpret_cast<double*>(A.Data()), &lda, reinterpret_cast<double*>(SIGMA.Data()), reinterpret_cast<double*>(U.Data()), &ldu, reinterpret_cast<double*>(VT.Data()), &ldvt, &wkopt, &lwork, &info);
@ -4622,16 +4548,13 @@ void CPUMatrix<ElemType>::SVD(const CPUMatrix<ElemType>& A, CPUMatrix<ElemType>&
dgesvd("All", "All", &m, &n, reinterpret_cast<double*>(A.Data()), &lda, reinterpret_cast<double*>(SIGMA.Data()), reinterpret_cast<double*>(U.Data()), &ldu, reinterpret_cast<double*>(VT.Data()), &ldvt, reinterpret_cast<double*>(W.Data()), &lwork, &info);
#else
std::vector<double> superb(std::max(std::min(m, n) - 1, 1));
info = LAPACKE_dgesvd(BLAS_COLMAJOR 'A', 'A', (int) m, (int) n, reinterpret_cast<double*>(A.Data()), (int) lda, reinterpret_cast<double*>(SIGMA.Data()),
info = LAPACKE_dgesvd((int) MatrixOrder::ColMajor, 'A', 'A', (int) m, (int) n, reinterpret_cast<double*>(A.Data()), (int) lda, reinterpret_cast<double*>(SIGMA.Data()),
reinterpret_cast<double*>(U.Data()), (int) ldu, reinterpret_cast<double*>(VT.Data()), (int) ldvt, &superb[0]);
#endif
}
else
{
#ifdef USE_ACML
#pragma warning(suppress : 4244)
sgesvd('A', 'A', (int) m, (int) n, reinterpret_cast<float*>(A.Data()), (int) lda, reinterpret_cast<float*>(SIGMA.Data()), reinterpret_cast<float*>(U.Data()), (int) ldu, reinterpret_cast<float*>(VT.Data()), (int) ldvt, &info);
#elif defined(USE_MKL)
#ifdef USE_MKL
float wkopt;
int lwork = -1;
sgesvd("All", "All", &m, &n, reinterpret_cast<float*>(A.Data()), &lda, reinterpret_cast<float*>(SIGMA.Data()), reinterpret_cast<float*>(U.Data()), &ldu, reinterpret_cast<float*>(VT.Data()), &ldvt, &wkopt, &lwork, &info);
@ -4640,7 +4563,7 @@ void CPUMatrix<ElemType>::SVD(const CPUMatrix<ElemType>& A, CPUMatrix<ElemType>&
sgesvd("All", "All", &m, &n, reinterpret_cast<float*>(A.Data()), &lda, reinterpret_cast<float*>(SIGMA.Data()), reinterpret_cast<float*>(U.Data()), &ldu, reinterpret_cast<float*>(VT.Data()), &ldvt, reinterpret_cast<float*>(W.Data()), &lwork, &info);
#else
std::vector<float> superb(std::max(std::min(m, n) - 1, 1));
info = LAPACKE_sgesvd(BLAS_COLMAJOR 'A', 'A', (int) m, (int) n, reinterpret_cast<float*>(A.Data()), (int) lda, reinterpret_cast<float*>(SIGMA.Data()),
info = LAPACKE_sgesvd((int) MatrixOrder::ColMajor, 'A', 'A', (int) m, (int) n, reinterpret_cast<float*>(A.Data()), (int) lda, reinterpret_cast<float*>(SIGMA.Data()),
reinterpret_cast<float*>(U.Data()), (int) ldu, reinterpret_cast<float*>(VT.Data()), (int) ldvt, &superb[0]);
#endif
}
@ -4837,20 +4760,12 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
if (sizeof(ElemType) == sizeof(double))
{
#ifdef USE_ACML
daxpy(len, alpha, reinterpret_cast<double*>(a.Data()), incx, reinterpret_cast<double*>(c.Data()), incy);
#else
cblas_daxpy(len, alpha, reinterpret_cast<double*>(a.Data()), incx, reinterpret_cast<double*>(c.Data()), incy);
#endif
}
else
{
#pragma warning(suppress : 4244)
#ifdef USE_ACML
saxpy(len, alpha, reinterpret_cast<float*>(a.Data()), incx, reinterpret_cast<float*>(c.Data()), incy);
#else
cblas_saxpy(len, alpha, reinterpret_cast<float*>(a.Data()), incx, reinterpret_cast<float*>(c.Data()), incy);
#endif
}
}
else if (a.GetNumElements() == 1) // scalar, add to all elements
@ -4889,11 +4804,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
#pragma omp parallel for
foreach_column (j, c)
{
#ifdef USE_ACML
daxpy(m, alpha, reinterpret_cast<double*>(aBufPtr), 1, reinterpret_cast<double*>(cBufPtr + c.LocateColumn(j)), 1);
#else
cblas_daxpy(m, alpha, reinterpret_cast<double*>(aBufPtr), 1, reinterpret_cast<double*>(cBufPtr + c.LocateColumn(j)), 1);
#endif
}
}
else
@ -4902,11 +4813,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
foreach_column (j, c)
{
#pragma warning(suppress : 4244)
#ifdef USE_ACML
saxpy(m, alpha, reinterpret_cast<float*>(aBufPtr), 1, reinterpret_cast<float*>(cBufPtr + c.LocateColumn(j)), 1);
#else
cblas_saxpy(m, alpha, reinterpret_cast<float*>(aBufPtr), 1, reinterpret_cast<float*>(cBufPtr + c.LocateColumn(j)), 1);
#endif
}
}
}
@ -4925,11 +4832,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
#pragma omp parallel for
foreach_row (i, c)
{
#ifdef USE_ACML
daxpy(n, alpha, reinterpret_cast<double*>(aBufPtr), 1, reinterpret_cast<double*>(cBufPtr + i), m);
#else
cblas_daxpy(n, alpha, reinterpret_cast<double*>(aBufPtr), 1, reinterpret_cast<double*>(cBufPtr + i), m);
#endif
}
}
else
@ -4938,11 +4841,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
foreach_row (i, c)
{
#pragma warning(suppress : 4244)
#ifdef USE_ACML
saxpy(n, alpha, reinterpret_cast<float*>(aBufPtr), 1, reinterpret_cast<float*>(cBufPtr + i), m);
#else
cblas_saxpy(n, alpha, reinterpret_cast<float*>(aBufPtr), 1, reinterpret_cast<float*>(cBufPtr + i), m);
#endif
}
}
}
@ -5163,20 +5062,12 @@ template <class ElemType>
}
else if (sizeof(ElemType) == sizeof(double))
{
#ifdef USE_ACML
dscal(len, alpha, reinterpret_cast<double*>(a.Data()), incx); // TODO: Use overloads.
#else
cblas_dscal(len, alpha, reinterpret_cast<double*>(a.Data()), incx);
#endif
}
else
{
#pragma warning(suppress : 4244)
#ifdef USE_ACML
sscal(len, alpha, reinterpret_cast<float*>(a.Data()), incx);
#else
cblas_sscal(len, alpha, reinterpret_cast<float*>(a.Data()), incx);
#endif
}
}
@ -5224,11 +5115,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
#pragma omp parallel for
foreach_column (j, c)
{
#ifdef USE_ACML
c(0, j) = (ElemType) ddot(m, reinterpret_cast<double*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<double*>(bBufPtr + b.LocateColumn(j)), 1);
#else
c(0, j) = (ElemType) cblas_ddot(m, reinterpret_cast<double*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<double*>(bBufPtr + b.LocateColumn(j)), 1);
#endif
}
}
else
@ -5237,11 +5124,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
foreach_column (j, c)
{
#pragma warning(suppress : 4244)
#ifdef USE_ACML
c(0, j) = (ElemType) sdot(m, reinterpret_cast<float*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<float*>(bBufPtr + b.LocateColumn(j)), 1);
#else
c(0, j) = (ElemType) cblas_sdot(m, reinterpret_cast<float*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<float*>(bBufPtr + b.LocateColumn(j)), 1);
#endif
}
}
}
@ -5256,11 +5139,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
#pragma omp parallel for
foreach_row (i, c)
{
#ifdef USE_ACML
c(i, 0) = ddot(n, reinterpret_cast<double*>(aBufPtr + i), m, reinterpret_cast<double*>(bBufPtr + i), m);
#else
c(i, 0) = cblas_ddot(n, reinterpret_cast<double*>(aBufPtr + i), m, reinterpret_cast<double*>(bBufPtr + i), m);
#endif
}
}
else
@ -5269,11 +5148,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
foreach_row (i, c)
{
#pragma warning(suppress : 4244)
#ifdef USE_ACML
c(i, 0) = sdot(n, reinterpret_cast<float*>(aBufPtr + i), m, reinterpret_cast<float*>(bBufPtr + i), m);
#else
c(i, 0) = cblas_sdot(n, reinterpret_cast<float*>(aBufPtr + i), m, reinterpret_cast<float*>(bBufPtr + i), m);
#endif
}
}
}
@ -5298,20 +5173,12 @@ ElemType CPUMatrix<ElemType>::InnerProductOfMatrices(const CPUMatrix<ElemType>&
if (sizeof(ElemType) == sizeof(double))
{
#ifdef USE_ACML
return (ElemType) ddot((int) a.GetNumElements(), reinterpret_cast<double*>(a.Data()), 1, reinterpret_cast<double*>(b.Data()), 1);
#else
return (ElemType) cblas_ddot((int) a.GetNumElements(), reinterpret_cast<double*>(a.Data()), 1, reinterpret_cast<double*>(b.Data()), 1);
#endif
}
else
{
#pragma warning(suppress : 4244)
#ifdef USE_ACML
return (ElemType) sdot((int) a.GetNumElements(), reinterpret_cast<float*>(a.Data()), 1, reinterpret_cast<float*>(b.Data()), 1);
#else
return (ElemType) cblas_sdot((int) a.GetNumElements(), reinterpret_cast<float*>(a.Data()), 1, reinterpret_cast<float*>(b.Data()), 1);
#endif
}
}
@ -5539,21 +5406,13 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
{
for (long j = 0; j < n; j++)
{
#ifdef USE_ACML
c(0, j) = (ElemType) ddot(m, reinterpret_cast<double*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<double*>(bBufPtr + b.LocateColumn(j)), 1);
#else
c(0, j) = (ElemType) cblas_ddot(m, reinterpret_cast<double*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<double*>(bBufPtr + b.LocateColumn(j)), 1);
#endif
}
for (long j = 0; j < n; j++)
{
for (long i = 1; i < negnumber + 1; i++)
{
#ifdef USE_ACML
c(i, j) = (ElemType) ddot(m, reinterpret_cast<double*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<double*>(bBufPtr + b.LocateColumn((j + shift + i - 1) % n)), 1);
#else
c(i, j) = (ElemType) cblas_ddot(m, reinterpret_cast<double*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<double*>(bBufPtr + b.LocateColumn((j + shift + i - 1) % n)), 1);
#endif
}
}
}
@ -5561,21 +5420,13 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
{
for (long j = 0; j < n; j++)
{
#ifdef USE_ACML
c(0, j) = (ElemType) sdot(m, reinterpret_cast<float*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<float*>(bBufPtr + b.LocateColumn(j)), 1);
#else
c(0, j) = (ElemType) cblas_sdot(m, reinterpret_cast<float*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<float*>(bBufPtr + b.LocateColumn(j)), 1);
#endif
}
for (long j = 0; j < n; j++)
{
for (long i = 1; i < negnumber + 1; i++)
{
#ifdef USE_ACML
c(i, j) = (ElemType) sdot(m, reinterpret_cast<float*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<float*>(bBufPtr + b.LocateColumn((j + shift + i - 1) % n)), 1);
#else
c(i, j) = (ElemType) cblas_sdot(m, reinterpret_cast<float*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<float*>(bBufPtr + b.LocateColumn((j + shift + i - 1) % n)), 1);
#endif
}
}
}
@ -5593,11 +5444,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
#pragma omp parallel for
foreach_row (i, c)
{
#ifdef USE_ACML
c(i, 0) = (ElemType) ddot(n, reinterpret_cast<double*>(aBufPtr + i), m, reinterpret_cast<double*>(bBufPtr + i), m);
#else
c(i, 0) = (ElemType) cblas_ddot(n, reinterpret_cast<double*>(aBufPtr + i), m, reinterpret_cast<double*>(bBufPtr + i), m);
#endif
}
}
else
@ -5606,9 +5453,6 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
foreach_row (i, c)
{
#pragma warning(suppress : 4244)
#ifdef USE_ACML
c(i, 0) = sdot(n, reinterpret_cast<float*>(aBufPtr + i), m, reinterpret_cast<float*>(bBufPtr + i), m);
#else
c(i, 0) = cblas_sdot(n, reinterpret_cast<float*>(aBufPtr + i), m, reinterpret_cast<float*>(bBufPtr + i), m);
#endif
}
@ -6025,13 +5869,11 @@ int CPUMatrix<ElemType>::SetNumThreads(int numThreads)
omp_set_num_threads(numThreads);
numThreads = omp_get_max_threads();
#ifdef USE_ACML
acmlsetnumthreads(numThreads);
#elif defined(USE_MKL)
mkl_set_num_threads(numThreads);
#elif defined(USE_OPENBLAS)
openblas_set_num_threads(numThreads);
#endif
#ifdef USE_MKL
mkl_set_num_threads(numThreads);
#elif defined(USE_OPENBLAS)
openblas_set_num_threads(numThreads);
#endif
#endif
return numThreads;
}

Просмотреть файл

@ -23,15 +23,7 @@
#pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
#ifdef USE_ACML
// use ACML as default.
// Download ACML 5.3.0 (e.g., acml5.3.0-ifort64.exe) or above
// from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/
// Install the ifort64 variant (compiled with intel compiler) of the library
// Set Environment variable ACML_PATH to C:\AMD\acml5.3.0\ifort64_mp or the folder you installed acml
// to point to your folder for the include file and link library
#include <acml.h> // requires ACML 5.3.0 and above
#elif defined(USE_MKL)
#ifdef USE_MKL
// requires MKL 10.0 and above
#include <mkl.h>
#else
@ -53,12 +45,6 @@
// return 42;
//}
#ifdef USE_ACML // MKL has one additional parameter for different matrix order
#define BLAS_COLMAJOR
#else
#define BLAS_COLMAJOR (int) MatrixOrder::ColMajor,
#endif
// TODO: Move to CommonMatrix.h
#define IDX2C(i, j, ld) (((j) * (ld)) + (i)) // 0 based indexing
@ -1340,20 +1326,12 @@ ElemType CPUSparseMatrix<ElemType>::SumOfAbsElements() const
if (sizeof(ElemType) == sizeof(double))
{
#ifdef USE_ACML
return (ElemType) dasum((int) this->NzCount(), reinterpret_cast<double*>(Data()), 1);
#else
return (ElemType) cblas_dasum((int) this->NzCount(), reinterpret_cast<double*>(Data()), 1);
#endif
}
else
{
#pragma warning(suppress : 4244)
#ifdef USE_ACML
return sasum((int) this->NzCount(), reinterpret_cast<float*>(Data()), 1);
#else
return cblas_sasum((int) this->NzCount(), reinterpret_cast<float*>(Data()), 1);
#endif
}
}

Просмотреть файл

@ -227,6 +227,5 @@
<Target Name="CheckDependencies">
<Error Condition="'$(MathLibrary)' == 'MKL' And '$(CNTK_MKL_PATH)' == ''" Text="CNTK custom MKL location not specified, see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#optional-mkl for instructions." />
<Error Condition="'$(MathLibrary)' == 'MKL' And !Exists('$(CNTKCustomMKLPath)')" Text="CNTK custom MKL not found. See https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#optional-mkl for instructions." />
<Error Condition="'$(MathLibrary)' == 'ACML' And !Exists('$(ACML_PATH)')" Text="ACML not found. See https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#acml for instructions." />
</Target>
</Project>

Просмотреть файл

@ -17,7 +17,7 @@ RANDOM_OUTPUT=0
CODE_COVERAGE=no
FLAVORS="debug:release"
TARGETS="cpu:gpu"
MATH_LIBRARY="acml"
MATH_LIBRARY="mkl"
TESTTARGETS="cpu:gpu"
# parsing command line arguments:
@ -76,9 +76,6 @@ case $key in
;;
-m|--math-library)
case ${2,,} in
acml)
MATH_LIBRARY_OPTION="--with-acml=$ACML_PATH"
;;
mkl)
MATH_LIBRARY_OPTION="--with-mkl=$MKL_PATH"
;;
@ -134,12 +131,6 @@ if [[ $OS == "Windows_NT" && $OSTYPE == "cygwin" ]]; then
exit 1
fi
if [[ $ACML_PATH == "" ]]; then
echo "============ ACML path not set ============"
echo "============ ACML libraries are needed to successfully build CNTK ============"
exit 1
fi
if [[ "${TARGETS,,}" =~ "1bitsgd" && "${TARGETS,,}" =~ "gpu" ]]; then
echo "============ Cannot specify both GPU and 1bit-SGD as targets, please choose one ============"
exit 1

Просмотреть файл

@ -11,7 +11,7 @@
# BUILDTYPE (release/debug)
# BUILDTYPE (GPU/CPU-only)
# WITH_1BITSGD (whether 1bit-SGD support was enabled)
# MATHLIB (MKL/ACML)
# MATHLIB (MKL)
# CUDA_PATH (if exists, i.e., for GPU builds)
# CUB_PATH (if exists, i.e., for GPU builds)
# CUDNN_PATH (if exists, i.e., only for GPU builds)

64
configure поставляемый
Просмотреть файл

@ -15,10 +15,6 @@ cuda_path=
cuda_check=include/cuda.h
enable_cuda=
have_acml=no
acml_path=
acml_check=include/acml.h
# CNTK Custom MKL Version
cntk_custom_mkl_version=1
@ -79,7 +75,6 @@ enable_code_coverage=$default_use_code_coverage
default_path_list="/usr /usr/local /opt /opt/local"
# List from best to worst choice
default_acmls="acml5.3.1/ifort64_mp"
default_mkls="CNTKCustomMKL"
default_openblas=""
@ -131,11 +126,6 @@ function find_dir ()
done
}
function find_acml ()
{
find_dir "$default_acmls" "$acml_check"
}
function find_mkl ()
{
find_dir "$default_mkls" "$mkl_check"
@ -237,7 +227,6 @@ function show_help ()
echo " --with-gdk-include[=directory] $(show_default $(find_gdk_include))"
echo " --with-gdk-nvml-lib[=directory] $(show_default $(find_gdk_nvml_lib))"
echo " --with-cudnn[=directory] $(show_default $(find_cudnn))"
echo " --with-acml[=directory] $(show_default $(find_acml))"
echo " --with-mkl[=directory] $(show_default $(find_mkl))"
echo " --with-mkl-sequential[=directory] $(show_default $(find_mkl))"
echo " --with-openblas[=directory] (experimental) $(show_default $(find_openblas))"
@ -422,28 +411,6 @@ do
fi
fi
;;
--with-acml*)
have_acml=yes
mathlib=acml
if test x$optarg = x
then
acml_path=$(find_acml)
if test x$acml_path = x
then
echo "Cannot find acml directory"
echo "Please specify a value for --with-acml"
exit 1
fi
else
if test $(check_dir $optarg $acml_check) = yes
then
acml_path=$optarg
else
echo "Invalid acml directory $optarg"
exit 1
fi
fi
;;
--with-mkl*)
have_mkl=yes
mathlib=mkl
@ -603,24 +570,18 @@ then
echo Defaulting to --with-buildtype=release
fi
# If no math library was specified, search for acml and then mkl
if test x$have_acml = xno && test x$have_mkl = xno && test x$have_openblas = xno
# If no math library was specified, search for mkl
if test x$have_mkl = xno && test x$have_openblas = xno
then
acml_path=$(find_acml)
if test x$acml_path = x
then
mkl_path=$(find_mkl)
if test x$mkl_path = x
then
echo "Cannot find a CPU math library."
echo "Please specify --with-acml, --with-mkl, --with-mkl-sequential, --with-openblas with a path."
exit 1
else
mathlib=mkl
fi
else
mathlib=acml
fi
mkl_path=$(find_mkl)
if test x$mkl_path = x
then
echo "Cannot find a CPU math library."
echo "Please specify --with-mkl, --with-mkl-sequential, --with-openblas with a path."
exit 1
else
mathlib=mkl
fi
fi
# If no cuda library specified, search for one
@ -735,9 +696,6 @@ echo "#Configuration file for cntk" > $config
echo BUILDTYPE=$buildtype >> $config
echo MATHLIB=$mathlib >> $config
case $mathlib in
acml)
echo ACML_PATH=$acml_path >> $config
;;
mkl)
echo MKL_PATH=$mkl_path >> $config
echo MKL_THREADING=$mkl_threading >> $config