more acml removal

2016-08-16 15:08:08 +02:00 · 2016-08-16 15:08:08 +02:00 · 79cfcf7d4f
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@ -37,14 +37,8 @@
 #pragma warning(disable : 4244) // unreachable code; triggered for unknown reasons
 #pragma warning(disable : 4702) // conversion from 'double' to 'float'

-#ifdef USE_ACML
-// Download ACML 5.3.1 (e.g., acml5.3.1-ifort64.exe) or above
-// from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/
-// Install the ifort64_mp variant (compiled with intel compiler) of the library
-// Set Environment variable ACML_PATH to C:\AMD\acml5.3.1\ifort64_mp or the folder you installed acml
-// to point to your folder for the include file and link library
-#include <acml.h> // requires ACML 5.3.1 and above
-#elif defined(USE_MKL)
+
+#ifdef defined(USE_MKL)
 // requires MKL 10.0 and above
 #include <mkl.h>
 #else
@ -57,12 +51,6 @@
 #include <lapacke.h>
 #endif

-#ifdef USE_ACML // MKL has one additional parameter for different matrix order
-#define BLAS_COLMAJOR
-#else
-#define BLAS_COLMAJOR (int) MatrixOrder::ColMajor,
-#endif
-
 #define SWAP(a, b)  \
    {               \
        (a) ^= (b); \
@ -912,11 +900,7 @@ void CPUMatrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, E
 #pragma omp parallel for
                    foreach_column (j, us)
                    {
-#ifdef USE_ACML
-                        dcopy((int) numRows, reinterpret_cast<double*>(pArray + j), (int) numCols, reinterpret_cast<double*>(bufPtr + LocateColumn(j)), 1);
-#else
                        cblas_dcopy((int) numRows, reinterpret_cast<double*>(pArray + j), (int) numCols, reinterpret_cast<double*>(bufPtr + LocateColumn(j)), 1);
-#endif
                    }
                }
                else
@ -926,11 +910,7 @@ void CPUMatrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, E
                    {
                        {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-                            scopy((int) numRows, reinterpret_cast<float*>(pArray + j), (int) numCols, reinterpret_cast<float*>(bufPtr + LocateColumn(j)), 1);
-#else
                            cblas_scopy((int) numRows, reinterpret_cast<float*>(pArray + j), (int) numCols, reinterpret_cast<float*>(bufPtr + LocateColumn(j)), 1);
-#endif
                        }
                    }
                }
@ -2844,20 +2824,12 @@ ElemType CPUMatrix<ElemType>::SumOfAbsElements() const

    if (sizeof(ElemType) == sizeof(double))
    {
-#ifdef USE_ACML
-        return (ElemType) dasum((int) GetNumElements(), reinterpret_cast<double*>(Data()), 1);
-#else
        return (ElemType) cblas_dasum((int) GetNumElements(), reinterpret_cast<double*>(Data()), 1);
-#endif
    }
    else
    {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-        return sasum((int) GetNumElements(), reinterpret_cast<float*>(Data()), 1);
-#else
        return cblas_sasum((int) GetNumElements(), reinterpret_cast<float*>(Data()), 1);
-#endif
    }
 }

@ -3028,11 +3000,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
 #pragma omp parallel for
            foreach_column (j, c)
            {
-#ifdef USE_ACML
-                c(0, j) = (ElemType) dnrm2(m, reinterpret_cast<double*>(bufPtr + us.LocateColumn(j)), 1);
-#else
                c(0, j) = (ElemType) cblas_dnrm2(m, reinterpret_cast<double*>(bufPtr + us.LocateColumn(j)), 1);
-#endif
            }
        }
        else
@ -3041,11 +3009,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
            foreach_column (j, c)
            {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-                c(0, j) = snrm2(m, reinterpret_cast<float*>(bufPtr + us.LocateColumn(j)), 1);
-#else
                c(0, j) = cblas_snrm2(m, reinterpret_cast<float*>(bufPtr + us.LocateColumn(j)), 1);
-#endif
            }
        }
    }
@ -3058,11 +3022,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
 #pragma omp parallel for
            foreach_row (i, c)
            {
-#ifdef USE_ACML
-                c(i, 0) = dnrm2(n, reinterpret_cast<double*>(bufPtr + i), m);
-#else
                c(i, 0) = cblas_dnrm2(n, reinterpret_cast<double*>(bufPtr + i), m);
-#endif
            }
        }
        else
@ -3071,11 +3031,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
            foreach_row (i, c)
            {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-                c(i, 0) = snrm2(n, reinterpret_cast<float*>(bufPtr + i), m);
-#else
                c(i, 0) = cblas_snrm2(n, reinterpret_cast<float*>(bufPtr + i), m);
-#endif
            }
        }
    }
@ -4486,34 +4442,22 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix

    int m, n, k, l;
    int lda, ldb, ldc;
-#ifdef USE_ACML
-    char transA, transB;
-#else
    CBLAS_TRANSPOSE mklTransA;
    CBLAS_TRANSPOSE mklTransB;
-#endif

    if (transposeA)
    {
        m = (int) a.GetNumCols();
        k = (int) a.GetNumRows();
        lda = k;
-#ifdef USE_ACML
-        transA = (char) MatrixTranspose::Trans;
-#else
        mklTransA = CBLAS_TRANSPOSE::CblasTrans;
-#endif
    }
    else
    {
        m = (int) a.GetNumRows();
        k = (int) a.GetNumCols();
        lda = m;
-#ifdef USE_ACML
-        transA = (char) MatrixTranspose::NoTrans;
-#else
        mklTransA = CBLAS_TRANSPOSE::CblasNoTrans;
-#endif
    }

    if (transposeB)
@ -4521,22 +4465,14 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
        l = (int) b.GetNumCols();
        n = (int) b.GetNumRows();
        ldb = n;
-#ifdef USE_ACML
-        transB = (char) MatrixTranspose::Trans;
-#else
        mklTransB = CBLAS_TRANSPOSE::CblasTrans;
-#endif
    }
    else
    {
        l = (int) b.GetNumRows();
        n = (int) b.GetNumCols();
        ldb = l;
-#ifdef USE_ACML
-        transB = (char) MatrixTranspose::NoTrans;
-#else
        mklTransB = CBLAS_TRANSPOSE::CblasNoTrans;
-#endif
    }

    assert(m > 0 && k > 0 && l > 0 && n > 0); // converting from size_t to int may cause overflow
@ -4553,20 +4489,12 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix

    if (sizeof(ElemType) == sizeof(double))
    {
-#ifdef USE_ACML
-        dgemm(transA, transB, m, n, k, alpha, reinterpret_cast<double*>(a.Data()), lda, reinterpret_cast<double*>(b.Data()), ldb, beta, reinterpret_cast<double*>(c.Data()), ldc);
-#else
-        cblas_dgemm((CBLAS_ORDER) BLAS_COLMAJOR mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast<double*>(a.Data()), lda, reinterpret_cast<double*>(b.Data()), ldb, beta, reinterpret_cast<double*>(c.Data()), ldc);
-#endif
+        cblas_dgemm((CBLAS_ORDER) (int)MatrixOrder::ColMajor, mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast<double*>(a.Data()), lda, reinterpret_cast<double*>(b.Data()), ldb, beta, reinterpret_cast<double*>(c.Data()), ldc);
    }
    else
    {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-        sgemm(BLAS_COLMAJOR transA, transB, m, n, k, alpha, reinterpret_cast<float*>(a.Data()), lda, reinterpret_cast<float*>(b.Data()), ldb, beta, reinterpret_cast<float*>(c.Data()), ldc);
-#else
-        cblas_sgemm((CBLAS_ORDER) BLAS_COLMAJOR mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast<float*>(a.Data()), lda, reinterpret_cast<float*>(b.Data()), ldb, beta, reinterpret_cast<float*>(c.Data()), ldc);
-#endif
+        cblas_sgemm((CBLAS_ORDER) (int)MatrixOrder::ColMajor, mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast<float*>(a.Data()), lda, reinterpret_cast<float*>(b.Data()), ldb, beta, reinterpret_cast<float*>(c.Data()), ldc);
    }
 }

@ -4611,9 +4539,7 @@ void CPUMatrix<ElemType>::SVD(const CPUMatrix<ElemType>& A, CPUMatrix<ElemType>&

    if (sizeof(ElemType) == sizeof(double))
    {
-#ifdef USE_ACML
-        dgesvd('A', 'A', (int) m, (int) n, reinterpret_cast<double*>(A.Data()), (int) lda, reinterpret_cast<double*>(SIGMA.Data()), reinterpret_cast<double*>(U.Data()), (int) ldu, reinterpret_cast<double*>(VT.Data()), (int) ldvt, &info);
-#elif defined(USE_MKL)
+#ifdef USE_MKL
        double wkopt;
        int lwork = -1;
        dgesvd("All", "All", &m, &n, reinterpret_cast<double*>(A.Data()), &lda, reinterpret_cast<double*>(SIGMA.Data()), reinterpret_cast<double*>(U.Data()), &ldu, reinterpret_cast<double*>(VT.Data()), &ldvt, &wkopt, &lwork, &info);
@ -4622,16 +4548,13 @@ void CPUMatrix<ElemType>::SVD(const CPUMatrix<ElemType>& A, CPUMatrix<ElemType>&
        dgesvd("All", "All", &m, &n, reinterpret_cast<double*>(A.Data()), &lda, reinterpret_cast<double*>(SIGMA.Data()), reinterpret_cast<double*>(U.Data()), &ldu, reinterpret_cast<double*>(VT.Data()), &ldvt, reinterpret_cast<double*>(W.Data()), &lwork, &info);
 #else
        std::vector<double> superb(std::max(std::min(m, n) - 1, 1));
-        info = LAPACKE_dgesvd(BLAS_COLMAJOR 'A', 'A', (int) m, (int) n, reinterpret_cast<double*>(A.Data()), (int) lda, reinterpret_cast<double*>(SIGMA.Data()),
+        info = LAPACKE_dgesvd((int) MatrixOrder::ColMajor, 'A', 'A', (int) m, (int) n, reinterpret_cast<double*>(A.Data()), (int) lda, reinterpret_cast<double*>(SIGMA.Data()),
            reinterpret_cast<double*>(U.Data()), (int) ldu, reinterpret_cast<double*>(VT.Data()), (int) ldvt, &superb[0]);
 #endif
    }
    else
    {
-#ifdef USE_ACML
-#pragma warning(suppress : 4244)
-        sgesvd('A', 'A', (int) m, (int) n, reinterpret_cast<float*>(A.Data()), (int) lda, reinterpret_cast<float*>(SIGMA.Data()), reinterpret_cast<float*>(U.Data()), (int) ldu, reinterpret_cast<float*>(VT.Data()), (int) ldvt, &info);
-#elif defined(USE_MKL)
+#ifdef USE_MKL
        float wkopt;
        int lwork = -1;
        sgesvd("All", "All", &m, &n, reinterpret_cast<float*>(A.Data()), &lda, reinterpret_cast<float*>(SIGMA.Data()), reinterpret_cast<float*>(U.Data()), &ldu, reinterpret_cast<float*>(VT.Data()), &ldvt, &wkopt, &lwork, &info);
@ -4640,7 +4563,7 @@ void CPUMatrix<ElemType>::SVD(const CPUMatrix<ElemType>& A, CPUMatrix<ElemType>&
        sgesvd("All", "All", &m, &n, reinterpret_cast<float*>(A.Data()), &lda, reinterpret_cast<float*>(SIGMA.Data()), reinterpret_cast<float*>(U.Data()), &ldu, reinterpret_cast<float*>(VT.Data()), &ldvt, reinterpret_cast<float*>(W.Data()), &lwork, &info);
 #else
        std::vector<float> superb(std::max(std::min(m, n) - 1, 1));
-        info = LAPACKE_sgesvd(BLAS_COLMAJOR 'A', 'A', (int) m, (int) n, reinterpret_cast<float*>(A.Data()), (int) lda, reinterpret_cast<float*>(SIGMA.Data()),
+        info = LAPACKE_sgesvd((int) MatrixOrder::ColMajor, 'A', 'A', (int) m, (int) n, reinterpret_cast<float*>(A.Data()), (int) lda, reinterpret_cast<float*>(SIGMA.Data()),
            reinterpret_cast<float*>(U.Data()), (int) ldu, reinterpret_cast<float*>(VT.Data()), (int) ldvt, &superb[0]);
 #endif
    }
@ -4837,20 +4760,12 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&

        if (sizeof(ElemType) == sizeof(double))
        {
-#ifdef USE_ACML
-            daxpy(len, alpha, reinterpret_cast<double*>(a.Data()), incx, reinterpret_cast<double*>(c.Data()), incy);
-#else
            cblas_daxpy(len, alpha, reinterpret_cast<double*>(a.Data()), incx, reinterpret_cast<double*>(c.Data()), incy);
-#endif
        }
        else
        {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-            saxpy(len, alpha, reinterpret_cast<float*>(a.Data()), incx, reinterpret_cast<float*>(c.Data()), incy);
-#else
            cblas_saxpy(len, alpha, reinterpret_cast<float*>(a.Data()), incx, reinterpret_cast<float*>(c.Data()), incy);
-#endif
        }
    }
    else if (a.GetNumElements() == 1) // scalar, add to all elements
@ -4889,11 +4804,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
 #pragma omp parallel for
            foreach_column (j, c)
            {
-#ifdef USE_ACML
-                daxpy(m, alpha, reinterpret_cast<double*>(aBufPtr), 1, reinterpret_cast<double*>(cBufPtr + c.LocateColumn(j)), 1);
-#else
                cblas_daxpy(m, alpha, reinterpret_cast<double*>(aBufPtr), 1, reinterpret_cast<double*>(cBufPtr + c.LocateColumn(j)), 1);
-#endif
            }
        }
        else
@ -4902,11 +4813,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
            foreach_column (j, c)
            {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-                saxpy(m, alpha, reinterpret_cast<float*>(aBufPtr), 1, reinterpret_cast<float*>(cBufPtr + c.LocateColumn(j)), 1);
-#else
                cblas_saxpy(m, alpha, reinterpret_cast<float*>(aBufPtr), 1, reinterpret_cast<float*>(cBufPtr + c.LocateColumn(j)), 1);
-#endif
            }
        }
    }
@ -4925,11 +4832,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
 #pragma omp parallel for
            foreach_row (i, c)
            {
-#ifdef USE_ACML
-                daxpy(n, alpha, reinterpret_cast<double*>(aBufPtr), 1, reinterpret_cast<double*>(cBufPtr + i), m);
-#else
                cblas_daxpy(n, alpha, reinterpret_cast<double*>(aBufPtr), 1, reinterpret_cast<double*>(cBufPtr + i), m);
-#endif
            }
        }
        else
@ -4938,11 +4841,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
            foreach_row (i, c)
            {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-                saxpy(n, alpha, reinterpret_cast<float*>(aBufPtr), 1, reinterpret_cast<float*>(cBufPtr + i), m);
-#else
                cblas_saxpy(n, alpha, reinterpret_cast<float*>(aBufPtr), 1, reinterpret_cast<float*>(cBufPtr + i), m);
-#endif
            }
        }
    }
@ -5163,20 +5062,12 @@ template <class ElemType>
    }
    else if (sizeof(ElemType) == sizeof(double))
    {
-#ifdef USE_ACML
-        dscal(len, alpha, reinterpret_cast<double*>(a.Data()), incx); // TODO: Use overloads.
-#else
        cblas_dscal(len, alpha, reinterpret_cast<double*>(a.Data()), incx);
-#endif
    }
    else
    {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-        sscal(len, alpha, reinterpret_cast<float*>(a.Data()), incx);
-#else
        cblas_sscal(len, alpha, reinterpret_cast<float*>(a.Data()), incx);
-#endif
    }
 }

@ -5224,11 +5115,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
 #pragma omp parallel for
            foreach_column (j, c)
            {
-#ifdef USE_ACML
-                c(0, j) = (ElemType) ddot(m, reinterpret_cast<double*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<double*>(bBufPtr + b.LocateColumn(j)), 1);
-#else
                c(0, j) = (ElemType) cblas_ddot(m, reinterpret_cast<double*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<double*>(bBufPtr + b.LocateColumn(j)), 1);
-#endif
            }
        }
        else
@ -5237,11 +5124,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
            foreach_column (j, c)
            {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-                c(0, j) = (ElemType) sdot(m, reinterpret_cast<float*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<float*>(bBufPtr + b.LocateColumn(j)), 1);
-#else
                c(0, j) = (ElemType) cblas_sdot(m, reinterpret_cast<float*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<float*>(bBufPtr + b.LocateColumn(j)), 1);
-#endif
            }
        }
    }
@ -5256,11 +5139,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
 #pragma omp parallel for
            foreach_row (i, c)
            {
-#ifdef USE_ACML
-                c(i, 0) = ddot(n, reinterpret_cast<double*>(aBufPtr + i), m, reinterpret_cast<double*>(bBufPtr + i), m);
-#else
                c(i, 0) = cblas_ddot(n, reinterpret_cast<double*>(aBufPtr + i), m, reinterpret_cast<double*>(bBufPtr + i), m);
-#endif
            }
        }
        else
@ -5269,11 +5148,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
            foreach_row (i, c)
            {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-                c(i, 0) = sdot(n, reinterpret_cast<float*>(aBufPtr + i), m, reinterpret_cast<float*>(bBufPtr + i), m);
-#else
                c(i, 0) = cblas_sdot(n, reinterpret_cast<float*>(aBufPtr + i), m, reinterpret_cast<float*>(bBufPtr + i), m);
-#endif
            }
        }
    }
@ -5298,20 +5173,12 @@ ElemType CPUMatrix<ElemType>::InnerProductOfMatrices(const CPUMatrix<ElemType>&

    if (sizeof(ElemType) == sizeof(double))
    {
-#ifdef USE_ACML
-        return (ElemType) ddot((int) a.GetNumElements(), reinterpret_cast<double*>(a.Data()), 1, reinterpret_cast<double*>(b.Data()), 1);
-#else
        return (ElemType) cblas_ddot((int) a.GetNumElements(), reinterpret_cast<double*>(a.Data()), 1, reinterpret_cast<double*>(b.Data()), 1);
-#endif
    }
    else
    {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-        return (ElemType) sdot((int) a.GetNumElements(), reinterpret_cast<float*>(a.Data()), 1, reinterpret_cast<float*>(b.Data()), 1);
-#else
        return (ElemType) cblas_sdot((int) a.GetNumElements(), reinterpret_cast<float*>(a.Data()), 1, reinterpret_cast<float*>(b.Data()), 1);
-#endif
    }
 }

@ -5539,21 +5406,13 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
        {
            for (long j = 0; j < n; j++)
            {
-#ifdef USE_ACML
-                c(0, j) = (ElemType) ddot(m, reinterpret_cast<double*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<double*>(bBufPtr + b.LocateColumn(j)), 1);
-#else
                c(0, j) = (ElemType) cblas_ddot(m, reinterpret_cast<double*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<double*>(bBufPtr + b.LocateColumn(j)), 1);
-#endif
            }
            for (long j = 0; j < n; j++)
            {
                for (long i = 1; i < negnumber + 1; i++)
                {
-#ifdef USE_ACML
-                    c(i, j) = (ElemType) ddot(m, reinterpret_cast<double*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<double*>(bBufPtr + b.LocateColumn((j + shift + i - 1) % n)), 1);
-#else
                    c(i, j) = (ElemType) cblas_ddot(m, reinterpret_cast<double*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<double*>(bBufPtr + b.LocateColumn((j + shift + i - 1) % n)), 1);
-#endif
                }
            }
        }
@ -5561,21 +5420,13 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
        {
            for (long j = 0; j < n; j++)
            {
-#ifdef USE_ACML
-                c(0, j) = (ElemType) sdot(m, reinterpret_cast<float*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<float*>(bBufPtr + b.LocateColumn(j)), 1);
-#else
                c(0, j) = (ElemType) cblas_sdot(m, reinterpret_cast<float*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<float*>(bBufPtr + b.LocateColumn(j)), 1);
-#endif
            }
            for (long j = 0; j < n; j++)
            {
                for (long i = 1; i < negnumber + 1; i++)
                {
-#ifdef USE_ACML
-                    c(i, j) = (ElemType) sdot(m, reinterpret_cast<float*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<float*>(bBufPtr + b.LocateColumn((j + shift + i - 1) % n)), 1);
-#else
                    c(i, j) = (ElemType) cblas_sdot(m, reinterpret_cast<float*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<float*>(bBufPtr + b.LocateColumn((j + shift + i - 1) % n)), 1);
-#endif
                }
            }
        }
@ -5593,11 +5444,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
 #pragma omp parallel for
            foreach_row (i, c)
            {
-#ifdef USE_ACML
-                c(i, 0) = (ElemType) ddot(n, reinterpret_cast<double*>(aBufPtr + i), m, reinterpret_cast<double*>(bBufPtr + i), m);
-#else
                c(i, 0) = (ElemType) cblas_ddot(n, reinterpret_cast<double*>(aBufPtr + i), m, reinterpret_cast<double*>(bBufPtr + i), m);
-#endif
            }
        }
        else
@ -5606,9 +5453,6 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
            foreach_row (i, c)
            {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-                c(i, 0) = sdot(n, reinterpret_cast<float*>(aBufPtr + i), m, reinterpret_cast<float*>(bBufPtr + i), m);
-#else
                c(i, 0) = cblas_sdot(n, reinterpret_cast<float*>(aBufPtr + i), m, reinterpret_cast<float*>(bBufPtr + i), m);
 #endif
            }
@ -6025,13 +5869,11 @@ int CPUMatrix<ElemType>::SetNumThreads(int numThreads)
    omp_set_num_threads(numThreads);
    numThreads = omp_get_max_threads();

-#ifdef USE_ACML
-    acmlsetnumthreads(numThreads);
-#elif defined(USE_MKL)
-    mkl_set_num_threads(numThreads);
-#elif defined(USE_OPENBLAS)
-    openblas_set_num_threads(numThreads);
-#endif
+    #ifdef USE_MKL
+        mkl_set_num_threads(numThreads);
+    #elif defined(USE_OPENBLAS)
+        openblas_set_num_threads(numThreads);
+    #endif
 #endif
    return numThreads;
 }
--- a/Source/Math/CPUSparseMatrix.cpp
+++ b/Source/Math/CPUSparseMatrix.cpp
@ -23,15 +23,7 @@

 #pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this

-#ifdef USE_ACML
-// use ACML as default.
-// Download ACML 5.3.0 (e.g., acml5.3.0-ifort64.exe) or above
-// from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/
-// Install the ifort64 variant (compiled with intel compiler) of the library
-// Set Environment variable ACML_PATH to C:\AMD\acml5.3.0\ifort64_mp or the folder you installed acml
-// to point to your folder for the include file and link library
-#include <acml.h> // requires ACML 5.3.0 and above
-#elif defined(USE_MKL)
+#ifdef USE_MKL
 // requires MKL 10.0 and above
 #include <mkl.h>
 #else
@ -53,12 +45,6 @@
 //    return 42;
 //}

-#ifdef USE_ACML // MKL has one additional parameter for different matrix order
-#define BLAS_COLMAJOR
-#else
-#define BLAS_COLMAJOR (int) MatrixOrder::ColMajor,
-#endif
-
 // TODO: Move to CommonMatrix.h
 #define IDX2C(i, j, ld) (((j) * (ld)) + (i)) // 0 based indexing

@ -1340,20 +1326,12 @@ ElemType CPUSparseMatrix<ElemType>::SumOfAbsElements() const

    if (sizeof(ElemType) == sizeof(double))
    {
-#ifdef USE_ACML
-        return (ElemType) dasum((int) this->NzCount(), reinterpret_cast<double*>(Data()), 1);
-#else
        return (ElemType) cblas_dasum((int) this->NzCount(), reinterpret_cast<double*>(Data()), 1);
-#endif
    }
    else
    {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-        return sasum((int) this->NzCount(), reinterpret_cast<float*>(Data()), 1);
-#else
        return cblas_sasum((int) this->NzCount(), reinterpret_cast<float*>(Data()), 1);
-#endif
    }
 }

--- a/Source/Math/Math.vcxproj
+++ b/Source/Math/Math.vcxproj
@ -227,6 +227,5 @@
  <Target Name="CheckDependencies">
    <Error Condition="'$(MathLibrary)' == 'MKL' And '$(CNTK_MKL_PATH)' == ''" Text="CNTK custom MKL location not specified, see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#optional-mkl for instructions." />
    <Error Condition="'$(MathLibrary)' == 'MKL' And !Exists('$(CNTKCustomMKLPath)')" Text="CNTK custom MKL not found. See https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#optional-mkl for instructions." />
-    <Error Condition="'$(MathLibrary)' == 'ACML' And !Exists('$(ACML_PATH)')" Text="ACML not found. See https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#acml for instructions." />
  </Target>
 </Project>
--- a/Tools/build-and-test
+++ b/Tools/build-and-test
@ -17,7 +17,7 @@ RANDOM_OUTPUT=0
 CODE_COVERAGE=no
 FLAVORS="debug:release"
 TARGETS="cpu:gpu"
-MATH_LIBRARY="acml"
+MATH_LIBRARY="mkl"
 TESTTARGETS="cpu:gpu"

 # parsing command line arguments:
@ -76,9 +76,6 @@ case $key in
    ;;
    -m|--math-library)
    case ${2,,} in
-        acml)
-        MATH_LIBRARY_OPTION="--with-acml=$ACML_PATH"
-        ;;
        mkl)
        MATH_LIBRARY_OPTION="--with-mkl=$MKL_PATH"
        ;;
@ -134,12 +131,6 @@ if [[ $OS == "Windows_NT" && $OSTYPE == "cygwin" ]]; then
        exit 1
    fi

-    if [[ $ACML_PATH == "" ]]; then
-        echo "============ ACML path not set  ============"
-        echo "============ ACML libraries are needed to successfully build CNTK ============"
-        exit 1
-    fi
-
    if [[ "${TARGETS,,}" =~ "1bitsgd" && "${TARGETS,,}" =~ "gpu" ]]; then
        echo "============ Cannot specify both GPU and 1bit-SGD as targets, please choose one ============"
        exit 1
--- a/Tools/generate_build_info
+++ b/Tools/generate_build_info
@ -11,7 +11,7 @@
 #   BUILDTYPE       (release/debug)
 #   BUILDTYPE       (GPU/CPU-only)
 #   WITH_1BITSGD    (whether 1bit-SGD support was enabled)
-#   MATHLIB         (MKL/ACML)
+#   MATHLIB         (MKL)
 #   CUDA_PATH       (if exists, i.e., for GPU builds)
 #   CUB_PATH        (if exists, i.e., for GPU builds)
 #   CUDNN_PATH      (if exists, i.e., only for GPU builds)
--- a/64
+++ b/64
@ -15,10 +15,6 @@ cuda_path=
 cuda_check=include/cuda.h
 enable_cuda=

-have_acml=no
-acml_path=
-acml_check=include/acml.h
-
 # CNTK Custom MKL Version
 cntk_custom_mkl_version=1

@ -79,7 +75,6 @@ enable_code_coverage=$default_use_code_coverage
 default_path_list="/usr /usr/local /opt /opt/local"

 # List from best to worst choice
-default_acmls="acml5.3.1/ifort64_mp"
 default_mkls="CNTKCustomMKL"
 default_openblas=""

@ -131,11 +126,6 @@ function find_dir ()
    done
 }

-function find_acml ()
-{
-    find_dir "$default_acmls" "$acml_check"
-}
-
 function find_mkl ()
 {
    find_dir "$default_mkls" "$mkl_check"
@ -237,7 +227,6 @@ function show_help ()
    echo "  --with-gdk-include[=directory] $(show_default $(find_gdk_include))"
    echo "  --with-gdk-nvml-lib[=directory] $(show_default $(find_gdk_nvml_lib))"
    echo "  --with-cudnn[=directory] $(show_default $(find_cudnn))"
-    echo "  --with-acml[=directory] $(show_default $(find_acml))"
    echo "  --with-mkl[=directory] $(show_default $(find_mkl))"
    echo "  --with-mkl-sequential[=directory] $(show_default $(find_mkl))"
    echo "  --with-openblas[=directory] (experimental) $(show_default $(find_openblas))"
@ -422,28 +411,6 @@ do
                fi
            fi
            ;;
-        --with-acml*)
-            have_acml=yes
-            mathlib=acml
-            if test x$optarg = x
-            then
-                acml_path=$(find_acml)
-                if test x$acml_path = x
-                then
-                    echo "Cannot find acml directory"
-                    echo "Please specify a value for --with-acml"
-                    exit 1
-                fi
-            else
-                if test $(check_dir $optarg $acml_check) = yes
-                then
-                    acml_path=$optarg
-                else
-                    echo "Invalid acml directory $optarg"
-                    exit 1
-                fi
-            fi
-            ;;
        --with-mkl*)
            have_mkl=yes
            mathlib=mkl
@ -603,24 +570,18 @@ then
    echo Defaulting to --with-buildtype=release
 fi

-# If no math library was specified, search for acml and then mkl
-if test x$have_acml = xno && test x$have_mkl = xno && test x$have_openblas = xno
+# If no math library was specified, search for mkl
+if test x$have_mkl = xno && test x$have_openblas = xno
 then
-    acml_path=$(find_acml)
-    if test x$acml_path = x
-    then
-        mkl_path=$(find_mkl)
-        if test x$mkl_path = x
-        then
-            echo "Cannot find a CPU math library."
-            echo "Please specify --with-acml, --with-mkl, --with-mkl-sequential, --with-openblas with a path."
-            exit 1
-        else
-            mathlib=mkl
-        fi
-    else
-        mathlib=acml
-    fi
+	mkl_path=$(find_mkl)
+	if test x$mkl_path = x
+	then
+		echo "Cannot find a CPU math library."
+		echo "Please specify --with-mkl, --with-mkl-sequential, --with-openblas with a path."
+		exit 1
+	else
+		mathlib=mkl
+	fi
 fi

 # If no cuda library specified, search for one
@ -735,9 +696,6 @@ echo "#Configuration file for cntk" > $config
 echo BUILDTYPE=$buildtype >> $config
 echo MATHLIB=$mathlib >> $config
 case $mathlib in
-    acml)
-        echo ACML_PATH=$acml_path >> $config
-        ;;
    mkl)
        echo MKL_PATH=$mkl_path >> $config
        echo MKL_THREADING=$mkl_threading >> $config