diff --git a/Source/Math/GPUMatrix.cu b/Source/Math/GPUMatrix.cu
index 474f41e4b..7e5157026 100644
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@@ -4279,10 +4279,12 @@ template <class ElemType>
 static shared_ptr<GPUMatrix<ElemType>> GetOnesVector(size_t N, DEVICEID_TYPE deviceId)
 {
     // using an array of shared_ptrs because those are thread-safe. The objects themselves are immutable.
-    // And using a plain array so this will never get freed, avoiding free-after-DLL-unload issues.
-    static shared_ptr<GPUMatrix<ElemType>> onesCache[32]; // cache of objects
-    if (deviceId >= _countof(onesCache))
-        LogicError("GetOnesVector: onesCache[] too small (%d entries), increase (you need %d) and recompile.", (int) _countof(onesCache), (int) deviceId + 1);
+    // And using a dynamically allocated array so this will never get freed, avoiding free-after-DLL-unload issues.
+    // Using a plain array would lead the destructor to be called for every element in the array
+    const int CacheSize = 32;
+    static shared_ptr<GPUMatrix<ElemType>>* onesCache = new shared_ptr<GPUMatrix<ElemType>>[CacheSize]; // cache of objects
+    if (deviceId >= CacheSize)
+        LogicError("GetOnesVector: onesCache[] too small (%d entries), increase (you need %d) and recompile.", CacheSize, (int)deviceId + 1);
     auto p = onesCache[deviceId];
     if (!p || p->GetNumRows() < N) // must (re-)allocate
     {
diff --git a/Tests/UnitTests/MathPerformanceTests/MathPerformanceTests.cpp b/Tests/UnitTests/MathPerformanceTests/MathPerformanceTests.cpp
index ac7b26889..0361326f0 100644
--- a/Tests/UnitTests/MathPerformanceTests/MathPerformanceTests.cpp
+++ b/Tests/UnitTests/MathPerformanceTests/MathPerformanceTests.cpp
@@ -19,160 +19,6 @@
 using namespace Microsoft::MSR::CNTK;
 using namespace std;
 
-template <class ElemType>
-void SetToInitStateValueForResetSeg(const Matrix<ElemType>& sentenceBegin,
-                                    size_t nStream, ElemType initStateValue, Matrix<ElemType>& newprevstate)
-{
-    Matrix<ElemType> colSeg(sentenceBegin.GetDeviceId());
-    colSeg.Resize(nStream, nStream);
-    size_t nStateRow = newprevstate.GetNumRows();
-
-    assert(nStream == sentenceBegin.GetNumRows());
-
-    // only set state to init state value for segmentation = 0, and -1
-    // e.g., -1 0 1 -> 0 0 1 -> 0 0 -1 -> 1 1 0
-
-    Matrix<ElemType> colPos(sentenceBegin.GetDeviceId());
-    colPos.SetValue(sentenceBegin);                                                     // -1 0 1
-    colPos.InplaceTruncateBottom(1 << 0 /*(int)MinibatchPackingFlags::SequenceStart*/); // TODO: these flags no longer exist, this test probably no longer applies
-    Matrix<ElemType>::Scale((ElemType) -1.0, colPos);
-    colPos += 0; // (int)MinibatchPackingFlags::None; // TODO: these flags no longer exist, this test probably no longer applies
-    colSeg.SetDiagonalValue(colPos);
-    Matrix<ElemType> ones(sentenceBegin.GetDeviceId());
-    ones.Resize(nStateRow, nStream);
-    ones.SetValue((ElemType) 1);
-    // add default state value if it is for reset
-    Matrix<ElemType>::MultiplyAndWeightedAdd(initStateValue, ones, false, colSeg, false, 1.0, newprevstate); // += [0 initStateValue 0 ]
-}
-
-template <class ElemType>
-void rnnForwardPropSRP(Matrix<ElemType>& functionValues, size_t mNbr, Matrix<ElemType>& pastActivity, Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& colBegin, const Matrix<ElemType>& needToCompute)
-{
-    size_t ncol = functionValues.GetNumCols();
-    size_t ntime = ncol / mNbr;
-    Matrix<ElemType> out = functionValues.ColumnSlice(0, mNbr);
-    Matrix<ElemType> inp((DEVICEID_TYPE) functionValues.GetDeviceId());
-
-    for (size_t d = 0; d < ntime; d++)
-    {
-        if (d == 0)
-            inp = pastActivity.ColumnSlice(d, mNbr);
-        else
-            inp = inputFunctionValues.ColumnSlice(d, mNbr);
-
-        if (needToCompute.ColumnSlice(d, 1).Get00Element() == 1)
-        {
-            Matrix<ElemType> colSegPastActivity((DEVICEID_TYPE) functionValues.GetDeviceId());
-            Matrix<ElemType> colSeg((DEVICEID_TYPE) functionValues.GetDeviceId());
-            colSeg.Resize(mNbr, mNbr);
-            colSeg.SetValue(0);
-            colSegPastActivity.SetValue(colBegin);
-            colSegPastActivity.InplaceTruncateBottom(1 << 0 /*(int)MinibatchPackingFlags::SequenceStart*/); // TODO: these flags no longer exist, this test probably no longer applies
-            colSeg.SetDiagonalValue(colSegPastActivity);
-            Matrix<ElemType>::Multiply(inp, false, colSeg, false, out);
-            ElemType initStateValue = (ElemType) 0.1;
-            SetToInitStateValueForResetSeg<ElemType>(colBegin, mNbr, initStateValue, out);
-        }
-    }
-}
-
-template <class ElemType>
-void oldRnnForwardPropSRP(Matrix<ElemType>& functionValues, size_t mNbr, Matrix<ElemType>& pastActivity, Matrix<ElemType>& inputFunctionValues)
-{
-    size_t ncol = functionValues.GetNumCols();
-    size_t ntime = ncol / mNbr;
-    for (size_t timeIdxInSeq = 0; timeIdxInSeq < ntime; timeIdxInSeq++)
-    {
-        for (size_t i = 0; i < mNbr; i++)
-        {
-            bool reset = false;
-
-            if (timeIdxInSeq == 0)
-            {
-                reset = true;
-            }
-            oldRNNForwardPropSRP<ElemType>(timeIdxInSeq, 1, reset, (ElemType) 0.1, functionValues, pastActivity, inputFunctionValues, i, mNbr);
-        }
-    }
-}
-
-template <class ElemType>
-void oldRNNForwardPropSRP(const size_t timeIdxInSeq, const int delay, const bool reset, const ElemType default_activity, Matrix<ElemType>& functionValues, const Matrix<ElemType>& pastActivity, const Matrix<ElemType>& inputFunctionValues, const size_t indexInBatch, const size_t mNbr)
-{
-    assert(delay > 0);
-
-    if (functionValues.GetNumRows() != inputFunctionValues.GetNumRows() ||
-        functionValues.GetNumCols() != inputFunctionValues.GetNumCols())
-        functionValues.Resize(inputFunctionValues.GetNumRows(),
-                              inputFunctionValues.GetNumCols());
-
-    int iPastIndex = (int) ((int) timeIdxInSeq - (int) delay) * (int) mNbr;
-    int d = iPastIndex;
-    if (d < 0)
-        d = (int) functionValues.Mod((float) iPastIndex, (float) pastActivity.GetNumCols());
-    // this can point to the past activity of the previous mninibatch
-
-    Matrix<ElemType> out = functionValues.ColumnSlice(timeIdxInSeq * mNbr + indexInBatch, 1);
-    Matrix<ElemType> inp((DEVICEID_TYPE) functionValues.GetDeviceId());
-
-    if (reset)
-        out.SetValue(default_activity);
-    else
-    {
-        if (iPastIndex < 0)
-            inp = pastActivity.ColumnSlice(d + indexInBatch, 1);
-        else
-            inp = inputFunctionValues.ColumnSlice(d + indexInBatch, 1);
-        out.AssignValuesOf(inp);
-    }
-}
-
-/**
-The new way of resetting RNN state. 
-*/
-template <class ElemType>
-void TestRnnForwardPropSRP(size_t nRow = 100, size_t nCol = 1000, size_t mNbr = 10, DEVICEID_TYPE deviceID = 0)
-{
-    Matrix<ElemType> functionValues(deviceID);
-    Matrix<ElemType> colBegin(deviceID);
-    Matrix<ElemType> pastActivity(deviceID);
-    Matrix<ElemType> inputFunctionValues(deviceID);
-    Matrix<ElemType> needToCompute(deviceID);
-
-    functionValues.Resize(nRow, nCol);
-    colBegin.Resize(mNbr, 1);
-    pastActivity.Resize(nRow, nCol);
-    inputFunctionValues.Resize(nRow, nCol);
-    needToCompute.Resize(1, nCol / mNbr);
-    needToCompute.SetValue(0);
-    needToCompute.ColumnSlice(0, 1).SetValue(1);
-    auto t_start = clock();
-    rnnForwardPropSRP<ElemType>(functionValues, mNbr, pastActivity, inputFunctionValues, colBegin, needToCompute);
-    auto t_end = clock();
-    std::cout << "testRnnForwardPropSRP: " << 1.0 * (t_end - t_start) / CLOCKS_PER_SEC << " seconds" << endl;
-}
-
-/**
-The old way of resetting RNN state, which used if statement. Also only supports up to two sentences within a minibatch
-*/
-template <class ElemType>
-void TestOldRnnForwardPropSRP(size_t nRow = 100, size_t nCol = 1000, size_t mNbr = 10, DEVICEID_TYPE deviceID = 0)
-{
-    Matrix<ElemType> functionValues(deviceID);
-    Matrix<ElemType> colBegin(deviceID);
-    Matrix<ElemType> pastActivity(deviceID);
-    Matrix<ElemType> inputFunctionValues(deviceID);
-
-    functionValues.Resize(nRow, nCol);
-    colBegin.Resize(mNbr, 1);
-    pastActivity.Resize(nRow, nCol);
-    inputFunctionValues.Resize(nRow, nCol);
-    auto t_start = clock();
-    oldRnnForwardPropSRP<ElemType>(functionValues, mNbr, pastActivity, inputFunctionValues);
-    auto t_end = clock();
-    std::cout << "TestOldRnnForwardPropSRP: " << 1.0 * (t_end - t_start) / CLOCKS_PER_SEC << " seconds" << endl;
-}
-
 template <class ElemType>
 void randomInitializeCPUMatrix(CPUMatrix<ElemType>& M, float min = -10, float max = 10)
 {
@@ -253,77 +99,6 @@ void AddMultiplyAndInplaceSigmoidTest(int n, int k, int m)
     std::cout << "Matrix in: " << 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC << " seconds" << endl;
 }
 
-template <class ElemType>
-void ColumnSliceMultAndAddTest(int n, int k, int m, DEVICEID_TYPE deviceID)
-{
-    cout << "Testing Matrix" << endl;
-
-    Matrix<ElemType> AG((size_t) n, (size_t) k, deviceID);
-    AG.SetUniformRandomValue(-1, 1);
-
-    Matrix<ElemType> BG((size_t) k, (size_t) m, deviceID);
-    BG.SetUniformRandomValue(-1, 1);
-
-    Matrix<ElemType> CG((size_t) n, (size_t) m, deviceID);
-    Matrix<ElemType> DG((size_t) n, (size_t) m, deviceID);
-
-    auto t_startG = clock();
-    Matrix<ElemType>::MultiplyAndAdd(AG, false, BG, false, CG);
-    auto t_endG = clock();
-    std::cout << "MultiplyAndAdd Directly: " << 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC << " seconds" << endl;
-
-    t_startG = clock();
-    for (int i = 0; i < m; i++)
-    {
-        Matrix<ElemType> col_BG = BG.ColumnSlice(i, 1);
-        Matrix<ElemType> col_CG = CG.ColumnSlice(i, 1);
-        Matrix<ElemType>::MultiplyAndAdd(AG, false, col_BG, false, col_CG);
-    }
-    t_endG = clock();
-    std::cout << "MultiplyAndAdd With ColumnSlice: " << 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC << " seconds" << endl;
-
-    t_startG = clock();
-    for (int i = 0; i < m; i++)
-    {
-        Matrix<ElemType> col_BG = BG.ColumnSlice(i, 1);
-        Matrix<ElemType> col_CG = CG.ColumnSlice(i, 1);
-        Matrix<ElemType>::MultiplyAndAdd(AG, false, col_BG, false, col_CG);
-    }
-    t_endG = clock();
-    std::cout << "MultiplyAndAdd With ColumnSlice&: " << 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC << " seconds" << endl;
-
-    Matrix<ElemType> col_BG1(0), col_CG1(0);
-    t_startG = clock();
-    for (int i = 0; i < m; i++)
-    {
-        col_BG1.AssignColumnSlice(BG, i, 1);
-        col_CG1.AssignColumnSlice(CG, i, 1);
-        Matrix<ElemType>::MultiplyAndAdd(AG, false, col_BG1, false, col_CG1);
-    }
-    t_endG = clock();
-    std::cout << "MultiplyAndAdd With AssignColumnSlice: " << 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC << " seconds" << endl;
-
-    t_startG = clock();
-    for (int i = 0; i < m; i++)
-    {
-        Matrix<ElemType> col_CG = CG.ColumnSlice(i, 1);
-        Matrix<ElemType> col_DG = DG.ColumnSlice(i, 1);
-        col_DG.AssignSigmoidOf(col_CG);
-    }
-    t_endG = clock();
-    std::cout << "AssignSigmoidOf With ColumnSlice: " << 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC << " seconds" << endl;
-
-    t_startG = clock();
-    for (int i = 0; i < m; i++)
-    {
-        col_BG1.AssignColumnSlice(BG, i, 1);
-        col_CG1.AssignColumnSlice(CG, i, 1);
-        col_BG1.AssignSigmoidOf(col_CG1);
-    }
-    t_endG = clock();
-    std::cout << "AssignSigmoidOf With AssignColumnSlice: " << 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC << " seconds" << endl;
-}
-
 template <class ElemType>
 void SquareMultiplyAndAdd10TimesAvgTest(int n, int count)
 {
@@ -381,129 +156,6 @@ void SquareMultiplyAndAdd10TimesAvgTest(int n, int count)
     cout << "CPUMatrix/Matrix ratio is: " << cpu_avg / m_avg << " seconds" << endl;
 }
 
-// simple test suite for TensorView
-//  - this is meant for performance optimization
-//  - correctness is defined as same result between GPU and CPU
-template <class ElemType>
-struct TensorTest
-{
-    // helper to create a randomly initialized tensor object
-    static TensorView<ElemType> CreateTensor(TensorShape shape, int randomSeed, DEVICEID_TYPE deviceId, bool isResult = false)
-    {
-        let numElements = shape.GetNumElements();
-
-        if (isResult)
-            cout << " ->";
-        cout << " [" << string(shape) << "]";
-        if (isResult)
-            cout << " \t// " << (deviceId < 0 ? "C" : "G") << "PU\n   " << flush;
-
-        // random init
-        mt19937 rng(randomSeed);
-        uniform_real_distribution<float> nd(-1, 1);
-        vector<ElemType> init(numElements);
-        generate(begin(init), end(init), [&] { return nd(rng); });
-
-        // create storage object (one-column matrix)
-        let sob = make_shared<Matrix<ElemType>>(numElements/*rows*/, 1/*cols*/, init.data(), deviceId);
-
-        // create TensorView
-        return TensorView<ElemType>(sob, shape);
-    }
-
-    // test bias gradient (reduction)
-    static TensorView<ElemType> BiasGradientTest(TensorShape layerShape, TensorShape biasShape, DEVICEID_TYPE deviceId)
-    {
-        int randomSeed = 1;
-        let  gradient = CreateTensor(layerShape, randomSeed++, deviceId);
-        auto bias = CreateTensor(biasShape, randomSeed++, deviceId, true);
-        //gradient.GetSOB().Print("incoming gradient", 0, 9, 0, 9);
-        //bias.GetSOB().Print("bias gradient", 0, 9, 0, 9);
-        bias.DoCopyOf(1, gradient, 1);
-        //bias.GetSOB().Print("updated bias gradient", 0, 9, 0, 9);
-        return bias;
-    }
-
-    // test broadcast summation gradient
-    static TensorView<ElemType> BroadcastingTest(TensorShape layerShape, TensorShape biasShape, DEVICEID_TYPE deviceId)
-    {
-        int randomSeed = 1;
-        let  input  = CreateTensor(layerShape, randomSeed++, deviceId);
-        auto bias   = CreateTensor(biasShape,  randomSeed++, deviceId);
-        //input.GetSOB().Print("input data", 0, 9, 0, 9);
-        //bias.GetSOB().Print("bias", 0, 9, 0, 9);
-        auto result = CreateTensor(layerShape, randomSeed++, deviceId, true);
-        result.AssignSumOf(input, bias);
-        return result;
-    }
-
-    // run one test for both GPU and CPU and verify they are the same
-    template<typename FN>
-    static void OneTensorTest(const char* what, double tolerance, const FN& fn)
-    {
-        cout << "===== Tensor test '" << what << "'\n   ";
-
-        // run on GPU and CPU
-        let resultGPU = fn(0);
-        let resultCPU = fn(-1);
-
-        // dump top corner of the result to get a feel for the error
-        resultGPU.GetSOB().Print("GPU result", 0, 7, 0, 9);
-        resultGPU.GetSOB().TransferToDeviceIfNotThere(-1, true, false, true);
-        resultCPU.GetSOB().Print("CPU result", 0, 7, 0, 9);
-
-        // compare
-        let isSame = resultGPU.GetSOB().IsEqualTo(resultCPU.GetSOB(), (ElemType)tolerance);
-        cout << (isSame ? " --> SUCCEEDED. =====\n" : " --> FAILED (GPU and CPU results differ). =====\n") << endl << flush;
-        if (!isSame)
-            sin(1.0);  // set breakpoint here
-    }
-
-    // main entry point (misusing the constructor)
-    /*void*/ TensorTest()
-    {
-        // --- elementwise
-
-        // elementwise sum
-        OneTensorTest("elementwise addition", 1e-8, [](DEVICEID_TYPE deviceId) -> TensorView<ElemType>
-        {
-            return BroadcastingTest(TensorShape{ 512, 256 }, TensorShape({ 512, 256 }), deviceId);
-        });
-
-        // --- broadcasting
-
-        // simple broadcasting
-        OneTensorTest("addition wth simple broadcasting", 1e-8, [](DEVICEID_TYPE deviceId) -> TensorView<ElemType>
-        {
-            return BroadcastingTest(TensorShape{ 3, 2 }, TensorShape({ 3, 1 }), deviceId);
-        });
-        // typical bias for convolutional layer
-        OneTensorTest("bias addition (broadcasting)", 1e-8, [](DEVICEID_TYPE deviceId) -> TensorView<ElemType>
-        {
-            return BroadcastingTest(TensorShape{ 28, 28, 128, 32 }, TensorShape({ 1, 1, 128 }), deviceId);
-        });
-        // BUGBUG: This test is strange--Print() shows different values with depth 128 instead of 64, but IsEqual() does not fail with 1e-3 tolerance.
-        //         Something fishy going on. Dimension overflow?
-        OneTensorTest("bias addition (broadcasting)", 1e-8, [](DEVICEID_TYPE deviceId) -> TensorView<ElemType>
-        {
-            return BroadcastingTest(TensorShape{ 256, 256, 64, 32 }, TensorShape({ 1, 1, 64 }), deviceId);
-        });
-
-        // --- reduction
-
-        // typical bias gradient (reduction) for FF-DNN
-        OneTensorTest("bias gradient (reduction)", 1e-4, [](DEVICEID_TYPE deviceId) -> TensorView<ElemType>
-        {
-            return BiasGradientTest(TensorShape{ 2048, 1024 }, TensorShape(2048), deviceId);
-        });
-        // typical bias gradient (reduction) for convolutional layer
-        OneTensorTest("bias gradient (reduction)", 1e-1, [](DEVICEID_TYPE deviceId) -> TensorView<ElemType>
-        {
-            return BiasGradientTest(TensorShape{ 256, 256, 64, 32 }, TensorShape({ 1, 1, 64 }), deviceId);
-        });
-    }
-};
-
 template <class ElemType>
 void MandSTest(int count, int devId)
 {
@@ -563,14 +215,6 @@ void MandSTest(int count, int devId)
 
 int wmain()
 {
-    //TensorTest<float>();
-
-    ColumnSliceMultAndAddTest<float>(2048, 2048, 256, 0);
-
-    TestRnnForwardPropSRP<float>();
-
-    TestOldRnnForwardPropSRP<float>();
-
     // MandSTest<float>(100, 2);
 
     /*cout<<endl<<"********************Matrix SquareMultiplyAndWeightedAdd10TimesAvg TEST********************"<<endl;