diff --git a/Source/Math/GPUMatrix.cu b/Source/Math/GPUMatrix.cu index 474f41e4b..7e5157026 100644 --- a/Source/Math/GPUMatrix.cu +++ b/Source/Math/GPUMatrix.cu @@ -4279,10 +4279,12 @@ template static shared_ptr> GetOnesVector(size_t N, DEVICEID_TYPE deviceId) { // using an array of shared_ptrs because those are thread-safe. The objects themselves are immutable. - // And using a plain array so this will never get freed, avoiding free-after-DLL-unload issues. - static shared_ptr> onesCache[32]; // cache of objects - if (deviceId >= _countof(onesCache)) - LogicError("GetOnesVector: onesCache[] too small (%d entries), increase (you need %d) and recompile.", (int) _countof(onesCache), (int) deviceId + 1); + // And using a dynamically allocated array so this will never get freed, avoiding free-after-DLL-unload issues. + // Using a plain array would lead the destructor to be called for every element in the array + const int CacheSize = 32; + static shared_ptr>* onesCache = new shared_ptr>[CacheSize]; // cache of objects + if (deviceId >= CacheSize) + LogicError("GetOnesVector: onesCache[] too small (%d entries), increase (you need %d) and recompile.", CacheSize, (int)deviceId + 1); auto p = onesCache[deviceId]; if (!p || p->GetNumRows() < N) // must (re-)allocate { diff --git a/Tests/UnitTests/MathPerformanceTests/MathPerformanceTests.cpp b/Tests/UnitTests/MathPerformanceTests/MathPerformanceTests.cpp index ac7b26889..0361326f0 100644 --- a/Tests/UnitTests/MathPerformanceTests/MathPerformanceTests.cpp +++ b/Tests/UnitTests/MathPerformanceTests/MathPerformanceTests.cpp @@ -19,160 +19,6 @@ using namespace Microsoft::MSR::CNTK; using namespace std; -template -void SetToInitStateValueForResetSeg(const Matrix& sentenceBegin, - size_t nStream, ElemType initStateValue, Matrix& newprevstate) -{ - Matrix colSeg(sentenceBegin.GetDeviceId()); - colSeg.Resize(nStream, nStream); - size_t nStateRow = newprevstate.GetNumRows(); - - assert(nStream == sentenceBegin.GetNumRows()); - - // only set state to init state value for segmentation = 0, and -1 - // e.g., -1 0 1 -> 0 0 1 -> 0 0 -1 -> 1 1 0 - - Matrix colPos(sentenceBegin.GetDeviceId()); - colPos.SetValue(sentenceBegin); // -1 0 1 - colPos.InplaceTruncateBottom(1 << 0 /*(int)MinibatchPackingFlags::SequenceStart*/); // TODO: these flags no longer exist, this test probably no longer applies - Matrix::Scale((ElemType) -1.0, colPos); - colPos += 0; // (int)MinibatchPackingFlags::None; // TODO: these flags no longer exist, this test probably no longer applies - colSeg.SetDiagonalValue(colPos); - Matrix ones(sentenceBegin.GetDeviceId()); - ones.Resize(nStateRow, nStream); - ones.SetValue((ElemType) 1); - // add default state value if it is for reset - Matrix::MultiplyAndWeightedAdd(initStateValue, ones, false, colSeg, false, 1.0, newprevstate); // += [0 initStateValue 0 ] -} - -template -void rnnForwardPropSRP(Matrix& functionValues, size_t mNbr, Matrix& pastActivity, Matrix& inputFunctionValues, Matrix& colBegin, const Matrix& needToCompute) -{ - size_t ncol = functionValues.GetNumCols(); - size_t ntime = ncol / mNbr; - Matrix out = functionValues.ColumnSlice(0, mNbr); - Matrix inp((DEVICEID_TYPE) functionValues.GetDeviceId()); - - for (size_t d = 0; d < ntime; d++) - { - if (d == 0) - inp = pastActivity.ColumnSlice(d, mNbr); - else - inp = inputFunctionValues.ColumnSlice(d, mNbr); - - if (needToCompute.ColumnSlice(d, 1).Get00Element() == 1) - { - Matrix colSegPastActivity((DEVICEID_TYPE) functionValues.GetDeviceId()); - Matrix colSeg((DEVICEID_TYPE) functionValues.GetDeviceId()); - colSeg.Resize(mNbr, mNbr); - colSeg.SetValue(0); - colSegPastActivity.SetValue(colBegin); - colSegPastActivity.InplaceTruncateBottom(1 << 0 /*(int)MinibatchPackingFlags::SequenceStart*/); // TODO: these flags no longer exist, this test probably no longer applies - colSeg.SetDiagonalValue(colSegPastActivity); - Matrix::Multiply(inp, false, colSeg, false, out); - ElemType initStateValue = (ElemType) 0.1; - SetToInitStateValueForResetSeg(colBegin, mNbr, initStateValue, out); - } - } -} - -template -void oldRnnForwardPropSRP(Matrix& functionValues, size_t mNbr, Matrix& pastActivity, Matrix& inputFunctionValues) -{ - size_t ncol = functionValues.GetNumCols(); - size_t ntime = ncol / mNbr; - for (size_t timeIdxInSeq = 0; timeIdxInSeq < ntime; timeIdxInSeq++) - { - for (size_t i = 0; i < mNbr; i++) - { - bool reset = false; - - if (timeIdxInSeq == 0) - { - reset = true; - } - oldRNNForwardPropSRP(timeIdxInSeq, 1, reset, (ElemType) 0.1, functionValues, pastActivity, inputFunctionValues, i, mNbr); - } - } -} - -template -void oldRNNForwardPropSRP(const size_t timeIdxInSeq, const int delay, const bool reset, const ElemType default_activity, Matrix& functionValues, const Matrix& pastActivity, const Matrix& inputFunctionValues, const size_t indexInBatch, const size_t mNbr) -{ - assert(delay > 0); - - if (functionValues.GetNumRows() != inputFunctionValues.GetNumRows() || - functionValues.GetNumCols() != inputFunctionValues.GetNumCols()) - functionValues.Resize(inputFunctionValues.GetNumRows(), - inputFunctionValues.GetNumCols()); - - int iPastIndex = (int) ((int) timeIdxInSeq - (int) delay) * (int) mNbr; - int d = iPastIndex; - if (d < 0) - d = (int) functionValues.Mod((float) iPastIndex, (float) pastActivity.GetNumCols()); - // this can point to the past activity of the previous mninibatch - - Matrix out = functionValues.ColumnSlice(timeIdxInSeq * mNbr + indexInBatch, 1); - Matrix inp((DEVICEID_TYPE) functionValues.GetDeviceId()); - - if (reset) - out.SetValue(default_activity); - else - { - if (iPastIndex < 0) - inp = pastActivity.ColumnSlice(d + indexInBatch, 1); - else - inp = inputFunctionValues.ColumnSlice(d + indexInBatch, 1); - out.AssignValuesOf(inp); - } -} - -/** -The new way of resetting RNN state. -*/ -template -void TestRnnForwardPropSRP(size_t nRow = 100, size_t nCol = 1000, size_t mNbr = 10, DEVICEID_TYPE deviceID = 0) -{ - Matrix functionValues(deviceID); - Matrix colBegin(deviceID); - Matrix pastActivity(deviceID); - Matrix inputFunctionValues(deviceID); - Matrix needToCompute(deviceID); - - functionValues.Resize(nRow, nCol); - colBegin.Resize(mNbr, 1); - pastActivity.Resize(nRow, nCol); - inputFunctionValues.Resize(nRow, nCol); - needToCompute.Resize(1, nCol / mNbr); - needToCompute.SetValue(0); - needToCompute.ColumnSlice(0, 1).SetValue(1); - auto t_start = clock(); - rnnForwardPropSRP(functionValues, mNbr, pastActivity, inputFunctionValues, colBegin, needToCompute); - auto t_end = clock(); - std::cout << "testRnnForwardPropSRP: " << 1.0 * (t_end - t_start) / CLOCKS_PER_SEC << " seconds" << endl; -} - -/** -The old way of resetting RNN state, which used if statement. Also only supports up to two sentences within a minibatch -*/ -template -void TestOldRnnForwardPropSRP(size_t nRow = 100, size_t nCol = 1000, size_t mNbr = 10, DEVICEID_TYPE deviceID = 0) -{ - Matrix functionValues(deviceID); - Matrix colBegin(deviceID); - Matrix pastActivity(deviceID); - Matrix inputFunctionValues(deviceID); - - functionValues.Resize(nRow, nCol); - colBegin.Resize(mNbr, 1); - pastActivity.Resize(nRow, nCol); - inputFunctionValues.Resize(nRow, nCol); - auto t_start = clock(); - oldRnnForwardPropSRP(functionValues, mNbr, pastActivity, inputFunctionValues); - auto t_end = clock(); - std::cout << "TestOldRnnForwardPropSRP: " << 1.0 * (t_end - t_start) / CLOCKS_PER_SEC << " seconds" << endl; -} - template void randomInitializeCPUMatrix(CPUMatrix& M, float min = -10, float max = 10) { @@ -253,77 +99,6 @@ void AddMultiplyAndInplaceSigmoidTest(int n, int k, int m) std::cout << "Matrix in: " << 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC << " seconds" << endl; } -template -void ColumnSliceMultAndAddTest(int n, int k, int m, DEVICEID_TYPE deviceID) -{ - cout << "Testing Matrix" << endl; - - Matrix AG((size_t) n, (size_t) k, deviceID); - AG.SetUniformRandomValue(-1, 1); - - Matrix BG((size_t) k, (size_t) m, deviceID); - BG.SetUniformRandomValue(-1, 1); - - Matrix CG((size_t) n, (size_t) m, deviceID); - Matrix DG((size_t) n, (size_t) m, deviceID); - - auto t_startG = clock(); - Matrix::MultiplyAndAdd(AG, false, BG, false, CG); - auto t_endG = clock(); - std::cout << "MultiplyAndAdd Directly: " << 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC << " seconds" << endl; - - t_startG = clock(); - for (int i = 0; i < m; i++) - { - Matrix col_BG = BG.ColumnSlice(i, 1); - Matrix col_CG = CG.ColumnSlice(i, 1); - Matrix::MultiplyAndAdd(AG, false, col_BG, false, col_CG); - } - t_endG = clock(); - std::cout << "MultiplyAndAdd With ColumnSlice: " << 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC << " seconds" << endl; - - t_startG = clock(); - for (int i = 0; i < m; i++) - { - Matrix col_BG = BG.ColumnSlice(i, 1); - Matrix col_CG = CG.ColumnSlice(i, 1); - Matrix::MultiplyAndAdd(AG, false, col_BG, false, col_CG); - } - t_endG = clock(); - std::cout << "MultiplyAndAdd With ColumnSlice&: " << 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC << " seconds" << endl; - - Matrix col_BG1(0), col_CG1(0); - t_startG = clock(); - for (int i = 0; i < m; i++) - { - col_BG1.AssignColumnSlice(BG, i, 1); - col_CG1.AssignColumnSlice(CG, i, 1); - Matrix::MultiplyAndAdd(AG, false, col_BG1, false, col_CG1); - } - t_endG = clock(); - std::cout << "MultiplyAndAdd With AssignColumnSlice: " << 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC << " seconds" << endl; - - t_startG = clock(); - for (int i = 0; i < m; i++) - { - Matrix col_CG = CG.ColumnSlice(i, 1); - Matrix col_DG = DG.ColumnSlice(i, 1); - col_DG.AssignSigmoidOf(col_CG); - } - t_endG = clock(); - std::cout << "AssignSigmoidOf With ColumnSlice: " << 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC << " seconds" << endl; - - t_startG = clock(); - for (int i = 0; i < m; i++) - { - col_BG1.AssignColumnSlice(BG, i, 1); - col_CG1.AssignColumnSlice(CG, i, 1); - col_BG1.AssignSigmoidOf(col_CG1); - } - t_endG = clock(); - std::cout << "AssignSigmoidOf With AssignColumnSlice: " << 1.0 * (t_endG - t_startG) / CLOCKS_PER_SEC << " seconds" << endl; -} - template void SquareMultiplyAndAdd10TimesAvgTest(int n, int count) { @@ -381,129 +156,6 @@ void SquareMultiplyAndAdd10TimesAvgTest(int n, int count) cout << "CPUMatrix/Matrix ratio is: " << cpu_avg / m_avg << " seconds" << endl; } -// simple test suite for TensorView -// - this is meant for performance optimization -// - correctness is defined as same result between GPU and CPU -template -struct TensorTest -{ - // helper to create a randomly initialized tensor object - static TensorView CreateTensor(TensorShape shape, int randomSeed, DEVICEID_TYPE deviceId, bool isResult = false) - { - let numElements = shape.GetNumElements(); - - if (isResult) - cout << " ->"; - cout << " [" << string(shape) << "]"; - if (isResult) - cout << " \t// " << (deviceId < 0 ? "C" : "G") << "PU\n " << flush; - - // random init - mt19937 rng(randomSeed); - uniform_real_distribution nd(-1, 1); - vector init(numElements); - generate(begin(init), end(init), [&] { return nd(rng); }); - - // create storage object (one-column matrix) - let sob = make_shared>(numElements/*rows*/, 1/*cols*/, init.data(), deviceId); - - // create TensorView - return TensorView(sob, shape); - } - - // test bias gradient (reduction) - static TensorView BiasGradientTest(TensorShape layerShape, TensorShape biasShape, DEVICEID_TYPE deviceId) - { - int randomSeed = 1; - let gradient = CreateTensor(layerShape, randomSeed++, deviceId); - auto bias = CreateTensor(biasShape, randomSeed++, deviceId, true); - //gradient.GetSOB().Print("incoming gradient", 0, 9, 0, 9); - //bias.GetSOB().Print("bias gradient", 0, 9, 0, 9); - bias.DoCopyOf(1, gradient, 1); - //bias.GetSOB().Print("updated bias gradient", 0, 9, 0, 9); - return bias; - } - - // test broadcast summation gradient - static TensorView BroadcastingTest(TensorShape layerShape, TensorShape biasShape, DEVICEID_TYPE deviceId) - { - int randomSeed = 1; - let input = CreateTensor(layerShape, randomSeed++, deviceId); - auto bias = CreateTensor(biasShape, randomSeed++, deviceId); - //input.GetSOB().Print("input data", 0, 9, 0, 9); - //bias.GetSOB().Print("bias", 0, 9, 0, 9); - auto result = CreateTensor(layerShape, randomSeed++, deviceId, true); - result.AssignSumOf(input, bias); - return result; - } - - // run one test for both GPU and CPU and verify they are the same - template - static void OneTensorTest(const char* what, double tolerance, const FN& fn) - { - cout << "===== Tensor test '" << what << "'\n "; - - // run on GPU and CPU - let resultGPU = fn(0); - let resultCPU = fn(-1); - - // dump top corner of the result to get a feel for the error - resultGPU.GetSOB().Print("GPU result", 0, 7, 0, 9); - resultGPU.GetSOB().TransferToDeviceIfNotThere(-1, true, false, true); - resultCPU.GetSOB().Print("CPU result", 0, 7, 0, 9); - - // compare - let isSame = resultGPU.GetSOB().IsEqualTo(resultCPU.GetSOB(), (ElemType)tolerance); - cout << (isSame ? " --> SUCCEEDED. =====\n" : " --> FAILED (GPU and CPU results differ). =====\n") << endl << flush; - if (!isSame) - sin(1.0); // set breakpoint here - } - - // main entry point (misusing the constructor) - /*void*/ TensorTest() - { - // --- elementwise - - // elementwise sum - OneTensorTest("elementwise addition", 1e-8, [](DEVICEID_TYPE deviceId) -> TensorView - { - return BroadcastingTest(TensorShape{ 512, 256 }, TensorShape({ 512, 256 }), deviceId); - }); - - // --- broadcasting - - // simple broadcasting - OneTensorTest("addition wth simple broadcasting", 1e-8, [](DEVICEID_TYPE deviceId) -> TensorView - { - return BroadcastingTest(TensorShape{ 3, 2 }, TensorShape({ 3, 1 }), deviceId); - }); - // typical bias for convolutional layer - OneTensorTest("bias addition (broadcasting)", 1e-8, [](DEVICEID_TYPE deviceId) -> TensorView - { - return BroadcastingTest(TensorShape{ 28, 28, 128, 32 }, TensorShape({ 1, 1, 128 }), deviceId); - }); - // BUGBUG: This test is strange--Print() shows different values with depth 128 instead of 64, but IsEqual() does not fail with 1e-3 tolerance. - // Something fishy going on. Dimension overflow? - OneTensorTest("bias addition (broadcasting)", 1e-8, [](DEVICEID_TYPE deviceId) -> TensorView - { - return BroadcastingTest(TensorShape{ 256, 256, 64, 32 }, TensorShape({ 1, 1, 64 }), deviceId); - }); - - // --- reduction - - // typical bias gradient (reduction) for FF-DNN - OneTensorTest("bias gradient (reduction)", 1e-4, [](DEVICEID_TYPE deviceId) -> TensorView - { - return BiasGradientTest(TensorShape{ 2048, 1024 }, TensorShape(2048), deviceId); - }); - // typical bias gradient (reduction) for convolutional layer - OneTensorTest("bias gradient (reduction)", 1e-1, [](DEVICEID_TYPE deviceId) -> TensorView - { - return BiasGradientTest(TensorShape{ 256, 256, 64, 32 }, TensorShape({ 1, 1, 64 }), deviceId); - }); - } -}; - template void MandSTest(int count, int devId) { @@ -563,14 +215,6 @@ void MandSTest(int count, int devId) int wmain() { - //TensorTest(); - - ColumnSliceMultAndAddTest(2048, 2048, 256, 0); - - TestRnnForwardPropSRP(); - - TestOldRnnForwardPropSRP(); - // MandSTest(100, 2); /*cout<