diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index 8efd26cb1..348253603 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -418,21 +418,6 @@ public: TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#TertiaryUint16OpTable") END_TEST_METHOD() - BEGIN_TEST_METHOD(WaveMatrixLoadStoreTests) - TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveMatrixTable") - TEST_METHOD_PROPERTY(L"Priority", L"2") - END_TEST_METHOD() - - BEGIN_TEST_METHOD(WaveMatrixScalarTests) - TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveMatrixTable") - TEST_METHOD_PROPERTY(L"Priority", L"2") - END_TEST_METHOD() - - BEGIN_TEST_METHOD(WaveMatrixMathTests) - TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveMatrixTable") - TEST_METHOD_PROPERTY(L"Priority", L"2") - END_TEST_METHOD() - BEGIN_TEST_METHOD(DotTest) TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#DotOpTable") END_TEST_METHOD() @@ -486,6 +471,18 @@ public: TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#PackUnpackOpTable") END_TEST_METHOD() + BEGIN_TEST_METHOD(WaveMatrixLoadStoreTests) + TEST_METHOD_PROPERTY(L"Priority", L"2") + END_TEST_METHOD() + + BEGIN_TEST_METHOD(WaveMatrixScalarTests) + TEST_METHOD_PROPERTY(L"Priority", L"2") + END_TEST_METHOD() + + BEGIN_TEST_METHOD(WaveMatrixMathTests) + TEST_METHOD_PROPERTY(L"Priority", L"2") + END_TEST_METHOD() + dxc::DxcDllSupport m_support; bool m_D3DInitCompleted = false; @@ -8654,9 +8651,8 @@ template void WaveMatrixLoadStoreTest(int DIM_M, int DIM_N, int MEM_TYPE, CComPtr pDevice, std::shared_ptr ShaderOpSet, - dxc::DxcDllSupport &support, std::string &Target, - CW2A &Text, PCWSTR Validation_type, - double tolerance) { + dxc::DxcDllSupport &support, + PCWSTR Validation_type, double tolerance) { using namespace DirectX::PackedVector; using namespace WMMA; std::string dataTypeInShader = TypeIdToHlsl(); @@ -8850,9 +8846,6 @@ void WaveMatrixLoadStoreTest(int DIM_M, int DIM_N, int MEM_TYPE, std::fill(Data.begin(), Data.end(), (BYTE)0); } - pShaderOp->Shaders.at(0).Target = Target.c_str(); - pShaderOp->Shaders.at(0).Text = Text.m_psz; - argsStream2.str(""); argsStream2 << initialArgsString; @@ -8994,8 +8987,8 @@ void WaveMatrixLoadStoreTest(int DIM_M, int DIM_N, int MEM_TYPE, template void WaveMatrixMathTest(int DIM_M, int DIM_N, CComPtr pDevice, std::shared_ptr ShaderOpSet, - dxc::DxcDllSupport &support, std::string &Target, - CW2A &Text, PCWSTR Validation_type, double tolerance) { + dxc::DxcDllSupport &support, + PCWSTR Validation_type, double tolerance) { using namespace WMMA; using namespace DirectX::PackedVector; DXASSERT_NOMSG(sizeof(T) == sizeof(T2)); @@ -9237,9 +9230,8 @@ void WaveMatrixMathTest(int DIM_M, int DIM_N, CComPtr pDevice, expectedRowCols.size() * expectedRowCols[0].size() * sizeof(expectedRowCols[0][0])); } - // use shader from data table - pShaderOp->Shaders.at(0).Target = Target.c_str(); - pShaderOp->Shaders.at(0).Text = Text.m_psz; + + // update compilation arguments pShaderOp->Shaders.at(0).Arguments = arguments.c_str(); }, ShaderOpSet); @@ -9317,8 +9309,8 @@ void WaveMatrixMathTest(int DIM_M, int DIM_N, CComPtr pDevice, template void WaveMatrixScalarTest(int DIM_M, int DIM_N, CComPtr pDevice, std::shared_ptr ShaderOpSet, - dxc::DxcDllSupport &support, std::string &Target, - CW2A &Text, std::string dataTypeInShader, + dxc::DxcDllSupport &support, + std::string dataTypeInShader, PCWSTR Validation_type, double tolerance, std::vector &floatScalars) { using namespace DirectX::PackedVector; @@ -9520,9 +9512,7 @@ void WaveMatrixScalarTest(int DIM_M, int DIM_N, CComPtr pDevice, std::fill(Data.begin(), Data.end(), (BYTE)0); } - // use shader from data table - pShaderOp->Shaders.at(0).Target = Target.c_str(); - pShaderOp->Shaders.at(0).Text = Text.m_psz; + // update compilation arguments pShaderOp->Shaders.at(0).Arguments = arguments.c_str(); }, ShaderOpSet); @@ -9629,19 +9619,13 @@ TEST_F(ExecutionTest, WaveMatrixLoadStoreTests) { std::vector dimNs; std::shared_ptr ShaderOpSet; - size_t tableSize = sizeof(WaveMatrixOpParameters) / sizeof(TableParameter); CComPtr pDevice = WaveMatrixTestCommon(dimMs, dimNs, ShaderOpSet); if (pDevice == nullptr) { return; } - TableParameterHandler handler(WaveMatrixOpParameters, tableSize); - std::wstring wTarget = handler.GetTableParamByName(L"ShaderOp.Target")->m_str; - std::string Target; - std::transform(wTarget.begin(), wTarget.end(), std::back_inserter(Target), - [](wchar_t c) { return char(c); }); - PCWSTR validationType = handler.GetTableParamByName(L"Validation.Type")->m_str; + PCWSTR validationType = L"epsilon"; double tolerance = 0; // 0 tolerance for load store std::vector memTypes = {BUFFER, GROUPSHARED}; @@ -9675,26 +9659,24 @@ TEST_F(ExecutionTest, WaveMatrixLoadStoreTests) { L"Wmma_DisableLoadStoreTests", disableLoadStoreTests); if (disableLoadStoreTests == 0) { - CW2A LoadStoreText( - handler.GetTableParamByName(L"LoadStoreShaderOp.Text")->m_str); for (int dimM : dimMs) { for (int dimN : dimNs) { for (int memType : memTypes) { WaveMatrixLoadStoreTest( - dimM, dimN, memType, pDevice, ShaderOpSet, m_support, Target, - LoadStoreText, validationType, tolerance); + dimM, dimN, memType, pDevice, ShaderOpSet, m_support, + validationType, tolerance); WaveMatrixLoadStoreTest( - dimM, dimN, memType, pDevice, ShaderOpSet, m_support, Target, - LoadStoreText, validationType, tolerance); + dimM, dimN, memType, pDevice, ShaderOpSet, m_support, + validationType, tolerance); WaveMatrixLoadStoreTest( - dimM, dimN, memType, pDevice, ShaderOpSet, m_support, Target, - LoadStoreText, validationType, tolerance); + dimM, dimN, memType, pDevice, ShaderOpSet, m_support, + validationType, tolerance); WaveMatrixLoadStoreTest( - dimM, dimN, memType, pDevice, ShaderOpSet, m_support, Target, - LoadStoreText, validationType, tolerance); + dimM, dimN, memType, pDevice, ShaderOpSet, m_support, + validationType, tolerance); WaveMatrixLoadStoreTest( - dimM, dimN, memType, pDevice, ShaderOpSet, m_support, Target, - LoadStoreText, validationType, tolerance); + dimM, dimN, memType, pDevice, ShaderOpSet, m_support, + validationType, tolerance); } } } @@ -9708,20 +9690,14 @@ TEST_F(ExecutionTest, WaveMatrixScalarTests) { std::vector dimMs; std::vector dimNs; std::shared_ptr ShaderOpSet; - size_t tableSize = sizeof(WaveMatrixOpParameters) / sizeof(TableParameter); CComPtr pDevice = WaveMatrixTestCommon(dimMs, dimNs, ShaderOpSet); if (pDevice == nullptr) { return; } - TableParameterHandler handler(WaveMatrixOpParameters, tableSize); - std::wstring wTarget = handler.GetTableParamByName(L"ShaderOp.Target")->m_str; - std::string Target; - std::transform(wTarget.begin(), wTarget.end(), std::back_inserter(Target), - [](wchar_t c) { return char(c); }); - PCWSTR validationType = handler.GetTableParamByName(L"Validation.Type")->m_str; - double tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double; + PCWSTR validationType = L"epsilon"; + double tolerance = 0.008; ////////// // SCALAR @@ -9732,20 +9708,13 @@ TEST_F(ExecutionTest, WaveMatrixScalarTests) { L"Wmma_DisableScalarTests", disableScalarTests); if (disableScalarTests == 0) { - CW2A ScalarText{handler.GetTableParamByName(L"ScalarShaderOp.Text")->m_str}; - std::vector *Validation_Scalar = - &handler.GetTableParamByName(L"ScalarValidation.Scalar")->m_StringTable; - - std::vector scalars(Validation_Scalar->size()); - for (size_t i = 0; i < Validation_Scalar->size(); ++i) { - VERIFY_SUCCEEDED(ParseDataToFloat((*Validation_Scalar)[i], scalars[i])); - } + std::vector scalars = { -100.0f, 20.0f, -50.0f, -0.0f, 0.0f, 42.0f }; for (uint32_t dimM : dimMs) { for (uint32_t dimN : dimNs) { std::string hlslType = "float32_t"; WaveMatrixScalarTest(dimM, dimN, pDevice, ShaderOpSet, m_support, - Target, ScalarText, hlslType, + hlslType, validationType, tolerance, scalars); // hlslType is used for the CheckFeatureSupport query. @@ -9753,20 +9722,20 @@ TEST_F(ExecutionTest, WaveMatrixScalarTests) { // accumulator precision returned by CheckFeatureSupport. hlslType = "float16_t"; WaveMatrixScalarTest(dimM, dimN, pDevice, ShaderOpSet, m_support, - Target, ScalarText, hlslType, + hlslType, validationType, tolerance, scalars); WaveMatrixScalarTest(dimM, dimN, pDevice, ShaderOpSet, m_support, - Target, ScalarText, hlslType, + hlslType, validationType, tolerance, scalars); hlslType = "uint8_t4_packed"; WaveMatrixScalarTest(dimM, dimN, pDevice, ShaderOpSet, - m_support, Target, ScalarText, hlslType, + m_support, hlslType, validationType, tolerance, scalars); hlslType = "int8_t4_packed"; WaveMatrixScalarTest(dimM, dimN, pDevice, ShaderOpSet, - m_support, Target, ScalarText, hlslType, + m_support, hlslType, validationType, tolerance, scalars); } } @@ -9780,20 +9749,14 @@ TEST_F(ExecutionTest, WaveMatrixMathTests) { std::vector dimMs; std::vector dimNs; std::shared_ptr ShaderOpSet; - size_t tableSize = sizeof(WaveMatrixOpParameters) / sizeof(TableParameter); CComPtr pDevice = WaveMatrixTestCommon(dimMs, dimNs, ShaderOpSet); if (pDevice == nullptr) { return; } - TableParameterHandler handler(WaveMatrixOpParameters, tableSize); - std::wstring wTarget = handler.GetTableParamByName(L"ShaderOp.Target")->m_str; - std::string Target; - std::transform(wTarget.begin(), wTarget.end(), std::back_inserter(Target), - [](wchar_t c) { return char(c); }); - PCWSTR validationType = handler.GetTableParamByName(L"Validation.Type")->m_str; - double tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double; + PCWSTR validationType = L"epsilon"; + double tolerance = 0.008; ////////// // MATH TEST @@ -9804,30 +9767,28 @@ TEST_F(ExecutionTest, WaveMatrixMathTests) { L"Wmma_DisableMathTests", disableMathTests); if (disableMathTests == 0) { - CW2A MathShaderText{ - handler.GetTableParamByName(L"MathShaderOp.Text")->m_str}; for (uint32_t dimM : dimMs) { for (uint32_t dimN : dimNs) { WaveMatrixMathTest( - dimM, dimN, pDevice, ShaderOpSet, m_support, Target, MathShaderText, + dimM, dimN, pDevice, ShaderOpSet, m_support, validationType, tolerance); WaveMatrixMathTest( - dimM, dimN, pDevice, ShaderOpSet, m_support, Target, MathShaderText, + dimM, dimN, pDevice, ShaderOpSet, m_support, validationType, tolerance); WaveMatrixMathTest( - dimM, dimN, pDevice, ShaderOpSet, m_support, Target, MathShaderText, + dimM, dimN, pDevice, ShaderOpSet, m_support, validationType, tolerance); WaveMatrixMathTest( - dimM, dimN, pDevice, ShaderOpSet, m_support, Target, MathShaderText, + dimM, dimN, pDevice, ShaderOpSet, m_support, validationType, tolerance); WaveMatrixMathTest( - dimM, dimN, pDevice, ShaderOpSet, m_support, Target, MathShaderText, + dimM, dimN, pDevice, ShaderOpSet, m_support, validationType, tolerance); WaveMatrixMathTest( - dimM, dimN, pDevice, ShaderOpSet, m_support, Target, MathShaderText, + dimM, dimN, pDevice, ShaderOpSet, m_support, validationType, tolerance); WaveMatrixMathTest( - dimM, dimN, pDevice, ShaderOpSet, m_support, Target, MathShaderText, + dimM, dimN, pDevice, ShaderOpSet, m_support, validationType, tolerance); } } diff --git a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml index afe5be9a1..0afa3532d 100644 --- a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml +++ b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml @@ -1298,7 +1298,410 @@ (j); + } + } + GroupMemoryBarrierWithGroupSync(); + } + + + void FillDest(uint start, uint threadX) + { + GroupMemoryBarrierWithGroupSync(); + if (threadX == 0) + { + for (uint i = 0; i < MAX_NUM_ELEMENTS; ++i) + { + g_bufOut.Store(start + i * sizeof(DATATYPE), gsharedArr[i]); + // Also clear output so we don't write garbage if the whole buffer is not filled + gsharedArr[i] = 0; + } + } + GroupMemoryBarrierWithGroupSync(); + } + + #elif TEST_LOAD_STORE_ACCUMULATOR + groupshared TYPE_ACC gsharedArrAccumulator[MAX_NUM_ELEMENTS]; + + void ClearGShared(uint threadX) + { + GroupMemoryBarrierWithGroupSync(); + if (threadX == 0) + { + for (uint i = 0; i < MAX_NUM_ELEMENTS; ++i) + { + gsharedArrAccumulator[i] = (TYPE_ACC)0; + } + } + GroupMemoryBarrierWithGroupSync(); + } + + void FillSource(uint threadX) + { + GroupMemoryBarrierWithGroupSync(); + if (threadX == 0) + { + uint j = 0; + for (uint i = 0; i < MAX_NUM_ELEMENTS; ++i, j += sizeof(TYPE_ACC)) + { + gsharedArrAccumulator[i] = g_bufInAccum.Load(j); + } + } + GroupMemoryBarrierWithGroupSync(); + } + + void FillDest(uint start, uint threadX) + { + GroupMemoryBarrierWithGroupSync(); + if (threadX == 0) + { + for (uint i = 0; i < MAX_NUM_ELEMENTS; ++i) + { + g_bufOutAccumulator.Store(start + i * sizeof(TYPE_ACC), gsharedArrAccumulator[i]); + gsharedArrAccumulator[i] = 0; + } + } + GroupMemoryBarrierWithGroupSync(); + } + + void FillDestRowCol(uint start, uint threadX) + { + GroupMemoryBarrierWithGroupSync(); + if (threadX == 0) + { + for (uint i = 0; i < MAX_NUM_ELEMENTS; ++i) + { + g_bufOutRowCol.Store(start + i * sizeof(TYPE_ACC), gsharedArrAccumulator[i]); + } + } + ClearGShared(threadX); + } + #endif + + #define LOAD_SOURCE gsharedArr + #define LOAD_SOURCE_ACCUM gsharedArrAccumulator + #define STORE_DEST gsharedArr + #define STORE_DEST_ROWCOL gsharedArrAccumulator + #define STORE_DEST_ACCUM gsharedArrAccumulator + + // Start/Stride/Offset are all given in bytes, and converted to array elements in the macros. + + #define TEST_LOAD_LEFT(mat, k, start, stride, alignment, transp, dest, destOffset) \ + mat.Load(gsharedArr, (start)/sizeof(DATATYPE), (stride)/sizeof(DATATYPE), transp); \ + mat.Store(g_bufOut, destOffset, lStride, false); + + #define TEST_LOAD_RIGHT(mat, k, start, stride, alignment, transp, dest, destOffset) \ + mat.Load(gsharedArr, (start)/sizeof(DATATYPE), (stride)/sizeof(DATATYPE), transp); \ + mat.Store(g_bufOut, destOffset, rStride, false); + + #define TEST_LOAD_LEFT_COL(mat, k, start, stride, alignment, dest, destOffset) \ + mat.Load(gsharedArrAccumulator, (start)/sizeof(TYPE_ACC), (stride)/sizeof(TYPE_ACC)); \ + mat.Store(g_bufOutRowCol, destOffset, 1 * sizeof(TYPE_ACC)); + + #define TEST_LOAD_RIGHT_ROW(mat, k, start, stride, alignment, dest, destOffset) \ + mat.Load(gsharedArrAccumulator, (start)/sizeof(TYPE_ACC), (stride)/sizeof(TYPE_ACC)); \ + mat.Store(g_bufOutRowCol, destOffset, 1 * sizeof(TYPE_ACC)); + + #define TEST_LOAD_ACCUMULATOR(mata, k, start, stride, alignment, transp, dest, destOffset) \ + mata.Load(gsharedArrAccumulator, (start)/sizeof(TYPE_ACC), (stride)/sizeof(TYPE_ACC), transp); \ + mata.Store(g_bufOutAccumulator, destOffset, aStride, false); + + #define TEST_STORE_LEFT(matl, k, stride, offset, transp, dest, destOffset) \ + matl.Load(g_bufIn, 0, lStride, false); \ + matl.Store(gsharedArr, (offset)/sizeof(DATATYPE), (stride)/sizeof(DATATYPE), transp); \ + FillDest(destOffset, groupThreadID.x); + + #define TEST_STORE_RIGHT(matr, k, stride, offset, transp, dest, destOffset) \ + matr.Load(g_bufIn, 0, rStride, false); \ + matr.Store(gsharedArr, (offset)/sizeof(DATATYPE), (stride)/sizeof(DATATYPE), transp); \ + FillDest(destOffset, groupThreadID.x); + + #define TEST_STORE_LEFT_COL(mat, k, stride, offset, dest, destOffset) \ + mat.Load(g_bufInAccum, 0, 1 * sizeof(TYPE_ACC)); \ + mat.Store(gsharedArrAccumulator, (offset)/sizeof(TYPE_ACC), (stride)/sizeof(TYPE_ACC)); \ + FillDestRowCol(destOffset, groupThreadID.x); + + #define TEST_STORE_RIGHT_ROW(mat, k, stride, offset, dest, destOffset) \ + mat.Load(g_bufInAccum, 0, 1 * sizeof(TYPE_ACC)); \ + mat.Store(gsharedArrAccumulator, (offset)/sizeof(TYPE_ACC), (stride)/sizeof(TYPE_ACC)); \ + FillDestRowCol(destOffset, groupThreadID.x); + + #define TEST_STORE_ACCUMULATOR(mata, k, stride, offset, transp, dest, destOffset) \ + mata.Load(g_bufInAccum, 0, aStride, false); \ + mata.Store(gsharedArrAccumulator, (offset)/sizeof(TYPE_ACC), (stride)/sizeof(TYPE_ACC), transp); \ + FillDest(destOffset, groupThreadID.x); + + #else + #define LOAD_SOURCE g_bufIn + #define LOAD_SOURCE_ACCUM g_bufInAccum + #define STORE_DEST g_bufOut + #define STORE_DEST_ROWCOL g_bufOutRowCol + #define STORE_DEST_ACCUM g_bufOutAccumulator + + void FillSource(uint threadX) {} // no-op + void FillDest(uint start, uint threadX) {} + void FillDestRowCol(uint start, uint threadX) {} + void ClearGShared(uint threadX) {} + + #define TEST_LOAD_LEFT(mat, k, start, stride, alignment, transp, dest, destOffset) \ + mat.Load(LOAD_SOURCE, start, stride, transp, alignment); \ + mat.Store(dest, destOffset, lStride, false); + + #define TEST_LOAD_RIGHT(mat, k, start, stride, alignment, transp, dest, destOffset) \ + mat.Load(LOAD_SOURCE, start, stride, transp, alignment); \ + mat.Store(dest, destOffset, rStride, false); + + #define TEST_LOAD_LEFT_COL(mat, k, start, stride, alignment, dest, destOffset) \ + mat.Load(LOAD_SOURCE_ACCUM, start, stride, alignment); \ + mat.Store(dest, destOffset, (int)sizeof(TYPE_ACC)); + + #define TEST_LOAD_RIGHT_ROW(mat, k, start, stride, alignment, dest, destOffset) \ + mat.Load(LOAD_SOURCE_ACCUM, start, stride, alignment); \ + mat.Store(dest, destOffset, (int)sizeof(TYPE_ACC)); + + #define TEST_LOAD_ACCUMULATOR(mata, k, start, stride, alignment, transp, dest, destOffset) \ + mata.Load(LOAD_SOURCE_ACCUM, start, stride, transp, alignment); \ + mata.Store(dest, destOffset, aStride, false); + + #define TEST_STORE_LEFT(matl, k, stride, alignment, transp, dest, destOffset) \ + matl.Load(LOAD_SOURCE, 0, lStride, false); \ + matl.Store(dest, destOffset, stride, transp, alignment); + + #define TEST_STORE_RIGHT(matr, k, stride, alignment, transp, dest, destOffset) \ + matr.Load(LOAD_SOURCE, 0, rStride, false); \ + matr.Store(dest, destOffset, stride, transp, alignment); + + #define TEST_STORE_LEFT_COL(mat, k, stride, alignment, dest, destOffset) \ + mat.Load(LOAD_SOURCE_ACCUM, 0, (int)sizeof(TYPE_ACC)); \ + mat.Store(dest, destOffset, stride, alignment); + + #define TEST_STORE_RIGHT_ROW(mat, k, stride, alignment, dest, destOffset) \ + mat.Load(LOAD_SOURCE_ACCUM, 0, (int)sizeof(TYPE_ACC)); \ + mat.Store(dest, destOffset, stride, alignment); + + #define TEST_STORE_ACCUMULATOR(mata, k, stride, alignment, transp, dest, destOffset) \ + mata.Load(LOAD_SOURCE_ACCUM, 0, aStride, false); \ + mata.Store(dest, destOffset, stride, transp, alignment); + + #endif // GROUPSHARED if/else + + [WaveSize(NUM_LANES)] + #ifdef GROUPSHARED + [numthreads(NUM_LANES,1,1)] + #else + [numthreads(NUM_LANES * 2,1,1)] + #endif + void main(uint3 groupThreadID : SV_GroupThreadID) + { + uint rowColSize = 64 * 64 * sizeof(TYPE_ACC); + uint size = 2 * 64 * 64 * ELEMENTSIZE; + + // Calculate strides and offsets in bytes. + uint s = 16 * ELEMENTSIZE; // start + uint lStride = (DIM_K * ELEMENTSIZE); + uint rStride = (DIM_N * ELEMENTSIZE); + uint ltStride = (DIM_M * ELEMENTSIZE); + uint rtStride = (DIM_K * ELEMENTSIZE); + uint a = 4; // Alignment. For groupshared, tests store offset. + + // For accumulator + uint sizeAcc = 2 * 64 * 64 * sizeof(TYPE_ACC); + uint s2 = 16 * sizeof(TYPE_ACC); // start + uint aStride = (DIM_N * sizeof(TYPE_ACC)); + uint atStride = (DIM_M * sizeof(TYPE_ACC)); + uint accElemStride = sizeof(TYPE_ACC); + + uint groupOffset = (groupThreadID.x/NUM_LANES) * 22; + + uint LOAD_LEFT_START = 0 + groupOffset; + uint LOAD_RIGHT_START = 1 + groupOffset; + uint LOAD_LEFT_STRIDE_P4 = 2 + groupOffset; + uint LOAD_RIGHT_STRIDE_P4 = 3 + groupOffset; + uint LOAD_LEFT_STRIDE_X2 = 4 + groupOffset; + uint LOAD_RIGHT_STRIDE_X2 = 5 + groupOffset; + uint LOAD_LEFT_ALIGNMENT = 6 + groupOffset; + uint LOAD_RIGHT_ALIGNMENT = 7 + groupOffset; + uint LOAD_LEFT_TRANSPOSE = 8 + groupOffset; + uint LOAD_RIGHT_TRANSPOSE = 9 + groupOffset; + uint LOAD_LEFT_ALLPARAMS = 10 + groupOffset; + uint LOAD_RIGHT_ALLPARAMS = 11 + groupOffset; + uint STORE_LEFT_STRIDE_P4 = 12 + groupOffset; + uint STORE_RIGHT_STRIDE_P4 = 13 + groupOffset; + uint STORE_LEFT_STRIDE_X2 = 14 + groupOffset; + uint STORE_RIGHT_STRIDE_X2 = 15 + groupOffset; + uint STORE_LEFT_ALIGNMENT = 16 + groupOffset; + uint STORE_RIGHT_ALIGNMENT = 17 + groupOffset; + uint STORE_LEFT_TRANSPOSE = 18 + groupOffset; + uint STORE_RIGHT_TRANSPOSE = 19 + groupOffset; + uint STORE_LEFT_ALLPARAMS = 20 + groupOffset; + uint STORE_RIGHT_ALLPARAMS = 21 + groupOffset; + +#if TEST_LOAD_STORE_LR + WaveMatrixLeft matLeft; + WaveMatrixRight matRight; + + if (groupThreadID.x == 0) + { + g_bufOutMatrixDepth.Store(0, matLeft.MatrixDepth()); + g_bufOutMatrixDepth.Store(0 + sizeof(uint), matRight.MatrixDepth()); + } + + ClearGShared(groupThreadID.x); + FillSource(groupThreadID.x); + + ///////////////////////// + // Left/Right Matrices // + ///////////////////////// + TEST_LOAD_LEFT(matLeft, DIM_K, s, lStride , 0, false, STORE_DEST, LOAD_LEFT_START * size); + TEST_LOAD_RIGHT(matRight, DIM_K, s, rStride , 0, false, STORE_DEST, LOAD_RIGHT_START * size); + + TEST_LOAD_LEFT(matLeft, DIM_K, 0, lStride + 4, 0, false, STORE_DEST, LOAD_LEFT_STRIDE_P4 * size); + TEST_LOAD_RIGHT(matRight, DIM_K, 0, rStride + 4, 0, false, STORE_DEST, LOAD_RIGHT_STRIDE_P4 * size); + + TEST_LOAD_LEFT(matLeft, DIM_K, 0, lStride * 2, 0, false, STORE_DEST, LOAD_LEFT_STRIDE_X2 * size); + TEST_LOAD_RIGHT(matRight, DIM_K, 0, rStride * 2, 0, false, STORE_DEST, LOAD_RIGHT_STRIDE_X2 * size); + + TEST_LOAD_LEFT(matLeft, DIM_K, 0, lStride , a, false, STORE_DEST, LOAD_LEFT_ALIGNMENT * size); + TEST_LOAD_RIGHT(matRight, DIM_K, 0, rStride , a, false, STORE_DEST, LOAD_RIGHT_ALIGNMENT * size); + + TEST_LOAD_LEFT(matLeft, DIM_K, 0, ltStride , 0, true , STORE_DEST, LOAD_LEFT_TRANSPOSE * size); + TEST_LOAD_RIGHT(matRight, DIM_K, 0, rtStride , 0, true , STORE_DEST, LOAD_RIGHT_TRANSPOSE * size); + + TEST_LOAD_LEFT(matLeft, DIM_K, s, ltStride + 4, a, true , STORE_DEST, LOAD_LEFT_ALLPARAMS * size); + TEST_LOAD_RIGHT(matRight, DIM_K, s, rtStride + 4, a, true , STORE_DEST, LOAD_RIGHT_ALLPARAMS * size); + + ClearGShared(groupThreadID.x); + + TEST_STORE_LEFT(matLeft, DIM_K, lStride + 4, 0, false, STORE_DEST, STORE_LEFT_STRIDE_P4 * size); + TEST_STORE_RIGHT(matRight, DIM_K, rStride + 4, 0, false, STORE_DEST, STORE_RIGHT_STRIDE_P4 * size); + + TEST_STORE_LEFT(matLeft, DIM_K, lStride * 2, 0, false, STORE_DEST, STORE_LEFT_STRIDE_X2 * size); + TEST_STORE_RIGHT(matRight, DIM_K, rStride * 2, 0, false, STORE_DEST, STORE_RIGHT_STRIDE_X2 * size); + + TEST_STORE_LEFT(matLeft, DIM_K, lStride , a, false, STORE_DEST, STORE_LEFT_ALIGNMENT * size); + TEST_STORE_RIGHT(matRight, DIM_K, rStride , a, false, STORE_DEST, STORE_RIGHT_ALIGNMENT * size); + + TEST_STORE_LEFT(matLeft, DIM_K, ltStride , 0, true , STORE_DEST, STORE_LEFT_TRANSPOSE * size); + TEST_STORE_RIGHT(matRight, DIM_K, rtStride , 0, true , STORE_DEST, STORE_RIGHT_TRANSPOSE * size); + + TEST_STORE_LEFT(matLeft, DIM_K, ltStride + 4, a, true , STORE_DEST, STORE_LEFT_ALLPARAMS * size); + TEST_STORE_RIGHT(matRight, DIM_K, rtStride + 4, a, true , STORE_DEST, STORE_RIGHT_ALLPARAMS * size); + +#endif +#if TEST_LOAD_STORE_ACCUMULATOR + /////////////////////// + // Accumulator Types // + /////////////////////// + WaveMatrixLeftColAcc matLeftColAcc; + WaveMatrixRightRowAcc matRightRowAcc; + WaveMatrixAccumulator matAccum; + #if FRAGS_ENABLE + ClearGShared(groupThreadID.x); + FillSource(groupThreadID.x); + + TEST_LOAD_LEFT_COL(matLeftColAcc, 1, s2, accElemStride , 0, STORE_DEST_ROWCOL, LOAD_LEFT_START * rowColSize); + TEST_LOAD_RIGHT_ROW(matRightRowAcc, 1, s2, accElemStride , 0, STORE_DEST_ROWCOL, LOAD_RIGHT_START * rowColSize); + + TEST_LOAD_LEFT_COL(matLeftColAcc, 1, 0, accElemStride + 4, 0, STORE_DEST_ROWCOL, LOAD_LEFT_STRIDE_P4 * rowColSize); + TEST_LOAD_RIGHT_ROW(matRightRowAcc, 1, 0, accElemStride + 4, 0, STORE_DEST_ROWCOL, LOAD_RIGHT_STRIDE_P4 * rowColSize); + + TEST_LOAD_LEFT_COL(matLeftColAcc, 1, 0, accElemStride * 2, 0, STORE_DEST_ROWCOL, LOAD_LEFT_STRIDE_X2 * rowColSize); + TEST_LOAD_RIGHT_ROW(matRightRowAcc, 1, 0, accElemStride * 2, 0, STORE_DEST_ROWCOL, LOAD_RIGHT_STRIDE_X2 * rowColSize); + + TEST_LOAD_LEFT_COL(matLeftColAcc, 1, 0, accElemStride , a, STORE_DEST_ROWCOL, LOAD_LEFT_ALIGNMENT * rowColSize); + TEST_LOAD_RIGHT_ROW(matRightRowAcc, 1, 0, accElemStride , a, STORE_DEST_ROWCOL, LOAD_RIGHT_ALIGNMENT * rowColSize); + + TEST_LOAD_LEFT_COL(matLeftColAcc, 1, 0, accElemStride , 0, STORE_DEST_ROWCOL, LOAD_LEFT_TRANSPOSE * rowColSize); + TEST_LOAD_RIGHT_ROW(matRightRowAcc, 1, 0, accElemStride , 0, STORE_DEST_ROWCOL, LOAD_RIGHT_TRANSPOSE * rowColSize); + + TEST_LOAD_LEFT_COL(matLeftColAcc, 1, s2, accElemStride + 4, a, STORE_DEST_ROWCOL, LOAD_LEFT_ALLPARAMS * rowColSize); + TEST_LOAD_RIGHT_ROW(matRightRowAcc, 1, s2, accElemStride + 4, a, STORE_DEST_ROWCOL, LOAD_RIGHT_ALLPARAMS * rowColSize); + + ClearGShared(groupThreadID.x); + + TEST_STORE_LEFT_COL(matLeftColAcc, 1, accElemStride + 4, 0, STORE_DEST_ROWCOL, STORE_LEFT_STRIDE_P4 * rowColSize); + TEST_STORE_RIGHT_ROW(matRightRowAcc, 1, accElemStride + 4, 0, STORE_DEST_ROWCOL, STORE_RIGHT_STRIDE_P4 * rowColSize); + + TEST_STORE_LEFT_COL(matLeftColAcc, 1, accElemStride * 2, 0, STORE_DEST_ROWCOL, STORE_LEFT_STRIDE_X2 * rowColSize); + TEST_STORE_RIGHT_ROW(matRightRowAcc, 1, accElemStride * 2, 0, STORE_DEST_ROWCOL, STORE_RIGHT_STRIDE_X2 * rowColSize); + + TEST_STORE_LEFT_COL(matLeftColAcc, 1, accElemStride , a, STORE_DEST_ROWCOL, STORE_LEFT_ALIGNMENT * rowColSize); + TEST_STORE_RIGHT_ROW(matRightRowAcc, 1, accElemStride , a, STORE_DEST_ROWCOL, STORE_RIGHT_ALIGNMENT * rowColSize); + + TEST_STORE_LEFT_COL(matLeftColAcc, 1, accElemStride , 0, STORE_DEST_ROWCOL, STORE_LEFT_TRANSPOSE * rowColSize); + TEST_STORE_RIGHT_ROW(matRightRowAcc, 1, accElemStride , 0, STORE_DEST_ROWCOL, STORE_RIGHT_TRANSPOSE * rowColSize); + + TEST_STORE_LEFT_COL(matLeftColAcc, 1, accElemStride + 4, a, STORE_DEST_ROWCOL, STORE_LEFT_ALLPARAMS * rowColSize); + TEST_STORE_RIGHT_ROW(matRightRowAcc, 1, accElemStride + 4, a, STORE_DEST_ROWCOL, STORE_RIGHT_ALLPARAMS * rowColSize); + #endif // #if FRAGS_ENABLE + + groupOffset = (groupThreadID.x/NUM_LANES) * 11; + uint LOAD_START = 0 + groupOffset; + uint LOAD_STRIDE_P4 = 1 + groupOffset; + uint LOAD_STRIDE_X2 = 2 + groupOffset; + uint LOAD_ALIGNMENT = 3 + groupOffset; + uint LOAD_TRANSPOSE = 4 + groupOffset; + uint LOAD_ALLPARAMS = 5 + groupOffset; + uint STORE_STRIDE_P4 = 6 + groupOffset; + uint STORE_STRIDE_X2 = 7 + groupOffset; + uint STORE_ALIGNMENT = 8 + groupOffset; + uint STORE_TRANSPOSE = 9 + groupOffset; + uint STORE_ALLPARAMS = 10 + groupOffset; + + ClearGShared(groupThreadID.x); + FillSource(groupThreadID.x); + + TEST_LOAD_ACCUMULATOR (matAccum, DIM_K, s2, aStride , 0, false, g_bufOutAccumulator, LOAD_START * sizeAcc); + TEST_LOAD_ACCUMULATOR (matAccum, DIM_K, 0 , aStride + 4, 0, false, g_bufOutAccumulator, LOAD_STRIDE_P4 * sizeAcc); + TEST_LOAD_ACCUMULATOR (matAccum, DIM_K, 0 , aStride * 2, 0, false, g_bufOutAccumulator, LOAD_STRIDE_X2 * sizeAcc); + TEST_LOAD_ACCUMULATOR (matAccum, DIM_K, 0 , aStride , a, false, g_bufOutAccumulator, LOAD_ALIGNMENT * sizeAcc); + TEST_LOAD_ACCUMULATOR (matAccum, DIM_K, 0 , atStride , 0, true , g_bufOutAccumulator, LOAD_TRANSPOSE * sizeAcc); + TEST_LOAD_ACCUMULATOR (matAccum, DIM_K, s2, atStride + 4, a, true , g_bufOutAccumulator, LOAD_ALLPARAMS * sizeAcc); + + ClearGShared(groupThreadID.x); + + TEST_STORE_ACCUMULATOR(matAccum, DIM_K, aStride + 4, 0, false, STORE_DEST_ACCUM, STORE_STRIDE_P4 * sizeAcc); + TEST_STORE_ACCUMULATOR(matAccum, DIM_K, aStride * 2, 0, false, STORE_DEST_ACCUM, STORE_STRIDE_X2 * sizeAcc); + TEST_STORE_ACCUMULATOR(matAccum, DIM_K, aStride , a, false, STORE_DEST_ACCUM, STORE_ALIGNMENT * sizeAcc); + TEST_STORE_ACCUMULATOR(matAccum, DIM_K, atStride , 0, true , STORE_DEST_ACCUM, STORE_TRANSPOSE * sizeAcc); + TEST_STORE_ACCUMULATOR(matAccum, DIM_K, atStride + 4, a, true , STORE_DEST_ACCUM, STORE_ALLPARAMS * sizeAcc); +#endif // #if TEST_LOAD_STORE_ACCUMULATOR + }; ]]> @@ -1323,7 +1726,124 @@ leftCol; + WaveMatrixRightRowAcc rightRow; + WaveMatrixAccumulator accumulator; + + TYPE_ACC scalar = g_bufInScalar.Load(groupID.x * sizeof(TYPE_ACC)); + + const uint lStride = (uint)(DIM_K * sizeof(TYPE_ACC)); + const uint rStride = (uint)(DIM_N * sizeof(TYPE_ACC)); + const uint aStride = (uint)(DIM_N * sizeof(TYPE_ACC)); + + /////////// + // Accumulator + /////////// + + accumulator.Load(g_bufInAccumulator, scalarMulOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); + accumulator.ScalarMultiply(scalar); + accumulator.Store(g_bufOutAccumulator, outScalarMulOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); + + accumulator.Load(g_bufInAccumulator, scalarDivOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); + accumulator.ScalarDivide(scalar); + accumulator.Store(g_bufOutAccumulator, outScalarDivOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); + + accumulator.Load(g_bufInAccumulator, scalarAddOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); + accumulator.ScalarAdd(scalar); + accumulator.Store(g_bufOutAccumulator, outScalarAddOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); + + accumulator.Load(g_bufInAccumulator, scalarSubOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); + accumulator.ScalarSubtract(scalar); + accumulator.Store(g_bufOutAccumulator, outScalarSubOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); + + accumulator.Load(g_bufInAccumulator, scalarFillOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); + accumulator.Fill(scalar); + accumulator.Store(g_bufOutAccumulator, outScalarFillOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); + +#if FRAGS_ENABLE + + /////////// + // Left Col + /////////// + + // We load and store the left col transposed (as a row) to save space + leftCol.Load (g_bufInLeftColAcc, scalarMulOffset * DIM_M, (int)sizeof(TYPE_ACC)); + leftCol.ScalarMultiply(scalar); + leftCol.Store(g_bufOutLeftColAcc, outScalarMulOffset * DIM_M, (int)sizeof(TYPE_ACC)); + + leftCol.Load (g_bufInLeftColAcc, scalarDivOffset * DIM_M, (int)sizeof(TYPE_ACC)); + leftCol.ScalarDivide(scalar); + leftCol.Store(g_bufOutLeftColAcc, outScalarDivOffset * DIM_M, (int)sizeof(TYPE_ACC)); + + leftCol.Load (g_bufInLeftColAcc, scalarAddOffset * DIM_M, (int)sizeof(TYPE_ACC)); + leftCol.ScalarAdd(scalar); + leftCol.Store(g_bufOutLeftColAcc, outScalarAddOffset * DIM_M, (int)sizeof(TYPE_ACC)); + + leftCol.Load (g_bufInLeftColAcc, scalarSubOffset * DIM_M, (int)sizeof(TYPE_ACC)); + leftCol.ScalarSubtract(scalar); + leftCol.Store(g_bufOutLeftColAcc, outScalarSubOffset * DIM_M, (int)sizeof(TYPE_ACC)); + + leftCol.Load (g_bufInLeftColAcc, scalarFillOffset * DIM_M, (int)sizeof(TYPE_ACC)); + leftCol.Fill(scalar); + leftCol.Store(g_bufOutLeftColAcc, outScalarFillOffset * DIM_M, (int)sizeof(TYPE_ACC)); + + /////////// + // Right Row + /////////// + + rightRow.Load (g_bufInRightRowAcc, scalarMulOffset * DIM_N, (int)sizeof(TYPE_ACC)); + rightRow.ScalarMultiply(scalar); + rightRow.Store(g_bufOutRightRowAcc, outScalarMulOffset * DIM_N, (int)sizeof(TYPE_ACC)); + + rightRow.Load (g_bufInRightRowAcc, scalarDivOffset * DIM_N, (int)sizeof(TYPE_ACC)); + rightRow.ScalarDivide(scalar); + rightRow.Store(g_bufOutRightRowAcc, outScalarDivOffset * DIM_N, (int)sizeof(TYPE_ACC)); + + rightRow.Load (g_bufInRightRowAcc, scalarAddOffset * DIM_N, (int)sizeof(TYPE_ACC)); + rightRow.ScalarAdd(scalar); + rightRow.Store(g_bufOutRightRowAcc, outScalarAddOffset * DIM_N, (int)sizeof(TYPE_ACC)); + + rightRow.Load (g_bufInRightRowAcc, scalarSubOffset * DIM_N, (int)sizeof(TYPE_ACC)); + rightRow.ScalarSubtract(scalar); + rightRow.Store(g_bufOutRightRowAcc, outScalarSubOffset * DIM_N, (int)sizeof(TYPE_ACC)); + + rightRow.Load (g_bufInRightRowAcc, scalarFillOffset * DIM_N, (int)sizeof(TYPE_ACC)); + rightRow.Fill(scalar); + rightRow.Store(g_bufOutRightRowAcc, outScalarFillOffset * DIM_N, (int)sizeof(TYPE_ACC)); +#endif // #if FRAGS_ENABLE + }; ]]> @@ -1340,7 +1860,66 @@ leftMatrix; + WaveMatrixRight rightMatrix; + WaveMatrixLeftColAcc leftCol; + WaveMatrixRightRowAcc rightRow; + WaveMatrixAccumulator accumulator; + WaveMatrixAccumulator outAccumulator; + + const uint lStride = (uint)(DIM_K * ELEMENTSIZE); + const uint rStride = (uint)(DIM_N * ELEMENTSIZE); + const uint aStride = (uint)(DIM_N * sizeof(TYPE_ACC)); + + leftMatrix.Load(g_bufInMatrices, 0, lStride, false); + rightMatrix.Load(g_bufInMatrices, MATRIX_BUFFER_STRIDE_IN_ELEMENTS * ELEMENTSIZE, rStride, false); + accumulator.Load(g_bufInMatrices, 2 * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * ELEMENTSIZE, aStride, false); + + outAccumulator.Multiply(leftMatrix, rightMatrix); + outAccumulator.Store(g_bufOutMatrices, outMulMatrix * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * sizeof(TYPE_ACC), aStride, false); + + outAccumulator.Fill(42); + outAccumulator.MultiplyAccumulate(leftMatrix, rightMatrix); + outAccumulator.Store(g_bufOutMatrices, outMulAccumulateMatrix * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * sizeof(TYPE_ACC), aStride, false); + + outAccumulator.Fill(42); + outAccumulator.Add(accumulator); + outAccumulator.Store(g_bufOutMatrices, outAddMatrix * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * sizeof(TYPE_ACC), aStride, false); +#if FRAGS_ENABLE + leftCol.Load(g_bufInMatrices, 2 * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * ELEMENTSIZE, (int)sizeof(TYPE_ACC)); + rightRow.Load(g_bufInMatrices, 2 * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * ELEMENTSIZE, (int)sizeof(TYPE_ACC)); + + outAccumulator.Fill(0); + outAccumulator.Add(leftCol); + outAccumulator.Store(g_bufOutMatrices, outBroadcastAddColMatrix * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * sizeof(TYPE_ACC), aStride, false); + + outAccumulator.Fill(0); + outAccumulator.Add(rightRow); + outAccumulator.Store(g_bufOutMatrices, outBroadcastAddRowMatrix * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * sizeof(TYPE_ACC), aStride, false); + + leftCol.SumAccumulate(leftMatrix); + rightRow.SumAccumulate(rightMatrix); + leftCol.Store(g_bufOutRowCols, outRowColOffset, (int)sizeof(TYPE_ACC)); + rightRow.Store(g_bufOutRowCols, outRowColOffset + 64 * sizeof(TYPE_ACC), (int)sizeof(TYPE_ACC)); +#endif //#if FRAGS_ENABLE + }; ]]> diff --git a/tools/clang/unittests/HLSLExec/ShaderOpArithTable.xml b/tools/clang/unittests/HLSLExec/ShaderOpArithTable.xml index 7de0a7b5b..380d28557 100644 --- a/tools/clang/unittests/HLSLExec/ShaderOpArithTable.xml +++ b/tools/clang/unittests/HLSLExec/ShaderOpArithTable.xml @@ -4750,623 +4750,6 @@ -enable-16bit-types - - - String - double - String - String - String - String - - - epsilon - 0.008 - cs_6_8 - - (j); - } - } - GroupMemoryBarrierWithGroupSync(); - } - - - void FillDest(uint start, uint threadX) - { - GroupMemoryBarrierWithGroupSync(); - if (threadX == 0) - { - for (uint i = 0; i < MAX_NUM_ELEMENTS; ++i) - { - g_bufOut.Store(start + i * sizeof(DATATYPE), gsharedArr[i]); - // Also clear output so we don't write garbage if the whole buffer is not filled - gsharedArr[i] = 0; - } - } - GroupMemoryBarrierWithGroupSync(); - } - - #elif TEST_LOAD_STORE_ACCUMULATOR - groupshared TYPE_ACC gsharedArrAccumulator[MAX_NUM_ELEMENTS]; - - void ClearGShared(uint threadX) - { - GroupMemoryBarrierWithGroupSync(); - if (threadX == 0) - { - for (uint i = 0; i < MAX_NUM_ELEMENTS; ++i) - { - gsharedArrAccumulator[i] = (TYPE_ACC)0; - } - } - GroupMemoryBarrierWithGroupSync(); - } - - void FillSource(uint threadX) - { - GroupMemoryBarrierWithGroupSync(); - if (threadX == 0) - { - uint j = 0; - for (uint i = 0; i < MAX_NUM_ELEMENTS; ++i, j += sizeof(TYPE_ACC)) - { - gsharedArrAccumulator[i] = g_bufInAccum.Load(j); - } - } - GroupMemoryBarrierWithGroupSync(); - } - - void FillDest(uint start, uint threadX) - { - GroupMemoryBarrierWithGroupSync(); - if (threadX == 0) - { - for (uint i = 0; i < MAX_NUM_ELEMENTS; ++i) - { - g_bufOutAccumulator.Store(start + i * sizeof(TYPE_ACC), gsharedArrAccumulator[i]); - gsharedArrAccumulator[i] = 0; - } - } - GroupMemoryBarrierWithGroupSync(); - } - - void FillDestRowCol(uint start, uint threadX) - { - GroupMemoryBarrierWithGroupSync(); - if (threadX == 0) - { - for (uint i = 0; i < MAX_NUM_ELEMENTS; ++i) - { - g_bufOutRowCol.Store(start + i * sizeof(TYPE_ACC), gsharedArrAccumulator[i]); - } - } - ClearGShared(threadX); - } - #endif - - #define LOAD_SOURCE gsharedArr - #define LOAD_SOURCE_ACCUM gsharedArrAccumulator - #define STORE_DEST gsharedArr - #define STORE_DEST_ROWCOL gsharedArrAccumulator - #define STORE_DEST_ACCUM gsharedArrAccumulator - - // Start/Stride/Offset are all given in bytes, and converted to array elements in the macros. - - #define TEST_LOAD_LEFT(mat, k, start, stride, alignment, transp, dest, destOffset) \ - mat.Load(gsharedArr, (start)/sizeof(DATATYPE), (stride)/sizeof(DATATYPE), transp); \ - mat.Store(g_bufOut, destOffset, lStride, false); - - #define TEST_LOAD_RIGHT(mat, k, start, stride, alignment, transp, dest, destOffset) \ - mat.Load(gsharedArr, (start)/sizeof(DATATYPE), (stride)/sizeof(DATATYPE), transp); \ - mat.Store(g_bufOut, destOffset, rStride, false); - - #define TEST_LOAD_LEFT_COL(mat, k, start, stride, alignment, dest, destOffset) \ - mat.Load(gsharedArrAccumulator, (start)/sizeof(TYPE_ACC), (stride)/sizeof(TYPE_ACC)); \ - mat.Store(g_bufOutRowCol, destOffset, 1 * sizeof(TYPE_ACC)); - - #define TEST_LOAD_RIGHT_ROW(mat, k, start, stride, alignment, dest, destOffset) \ - mat.Load(gsharedArrAccumulator, (start)/sizeof(TYPE_ACC), (stride)/sizeof(TYPE_ACC)); \ - mat.Store(g_bufOutRowCol, destOffset, 1 * sizeof(TYPE_ACC)); - - #define TEST_LOAD_ACCUMULATOR(mata, k, start, stride, alignment, transp, dest, destOffset) \ - mata.Load(gsharedArrAccumulator, (start)/sizeof(TYPE_ACC), (stride)/sizeof(TYPE_ACC), transp); \ - mata.Store(g_bufOutAccumulator, destOffset, aStride, false); - - #define TEST_STORE_LEFT(matl, k, stride, offset, transp, dest, destOffset) \ - matl.Load(g_bufIn, 0, lStride, false); \ - matl.Store(gsharedArr, (offset)/sizeof(DATATYPE), (stride)/sizeof(DATATYPE), transp); \ - FillDest(destOffset, groupThreadID.x); - - #define TEST_STORE_RIGHT(matr, k, stride, offset, transp, dest, destOffset) \ - matr.Load(g_bufIn, 0, rStride, false); \ - matr.Store(gsharedArr, (offset)/sizeof(DATATYPE), (stride)/sizeof(DATATYPE), transp); \ - FillDest(destOffset, groupThreadID.x); - - #define TEST_STORE_LEFT_COL(mat, k, stride, offset, dest, destOffset) \ - mat.Load(g_bufInAccum, 0, 1 * sizeof(TYPE_ACC)); \ - mat.Store(gsharedArrAccumulator, (offset)/sizeof(TYPE_ACC), (stride)/sizeof(TYPE_ACC)); \ - FillDestRowCol(destOffset, groupThreadID.x); - - #define TEST_STORE_RIGHT_ROW(mat, k, stride, offset, dest, destOffset) \ - mat.Load(g_bufInAccum, 0, 1 * sizeof(TYPE_ACC)); \ - mat.Store(gsharedArrAccumulator, (offset)/sizeof(TYPE_ACC), (stride)/sizeof(TYPE_ACC)); \ - FillDestRowCol(destOffset, groupThreadID.x); - - #define TEST_STORE_ACCUMULATOR(mata, k, stride, offset, transp, dest, destOffset) \ - mata.Load(g_bufInAccum, 0, aStride, false); \ - mata.Store(gsharedArrAccumulator, (offset)/sizeof(TYPE_ACC), (stride)/sizeof(TYPE_ACC), transp); \ - FillDest(destOffset, groupThreadID.x); - - #else - #define LOAD_SOURCE g_bufIn - #define LOAD_SOURCE_ACCUM g_bufInAccum - #define STORE_DEST g_bufOut - #define STORE_DEST_ROWCOL g_bufOutRowCol - #define STORE_DEST_ACCUM g_bufOutAccumulator - - void FillSource(uint threadX) {} // no-op - void FillDest(uint start, uint threadX) {} - void FillDestRowCol(uint start, uint threadX) {} - void ClearGShared(uint threadX) {} - - #define TEST_LOAD_LEFT(mat, k, start, stride, alignment, transp, dest, destOffset) \ - mat.Load(LOAD_SOURCE, start, stride, transp, alignment); \ - mat.Store(dest, destOffset, lStride, false); - - #define TEST_LOAD_RIGHT(mat, k, start, stride, alignment, transp, dest, destOffset) \ - mat.Load(LOAD_SOURCE, start, stride, transp, alignment); \ - mat.Store(dest, destOffset, rStride, false); - - #define TEST_LOAD_LEFT_COL(mat, k, start, stride, alignment, dest, destOffset) \ - mat.Load(LOAD_SOURCE_ACCUM, start, stride, alignment); \ - mat.Store(dest, destOffset, (int)sizeof(TYPE_ACC)); - - #define TEST_LOAD_RIGHT_ROW(mat, k, start, stride, alignment, dest, destOffset) \ - mat.Load(LOAD_SOURCE_ACCUM, start, stride, alignment); \ - mat.Store(dest, destOffset, (int)sizeof(TYPE_ACC)); - - #define TEST_LOAD_ACCUMULATOR(mata, k, start, stride, alignment, transp, dest, destOffset) \ - mata.Load(LOAD_SOURCE_ACCUM, start, stride, transp, alignment); \ - mata.Store(dest, destOffset, aStride, false); - - #define TEST_STORE_LEFT(matl, k, stride, alignment, transp, dest, destOffset) \ - matl.Load(LOAD_SOURCE, 0, lStride, false); \ - matl.Store(dest, destOffset, stride, transp, alignment); - - #define TEST_STORE_RIGHT(matr, k, stride, alignment, transp, dest, destOffset) \ - matr.Load(LOAD_SOURCE, 0, rStride, false); \ - matr.Store(dest, destOffset, stride, transp, alignment); - - #define TEST_STORE_LEFT_COL(mat, k, stride, alignment, dest, destOffset) \ - mat.Load(LOAD_SOURCE_ACCUM, 0, (int)sizeof(TYPE_ACC)); \ - mat.Store(dest, destOffset, stride, alignment); - - #define TEST_STORE_RIGHT_ROW(mat, k, stride, alignment, dest, destOffset) \ - mat.Load(LOAD_SOURCE_ACCUM, 0, (int)sizeof(TYPE_ACC)); \ - mat.Store(dest, destOffset, stride, alignment); - - #define TEST_STORE_ACCUMULATOR(mata, k, stride, alignment, transp, dest, destOffset) \ - mata.Load(LOAD_SOURCE_ACCUM, 0, aStride, false); \ - mata.Store(dest, destOffset, stride, transp, alignment); - - #endif // GROUPSHARED if/else - - [WaveSize(NUM_LANES)] - #ifdef GROUPSHARED - [numthreads(NUM_LANES,1,1)] - #else - [numthreads(NUM_LANES * 2,1,1)] - #endif - void main(uint3 groupThreadID : SV_GroupThreadID) - { - uint rowColSize = 64 * 64 * sizeof(TYPE_ACC); - uint size = 2 * 64 * 64 * ELEMENTSIZE; - - // Calculate strides and offsets in bytes. - uint s = 16 * ELEMENTSIZE; // start - uint lStride = (DIM_K * ELEMENTSIZE); - uint rStride = (DIM_N * ELEMENTSIZE); - uint ltStride = (DIM_M * ELEMENTSIZE); - uint rtStride = (DIM_K * ELEMENTSIZE); - uint a = 4; // Alignment. For groupshared, tests store offset. - - // For accumulator - uint sizeAcc = 2 * 64 * 64 * sizeof(TYPE_ACC); - uint s2 = 16 * sizeof(TYPE_ACC); // start - uint aStride = (DIM_N * sizeof(TYPE_ACC)); - uint atStride = (DIM_M * sizeof(TYPE_ACC)); - uint accElemStride = sizeof(TYPE_ACC); - - uint groupOffset = (groupThreadID.x/NUM_LANES) * 22; - - uint LOAD_LEFT_START = 0 + groupOffset; - uint LOAD_RIGHT_START = 1 + groupOffset; - uint LOAD_LEFT_STRIDE_P4 = 2 + groupOffset; - uint LOAD_RIGHT_STRIDE_P4 = 3 + groupOffset; - uint LOAD_LEFT_STRIDE_X2 = 4 + groupOffset; - uint LOAD_RIGHT_STRIDE_X2 = 5 + groupOffset; - uint LOAD_LEFT_ALIGNMENT = 6 + groupOffset; - uint LOAD_RIGHT_ALIGNMENT = 7 + groupOffset; - uint LOAD_LEFT_TRANSPOSE = 8 + groupOffset; - uint LOAD_RIGHT_TRANSPOSE = 9 + groupOffset; - uint LOAD_LEFT_ALLPARAMS = 10 + groupOffset; - uint LOAD_RIGHT_ALLPARAMS = 11 + groupOffset; - uint STORE_LEFT_STRIDE_P4 = 12 + groupOffset; - uint STORE_RIGHT_STRIDE_P4 = 13 + groupOffset; - uint STORE_LEFT_STRIDE_X2 = 14 + groupOffset; - uint STORE_RIGHT_STRIDE_X2 = 15 + groupOffset; - uint STORE_LEFT_ALIGNMENT = 16 + groupOffset; - uint STORE_RIGHT_ALIGNMENT = 17 + groupOffset; - uint STORE_LEFT_TRANSPOSE = 18 + groupOffset; - uint STORE_RIGHT_TRANSPOSE = 19 + groupOffset; - uint STORE_LEFT_ALLPARAMS = 20 + groupOffset; - uint STORE_RIGHT_ALLPARAMS = 21 + groupOffset; - -#if TEST_LOAD_STORE_LR - WaveMatrixLeft matLeft; - WaveMatrixRight matRight; - - if (groupThreadID.x == 0) - { - g_bufOutMatrixDepth.Store(0, matLeft.MatrixDepth()); - g_bufOutMatrixDepth.Store(0 + sizeof(uint), matRight.MatrixDepth()); - } - - ClearGShared(groupThreadID.x); - FillSource(groupThreadID.x); - - ///////////////////////// - // Left/Right Matrices // - ///////////////////////// - TEST_LOAD_LEFT(matLeft, DIM_K, s, lStride , 0, false, STORE_DEST, LOAD_LEFT_START * size); - TEST_LOAD_RIGHT(matRight, DIM_K, s, rStride , 0, false, STORE_DEST, LOAD_RIGHT_START * size); - - TEST_LOAD_LEFT(matLeft, DIM_K, 0, lStride + 4, 0, false, STORE_DEST, LOAD_LEFT_STRIDE_P4 * size); - TEST_LOAD_RIGHT(matRight, DIM_K, 0, rStride + 4, 0, false, STORE_DEST, LOAD_RIGHT_STRIDE_P4 * size); - - TEST_LOAD_LEFT(matLeft, DIM_K, 0, lStride * 2, 0, false, STORE_DEST, LOAD_LEFT_STRIDE_X2 * size); - TEST_LOAD_RIGHT(matRight, DIM_K, 0, rStride * 2, 0, false, STORE_DEST, LOAD_RIGHT_STRIDE_X2 * size); - - TEST_LOAD_LEFT(matLeft, DIM_K, 0, lStride , a, false, STORE_DEST, LOAD_LEFT_ALIGNMENT * size); - TEST_LOAD_RIGHT(matRight, DIM_K, 0, rStride , a, false, STORE_DEST, LOAD_RIGHT_ALIGNMENT * size); - - TEST_LOAD_LEFT(matLeft, DIM_K, 0, ltStride , 0, true , STORE_DEST, LOAD_LEFT_TRANSPOSE * size); - TEST_LOAD_RIGHT(matRight, DIM_K, 0, rtStride , 0, true , STORE_DEST, LOAD_RIGHT_TRANSPOSE * size); - - TEST_LOAD_LEFT(matLeft, DIM_K, s, ltStride + 4, a, true , STORE_DEST, LOAD_LEFT_ALLPARAMS * size); - TEST_LOAD_RIGHT(matRight, DIM_K, s, rtStride + 4, a, true , STORE_DEST, LOAD_RIGHT_ALLPARAMS * size); - - ClearGShared(groupThreadID.x); - - TEST_STORE_LEFT(matLeft, DIM_K, lStride + 4, 0, false, STORE_DEST, STORE_LEFT_STRIDE_P4 * size); - TEST_STORE_RIGHT(matRight, DIM_K, rStride + 4, 0, false, STORE_DEST, STORE_RIGHT_STRIDE_P4 * size); - - TEST_STORE_LEFT(matLeft, DIM_K, lStride * 2, 0, false, STORE_DEST, STORE_LEFT_STRIDE_X2 * size); - TEST_STORE_RIGHT(matRight, DIM_K, rStride * 2, 0, false, STORE_DEST, STORE_RIGHT_STRIDE_X2 * size); - - TEST_STORE_LEFT(matLeft, DIM_K, lStride , a, false, STORE_DEST, STORE_LEFT_ALIGNMENT * size); - TEST_STORE_RIGHT(matRight, DIM_K, rStride , a, false, STORE_DEST, STORE_RIGHT_ALIGNMENT * size); - - TEST_STORE_LEFT(matLeft, DIM_K, ltStride , 0, true , STORE_DEST, STORE_LEFT_TRANSPOSE * size); - TEST_STORE_RIGHT(matRight, DIM_K, rtStride , 0, true , STORE_DEST, STORE_RIGHT_TRANSPOSE * size); - - TEST_STORE_LEFT(matLeft, DIM_K, ltStride + 4, a, true , STORE_DEST, STORE_LEFT_ALLPARAMS * size); - TEST_STORE_RIGHT(matRight, DIM_K, rtStride + 4, a, true , STORE_DEST, STORE_RIGHT_ALLPARAMS * size); - -#endif -#if TEST_LOAD_STORE_ACCUMULATOR - /////////////////////// - // Accumulator Types // - /////////////////////// - WaveMatrixLeftColAcc matLeftColAcc; - WaveMatrixRightRowAcc matRightRowAcc; - WaveMatrixAccumulator matAccum; - #if FRAGS_ENABLE - ClearGShared(groupThreadID.x); - FillSource(groupThreadID.x); - - TEST_LOAD_LEFT_COL(matLeftColAcc, 1, s2, accElemStride , 0, STORE_DEST_ROWCOL, LOAD_LEFT_START * rowColSize); - TEST_LOAD_RIGHT_ROW(matRightRowAcc, 1, s2, accElemStride , 0, STORE_DEST_ROWCOL, LOAD_RIGHT_START * rowColSize); - - TEST_LOAD_LEFT_COL(matLeftColAcc, 1, 0, accElemStride + 4, 0, STORE_DEST_ROWCOL, LOAD_LEFT_STRIDE_P4 * rowColSize); - TEST_LOAD_RIGHT_ROW(matRightRowAcc, 1, 0, accElemStride + 4, 0, STORE_DEST_ROWCOL, LOAD_RIGHT_STRIDE_P4 * rowColSize); - - TEST_LOAD_LEFT_COL(matLeftColAcc, 1, 0, accElemStride * 2, 0, STORE_DEST_ROWCOL, LOAD_LEFT_STRIDE_X2 * rowColSize); - TEST_LOAD_RIGHT_ROW(matRightRowAcc, 1, 0, accElemStride * 2, 0, STORE_DEST_ROWCOL, LOAD_RIGHT_STRIDE_X2 * rowColSize); - - TEST_LOAD_LEFT_COL(matLeftColAcc, 1, 0, accElemStride , a, STORE_DEST_ROWCOL, LOAD_LEFT_ALIGNMENT * rowColSize); - TEST_LOAD_RIGHT_ROW(matRightRowAcc, 1, 0, accElemStride , a, STORE_DEST_ROWCOL, LOAD_RIGHT_ALIGNMENT * rowColSize); - - TEST_LOAD_LEFT_COL(matLeftColAcc, 1, 0, accElemStride , 0, STORE_DEST_ROWCOL, LOAD_LEFT_TRANSPOSE * rowColSize); - TEST_LOAD_RIGHT_ROW(matRightRowAcc, 1, 0, accElemStride , 0, STORE_DEST_ROWCOL, LOAD_RIGHT_TRANSPOSE * rowColSize); - - TEST_LOAD_LEFT_COL(matLeftColAcc, 1, s2, accElemStride + 4, a, STORE_DEST_ROWCOL, LOAD_LEFT_ALLPARAMS * rowColSize); - TEST_LOAD_RIGHT_ROW(matRightRowAcc, 1, s2, accElemStride + 4, a, STORE_DEST_ROWCOL, LOAD_RIGHT_ALLPARAMS * rowColSize); - - ClearGShared(groupThreadID.x); - - TEST_STORE_LEFT_COL(matLeftColAcc, 1, accElemStride + 4, 0, STORE_DEST_ROWCOL, STORE_LEFT_STRIDE_P4 * rowColSize); - TEST_STORE_RIGHT_ROW(matRightRowAcc, 1, accElemStride + 4, 0, STORE_DEST_ROWCOL, STORE_RIGHT_STRIDE_P4 * rowColSize); - - TEST_STORE_LEFT_COL(matLeftColAcc, 1, accElemStride * 2, 0, STORE_DEST_ROWCOL, STORE_LEFT_STRIDE_X2 * rowColSize); - TEST_STORE_RIGHT_ROW(matRightRowAcc, 1, accElemStride * 2, 0, STORE_DEST_ROWCOL, STORE_RIGHT_STRIDE_X2 * rowColSize); - - TEST_STORE_LEFT_COL(matLeftColAcc, 1, accElemStride , a, STORE_DEST_ROWCOL, STORE_LEFT_ALIGNMENT * rowColSize); - TEST_STORE_RIGHT_ROW(matRightRowAcc, 1, accElemStride , a, STORE_DEST_ROWCOL, STORE_RIGHT_ALIGNMENT * rowColSize); - - TEST_STORE_LEFT_COL(matLeftColAcc, 1, accElemStride , 0, STORE_DEST_ROWCOL, STORE_LEFT_TRANSPOSE * rowColSize); - TEST_STORE_RIGHT_ROW(matRightRowAcc, 1, accElemStride , 0, STORE_DEST_ROWCOL, STORE_RIGHT_TRANSPOSE * rowColSize); - - TEST_STORE_LEFT_COL(matLeftColAcc, 1, accElemStride + 4, a, STORE_DEST_ROWCOL, STORE_LEFT_ALLPARAMS * rowColSize); - TEST_STORE_RIGHT_ROW(matRightRowAcc, 1, accElemStride + 4, a, STORE_DEST_ROWCOL, STORE_RIGHT_ALLPARAMS * rowColSize); - #endif // #if FRAGS_ENABLE - - groupOffset = (groupThreadID.x/NUM_LANES) * 11; - uint LOAD_START = 0 + groupOffset; - uint LOAD_STRIDE_P4 = 1 + groupOffset; - uint LOAD_STRIDE_X2 = 2 + groupOffset; - uint LOAD_ALIGNMENT = 3 + groupOffset; - uint LOAD_TRANSPOSE = 4 + groupOffset; - uint LOAD_ALLPARAMS = 5 + groupOffset; - uint STORE_STRIDE_P4 = 6 + groupOffset; - uint STORE_STRIDE_X2 = 7 + groupOffset; - uint STORE_ALIGNMENT = 8 + groupOffset; - uint STORE_TRANSPOSE = 9 + groupOffset; - uint STORE_ALLPARAMS = 10 + groupOffset; - - ClearGShared(groupThreadID.x); - FillSource(groupThreadID.x); - - TEST_LOAD_ACCUMULATOR (matAccum, DIM_K, s2, aStride , 0, false, g_bufOutAccumulator, LOAD_START * sizeAcc); - TEST_LOAD_ACCUMULATOR (matAccum, DIM_K, 0 , aStride + 4, 0, false, g_bufOutAccumulator, LOAD_STRIDE_P4 * sizeAcc); - TEST_LOAD_ACCUMULATOR (matAccum, DIM_K, 0 , aStride * 2, 0, false, g_bufOutAccumulator, LOAD_STRIDE_X2 * sizeAcc); - TEST_LOAD_ACCUMULATOR (matAccum, DIM_K, 0 , aStride , a, false, g_bufOutAccumulator, LOAD_ALIGNMENT * sizeAcc); - TEST_LOAD_ACCUMULATOR (matAccum, DIM_K, 0 , atStride , 0, true , g_bufOutAccumulator, LOAD_TRANSPOSE * sizeAcc); - TEST_LOAD_ACCUMULATOR (matAccum, DIM_K, s2, atStride + 4, a, true , g_bufOutAccumulator, LOAD_ALLPARAMS * sizeAcc); - - ClearGShared(groupThreadID.x); - - TEST_STORE_ACCUMULATOR(matAccum, DIM_K, aStride + 4, 0, false, STORE_DEST_ACCUM, STORE_STRIDE_P4 * sizeAcc); - TEST_STORE_ACCUMULATOR(matAccum, DIM_K, aStride * 2, 0, false, STORE_DEST_ACCUM, STORE_STRIDE_X2 * sizeAcc); - TEST_STORE_ACCUMULATOR(matAccum, DIM_K, aStride , a, false, STORE_DEST_ACCUM, STORE_ALIGNMENT * sizeAcc); - TEST_STORE_ACCUMULATOR(matAccum, DIM_K, atStride , 0, true , STORE_DEST_ACCUM, STORE_TRANSPOSE * sizeAcc); - TEST_STORE_ACCUMULATOR(matAccum, DIM_K, atStride + 4, a, true , STORE_DEST_ACCUM, STORE_ALLPARAMS * sizeAcc); -#endif // #if TEST_LOAD_STORE_ACCUMULATOR - }; - ]]> - - - leftCol; - WaveMatrixRightRowAcc rightRow; - WaveMatrixAccumulator accumulator; - - TYPE_ACC scalar = g_bufInScalar.Load(groupID.x * sizeof(TYPE_ACC)); - - const uint lStride = (uint)(DIM_K * sizeof(TYPE_ACC)); - const uint rStride = (uint)(DIM_N * sizeof(TYPE_ACC)); - const uint aStride = (uint)(DIM_N * sizeof(TYPE_ACC)); - - /////////// - // Accumulator - /////////// - - accumulator.Load(g_bufInAccumulator, scalarMulOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); - accumulator.ScalarMultiply(scalar); - accumulator.Store(g_bufOutAccumulator, outScalarMulOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); - - accumulator.Load(g_bufInAccumulator, scalarDivOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); - accumulator.ScalarDivide(scalar); - accumulator.Store(g_bufOutAccumulator, outScalarDivOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); - - accumulator.Load(g_bufInAccumulator, scalarAddOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); - accumulator.ScalarAdd(scalar); - accumulator.Store(g_bufOutAccumulator, outScalarAddOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); - - accumulator.Load(g_bufInAccumulator, scalarSubOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); - accumulator.ScalarSubtract(scalar); - accumulator.Store(g_bufOutAccumulator, outScalarSubOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); - - accumulator.Load(g_bufInAccumulator, scalarFillOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); - accumulator.Fill(scalar); - accumulator.Store(g_bufOutAccumulator, outScalarFillOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); - -#if FRAGS_ENABLE - - /////////// - // Left Col - /////////// - - // We load and store the left col transposed (as a row) to save space - leftCol.Load (g_bufInLeftColAcc, scalarMulOffset * DIM_M, (int)sizeof(TYPE_ACC)); - leftCol.ScalarMultiply(scalar); - leftCol.Store(g_bufOutLeftColAcc, outScalarMulOffset * DIM_M, (int)sizeof(TYPE_ACC)); - - leftCol.Load (g_bufInLeftColAcc, scalarDivOffset * DIM_M, (int)sizeof(TYPE_ACC)); - leftCol.ScalarDivide(scalar); - leftCol.Store(g_bufOutLeftColAcc, outScalarDivOffset * DIM_M, (int)sizeof(TYPE_ACC)); - - leftCol.Load (g_bufInLeftColAcc, scalarAddOffset * DIM_M, (int)sizeof(TYPE_ACC)); - leftCol.ScalarAdd(scalar); - leftCol.Store(g_bufOutLeftColAcc, outScalarAddOffset * DIM_M, (int)sizeof(TYPE_ACC)); - - leftCol.Load (g_bufInLeftColAcc, scalarSubOffset * DIM_M, (int)sizeof(TYPE_ACC)); - leftCol.ScalarSubtract(scalar); - leftCol.Store(g_bufOutLeftColAcc, outScalarSubOffset * DIM_M, (int)sizeof(TYPE_ACC)); - - leftCol.Load (g_bufInLeftColAcc, scalarFillOffset * DIM_M, (int)sizeof(TYPE_ACC)); - leftCol.Fill(scalar); - leftCol.Store(g_bufOutLeftColAcc, outScalarFillOffset * DIM_M, (int)sizeof(TYPE_ACC)); - - /////////// - // Right Row - /////////// - - rightRow.Load (g_bufInRightRowAcc, scalarMulOffset * DIM_N, (int)sizeof(TYPE_ACC)); - rightRow.ScalarMultiply(scalar); - rightRow.Store(g_bufOutRightRowAcc, outScalarMulOffset * DIM_N, (int)sizeof(TYPE_ACC)); - - rightRow.Load (g_bufInRightRowAcc, scalarDivOffset * DIM_N, (int)sizeof(TYPE_ACC)); - rightRow.ScalarDivide(scalar); - rightRow.Store(g_bufOutRightRowAcc, outScalarDivOffset * DIM_N, (int)sizeof(TYPE_ACC)); - - rightRow.Load (g_bufInRightRowAcc, scalarAddOffset * DIM_N, (int)sizeof(TYPE_ACC)); - rightRow.ScalarAdd(scalar); - rightRow.Store(g_bufOutRightRowAcc, outScalarAddOffset * DIM_N, (int)sizeof(TYPE_ACC)); - - rightRow.Load (g_bufInRightRowAcc, scalarSubOffset * DIM_N, (int)sizeof(TYPE_ACC)); - rightRow.ScalarSubtract(scalar); - rightRow.Store(g_bufOutRightRowAcc, outScalarSubOffset * DIM_N, (int)sizeof(TYPE_ACC)); - - rightRow.Load (g_bufInRightRowAcc, scalarFillOffset * DIM_N, (int)sizeof(TYPE_ACC)); - rightRow.Fill(scalar); - rightRow.Store(g_bufOutRightRowAcc, outScalarFillOffset * DIM_N, (int)sizeof(TYPE_ACC)); -#endif // #if FRAGS_ENABLE - }; - ]]> - - - leftMatrix; - WaveMatrixRight rightMatrix; - WaveMatrixLeftColAcc leftCol; - WaveMatrixRightRowAcc rightRow; - WaveMatrixAccumulator accumulator; - WaveMatrixAccumulator outAccumulator; - - const uint lStride = (uint)(DIM_K * ELEMENTSIZE); - const uint rStride = (uint)(DIM_N * ELEMENTSIZE); - const uint aStride = (uint)(DIM_N * sizeof(TYPE_ACC)); - - leftMatrix.Load(g_bufInMatrices, 0, lStride, false); - rightMatrix.Load(g_bufInMatrices, MATRIX_BUFFER_STRIDE_IN_ELEMENTS * ELEMENTSIZE, rStride, false); - accumulator.Load(g_bufInMatrices, 2 * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * ELEMENTSIZE, aStride, false); - - outAccumulator.Multiply(leftMatrix, rightMatrix); - outAccumulator.Store(g_bufOutMatrices, outMulMatrix * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * sizeof(TYPE_ACC), aStride, false); - - outAccumulator.Fill(42); - outAccumulator.MultiplyAccumulate(leftMatrix, rightMatrix); - outAccumulator.Store(g_bufOutMatrices, outMulAccumulateMatrix * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * sizeof(TYPE_ACC), aStride, false); - - outAccumulator.Fill(42); - outAccumulator.Add(accumulator); - outAccumulator.Store(g_bufOutMatrices, outAddMatrix * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * sizeof(TYPE_ACC), aStride, false); -#if FRAGS_ENABLE - leftCol.Load(g_bufInMatrices, 2 * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * ELEMENTSIZE, (int)sizeof(TYPE_ACC)); - rightRow.Load(g_bufInMatrices, 2 * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * ELEMENTSIZE, (int)sizeof(TYPE_ACC)); - - outAccumulator.Fill(0); - outAccumulator.Add(leftCol); - outAccumulator.Store(g_bufOutMatrices, outBroadcastAddColMatrix * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * sizeof(TYPE_ACC), aStride, false); - - outAccumulator.Fill(0); - outAccumulator.Add(rightRow); - outAccumulator.Store(g_bufOutMatrices, outBroadcastAddRowMatrix * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * sizeof(TYPE_ACC), aStride, false); - - leftCol.SumAccumulate(leftMatrix); - rightRow.SumAccumulate(rightMatrix); - leftCol.Store(g_bufOutRowCols, outRowColOffset, (int)sizeof(TYPE_ACC)); - rightRow.Store(g_bufOutRowCols, outRowColOffset + 64 * sizeof(TYPE_ACC), (int)sizeof(TYPE_ACC)); -#endif //#if FRAGS_ENABLE - }; - ]]> - - - -100 - 20 - -50 - -0 - 0 - 42 - - -
String