From 29823abbcd18c3b441042d0bca8973bf4d67cf87 Mon Sep 17 00:00:00 2001 From: Tex Riddell Date: Fri, 21 Jul 2023 19:20:17 -0700 Subject: [PATCH] WaveMatrix test data refactor: remove ShaderOpArithTable.xml dependency (#5336) This test used the ShaderOpArithTable.xml in a weird way that breaks the way we map to the HLK test. Use of ShaderOpArithTable.xml in this test was unnecessary, since it wasn't using more than one row with different parameter sets to define data-driven test cases. This change simplifies things and gets rid of this dependency. The impact is that the shaders are now defined in ShaderOpArith.xml in place of the dummy shaders that used to be there, instead of ShaderOpArithTable.xml. The shader text and target are used as defined from ShaderOpArith.xml, instead of overriding those values with ones from ShaderOpArithTable.xml. The only changes to the shader content during the move is in whitespace: indentation changed to be consistent with target file, trailing whitespace removed. --- .../unittests/HLSLExec/ExecutionTest.cpp | 137 ++-- .../unittests/HLSLExec/ShaderOpArith.xml | 585 ++++++++++++++++- .../unittests/HLSLExec/ShaderOpArithTable.xml | 617 ------------------ 3 files changed, 631 insertions(+), 708 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index 8efd26cb1..348253603 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -418,21 +418,6 @@ public: TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#TertiaryUint16OpTable") END_TEST_METHOD() - BEGIN_TEST_METHOD(WaveMatrixLoadStoreTests) - TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveMatrixTable") - TEST_METHOD_PROPERTY(L"Priority", L"2") - END_TEST_METHOD() - - BEGIN_TEST_METHOD(WaveMatrixScalarTests) - TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveMatrixTable") - TEST_METHOD_PROPERTY(L"Priority", L"2") - END_TEST_METHOD() - - BEGIN_TEST_METHOD(WaveMatrixMathTests) - TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveMatrixTable") - TEST_METHOD_PROPERTY(L"Priority", L"2") - END_TEST_METHOD() - BEGIN_TEST_METHOD(DotTest) TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#DotOpTable") END_TEST_METHOD() @@ -486,6 +471,18 @@ public: TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#PackUnpackOpTable") END_TEST_METHOD() + BEGIN_TEST_METHOD(WaveMatrixLoadStoreTests) + TEST_METHOD_PROPERTY(L"Priority", L"2") + END_TEST_METHOD() + + BEGIN_TEST_METHOD(WaveMatrixScalarTests) + TEST_METHOD_PROPERTY(L"Priority", L"2") + END_TEST_METHOD() + + BEGIN_TEST_METHOD(WaveMatrixMathTests) + TEST_METHOD_PROPERTY(L"Priority", L"2") + END_TEST_METHOD() + dxc::DxcDllSupport m_support; bool m_D3DInitCompleted = false; @@ -8654,9 +8651,8 @@ template void WaveMatrixLoadStoreTest(int DIM_M, int DIM_N, int MEM_TYPE, CComPtr pDevice, std::shared_ptr ShaderOpSet, - dxc::DxcDllSupport &support, std::string &Target, - CW2A &Text, PCWSTR Validation_type, - double tolerance) { + dxc::DxcDllSupport &support, + PCWSTR Validation_type, double tolerance) { using namespace DirectX::PackedVector; using namespace WMMA; std::string dataTypeInShader = TypeIdToHlsl(); @@ -8850,9 +8846,6 @@ void WaveMatrixLoadStoreTest(int DIM_M, int DIM_N, int MEM_TYPE, std::fill(Data.begin(), Data.end(), (BYTE)0); } - pShaderOp->Shaders.at(0).Target = Target.c_str(); - pShaderOp->Shaders.at(0).Text = Text.m_psz; - argsStream2.str(""); argsStream2 << initialArgsString; @@ -8994,8 +8987,8 @@ void WaveMatrixLoadStoreTest(int DIM_M, int DIM_N, int MEM_TYPE, template void WaveMatrixMathTest(int DIM_M, int DIM_N, CComPtr pDevice, std::shared_ptr ShaderOpSet, - dxc::DxcDllSupport &support, std::string &Target, - CW2A &Text, PCWSTR Validation_type, double tolerance) { + dxc::DxcDllSupport &support, + PCWSTR Validation_type, double tolerance) { using namespace WMMA; using namespace DirectX::PackedVector; DXASSERT_NOMSG(sizeof(T) == sizeof(T2)); @@ -9237,9 +9230,8 @@ void WaveMatrixMathTest(int DIM_M, int DIM_N, CComPtr pDevice, expectedRowCols.size() * expectedRowCols[0].size() * sizeof(expectedRowCols[0][0])); } - // use shader from data table - pShaderOp->Shaders.at(0).Target = Target.c_str(); - pShaderOp->Shaders.at(0).Text = Text.m_psz; + + // update compilation arguments pShaderOp->Shaders.at(0).Arguments = arguments.c_str(); }, ShaderOpSet); @@ -9317,8 +9309,8 @@ void WaveMatrixMathTest(int DIM_M, int DIM_N, CComPtr pDevice, template void WaveMatrixScalarTest(int DIM_M, int DIM_N, CComPtr pDevice, std::shared_ptr ShaderOpSet, - dxc::DxcDllSupport &support, std::string &Target, - CW2A &Text, std::string dataTypeInShader, + dxc::DxcDllSupport &support, + std::string dataTypeInShader, PCWSTR Validation_type, double tolerance, std::vector &floatScalars) { using namespace DirectX::PackedVector; @@ -9520,9 +9512,7 @@ void WaveMatrixScalarTest(int DIM_M, int DIM_N, CComPtr pDevice, std::fill(Data.begin(), Data.end(), (BYTE)0); } - // use shader from data table - pShaderOp->Shaders.at(0).Target = Target.c_str(); - pShaderOp->Shaders.at(0).Text = Text.m_psz; + // update compilation arguments pShaderOp->Shaders.at(0).Arguments = arguments.c_str(); }, ShaderOpSet); @@ -9629,19 +9619,13 @@ TEST_F(ExecutionTest, WaveMatrixLoadStoreTests) { std::vector dimNs; std::shared_ptr ShaderOpSet; - size_t tableSize = sizeof(WaveMatrixOpParameters) / sizeof(TableParameter); CComPtr pDevice = WaveMatrixTestCommon(dimMs, dimNs, ShaderOpSet); if (pDevice == nullptr) { return; } - TableParameterHandler handler(WaveMatrixOpParameters, tableSize); - std::wstring wTarget = handler.GetTableParamByName(L"ShaderOp.Target")->m_str; - std::string Target; - std::transform(wTarget.begin(), wTarget.end(), std::back_inserter(Target), - [](wchar_t c) { return char(c); }); - PCWSTR validationType = handler.GetTableParamByName(L"Validation.Type")->m_str; + PCWSTR validationType = L"epsilon"; double tolerance = 0; // 0 tolerance for load store std::vector memTypes = {BUFFER, GROUPSHARED}; @@ -9675,26 +9659,24 @@ TEST_F(ExecutionTest, WaveMatrixLoadStoreTests) { L"Wmma_DisableLoadStoreTests", disableLoadStoreTests); if (disableLoadStoreTests == 0) { - CW2A LoadStoreText( - handler.GetTableParamByName(L"LoadStoreShaderOp.Text")->m_str); for (int dimM : dimMs) { for (int dimN : dimNs) { for (int memType : memTypes) { WaveMatrixLoadStoreTest( - dimM, dimN, memType, pDevice, ShaderOpSet, m_support, Target, - LoadStoreText, validationType, tolerance); + dimM, dimN, memType, pDevice, ShaderOpSet, m_support, + validationType, tolerance); WaveMatrixLoadStoreTest( - dimM, dimN, memType, pDevice, ShaderOpSet, m_support, Target, - LoadStoreText, validationType, tolerance); + dimM, dimN, memType, pDevice, ShaderOpSet, m_support, + validationType, tolerance); WaveMatrixLoadStoreTest( - dimM, dimN, memType, pDevice, ShaderOpSet, m_support, Target, - LoadStoreText, validationType, tolerance); + dimM, dimN, memType, pDevice, ShaderOpSet, m_support, + validationType, tolerance); WaveMatrixLoadStoreTest( - dimM, dimN, memType, pDevice, ShaderOpSet, m_support, Target, - LoadStoreText, validationType, tolerance); + dimM, dimN, memType, pDevice, ShaderOpSet, m_support, + validationType, tolerance); WaveMatrixLoadStoreTest( - dimM, dimN, memType, pDevice, ShaderOpSet, m_support, Target, - LoadStoreText, validationType, tolerance); + dimM, dimN, memType, pDevice, ShaderOpSet, m_support, + validationType, tolerance); } } } @@ -9708,20 +9690,14 @@ TEST_F(ExecutionTest, WaveMatrixScalarTests) { std::vector dimMs; std::vector dimNs; std::shared_ptr ShaderOpSet; - size_t tableSize = sizeof(WaveMatrixOpParameters) / sizeof(TableParameter); CComPtr pDevice = WaveMatrixTestCommon(dimMs, dimNs, ShaderOpSet); if (pDevice == nullptr) { return; } - TableParameterHandler handler(WaveMatrixOpParameters, tableSize); - std::wstring wTarget = handler.GetTableParamByName(L"ShaderOp.Target")->m_str; - std::string Target; - std::transform(wTarget.begin(), wTarget.end(), std::back_inserter(Target), - [](wchar_t c) { return char(c); }); - PCWSTR validationType = handler.GetTableParamByName(L"Validation.Type")->m_str; - double tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double; + PCWSTR validationType = L"epsilon"; + double tolerance = 0.008; ////////// // SCALAR @@ -9732,20 +9708,13 @@ TEST_F(ExecutionTest, WaveMatrixScalarTests) { L"Wmma_DisableScalarTests", disableScalarTests); if (disableScalarTests == 0) { - CW2A ScalarText{handler.GetTableParamByName(L"ScalarShaderOp.Text")->m_str}; - std::vector *Validation_Scalar = - &handler.GetTableParamByName(L"ScalarValidation.Scalar")->m_StringTable; - - std::vector scalars(Validation_Scalar->size()); - for (size_t i = 0; i < Validation_Scalar->size(); ++i) { - VERIFY_SUCCEEDED(ParseDataToFloat((*Validation_Scalar)[i], scalars[i])); - } + std::vector scalars = { -100.0f, 20.0f, -50.0f, -0.0f, 0.0f, 42.0f }; for (uint32_t dimM : dimMs) { for (uint32_t dimN : dimNs) { std::string hlslType = "float32_t"; WaveMatrixScalarTest(dimM, dimN, pDevice, ShaderOpSet, m_support, - Target, ScalarText, hlslType, + hlslType, validationType, tolerance, scalars); // hlslType is used for the CheckFeatureSupport query. @@ -9753,20 +9722,20 @@ TEST_F(ExecutionTest, WaveMatrixScalarTests) { // accumulator precision returned by CheckFeatureSupport. hlslType = "float16_t"; WaveMatrixScalarTest(dimM, dimN, pDevice, ShaderOpSet, m_support, - Target, ScalarText, hlslType, + hlslType, validationType, tolerance, scalars); WaveMatrixScalarTest(dimM, dimN, pDevice, ShaderOpSet, m_support, - Target, ScalarText, hlslType, + hlslType, validationType, tolerance, scalars); hlslType = "uint8_t4_packed"; WaveMatrixScalarTest(dimM, dimN, pDevice, ShaderOpSet, - m_support, Target, ScalarText, hlslType, + m_support, hlslType, validationType, tolerance, scalars); hlslType = "int8_t4_packed"; WaveMatrixScalarTest(dimM, dimN, pDevice, ShaderOpSet, - m_support, Target, ScalarText, hlslType, + m_support, hlslType, validationType, tolerance, scalars); } } @@ -9780,20 +9749,14 @@ TEST_F(ExecutionTest, WaveMatrixMathTests) { std::vector dimMs; std::vector dimNs; std::shared_ptr ShaderOpSet; - size_t tableSize = sizeof(WaveMatrixOpParameters) / sizeof(TableParameter); CComPtr pDevice = WaveMatrixTestCommon(dimMs, dimNs, ShaderOpSet); if (pDevice == nullptr) { return; } - TableParameterHandler handler(WaveMatrixOpParameters, tableSize); - std::wstring wTarget = handler.GetTableParamByName(L"ShaderOp.Target")->m_str; - std::string Target; - std::transform(wTarget.begin(), wTarget.end(), std::back_inserter(Target), - [](wchar_t c) { return char(c); }); - PCWSTR validationType = handler.GetTableParamByName(L"Validation.Type")->m_str; - double tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double; + PCWSTR validationType = L"epsilon"; + double tolerance = 0.008; ////////// // MATH TEST @@ -9804,30 +9767,28 @@ TEST_F(ExecutionTest, WaveMatrixMathTests) { L"Wmma_DisableMathTests", disableMathTests); if (disableMathTests == 0) { - CW2A MathShaderText{ - handler.GetTableParamByName(L"MathShaderOp.Text")->m_str}; for (uint32_t dimM : dimMs) { for (uint32_t dimN : dimNs) { WaveMatrixMathTest( - dimM, dimN, pDevice, ShaderOpSet, m_support, Target, MathShaderText, + dimM, dimN, pDevice, ShaderOpSet, m_support, validationType, tolerance); WaveMatrixMathTest( - dimM, dimN, pDevice, ShaderOpSet, m_support, Target, MathShaderText, + dimM, dimN, pDevice, ShaderOpSet, m_support, validationType, tolerance); WaveMatrixMathTest( - dimM, dimN, pDevice, ShaderOpSet, m_support, Target, MathShaderText, + dimM, dimN, pDevice, ShaderOpSet, m_support, validationType, tolerance); WaveMatrixMathTest( - dimM, dimN, pDevice, ShaderOpSet, m_support, Target, MathShaderText, + dimM, dimN, pDevice, ShaderOpSet, m_support, validationType, tolerance); WaveMatrixMathTest( - dimM, dimN, pDevice, ShaderOpSet, m_support, Target, MathShaderText, + dimM, dimN, pDevice, ShaderOpSet, m_support, validationType, tolerance); WaveMatrixMathTest( - dimM, dimN, pDevice, ShaderOpSet, m_support, Target, MathShaderText, + dimM, dimN, pDevice, ShaderOpSet, m_support, validationType, tolerance); WaveMatrixMathTest( - dimM, dimN, pDevice, ShaderOpSet, m_support, Target, MathShaderText, + dimM, dimN, pDevice, ShaderOpSet, m_support, validationType, tolerance); } } diff --git a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml index afe5be9a1..0afa3532d 100644 --- a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml +++ b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml @@ -1298,7 +1298,410 @@ (j); + } + } + GroupMemoryBarrierWithGroupSync(); + } + + + void FillDest(uint start, uint threadX) + { + GroupMemoryBarrierWithGroupSync(); + if (threadX == 0) + { + for (uint i = 0; i < MAX_NUM_ELEMENTS; ++i) + { + g_bufOut.Store(start + i * sizeof(DATATYPE), gsharedArr[i]); + // Also clear output so we don't write garbage if the whole buffer is not filled + gsharedArr[i] = 0; + } + } + GroupMemoryBarrierWithGroupSync(); + } + + #elif TEST_LOAD_STORE_ACCUMULATOR + groupshared TYPE_ACC gsharedArrAccumulator[MAX_NUM_ELEMENTS]; + + void ClearGShared(uint threadX) + { + GroupMemoryBarrierWithGroupSync(); + if (threadX == 0) + { + for (uint i = 0; i < MAX_NUM_ELEMENTS; ++i) + { + gsharedArrAccumulator[i] = (TYPE_ACC)0; + } + } + GroupMemoryBarrierWithGroupSync(); + } + + void FillSource(uint threadX) + { + GroupMemoryBarrierWithGroupSync(); + if (threadX == 0) + { + uint j = 0; + for (uint i = 0; i < MAX_NUM_ELEMENTS; ++i, j += sizeof(TYPE_ACC)) + { + gsharedArrAccumulator[i] = g_bufInAccum.Load(j); + } + } + GroupMemoryBarrierWithGroupSync(); + } + + void FillDest(uint start, uint threadX) + { + GroupMemoryBarrierWithGroupSync(); + if (threadX == 0) + { + for (uint i = 0; i < MAX_NUM_ELEMENTS; ++i) + { + g_bufOutAccumulator.Store(start + i * sizeof(TYPE_ACC), gsharedArrAccumulator[i]); + gsharedArrAccumulator[i] = 0; + } + } + GroupMemoryBarrierWithGroupSync(); + } + + void FillDestRowCol(uint start, uint threadX) + { + GroupMemoryBarrierWithGroupSync(); + if (threadX == 0) + { + for (uint i = 0; i < MAX_NUM_ELEMENTS; ++i) + { + g_bufOutRowCol.Store(start + i * sizeof(TYPE_ACC), gsharedArrAccumulator[i]); + } + } + ClearGShared(threadX); + } + #endif + + #define LOAD_SOURCE gsharedArr + #define LOAD_SOURCE_ACCUM gsharedArrAccumulator + #define STORE_DEST gsharedArr + #define STORE_DEST_ROWCOL gsharedArrAccumulator + #define STORE_DEST_ACCUM gsharedArrAccumulator + + // Start/Stride/Offset are all given in bytes, and converted to array elements in the macros. + + #define TEST_LOAD_LEFT(mat, k, start, stride, alignment, transp, dest, destOffset) \ + mat.Load(gsharedArr, (start)/sizeof(DATATYPE), (stride)/sizeof(DATATYPE), transp); \ + mat.Store(g_bufOut, destOffset, lStride, false); + + #define TEST_LOAD_RIGHT(mat, k, start, stride, alignment, transp, dest, destOffset) \ + mat.Load(gsharedArr, (start)/sizeof(DATATYPE), (stride)/sizeof(DATATYPE), transp); \ + mat.Store(g_bufOut, destOffset, rStride, false); + + #define TEST_LOAD_LEFT_COL(mat, k, start, stride, alignment, dest, destOffset) \ + mat.Load(gsharedArrAccumulator, (start)/sizeof(TYPE_ACC), (stride)/sizeof(TYPE_ACC)); \ + mat.Store(g_bufOutRowCol, destOffset, 1 * sizeof(TYPE_ACC)); + + #define TEST_LOAD_RIGHT_ROW(mat, k, start, stride, alignment, dest, destOffset) \ + mat.Load(gsharedArrAccumulator, (start)/sizeof(TYPE_ACC), (stride)/sizeof(TYPE_ACC)); \ + mat.Store(g_bufOutRowCol, destOffset, 1 * sizeof(TYPE_ACC)); + + #define TEST_LOAD_ACCUMULATOR(mata, k, start, stride, alignment, transp, dest, destOffset) \ + mata.Load(gsharedArrAccumulator, (start)/sizeof(TYPE_ACC), (stride)/sizeof(TYPE_ACC), transp); \ + mata.Store(g_bufOutAccumulator, destOffset, aStride, false); + + #define TEST_STORE_LEFT(matl, k, stride, offset, transp, dest, destOffset) \ + matl.Load(g_bufIn, 0, lStride, false); \ + matl.Store(gsharedArr, (offset)/sizeof(DATATYPE), (stride)/sizeof(DATATYPE), transp); \ + FillDest(destOffset, groupThreadID.x); + + #define TEST_STORE_RIGHT(matr, k, stride, offset, transp, dest, destOffset) \ + matr.Load(g_bufIn, 0, rStride, false); \ + matr.Store(gsharedArr, (offset)/sizeof(DATATYPE), (stride)/sizeof(DATATYPE), transp); \ + FillDest(destOffset, groupThreadID.x); + + #define TEST_STORE_LEFT_COL(mat, k, stride, offset, dest, destOffset) \ + mat.Load(g_bufInAccum, 0, 1 * sizeof(TYPE_ACC)); \ + mat.Store(gsharedArrAccumulator, (offset)/sizeof(TYPE_ACC), (stride)/sizeof(TYPE_ACC)); \ + FillDestRowCol(destOffset, groupThreadID.x); + + #define TEST_STORE_RIGHT_ROW(mat, k, stride, offset, dest, destOffset) \ + mat.Load(g_bufInAccum, 0, 1 * sizeof(TYPE_ACC)); \ + mat.Store(gsharedArrAccumulator, (offset)/sizeof(TYPE_ACC), (stride)/sizeof(TYPE_ACC)); \ + FillDestRowCol(destOffset, groupThreadID.x); + + #define TEST_STORE_ACCUMULATOR(mata, k, stride, offset, transp, dest, destOffset) \ + mata.Load(g_bufInAccum, 0, aStride, false); \ + mata.Store(gsharedArrAccumulator, (offset)/sizeof(TYPE_ACC), (stride)/sizeof(TYPE_ACC), transp); \ + FillDest(destOffset, groupThreadID.x); + + #else + #define LOAD_SOURCE g_bufIn + #define LOAD_SOURCE_ACCUM g_bufInAccum + #define STORE_DEST g_bufOut + #define STORE_DEST_ROWCOL g_bufOutRowCol + #define STORE_DEST_ACCUM g_bufOutAccumulator + + void FillSource(uint threadX) {} // no-op + void FillDest(uint start, uint threadX) {} + void FillDestRowCol(uint start, uint threadX) {} + void ClearGShared(uint threadX) {} + + #define TEST_LOAD_LEFT(mat, k, start, stride, alignment, transp, dest, destOffset) \ + mat.Load(LOAD_SOURCE, start, stride, transp, alignment); \ + mat.Store(dest, destOffset, lStride, false); + + #define TEST_LOAD_RIGHT(mat, k, start, stride, alignment, transp, dest, destOffset) \ + mat.Load(LOAD_SOURCE, start, stride, transp, alignment); \ + mat.Store(dest, destOffset, rStride, false); + + #define TEST_LOAD_LEFT_COL(mat, k, start, stride, alignment, dest, destOffset) \ + mat.Load(LOAD_SOURCE_ACCUM, start, stride, alignment); \ + mat.Store(dest, destOffset, (int)sizeof(TYPE_ACC)); + + #define TEST_LOAD_RIGHT_ROW(mat, k, start, stride, alignment, dest, destOffset) \ + mat.Load(LOAD_SOURCE_ACCUM, start, stride, alignment); \ + mat.Store(dest, destOffset, (int)sizeof(TYPE_ACC)); + + #define TEST_LOAD_ACCUMULATOR(mata, k, start, stride, alignment, transp, dest, destOffset) \ + mata.Load(LOAD_SOURCE_ACCUM, start, stride, transp, alignment); \ + mata.Store(dest, destOffset, aStride, false); + + #define TEST_STORE_LEFT(matl, k, stride, alignment, transp, dest, destOffset) \ + matl.Load(LOAD_SOURCE, 0, lStride, false); \ + matl.Store(dest, destOffset, stride, transp, alignment); + + #define TEST_STORE_RIGHT(matr, k, stride, alignment, transp, dest, destOffset) \ + matr.Load(LOAD_SOURCE, 0, rStride, false); \ + matr.Store(dest, destOffset, stride, transp, alignment); + + #define TEST_STORE_LEFT_COL(mat, k, stride, alignment, dest, destOffset) \ + mat.Load(LOAD_SOURCE_ACCUM, 0, (int)sizeof(TYPE_ACC)); \ + mat.Store(dest, destOffset, stride, alignment); + + #define TEST_STORE_RIGHT_ROW(mat, k, stride, alignment, dest, destOffset) \ + mat.Load(LOAD_SOURCE_ACCUM, 0, (int)sizeof(TYPE_ACC)); \ + mat.Store(dest, destOffset, stride, alignment); + + #define TEST_STORE_ACCUMULATOR(mata, k, stride, alignment, transp, dest, destOffset) \ + mata.Load(LOAD_SOURCE_ACCUM, 0, aStride, false); \ + mata.Store(dest, destOffset, stride, transp, alignment); + + #endif // GROUPSHARED if/else + + [WaveSize(NUM_LANES)] + #ifdef GROUPSHARED + [numthreads(NUM_LANES,1,1)] + #else + [numthreads(NUM_LANES * 2,1,1)] + #endif + void main(uint3 groupThreadID : SV_GroupThreadID) + { + uint rowColSize = 64 * 64 * sizeof(TYPE_ACC); + uint size = 2 * 64 * 64 * ELEMENTSIZE; + + // Calculate strides and offsets in bytes. + uint s = 16 * ELEMENTSIZE; // start + uint lStride = (DIM_K * ELEMENTSIZE); + uint rStride = (DIM_N * ELEMENTSIZE); + uint ltStride = (DIM_M * ELEMENTSIZE); + uint rtStride = (DIM_K * ELEMENTSIZE); + uint a = 4; // Alignment. For groupshared, tests store offset. + + // For accumulator + uint sizeAcc = 2 * 64 * 64 * sizeof(TYPE_ACC); + uint s2 = 16 * sizeof(TYPE_ACC); // start + uint aStride = (DIM_N * sizeof(TYPE_ACC)); + uint atStride = (DIM_M * sizeof(TYPE_ACC)); + uint accElemStride = sizeof(TYPE_ACC); + + uint groupOffset = (groupThreadID.x/NUM_LANES) * 22; + + uint LOAD_LEFT_START = 0 + groupOffset; + uint LOAD_RIGHT_START = 1 + groupOffset; + uint LOAD_LEFT_STRIDE_P4 = 2 + groupOffset; + uint LOAD_RIGHT_STRIDE_P4 = 3 + groupOffset; + uint LOAD_LEFT_STRIDE_X2 = 4 + groupOffset; + uint LOAD_RIGHT_STRIDE_X2 = 5 + groupOffset; + uint LOAD_LEFT_ALIGNMENT = 6 + groupOffset; + uint LOAD_RIGHT_ALIGNMENT = 7 + groupOffset; + uint LOAD_LEFT_TRANSPOSE = 8 + groupOffset; + uint LOAD_RIGHT_TRANSPOSE = 9 + groupOffset; + uint LOAD_LEFT_ALLPARAMS = 10 + groupOffset; + uint LOAD_RIGHT_ALLPARAMS = 11 + groupOffset; + uint STORE_LEFT_STRIDE_P4 = 12 + groupOffset; + uint STORE_RIGHT_STRIDE_P4 = 13 + groupOffset; + uint STORE_LEFT_STRIDE_X2 = 14 + groupOffset; + uint STORE_RIGHT_STRIDE_X2 = 15 + groupOffset; + uint STORE_LEFT_ALIGNMENT = 16 + groupOffset; + uint STORE_RIGHT_ALIGNMENT = 17 + groupOffset; + uint STORE_LEFT_TRANSPOSE = 18 + groupOffset; + uint STORE_RIGHT_TRANSPOSE = 19 + groupOffset; + uint STORE_LEFT_ALLPARAMS = 20 + groupOffset; + uint STORE_RIGHT_ALLPARAMS = 21 + groupOffset; + +#if TEST_LOAD_STORE_LR + WaveMatrixLeft matLeft; + WaveMatrixRight matRight; + + if (groupThreadID.x == 0) + { + g_bufOutMatrixDepth.Store(0, matLeft.MatrixDepth()); + g_bufOutMatrixDepth.Store(0 + sizeof(uint), matRight.MatrixDepth()); + } + + ClearGShared(groupThreadID.x); + FillSource(groupThreadID.x); + + ///////////////////////// + // Left/Right Matrices // + ///////////////////////// + TEST_LOAD_LEFT(matLeft, DIM_K, s, lStride , 0, false, STORE_DEST, LOAD_LEFT_START * size); + TEST_LOAD_RIGHT(matRight, DIM_K, s, rStride , 0, false, STORE_DEST, LOAD_RIGHT_START * size); + + TEST_LOAD_LEFT(matLeft, DIM_K, 0, lStride + 4, 0, false, STORE_DEST, LOAD_LEFT_STRIDE_P4 * size); + TEST_LOAD_RIGHT(matRight, DIM_K, 0, rStride + 4, 0, false, STORE_DEST, LOAD_RIGHT_STRIDE_P4 * size); + + TEST_LOAD_LEFT(matLeft, DIM_K, 0, lStride * 2, 0, false, STORE_DEST, LOAD_LEFT_STRIDE_X2 * size); + TEST_LOAD_RIGHT(matRight, DIM_K, 0, rStride * 2, 0, false, STORE_DEST, LOAD_RIGHT_STRIDE_X2 * size); + + TEST_LOAD_LEFT(matLeft, DIM_K, 0, lStride , a, false, STORE_DEST, LOAD_LEFT_ALIGNMENT * size); + TEST_LOAD_RIGHT(matRight, DIM_K, 0, rStride , a, false, STORE_DEST, LOAD_RIGHT_ALIGNMENT * size); + + TEST_LOAD_LEFT(matLeft, DIM_K, 0, ltStride , 0, true , STORE_DEST, LOAD_LEFT_TRANSPOSE * size); + TEST_LOAD_RIGHT(matRight, DIM_K, 0, rtStride , 0, true , STORE_DEST, LOAD_RIGHT_TRANSPOSE * size); + + TEST_LOAD_LEFT(matLeft, DIM_K, s, ltStride + 4, a, true , STORE_DEST, LOAD_LEFT_ALLPARAMS * size); + TEST_LOAD_RIGHT(matRight, DIM_K, s, rtStride + 4, a, true , STORE_DEST, LOAD_RIGHT_ALLPARAMS * size); + + ClearGShared(groupThreadID.x); + + TEST_STORE_LEFT(matLeft, DIM_K, lStride + 4, 0, false, STORE_DEST, STORE_LEFT_STRIDE_P4 * size); + TEST_STORE_RIGHT(matRight, DIM_K, rStride + 4, 0, false, STORE_DEST, STORE_RIGHT_STRIDE_P4 * size); + + TEST_STORE_LEFT(matLeft, DIM_K, lStride * 2, 0, false, STORE_DEST, STORE_LEFT_STRIDE_X2 * size); + TEST_STORE_RIGHT(matRight, DIM_K, rStride * 2, 0, false, STORE_DEST, STORE_RIGHT_STRIDE_X2 * size); + + TEST_STORE_LEFT(matLeft, DIM_K, lStride , a, false, STORE_DEST, STORE_LEFT_ALIGNMENT * size); + TEST_STORE_RIGHT(matRight, DIM_K, rStride , a, false, STORE_DEST, STORE_RIGHT_ALIGNMENT * size); + + TEST_STORE_LEFT(matLeft, DIM_K, ltStride , 0, true , STORE_DEST, STORE_LEFT_TRANSPOSE * size); + TEST_STORE_RIGHT(matRight, DIM_K, rtStride , 0, true , STORE_DEST, STORE_RIGHT_TRANSPOSE * size); + + TEST_STORE_LEFT(matLeft, DIM_K, ltStride + 4, a, true , STORE_DEST, STORE_LEFT_ALLPARAMS * size); + TEST_STORE_RIGHT(matRight, DIM_K, rtStride + 4, a, true , STORE_DEST, STORE_RIGHT_ALLPARAMS * size); + +#endif +#if TEST_LOAD_STORE_ACCUMULATOR + /////////////////////// + // Accumulator Types // + /////////////////////// + WaveMatrixLeftColAcc matLeftColAcc; + WaveMatrixRightRowAcc matRightRowAcc; + WaveMatrixAccumulator matAccum; + #if FRAGS_ENABLE + ClearGShared(groupThreadID.x); + FillSource(groupThreadID.x); + + TEST_LOAD_LEFT_COL(matLeftColAcc, 1, s2, accElemStride , 0, STORE_DEST_ROWCOL, LOAD_LEFT_START * rowColSize); + TEST_LOAD_RIGHT_ROW(matRightRowAcc, 1, s2, accElemStride , 0, STORE_DEST_ROWCOL, LOAD_RIGHT_START * rowColSize); + + TEST_LOAD_LEFT_COL(matLeftColAcc, 1, 0, accElemStride + 4, 0, STORE_DEST_ROWCOL, LOAD_LEFT_STRIDE_P4 * rowColSize); + TEST_LOAD_RIGHT_ROW(matRightRowAcc, 1, 0, accElemStride + 4, 0, STORE_DEST_ROWCOL, LOAD_RIGHT_STRIDE_P4 * rowColSize); + + TEST_LOAD_LEFT_COL(matLeftColAcc, 1, 0, accElemStride * 2, 0, STORE_DEST_ROWCOL, LOAD_LEFT_STRIDE_X2 * rowColSize); + TEST_LOAD_RIGHT_ROW(matRightRowAcc, 1, 0, accElemStride * 2, 0, STORE_DEST_ROWCOL, LOAD_RIGHT_STRIDE_X2 * rowColSize); + + TEST_LOAD_LEFT_COL(matLeftColAcc, 1, 0, accElemStride , a, STORE_DEST_ROWCOL, LOAD_LEFT_ALIGNMENT * rowColSize); + TEST_LOAD_RIGHT_ROW(matRightRowAcc, 1, 0, accElemStride , a, STORE_DEST_ROWCOL, LOAD_RIGHT_ALIGNMENT * rowColSize); + + TEST_LOAD_LEFT_COL(matLeftColAcc, 1, 0, accElemStride , 0, STORE_DEST_ROWCOL, LOAD_LEFT_TRANSPOSE * rowColSize); + TEST_LOAD_RIGHT_ROW(matRightRowAcc, 1, 0, accElemStride , 0, STORE_DEST_ROWCOL, LOAD_RIGHT_TRANSPOSE * rowColSize); + + TEST_LOAD_LEFT_COL(matLeftColAcc, 1, s2, accElemStride + 4, a, STORE_DEST_ROWCOL, LOAD_LEFT_ALLPARAMS * rowColSize); + TEST_LOAD_RIGHT_ROW(matRightRowAcc, 1, s2, accElemStride + 4, a, STORE_DEST_ROWCOL, LOAD_RIGHT_ALLPARAMS * rowColSize); + + ClearGShared(groupThreadID.x); + + TEST_STORE_LEFT_COL(matLeftColAcc, 1, accElemStride + 4, 0, STORE_DEST_ROWCOL, STORE_LEFT_STRIDE_P4 * rowColSize); + TEST_STORE_RIGHT_ROW(matRightRowAcc, 1, accElemStride + 4, 0, STORE_DEST_ROWCOL, STORE_RIGHT_STRIDE_P4 * rowColSize); + + TEST_STORE_LEFT_COL(matLeftColAcc, 1, accElemStride * 2, 0, STORE_DEST_ROWCOL, STORE_LEFT_STRIDE_X2 * rowColSize); + TEST_STORE_RIGHT_ROW(matRightRowAcc, 1, accElemStride * 2, 0, STORE_DEST_ROWCOL, STORE_RIGHT_STRIDE_X2 * rowColSize); + + TEST_STORE_LEFT_COL(matLeftColAcc, 1, accElemStride , a, STORE_DEST_ROWCOL, STORE_LEFT_ALIGNMENT * rowColSize); + TEST_STORE_RIGHT_ROW(matRightRowAcc, 1, accElemStride , a, STORE_DEST_ROWCOL, STORE_RIGHT_ALIGNMENT * rowColSize); + + TEST_STORE_LEFT_COL(matLeftColAcc, 1, accElemStride , 0, STORE_DEST_ROWCOL, STORE_LEFT_TRANSPOSE * rowColSize); + TEST_STORE_RIGHT_ROW(matRightRowAcc, 1, accElemStride , 0, STORE_DEST_ROWCOL, STORE_RIGHT_TRANSPOSE * rowColSize); + + TEST_STORE_LEFT_COL(matLeftColAcc, 1, accElemStride + 4, a, STORE_DEST_ROWCOL, STORE_LEFT_ALLPARAMS * rowColSize); + TEST_STORE_RIGHT_ROW(matRightRowAcc, 1, accElemStride + 4, a, STORE_DEST_ROWCOL, STORE_RIGHT_ALLPARAMS * rowColSize); + #endif // #if FRAGS_ENABLE + + groupOffset = (groupThreadID.x/NUM_LANES) * 11; + uint LOAD_START = 0 + groupOffset; + uint LOAD_STRIDE_P4 = 1 + groupOffset; + uint LOAD_STRIDE_X2 = 2 + groupOffset; + uint LOAD_ALIGNMENT = 3 + groupOffset; + uint LOAD_TRANSPOSE = 4 + groupOffset; + uint LOAD_ALLPARAMS = 5 + groupOffset; + uint STORE_STRIDE_P4 = 6 + groupOffset; + uint STORE_STRIDE_X2 = 7 + groupOffset; + uint STORE_ALIGNMENT = 8 + groupOffset; + uint STORE_TRANSPOSE = 9 + groupOffset; + uint STORE_ALLPARAMS = 10 + groupOffset; + + ClearGShared(groupThreadID.x); + FillSource(groupThreadID.x); + + TEST_LOAD_ACCUMULATOR (matAccum, DIM_K, s2, aStride , 0, false, g_bufOutAccumulator, LOAD_START * sizeAcc); + TEST_LOAD_ACCUMULATOR (matAccum, DIM_K, 0 , aStride + 4, 0, false, g_bufOutAccumulator, LOAD_STRIDE_P4 * sizeAcc); + TEST_LOAD_ACCUMULATOR (matAccum, DIM_K, 0 , aStride * 2, 0, false, g_bufOutAccumulator, LOAD_STRIDE_X2 * sizeAcc); + TEST_LOAD_ACCUMULATOR (matAccum, DIM_K, 0 , aStride , a, false, g_bufOutAccumulator, LOAD_ALIGNMENT * sizeAcc); + TEST_LOAD_ACCUMULATOR (matAccum, DIM_K, 0 , atStride , 0, true , g_bufOutAccumulator, LOAD_TRANSPOSE * sizeAcc); + TEST_LOAD_ACCUMULATOR (matAccum, DIM_K, s2, atStride + 4, a, true , g_bufOutAccumulator, LOAD_ALLPARAMS * sizeAcc); + + ClearGShared(groupThreadID.x); + + TEST_STORE_ACCUMULATOR(matAccum, DIM_K, aStride + 4, 0, false, STORE_DEST_ACCUM, STORE_STRIDE_P4 * sizeAcc); + TEST_STORE_ACCUMULATOR(matAccum, DIM_K, aStride * 2, 0, false, STORE_DEST_ACCUM, STORE_STRIDE_X2 * sizeAcc); + TEST_STORE_ACCUMULATOR(matAccum, DIM_K, aStride , a, false, STORE_DEST_ACCUM, STORE_ALIGNMENT * sizeAcc); + TEST_STORE_ACCUMULATOR(matAccum, DIM_K, atStride , 0, true , STORE_DEST_ACCUM, STORE_TRANSPOSE * sizeAcc); + TEST_STORE_ACCUMULATOR(matAccum, DIM_K, atStride + 4, a, true , STORE_DEST_ACCUM, STORE_ALLPARAMS * sizeAcc); +#endif // #if TEST_LOAD_STORE_ACCUMULATOR + }; ]]> @@ -1323,7 +1726,124 @@ leftCol; + WaveMatrixRightRowAcc rightRow; + WaveMatrixAccumulator accumulator; + + TYPE_ACC scalar = g_bufInScalar.Load(groupID.x * sizeof(TYPE_ACC)); + + const uint lStride = (uint)(DIM_K * sizeof(TYPE_ACC)); + const uint rStride = (uint)(DIM_N * sizeof(TYPE_ACC)); + const uint aStride = (uint)(DIM_N * sizeof(TYPE_ACC)); + + /////////// + // Accumulator + /////////// + + accumulator.Load(g_bufInAccumulator, scalarMulOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); + accumulator.ScalarMultiply(scalar); + accumulator.Store(g_bufOutAccumulator, outScalarMulOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); + + accumulator.Load(g_bufInAccumulator, scalarDivOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); + accumulator.ScalarDivide(scalar); + accumulator.Store(g_bufOutAccumulator, outScalarDivOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); + + accumulator.Load(g_bufInAccumulator, scalarAddOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); + accumulator.ScalarAdd(scalar); + accumulator.Store(g_bufOutAccumulator, outScalarAddOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); + + accumulator.Load(g_bufInAccumulator, scalarSubOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); + accumulator.ScalarSubtract(scalar); + accumulator.Store(g_bufOutAccumulator, outScalarSubOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); + + accumulator.Load(g_bufInAccumulator, scalarFillOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); + accumulator.Fill(scalar); + accumulator.Store(g_bufOutAccumulator, outScalarFillOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); + +#if FRAGS_ENABLE + + /////////// + // Left Col + /////////// + + // We load and store the left col transposed (as a row) to save space + leftCol.Load (g_bufInLeftColAcc, scalarMulOffset * DIM_M, (int)sizeof(TYPE_ACC)); + leftCol.ScalarMultiply(scalar); + leftCol.Store(g_bufOutLeftColAcc, outScalarMulOffset * DIM_M, (int)sizeof(TYPE_ACC)); + + leftCol.Load (g_bufInLeftColAcc, scalarDivOffset * DIM_M, (int)sizeof(TYPE_ACC)); + leftCol.ScalarDivide(scalar); + leftCol.Store(g_bufOutLeftColAcc, outScalarDivOffset * DIM_M, (int)sizeof(TYPE_ACC)); + + leftCol.Load (g_bufInLeftColAcc, scalarAddOffset * DIM_M, (int)sizeof(TYPE_ACC)); + leftCol.ScalarAdd(scalar); + leftCol.Store(g_bufOutLeftColAcc, outScalarAddOffset * DIM_M, (int)sizeof(TYPE_ACC)); + + leftCol.Load (g_bufInLeftColAcc, scalarSubOffset * DIM_M, (int)sizeof(TYPE_ACC)); + leftCol.ScalarSubtract(scalar); + leftCol.Store(g_bufOutLeftColAcc, outScalarSubOffset * DIM_M, (int)sizeof(TYPE_ACC)); + + leftCol.Load (g_bufInLeftColAcc, scalarFillOffset * DIM_M, (int)sizeof(TYPE_ACC)); + leftCol.Fill(scalar); + leftCol.Store(g_bufOutLeftColAcc, outScalarFillOffset * DIM_M, (int)sizeof(TYPE_ACC)); + + /////////// + // Right Row + /////////// + + rightRow.Load (g_bufInRightRowAcc, scalarMulOffset * DIM_N, (int)sizeof(TYPE_ACC)); + rightRow.ScalarMultiply(scalar); + rightRow.Store(g_bufOutRightRowAcc, outScalarMulOffset * DIM_N, (int)sizeof(TYPE_ACC)); + + rightRow.Load (g_bufInRightRowAcc, scalarDivOffset * DIM_N, (int)sizeof(TYPE_ACC)); + rightRow.ScalarDivide(scalar); + rightRow.Store(g_bufOutRightRowAcc, outScalarDivOffset * DIM_N, (int)sizeof(TYPE_ACC)); + + rightRow.Load (g_bufInRightRowAcc, scalarAddOffset * DIM_N, (int)sizeof(TYPE_ACC)); + rightRow.ScalarAdd(scalar); + rightRow.Store(g_bufOutRightRowAcc, outScalarAddOffset * DIM_N, (int)sizeof(TYPE_ACC)); + + rightRow.Load (g_bufInRightRowAcc, scalarSubOffset * DIM_N, (int)sizeof(TYPE_ACC)); + rightRow.ScalarSubtract(scalar); + rightRow.Store(g_bufOutRightRowAcc, outScalarSubOffset * DIM_N, (int)sizeof(TYPE_ACC)); + + rightRow.Load (g_bufInRightRowAcc, scalarFillOffset * DIM_N, (int)sizeof(TYPE_ACC)); + rightRow.Fill(scalar); + rightRow.Store(g_bufOutRightRowAcc, outScalarFillOffset * DIM_N, (int)sizeof(TYPE_ACC)); +#endif // #if FRAGS_ENABLE + }; ]]> @@ -1340,7 +1860,66 @@ leftMatrix; + WaveMatrixRight rightMatrix; + WaveMatrixLeftColAcc leftCol; + WaveMatrixRightRowAcc rightRow; + WaveMatrixAccumulator accumulator; + WaveMatrixAccumulator outAccumulator; + + const uint lStride = (uint)(DIM_K * ELEMENTSIZE); + const uint rStride = (uint)(DIM_N * ELEMENTSIZE); + const uint aStride = (uint)(DIM_N * sizeof(TYPE_ACC)); + + leftMatrix.Load(g_bufInMatrices, 0, lStride, false); + rightMatrix.Load(g_bufInMatrices, MATRIX_BUFFER_STRIDE_IN_ELEMENTS * ELEMENTSIZE, rStride, false); + accumulator.Load(g_bufInMatrices, 2 * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * ELEMENTSIZE, aStride, false); + + outAccumulator.Multiply(leftMatrix, rightMatrix); + outAccumulator.Store(g_bufOutMatrices, outMulMatrix * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * sizeof(TYPE_ACC), aStride, false); + + outAccumulator.Fill(42); + outAccumulator.MultiplyAccumulate(leftMatrix, rightMatrix); + outAccumulator.Store(g_bufOutMatrices, outMulAccumulateMatrix * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * sizeof(TYPE_ACC), aStride, false); + + outAccumulator.Fill(42); + outAccumulator.Add(accumulator); + outAccumulator.Store(g_bufOutMatrices, outAddMatrix * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * sizeof(TYPE_ACC), aStride, false); +#if FRAGS_ENABLE + leftCol.Load(g_bufInMatrices, 2 * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * ELEMENTSIZE, (int)sizeof(TYPE_ACC)); + rightRow.Load(g_bufInMatrices, 2 * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * ELEMENTSIZE, (int)sizeof(TYPE_ACC)); + + outAccumulator.Fill(0); + outAccumulator.Add(leftCol); + outAccumulator.Store(g_bufOutMatrices, outBroadcastAddColMatrix * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * sizeof(TYPE_ACC), aStride, false); + + outAccumulator.Fill(0); + outAccumulator.Add(rightRow); + outAccumulator.Store(g_bufOutMatrices, outBroadcastAddRowMatrix * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * sizeof(TYPE_ACC), aStride, false); + + leftCol.SumAccumulate(leftMatrix); + rightRow.SumAccumulate(rightMatrix); + leftCol.Store(g_bufOutRowCols, outRowColOffset, (int)sizeof(TYPE_ACC)); + rightRow.Store(g_bufOutRowCols, outRowColOffset + 64 * sizeof(TYPE_ACC), (int)sizeof(TYPE_ACC)); +#endif //#if FRAGS_ENABLE + }; ]]> diff --git a/tools/clang/unittests/HLSLExec/ShaderOpArithTable.xml b/tools/clang/unittests/HLSLExec/ShaderOpArithTable.xml index 7de0a7b5b..380d28557 100644 --- a/tools/clang/unittests/HLSLExec/ShaderOpArithTable.xml +++ b/tools/clang/unittests/HLSLExec/ShaderOpArithTable.xml @@ -4750,623 +4750,6 @@ -enable-16bit-types - - - String - double - String - String - String - String - - - epsilon - 0.008 - cs_6_8 - - (j); - } - } - GroupMemoryBarrierWithGroupSync(); - } - - - void FillDest(uint start, uint threadX) - { - GroupMemoryBarrierWithGroupSync(); - if (threadX == 0) - { - for (uint i = 0; i < MAX_NUM_ELEMENTS; ++i) - { - g_bufOut.Store(start + i * sizeof(DATATYPE), gsharedArr[i]); - // Also clear output so we don't write garbage if the whole buffer is not filled - gsharedArr[i] = 0; - } - } - GroupMemoryBarrierWithGroupSync(); - } - - #elif TEST_LOAD_STORE_ACCUMULATOR - groupshared TYPE_ACC gsharedArrAccumulator[MAX_NUM_ELEMENTS]; - - void ClearGShared(uint threadX) - { - GroupMemoryBarrierWithGroupSync(); - if (threadX == 0) - { - for (uint i = 0; i < MAX_NUM_ELEMENTS; ++i) - { - gsharedArrAccumulator[i] = (TYPE_ACC)0; - } - } - GroupMemoryBarrierWithGroupSync(); - } - - void FillSource(uint threadX) - { - GroupMemoryBarrierWithGroupSync(); - if (threadX == 0) - { - uint j = 0; - for (uint i = 0; i < MAX_NUM_ELEMENTS; ++i, j += sizeof(TYPE_ACC)) - { - gsharedArrAccumulator[i] = g_bufInAccum.Load(j); - } - } - GroupMemoryBarrierWithGroupSync(); - } - - void FillDest(uint start, uint threadX) - { - GroupMemoryBarrierWithGroupSync(); - if (threadX == 0) - { - for (uint i = 0; i < MAX_NUM_ELEMENTS; ++i) - { - g_bufOutAccumulator.Store(start + i * sizeof(TYPE_ACC), gsharedArrAccumulator[i]); - gsharedArrAccumulator[i] = 0; - } - } - GroupMemoryBarrierWithGroupSync(); - } - - void FillDestRowCol(uint start, uint threadX) - { - GroupMemoryBarrierWithGroupSync(); - if (threadX == 0) - { - for (uint i = 0; i < MAX_NUM_ELEMENTS; ++i) - { - g_bufOutRowCol.Store(start + i * sizeof(TYPE_ACC), gsharedArrAccumulator[i]); - } - } - ClearGShared(threadX); - } - #endif - - #define LOAD_SOURCE gsharedArr - #define LOAD_SOURCE_ACCUM gsharedArrAccumulator - #define STORE_DEST gsharedArr - #define STORE_DEST_ROWCOL gsharedArrAccumulator - #define STORE_DEST_ACCUM gsharedArrAccumulator - - // Start/Stride/Offset are all given in bytes, and converted to array elements in the macros. - - #define TEST_LOAD_LEFT(mat, k, start, stride, alignment, transp, dest, destOffset) \ - mat.Load(gsharedArr, (start)/sizeof(DATATYPE), (stride)/sizeof(DATATYPE), transp); \ - mat.Store(g_bufOut, destOffset, lStride, false); - - #define TEST_LOAD_RIGHT(mat, k, start, stride, alignment, transp, dest, destOffset) \ - mat.Load(gsharedArr, (start)/sizeof(DATATYPE), (stride)/sizeof(DATATYPE), transp); \ - mat.Store(g_bufOut, destOffset, rStride, false); - - #define TEST_LOAD_LEFT_COL(mat, k, start, stride, alignment, dest, destOffset) \ - mat.Load(gsharedArrAccumulator, (start)/sizeof(TYPE_ACC), (stride)/sizeof(TYPE_ACC)); \ - mat.Store(g_bufOutRowCol, destOffset, 1 * sizeof(TYPE_ACC)); - - #define TEST_LOAD_RIGHT_ROW(mat, k, start, stride, alignment, dest, destOffset) \ - mat.Load(gsharedArrAccumulator, (start)/sizeof(TYPE_ACC), (stride)/sizeof(TYPE_ACC)); \ - mat.Store(g_bufOutRowCol, destOffset, 1 * sizeof(TYPE_ACC)); - - #define TEST_LOAD_ACCUMULATOR(mata, k, start, stride, alignment, transp, dest, destOffset) \ - mata.Load(gsharedArrAccumulator, (start)/sizeof(TYPE_ACC), (stride)/sizeof(TYPE_ACC), transp); \ - mata.Store(g_bufOutAccumulator, destOffset, aStride, false); - - #define TEST_STORE_LEFT(matl, k, stride, offset, transp, dest, destOffset) \ - matl.Load(g_bufIn, 0, lStride, false); \ - matl.Store(gsharedArr, (offset)/sizeof(DATATYPE), (stride)/sizeof(DATATYPE), transp); \ - FillDest(destOffset, groupThreadID.x); - - #define TEST_STORE_RIGHT(matr, k, stride, offset, transp, dest, destOffset) \ - matr.Load(g_bufIn, 0, rStride, false); \ - matr.Store(gsharedArr, (offset)/sizeof(DATATYPE), (stride)/sizeof(DATATYPE), transp); \ - FillDest(destOffset, groupThreadID.x); - - #define TEST_STORE_LEFT_COL(mat, k, stride, offset, dest, destOffset) \ - mat.Load(g_bufInAccum, 0, 1 * sizeof(TYPE_ACC)); \ - mat.Store(gsharedArrAccumulator, (offset)/sizeof(TYPE_ACC), (stride)/sizeof(TYPE_ACC)); \ - FillDestRowCol(destOffset, groupThreadID.x); - - #define TEST_STORE_RIGHT_ROW(mat, k, stride, offset, dest, destOffset) \ - mat.Load(g_bufInAccum, 0, 1 * sizeof(TYPE_ACC)); \ - mat.Store(gsharedArrAccumulator, (offset)/sizeof(TYPE_ACC), (stride)/sizeof(TYPE_ACC)); \ - FillDestRowCol(destOffset, groupThreadID.x); - - #define TEST_STORE_ACCUMULATOR(mata, k, stride, offset, transp, dest, destOffset) \ - mata.Load(g_bufInAccum, 0, aStride, false); \ - mata.Store(gsharedArrAccumulator, (offset)/sizeof(TYPE_ACC), (stride)/sizeof(TYPE_ACC), transp); \ - FillDest(destOffset, groupThreadID.x); - - #else - #define LOAD_SOURCE g_bufIn - #define LOAD_SOURCE_ACCUM g_bufInAccum - #define STORE_DEST g_bufOut - #define STORE_DEST_ROWCOL g_bufOutRowCol - #define STORE_DEST_ACCUM g_bufOutAccumulator - - void FillSource(uint threadX) {} // no-op - void FillDest(uint start, uint threadX) {} - void FillDestRowCol(uint start, uint threadX) {} - void ClearGShared(uint threadX) {} - - #define TEST_LOAD_LEFT(mat, k, start, stride, alignment, transp, dest, destOffset) \ - mat.Load(LOAD_SOURCE, start, stride, transp, alignment); \ - mat.Store(dest, destOffset, lStride, false); - - #define TEST_LOAD_RIGHT(mat, k, start, stride, alignment, transp, dest, destOffset) \ - mat.Load(LOAD_SOURCE, start, stride, transp, alignment); \ - mat.Store(dest, destOffset, rStride, false); - - #define TEST_LOAD_LEFT_COL(mat, k, start, stride, alignment, dest, destOffset) \ - mat.Load(LOAD_SOURCE_ACCUM, start, stride, alignment); \ - mat.Store(dest, destOffset, (int)sizeof(TYPE_ACC)); - - #define TEST_LOAD_RIGHT_ROW(mat, k, start, stride, alignment, dest, destOffset) \ - mat.Load(LOAD_SOURCE_ACCUM, start, stride, alignment); \ - mat.Store(dest, destOffset, (int)sizeof(TYPE_ACC)); - - #define TEST_LOAD_ACCUMULATOR(mata, k, start, stride, alignment, transp, dest, destOffset) \ - mata.Load(LOAD_SOURCE_ACCUM, start, stride, transp, alignment); \ - mata.Store(dest, destOffset, aStride, false); - - #define TEST_STORE_LEFT(matl, k, stride, alignment, transp, dest, destOffset) \ - matl.Load(LOAD_SOURCE, 0, lStride, false); \ - matl.Store(dest, destOffset, stride, transp, alignment); - - #define TEST_STORE_RIGHT(matr, k, stride, alignment, transp, dest, destOffset) \ - matr.Load(LOAD_SOURCE, 0, rStride, false); \ - matr.Store(dest, destOffset, stride, transp, alignment); - - #define TEST_STORE_LEFT_COL(mat, k, stride, alignment, dest, destOffset) \ - mat.Load(LOAD_SOURCE_ACCUM, 0, (int)sizeof(TYPE_ACC)); \ - mat.Store(dest, destOffset, stride, alignment); - - #define TEST_STORE_RIGHT_ROW(mat, k, stride, alignment, dest, destOffset) \ - mat.Load(LOAD_SOURCE_ACCUM, 0, (int)sizeof(TYPE_ACC)); \ - mat.Store(dest, destOffset, stride, alignment); - - #define TEST_STORE_ACCUMULATOR(mata, k, stride, alignment, transp, dest, destOffset) \ - mata.Load(LOAD_SOURCE_ACCUM, 0, aStride, false); \ - mata.Store(dest, destOffset, stride, transp, alignment); - - #endif // GROUPSHARED if/else - - [WaveSize(NUM_LANES)] - #ifdef GROUPSHARED - [numthreads(NUM_LANES,1,1)] - #else - [numthreads(NUM_LANES * 2,1,1)] - #endif - void main(uint3 groupThreadID : SV_GroupThreadID) - { - uint rowColSize = 64 * 64 * sizeof(TYPE_ACC); - uint size = 2 * 64 * 64 * ELEMENTSIZE; - - // Calculate strides and offsets in bytes. - uint s = 16 * ELEMENTSIZE; // start - uint lStride = (DIM_K * ELEMENTSIZE); - uint rStride = (DIM_N * ELEMENTSIZE); - uint ltStride = (DIM_M * ELEMENTSIZE); - uint rtStride = (DIM_K * ELEMENTSIZE); - uint a = 4; // Alignment. For groupshared, tests store offset. - - // For accumulator - uint sizeAcc = 2 * 64 * 64 * sizeof(TYPE_ACC); - uint s2 = 16 * sizeof(TYPE_ACC); // start - uint aStride = (DIM_N * sizeof(TYPE_ACC)); - uint atStride = (DIM_M * sizeof(TYPE_ACC)); - uint accElemStride = sizeof(TYPE_ACC); - - uint groupOffset = (groupThreadID.x/NUM_LANES) * 22; - - uint LOAD_LEFT_START = 0 + groupOffset; - uint LOAD_RIGHT_START = 1 + groupOffset; - uint LOAD_LEFT_STRIDE_P4 = 2 + groupOffset; - uint LOAD_RIGHT_STRIDE_P4 = 3 + groupOffset; - uint LOAD_LEFT_STRIDE_X2 = 4 + groupOffset; - uint LOAD_RIGHT_STRIDE_X2 = 5 + groupOffset; - uint LOAD_LEFT_ALIGNMENT = 6 + groupOffset; - uint LOAD_RIGHT_ALIGNMENT = 7 + groupOffset; - uint LOAD_LEFT_TRANSPOSE = 8 + groupOffset; - uint LOAD_RIGHT_TRANSPOSE = 9 + groupOffset; - uint LOAD_LEFT_ALLPARAMS = 10 + groupOffset; - uint LOAD_RIGHT_ALLPARAMS = 11 + groupOffset; - uint STORE_LEFT_STRIDE_P4 = 12 + groupOffset; - uint STORE_RIGHT_STRIDE_P4 = 13 + groupOffset; - uint STORE_LEFT_STRIDE_X2 = 14 + groupOffset; - uint STORE_RIGHT_STRIDE_X2 = 15 + groupOffset; - uint STORE_LEFT_ALIGNMENT = 16 + groupOffset; - uint STORE_RIGHT_ALIGNMENT = 17 + groupOffset; - uint STORE_LEFT_TRANSPOSE = 18 + groupOffset; - uint STORE_RIGHT_TRANSPOSE = 19 + groupOffset; - uint STORE_LEFT_ALLPARAMS = 20 + groupOffset; - uint STORE_RIGHT_ALLPARAMS = 21 + groupOffset; - -#if TEST_LOAD_STORE_LR - WaveMatrixLeft matLeft; - WaveMatrixRight matRight; - - if (groupThreadID.x == 0) - { - g_bufOutMatrixDepth.Store(0, matLeft.MatrixDepth()); - g_bufOutMatrixDepth.Store(0 + sizeof(uint), matRight.MatrixDepth()); - } - - ClearGShared(groupThreadID.x); - FillSource(groupThreadID.x); - - ///////////////////////// - // Left/Right Matrices // - ///////////////////////// - TEST_LOAD_LEFT(matLeft, DIM_K, s, lStride , 0, false, STORE_DEST, LOAD_LEFT_START * size); - TEST_LOAD_RIGHT(matRight, DIM_K, s, rStride , 0, false, STORE_DEST, LOAD_RIGHT_START * size); - - TEST_LOAD_LEFT(matLeft, DIM_K, 0, lStride + 4, 0, false, STORE_DEST, LOAD_LEFT_STRIDE_P4 * size); - TEST_LOAD_RIGHT(matRight, DIM_K, 0, rStride + 4, 0, false, STORE_DEST, LOAD_RIGHT_STRIDE_P4 * size); - - TEST_LOAD_LEFT(matLeft, DIM_K, 0, lStride * 2, 0, false, STORE_DEST, LOAD_LEFT_STRIDE_X2 * size); - TEST_LOAD_RIGHT(matRight, DIM_K, 0, rStride * 2, 0, false, STORE_DEST, LOAD_RIGHT_STRIDE_X2 * size); - - TEST_LOAD_LEFT(matLeft, DIM_K, 0, lStride , a, false, STORE_DEST, LOAD_LEFT_ALIGNMENT * size); - TEST_LOAD_RIGHT(matRight, DIM_K, 0, rStride , a, false, STORE_DEST, LOAD_RIGHT_ALIGNMENT * size); - - TEST_LOAD_LEFT(matLeft, DIM_K, 0, ltStride , 0, true , STORE_DEST, LOAD_LEFT_TRANSPOSE * size); - TEST_LOAD_RIGHT(matRight, DIM_K, 0, rtStride , 0, true , STORE_DEST, LOAD_RIGHT_TRANSPOSE * size); - - TEST_LOAD_LEFT(matLeft, DIM_K, s, ltStride + 4, a, true , STORE_DEST, LOAD_LEFT_ALLPARAMS * size); - TEST_LOAD_RIGHT(matRight, DIM_K, s, rtStride + 4, a, true , STORE_DEST, LOAD_RIGHT_ALLPARAMS * size); - - ClearGShared(groupThreadID.x); - - TEST_STORE_LEFT(matLeft, DIM_K, lStride + 4, 0, false, STORE_DEST, STORE_LEFT_STRIDE_P4 * size); - TEST_STORE_RIGHT(matRight, DIM_K, rStride + 4, 0, false, STORE_DEST, STORE_RIGHT_STRIDE_P4 * size); - - TEST_STORE_LEFT(matLeft, DIM_K, lStride * 2, 0, false, STORE_DEST, STORE_LEFT_STRIDE_X2 * size); - TEST_STORE_RIGHT(matRight, DIM_K, rStride * 2, 0, false, STORE_DEST, STORE_RIGHT_STRIDE_X2 * size); - - TEST_STORE_LEFT(matLeft, DIM_K, lStride , a, false, STORE_DEST, STORE_LEFT_ALIGNMENT * size); - TEST_STORE_RIGHT(matRight, DIM_K, rStride , a, false, STORE_DEST, STORE_RIGHT_ALIGNMENT * size); - - TEST_STORE_LEFT(matLeft, DIM_K, ltStride , 0, true , STORE_DEST, STORE_LEFT_TRANSPOSE * size); - TEST_STORE_RIGHT(matRight, DIM_K, rtStride , 0, true , STORE_DEST, STORE_RIGHT_TRANSPOSE * size); - - TEST_STORE_LEFT(matLeft, DIM_K, ltStride + 4, a, true , STORE_DEST, STORE_LEFT_ALLPARAMS * size); - TEST_STORE_RIGHT(matRight, DIM_K, rtStride + 4, a, true , STORE_DEST, STORE_RIGHT_ALLPARAMS * size); - -#endif -#if TEST_LOAD_STORE_ACCUMULATOR - /////////////////////// - // Accumulator Types // - /////////////////////// - WaveMatrixLeftColAcc matLeftColAcc; - WaveMatrixRightRowAcc matRightRowAcc; - WaveMatrixAccumulator matAccum; - #if FRAGS_ENABLE - ClearGShared(groupThreadID.x); - FillSource(groupThreadID.x); - - TEST_LOAD_LEFT_COL(matLeftColAcc, 1, s2, accElemStride , 0, STORE_DEST_ROWCOL, LOAD_LEFT_START * rowColSize); - TEST_LOAD_RIGHT_ROW(matRightRowAcc, 1, s2, accElemStride , 0, STORE_DEST_ROWCOL, LOAD_RIGHT_START * rowColSize); - - TEST_LOAD_LEFT_COL(matLeftColAcc, 1, 0, accElemStride + 4, 0, STORE_DEST_ROWCOL, LOAD_LEFT_STRIDE_P4 * rowColSize); - TEST_LOAD_RIGHT_ROW(matRightRowAcc, 1, 0, accElemStride + 4, 0, STORE_DEST_ROWCOL, LOAD_RIGHT_STRIDE_P4 * rowColSize); - - TEST_LOAD_LEFT_COL(matLeftColAcc, 1, 0, accElemStride * 2, 0, STORE_DEST_ROWCOL, LOAD_LEFT_STRIDE_X2 * rowColSize); - TEST_LOAD_RIGHT_ROW(matRightRowAcc, 1, 0, accElemStride * 2, 0, STORE_DEST_ROWCOL, LOAD_RIGHT_STRIDE_X2 * rowColSize); - - TEST_LOAD_LEFT_COL(matLeftColAcc, 1, 0, accElemStride , a, STORE_DEST_ROWCOL, LOAD_LEFT_ALIGNMENT * rowColSize); - TEST_LOAD_RIGHT_ROW(matRightRowAcc, 1, 0, accElemStride , a, STORE_DEST_ROWCOL, LOAD_RIGHT_ALIGNMENT * rowColSize); - - TEST_LOAD_LEFT_COL(matLeftColAcc, 1, 0, accElemStride , 0, STORE_DEST_ROWCOL, LOAD_LEFT_TRANSPOSE * rowColSize); - TEST_LOAD_RIGHT_ROW(matRightRowAcc, 1, 0, accElemStride , 0, STORE_DEST_ROWCOL, LOAD_RIGHT_TRANSPOSE * rowColSize); - - TEST_LOAD_LEFT_COL(matLeftColAcc, 1, s2, accElemStride + 4, a, STORE_DEST_ROWCOL, LOAD_LEFT_ALLPARAMS * rowColSize); - TEST_LOAD_RIGHT_ROW(matRightRowAcc, 1, s2, accElemStride + 4, a, STORE_DEST_ROWCOL, LOAD_RIGHT_ALLPARAMS * rowColSize); - - ClearGShared(groupThreadID.x); - - TEST_STORE_LEFT_COL(matLeftColAcc, 1, accElemStride + 4, 0, STORE_DEST_ROWCOL, STORE_LEFT_STRIDE_P4 * rowColSize); - TEST_STORE_RIGHT_ROW(matRightRowAcc, 1, accElemStride + 4, 0, STORE_DEST_ROWCOL, STORE_RIGHT_STRIDE_P4 * rowColSize); - - TEST_STORE_LEFT_COL(matLeftColAcc, 1, accElemStride * 2, 0, STORE_DEST_ROWCOL, STORE_LEFT_STRIDE_X2 * rowColSize); - TEST_STORE_RIGHT_ROW(matRightRowAcc, 1, accElemStride * 2, 0, STORE_DEST_ROWCOL, STORE_RIGHT_STRIDE_X2 * rowColSize); - - TEST_STORE_LEFT_COL(matLeftColAcc, 1, accElemStride , a, STORE_DEST_ROWCOL, STORE_LEFT_ALIGNMENT * rowColSize); - TEST_STORE_RIGHT_ROW(matRightRowAcc, 1, accElemStride , a, STORE_DEST_ROWCOL, STORE_RIGHT_ALIGNMENT * rowColSize); - - TEST_STORE_LEFT_COL(matLeftColAcc, 1, accElemStride , 0, STORE_DEST_ROWCOL, STORE_LEFT_TRANSPOSE * rowColSize); - TEST_STORE_RIGHT_ROW(matRightRowAcc, 1, accElemStride , 0, STORE_DEST_ROWCOL, STORE_RIGHT_TRANSPOSE * rowColSize); - - TEST_STORE_LEFT_COL(matLeftColAcc, 1, accElemStride + 4, a, STORE_DEST_ROWCOL, STORE_LEFT_ALLPARAMS * rowColSize); - TEST_STORE_RIGHT_ROW(matRightRowAcc, 1, accElemStride + 4, a, STORE_DEST_ROWCOL, STORE_RIGHT_ALLPARAMS * rowColSize); - #endif // #if FRAGS_ENABLE - - groupOffset = (groupThreadID.x/NUM_LANES) * 11; - uint LOAD_START = 0 + groupOffset; - uint LOAD_STRIDE_P4 = 1 + groupOffset; - uint LOAD_STRIDE_X2 = 2 + groupOffset; - uint LOAD_ALIGNMENT = 3 + groupOffset; - uint LOAD_TRANSPOSE = 4 + groupOffset; - uint LOAD_ALLPARAMS = 5 + groupOffset; - uint STORE_STRIDE_P4 = 6 + groupOffset; - uint STORE_STRIDE_X2 = 7 + groupOffset; - uint STORE_ALIGNMENT = 8 + groupOffset; - uint STORE_TRANSPOSE = 9 + groupOffset; - uint STORE_ALLPARAMS = 10 + groupOffset; - - ClearGShared(groupThreadID.x); - FillSource(groupThreadID.x); - - TEST_LOAD_ACCUMULATOR (matAccum, DIM_K, s2, aStride , 0, false, g_bufOutAccumulator, LOAD_START * sizeAcc); - TEST_LOAD_ACCUMULATOR (matAccum, DIM_K, 0 , aStride + 4, 0, false, g_bufOutAccumulator, LOAD_STRIDE_P4 * sizeAcc); - TEST_LOAD_ACCUMULATOR (matAccum, DIM_K, 0 , aStride * 2, 0, false, g_bufOutAccumulator, LOAD_STRIDE_X2 * sizeAcc); - TEST_LOAD_ACCUMULATOR (matAccum, DIM_K, 0 , aStride , a, false, g_bufOutAccumulator, LOAD_ALIGNMENT * sizeAcc); - TEST_LOAD_ACCUMULATOR (matAccum, DIM_K, 0 , atStride , 0, true , g_bufOutAccumulator, LOAD_TRANSPOSE * sizeAcc); - TEST_LOAD_ACCUMULATOR (matAccum, DIM_K, s2, atStride + 4, a, true , g_bufOutAccumulator, LOAD_ALLPARAMS * sizeAcc); - - ClearGShared(groupThreadID.x); - - TEST_STORE_ACCUMULATOR(matAccum, DIM_K, aStride + 4, 0, false, STORE_DEST_ACCUM, STORE_STRIDE_P4 * sizeAcc); - TEST_STORE_ACCUMULATOR(matAccum, DIM_K, aStride * 2, 0, false, STORE_DEST_ACCUM, STORE_STRIDE_X2 * sizeAcc); - TEST_STORE_ACCUMULATOR(matAccum, DIM_K, aStride , a, false, STORE_DEST_ACCUM, STORE_ALIGNMENT * sizeAcc); - TEST_STORE_ACCUMULATOR(matAccum, DIM_K, atStride , 0, true , STORE_DEST_ACCUM, STORE_TRANSPOSE * sizeAcc); - TEST_STORE_ACCUMULATOR(matAccum, DIM_K, atStride + 4, a, true , STORE_DEST_ACCUM, STORE_ALLPARAMS * sizeAcc); -#endif // #if TEST_LOAD_STORE_ACCUMULATOR - }; - ]]> - - - leftCol; - WaveMatrixRightRowAcc rightRow; - WaveMatrixAccumulator accumulator; - - TYPE_ACC scalar = g_bufInScalar.Load(groupID.x * sizeof(TYPE_ACC)); - - const uint lStride = (uint)(DIM_K * sizeof(TYPE_ACC)); - const uint rStride = (uint)(DIM_N * sizeof(TYPE_ACC)); - const uint aStride = (uint)(DIM_N * sizeof(TYPE_ACC)); - - /////////// - // Accumulator - /////////// - - accumulator.Load(g_bufInAccumulator, scalarMulOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); - accumulator.ScalarMultiply(scalar); - accumulator.Store(g_bufOutAccumulator, outScalarMulOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); - - accumulator.Load(g_bufInAccumulator, scalarDivOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); - accumulator.ScalarDivide(scalar); - accumulator.Store(g_bufOutAccumulator, outScalarDivOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); - - accumulator.Load(g_bufInAccumulator, scalarAddOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); - accumulator.ScalarAdd(scalar); - accumulator.Store(g_bufOutAccumulator, outScalarAddOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); - - accumulator.Load(g_bufInAccumulator, scalarSubOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); - accumulator.ScalarSubtract(scalar); - accumulator.Store(g_bufOutAccumulator, outScalarSubOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); - - accumulator.Load(g_bufInAccumulator, scalarFillOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); - accumulator.Fill(scalar); - accumulator.Store(g_bufOutAccumulator, outScalarFillOffset * NUM_ACCUMULATOR_ELEMENTS, aStride, false); - -#if FRAGS_ENABLE - - /////////// - // Left Col - /////////// - - // We load and store the left col transposed (as a row) to save space - leftCol.Load (g_bufInLeftColAcc, scalarMulOffset * DIM_M, (int)sizeof(TYPE_ACC)); - leftCol.ScalarMultiply(scalar); - leftCol.Store(g_bufOutLeftColAcc, outScalarMulOffset * DIM_M, (int)sizeof(TYPE_ACC)); - - leftCol.Load (g_bufInLeftColAcc, scalarDivOffset * DIM_M, (int)sizeof(TYPE_ACC)); - leftCol.ScalarDivide(scalar); - leftCol.Store(g_bufOutLeftColAcc, outScalarDivOffset * DIM_M, (int)sizeof(TYPE_ACC)); - - leftCol.Load (g_bufInLeftColAcc, scalarAddOffset * DIM_M, (int)sizeof(TYPE_ACC)); - leftCol.ScalarAdd(scalar); - leftCol.Store(g_bufOutLeftColAcc, outScalarAddOffset * DIM_M, (int)sizeof(TYPE_ACC)); - - leftCol.Load (g_bufInLeftColAcc, scalarSubOffset * DIM_M, (int)sizeof(TYPE_ACC)); - leftCol.ScalarSubtract(scalar); - leftCol.Store(g_bufOutLeftColAcc, outScalarSubOffset * DIM_M, (int)sizeof(TYPE_ACC)); - - leftCol.Load (g_bufInLeftColAcc, scalarFillOffset * DIM_M, (int)sizeof(TYPE_ACC)); - leftCol.Fill(scalar); - leftCol.Store(g_bufOutLeftColAcc, outScalarFillOffset * DIM_M, (int)sizeof(TYPE_ACC)); - - /////////// - // Right Row - /////////// - - rightRow.Load (g_bufInRightRowAcc, scalarMulOffset * DIM_N, (int)sizeof(TYPE_ACC)); - rightRow.ScalarMultiply(scalar); - rightRow.Store(g_bufOutRightRowAcc, outScalarMulOffset * DIM_N, (int)sizeof(TYPE_ACC)); - - rightRow.Load (g_bufInRightRowAcc, scalarDivOffset * DIM_N, (int)sizeof(TYPE_ACC)); - rightRow.ScalarDivide(scalar); - rightRow.Store(g_bufOutRightRowAcc, outScalarDivOffset * DIM_N, (int)sizeof(TYPE_ACC)); - - rightRow.Load (g_bufInRightRowAcc, scalarAddOffset * DIM_N, (int)sizeof(TYPE_ACC)); - rightRow.ScalarAdd(scalar); - rightRow.Store(g_bufOutRightRowAcc, outScalarAddOffset * DIM_N, (int)sizeof(TYPE_ACC)); - - rightRow.Load (g_bufInRightRowAcc, scalarSubOffset * DIM_N, (int)sizeof(TYPE_ACC)); - rightRow.ScalarSubtract(scalar); - rightRow.Store(g_bufOutRightRowAcc, outScalarSubOffset * DIM_N, (int)sizeof(TYPE_ACC)); - - rightRow.Load (g_bufInRightRowAcc, scalarFillOffset * DIM_N, (int)sizeof(TYPE_ACC)); - rightRow.Fill(scalar); - rightRow.Store(g_bufOutRightRowAcc, outScalarFillOffset * DIM_N, (int)sizeof(TYPE_ACC)); -#endif // #if FRAGS_ENABLE - }; - ]]> - - - leftMatrix; - WaveMatrixRight rightMatrix; - WaveMatrixLeftColAcc leftCol; - WaveMatrixRightRowAcc rightRow; - WaveMatrixAccumulator accumulator; - WaveMatrixAccumulator outAccumulator; - - const uint lStride = (uint)(DIM_K * ELEMENTSIZE); - const uint rStride = (uint)(DIM_N * ELEMENTSIZE); - const uint aStride = (uint)(DIM_N * sizeof(TYPE_ACC)); - - leftMatrix.Load(g_bufInMatrices, 0, lStride, false); - rightMatrix.Load(g_bufInMatrices, MATRIX_BUFFER_STRIDE_IN_ELEMENTS * ELEMENTSIZE, rStride, false); - accumulator.Load(g_bufInMatrices, 2 * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * ELEMENTSIZE, aStride, false); - - outAccumulator.Multiply(leftMatrix, rightMatrix); - outAccumulator.Store(g_bufOutMatrices, outMulMatrix * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * sizeof(TYPE_ACC), aStride, false); - - outAccumulator.Fill(42); - outAccumulator.MultiplyAccumulate(leftMatrix, rightMatrix); - outAccumulator.Store(g_bufOutMatrices, outMulAccumulateMatrix * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * sizeof(TYPE_ACC), aStride, false); - - outAccumulator.Fill(42); - outAccumulator.Add(accumulator); - outAccumulator.Store(g_bufOutMatrices, outAddMatrix * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * sizeof(TYPE_ACC), aStride, false); -#if FRAGS_ENABLE - leftCol.Load(g_bufInMatrices, 2 * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * ELEMENTSIZE, (int)sizeof(TYPE_ACC)); - rightRow.Load(g_bufInMatrices, 2 * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * ELEMENTSIZE, (int)sizeof(TYPE_ACC)); - - outAccumulator.Fill(0); - outAccumulator.Add(leftCol); - outAccumulator.Store(g_bufOutMatrices, outBroadcastAddColMatrix * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * sizeof(TYPE_ACC), aStride, false); - - outAccumulator.Fill(0); - outAccumulator.Add(rightRow); - outAccumulator.Store(g_bufOutMatrices, outBroadcastAddRowMatrix * MATRIX_BUFFER_STRIDE_IN_ELEMENTS * sizeof(TYPE_ACC), aStride, false); - - leftCol.SumAccumulate(leftMatrix); - rightRow.SumAccumulate(rightMatrix); - leftCol.Store(g_bufOutRowCols, outRowColOffset, (int)sizeof(TYPE_ACC)); - rightRow.Store(g_bufOutRowCols, outRowColOffset + 64 * sizeof(TYPE_ACC), (int)sizeof(TYPE_ACC)); -#endif //#if FRAGS_ENABLE - }; - ]]> - - - -100 - 20 - -50 - -0 - 0 - 42 - - -
String