added tensor test(s) to MathPerformanceTests

2016-06-17 12:10:37 -07:00 · 2016-06-17 12:10:37 -07:00 · a172c89111
--- a/Source/Math/CntkBatchNormalization.cuh
+++ b/Source/Math/CntkBatchNormalization.cuh
@ -108,13 +108,12 @@ __device__ __forceinline__ void StoreValues<4, float>(const float src[4], float*
 template <typename T>
 __device__ __forceinline__ T Shuffle(T input, int srcLane)
 {
-#ifdef __CUDA_ARCH__
+#if __CUDA_ARCH__ >= 300
    // shfl is supported only on Kepler+
-    static_assert(__CUDA_ARCH__ >= 300, "CNTK only supports only Kepler GPU architecture or newer");
    return cub::ShuffleIndex(input, srcLane);
 #else
    assert(false);
-    return input;
+    return input; // keep compiler happy
 #endif
 }

--- a/Source/Math/MathCUDA.vcxproj
+++ b/Source/Math/MathCUDA.vcxproj
@ -52,7 +52,7 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="$(DebugBuild)">
    <CudaCodeGen>$(CNTK_CUDA_CODEGEN_DEBUG)</CudaCodeGen>
-    <CudaCodeGen Condition="'$(CudaCodeGen)'==''">compute_30,sm_30</CudaCodeGen>
+    <CudaCodeGen Condition="'$(CudaCodeGen)'==''">compute_20,sm_20;compute_30,sm_30</CudaCodeGen>
  </PropertyGroup>
  <PropertyGroup Condition="$(ReleaseBuild)">
    <CudaCodeGen>$(CNTK_CUDA_CODEGEN_RELEASE)</CudaCodeGen>
--- a/Source/Math/TensorView.h
+++ b/Source/Math/TensorView.h
@ -14,6 +14,8 @@
 #pragma warning(push)
 #pragma warning(disable : 4251) // needs to have dll-interface to be used by clients of... caused by TensorView::m_shape which is only private. We use the same compiler everywhere.

+template<class ElemType> struct TensorTest;
+
 // This class is exported from the Math.dll.
 namespace Microsoft { namespace MSR { namespace CNTK {

@ -149,6 +151,7 @@ private:

    const Matrix<ElemType>& GetSOB() const { return *m_sob; }
    Matrix<ElemType>&       GetSOB()       { return *m_sob; }
+    friend struct ::TensorTest<ElemType>;

    // -------------------------------------------------------------------
    // sob members
--- a/Tests/UnitTests/MathPerformanceTests/MathPerformanceTests.cpp
+++ b/Tests/UnitTests/MathPerformanceTests/MathPerformanceTests.cpp
@ -5,14 +5,17 @@
 // MathPerformanceTests.cpp : Defines the entry point for the console application.
 //
 #include "stdafx.h"
-#define NOMINMAX
-#include "Windows.h"
+//#define NOMINMAX
+//#include "Windows.h"
+#include "Matrix.h"
+#include "CPUMatrix.h"
+#include "TensorView.h"
+#include "Sequences.h"
 #include <chrono>
 #include <iostream>
 #include <vector>
-#include "Matrix.h"
-#include "CPUMatrix.h"
-#include "Sequences.h"
+#include <algorithm>
+
 using namespace Microsoft::MSR::CNTK;
 using namespace std;

@ -378,6 +381,63 @@ void SquareMultiplyAndAdd10TimesAvgTest(int n, int count)
    cout << "CPUMatrix/Matrix ratio is: " << cpu_avg / m_avg << " seconds" << endl;
 }

+// simple test suite for TensorView
+//  - this is meant for performance optimization
+//  - correctness is defined as same result between GPU and CPU
+template <class ElemType>
+struct TensorTest
+{
+    // helper to create a randomly initialized tensor object
+    static TensorView<ElemType> CreateTensor(TensorShape shape, int randomSeed, DEVICEID_TYPE deviceId)
+    {
+        let numElements = shape.GetNumElements();
+
+        // random init
+        mt19937 rng(randomSeed);
+        uniform_real_distribution<float> nd(-1, 1);
+        vector<ElemType> init(numElements);
+        generate(begin(init), end(init), [&] { return nd(rng); });
+
+        // create storage object (one-column matrix)
+        let sob = make_shared<Matrix<ElemType>>(numElements/*rows*/, 1/*cols*/, init.data(), deviceId);
+
+        // create TensorView
+        return TensorView<ElemType>(sob, shape);
+    }
+
+    template<typename FN>
+    static void OneTensorTest(const char* what, const FN& fn)
+    {
+        cout << "Tensor test '" << what << "': ";
+
+        // run on GPU and CPU
+        let resultGPU = fn(0);
+        let resultCPU = fn(-1);
+
+        // compare
+        let isSame = resultGPU.GetSOB().IsEqualTo(resultCPU.GetSOB(), 1e-3f);
+        cout << (isSame ? "succeeded." : "FAILED (GPU and CPU results differ).") << endl;
+    }
+
+    // main entry point (misusing the constructor)
+    /*void*/ TensorTest()
+    {
+        OneTensorTest("bias gradient", [](DEVICEID_TYPE deviceId) -> TensorView<ElemType>
+        {
+            let N = 2048u;
+            let T = 1024u;
+            int randomSeed = 1;
+            let  gradient = CreateTensor(TensorShape{ N, T }, randomSeed++, deviceId);
+            auto bias     = CreateTensor(TensorShape(N),      randomSeed++, deviceId);
+            //gradient.GetSOB().Print("incoming gradient", 0, 9, 0, 9);
+            //bias.GetSOB().Print("bias gradient", 0, 9, 0, 9);
+            bias.DoCopyOf(1, gradient, 1);
+            //bias.GetSOB().Print("updated bias gradient", 0, 9, 0, 9);
+            return bias;
+        });
+    }
+};
+
 template <class ElemType>
 void MandSTest(int count, int devId)
 {
@ -437,6 +497,8 @@ void MandSTest(int count, int devId)

 int wmain()
 {
+    TensorTest<float>();
+
    ColumnSliceMultAndAddTest<float>(2048, 2048, 256, 0);

    TestRnnForwardPropSRP<float>();
--- a/Tests/UnitTests/MathPerformanceTests/MathPerformanceTests.vcxproj
+++ b/Tests/UnitTests/MathPerformanceTests/MathPerformanceTests.vcxproj
@ -114,6 +114,7 @@
    <ClInclude Include="targetver.h" />
  </ItemGroup>
  <ItemGroup>
+    <ClCompile Include="..\..\..\Source\Common\ExceptionWithCallStack.cpp" />
    <ClCompile Include="MathPerformanceTests.cpp" />
    <ClCompile Include="stdafx.cpp">
      <PrecompiledHeader>Create</PrecompiledHeader>