diff --git a/Source/CNTK/CNTK.vcxproj b/Source/CNTK/CNTK.vcxproj
index bb6f9e8c1..9a69ba87a 100644
--- a/Source/CNTK/CNTK.vcxproj
+++ b/Source/CNTK/CNTK.vcxproj
@@ -158,7 +158,7 @@
     <ClInclude Include="..\Common\Include\Basics.h" />
     <ClInclude Include="..\Common\Include\BestGpu.h" />
     <ClInclude Include="..\Common\Include\DataReader.h" />
-    <ClInclude Include="..\Common\Include\DataTensor.h" />
+    <ClInclude Include="..\Common\Include\TensorShape.h" />
     <ClInclude Include="..\Common\Include\DataWriter.h" />
     <ClInclude Include="..\Common\Include\File.h" />
     <ClInclude Include="..\Common\Include\fileutil.h" />
diff --git a/Source/CNTK/CNTK.vcxproj.filters b/Source/CNTK/CNTK.vcxproj.filters
index 3d80e55c8..39b063809 100644
--- a/Source/CNTK/CNTK.vcxproj.filters
+++ b/Source/CNTK/CNTK.vcxproj.filters
@@ -133,7 +133,7 @@
     <ClInclude Include="..\Common\Include\Sequences.h">
       <Filter>Common\Include</Filter>
     </ClInclude>
-    <ClInclude Include="..\Common\Include\DataTensor.h">
+    <ClInclude Include="..\Common\Include\TensorShape.h">
       <Filter>Common\Include</Filter>
     </ClInclude>
     <ClInclude Include="..\Common\Include\ProgressTracing.h">
diff --git a/Source/CNTK/SynchronousExecutionEngine.cpp b/Source/CNTK/SynchronousExecutionEngine.cpp
index 996619102..5cd5e845b 100644
--- a/Source/CNTK/SynchronousExecutionEngine.cpp
+++ b/Source/CNTK/SynchronousExecutionEngine.cpp
@@ -15,7 +15,7 @@
 #include "ConvolutionalNodes.h"
 #include "NonlinearityNodes.h"
 #include "ReshapingNodes.h"
-#include "DataTensor.h"
+#include "TensorShape.h"
 
 namespace Microsoft { namespace MSR { namespace CNTK {
 
diff --git a/Source/Common/Include/DataTensor.h b/Source/Common/Include/TensorShape.h
similarity index 99%
rename from Source/Common/Include/DataTensor.h
rename to Source/Common/Include/TensorShape.h
index 6df7488b1..55124dd19 100644
--- a/Source/Common/Include/DataTensor.h
+++ b/Source/Common/Include/TensorShape.h
@@ -1,4 +1,4 @@
-// DataTensor.h -- tensor descriptor that describes the inner structure of data vectors
+// TensorShape.h -- tensor descriptor that describes the inner structure of data vectors
 //
 // <copyright file="Sequences.h" company="Microsoft">
 //     Copyright (c) Microsoft Corporation.  All rights reserved.
diff --git a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
index 8c09e9b2e..ce0dc84cc 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
@@ -9,7 +9,7 @@
 #include "ComputationNetwork.h"
 #include "TrainingCriterionNodes.h" // for NCEEvalMode
 #include "ScriptableObjects.h"
-#include "DataTensor.h"
+#include "TensorShape.h"
 #include <string>
 
 namespace Microsoft { namespace MSR { namespace CNTK {
diff --git a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
index f1960d8ba..8669940d7 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
+++ b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
@@ -155,7 +155,7 @@
     <ClInclude Include="..\Common\Include\Basics.h" />
     <ClInclude Include="..\Common\Include\BestGpu.h" />
     <ClInclude Include="..\Common\Include\Config.h" />
-    <ClInclude Include="..\Common\Include\DataTensor.h" />
+    <ClInclude Include="..\Common\Include\TensorShape.h" />
     <ClInclude Include="..\Common\Include\File.h" />
     <ClInclude Include="..\Common\Include\fileutil.h" />
     <ClInclude Include="..\Common\Include\Platform.h" />
diff --git a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters
index ca51f6cd5..d65affb21 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters
+++ b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters
@@ -117,7 +117,7 @@
     <ClInclude Include="EsotericNodes.h">
       <Filter>Nodes</Filter>
     </ClInclude>
-    <ClInclude Include="..\Common\Include\DataTensor.h">
+    <ClInclude Include="..\Common\Include\TensorShape.h">
       <Filter>Common\Include</Filter>
     </ClInclude>
     <ClInclude Include="..\Common\Include\Config.h">
diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp
index 70a63c806..c655903b1 100644
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@@ -9,7 +9,7 @@
 #include "ComputationNode.h"
 #include "InputAndParamNodes.h"
 #include "ComputationNetworkBuilder.h"  // TODO: We should only pull in NewComputationNodeFromConfig(). Nodes should not know about network at large.
-#include "DataTensor.h"
+#include "TensorShape.h"
 
 #ifndef let
 #define let const auto
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index 6ae542997..e6d85dde8 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -10,7 +10,7 @@
 #include "TensorView.h"
 #include "ScriptableObjects.h"
 #include "Sequences.h"
-#include "DataTensor.h"
+#include "TensorShape.h"
 #include "MatrixPool.h"
 
 #include <unordered_set>
diff --git a/Source/Math/ConvolutionEngine.h b/Source/Math/ConvolutionEngine.h
index a5b7bd2db..2b3a77681 100644
--- a/Source/Math/ConvolutionEngine.h
+++ b/Source/Math/ConvolutionEngine.h
@@ -18,7 +18,7 @@
 #endif
 
 #include "Matrix.h"
-#include "DataTensor.h" // for ImageLayoutKind
+#include "TensorShape.h" // for ImageLayoutKind
 
 namespace Microsoft { namespace MSR { namespace CNTK {
 
diff --git a/Source/Math/GPUMatrix.h b/Source/Math/GPUMatrix.h
index 87c7c9d9a..5943b133f 100644
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@@ -9,7 +9,7 @@
 #include "File.h"
 #include "Helpers.h"
 #include "CommonMatrix.h"
-#include "DataTensor.h" // only for SmallVector; I was hoping to keep this out
+#include "TensorShape.h" // only for SmallVector; I was hoping to keep this out
 #include "DebugUtil.h"
 #include "BestGpu.h"    // for CPUONLY macro
 #include "ConcStack.h"
diff --git a/Source/Math/GPUTensor.cu b/Source/Math/GPUTensor.cu
index 4f1b92cea..86dd4164e 100644
--- a/Source/Math/GPUTensor.cu
+++ b/Source/Math/GPUTensor.cu
@@ -378,6 +378,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             {
                 if (tid < i && tid + i < tids) accumulators[tid] += accumulators[tid + i];
                 if (0 + i < tids) __syncthreads();    // sync if condition true for at least one thread
+                // TODO: use volatile* and then we can skip the __syncthreads() for the last 32 values
             }
 
             // now set final value to output coordinate
diff --git a/Source/Math/GPUTensor.h b/Source/Math/GPUTensor.h
index 954be8aad..d24094ed2 100644
--- a/Source/Math/GPUTensor.h
+++ b/Source/Math/GPUTensor.h
@@ -6,7 +6,7 @@
 
 #pragma once
 #include "CommonMatrix.h"
-#include "DataTensor.h" // only for SmallVector; I was hoping to keep this out
+#include "TensorShape.h" // only for SmallVector; I was hoping to keep this out
 #include "GPUMatrixCUDAKernels.cuh"
 #include <array>
 
diff --git a/Source/Math/Math.vcxproj b/Source/Math/Math.vcxproj
index eaba79f58..950fab341 100644
--- a/Source/Math/Math.vcxproj
+++ b/Source/Math/Math.vcxproj
@@ -156,7 +156,7 @@
     </ProjectReference>
   </ItemDefinitionGroup>
   <ItemGroup>
-    <ClInclude Include="..\Common\Include\DataTensor.h" />
+    <ClInclude Include="..\Common\Include\TensorShape.h" />
     <ClInclude Include="..\Common\Include\File.h" />
     <ClInclude Include="..\Common\Include\fileutil.h" />
     <ClInclude Include="..\Common\Include\DebugUtil.h" />
diff --git a/Source/Math/Math.vcxproj.filters b/Source/Math/Math.vcxproj.filters
index c828e33db..0282e8f60 100644
--- a/Source/Math/Math.vcxproj.filters
+++ b/Source/Math/Math.vcxproj.filters
@@ -76,7 +76,7 @@
     <ClInclude Include="TensorOps.h">
       <Filter>Tensors</Filter>
     </ClInclude>
-    <ClInclude Include="..\Common\Include\DataTensor.h">
+    <ClInclude Include="..\Common\Include\TensorShape.h">
       <Filter>Common\Include</Filter>
     </ClInclude>
     <ClInclude Include="Helpers.h">
diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h
index 6eab0fa47..379169529 100644
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@@ -13,7 +13,7 @@
 #include "Basics.h"
 #include "File.h"
 #include "CommonMatrix.h"
-#include "DataTensor.h" // only for SmallVector; I was hoping to keep this out
+#include "TensorShape.h" // only for SmallVector; I was hoping to keep this out
 #include <limits.h>
 #include <memory>       // for shared_ptr
 #include <array>
diff --git a/Source/Math/NoGPU.cpp b/Source/Math/NoGPU.cpp
index f728bb579..6951629b9 100644
--- a/Source/Math/NoGPU.cpp
+++ b/Source/Math/NoGPU.cpp
@@ -13,7 +13,7 @@
 #include "GPUSparseMatrix.h"
 #include "MatrixQuantizerGPU.h"
 #include "CuDnnConvolutionEngine.h"
-#include "DataTensor.h"
+#include "TensorShape.h"
 
 #pragma warning (disable: 4100) // unreferenced formal parameter, which is OK since all functions in here are dummies; disabling this allows to copy-paste prototypes here when we add new functions
 #pragma warning (disable: 4702) // unreachable code, which we get from the NOT_IMPLEMENTED macro which is OK
diff --git a/Source/Math/TensorView.h b/Source/Math/TensorView.h
index 88f8dcdac..c768a7ed7 100644
--- a/Source/Math/TensorView.h
+++ b/Source/Math/TensorView.h
@@ -10,7 +10,7 @@
 
 #include "Basics.h"
 #include "Matrix.h"
-#include "DataTensor.h"
+#include "TensorShape.h"
 
 #pragma warning (push)
 #pragma warning (disable: 4251) // needs to have dll-interface to be used by clients of... caused by TensorView::m_shape which is only private. We use the same compiler everywhere.
diff --git a/Source/SGDLib/SGDLib.vcxproj b/Source/SGDLib/SGDLib.vcxproj
index c157972e1..9985eff6c 100644
--- a/Source/SGDLib/SGDLib.vcxproj
+++ b/Source/SGDLib/SGDLib.vcxproj
@@ -156,7 +156,7 @@
     <ClInclude Include="..\Common\Include\BestGpu.h" />
     <ClInclude Include="..\Common\Include\Config.h" />
     <ClInclude Include="..\Common\Include\DataReader.h" />
-    <ClInclude Include="..\Common\Include\DataTensor.h" />
+    <ClInclude Include="..\Common\Include\TensorShape.h" />
     <ClInclude Include="..\Common\Include\DataWriter.h" />
     <ClInclude Include="..\Common\Include\File.h" />
     <ClInclude Include="..\Common\Include\fileutil.h" />
diff --git a/Source/SGDLib/SGDLib.vcxproj.filters b/Source/SGDLib/SGDLib.vcxproj.filters
index dd984702f..be0cb5acd 100644
--- a/Source/SGDLib/SGDLib.vcxproj.filters
+++ b/Source/SGDLib/SGDLib.vcxproj.filters
@@ -141,7 +141,7 @@
     <ClInclude Include="..\Common\Include\Sequences.h">
       <Filter>Common\Include</Filter>
     </ClInclude>
-    <ClInclude Include="..\Common\Include\DataTensor.h">
+    <ClInclude Include="..\Common\Include\TensorShape.h">
       <Filter>Common\Include</Filter>
     </ClInclude>
     <ClInclude Include="..\Common\Include\Config.h">