diff --git a/Makefile b/Makefile
index d4d975bf4..ed1da399d 100644
--- a/Makefile
+++ b/Makefile
@@ -65,14 +65,8 @@ SRC:=
 # this early in the file, so let buildall do the work.
 all : buildall
 
-# Set up nvcc target architectures (will generate code to support them all, i.e. fat-binary)
-GENCODE_SM20 := -gencode arch=compute_20,code=\"sm_20,compute_20\"
-GENCODE_SM30 := -gencode arch=compute_30,code=\"sm_30,compute_30\"
-GENCODE_SM35 := -gencode arch=compute_35,code=\"sm_35,compute_35\"
-GENCODE_FLAGS := $(GENCODE_SM20) $(GENCODE_SM30) $(GENCODE_SM35)
-
 # Set up basic nvcc options and add CUDA targets from above
-CUFLAGS = -std=c++11 -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -m 64 $(GENCODE_FLAGS)
+CUFLAGS = -std=c++11 -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -m 64
 
 ifdef CUDA_PATH
   ifndef GDK_PATH
@@ -126,14 +120,22 @@ ifdef KALDI_PATH
   KALDI_LIBS += -lkaldi-util -lkaldi-matrix -lkaldi-base -lkaldi-hmm -lkaldi-cudamatrix -lkaldi-nnet -lkaldi-lat
 endif
 
+# Set up nvcc target architectures (will generate code to support them all, i.e. fat-binary, in release mode)
+# In debug mode we will rely on JIT to create code "on the fly" for the underlying architecture
+GENCODE_SM20 := -gencode arch=compute_20,code=\"sm_20,compute_20\"
+GENCODE_SM30 := -gencode arch=compute_30,code=\"sm_30,compute_30\"
+GENCODE_SM35 := -gencode arch=compute_35,code=\"sm_35,compute_35\"
+GENCODE_SM50 := -gencode arch=compute_50,code=\"sm_50,compute_50\"
+GENCODE_FLAGS := $(GENCODE_SM20) $(GENCODE_SM30) $(GENCODE_SM35) $(GENCODE_SM50)
+
 ifeq ("$(BUILDTYPE)","debug")
   CXXFLAGS += -g
-  CUFLAGS += -O0 -G -lineinfo
+  CUFLAGS += -O0 -G -lineinfo -gencode arch=compute_20,code=\"compute_20\"
 endif
 
 ifeq ("$(BUILDTYPE)","release")
   CXXFLAGS += -O4
-  CUFLAGS += -O3 -use_fast_math -lineinfo
+  CUFLAGS += -O3 -use_fast_math -lineinfo $(GENCODE_FLAGS)
 endif
 
 #######
@@ -394,7 +396,7 @@ $(OBJDIR)/%.o : %.cu Makefile
 	@echo $(SEPARATOR)
 	@echo creating $@ for $(ARCH) with build type $(BUILDTYPE) 
 	@mkdir -p $(dir $@)
-	$(NVCC) -c $< -o $@  $(CUFLAGS) $(INCLUDEPATH:%=-I%) -Xcompiler -fPIC
+	$(NVCC) -c $< -o $@  $(CUFLAGS) $(INCLUDEPATH:%=-I%) -Xcompiler "-fPIC -Werror"
 
 $(OBJDIR)/%.o : %.cpp Makefile
 	@echo $(SEPARATOR)
diff --git a/Math/Math/CNTKMathCUDA.vcxproj b/Math/Math/CNTKMathCUDA.vcxproj
index 4ca17bce7..82ea7daee 100644
--- a/Math/Math/CNTKMathCUDA.vcxproj
+++ b/Math/Math/CNTKMathCUDA.vcxproj
@@ -14,16 +14,19 @@
     <ProjectGuid>{B3DD765E-694E-4494-BAD7-37BBF2942517}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>Math</RootNamespace>
-    <SccProjectName></SccProjectName>
-    <SccAuxPath></SccAuxPath>
-    <SccLocalPath></SccLocalPath>
-    <SccProvider></SccProvider>
+    <SccProjectName>
+    </SccProjectName>
+    <SccAuxPath>
+    </SccAuxPath>
+    <SccLocalPath>
+    </SccLocalPath>
+    <SccProvider>
+    </SccProvider>
     <ProjectName>CNTKMathCUDA</ProjectName>
     <CudaPath>$(CUDA_PATH_V7_0)</CudaPath>
     <CudaToolkitCustomDir>$(CudaPath)</CudaToolkitCustomDir>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  
   <PropertyGroup>
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <PlatformToolset>v120</PlatformToolset>
@@ -43,20 +46,17 @@
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
   <PropertyGroup Label="UserMacros" />
-  
   <PropertyGroup>
     <IncludePath>..\..\common\include;$(ACML_PATH)\include;$(CudaPath)\include;$(IncludePath)</IncludePath>
     <LibraryPath>$(SolutionDir)$(Platform)\$(Configuration);$(ACML_PATH)\lib;$(CudaPath)\lib\$(Platform);$(LibraryPath)</LibraryPath>
     <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
   </PropertyGroup>
-
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <LinkIncremental>true</LinkIncremental>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <LinkIncremental>false</LinkIncremental>
   </PropertyGroup>
-  
   <ItemDefinitionGroup>
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
@@ -78,14 +78,13 @@
     </Link>
     <CudaCompile>
       <TargetMachinePlatform>64</TargetMachinePlatform>
-      <CodeGeneration>compute_20,sm_20;compute_30,sm_30;compute_35,sm_35;compute_50,sm_50;</CodeGeneration>
       <GenerateLineInfo>true</GenerateLineInfo>
+      <AdditionalCompilerOptions>/WX</AdditionalCompilerOptions>
     </CudaCompile>
     <PostBuildEvent>
       <Command>xcopy /D /I /Y "$(CudaPath)\bin\cudart64_*.dll" $(OutputPath)</Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
-  
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <ClCompile>
       <PreprocessorDefinitions>_DEBUG; %(PreprocessorDefinitions)</PreprocessorDefinitions>
@@ -94,8 +93,10 @@
     </ClCompile>
     <Link>
     </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_20,compute_20;</CodeGeneration>
+    </CudaCompile>
   </ItemDefinitionGroup>
-  
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <ClCompile>
       <Optimization>MaxSpeed</Optimization>
@@ -111,12 +112,12 @@
       <OptimizeReferences>true</OptimizeReferences>
     </Link>
     <CudaCompile>
+      <CodeGeneration>compute_20,sm_20;compute_30,sm_30;compute_35,sm_35;compute_50,sm_50;</CodeGeneration>
       <FastMath>true</FastMath>
       <GPUDebugInfo>false</GPUDebugInfo>
       <HostDebugInfo>false</HostDebugInfo>
     </CudaCompile>
   </ItemDefinitionGroup>
-  
   <ItemGroup>
     <ClInclude Include="..\..\Common\Include\basetypes.h" />
     <ClInclude Include="..\..\Common\Include\File.h" />