diff --git a/Makefile b/Makefile
index d4d975bf4..ed1da399d 100644
--- a/Makefile
+++ b/Makefile
@@ -65,14 +65,8 @@ SRC:=
# this early in the file, so let buildall do the work.
all : buildall
-# Set up nvcc target architectures (will generate code to support them all, i.e. fat-binary)
-GENCODE_SM20 := -gencode arch=compute_20,code=\"sm_20,compute_20\"
-GENCODE_SM30 := -gencode arch=compute_30,code=\"sm_30,compute_30\"
-GENCODE_SM35 := -gencode arch=compute_35,code=\"sm_35,compute_35\"
-GENCODE_FLAGS := $(GENCODE_SM20) $(GENCODE_SM30) $(GENCODE_SM35)
-
# Set up basic nvcc options and add CUDA targets from above
-CUFLAGS = -std=c++11 -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -m 64 $(GENCODE_FLAGS)
+CUFLAGS = -std=c++11 -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -m 64
ifdef CUDA_PATH
ifndef GDK_PATH
@@ -126,14 +120,22 @@ ifdef KALDI_PATH
KALDI_LIBS += -lkaldi-util -lkaldi-matrix -lkaldi-base -lkaldi-hmm -lkaldi-cudamatrix -lkaldi-nnet -lkaldi-lat
endif
+# Set up nvcc target architectures (will generate code to support them all, i.e. fat-binary, in release mode)
+# In debug mode we will rely on JIT to create code "on the fly" for the underlying architecture
+GENCODE_SM20 := -gencode arch=compute_20,code=\"sm_20,compute_20\"
+GENCODE_SM30 := -gencode arch=compute_30,code=\"sm_30,compute_30\"
+GENCODE_SM35 := -gencode arch=compute_35,code=\"sm_35,compute_35\"
+GENCODE_SM50 := -gencode arch=compute_50,code=\"sm_50,compute_50\"
+GENCODE_FLAGS := $(GENCODE_SM20) $(GENCODE_SM30) $(GENCODE_SM35) $(GENCODE_SM50)
+
ifeq ("$(BUILDTYPE)","debug")
CXXFLAGS += -g
- CUFLAGS += -O0 -G -lineinfo
+ CUFLAGS += -O0 -G -lineinfo -gencode arch=compute_20,code=\"compute_20\"
endif
ifeq ("$(BUILDTYPE)","release")
CXXFLAGS += -O4
- CUFLAGS += -O3 -use_fast_math -lineinfo
+ CUFLAGS += -O3 -use_fast_math -lineinfo $(GENCODE_FLAGS)
endif
#######
@@ -394,7 +396,7 @@ $(OBJDIR)/%.o : %.cu Makefile
@echo $(SEPARATOR)
@echo creating $@ for $(ARCH) with build type $(BUILDTYPE)
@mkdir -p $(dir $@)
- $(NVCC) -c $< -o $@ $(CUFLAGS) $(INCLUDEPATH:%=-I%) -Xcompiler -fPIC
+ $(NVCC) -c $< -o $@ $(CUFLAGS) $(INCLUDEPATH:%=-I%) -Xcompiler "-fPIC -Werror"
$(OBJDIR)/%.o : %.cpp Makefile
@echo $(SEPARATOR)
diff --git a/Math/Math/CNTKMathCUDA.vcxproj b/Math/Math/CNTKMathCUDA.vcxproj
index 4ca17bce7..82ea7daee 100644
--- a/Math/Math/CNTKMathCUDA.vcxproj
+++ b/Math/Math/CNTKMathCUDA.vcxproj
@@ -14,16 +14,19 @@
{B3DD765E-694E-4494-BAD7-37BBF2942517}
Win32Proj
Math
-
-
-
-
+
+
+
+
+
+
+
+
CNTKMathCUDA
$(CUDA_PATH_V7_0)
$(CudaPath)
-
StaticLibrary
v120
@@ -43,20 +46,17 @@
-
..\..\common\include;$(ACML_PATH)\include;$(CudaPath)\include;$(IncludePath)
$(SolutionDir)$(Platform)\$(Configuration);$(ACML_PATH)\lib;$(CudaPath)\lib\$(Platform);$(LibraryPath)
$(Platform)\$(Configuration)\$(ProjectName)\
-
true
false
-
NotUsing
@@ -78,14 +78,13 @@
64
- compute_20,sm_20;compute_30,sm_30;compute_35,sm_35;compute_50,sm_50;
true
+ /WX
xcopy /D /I /Y "$(CudaPath)\bin\cudart64_*.dll" $(OutputPath)
-
_DEBUG; %(PreprocessorDefinitions)
@@ -94,8 +93,10 @@
+
+ compute_20,compute_20;
+
-
MaxSpeed
@@ -111,12 +112,12 @@
true
+ compute_20,sm_20;compute_30,sm_30;compute_35,sm_35;compute_50,sm_50;
true
false
false
-