Merge branch 'master' into fseide/bs

Conflicts: Math/Math/NoGPU.cpp
2015-09-03 13:50:03 -07:00 · 2015-09-03 13:50:03 -07:00 · 785429e854
--- a/DataReader/HTKMLFReader/latticearchive.h
+++ b/DataReader/HTKMLFReader/latticearchive.h
@ -60,7 +60,7 @@ class lattice
        size_t impliedspunitid : 31;        // id of implied last unit (intended as /sp/); only used in V2
        size_t hasacscores : 1;             // if 1 then ac scores are embedded

-        header_v1_v2() : numnodes (0), numedges (0), lmf (1.0f), wp (0.0f), frameduration (0.01/*assumption*/), numframes (0), impliedspunitid (SIZE_MAX), hasacscores (1) { }
+        header_v1_v2() : numnodes (0), numedges (0), lmf (1.0f), wp (0.0f), frameduration (0.01/*assumption*/), numframes (0), impliedspunitid (INT_MAX), hasacscores (1) { }
    };
    header_v1_v2 info;                         // information about the lattice
    static const unsigned int NOEDGE = 0xffffff;    // 24 bits
@ -507,7 +507,7 @@ public:
            }
        };

-        typedef aligninfo aligninfo;        // now we can access it as htkmlfwordsequence::aligninfo although it comes from some totally other corner of the system
+        typedef msra::lattices::aligninfo aligninfo;        // now we can access it as htkmlfwordsequence::aligninfo although it comes from some totally other corner of the system

        std::vector<word> words;
        std::vector<aligninfo> align;
--- a/DataReader/HTKMLFReader/msra_mgram.h
+++ b/DataReader/HTKMLFReader/msra_mgram.h
@ -1983,10 +1983,11 @@ public:
        //// set prune value to 0 3 3
        //setMinObs (iMinObs);

-        for (size_t i = 0; i < minObs.size(); i++)
-        {
-            MESSAGE("minObs %d: %d.", i, minObs[i]);
-        }
+        // TODO: Re-enable when MESSAGE definition is provided (printf?)
+        // for (size_t i = 0; i < minObs.size(); i++)
+        // {
+        //     MESSAGE("minObs %d: %d.", i, minObs[i]);
+        // }

        estimate (startId, minObs, dropWord);

--- a/DataReader/HTKMLFReader/rollingwindowsource.h
+++ b/DataReader/HTKMLFReader/rollingwindowsource.h
@ -169,7 +169,7 @@ namespace msra { namespace dbn {
            // finish off last block
            flushlastblock();
            fflushOrDie (f);
-            fprintf (stderr, "biggrowablevectorarray: disk backup store created, %d frames, %ull bytes\n", (int) n, fgetpos (f));
+            fprintf (stderr, "biggrowablevectorarray: disk backup store created, %d frames, %lu bytes\n", (int) n, fgetpos (f));
            fclose (f);
            foreach_index (i, blocks) assert (!blocks[i]);   // ensure we flushed
            assert (inmembegin == inmemend);    // nothing in cache
--- a/DataReader/HTKMLFReader/ssematrix.h
+++ b/DataReader/HTKMLFReader/ssematrix.h
@ -278,7 +278,7 @@ public:
                         bool addtoresult, const float thisscale, const float weight)
    {
        assert (a.size() == b.size());
-        assert ((15 & (int) &a[0]) == 0); assert ((15 & (int) &b[0]) == 0);   // enforce SSE alignment
+        assert ((15 & reinterpret_cast<uintptr_t>(&a[0])) == 0); assert ((15 & reinterpret_cast<uintptr_t>(&b[0])) == 0);   // enforce SSE alignment

        size_t nlong = (a.size() + 3) / 4; // number of SSE elements
        const msra::math::float4 * pa = (const msra::math::float4 *) &a[0];
@ -313,9 +313,9 @@ public:
        // for (size_t k = 0; k < 4; k++)
        //     dotprod (row, const_array_ref<float> (&cols4[k * cols4stride], cols4stride), usij[k * usijstride]);

-        assert ((15 & (int) &row[0]) == 0);
-        assert ((15 & (int) &cols4[0]) == 0);
-        assert ((15 & (int) &cols4[cols4stride]) == 0);
+        assert ((15 & reinterpret_cast<uintptr_t>(&row[0])) == 0);
+        assert ((15 & reinterpret_cast<uintptr_t>(&cols4[0])) == 0);
+        assert ((15 & reinterpret_cast<uintptr_t>(&cols4[cols4stride])) == 0);
        //assert (cols4stride * 4 == cols4.size());     // (passed in one vector with 4 columns stacked on top of each other)
        //assert (row.size() * 4 == cols4.size());  // this assert is no longer appropriate because of further breaking into blocks

--- a/DataReader/LMSequenceReader/SequenceReader.h
+++ b/DataReader/LMSequenceReader/SequenceReader.h
@ -76,7 +76,7 @@ public:
    double logprob(int i) const { if (uniform_sampling) return uniform_log_prob; else return m_log_prob[i]; }

    template <typename Engine>
-    int sample(Engine &eng) const
+    int sample(Engine &eng)
    {
        int m = unif_int(eng);
        if (uniform_sampling)
--- a/2
+++ b/2
@ -52,7 +52,7 @@ CXX = mpic++

 INCLUDEPATH:= Common/Include Math/Math MachineLearning/CNTK BrainScript
 CPPFLAGS:= -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K
-CXXFLAGS:= -msse3 -std=c++0x -std=c++11 -fopenmp -fpermissive -fPIC
+CXXFLAGS:= -msse3 -std=c++0x -std=c++11 -fopenmp -fpermissive -fPIC -Werror
 LIBPATH:=
 LIBS:=
 LDFLAGS:=
--- a/Math/Math/ColumnQuantizer.h
+++ b/Math/Math/ColumnQuantizer.h
@ -1,6 +1,7 @@
 #ifndef __COLUMN_QUANTIZER_H__
 #define __COLUMN_QUANTIZER_H__
 #include "ValueQuantizer.h"
+#include <math.h>

 #pragma warning (disable: 4127) // conditional expression is constant

--- a/Math/Math/GPUMatrixCUDAKernels.cu
+++ b/Math/Math/GPUMatrixCUDAKernels.cu
@ -23,8 +23,14 @@
 #define IDX2C(i,j,ld) (((j)*(ld))+(i)) // 0 based indexing
 #define threadsPerBlock 512

+#ifdef __GNUC__
+#define UNUSED_FUNCTION_ATTRIBUTE __attribute__ ((unused))
+#else
+#define UNUSED_FUNCTION_ATTRIBUTE
+#endif
+
 // Predefine this for later.
-static __inline__ __device__ double atomicAdd(double* address, double val);
+static __inline__ __device__ double atomicAdd(double* address, double val) UNUSED_FUNCTION_ATTRIBUTE;
 //CUDA Kernels code
 template<class ElemType>
 __global__ void _elementWisePowerOnCuda(
--- a/Math/Math/Math.vcxproj
+++ b/Math/Math/Math.vcxproj
@ -81,7 +81,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalDependencies>libacml_mp_dll.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\</AdditionalLibraryDirectories>
-      <DelayLoadDLLs>cublas64_70.dll; cusparse64_70.dll; curand64_70.dll; cudart64_70.dll; libacml_dll.dll; libacml_mp_dll.dll; %(DelayLoadDLLs)</DelayLoadDLLs>
+      <DelayLoadDLLs>cublas64_70.dll; cusparse64_70.dll; curand64_70.dll; cudart64_70.dll; libacml_mp_dll.dll; %(DelayLoadDLLs)</DelayLoadDLLs>
      <Profile>true</Profile>
    </Link>
    <PostBuildEvent>
--- a/Math/Math/MatrixQuantizer_kernel.cu
+++ b/Math/Math/MatrixQuantizer_kernel.cu
@ -24,7 +24,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        blockdim = (unsigned int) warpsize;                             // -> blockIdx.x
    }
    // get the array index for the current thread
-    __device__ static size_t ParallelizeOverRangeIndex()
+    __device__ __inline__ static size_t ParallelizeOverRangeIndex()
    {
        return threadIdx.x + (blockIdx.x * blockDim.x);
    }
--- a/Math/Math/NoGPU.cpp
+++ b/Math/Math/NoGPU.cpp
@ -69,7 +69,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // Start of new GPU Sparse Matrix code 
    //-------------------------------------------------------------------------

-    template<class ElemType> void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const MatrixFormat matrixFormat, const bool growOnly = true, bool keepExistingValues = true) {}//matrix format will affect the size to allocate
+    template<class ElemType> void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const MatrixFormat matrixFormat, const bool growOnly, bool keepExistingValues) {}//matrix format will affect the size to allocate
    template<class ElemType> void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const bool growOnly, bool keepExistingValues) {}

    template<class ElemType> GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::CopyToDenseMatrix() const
@ -351,7 +351,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType> void GPUSparseMatrix<ElemType>::ConvertToSparseFormat(MatrixFormat newFormat, GPUSparseMatrix<ElemType>& outMatrix) const {}

    template<class ElemType> template <class OutType, class InType>
-    static void GPUSparseMatrix<ElemType>::CopyBuffer(OutType * outBuffer, const InType * inBuffer, const size_t size){}
+    void GPUSparseMatrix<ElemType>::CopyBuffer(OutType * outBuffer, const InType * inBuffer, const size_t size){}

 #pragma endregion Helper Functions

--- a/Math/Math/QuantizedMatrix.cpp
+++ b/Math/Math/QuantizedMatrix.cpp
@ -5,7 +5,7 @@
 namespace Microsoft { namespace MSR { namespace CNTK {
    
    template<class ElemType>
-    QuantizedMatrix<ElemType>::QuantizedMatrix(const size_t numRows, const size_t numCols, const size_t nbits, short deviceId, MemAllocator* allocator /* = nullptr */)
+    QuantizedMatrix<ElemType>::QuantizedMatrix(const size_t numRows, const size_t numCols, const size_t nbits, DEVICEID_TYPE deviceId, MemAllocator* allocator /* = nullptr */)
        : m_numRows(numRows), m_numCols(numCols), m_numBits(nbits), m_allocator(allocator)
    {
        m_qColSize = QuantizedColumn<ElemType>::QuantizedColumnSize(m_numBits, m_numRows);
--- a/Math/Math/QuantizedMatrix.h
+++ b/Math/Math/QuantizedMatrix.h
@ -56,7 +56,7 @@ class MATH_API QuantizedMatrix
    static const size_t QWordNumBits = ValueQuantizer<ElemType>::QWordNumBits;

 public:       
-    QuantizedMatrix(const size_t numRows, const size_t numCols, const size_t nbits, short deviceId, MemAllocator* allocator = nullptr);
+    QuantizedMatrix(const size_t numRows, const size_t numCols, const size_t nbits, DEVICEID_TYPE deviceId, MemAllocator* allocator = nullptr);
    
    // Move constructor and assignment
    QuantizedMatrix(QuantizedMatrix<ElemType>&& moveFrom);