Port the Sequence trainign code to Linux

2015-09-25 17:22:58 -07:00 · 2015-09-25 17:22:58 -07:00 · 47cb3ce1bd
--- a/Common/Include/basetypes.h
+++ b/Common/Include/basetypes.h
@ -96,6 +96,7 @@ OACR_WARNING_DISABLE(POTENTIAL_ARGUMENT_TYPE_MISMATCH, "Not level1 or level2_sec
 #include <unistd.h>
 #include <sys/stat.h>
 #include <dlfcn.h>
+#include <sys/time.h>
 typedef unsigned char byte;
 #endif

@ -188,29 +189,43 @@ static inline void Sleep (size_t ms) { std::this_thread::sleep_for (std::chrono:
 namespace msra { namespace basetypes {

 	//sequence training
-	class auto_timer
-	{
-		LARGE_INTEGER freq, start;
-		auto_timer(const auto_timer &); void operator= (const auto_timer &);
-	public:
-		auto_timer()
-		{
-			if (!QueryPerformanceFrequency(&freq)) // count ticks per second
-				throw std::runtime_error("auto_timer: QueryPerformanceFrequency failure");
-			QueryPerformanceCounter(&start);
-		}
-		operator double() const     // each read gives time elapsed since start, in seconds
-		{
-			LARGE_INTEGER end;
-			QueryPerformanceCounter(&end);
-			return (end.QuadPart - start.QuadPart) / (double)freq.QuadPart;
-		}
-		void show(const std::string & msg) const
-		{
-			double elapsed = *this;
-			fprintf(stderr, "%s: %.6f ms\n", msg.c_str(), elapsed * 1000.0);
-		}
-	};
+    #ifdef __unix__
+    typedef timeval LARGE_INTEGER;
+    #endif
+    class auto_timer
+    {
+        LARGE_INTEGER freq, start;
+        auto_timer (const auto_timer &); void operator= (const auto_timer &);
+    public:
+        auto_timer()
+        {
+    #ifdef _WIN32
+            if (!QueryPerformanceFrequency (&freq)) // count ticks per second
+                throw std::runtime_error ("auto_timer: QueryPerformanceFrequency failure");
+            QueryPerformanceCounter (&start);
+    #endif
+    #ifdef __unix__
+            gettimeofday (&start, NULL);
+    #endif
+        }
+        operator double() const     // each read gives time elapsed since start, in seconds
+        {
+            LARGE_INTEGER end;
+    #ifdef _WIN32
+            QueryPerformanceCounter (&end);
+            return (end.QuadPart - start.QuadPart) / (double) freq.QuadPart;
+    #endif
+    #ifdef __unix__
+            gettimeofday (&end,NULL);
+            return (end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec)/(1000*1000);
+    #endif
+        }
+        void show (const std::string & msg) const
+        {
+            double elapsed = *this;
+            fprintf(stderr, "%s: %.6f ms\n", msg.c_str(), elapsed * 1000.0/*to ms*/);
+        }
+    };

 // class ARRAY -- std::vector with array-bounds checking
 // VS 2008 and above do this, so there is no longer a need for this.
@ -1128,6 +1143,7 @@ static inline bool comparator(const pair<int, F>& l, const pair<int, F>& r)
    return l.second > r.second;
 }

+#ifdef _WIN32
 //sequence training 
 // ----------------------------------------------------------------------------
 // frequently missing Win32 functions
@ -1144,4 +1160,5 @@ static inline std::wstring FormatWin32Error(DWORD error)
 	if (last != std::string::npos) res.erase(last + 1, res.length());
 	return res;
 }
+#endif // _WIN32
 #endif    // _BASETYPES_
--- a/Common/Include/latticestorage.h
+++ b/Common/Include/latticestorage.h
@ -25,7 +25,7 @@ static void checkoverflow (size_t fieldval, size_t targetval, const char * field
    if (fieldval != targetval)
    {
        char buf[1000];
-        sprintf_s (buf, sizeof(buf), "lattice: bit field %s too small for value 0x%x (cut from 0x%x)", fieldname, (unsigned int)targetval, (unsigned int)fieldval);
+        snprintf(buf, sizeof(buf), "lattice: bit field %s too small for value 0x%x (cut from 0x%x)", fieldname, (unsigned int)targetval, (unsigned int)fieldval);
        throw std::runtime_error (buf);
    }
 }
--- a/Common/Include/simplesenonehmm.h
+++ b/Common/Include/simplesenonehmm.h
@ -15,6 +15,7 @@
 #include <unordered_map>
 #include <algorithm>    // for find()
 #include "simple_checked_arrays.h"
+#include <limits.h>

 namespace msra { namespace asr {

--- a/MachineLearning/CNTKSGDLib/SGD.cpp
+++ b/MachineLearning/CNTKSGDLib/SGD.cpp
@ -491,9 +491,9 @@ template<class ElemType>
              const bool useCVSetControlLRIfCVExists,
              const bool useEvalCriterionControlLR,
              const size_t minibatchSearchCriterionErrorMargin,
-              const ElemType hsmoothingWeight = 1.0,
-              const ElemType frameDropThresh = 1e-10,
-              const bool doreferencealign = false)
+              const ElemType hsmoothingWeight,
+              const ElemType frameDropThresh,
+              const bool doreferencealign)
    {
        m_numPrevLearnRates = numPrevLearnRates;
        m_prevChosenMinibatchSize = 0;
--- a/MachineLearning/SequenceTraining/NoGPU.cpp
+++ b/MachineLearning/SequenceTraining/NoGPU.cpp
--- a/MachineLearning/SequenceTraining/cudabasetypes.h
+++ b/MachineLearning/SequenceTraining/cudabasetypes.h
@ -111,36 +111,4 @@ public:
    cudaArray * get() const { return a; }
 };

-// using a cudaarrayref
-// Pattern:
-//  - do not declare the texture as an argument to the kernel, instead:
-//  - at file scope:
-//    texture<float, 2, cudaReadModeElementType> texref;
-//  - right before kernel launch:
-//    passtextureref texref (texref, cudaarrayref);    // use the same name as that global texref one, so it will match the name inside the kernel
-class passtextureref
-{
-    textureReference & texref;  // associated texture reference if any
-public:
-    template<typename R,class T>
-    passtextureref (R texref, cudaarrayref<T> cudaarrayref) : texref (texref)
-    {
-        texref.addressMode[0] = cudaAddressModeWrap;
-        texref.addressMode[1] = cudaAddressModeWrap;
-        texref.filterMode     = cudaFilterModePoint;
-        texref.normalized     = false;
-        cudaError_t rc = cudaBindTextureToArray (texref, cudaarrayref.get());
-        if (rc != cudaSuccess)
-        {
-            char buf[1000];
-            sprintf_s (buf, "passtextureref: %s (cuda error %d)", cudaGetErrorString (rc), rc);
-            throw std::runtime_error (buf);
-        }
-    }
-    ~passtextureref()
-    {
-        cudaUnbindTexture (&texref);
-    }
-};
-
 };};
--- a/MachineLearning/SequenceTraining/cudadevice.h
+++ b/MachineLearning/SequenceTraining/cudadevice.h
@ -8,8 +8,7 @@
 #include <assert.h>
 #include <math.h>
 #include <vector>
-#include <hash_set>
-using namespace stdext;
+#include <unordered_set>

 namespace msra { namespace cuda {

@ -265,7 +264,7 @@ public:
 class Devices
 {
 	DeviceInfo* deviceInfo[deviceMax];
-	hash_set<const float *> pinnedBuffers;
+	std::unordered_set<const float *> pinnedBuffers;
 public:
 	Devices()
 	{
@ -320,7 +319,7 @@ public:
 	// Is the buffer passed in pinned?
 	bool IsPinned(const float *bufHost)
 	{
-		hash_set<const float *>::iterator found = pinnedBuffers.find(bufHost);
+		std::unordered_set<const float *>::iterator found = pinnedBuffers.find(bufHost);
 		// see if we found the pointer or not
 		return (found != pinnedBuffers.end());
 	}
@ -340,7 +339,7 @@ public:
 	// WARNING: Unpin operations do a CPU sync
 	void UnpinBuffer(const float *bufHost)
 	{
-		hash_set<const float *>::iterator found = pinnedBuffers.find(bufHost);
+		std::unordered_set<const float *>::iterator found = pinnedBuffers.find(bufHost);
 		// if we didn't find the buffer, exit
 		if (found == pinnedBuffers.end())
 			return;
--- a/MachineLearning/SequenceTraining/cudalattice.cpp
+++ b/MachineLearning/SequenceTraining/cudalattice.cpp
@ -29,7 +29,7 @@ template<typename VECTORTYPE,typename OPSTYPE> class vectorbaseimpl :
 {
    typedef typename VECTORTYPE::elemtype elemtype; // (for convenience)
    size_t capacity;                                // amount of allocated storage (like capacity() vs. vectorref::n = size())
-    void release() { ondevice no (deviceid); free (reset (NULL, 0)); }
+    void release() { ondevice no (deviceid); free (this->reset (NULL, 0)); }
 public:
    vectorbaseimpl() : capacity (0) { }
    ~vectorbaseimpl() { release(); }
@ -49,29 +49,29 @@ public:
            ondevice no (deviceid);                             // switch to desired CUDA card
            cuda_ptr<elemtype> pnew = malloc<elemtype> (sz);    // allocate memory inside CUDA device (or throw)
            capacity = sz;                                      // if succeeded then: remember
-            cuda_ptr<elemtype> p = reset (pnew, sz);            //  and swap the pointers and update n
+            cuda_ptr<elemtype> p = this->reset (pnew, sz);            //  and swap the pointers and update n
            free (p);                                           //  then release the old one
        }
        else                                                    // not growing: keep same allocation
-            reset (get(), sz);
+            this->reset (this->get(), sz);
    }
-    size_t size() const throw() { return vectorref::size(); }
+    size_t size() const throw() { return vectorref<elemtype>::size(); }
    void assign (const elemtype * p, size_t nelem, bool synchronize)
    {
        allocate (nelem);           // assign will resize the target appropriately
        ondevice no (deviceid);     // switch to desired CUDA card
        if (nelem > 0)
-            memcpy (get(), 0, p, nelem);
+            memcpy (this->get(), 0, p, nelem);
        if (synchronize)
            join();
    }
-    void fetch (typename elemtype * p, size_t nelem, bool synchronize) const
+    void fetch (elemtype * p, size_t nelem, bool synchronize) const
    {
        if (nelem != size())        // fetch() cannot resize the target; caller must do that
            throw std::logic_error ("fetch: vector size mismatch");
        ondevice no (deviceid);     // switch to desired CUDA card
        if (nelem > 0)
-            memcpy (p, get(), 0, nelem);
+            memcpy (p, this->get(), 0, nelem);
        if (synchronize)
            join();
    };
@ -176,12 +176,13 @@ class latticefunctionsimpl : public vectorbaseimpl<latticefunctions,latticefunct
    {
        ondevice no (deviceid);

+        matrixref<float> dengammasMatrixRef = tomatrixref(dengammas);
        latticefunctionsops::mmierrorsignal (dynamic_cast<const vectorbaseimpl<ushortvector, vectorref<unsigned short>> &> (alignstateids),
                                             dynamic_cast<const vectorbaseimpl<uintvector, vectorref<unsigned int>> &> (alignoffsets),
                                             dynamic_cast<const vectorbaseimpl<edgeinfowithscoresvector, vectorref<msra::lattices::edgeinfowithscores>> &> (edges),
                                             dynamic_cast<const vectorbaseimpl<nodeinfovector, vectorref<msra::lattices::nodeinfo>> &> (nodes),
                                             dynamic_cast<const vectorbaseimpl<doublevector, vectorref<double>> &> (logpps),
-                                             tomatrixref (dengammas));
+                                             dengammasMatrixRef);
    }

    void stateposteriors (const ushortvector & alignstateids, const uintvector & alignoffsets,
--- a/MachineLearning/SequenceTraining/cudalatticeops.cu
+++ b/MachineLearning/SequenceTraining/cudalatticeops.cu
@ -14,7 +14,7 @@ namespace msra { namespace cuda {
        if (rc != cudaSuccess)
        {
            char buf[1000];
-            sprintf_s (buf, "%s: launch failure: %s (cuda error %d)", fn, cudaGetErrorString (rc), rc);
+            snprintf(buf, sizeof(buf), "%s: launch failure: %s (cuda error %d)", fn, cudaGetErrorString (rc), rc);
            throw std::runtime_error (buf);
        }
    }
--- a/MachineLearning/SequenceTraining/cudalatticeops.cu.h
+++ b/MachineLearning/SequenceTraining/cudalatticeops.cu.h
@ -16,11 +16,21 @@
 #include "math.h"
 #include <assert.h>
 #include <stdexcept>
+
+#ifdef _WIN32
 #include <windows.h>    // for timer
+#endif
+
+#if __unix__
+#include <sys/time.h>
+#endif

 namespace msra { namespace cuda {

-    // auto_timer timer; run(); double seconds = timer; // now can abandon the object
+    // auto_timer timer; run(); double seconds = timer; // now can abandon the object    
+    #ifdef __unix__
+    typedef timeval LARGE_INTEGER;
+    #endif
    class auto_timer
    {
        LARGE_INTEGER freq, start;
@ -28,15 +38,26 @@ namespace msra { namespace cuda {
    public:
        auto_timer()
        {
+    #ifdef _WIN32
            if (!QueryPerformanceFrequency (&freq)) // count ticks per second
                throw std::runtime_error ("auto_timer: QueryPerformanceFrequency failure");
            QueryPerformanceCounter (&start);
+    #endif
+    #ifdef __unix__
+            gettimeofday (&start, NULL);
+    #endif
        }
        operator double() const     // each read gives time elapsed since start, in seconds
        {
            LARGE_INTEGER end;
+    #ifdef _WIN32
            QueryPerformanceCounter (&end);
            return (end.QuadPart - start.QuadPart) / (double) freq.QuadPart;
+    #endif
+    #ifdef __unix__
+            gettimeofday (&end,NULL);
+            return (end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec)/(1000*1000);
+    #endif
        }
        void show (const std::string & msg) const
        {
@ -44,7 +65,7 @@ namespace msra { namespace cuda {
            fprintf (stderr, "%s: %.6f ms\n", msg.c_str(), elapsed * 1000.0/*to ms*/);
        }
    };
-
+    
    // -----------------------------------------------------------------------
    // edgealignment --do alignment on a per edge level, only support normal left to right hmms and ergodic silence hmm
    // output alignresult
--- a/MachineLearning/SequenceTraining/cudalib.cpp
+++ b/MachineLearning/SequenceTraining/cudalib.cpp
@ -5,7 +5,6 @@

 #define _CRT_SECURE_NO_WARNINGS 1    // so we can use getenv()...

-#include <Windows.h>            // for the Mutex
 #include <cuda_runtime_api.h>           // for CUDA API
 #include <cuda.h>                       // for device API
 #include "cudalib.h"
@ -30,7 +29,7 @@ static void operator|| (cudaError_t rc, const char * msg)
    if (rc != cudaSuccess)
    {
        char buf[1000];
-        sprintf_s (buf, "%s: %s (cuda error %d)", msg, cudaGetErrorString (rc), rc);
+        snprintf(buf, sizeof(buf), "%s: %s (cuda error %d)", msg, cudaGetErrorString (rc), rc);
        throw std::runtime_error (buf);
    }
 }
@ -41,7 +40,7 @@ static void operator|| (CUresult rc, const char * msg)
    if (rc != CUDA_SUCCESS)
    {
        char buf[1000];
-        sprintf_s (buf, "%s: cuda API error %d", msg, rc);
+        snprintf(buf, sizeof(buf), "%s: cuda API error %d", msg, rc);
        throw std::runtime_error (buf);
    }
 }
@ -71,8 +70,8 @@ public:
        assert (cuContext == cuContextDummy);
        // show some info to the user
        char namebuf[1024] = { 0 };
-        cuDeviceGetName (&namebuf[0], _countof (namebuf) -1, cuDevice) || "cuDeviceGetName failed";
-        fprintf (stderr, "using physical CUDA device %d: %s\n", physicaldeviceid, namebuf);
+        cuDeviceGetName (&namebuf[0], sizeof(namebuf) -1, cuDevice) || "cuDeviceGetName failed";
+        fprintf (stderr, "using physical CUDA device %d: %s\n", (int)physicaldeviceid, namebuf);
 #endif
    }
    // cast this to the CUcontext for use with CUDA functions
@ -110,102 +109,9 @@ cudaStream_t GetCurrentStream() { return cudaStreamDefault; }
 cudaEvent_t GetCurrentEvent() {return GetEvent(GetCurrentDevice());}
 Devices g_devices;    // one global device pool

-
-// try to acquire a device exclusively; managed through this library's private lock mechanism (i.e. not through CUDA APIs)
-static bool lockdevicebymutex (int physicaldeviceid)
-{
-    wchar_t buffer[80];
-    wsprintf (buffer, L"Global\\DBN.exe GPGPU exclusive lock for device %d", physicaldeviceid);
-    // we actually use a Windows-wide named mutex
-    HANDLE h = ::CreateMutex (NULL/*security attr*/, TRUE/*bInitialOwner*/, buffer);
-    DWORD res = ::GetLastError();
-    if (h == NULL)  // failure  --this should not really happen
-    {
-        if (res == ERROR_ACCESS_DENIED)    // no access: already locked by another process
-        {
-            fprintf (stderr, "lockdevicebymutex: mutex access denied, assuming already locked '%S'\n", buffer);
-            return false;
-        }
-        fprintf (stderr, "lockdevicebymutex: failed to create '%S': %d\n", buffer, res);
-        throw std::runtime_error ("lockdevicebymutex: unexpected failure");
-    }
-    // got a handle
-    if (res == 0)   // no error
-    {
-        fprintf (stderr, "lockdevicebymutex: created and acquired mutex '%S'\n", buffer);
-        return true;
-    }
-    // failure with handle  --remember to release the handle
-    ::CloseHandle (h);
-    if (res == ERROR_ALREADY_EXISTS)    // already locked by another process
-    {
-        fprintf (stderr, "lockdevicebymutex: mutex already locked '%S'\n", buffer);
-        return false;
-    }
-    else if (res != 0)
-    {
-        fprintf (stderr, "lockdevicebymutex: unexpected error from CreateMutex() when attempting to create and acquire mutex '%S': %d\n", buffer, res);
-        throw std::logic_error ("lockdevicebymutex: unexpected failure");
-    }
-    return false;
-}
 // initialize CUDA system
 void lazyinit()
 {
-#if 0
-    if (devicesallocated >= 0) return;
-    int numphysicaldevices = 0;
-    cudaGetDeviceCount (&numphysicaldevices) || "cudaGetDeviceCount failed";
-    fprintf (stderr, "lazyinit: %d physical CUDA devices detected\n", numphysicaldevices);
-#ifndef NOMULTIDEVICE
-       // we can emulate a larger number of GPUs than actually present, for dev purposes
-    int oversubscribe = 1;
-    const char * oversubscribevar = getenv ("DBNOVERSUBSCRIBEGPUS");
-    if (oversubscribevar)
-        oversubscribe = atoi (oversubscribevar);
-       const int numdevices = numphysicaldevices * oversubscribe;
-    // number of devices
-    // environment variable DBNMAXGPUS
-    //  - 0: use all, exclusively
-    //  - >0: limit to this number, exclusively  --default is 1
-       //        The number of available devices includes the emulated one by means of DBNOVERSUBSCRIBEGPUS
-    //  - <0: use this number but bypassing the exclusive check, for debugging/quick stuff
-    int devlimit = 1;
-    bool exclusive = true;
-    const char * devlimitvar = getenv ("DBNMAXGPUS");
-    if (devlimitvar)
-        devlimit = atoi (devlimitvar);
-    if (devlimit < 0)
-    {
-        devlimit = -devlimit;
-        exclusive = false; // allow non-exclusive use
-    }
-    if (devlimit == 0)
-        devlimit = INT_MAX;
-    // initialize CUDA device API
-    cuInit (0) || "cuInit failed";
-    // initialize the system
-    devicesallocated = 0;
-    for (int deviceid = 0; deviceid < numdevices && devicesallocated < devlimit; deviceid++)    // loop over all physical devices
-    {
-        // check if device is available by trying to lock it
-        bool available = !exclusive || lockdevicebymutex (deviceid); // try to acquire the lock
-        
-        if (!available)           // not available: don't use it
-        {
-            fprintf (stderr, "CUDA device %d already in use, skipping\n", deviceid);
-            continue;
-        }
-        // OK to allocate
-              const int physicaldeviceid = deviceid % numphysicaldevices;   // not the same in case of DBNOVERSUBSCRIBEGPUS > 1
-        cudadevicecontexts[devicesallocated].init (physicaldeviceid);
-        devicesallocated++;
-    }
-    fprintf (stderr, "using %d on %d physically present CUDA devices%s\n", devicesallocated, numphysicaldevices, exclusive ? " exclusively" : "");
-#else
-    devicesallocated = 1;
-#endif
-#endif 
 }

 void initwithdeviceid(size_t deviceid)
@ -214,7 +120,7 @@ void initwithdeviceid(size_t deviceid)
    devicesallocated = 0;
    cudadevicecontexts[devicesallocated].init(deviceid);
    devicesallocated++;
-    fprintf(stderr, "using  CUDA devices%d \n", deviceid);
+    fprintf(stderr, "using  CUDA devices%d \n", (int)deviceid);
 }

 // get number of devices
@ -244,7 +150,7 @@ void setdevicecontext (size_t deviceid)
 #ifndef NOMULTIDEVICE
    //if (currentcudadevicecontext != NULL)
    //    throw std::logic_error ("setdevicecontext: a device context has already been set --??");
-    if (deviceid >= _countof (cudadevicecontexts))
+    if (deviceid >= (sizeof(cudadevicecontexts) / sizeof(cudadevicecontext)))
        throw std::logic_error ("setdevicecontext: device id exceeds size of fixed-size array cudadevicecontexts[]");
    cudadevicecontext & c = cudadevicecontexts[deviceid];

--- a/MachineLearning/SequenceTraining/gammacalculation.h
+++ b/MachineLearning/SequenceTraining/gammacalculation.h
@ -1,6 +1,6 @@
 #pragma once

-#include <hash_map>
+#include <unordered_map>
 #include "simplesenonehmm.h"
 #include "latticearchive.h"
 #include "latticesource.h"
--- a/MachineLearning/SequenceTraining/latticeforwardbackward.cpp
+++ b/MachineLearning/SequenceTraining/latticeforwardbackward.cpp
@ -2,21 +2,22 @@
 //
 // F. Seide, V-hansu

-#pragma once
-
 #include "basetypes.h"
 #include "simple_checked_arrays.h"
 #include "latticearchive.h"
 #include "simplesenonehmm.h"    // the model
 #include "ssematrix.h"          // the matrices
 #include "latticestorage.h"
-#include <hash_map>
+#include <unordered_map>
+#include <list>

 #undef PRINT_TIME_MEASUREMENT       // [v-hansu]
 #define VIRGINLOGZERO (10 * LOGZERO)            // used for printing statistics on unseen states
 #undef CPU_VERIFICATION

+#ifdef _WIN32
 int msra::numa::node_override = -1;     // for numahelpers.h
+#endif

 namespace msra { namespace lattices {

@ -26,7 +27,7 @@ namespace msra { namespace lattices {

 class littlematrixheap
 {
-    static const size_t CHUNKSIZE = 256*1024;   // 1 MB
+    static const size_t CHUNKSIZE;
    typedef msra::math::ssematrixfrombuffer matrixfrombuffer;
    std::list<std::vector<float>> heap;
    size_t allocatedinlast; // in last heap element
@ -64,6 +65,8 @@ public:
    }
 };

+const size_t littlematrixheap::CHUNKSIZE = 256*1024; // 1 MB
+
 // ---------------------------------------------------------------------------
 // helpers for log-domain addition
 // ---------------------------------------------------------------------------
@ -183,7 +186,7 @@ template<typename FLOAT> static bool islogzero (FLOAT v) { return v < LOGZERO/2;
                const float acLL = logLLs(s,t);
                if (islogzero (acLL))
                    fprintf (stderr, "forwardbackwardedge: WARNING: edge J=%d unit %d (%s) frames [%d,%d) ac score(%d,%d) is zero (%d st, %d fr: %s)\n",
-                    edgeindex, (int) k, hmm.getname(), (int) ts, (int) te,
+                    (int)edgeindex, (int) k, hmm.getname(), (int) ts, (int) te,
                    (int) s, (int) t,
                    (int) logbetas.rows(), (int) logbetas.cols(), gettranscript (units, hset).c_str());
                const float betajt = logbetas(j,t);     // sum over all all path exiting from (j,t) to end
@ -211,7 +214,7 @@ template<typename FLOAT> static bool islogzero (FLOAT v) { return v < LOGZERO/2;
        bwscore = exitscore;
        if (islogzero (bwscore))
            fprintf (stderr, "forwardbackwardedge: WARNING: edge J=%d unit %d (%s) frames [%d,%d) bw score is zero (%d st, %d fr: %s)\n",
-            edgeindex, (int) k, hmm.getname(), (int) ts, (int) te, (int) logbetas.rows(), (int) logbetas.cols(), gettranscript (units, hset).c_str());
+            (int)edgeindex, (int) k, hmm.getname(), (int) ts, (int) te, (int) logbetas.rows(), (int) logbetas.cols(), gettranscript (units, hset).c_str());

        te = ts;
        je = js;
@ -292,17 +295,17 @@ template<typename FLOAT> static bool islogzero (FLOAT v) { return v < LOGZERO/2;
    // These cases must be handled separately. If the whole path is 0 (0 prob is on the only path at some point) then skip the lattice.
    if (islogzero (totalbwscore) ^ islogzero (totalfwscore))
        fprintf (stderr, "forwardbackwardedge: WARNING: edge J=%d fw and bw 0 score %.10f vs. %.10f (%d st, %d fr: %s)\n",
-        edgeindex, (float) totalfwscore, (float) totalbwscore, (int) js, (int) ts, gettranscript (units, hset).c_str());
+        (int)edgeindex, (float) totalfwscore, (float) totalbwscore, (int) js, (int) ts, gettranscript (units, hset).c_str());
    if (islogzero (totalbwscore))
    {
        fprintf (stderr, "forwardbackwardedge: WARNING: edge J=%d has zero ac. score (%d st, %d fr: %s)\n",
-            edgeindex, (int) js, (int) ts, gettranscript (units, hset).c_str());
+                 (int)edgeindex, (int) js, (int) ts, gettranscript (units, hset).c_str());
        return LOGZERO;
    }

    if (fabsf (totalfwscore - totalbwscore) / ts > 1e-4f)
        fprintf (stderr, "forwardbackwardedge: WARNING: edge J=%d fw and bw score %.10f vs. %.10f (%d st, %d fr: %s)\n",
-        edgeindex, (float) totalfwscore, (float) totalbwscore, (int) js, (int) ts, gettranscript (units, hset).c_str());
+        (int)edgeindex, (float) totalfwscore, (float) totalbwscore, (int) js, (int) ts, gettranscript (units, hset).c_str());

    // we return the full path score
    return totalfwscore;
@ -414,7 +417,7 @@ template<typename FLOAT> static bool islogzero (FLOAT v) { return v < LOGZERO/2;
    if (islogzero (fwscore))
    {
        fprintf (stderr, "alignedge: WARNING: edge J=%d has zero ac. score (%d st, %d fr: %s)\n",
-            edgeindex, (int) js, (int) ts, gettranscript (units, hset).c_str());
+                 (int)edgeindex, (int) js, (int) ts, gettranscript (units, hset).c_str());
        return LOGZERO;
    }

@ -746,7 +749,7 @@ double lattice::forwardbackwardlatticesMBR (const std::vector<float> & edgeacsco
    // TODO: we will later have code that adds this path if needed
    size_t oracleframeacc = maxcorrect.back();
    if (oracleframeacc != info.numframes)
-        fprintf (stderr, "forwardbackwardlatticesMBR: ground-truth path missing from lattice (most correct path: %d out of %d frames correct)\n", (unsigned int) oracleframeacc, (unsigned int) info.numframes);
+        fprintf (stderr, "forwardbackwardlatticesMBR: ground-truth path missing from lattice (most correct path: %d out of %d frames correct)\n", (unsigned int) oracleframeacc, (int) info.numframes);

    // backward pass and computation of state-conditioned frames-correct count
    for (size_t j = edges.size() -1; j+1 > 0; j--)
@ -897,7 +900,7 @@ void lattice::forwardbackwardalign (parallelstate & parallelstate,
            }
        }
        if (minlogpp > LOGZERO)
-            fprintf(stderr, "forwardbackwardalign: %d of %d edges pruned\n", countskip, edges.size());
+            fprintf(stderr, "forwardbackwardalign: %d of %d edges pruned\n", (int)countskip, (int)edges.size());
    }
    const double abcsallocatedur = timerabcsallocate;

@ -1048,7 +1051,7 @@ void lattice::forwardbackwardalign (parallelstate & parallelstate,
                for (size_t t = ts; t < te; t++)
                {
                    if (thisedgealignments[j][t-ts] != thisedgealignmentsgpu[j][t-ts])
-                        fprintf (stderr, "edge %d, sil ? %d, time %d, alignment / alignmentgpu MISMATCH %d v.s. %d\n", j, edgehassil ? 1 : 0, t-ts, thisedgealignments[j][t-ts], thisedgealignmentsgpu[j][t-ts]);
+                        fprintf (stderr, "edge %d, sil ? %d, time %d, alignment / alignmentgpu MISMATCH %d v.s. %d\n", j, edgehassil ? 1 : 0, (int)(t-ts), thisedgealignments[j][t-ts], thisedgealignmentsgpu[j][t-ts]);
                }
            }
        }
@ -1284,7 +1287,7 @@ void lattice::mmierrorsignal (parallelstate & parallelstate, double minlogpp, co
            // TODO: count VIRGINLOGZERO, print per frame
        }
        if (fabs (logsum) / errorsignal.rows() > 1e-6)
-            fprintf (stderr, "forwardbackward: WARNING: overall posterior column(%d) sum = exp (%.10f) != 1\n", t, logsum);
+            fprintf (stderr, "forwardbackward: WARNING: overall posterior column(%d) sum = exp (%.10f) != 1\n", (int)t, logsum);
    }
    fprintf (stderr, "forwardbackward: %.3f%% non-zero state posteriors\n", 100.0f - nonzerostates * 100.0f / errorsignal.rows() / errorsignal.cols());

@ -1441,7 +1444,7 @@ void sMBRdiagnostics (const msra::math::ssematrixbase & errorsignal, const_array
    }
    // print this to validate our bestpath computation
    fprintf (stderr, "sMBRdiagnostics: %d frames correct out of %d, %.2f%%, neg better in %d, pos in %d\n",
-             (int) numcor, errorsignal.cols(), 100.0f * numcor / errorsignal.cols(),
+             (int) numcor, (int)errorsignal.cols(), 100.0f * numcor / errorsignal.cols(),
             (int) numnegbetter, (int) numposbetter);
 }

@ -1471,7 +1474,7 @@ void sMBRsuppressweirdstuff (msra::math::ssematrixbase & errorsignal, const_arra
    }
    // print this to validate our bestpath computation
    fprintf (stderr, "sMBRsuppressweirdstuff: %d weird frames out of %d, %.2f%% were flattened\n",
-             (int) numweird, errorsignal.cols(), 100.0f * numweird / errorsignal.cols());
+             (int) numweird, (int) errorsignal.cols(), 100.0f * numweird / errorsignal.cols());
 }


--- a/MachineLearning/SequenceTraining/latticefunctionskernels.h
+++ b/MachineLearning/SequenceTraining/latticefunctionskernels.h
@ -14,6 +14,7 @@
 #pragma push_macro ("atomicCAS")

 #include "latticestorage.h"
+#include <limits>

 namespace msra { namespace cuda { class passtextureref; }}

--- a/MachineLearning/SequenceTraining/parallelforwardbackward.cpp
+++ b/MachineLearning/SequenceTraining/parallelforwardbackward.cpp
@ -481,21 +481,21 @@ namespace msra { namespace lattices {
        template<class edgestype, class nodestype, class aligntype, class edgealignments, class backpointers>
        void setutterancedata (const edgestype & edges, const nodestype & nodes, const aligntype & align,
                               const msra::math::ssematrixbase & logLLs, std::vector<float> & edgeacscores, 
-                               edgealignments & edgealignments, backpointers & backpointers)
+                               edgealignments & edgeAlignments, backpointers & backPointers)
        {
            // lattice data
            edgesgpu->assign (edges, false);
            nodesgpu->assign (nodes, false);
            aligngpu->assign (align, false);
-            alignoffsetsgpu->assign (edgealignments.getalignoffsets(), false);
-            backptrstoragegpu->allocate (backpointers.getbackptrstoragesize());
-            backptroffsetsgpu->assign (backpointers.getbackptroffsets(), false);
+            alignoffsetsgpu->assign (edgeAlignments.getalignoffsets(), false);
+            backptrstoragegpu->allocate (backPointers.getbackptrstoragesize());
+            backptroffsetsgpu->assign (backPointers.getbackptroffsets(), false);

 #ifndef PARALLEL_SIL
-            alignresult->assign (edgealignments.getalignmentsbuffer(), false);
+            alignresult->assign (edgeAlignments.getalignmentsbuffer(), false);
            edgeacscoresgpu->assign (edgeacscores, false);
 #else
-            alignresult->allocate (edgealignments.getalignbuffersize());
+            alignresult->allocate (edgeAlignments.getalignbuffersize());
            edgeacscoresgpu->allocate (edges.size());
            edgeacscores;               // reference to make compilor happy
 #endif
@ -516,9 +516,9 @@ namespace msra { namespace lattices {
            loglls = *errorsignalgpu;
        }
        template<class edgealignments>
-        void copyalignments (edgealignments & edgealignments)
+        void copyalignments (edgealignments & edgeAlignments)
        {
-            alignresult->fetch(edgealignments.getalignmentsbuffer(), true);
+            alignresult->fetch(edgeAlignments.getalignmentsbuffer(), true);
        }

        // [v-hansu] allocate memory for vectors relating to forward-backward
@ -735,7 +735,7 @@ namespace msra { namespace lattices {
        if (!parallelstate->emulation)
        {

-            fprintf(stderr, "parallelforwardbackwardlattice: %d launches for forward, %d launches for backward\n", batchsizeforward.size(), batchsizebackward.size());
+            fprintf(stderr, "parallelforwardbackwardlattice: %d launches for forward, %d launches for backward\n", (int)batchsizeforward.size(), (int)batchsizebackward.size());

            const bool allocateframescorrect = (returnEframescorrect || boostingfactor != 0.0f);
            const bool copyuids = (returnEframescorrect || boostingfactor != 0.0f);
@ -799,7 +799,7 @@ namespace msra { namespace lattices {
        if (rc != CUBLAS_STATUS_SUCCESS)
        {
            char buf[1000];
-            sprintf_s(buf, "%s: cublas error code %d", msg.c_str(), rc);    // ... TODO: add error message
+            sprintf_s(buf, sizeof(buf), "%s: cublas error code %d", msg.c_str(), rc);    // ... TODO: add error message
            throw std::runtime_error(buf);
        }
    }
--- a/19
+++ b/19
@ -50,7 +50,7 @@ endif
 # The actual compiler/linker flags added can be viewed by running 'mpic++ --showme:compile' and 'mpic++ --showme:link'
 CXX = mpic++

-INCLUDEPATH:= Common/Include Math/Math MachineLearning/CNTK MachineLearning/CNTKComputationNetworkLib MachineLearning/CNTKSGDLib BrainScript
+INCLUDEPATH:= Common/Include Math/Math MachineLearning/CNTK MachineLearning/CNTKComputationNetworkLib MachineLearning/CNTKSGDLib MachineLearning/SequenceTraining BrainScript
 CPPFLAGS:= -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K
 CXXFLAGS:= -msse3 -std=c++0x -std=c++11 -fopenmp -fpermissive -fPIC -Werror -Wno-error=literal-suffix
 LIBPATH:=
@ -377,12 +377,27 @@ CNTK_SRC =\
 	MachineLearning/CNTKComputationNetworkLib/NetworkBuilderFromConfig.cpp \
 	MachineLearning/CNTKSGDLib/Profiler.cpp \
 	MachineLearning/CNTKSGDLib/SGD.cpp \
+	MachineLearning/SequenceTraining/cudalattice.cpp \
+	MachineLearning/SequenceTraining/cudalib.cpp \
+	MachineLearning/SequenceTraining/latticeforwardbackward.cpp \
+	MachineLearning/SequenceTraining/parallelforwardbackward.cpp \
 	BrainScript/BrainScriptEvaluator.cpp \
 	BrainScript/BrainScriptParser.cpp \
 	BrainScript/BrainScriptTest.cpp \
 	MachineLearning/CNTK/ExperimentalNetworkBuilder.cpp \

-CNTK_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(CNTK_SRC))
+
+ifdef CUDA_PATH
+CNTK_SRC +=\
+	MachineLearning/SequenceTraining/cudalatticeops.cu \
+
+else
+CNTK_SRC +=\
+	MachineLearning/SequenceTraining/NoGPU.cpp \
+
+endif
+
+CNTK_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(CNTK_SRC)))

 CNTK:=$(BINDIR)/cntk
 ALL+=$(CNTK)
--- a/Math/Math/GPUMatrixCUDAKernels.cu
+++ b/Math/Math/GPUMatrixCUDAKernels.cu
@ -4481,7 +4481,7 @@ template<class ElemType>
 __global__ void _AssignSequenceError(const ElemType hsmoothingWeight, ElemType *error, const ElemType *label,
 	const ElemType *dnnoutput, const ElemType *gamma, ElemType alpha, const long N)
 {
-	LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+	int id = blockDim.x * blockIdx.x + threadIdx.x;
 	if (id >= N)
 		return;
 	error[id] -= alpha * (label[id] - (1.0 - hsmoothingWeight)*dnnoutput[id] - hsmoothingWeight * gamma[id]);