Port the Sequence trainign code to Linux

This commit is contained in:
Amit 2015-09-25 17:22:58 -07:00
Родитель d1d6cca696
Коммит 47cb3ce1bd
17 изменённых файлов: 137 добавлений и 205 удалений

Просмотреть файл

@ -96,6 +96,7 @@ OACR_WARNING_DISABLE(POTENTIAL_ARGUMENT_TYPE_MISMATCH, "Not level1 or level2_sec
#include <unistd.h>
#include <sys/stat.h>
#include <dlfcn.h>
#include <sys/time.h>
typedef unsigned char byte;
#endif
@ -188,29 +189,43 @@ static inline void Sleep (size_t ms) { std::this_thread::sleep_for (std::chrono:
namespace msra { namespace basetypes {
//sequence training
class auto_timer
{
LARGE_INTEGER freq, start;
auto_timer(const auto_timer &); void operator= (const auto_timer &);
public:
auto_timer()
{
if (!QueryPerformanceFrequency(&freq)) // count ticks per second
throw std::runtime_error("auto_timer: QueryPerformanceFrequency failure");
QueryPerformanceCounter(&start);
}
operator double() const // each read gives time elapsed since start, in seconds
{
LARGE_INTEGER end;
QueryPerformanceCounter(&end);
return (end.QuadPart - start.QuadPart) / (double)freq.QuadPart;
}
void show(const std::string & msg) const
{
double elapsed = *this;
fprintf(stderr, "%s: %.6f ms\n", msg.c_str(), elapsed * 1000.0);
}
};
#ifdef __unix__
typedef timeval LARGE_INTEGER;
#endif
class auto_timer
{
LARGE_INTEGER freq, start;
auto_timer (const auto_timer &); void operator= (const auto_timer &);
public:
auto_timer()
{
#ifdef _WIN32
if (!QueryPerformanceFrequency (&freq)) // count ticks per second
throw std::runtime_error ("auto_timer: QueryPerformanceFrequency failure");
QueryPerformanceCounter (&start);
#endif
#ifdef __unix__
gettimeofday (&start, NULL);
#endif
}
operator double() const // each read gives time elapsed since start, in seconds
{
LARGE_INTEGER end;
#ifdef _WIN32
QueryPerformanceCounter (&end);
return (end.QuadPart - start.QuadPart) / (double) freq.QuadPart;
#endif
#ifdef __unix__
gettimeofday (&end,NULL);
return (end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec)/(1000*1000);
#endif
}
void show (const std::string & msg) const
{
double elapsed = *this;
fprintf(stderr, "%s: %.6f ms\n", msg.c_str(), elapsed * 1000.0/*to ms*/);
}
};
// class ARRAY -- std::vector with array-bounds checking
// VS 2008 and above do this, so there is no longer a need for this.
@ -1128,6 +1143,7 @@ static inline bool comparator(const pair<int, F>& l, const pair<int, F>& r)
return l.second > r.second;
}
#ifdef _WIN32
//sequence training
// ----------------------------------------------------------------------------
// frequently missing Win32 functions
@ -1144,4 +1160,5 @@ static inline std::wstring FormatWin32Error(DWORD error)
if (last != std::string::npos) res.erase(last + 1, res.length());
return res;
}
#endif // _WIN32
#endif // _BASETYPES_

Просмотреть файл

@ -25,7 +25,7 @@ static void checkoverflow (size_t fieldval, size_t targetval, const char * field
if (fieldval != targetval)
{
char buf[1000];
sprintf_s (buf, sizeof(buf), "lattice: bit field %s too small for value 0x%x (cut from 0x%x)", fieldname, (unsigned int)targetval, (unsigned int)fieldval);
snprintf(buf, sizeof(buf), "lattice: bit field %s too small for value 0x%x (cut from 0x%x)", fieldname, (unsigned int)targetval, (unsigned int)fieldval);
throw std::runtime_error (buf);
}
}

Просмотреть файл

@ -15,6 +15,7 @@
#include <unordered_map>
#include <algorithm> // for find()
#include "simple_checked_arrays.h"
#include <limits.h>
namespace msra { namespace asr {

Просмотреть файл

@ -491,9 +491,9 @@ template<class ElemType>
const bool useCVSetControlLRIfCVExists,
const bool useEvalCriterionControlLR,
const size_t minibatchSearchCriterionErrorMargin,
const ElemType hsmoothingWeight = 1.0,
const ElemType frameDropThresh = 1e-10,
const bool doreferencealign = false)
const ElemType hsmoothingWeight,
const ElemType frameDropThresh,
const bool doreferencealign)
{
m_numPrevLearnRates = numPrevLearnRates;
m_prevChosenMinibatchSize = 0;

Просмотреть файл

Просмотреть файл

@ -111,36 +111,4 @@ public:
cudaArray * get() const { return a; }
};
// using a cudaarrayref
// Pattern:
// - do not declare the texture as an argument to the kernel, instead:
// - at file scope:
// texture<float, 2, cudaReadModeElementType> texref;
// - right before kernel launch:
// passtextureref texref (texref, cudaarrayref); // use the same name as that global texref one, so it will match the name inside the kernel
class passtextureref
{
textureReference & texref; // associated texture reference if any
public:
template<typename R,class T>
passtextureref (R texref, cudaarrayref<T> cudaarrayref) : texref (texref)
{
texref.addressMode[0] = cudaAddressModeWrap;
texref.addressMode[1] = cudaAddressModeWrap;
texref.filterMode = cudaFilterModePoint;
texref.normalized = false;
cudaError_t rc = cudaBindTextureToArray (texref, cudaarrayref.get());
if (rc != cudaSuccess)
{
char buf[1000];
sprintf_s (buf, "passtextureref: %s (cuda error %d)", cudaGetErrorString (rc), rc);
throw std::runtime_error (buf);
}
}
~passtextureref()
{
cudaUnbindTexture (&texref);
}
};
};};

Просмотреть файл

@ -8,8 +8,7 @@
#include <assert.h>
#include <math.h>
#include <vector>
#include <hash_set>
using namespace stdext;
#include <unordered_set>
namespace msra { namespace cuda {
@ -265,7 +264,7 @@ public:
class Devices
{
DeviceInfo* deviceInfo[deviceMax];
hash_set<const float *> pinnedBuffers;
std::unordered_set<const float *> pinnedBuffers;
public:
Devices()
{
@ -320,7 +319,7 @@ public:
// Is the buffer passed in pinned?
bool IsPinned(const float *bufHost)
{
hash_set<const float *>::iterator found = pinnedBuffers.find(bufHost);
std::unordered_set<const float *>::iterator found = pinnedBuffers.find(bufHost);
// see if we found the pointer or not
return (found != pinnedBuffers.end());
}
@ -340,7 +339,7 @@ public:
// WARNING: Unpin operations do a CPU sync
void UnpinBuffer(const float *bufHost)
{
hash_set<const float *>::iterator found = pinnedBuffers.find(bufHost);
std::unordered_set<const float *>::iterator found = pinnedBuffers.find(bufHost);
// if we didn't find the buffer, exit
if (found == pinnedBuffers.end())
return;

Просмотреть файл

@ -29,7 +29,7 @@ template<typename VECTORTYPE,typename OPSTYPE> class vectorbaseimpl :
{
typedef typename VECTORTYPE::elemtype elemtype; // (for convenience)
size_t capacity; // amount of allocated storage (like capacity() vs. vectorref::n = size())
void release() { ondevice no (deviceid); free (reset (NULL, 0)); }
void release() { ondevice no (deviceid); free (this->reset (NULL, 0)); }
public:
vectorbaseimpl() : capacity (0) { }
~vectorbaseimpl() { release(); }
@ -49,29 +49,29 @@ public:
ondevice no (deviceid); // switch to desired CUDA card
cuda_ptr<elemtype> pnew = malloc<elemtype> (sz); // allocate memory inside CUDA device (or throw)
capacity = sz; // if succeeded then: remember
cuda_ptr<elemtype> p = reset (pnew, sz); // and swap the pointers and update n
cuda_ptr<elemtype> p = this->reset (pnew, sz); // and swap the pointers and update n
free (p); // then release the old one
}
else // not growing: keep same allocation
reset (get(), sz);
this->reset (this->get(), sz);
}
size_t size() const throw() { return vectorref::size(); }
size_t size() const throw() { return vectorref<elemtype>::size(); }
void assign (const elemtype * p, size_t nelem, bool synchronize)
{
allocate (nelem); // assign will resize the target appropriately
ondevice no (deviceid); // switch to desired CUDA card
if (nelem > 0)
memcpy (get(), 0, p, nelem);
memcpy (this->get(), 0, p, nelem);
if (synchronize)
join();
}
void fetch (typename elemtype * p, size_t nelem, bool synchronize) const
void fetch (elemtype * p, size_t nelem, bool synchronize) const
{
if (nelem != size()) // fetch() cannot resize the target; caller must do that
throw std::logic_error ("fetch: vector size mismatch");
ondevice no (deviceid); // switch to desired CUDA card
if (nelem > 0)
memcpy (p, get(), 0, nelem);
memcpy (p, this->get(), 0, nelem);
if (synchronize)
join();
};
@ -176,12 +176,13 @@ class latticefunctionsimpl : public vectorbaseimpl<latticefunctions,latticefunct
{
ondevice no (deviceid);
matrixref<float> dengammasMatrixRef = tomatrixref(dengammas);
latticefunctionsops::mmierrorsignal (dynamic_cast<const vectorbaseimpl<ushortvector, vectorref<unsigned short>> &> (alignstateids),
dynamic_cast<const vectorbaseimpl<uintvector, vectorref<unsigned int>> &> (alignoffsets),
dynamic_cast<const vectorbaseimpl<edgeinfowithscoresvector, vectorref<msra::lattices::edgeinfowithscores>> &> (edges),
dynamic_cast<const vectorbaseimpl<nodeinfovector, vectorref<msra::lattices::nodeinfo>> &> (nodes),
dynamic_cast<const vectorbaseimpl<doublevector, vectorref<double>> &> (logpps),
tomatrixref (dengammas));
dengammasMatrixRef);
}
void stateposteriors (const ushortvector & alignstateids, const uintvector & alignoffsets,

Просмотреть файл

@ -14,7 +14,7 @@ namespace msra { namespace cuda {
if (rc != cudaSuccess)
{
char buf[1000];
sprintf_s (buf, "%s: launch failure: %s (cuda error %d)", fn, cudaGetErrorString (rc), rc);
snprintf(buf, sizeof(buf), "%s: launch failure: %s (cuda error %d)", fn, cudaGetErrorString (rc), rc);
throw std::runtime_error (buf);
}
}

Просмотреть файл

@ -16,11 +16,21 @@
#include "math.h"
#include <assert.h>
#include <stdexcept>
#ifdef _WIN32
#include <windows.h> // for timer
#endif
#if __unix__
#include <sys/time.h>
#endif
namespace msra { namespace cuda {
// auto_timer timer; run(); double seconds = timer; // now can abandon the object
// auto_timer timer; run(); double seconds = timer; // now can abandon the object
#ifdef __unix__
typedef timeval LARGE_INTEGER;
#endif
class auto_timer
{
LARGE_INTEGER freq, start;
@ -28,15 +38,26 @@ namespace msra { namespace cuda {
public:
auto_timer()
{
#ifdef _WIN32
if (!QueryPerformanceFrequency (&freq)) // count ticks per second
throw std::runtime_error ("auto_timer: QueryPerformanceFrequency failure");
QueryPerformanceCounter (&start);
#endif
#ifdef __unix__
gettimeofday (&start, NULL);
#endif
}
operator double() const // each read gives time elapsed since start, in seconds
{
LARGE_INTEGER end;
#ifdef _WIN32
QueryPerformanceCounter (&end);
return (end.QuadPart - start.QuadPart) / (double) freq.QuadPart;
#endif
#ifdef __unix__
gettimeofday (&end,NULL);
return (end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec)/(1000*1000);
#endif
}
void show (const std::string & msg) const
{
@ -44,7 +65,7 @@ namespace msra { namespace cuda {
fprintf (stderr, "%s: %.6f ms\n", msg.c_str(), elapsed * 1000.0/*to ms*/);
}
};
// -----------------------------------------------------------------------
// edgealignment --do alignment on a per edge level, only support normal left to right hmms and ergodic silence hmm
// output alignresult

Просмотреть файл

@ -5,7 +5,6 @@
#define _CRT_SECURE_NO_WARNINGS 1 // so we can use getenv()...
#include <Windows.h> // for the Mutex
#include <cuda_runtime_api.h> // for CUDA API
#include <cuda.h> // for device API
#include "cudalib.h"
@ -30,7 +29,7 @@ static void operator|| (cudaError_t rc, const char * msg)
if (rc != cudaSuccess)
{
char buf[1000];
sprintf_s (buf, "%s: %s (cuda error %d)", msg, cudaGetErrorString (rc), rc);
snprintf(buf, sizeof(buf), "%s: %s (cuda error %d)", msg, cudaGetErrorString (rc), rc);
throw std::runtime_error (buf);
}
}
@ -41,7 +40,7 @@ static void operator|| (CUresult rc, const char * msg)
if (rc != CUDA_SUCCESS)
{
char buf[1000];
sprintf_s (buf, "%s: cuda API error %d", msg, rc);
snprintf(buf, sizeof(buf), "%s: cuda API error %d", msg, rc);
throw std::runtime_error (buf);
}
}
@ -71,8 +70,8 @@ public:
assert (cuContext == cuContextDummy);
// show some info to the user
char namebuf[1024] = { 0 };
cuDeviceGetName (&namebuf[0], _countof (namebuf) -1, cuDevice) || "cuDeviceGetName failed";
fprintf (stderr, "using physical CUDA device %d: %s\n", physicaldeviceid, namebuf);
cuDeviceGetName (&namebuf[0], sizeof(namebuf) -1, cuDevice) || "cuDeviceGetName failed";
fprintf (stderr, "using physical CUDA device %d: %s\n", (int)physicaldeviceid, namebuf);
#endif
}
// cast this to the CUcontext for use with CUDA functions
@ -110,102 +109,9 @@ cudaStream_t GetCurrentStream() { return cudaStreamDefault; }
cudaEvent_t GetCurrentEvent() {return GetEvent(GetCurrentDevice());}
Devices g_devices; // one global device pool
// try to acquire a device exclusively; managed through this library's private lock mechanism (i.e. not through CUDA APIs)
static bool lockdevicebymutex (int physicaldeviceid)
{
wchar_t buffer[80];
wsprintf (buffer, L"Global\\DBN.exe GPGPU exclusive lock for device %d", physicaldeviceid);
// we actually use a Windows-wide named mutex
HANDLE h = ::CreateMutex (NULL/*security attr*/, TRUE/*bInitialOwner*/, buffer);
DWORD res = ::GetLastError();
if (h == NULL) // failure --this should not really happen
{
if (res == ERROR_ACCESS_DENIED) // no access: already locked by another process
{
fprintf (stderr, "lockdevicebymutex: mutex access denied, assuming already locked '%S'\n", buffer);
return false;
}
fprintf (stderr, "lockdevicebymutex: failed to create '%S': %d\n", buffer, res);
throw std::runtime_error ("lockdevicebymutex: unexpected failure");
}
// got a handle
if (res == 0) // no error
{
fprintf (stderr, "lockdevicebymutex: created and acquired mutex '%S'\n", buffer);
return true;
}
// failure with handle --remember to release the handle
::CloseHandle (h);
if (res == ERROR_ALREADY_EXISTS) // already locked by another process
{
fprintf (stderr, "lockdevicebymutex: mutex already locked '%S'\n", buffer);
return false;
}
else if (res != 0)
{
fprintf (stderr, "lockdevicebymutex: unexpected error from CreateMutex() when attempting to create and acquire mutex '%S': %d\n", buffer, res);
throw std::logic_error ("lockdevicebymutex: unexpected failure");
}
return false;
}
// initialize CUDA system
void lazyinit()
{
#if 0
if (devicesallocated >= 0) return;
int numphysicaldevices = 0;
cudaGetDeviceCount (&numphysicaldevices) || "cudaGetDeviceCount failed";
fprintf (stderr, "lazyinit: %d physical CUDA devices detected\n", numphysicaldevices);
#ifndef NOMULTIDEVICE
// we can emulate a larger number of GPUs than actually present, for dev purposes
int oversubscribe = 1;
const char * oversubscribevar = getenv ("DBNOVERSUBSCRIBEGPUS");
if (oversubscribevar)
oversubscribe = atoi (oversubscribevar);
const int numdevices = numphysicaldevices * oversubscribe;
// number of devices
// environment variable DBNMAXGPUS
// - 0: use all, exclusively
// - >0: limit to this number, exclusively --default is 1
// The number of available devices includes the emulated one by means of DBNOVERSUBSCRIBEGPUS
// - <0: use this number but bypassing the exclusive check, for debugging/quick stuff
int devlimit = 1;
bool exclusive = true;
const char * devlimitvar = getenv ("DBNMAXGPUS");
if (devlimitvar)
devlimit = atoi (devlimitvar);
if (devlimit < 0)
{
devlimit = -devlimit;
exclusive = false; // allow non-exclusive use
}
if (devlimit == 0)
devlimit = INT_MAX;
// initialize CUDA device API
cuInit (0) || "cuInit failed";
// initialize the system
devicesallocated = 0;
for (int deviceid = 0; deviceid < numdevices && devicesallocated < devlimit; deviceid++) // loop over all physical devices
{
// check if device is available by trying to lock it
bool available = !exclusive || lockdevicebymutex (deviceid); // try to acquire the lock
if (!available) // not available: don't use it
{
fprintf (stderr, "CUDA device %d already in use, skipping\n", deviceid);
continue;
}
// OK to allocate
const int physicaldeviceid = deviceid % numphysicaldevices; // not the same in case of DBNOVERSUBSCRIBEGPUS > 1
cudadevicecontexts[devicesallocated].init (physicaldeviceid);
devicesallocated++;
}
fprintf (stderr, "using %d on %d physically present CUDA devices%s\n", devicesallocated, numphysicaldevices, exclusive ? " exclusively" : "");
#else
devicesallocated = 1;
#endif
#endif
}
void initwithdeviceid(size_t deviceid)
@ -214,7 +120,7 @@ void initwithdeviceid(size_t deviceid)
devicesallocated = 0;
cudadevicecontexts[devicesallocated].init(deviceid);
devicesallocated++;
fprintf(stderr, "using CUDA devices%d \n", deviceid);
fprintf(stderr, "using CUDA devices%d \n", (int)deviceid);
}
// get number of devices
@ -244,7 +150,7 @@ void setdevicecontext (size_t deviceid)
#ifndef NOMULTIDEVICE
//if (currentcudadevicecontext != NULL)
// throw std::logic_error ("setdevicecontext: a device context has already been set --??");
if (deviceid >= _countof (cudadevicecontexts))
if (deviceid >= (sizeof(cudadevicecontexts) / sizeof(cudadevicecontext)))
throw std::logic_error ("setdevicecontext: device id exceeds size of fixed-size array cudadevicecontexts[]");
cudadevicecontext & c = cudadevicecontexts[deviceid];

Просмотреть файл

@ -1,6 +1,6 @@
#pragma once
#include <hash_map>
#include <unordered_map>
#include "simplesenonehmm.h"
#include "latticearchive.h"
#include "latticesource.h"

Просмотреть файл

@ -2,21 +2,22 @@
//
// F. Seide, V-hansu
#pragma once
#include "basetypes.h"
#include "simple_checked_arrays.h"
#include "latticearchive.h"
#include "simplesenonehmm.h" // the model
#include "ssematrix.h" // the matrices
#include "latticestorage.h"
#include <hash_map>
#include <unordered_map>
#include <list>
#undef PRINT_TIME_MEASUREMENT // [v-hansu]
#define VIRGINLOGZERO (10 * LOGZERO) // used for printing statistics on unseen states
#undef CPU_VERIFICATION
#ifdef _WIN32
int msra::numa::node_override = -1; // for numahelpers.h
#endif
namespace msra { namespace lattices {
@ -26,7 +27,7 @@ namespace msra { namespace lattices {
class littlematrixheap
{
static const size_t CHUNKSIZE = 256*1024; // 1 MB
static const size_t CHUNKSIZE;
typedef msra::math::ssematrixfrombuffer matrixfrombuffer;
std::list<std::vector<float>> heap;
size_t allocatedinlast; // in last heap element
@ -64,6 +65,8 @@ public:
}
};
const size_t littlematrixheap::CHUNKSIZE = 256*1024; // 1 MB
// ---------------------------------------------------------------------------
// helpers for log-domain addition
// ---------------------------------------------------------------------------
@ -183,7 +186,7 @@ template<typename FLOAT> static bool islogzero (FLOAT v) { return v < LOGZERO/2;
const float acLL = logLLs(s,t);
if (islogzero (acLL))
fprintf (stderr, "forwardbackwardedge: WARNING: edge J=%d unit %d (%s) frames [%d,%d) ac score(%d,%d) is zero (%d st, %d fr: %s)\n",
edgeindex, (int) k, hmm.getname(), (int) ts, (int) te,
(int)edgeindex, (int) k, hmm.getname(), (int) ts, (int) te,
(int) s, (int) t,
(int) logbetas.rows(), (int) logbetas.cols(), gettranscript (units, hset).c_str());
const float betajt = logbetas(j,t); // sum over all all path exiting from (j,t) to end
@ -211,7 +214,7 @@ template<typename FLOAT> static bool islogzero (FLOAT v) { return v < LOGZERO/2;
bwscore = exitscore;
if (islogzero (bwscore))
fprintf (stderr, "forwardbackwardedge: WARNING: edge J=%d unit %d (%s) frames [%d,%d) bw score is zero (%d st, %d fr: %s)\n",
edgeindex, (int) k, hmm.getname(), (int) ts, (int) te, (int) logbetas.rows(), (int) logbetas.cols(), gettranscript (units, hset).c_str());
(int)edgeindex, (int) k, hmm.getname(), (int) ts, (int) te, (int) logbetas.rows(), (int) logbetas.cols(), gettranscript (units, hset).c_str());
te = ts;
je = js;
@ -292,17 +295,17 @@ template<typename FLOAT> static bool islogzero (FLOAT v) { return v < LOGZERO/2;
// These cases must be handled separately. If the whole path is 0 (0 prob is on the only path at some point) then skip the lattice.
if (islogzero (totalbwscore) ^ islogzero (totalfwscore))
fprintf (stderr, "forwardbackwardedge: WARNING: edge J=%d fw and bw 0 score %.10f vs. %.10f (%d st, %d fr: %s)\n",
edgeindex, (float) totalfwscore, (float) totalbwscore, (int) js, (int) ts, gettranscript (units, hset).c_str());
(int)edgeindex, (float) totalfwscore, (float) totalbwscore, (int) js, (int) ts, gettranscript (units, hset).c_str());
if (islogzero (totalbwscore))
{
fprintf (stderr, "forwardbackwardedge: WARNING: edge J=%d has zero ac. score (%d st, %d fr: %s)\n",
edgeindex, (int) js, (int) ts, gettranscript (units, hset).c_str());
(int)edgeindex, (int) js, (int) ts, gettranscript (units, hset).c_str());
return LOGZERO;
}
if (fabsf (totalfwscore - totalbwscore) / ts > 1e-4f)
fprintf (stderr, "forwardbackwardedge: WARNING: edge J=%d fw and bw score %.10f vs. %.10f (%d st, %d fr: %s)\n",
edgeindex, (float) totalfwscore, (float) totalbwscore, (int) js, (int) ts, gettranscript (units, hset).c_str());
(int)edgeindex, (float) totalfwscore, (float) totalbwscore, (int) js, (int) ts, gettranscript (units, hset).c_str());
// we return the full path score
return totalfwscore;
@ -414,7 +417,7 @@ template<typename FLOAT> static bool islogzero (FLOAT v) { return v < LOGZERO/2;
if (islogzero (fwscore))
{
fprintf (stderr, "alignedge: WARNING: edge J=%d has zero ac. score (%d st, %d fr: %s)\n",
edgeindex, (int) js, (int) ts, gettranscript (units, hset).c_str());
(int)edgeindex, (int) js, (int) ts, gettranscript (units, hset).c_str());
return LOGZERO;
}
@ -746,7 +749,7 @@ double lattice::forwardbackwardlatticesMBR (const std::vector<float> & edgeacsco
// TODO: we will later have code that adds this path if needed
size_t oracleframeacc = maxcorrect.back();
if (oracleframeacc != info.numframes)
fprintf (stderr, "forwardbackwardlatticesMBR: ground-truth path missing from lattice (most correct path: %d out of %d frames correct)\n", (unsigned int) oracleframeacc, (unsigned int) info.numframes);
fprintf (stderr, "forwardbackwardlatticesMBR: ground-truth path missing from lattice (most correct path: %d out of %d frames correct)\n", (unsigned int) oracleframeacc, (int) info.numframes);
// backward pass and computation of state-conditioned frames-correct count
for (size_t j = edges.size() -1; j+1 > 0; j--)
@ -897,7 +900,7 @@ void lattice::forwardbackwardalign (parallelstate & parallelstate,
}
}
if (minlogpp > LOGZERO)
fprintf(stderr, "forwardbackwardalign: %d of %d edges pruned\n", countskip, edges.size());
fprintf(stderr, "forwardbackwardalign: %d of %d edges pruned\n", (int)countskip, (int)edges.size());
}
const double abcsallocatedur = timerabcsallocate;
@ -1048,7 +1051,7 @@ void lattice::forwardbackwardalign (parallelstate & parallelstate,
for (size_t t = ts; t < te; t++)
{
if (thisedgealignments[j][t-ts] != thisedgealignmentsgpu[j][t-ts])
fprintf (stderr, "edge %d, sil ? %d, time %d, alignment / alignmentgpu MISMATCH %d v.s. %d\n", j, edgehassil ? 1 : 0, t-ts, thisedgealignments[j][t-ts], thisedgealignmentsgpu[j][t-ts]);
fprintf (stderr, "edge %d, sil ? %d, time %d, alignment / alignmentgpu MISMATCH %d v.s. %d\n", j, edgehassil ? 1 : 0, (int)(t-ts), thisedgealignments[j][t-ts], thisedgealignmentsgpu[j][t-ts]);
}
}
}
@ -1284,7 +1287,7 @@ void lattice::mmierrorsignal (parallelstate & parallelstate, double minlogpp, co
// TODO: count VIRGINLOGZERO, print per frame
}
if (fabs (logsum) / errorsignal.rows() > 1e-6)
fprintf (stderr, "forwardbackward: WARNING: overall posterior column(%d) sum = exp (%.10f) != 1\n", t, logsum);
fprintf (stderr, "forwardbackward: WARNING: overall posterior column(%d) sum = exp (%.10f) != 1\n", (int)t, logsum);
}
fprintf (stderr, "forwardbackward: %.3f%% non-zero state posteriors\n", 100.0f - nonzerostates * 100.0f / errorsignal.rows() / errorsignal.cols());
@ -1441,7 +1444,7 @@ void sMBRdiagnostics (const msra::math::ssematrixbase & errorsignal, const_array
}
// print this to validate our bestpath computation
fprintf (stderr, "sMBRdiagnostics: %d frames correct out of %d, %.2f%%, neg better in %d, pos in %d\n",
(int) numcor, errorsignal.cols(), 100.0f * numcor / errorsignal.cols(),
(int) numcor, (int)errorsignal.cols(), 100.0f * numcor / errorsignal.cols(),
(int) numnegbetter, (int) numposbetter);
}
@ -1471,7 +1474,7 @@ void sMBRsuppressweirdstuff (msra::math::ssematrixbase & errorsignal, const_arra
}
// print this to validate our bestpath computation
fprintf (stderr, "sMBRsuppressweirdstuff: %d weird frames out of %d, %.2f%% were flattened\n",
(int) numweird, errorsignal.cols(), 100.0f * numweird / errorsignal.cols());
(int) numweird, (int) errorsignal.cols(), 100.0f * numweird / errorsignal.cols());
}

Просмотреть файл

@ -14,6 +14,7 @@
#pragma push_macro ("atomicCAS")
#include "latticestorage.h"
#include <limits>
namespace msra { namespace cuda { class passtextureref; }}

Просмотреть файл

@ -481,21 +481,21 @@ namespace msra { namespace lattices {
template<class edgestype, class nodestype, class aligntype, class edgealignments, class backpointers>
void setutterancedata (const edgestype & edges, const nodestype & nodes, const aligntype & align,
const msra::math::ssematrixbase & logLLs, std::vector<float> & edgeacscores,
edgealignments & edgealignments, backpointers & backpointers)
edgealignments & edgeAlignments, backpointers & backPointers)
{
// lattice data
edgesgpu->assign (edges, false);
nodesgpu->assign (nodes, false);
aligngpu->assign (align, false);
alignoffsetsgpu->assign (edgealignments.getalignoffsets(), false);
backptrstoragegpu->allocate (backpointers.getbackptrstoragesize());
backptroffsetsgpu->assign (backpointers.getbackptroffsets(), false);
alignoffsetsgpu->assign (edgeAlignments.getalignoffsets(), false);
backptrstoragegpu->allocate (backPointers.getbackptrstoragesize());
backptroffsetsgpu->assign (backPointers.getbackptroffsets(), false);
#ifndef PARALLEL_SIL
alignresult->assign (edgealignments.getalignmentsbuffer(), false);
alignresult->assign (edgeAlignments.getalignmentsbuffer(), false);
edgeacscoresgpu->assign (edgeacscores, false);
#else
alignresult->allocate (edgealignments.getalignbuffersize());
alignresult->allocate (edgeAlignments.getalignbuffersize());
edgeacscoresgpu->allocate (edges.size());
edgeacscores; // reference to make compilor happy
#endif
@ -516,9 +516,9 @@ namespace msra { namespace lattices {
loglls = *errorsignalgpu;
}
template<class edgealignments>
void copyalignments (edgealignments & edgealignments)
void copyalignments (edgealignments & edgeAlignments)
{
alignresult->fetch(edgealignments.getalignmentsbuffer(), true);
alignresult->fetch(edgeAlignments.getalignmentsbuffer(), true);
}
// [v-hansu] allocate memory for vectors relating to forward-backward
@ -735,7 +735,7 @@ namespace msra { namespace lattices {
if (!parallelstate->emulation)
{
fprintf(stderr, "parallelforwardbackwardlattice: %d launches for forward, %d launches for backward\n", batchsizeforward.size(), batchsizebackward.size());
fprintf(stderr, "parallelforwardbackwardlattice: %d launches for forward, %d launches for backward\n", (int)batchsizeforward.size(), (int)batchsizebackward.size());
const bool allocateframescorrect = (returnEframescorrect || boostingfactor != 0.0f);
const bool copyuids = (returnEframescorrect || boostingfactor != 0.0f);
@ -799,7 +799,7 @@ namespace msra { namespace lattices {
if (rc != CUBLAS_STATUS_SUCCESS)
{
char buf[1000];
sprintf_s(buf, "%s: cublas error code %d", msg.c_str(), rc); // ... TODO: add error message
sprintf_s(buf, sizeof(buf), "%s: cublas error code %d", msg.c_str(), rc); // ... TODO: add error message
throw std::runtime_error(buf);
}
}

Просмотреть файл

@ -50,7 +50,7 @@ endif
# The actual compiler/linker flags added can be viewed by running 'mpic++ --showme:compile' and 'mpic++ --showme:link'
CXX = mpic++
INCLUDEPATH:= Common/Include Math/Math MachineLearning/CNTK MachineLearning/CNTKComputationNetworkLib MachineLearning/CNTKSGDLib BrainScript
INCLUDEPATH:= Common/Include Math/Math MachineLearning/CNTK MachineLearning/CNTKComputationNetworkLib MachineLearning/CNTKSGDLib MachineLearning/SequenceTraining BrainScript
CPPFLAGS:= -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K
CXXFLAGS:= -msse3 -std=c++0x -std=c++11 -fopenmp -fpermissive -fPIC -Werror -Wno-error=literal-suffix
LIBPATH:=
@ -377,12 +377,27 @@ CNTK_SRC =\
MachineLearning/CNTKComputationNetworkLib/NetworkBuilderFromConfig.cpp \
MachineLearning/CNTKSGDLib/Profiler.cpp \
MachineLearning/CNTKSGDLib/SGD.cpp \
MachineLearning/SequenceTraining/cudalattice.cpp \
MachineLearning/SequenceTraining/cudalib.cpp \
MachineLearning/SequenceTraining/latticeforwardbackward.cpp \
MachineLearning/SequenceTraining/parallelforwardbackward.cpp \
BrainScript/BrainScriptEvaluator.cpp \
BrainScript/BrainScriptParser.cpp \
BrainScript/BrainScriptTest.cpp \
MachineLearning/CNTK/ExperimentalNetworkBuilder.cpp \
CNTK_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(CNTK_SRC))
ifdef CUDA_PATH
CNTK_SRC +=\
MachineLearning/SequenceTraining/cudalatticeops.cu \
else
CNTK_SRC +=\
MachineLearning/SequenceTraining/NoGPU.cpp \
endif
CNTK_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(CNTK_SRC)))
CNTK:=$(BINDIR)/cntk
ALL+=$(CNTK)

Просмотреть файл

@ -4481,7 +4481,7 @@ template<class ElemType>
__global__ void _AssignSequenceError(const ElemType hsmoothingWeight, ElemType *error, const ElemType *label,
const ElemType *dnnoutput, const ElemType *gamma, ElemType alpha, const long N)
{
LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
int id = blockDim.x * blockIdx.x + threadIdx.x;
if (id >= N)
return;
error[id] -= alpha * (label[id] - (1.0 - hsmoothingWeight)*dnnoutput[id] - hsmoothingWeight * gamma[id]);