Port the Sequence trainign code to Linux
This commit is contained in:
Родитель
d1d6cca696
Коммит
47cb3ce1bd
|
@ -96,6 +96,7 @@ OACR_WARNING_DISABLE(POTENTIAL_ARGUMENT_TYPE_MISMATCH, "Not level1 or level2_sec
|
|||
#include <unistd.h>
|
||||
#include <sys/stat.h>
|
||||
#include <dlfcn.h>
|
||||
#include <sys/time.h>
|
||||
typedef unsigned char byte;
|
||||
#endif
|
||||
|
||||
|
@ -188,29 +189,43 @@ static inline void Sleep (size_t ms) { std::this_thread::sleep_for (std::chrono:
|
|||
namespace msra { namespace basetypes {
|
||||
|
||||
//sequence training
|
||||
class auto_timer
|
||||
{
|
||||
LARGE_INTEGER freq, start;
|
||||
auto_timer(const auto_timer &); void operator= (const auto_timer &);
|
||||
public:
|
||||
auto_timer()
|
||||
{
|
||||
if (!QueryPerformanceFrequency(&freq)) // count ticks per second
|
||||
throw std::runtime_error("auto_timer: QueryPerformanceFrequency failure");
|
||||
QueryPerformanceCounter(&start);
|
||||
}
|
||||
operator double() const // each read gives time elapsed since start, in seconds
|
||||
{
|
||||
LARGE_INTEGER end;
|
||||
QueryPerformanceCounter(&end);
|
||||
return (end.QuadPart - start.QuadPart) / (double)freq.QuadPart;
|
||||
}
|
||||
void show(const std::string & msg) const
|
||||
{
|
||||
double elapsed = *this;
|
||||
fprintf(stderr, "%s: %.6f ms\n", msg.c_str(), elapsed * 1000.0);
|
||||
}
|
||||
};
|
||||
#ifdef __unix__
|
||||
typedef timeval LARGE_INTEGER;
|
||||
#endif
|
||||
class auto_timer
|
||||
{
|
||||
LARGE_INTEGER freq, start;
|
||||
auto_timer (const auto_timer &); void operator= (const auto_timer &);
|
||||
public:
|
||||
auto_timer()
|
||||
{
|
||||
#ifdef _WIN32
|
||||
if (!QueryPerformanceFrequency (&freq)) // count ticks per second
|
||||
throw std::runtime_error ("auto_timer: QueryPerformanceFrequency failure");
|
||||
QueryPerformanceCounter (&start);
|
||||
#endif
|
||||
#ifdef __unix__
|
||||
gettimeofday (&start, NULL);
|
||||
#endif
|
||||
}
|
||||
operator double() const // each read gives time elapsed since start, in seconds
|
||||
{
|
||||
LARGE_INTEGER end;
|
||||
#ifdef _WIN32
|
||||
QueryPerformanceCounter (&end);
|
||||
return (end.QuadPart - start.QuadPart) / (double) freq.QuadPart;
|
||||
#endif
|
||||
#ifdef __unix__
|
||||
gettimeofday (&end,NULL);
|
||||
return (end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec)/(1000*1000);
|
||||
#endif
|
||||
}
|
||||
void show (const std::string & msg) const
|
||||
{
|
||||
double elapsed = *this;
|
||||
fprintf(stderr, "%s: %.6f ms\n", msg.c_str(), elapsed * 1000.0/*to ms*/);
|
||||
}
|
||||
};
|
||||
|
||||
// class ARRAY -- std::vector with array-bounds checking
|
||||
// VS 2008 and above do this, so there is no longer a need for this.
|
||||
|
@ -1128,6 +1143,7 @@ static inline bool comparator(const pair<int, F>& l, const pair<int, F>& r)
|
|||
return l.second > r.second;
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
//sequence training
|
||||
// ----------------------------------------------------------------------------
|
||||
// frequently missing Win32 functions
|
||||
|
@ -1144,4 +1160,5 @@ static inline std::wstring FormatWin32Error(DWORD error)
|
|||
if (last != std::string::npos) res.erase(last + 1, res.length());
|
||||
return res;
|
||||
}
|
||||
#endif // _WIN32
|
||||
#endif // _BASETYPES_
|
||||
|
|
|
@ -25,7 +25,7 @@ static void checkoverflow (size_t fieldval, size_t targetval, const char * field
|
|||
if (fieldval != targetval)
|
||||
{
|
||||
char buf[1000];
|
||||
sprintf_s (buf, sizeof(buf), "lattice: bit field %s too small for value 0x%x (cut from 0x%x)", fieldname, (unsigned int)targetval, (unsigned int)fieldval);
|
||||
snprintf(buf, sizeof(buf), "lattice: bit field %s too small for value 0x%x (cut from 0x%x)", fieldname, (unsigned int)targetval, (unsigned int)fieldval);
|
||||
throw std::runtime_error (buf);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include <unordered_map>
|
||||
#include <algorithm> // for find()
|
||||
#include "simple_checked_arrays.h"
|
||||
#include <limits.h>
|
||||
|
||||
namespace msra { namespace asr {
|
||||
|
||||
|
|
|
@ -491,9 +491,9 @@ template<class ElemType>
|
|||
const bool useCVSetControlLRIfCVExists,
|
||||
const bool useEvalCriterionControlLR,
|
||||
const size_t minibatchSearchCriterionErrorMargin,
|
||||
const ElemType hsmoothingWeight = 1.0,
|
||||
const ElemType frameDropThresh = 1e-10,
|
||||
const bool doreferencealign = false)
|
||||
const ElemType hsmoothingWeight,
|
||||
const ElemType frameDropThresh,
|
||||
const bool doreferencealign)
|
||||
{
|
||||
m_numPrevLearnRates = numPrevLearnRates;
|
||||
m_prevChosenMinibatchSize = 0;
|
||||
|
|
|
@ -111,36 +111,4 @@ public:
|
|||
cudaArray * get() const { return a; }
|
||||
};
|
||||
|
||||
// using a cudaarrayref
|
||||
// Pattern:
|
||||
// - do not declare the texture as an argument to the kernel, instead:
|
||||
// - at file scope:
|
||||
// texture<float, 2, cudaReadModeElementType> texref;
|
||||
// - right before kernel launch:
|
||||
// passtextureref texref (texref, cudaarrayref); // use the same name as that global texref one, so it will match the name inside the kernel
|
||||
class passtextureref
|
||||
{
|
||||
textureReference & texref; // associated texture reference if any
|
||||
public:
|
||||
template<typename R,class T>
|
||||
passtextureref (R texref, cudaarrayref<T> cudaarrayref) : texref (texref)
|
||||
{
|
||||
texref.addressMode[0] = cudaAddressModeWrap;
|
||||
texref.addressMode[1] = cudaAddressModeWrap;
|
||||
texref.filterMode = cudaFilterModePoint;
|
||||
texref.normalized = false;
|
||||
cudaError_t rc = cudaBindTextureToArray (texref, cudaarrayref.get());
|
||||
if (rc != cudaSuccess)
|
||||
{
|
||||
char buf[1000];
|
||||
sprintf_s (buf, "passtextureref: %s (cuda error %d)", cudaGetErrorString (rc), rc);
|
||||
throw std::runtime_error (buf);
|
||||
}
|
||||
}
|
||||
~passtextureref()
|
||||
{
|
||||
cudaUnbindTexture (&texref);
|
||||
}
|
||||
};
|
||||
|
||||
};};
|
||||
|
|
|
@ -8,8 +8,7 @@
|
|||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <vector>
|
||||
#include <hash_set>
|
||||
using namespace stdext;
|
||||
#include <unordered_set>
|
||||
|
||||
namespace msra { namespace cuda {
|
||||
|
||||
|
@ -265,7 +264,7 @@ public:
|
|||
class Devices
|
||||
{
|
||||
DeviceInfo* deviceInfo[deviceMax];
|
||||
hash_set<const float *> pinnedBuffers;
|
||||
std::unordered_set<const float *> pinnedBuffers;
|
||||
public:
|
||||
Devices()
|
||||
{
|
||||
|
@ -320,7 +319,7 @@ public:
|
|||
// Is the buffer passed in pinned?
|
||||
bool IsPinned(const float *bufHost)
|
||||
{
|
||||
hash_set<const float *>::iterator found = pinnedBuffers.find(bufHost);
|
||||
std::unordered_set<const float *>::iterator found = pinnedBuffers.find(bufHost);
|
||||
// see if we found the pointer or not
|
||||
return (found != pinnedBuffers.end());
|
||||
}
|
||||
|
@ -340,7 +339,7 @@ public:
|
|||
// WARNING: Unpin operations do a CPU sync
|
||||
void UnpinBuffer(const float *bufHost)
|
||||
{
|
||||
hash_set<const float *>::iterator found = pinnedBuffers.find(bufHost);
|
||||
std::unordered_set<const float *>::iterator found = pinnedBuffers.find(bufHost);
|
||||
// if we didn't find the buffer, exit
|
||||
if (found == pinnedBuffers.end())
|
||||
return;
|
||||
|
|
|
@ -29,7 +29,7 @@ template<typename VECTORTYPE,typename OPSTYPE> class vectorbaseimpl :
|
|||
{
|
||||
typedef typename VECTORTYPE::elemtype elemtype; // (for convenience)
|
||||
size_t capacity; // amount of allocated storage (like capacity() vs. vectorref::n = size())
|
||||
void release() { ondevice no (deviceid); free (reset (NULL, 0)); }
|
||||
void release() { ondevice no (deviceid); free (this->reset (NULL, 0)); }
|
||||
public:
|
||||
vectorbaseimpl() : capacity (0) { }
|
||||
~vectorbaseimpl() { release(); }
|
||||
|
@ -49,29 +49,29 @@ public:
|
|||
ondevice no (deviceid); // switch to desired CUDA card
|
||||
cuda_ptr<elemtype> pnew = malloc<elemtype> (sz); // allocate memory inside CUDA device (or throw)
|
||||
capacity = sz; // if succeeded then: remember
|
||||
cuda_ptr<elemtype> p = reset (pnew, sz); // and swap the pointers and update n
|
||||
cuda_ptr<elemtype> p = this->reset (pnew, sz); // and swap the pointers and update n
|
||||
free (p); // then release the old one
|
||||
}
|
||||
else // not growing: keep same allocation
|
||||
reset (get(), sz);
|
||||
this->reset (this->get(), sz);
|
||||
}
|
||||
size_t size() const throw() { return vectorref::size(); }
|
||||
size_t size() const throw() { return vectorref<elemtype>::size(); }
|
||||
void assign (const elemtype * p, size_t nelem, bool synchronize)
|
||||
{
|
||||
allocate (nelem); // assign will resize the target appropriately
|
||||
ondevice no (deviceid); // switch to desired CUDA card
|
||||
if (nelem > 0)
|
||||
memcpy (get(), 0, p, nelem);
|
||||
memcpy (this->get(), 0, p, nelem);
|
||||
if (synchronize)
|
||||
join();
|
||||
}
|
||||
void fetch (typename elemtype * p, size_t nelem, bool synchronize) const
|
||||
void fetch (elemtype * p, size_t nelem, bool synchronize) const
|
||||
{
|
||||
if (nelem != size()) // fetch() cannot resize the target; caller must do that
|
||||
throw std::logic_error ("fetch: vector size mismatch");
|
||||
ondevice no (deviceid); // switch to desired CUDA card
|
||||
if (nelem > 0)
|
||||
memcpy (p, get(), 0, nelem);
|
||||
memcpy (p, this->get(), 0, nelem);
|
||||
if (synchronize)
|
||||
join();
|
||||
};
|
||||
|
@ -176,12 +176,13 @@ class latticefunctionsimpl : public vectorbaseimpl<latticefunctions,latticefunct
|
|||
{
|
||||
ondevice no (deviceid);
|
||||
|
||||
matrixref<float> dengammasMatrixRef = tomatrixref(dengammas);
|
||||
latticefunctionsops::mmierrorsignal (dynamic_cast<const vectorbaseimpl<ushortvector, vectorref<unsigned short>> &> (alignstateids),
|
||||
dynamic_cast<const vectorbaseimpl<uintvector, vectorref<unsigned int>> &> (alignoffsets),
|
||||
dynamic_cast<const vectorbaseimpl<edgeinfowithscoresvector, vectorref<msra::lattices::edgeinfowithscores>> &> (edges),
|
||||
dynamic_cast<const vectorbaseimpl<nodeinfovector, vectorref<msra::lattices::nodeinfo>> &> (nodes),
|
||||
dynamic_cast<const vectorbaseimpl<doublevector, vectorref<double>> &> (logpps),
|
||||
tomatrixref (dengammas));
|
||||
dengammasMatrixRef);
|
||||
}
|
||||
|
||||
void stateposteriors (const ushortvector & alignstateids, const uintvector & alignoffsets,
|
||||
|
|
|
@ -14,7 +14,7 @@ namespace msra { namespace cuda {
|
|||
if (rc != cudaSuccess)
|
||||
{
|
||||
char buf[1000];
|
||||
sprintf_s (buf, "%s: launch failure: %s (cuda error %d)", fn, cudaGetErrorString (rc), rc);
|
||||
snprintf(buf, sizeof(buf), "%s: launch failure: %s (cuda error %d)", fn, cudaGetErrorString (rc), rc);
|
||||
throw std::runtime_error (buf);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,11 +16,21 @@
|
|||
#include "math.h"
|
||||
#include <assert.h>
|
||||
#include <stdexcept>
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <windows.h> // for timer
|
||||
#endif
|
||||
|
||||
#if __unix__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
namespace msra { namespace cuda {
|
||||
|
||||
// auto_timer timer; run(); double seconds = timer; // now can abandon the object
|
||||
// auto_timer timer; run(); double seconds = timer; // now can abandon the object
|
||||
#ifdef __unix__
|
||||
typedef timeval LARGE_INTEGER;
|
||||
#endif
|
||||
class auto_timer
|
||||
{
|
||||
LARGE_INTEGER freq, start;
|
||||
|
@ -28,15 +38,26 @@ namespace msra { namespace cuda {
|
|||
public:
|
||||
auto_timer()
|
||||
{
|
||||
#ifdef _WIN32
|
||||
if (!QueryPerformanceFrequency (&freq)) // count ticks per second
|
||||
throw std::runtime_error ("auto_timer: QueryPerformanceFrequency failure");
|
||||
QueryPerformanceCounter (&start);
|
||||
#endif
|
||||
#ifdef __unix__
|
||||
gettimeofday (&start, NULL);
|
||||
#endif
|
||||
}
|
||||
operator double() const // each read gives time elapsed since start, in seconds
|
||||
{
|
||||
LARGE_INTEGER end;
|
||||
#ifdef _WIN32
|
||||
QueryPerformanceCounter (&end);
|
||||
return (end.QuadPart - start.QuadPart) / (double) freq.QuadPart;
|
||||
#endif
|
||||
#ifdef __unix__
|
||||
gettimeofday (&end,NULL);
|
||||
return (end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec)/(1000*1000);
|
||||
#endif
|
||||
}
|
||||
void show (const std::string & msg) const
|
||||
{
|
||||
|
@ -44,7 +65,7 @@ namespace msra { namespace cuda {
|
|||
fprintf (stderr, "%s: %.6f ms\n", msg.c_str(), elapsed * 1000.0/*to ms*/);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// edgealignment --do alignment on a per edge level, only support normal left to right hmms and ergodic silence hmm
|
||||
// output alignresult
|
||||
|
|
|
@ -5,7 +5,6 @@
|
|||
|
||||
#define _CRT_SECURE_NO_WARNINGS 1 // so we can use getenv()...
|
||||
|
||||
#include <Windows.h> // for the Mutex
|
||||
#include <cuda_runtime_api.h> // for CUDA API
|
||||
#include <cuda.h> // for device API
|
||||
#include "cudalib.h"
|
||||
|
@ -30,7 +29,7 @@ static void operator|| (cudaError_t rc, const char * msg)
|
|||
if (rc != cudaSuccess)
|
||||
{
|
||||
char buf[1000];
|
||||
sprintf_s (buf, "%s: %s (cuda error %d)", msg, cudaGetErrorString (rc), rc);
|
||||
snprintf(buf, sizeof(buf), "%s: %s (cuda error %d)", msg, cudaGetErrorString (rc), rc);
|
||||
throw std::runtime_error (buf);
|
||||
}
|
||||
}
|
||||
|
@ -41,7 +40,7 @@ static void operator|| (CUresult rc, const char * msg)
|
|||
if (rc != CUDA_SUCCESS)
|
||||
{
|
||||
char buf[1000];
|
||||
sprintf_s (buf, "%s: cuda API error %d", msg, rc);
|
||||
snprintf(buf, sizeof(buf), "%s: cuda API error %d", msg, rc);
|
||||
throw std::runtime_error (buf);
|
||||
}
|
||||
}
|
||||
|
@ -71,8 +70,8 @@ public:
|
|||
assert (cuContext == cuContextDummy);
|
||||
// show some info to the user
|
||||
char namebuf[1024] = { 0 };
|
||||
cuDeviceGetName (&namebuf[0], _countof (namebuf) -1, cuDevice) || "cuDeviceGetName failed";
|
||||
fprintf (stderr, "using physical CUDA device %d: %s\n", physicaldeviceid, namebuf);
|
||||
cuDeviceGetName (&namebuf[0], sizeof(namebuf) -1, cuDevice) || "cuDeviceGetName failed";
|
||||
fprintf (stderr, "using physical CUDA device %d: %s\n", (int)physicaldeviceid, namebuf);
|
||||
#endif
|
||||
}
|
||||
// cast this to the CUcontext for use with CUDA functions
|
||||
|
@ -110,102 +109,9 @@ cudaStream_t GetCurrentStream() { return cudaStreamDefault; }
|
|||
cudaEvent_t GetCurrentEvent() {return GetEvent(GetCurrentDevice());}
|
||||
Devices g_devices; // one global device pool
|
||||
|
||||
|
||||
// try to acquire a device exclusively; managed through this library's private lock mechanism (i.e. not through CUDA APIs)
|
||||
static bool lockdevicebymutex (int physicaldeviceid)
|
||||
{
|
||||
wchar_t buffer[80];
|
||||
wsprintf (buffer, L"Global\\DBN.exe GPGPU exclusive lock for device %d", physicaldeviceid);
|
||||
// we actually use a Windows-wide named mutex
|
||||
HANDLE h = ::CreateMutex (NULL/*security attr*/, TRUE/*bInitialOwner*/, buffer);
|
||||
DWORD res = ::GetLastError();
|
||||
if (h == NULL) // failure --this should not really happen
|
||||
{
|
||||
if (res == ERROR_ACCESS_DENIED) // no access: already locked by another process
|
||||
{
|
||||
fprintf (stderr, "lockdevicebymutex: mutex access denied, assuming already locked '%S'\n", buffer);
|
||||
return false;
|
||||
}
|
||||
fprintf (stderr, "lockdevicebymutex: failed to create '%S': %d\n", buffer, res);
|
||||
throw std::runtime_error ("lockdevicebymutex: unexpected failure");
|
||||
}
|
||||
// got a handle
|
||||
if (res == 0) // no error
|
||||
{
|
||||
fprintf (stderr, "lockdevicebymutex: created and acquired mutex '%S'\n", buffer);
|
||||
return true;
|
||||
}
|
||||
// failure with handle --remember to release the handle
|
||||
::CloseHandle (h);
|
||||
if (res == ERROR_ALREADY_EXISTS) // already locked by another process
|
||||
{
|
||||
fprintf (stderr, "lockdevicebymutex: mutex already locked '%S'\n", buffer);
|
||||
return false;
|
||||
}
|
||||
else if (res != 0)
|
||||
{
|
||||
fprintf (stderr, "lockdevicebymutex: unexpected error from CreateMutex() when attempting to create and acquire mutex '%S': %d\n", buffer, res);
|
||||
throw std::logic_error ("lockdevicebymutex: unexpected failure");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
// initialize CUDA system
|
||||
void lazyinit()
|
||||
{
|
||||
#if 0
|
||||
if (devicesallocated >= 0) return;
|
||||
int numphysicaldevices = 0;
|
||||
cudaGetDeviceCount (&numphysicaldevices) || "cudaGetDeviceCount failed";
|
||||
fprintf (stderr, "lazyinit: %d physical CUDA devices detected\n", numphysicaldevices);
|
||||
#ifndef NOMULTIDEVICE
|
||||
// we can emulate a larger number of GPUs than actually present, for dev purposes
|
||||
int oversubscribe = 1;
|
||||
const char * oversubscribevar = getenv ("DBNOVERSUBSCRIBEGPUS");
|
||||
if (oversubscribevar)
|
||||
oversubscribe = atoi (oversubscribevar);
|
||||
const int numdevices = numphysicaldevices * oversubscribe;
|
||||
// number of devices
|
||||
// environment variable DBNMAXGPUS
|
||||
// - 0: use all, exclusively
|
||||
// - >0: limit to this number, exclusively --default is 1
|
||||
// The number of available devices includes the emulated one by means of DBNOVERSUBSCRIBEGPUS
|
||||
// - <0: use this number but bypassing the exclusive check, for debugging/quick stuff
|
||||
int devlimit = 1;
|
||||
bool exclusive = true;
|
||||
const char * devlimitvar = getenv ("DBNMAXGPUS");
|
||||
if (devlimitvar)
|
||||
devlimit = atoi (devlimitvar);
|
||||
if (devlimit < 0)
|
||||
{
|
||||
devlimit = -devlimit;
|
||||
exclusive = false; // allow non-exclusive use
|
||||
}
|
||||
if (devlimit == 0)
|
||||
devlimit = INT_MAX;
|
||||
// initialize CUDA device API
|
||||
cuInit (0) || "cuInit failed";
|
||||
// initialize the system
|
||||
devicesallocated = 0;
|
||||
for (int deviceid = 0; deviceid < numdevices && devicesallocated < devlimit; deviceid++) // loop over all physical devices
|
||||
{
|
||||
// check if device is available by trying to lock it
|
||||
bool available = !exclusive || lockdevicebymutex (deviceid); // try to acquire the lock
|
||||
|
||||
if (!available) // not available: don't use it
|
||||
{
|
||||
fprintf (stderr, "CUDA device %d already in use, skipping\n", deviceid);
|
||||
continue;
|
||||
}
|
||||
// OK to allocate
|
||||
const int physicaldeviceid = deviceid % numphysicaldevices; // not the same in case of DBNOVERSUBSCRIBEGPUS > 1
|
||||
cudadevicecontexts[devicesallocated].init (physicaldeviceid);
|
||||
devicesallocated++;
|
||||
}
|
||||
fprintf (stderr, "using %d on %d physically present CUDA devices%s\n", devicesallocated, numphysicaldevices, exclusive ? " exclusively" : "");
|
||||
#else
|
||||
devicesallocated = 1;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
void initwithdeviceid(size_t deviceid)
|
||||
|
@ -214,7 +120,7 @@ void initwithdeviceid(size_t deviceid)
|
|||
devicesallocated = 0;
|
||||
cudadevicecontexts[devicesallocated].init(deviceid);
|
||||
devicesallocated++;
|
||||
fprintf(stderr, "using CUDA devices%d \n", deviceid);
|
||||
fprintf(stderr, "using CUDA devices%d \n", (int)deviceid);
|
||||
}
|
||||
|
||||
// get number of devices
|
||||
|
@ -244,7 +150,7 @@ void setdevicecontext (size_t deviceid)
|
|||
#ifndef NOMULTIDEVICE
|
||||
//if (currentcudadevicecontext != NULL)
|
||||
// throw std::logic_error ("setdevicecontext: a device context has already been set --??");
|
||||
if (deviceid >= _countof (cudadevicecontexts))
|
||||
if (deviceid >= (sizeof(cudadevicecontexts) / sizeof(cudadevicecontext)))
|
||||
throw std::logic_error ("setdevicecontext: device id exceeds size of fixed-size array cudadevicecontexts[]");
|
||||
cudadevicecontext & c = cudadevicecontexts[deviceid];
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#pragma once
|
||||
|
||||
#include <hash_map>
|
||||
#include <unordered_map>
|
||||
#include "simplesenonehmm.h"
|
||||
#include "latticearchive.h"
|
||||
#include "latticesource.h"
|
||||
|
|
|
@ -2,21 +2,22 @@
|
|||
//
|
||||
// F. Seide, V-hansu
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "basetypes.h"
|
||||
#include "simple_checked_arrays.h"
|
||||
#include "latticearchive.h"
|
||||
#include "simplesenonehmm.h" // the model
|
||||
#include "ssematrix.h" // the matrices
|
||||
#include "latticestorage.h"
|
||||
#include <hash_map>
|
||||
#include <unordered_map>
|
||||
#include <list>
|
||||
|
||||
#undef PRINT_TIME_MEASUREMENT // [v-hansu]
|
||||
#define VIRGINLOGZERO (10 * LOGZERO) // used for printing statistics on unseen states
|
||||
#undef CPU_VERIFICATION
|
||||
|
||||
#ifdef _WIN32
|
||||
int msra::numa::node_override = -1; // for numahelpers.h
|
||||
#endif
|
||||
|
||||
namespace msra { namespace lattices {
|
||||
|
||||
|
@ -26,7 +27,7 @@ namespace msra { namespace lattices {
|
|||
|
||||
class littlematrixheap
|
||||
{
|
||||
static const size_t CHUNKSIZE = 256*1024; // 1 MB
|
||||
static const size_t CHUNKSIZE;
|
||||
typedef msra::math::ssematrixfrombuffer matrixfrombuffer;
|
||||
std::list<std::vector<float>> heap;
|
||||
size_t allocatedinlast; // in last heap element
|
||||
|
@ -64,6 +65,8 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
const size_t littlematrixheap::CHUNKSIZE = 256*1024; // 1 MB
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// helpers for log-domain addition
|
||||
// ---------------------------------------------------------------------------
|
||||
|
@ -183,7 +186,7 @@ template<typename FLOAT> static bool islogzero (FLOAT v) { return v < LOGZERO/2;
|
|||
const float acLL = logLLs(s,t);
|
||||
if (islogzero (acLL))
|
||||
fprintf (stderr, "forwardbackwardedge: WARNING: edge J=%d unit %d (%s) frames [%d,%d) ac score(%d,%d) is zero (%d st, %d fr: %s)\n",
|
||||
edgeindex, (int) k, hmm.getname(), (int) ts, (int) te,
|
||||
(int)edgeindex, (int) k, hmm.getname(), (int) ts, (int) te,
|
||||
(int) s, (int) t,
|
||||
(int) logbetas.rows(), (int) logbetas.cols(), gettranscript (units, hset).c_str());
|
||||
const float betajt = logbetas(j,t); // sum over all all path exiting from (j,t) to end
|
||||
|
@ -211,7 +214,7 @@ template<typename FLOAT> static bool islogzero (FLOAT v) { return v < LOGZERO/2;
|
|||
bwscore = exitscore;
|
||||
if (islogzero (bwscore))
|
||||
fprintf (stderr, "forwardbackwardedge: WARNING: edge J=%d unit %d (%s) frames [%d,%d) bw score is zero (%d st, %d fr: %s)\n",
|
||||
edgeindex, (int) k, hmm.getname(), (int) ts, (int) te, (int) logbetas.rows(), (int) logbetas.cols(), gettranscript (units, hset).c_str());
|
||||
(int)edgeindex, (int) k, hmm.getname(), (int) ts, (int) te, (int) logbetas.rows(), (int) logbetas.cols(), gettranscript (units, hset).c_str());
|
||||
|
||||
te = ts;
|
||||
je = js;
|
||||
|
@ -292,17 +295,17 @@ template<typename FLOAT> static bool islogzero (FLOAT v) { return v < LOGZERO/2;
|
|||
// These cases must be handled separately. If the whole path is 0 (0 prob is on the only path at some point) then skip the lattice.
|
||||
if (islogzero (totalbwscore) ^ islogzero (totalfwscore))
|
||||
fprintf (stderr, "forwardbackwardedge: WARNING: edge J=%d fw and bw 0 score %.10f vs. %.10f (%d st, %d fr: %s)\n",
|
||||
edgeindex, (float) totalfwscore, (float) totalbwscore, (int) js, (int) ts, gettranscript (units, hset).c_str());
|
||||
(int)edgeindex, (float) totalfwscore, (float) totalbwscore, (int) js, (int) ts, gettranscript (units, hset).c_str());
|
||||
if (islogzero (totalbwscore))
|
||||
{
|
||||
fprintf (stderr, "forwardbackwardedge: WARNING: edge J=%d has zero ac. score (%d st, %d fr: %s)\n",
|
||||
edgeindex, (int) js, (int) ts, gettranscript (units, hset).c_str());
|
||||
(int)edgeindex, (int) js, (int) ts, gettranscript (units, hset).c_str());
|
||||
return LOGZERO;
|
||||
}
|
||||
|
||||
if (fabsf (totalfwscore - totalbwscore) / ts > 1e-4f)
|
||||
fprintf (stderr, "forwardbackwardedge: WARNING: edge J=%d fw and bw score %.10f vs. %.10f (%d st, %d fr: %s)\n",
|
||||
edgeindex, (float) totalfwscore, (float) totalbwscore, (int) js, (int) ts, gettranscript (units, hset).c_str());
|
||||
(int)edgeindex, (float) totalfwscore, (float) totalbwscore, (int) js, (int) ts, gettranscript (units, hset).c_str());
|
||||
|
||||
// we return the full path score
|
||||
return totalfwscore;
|
||||
|
@ -414,7 +417,7 @@ template<typename FLOAT> static bool islogzero (FLOAT v) { return v < LOGZERO/2;
|
|||
if (islogzero (fwscore))
|
||||
{
|
||||
fprintf (stderr, "alignedge: WARNING: edge J=%d has zero ac. score (%d st, %d fr: %s)\n",
|
||||
edgeindex, (int) js, (int) ts, gettranscript (units, hset).c_str());
|
||||
(int)edgeindex, (int) js, (int) ts, gettranscript (units, hset).c_str());
|
||||
return LOGZERO;
|
||||
}
|
||||
|
||||
|
@ -746,7 +749,7 @@ double lattice::forwardbackwardlatticesMBR (const std::vector<float> & edgeacsco
|
|||
// TODO: we will later have code that adds this path if needed
|
||||
size_t oracleframeacc = maxcorrect.back();
|
||||
if (oracleframeacc != info.numframes)
|
||||
fprintf (stderr, "forwardbackwardlatticesMBR: ground-truth path missing from lattice (most correct path: %d out of %d frames correct)\n", (unsigned int) oracleframeacc, (unsigned int) info.numframes);
|
||||
fprintf (stderr, "forwardbackwardlatticesMBR: ground-truth path missing from lattice (most correct path: %d out of %d frames correct)\n", (unsigned int) oracleframeacc, (int) info.numframes);
|
||||
|
||||
// backward pass and computation of state-conditioned frames-correct count
|
||||
for (size_t j = edges.size() -1; j+1 > 0; j--)
|
||||
|
@ -897,7 +900,7 @@ void lattice::forwardbackwardalign (parallelstate & parallelstate,
|
|||
}
|
||||
}
|
||||
if (minlogpp > LOGZERO)
|
||||
fprintf(stderr, "forwardbackwardalign: %d of %d edges pruned\n", countskip, edges.size());
|
||||
fprintf(stderr, "forwardbackwardalign: %d of %d edges pruned\n", (int)countskip, (int)edges.size());
|
||||
}
|
||||
const double abcsallocatedur = timerabcsallocate;
|
||||
|
||||
|
@ -1048,7 +1051,7 @@ void lattice::forwardbackwardalign (parallelstate & parallelstate,
|
|||
for (size_t t = ts; t < te; t++)
|
||||
{
|
||||
if (thisedgealignments[j][t-ts] != thisedgealignmentsgpu[j][t-ts])
|
||||
fprintf (stderr, "edge %d, sil ? %d, time %d, alignment / alignmentgpu MISMATCH %d v.s. %d\n", j, edgehassil ? 1 : 0, t-ts, thisedgealignments[j][t-ts], thisedgealignmentsgpu[j][t-ts]);
|
||||
fprintf (stderr, "edge %d, sil ? %d, time %d, alignment / alignmentgpu MISMATCH %d v.s. %d\n", j, edgehassil ? 1 : 0, (int)(t-ts), thisedgealignments[j][t-ts], thisedgealignmentsgpu[j][t-ts]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1284,7 +1287,7 @@ void lattice::mmierrorsignal (parallelstate & parallelstate, double minlogpp, co
|
|||
// TODO: count VIRGINLOGZERO, print per frame
|
||||
}
|
||||
if (fabs (logsum) / errorsignal.rows() > 1e-6)
|
||||
fprintf (stderr, "forwardbackward: WARNING: overall posterior column(%d) sum = exp (%.10f) != 1\n", t, logsum);
|
||||
fprintf (stderr, "forwardbackward: WARNING: overall posterior column(%d) sum = exp (%.10f) != 1\n", (int)t, logsum);
|
||||
}
|
||||
fprintf (stderr, "forwardbackward: %.3f%% non-zero state posteriors\n", 100.0f - nonzerostates * 100.0f / errorsignal.rows() / errorsignal.cols());
|
||||
|
||||
|
@ -1441,7 +1444,7 @@ void sMBRdiagnostics (const msra::math::ssematrixbase & errorsignal, const_array
|
|||
}
|
||||
// print this to validate our bestpath computation
|
||||
fprintf (stderr, "sMBRdiagnostics: %d frames correct out of %d, %.2f%%, neg better in %d, pos in %d\n",
|
||||
(int) numcor, errorsignal.cols(), 100.0f * numcor / errorsignal.cols(),
|
||||
(int) numcor, (int)errorsignal.cols(), 100.0f * numcor / errorsignal.cols(),
|
||||
(int) numnegbetter, (int) numposbetter);
|
||||
}
|
||||
|
||||
|
@ -1471,7 +1474,7 @@ void sMBRsuppressweirdstuff (msra::math::ssematrixbase & errorsignal, const_arra
|
|||
}
|
||||
// print this to validate our bestpath computation
|
||||
fprintf (stderr, "sMBRsuppressweirdstuff: %d weird frames out of %d, %.2f%% were flattened\n",
|
||||
(int) numweird, errorsignal.cols(), 100.0f * numweird / errorsignal.cols());
|
||||
(int) numweird, (int) errorsignal.cols(), 100.0f * numweird / errorsignal.cols());
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
#pragma push_macro ("atomicCAS")
|
||||
|
||||
#include "latticestorage.h"
|
||||
#include <limits>
|
||||
|
||||
namespace msra { namespace cuda { class passtextureref; }}
|
||||
|
||||
|
|
|
@ -481,21 +481,21 @@ namespace msra { namespace lattices {
|
|||
template<class edgestype, class nodestype, class aligntype, class edgealignments, class backpointers>
|
||||
void setutterancedata (const edgestype & edges, const nodestype & nodes, const aligntype & align,
|
||||
const msra::math::ssematrixbase & logLLs, std::vector<float> & edgeacscores,
|
||||
edgealignments & edgealignments, backpointers & backpointers)
|
||||
edgealignments & edgeAlignments, backpointers & backPointers)
|
||||
{
|
||||
// lattice data
|
||||
edgesgpu->assign (edges, false);
|
||||
nodesgpu->assign (nodes, false);
|
||||
aligngpu->assign (align, false);
|
||||
alignoffsetsgpu->assign (edgealignments.getalignoffsets(), false);
|
||||
backptrstoragegpu->allocate (backpointers.getbackptrstoragesize());
|
||||
backptroffsetsgpu->assign (backpointers.getbackptroffsets(), false);
|
||||
alignoffsetsgpu->assign (edgeAlignments.getalignoffsets(), false);
|
||||
backptrstoragegpu->allocate (backPointers.getbackptrstoragesize());
|
||||
backptroffsetsgpu->assign (backPointers.getbackptroffsets(), false);
|
||||
|
||||
#ifndef PARALLEL_SIL
|
||||
alignresult->assign (edgealignments.getalignmentsbuffer(), false);
|
||||
alignresult->assign (edgeAlignments.getalignmentsbuffer(), false);
|
||||
edgeacscoresgpu->assign (edgeacscores, false);
|
||||
#else
|
||||
alignresult->allocate (edgealignments.getalignbuffersize());
|
||||
alignresult->allocate (edgeAlignments.getalignbuffersize());
|
||||
edgeacscoresgpu->allocate (edges.size());
|
||||
edgeacscores; // reference to make compilor happy
|
||||
#endif
|
||||
|
@ -516,9 +516,9 @@ namespace msra { namespace lattices {
|
|||
loglls = *errorsignalgpu;
|
||||
}
|
||||
template<class edgealignments>
|
||||
void copyalignments (edgealignments & edgealignments)
|
||||
void copyalignments (edgealignments & edgeAlignments)
|
||||
{
|
||||
alignresult->fetch(edgealignments.getalignmentsbuffer(), true);
|
||||
alignresult->fetch(edgeAlignments.getalignmentsbuffer(), true);
|
||||
}
|
||||
|
||||
// [v-hansu] allocate memory for vectors relating to forward-backward
|
||||
|
@ -735,7 +735,7 @@ namespace msra { namespace lattices {
|
|||
if (!parallelstate->emulation)
|
||||
{
|
||||
|
||||
fprintf(stderr, "parallelforwardbackwardlattice: %d launches for forward, %d launches for backward\n", batchsizeforward.size(), batchsizebackward.size());
|
||||
fprintf(stderr, "parallelforwardbackwardlattice: %d launches for forward, %d launches for backward\n", (int)batchsizeforward.size(), (int)batchsizebackward.size());
|
||||
|
||||
const bool allocateframescorrect = (returnEframescorrect || boostingfactor != 0.0f);
|
||||
const bool copyuids = (returnEframescorrect || boostingfactor != 0.0f);
|
||||
|
@ -799,7 +799,7 @@ namespace msra { namespace lattices {
|
|||
if (rc != CUBLAS_STATUS_SUCCESS)
|
||||
{
|
||||
char buf[1000];
|
||||
sprintf_s(buf, "%s: cublas error code %d", msg.c_str(), rc); // ... TODO: add error message
|
||||
sprintf_s(buf, sizeof(buf), "%s: cublas error code %d", msg.c_str(), rc); // ... TODO: add error message
|
||||
throw std::runtime_error(buf);
|
||||
}
|
||||
}
|
||||
|
|
19
Makefile
19
Makefile
|
@ -50,7 +50,7 @@ endif
|
|||
# The actual compiler/linker flags added can be viewed by running 'mpic++ --showme:compile' and 'mpic++ --showme:link'
|
||||
CXX = mpic++
|
||||
|
||||
INCLUDEPATH:= Common/Include Math/Math MachineLearning/CNTK MachineLearning/CNTKComputationNetworkLib MachineLearning/CNTKSGDLib BrainScript
|
||||
INCLUDEPATH:= Common/Include Math/Math MachineLearning/CNTK MachineLearning/CNTKComputationNetworkLib MachineLearning/CNTKSGDLib MachineLearning/SequenceTraining BrainScript
|
||||
CPPFLAGS:= -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K
|
||||
CXXFLAGS:= -msse3 -std=c++0x -std=c++11 -fopenmp -fpermissive -fPIC -Werror -Wno-error=literal-suffix
|
||||
LIBPATH:=
|
||||
|
@ -377,12 +377,27 @@ CNTK_SRC =\
|
|||
MachineLearning/CNTKComputationNetworkLib/NetworkBuilderFromConfig.cpp \
|
||||
MachineLearning/CNTKSGDLib/Profiler.cpp \
|
||||
MachineLearning/CNTKSGDLib/SGD.cpp \
|
||||
MachineLearning/SequenceTraining/cudalattice.cpp \
|
||||
MachineLearning/SequenceTraining/cudalib.cpp \
|
||||
MachineLearning/SequenceTraining/latticeforwardbackward.cpp \
|
||||
MachineLearning/SequenceTraining/parallelforwardbackward.cpp \
|
||||
BrainScript/BrainScriptEvaluator.cpp \
|
||||
BrainScript/BrainScriptParser.cpp \
|
||||
BrainScript/BrainScriptTest.cpp \
|
||||
MachineLearning/CNTK/ExperimentalNetworkBuilder.cpp \
|
||||
|
||||
CNTK_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(CNTK_SRC))
|
||||
|
||||
ifdef CUDA_PATH
|
||||
CNTK_SRC +=\
|
||||
MachineLearning/SequenceTraining/cudalatticeops.cu \
|
||||
|
||||
else
|
||||
CNTK_SRC +=\
|
||||
MachineLearning/SequenceTraining/NoGPU.cpp \
|
||||
|
||||
endif
|
||||
|
||||
CNTK_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(CNTK_SRC)))
|
||||
|
||||
CNTK:=$(BINDIR)/cntk
|
||||
ALL+=$(CNTK)
|
||||
|
|
|
@ -4481,7 +4481,7 @@ template<class ElemType>
|
|||
__global__ void _AssignSequenceError(const ElemType hsmoothingWeight, ElemType *error, const ElemType *label,
|
||||
const ElemType *dnnoutput, const ElemType *gamma, ElemType alpha, const long N)
|
||||
{
|
||||
LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int id = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (id >= N)
|
||||
return;
|
||||
error[id] -= alpha * (label[id] - (1.0 - hsmoothingWeight)*dnnoutput[id] - hsmoothingWeight * gamma[id]);
|
||||
|
|
Загрузка…
Ссылка в новой задаче