Merge branch 'master' into qiwye/multiverso
This commit is contained in:
Коммит
9664daccb0
|
@ -9154,7 +9154,7 @@ L
|
|||
\begin_layout Standard
|
||||
\begin_inset Formula
|
||||
\begin{eqnarray}
|
||||
\alpha_{t}\left(i\right) & \leftarrow & h_{it}+logadd_{k}\left(\delta_{t-1}(k)+\eta a_{ki}\right)\\
|
||||
\alpha_{t}\left(i\right) & \leftarrow & h_{it}+LogAdd{k}\left(\delta_{t-1}(k)+\eta a_{ki}\right)\\
|
||||
\mathbf{\frac{\partial R}{\partial\delta_{t-1}(i)}} & \leftarrow & \sum_{j}\frac{\partial C_{logadd}}{\partial\delta_{t}(j)}\frac{\exp(\delta_{t-1}(i)+a_{i,j})}{\sum_{k}\exp(\delta_{t-1}(k)+a_{k,j})}\\
|
||||
\mathbf{\frac{\partial R}{\partial\delta_{T}(i)}} & \leftarrow & \frac{\exp(\delta_{T}(i))}{\sum_{k}\exp(\delta_{T}(k))}\\
|
||||
\frac{\partial R}{\partial h_{t}(i)} & \leftarrow & l_{t}(i)-\frac{\partial R}{\partial\delta_{t}(i)}\\
|
||||
|
|
1
Makefile
1
Makefile
|
@ -315,6 +315,7 @@ LMSEQUENCEREADER_SRC =\
|
|||
$(SOURCEDIR)/Readers/LMSequenceReader/Exports.cpp \
|
||||
$(SOURCEDIR)/Readers/LMSequenceReader/SequenceParser.cpp \
|
||||
$(SOURCEDIR)/Readers/LMSequenceReader/SequenceReader.cpp \
|
||||
$(SOURCEDIR)/Readers/LMSequenceReader/SequenceWriter.cpp \
|
||||
|
||||
LMSEQUENCEREADER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(LMSEQUENCEREADER_SRC))
|
||||
|
||||
|
|
|
@ -11,25 +11,8 @@
|
|||
#define _CRT_NONSTDC_NO_DEPRECATE // make VS accept POSIX functions without _
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "Actions.h"
|
||||
#include <string>
|
||||
#include <chrono>
|
||||
#include <algorithm>
|
||||
#if defined(_WIN32)
|
||||
#include "io.h"
|
||||
#endif
|
||||
#include "buildinfo.h"
|
||||
#include "hostname.h"
|
||||
#ifdef LEAKDETECT
|
||||
#include "vld.h" // for memory leak detection
|
||||
#endif
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include <memory>
|
||||
|
||||
#include "Basics.h"
|
||||
#include "Actions.h"
|
||||
#include "ComputationNetwork.h"
|
||||
#include "ComputationNode.h"
|
||||
#include "DataReader.h"
|
||||
|
@ -54,6 +37,23 @@
|
|||
#include "BrainScriptEvaluator.h"
|
||||
#include "BrainScriptParser.h"
|
||||
|
||||
#include <string>
|
||||
#include <chrono>
|
||||
#include <algorithm>
|
||||
#if defined(_WIN32)
|
||||
#include "io.h"
|
||||
#endif
|
||||
#include "buildinfo.h"
|
||||
#include "hostname.h"
|
||||
#ifdef LEAKDETECT
|
||||
#include "vld.h" // for memory leak detection
|
||||
#endif
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include <memory>
|
||||
|
||||
#ifndef let
|
||||
#define let const auto
|
||||
#endif
|
||||
|
|
|
@ -107,24 +107,32 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
void Invalidate() { m_dims.assign(3, SIZE_MAX); } // TODO: clean up the valid/invalid situation (this is currently done inconsistently). Also this object is immutable.
|
||||
|
||||
void Save(File& fstream) const
|
||||
// verify that this refers to a dense matrix (no strides)
|
||||
void VerifyIsDense() const
|
||||
{
|
||||
if (m_offset != 0)
|
||||
LogicError("TensorShape::Save(): Cannot serialize TensorShape for slices.");
|
||||
LogicError("TensorShape: A dense TensorShape expected. Offset %d not allowed.", (int)m_offset);
|
||||
for (size_t k = 0; k < m_dims.size(); k++) // (TODO: we can save one multiplication here)
|
||||
{
|
||||
ptrdiff_t stride = k > 0 ? m_strides[k - 1] * (ptrdiff_t)m_dims[k - 1] : 1;
|
||||
if (m_strides[k] != stride)
|
||||
LogicError("TensorShape: A dense TensorShape expected. Dimension %d is not.", (int)k);
|
||||
}
|
||||
}
|
||||
|
||||
void Save(File& fstream) const
|
||||
{
|
||||
VerifyIsDense();
|
||||
// saving as 32-bit ints. This allows to continue to support the old format (size_t W, H, C)
|
||||
fstream << (uint32_t)m_dims.size();
|
||||
ptrdiff_t mul = 1;
|
||||
for (size_t k = 0; k < m_dims.size(); k++)
|
||||
for (auto dim : m_dims)
|
||||
{
|
||||
auto dim = m_dims[k];
|
||||
if (dim > UINT32_MAX)
|
||||
LogicError("TensorShape::Save(): Tensor dimensions %s out of bounds (> 4G).", string(*this).c_str());
|
||||
fstream << (uint32_t)dim;
|
||||
if (m_steps[k] != mul)
|
||||
LogicError("TensorShape::Save(): Cannot serialize TensorShape for slices.");
|
||||
mul *= (ptrdiff_t)dim;
|
||||
}
|
||||
}
|
||||
|
||||
void Load(File& fstream)
|
||||
{
|
||||
// format: uint32_t n, dim[0], dim[1], ..., dim[n-1]
|
||||
|
@ -154,8 +162,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// accessors
|
||||
size_t GetDim(size_t k) const { return m_dims[k]; }
|
||||
size_t GetNumDims() const { return m_dims.size(); }
|
||||
size_t GetNumElements() const { size_t res = 1; for (auto & dim : m_dims) res *= dim; return res; }
|
||||
ptrdiff_t GetStep(size_t k) const { return m_steps[k]; }
|
||||
size_t GetNumElements() const { size_t res = 1; for (auto & dim : m_dims) res *= dim; return res; } // in slice
|
||||
size_t GetOffset() const { return m_offset; }
|
||||
|
||||
// vector-like accessors
|
||||
|
@ -163,12 +170,31 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
size_t size() const { return GetNumDims(); }
|
||||
|
||||
const std::vector<size_t> & GetDims() const { return m_dims; } // get all, e.g. for logging or for constructing derived tensors with edited dimensions
|
||||
const std::vector<ptrdiff_t> & GetStrides() const { return m_strides; }
|
||||
|
||||
// interpretation as an image tensor
|
||||
size_t GetNumChannels() const { return m_dims[0]; }
|
||||
size_t GetWidth() const { return m_dims[1]; }
|
||||
size_t GetHeight() const { return m_dims[2]; }
|
||||
|
||||
// indexing
|
||||
// Determines the offset into the underlying element array for a given multi-dimensional index.
|
||||
// This function is for reference. Probably not often used.
|
||||
size_t Locate(const std::vector<size_t> & index) const
|
||||
{
|
||||
ptrdiff_t location = m_offset;
|
||||
for (size_t k = 0; k < index.size(); k++)
|
||||
{
|
||||
size_t dim = k < size() ? m_dims[k] : 1; // dimensions are bottomless
|
||||
if (index[k] >= dim)
|
||||
LogicError("Locate: Tensor index[%d]=%d exceeds bound %d.", (int)k, (int)index[k], (int)dim);
|
||||
location += (ptrdiff_t)index[k] * m_strides[k]; // strides may be negative
|
||||
}
|
||||
if (location < 0 || (size_t)location >= m_allocation)
|
||||
LogicError("Locate: Tensor index out of bounds.");
|
||||
return (size_t)location;
|
||||
}
|
||||
|
||||
// helpers for tensor operations
|
||||
bool CanFlatten(size_t k) const // can dims k and k-1 be flattened into a single vector? (do they form a matrix without stride)
|
||||
{
|
||||
|
@ -179,66 +205,145 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (m_dims[k] == 1 || m_dims[k - 1] == 1) // both are broadcasting or scalar--we don't care about stride in this case
|
||||
return true;
|
||||
else
|
||||
return m_steps[k] == m_steps[k - 1] * (ptrdiff_t)m_dims[k - 1];
|
||||
return m_strides[k] == m_strides[k - 1] * (ptrdiff_t)m_dims[k - 1];
|
||||
}
|
||||
// editing functions
|
||||
// These all create new TensorShape objects.
|
||||
TensorShape Flatten(size_t k) const // flatten [k] with [k-1]
|
||||
{
|
||||
TensorShape result = *this;
|
||||
if (!CanFlatten(k))
|
||||
LogicError("Flatten() cannot flatten dimensions with gaps");
|
||||
// We reshape local (I x J) sub-matrices to (1 x I*J) sub-matrices.
|
||||
// We merge to right so that we can merge multiple by looping left-to-right.
|
||||
// m_dims = I J K L
|
||||
// m_strides = 1 I I*J I*J*K
|
||||
// flattening J and K
|
||||
// m_dims = I 1 J*K L
|
||||
// m_strides = 1 I I I*J*K
|
||||
// TODO: rethink whether this is correct for example of negative strides
|
||||
result.m_dims[k] *= result.m_dims[k - 1];
|
||||
result.m_dims[k - 1] = 1;
|
||||
result.m_strides[k] = /*result.m_dims[k - 1] *, it's 1 */ result.m_strides[k - 1];
|
||||
return result;
|
||||
}
|
||||
TensorShape DropDims(const std::vector<bool> & toDrop) const // remove dimension
|
||||
{
|
||||
// this deletes a dimension while retaining strides
|
||||
// This implies a slice to [0] for this dimension.
|
||||
TensorShape result = *this;
|
||||
size_t j = 0;
|
||||
for (size_t k = 0; k < size(); k++)
|
||||
{
|
||||
if (toDrop[k])
|
||||
continue;
|
||||
else
|
||||
{
|
||||
// example
|
||||
// m_dims = I 1 J K
|
||||
// m_strides = 1 I I I*J
|
||||
// dropping the second dimension
|
||||
// m_dims = I % J K
|
||||
// m_strides = 1 % I I*J
|
||||
result.m_dims[j] = result.m_dims[k];
|
||||
result.m_strides[j] = result.m_strides[k];
|
||||
j++;
|
||||
}
|
||||
}
|
||||
result.m_dims.resize(j);
|
||||
result.m_strides.resize(j);
|
||||
return result;
|
||||
}
|
||||
TensorShape WithBroadcastStrides() const // flatten [k] with [k-1] if toFlatten[k] is set
|
||||
{
|
||||
TensorShape result = *this;
|
||||
for (size_t k = 0; k < size(); k++)
|
||||
if (result.m_dims[k] == 1)
|
||||
result.m_strides[k] = 0;
|
||||
return result;
|
||||
}
|
||||
TensorShape Pad(size_t numDims) const // append singleton dimensions
|
||||
{
|
||||
VerifyIsDense();
|
||||
if (numDims < GetNumDims())
|
||||
LogicError("Pad() cannot drop a shorten the dimensions.");
|
||||
else if (numDims == GetNumDims())
|
||||
return *this;
|
||||
auto dims = GetDims();
|
||||
dims.resize(numDims, 1);
|
||||
return TensorShape(dims);
|
||||
}
|
||||
TensorShape Concat(const TensorShape & other) const // concatenate
|
||||
{
|
||||
auto dims = GetDims();
|
||||
auto otherDims = other.GetDims();
|
||||
dims.insert(dims.end(), otherDims.begin(), otherDims.end());
|
||||
return TensorShape(dims);
|
||||
}
|
||||
|
||||
// pretty-printing. Returns tensor dims in the form "I x J x K".
|
||||
operator std::string() const
|
||||
{
|
||||
std::string s;
|
||||
for (const auto & dim : m_dims)
|
||||
for (size_t k = 0; k < size(); k++)
|
||||
{
|
||||
if (!s.empty())
|
||||
s.append(" x ");
|
||||
s.append(std::to_string(dim));
|
||||
s.append(std::to_string(m_dims[k]));
|
||||
}
|
||||
#ifdef _DEBUG // also emit the strides, easier for debugging
|
||||
s.append(" {");
|
||||
for (size_t k = 0; k < size(); k++)
|
||||
{
|
||||
if (k > 0)
|
||||
s.append(",");
|
||||
s.append(std::to_string(m_strides[k]));
|
||||
}
|
||||
s.append("}");
|
||||
#endif
|
||||
return s;
|
||||
}
|
||||
|
||||
private:
|
||||
// reset m_steps and m_offset to represent a canonical no-strides tensor
|
||||
// reset m_strides and m_offset to represent a canonical no-strides tensor
|
||||
void InitAsNoSlice()
|
||||
{
|
||||
m_offset = 0;
|
||||
m_steps.resize(m_dims.size());
|
||||
ptrdiff_t mul = 1;
|
||||
m_strides.resize(m_dims.size());
|
||||
for (size_t k = 0; k < m_dims.size(); k++)
|
||||
{
|
||||
m_steps[k] = (ptrdiff_t)mul;
|
||||
mul *= m_dims[k];
|
||||
}
|
||||
m_strides[k] = k > 0 ? m_strides[k - 1] * (ptrdiff_t)m_dims[k - 1] : 1;
|
||||
m_allocation = m_dims.empty() ? 0 : m_dims.back() * (size_t)m_strides.back();
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<size_t> m_dims; // dimensions of tensor or tensor slice. The size of the box.
|
||||
std::vector<ptrdiff_t> m_steps; // dimension gets multiplied by this for computing the index offset. How to hop to the next element in dimension[k]. Stride magic happening here!
|
||||
std::vector<ptrdiff_t> m_strides; // dimension gets multiplied by this for computing the index offset. How to hop to the next element in dimension[k]. Stride magic happening here!
|
||||
size_t m_offset; // offset to element(0,0,...,0). May be non-0 in case of slicing.
|
||||
// For a regular tensor, there are no strides, m_steps[k] = m_steps[k-1] * m_dims[k-1]. This is how TensorShapes are created from dimensions.
|
||||
size_t m_allocation; // allocation size of original dense tensor
|
||||
// For a regular tensor, there are no strides, m_strides[k] = m_strides[k-1] * m_dims[k-1]. This is how TensorShapes are created from dimensions.
|
||||
// For views into existing tensors, we do stride shenanigans to implement broadcasting (plus magic tricks). Examples:
|
||||
// To traverse a 5 x 10 matrix with column order reversed:
|
||||
// - op.dims = (5 x 10)
|
||||
// - m_offset points to element (0,9)
|
||||
// - m_steps[0] = 1 // regular forward iteration within each column
|
||||
// - m_steps[1] = -5 // backward iteration over columns
|
||||
// - m_strides = (1, -5) // backward iteration over columns
|
||||
// To compute matrix C(13 x 42) = vector A(13 x 1) + matrix B(13 x 42):
|
||||
// - op = sum
|
||||
// - op.dims = (13 x 42)
|
||||
// - *.m_steps[0] = 1 // forward iteration through each column
|
||||
// - C.m_steps[1] = 13 // forward iteration over columns of B--defines the for loop
|
||||
// - B.m_steps[1] = 13 // forward iteration over columns of B--iterates in sync with C
|
||||
// - A.m_steps[1] = 0 // A, however, is stuck in column 0 forever
|
||||
// - C.m_strides = (1, 13) // forward iteration over columns of B--defines the for loop
|
||||
// - B.m_strides = (1, 13) // forward iteration over columns of B--iterates in sync with C
|
||||
// - A.m_strides = (1, 0) // A, however, is stuck in column 0 forever
|
||||
// Matrix product: C(I x K) = A(I x J) * B(J x K) --Note: Likely not RAM-bandwidth efficient!
|
||||
// - op = mul
|
||||
// - op.dims = (I x J x K) // iteration dimensions
|
||||
// - C.m_steps = (1, 0, I) // inverse broadcasting for inner dimension
|
||||
// - A.m_steps = (1, I, 0)
|
||||
// - B.m_steps = (0, 1, J)
|
||||
// - C.m_strides = (1, 0, I) // inverse broadcasting for inner dimension
|
||||
// - A.m_strides = (1, I, 0)
|
||||
// - B.m_strides = (0, 1, J)
|
||||
// Convolution of time signals (without padding): Y(T-N+1) = X(T) * H(N): --Note: Likely not RAM-bandwidth efficient!
|
||||
// - op = mul
|
||||
// - op.dims = (T-N+1 x N) // iteration dimensions
|
||||
// - Y.m_steps = (1, 0) // inverse broadcasting: this sums up the individual products
|
||||
// - X.m_steps = (1, 1) // shift window by 1 for each output sample
|
||||
// - H.m_steps = (0, -1) // reuse for each output sample; iterate in reverse order for convolution
|
||||
// - Y.m_strides = (1, 0) // inverse broadcasting: this sums up the individual products
|
||||
// - X.m_strides = (1, 1) // shift window by 1 for each output sample
|
||||
// - H.m_strides = (0, -1) // reuse for each output sample; iterate in reverse order for convolution
|
||||
// - H.m_offset = N - 1 // begin with last element (reverse order for convolution)
|
||||
// TODO: double-check all these
|
||||
// TODO: Does the same trick work for 2D images?
|
||||
|
|
|
@ -108,12 +108,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
m_numParallelSequences = numParallelSequences;
|
||||
m_numTimeSteps = numTimeSteps;
|
||||
// allocate lookup tables (note: except at the start, these don't really allocate new memory most of the time)
|
||||
// PTRDIFF_MAX indicates not initialized (also in the matrix, which is stored as float).
|
||||
m_distanceToStart.Resize(m_numParallelSequences, m_numTimeSteps); m_distanceToStart.SetValue((float)PTRDIFF_MAX);
|
||||
m_distanceToEnd.Resize(m_numParallelSequences, m_numTimeSteps); m_distanceToEnd.SetValue((float)PTRDIFF_MAX);
|
||||
#if 1
|
||||
if ((m_distanceToStart.GetNumRows() != m_numParallelSequences || m_distanceToStart.GetNumCols() != m_numTimeSteps) && m_numTimeSteps > 0) // sanity check for debugging a regression
|
||||
fprintf(stderr, "MBLayout::Init: Resizing m_distanceToStart from %d x %d to %d x %d\n",
|
||||
(int)m_distanceToStart.GetNumRows(), (int)m_distanceToStart.GetNumCols(), (int)m_numParallelSequences, (int)m_numTimeSteps); // (I really want to know about actual allocations, but this is a necessary condition for them)
|
||||
#endif
|
||||
m_distanceToStart.Resize(m_numParallelSequences, m_numTimeSteps);
|
||||
m_distanceToEnd.Resize(m_numParallelSequences, m_numTimeSteps);
|
||||
m_distanceToNearestStart.assign(m_numTimeSteps, PTRDIFF_MAX);
|
||||
m_distanceToNearestEnd.assign(m_numTimeSteps, PTRDIFF_MAX);
|
||||
m_distanceToNearestEnd.assign(m_numTimeSteps, PTRDIFF_MAX);
|
||||
m_timeStepHasGap.assign(m_numTimeSteps, false);
|
||||
m_columnsValidityMask.Resize(0, 0); // invalidate
|
||||
// reset state
|
||||
m_numFramesDeclared = 0;
|
||||
m_numGapFrames = 0;
|
||||
|
@ -121,20 +126,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
m_writable = true;
|
||||
}
|
||||
|
||||
// short-hand to initialize an MBLayout for the common case of frame mode
|
||||
// In frame mode, there is one parallel "sequence" per sample, which is 1 frame long.
|
||||
void InitAsFrameMode(size_t numSamples)
|
||||
{
|
||||
Init(numSamples, 1);
|
||||
SequenceInfo seqInfo { 0, 0, 0, 1 };
|
||||
for (size_t s = 0; s < numSamples; s++)
|
||||
{
|
||||
seqInfo.seqId = seqInfo.s = s;
|
||||
AddSequence(seqInfo);
|
||||
}
|
||||
Lock();
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------
|
||||
// accessors
|
||||
// -------------------------------------------------------------------
|
||||
|
@ -199,7 +190,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
LogicError("AddSequence: Sequence added to an MBLayout must overlap with minibatch.");
|
||||
|
||||
// remember it
|
||||
#ifdef _DEBUG
|
||||
auto cap = m_sequences.capacity(); // Some sanity check for debugging a speed regression. This should only show up during the first minibatches, and growing only.
|
||||
m_sequences.push_back(seqDesc);
|
||||
if (cap != m_sequences.capacity())
|
||||
fprintf(stderr, "AddSequence: m_sequences was reallocated from capacity %d to %d\n", (int)cap, (int)m_sequences.capacity());
|
||||
#else
|
||||
m_sequences.push_back(seqDesc);
|
||||
#endif
|
||||
|
||||
// create all the cached fast-lookup information
|
||||
const auto seqId = seqDesc.seqId;
|
||||
|
@ -212,7 +210,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
m_numGapFrames += (e - b);
|
||||
for (size_t t = b; t < e; t++)
|
||||
{
|
||||
//Set(s, t, MinibatchPackingFlags::NoInput);
|
||||
m_timeStepHasGap[t] = true;
|
||||
m_distanceToStart(s, t) = -1; // start flags also encode gaps
|
||||
}
|
||||
|
@ -220,22 +217,49 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
else for (size_t t = b; t < e; t++)
|
||||
{
|
||||
// update the nearest sentence boundaries, minimum over all parallel sequences
|
||||
// -1 in distanceToStart(,) stands for a gap
|
||||
assert(m_distanceToStart(s, t) != -1); // gaps not allowed to overlap
|
||||
// If 0, then we are on a boundary. If not 0, we can still test in presence of FrameRange.m_timeOffset.
|
||||
ptrdiff_t distanceToStart = t - beginTime;
|
||||
if (m_distanceToStart(s, t) > (float)distanceToStart)
|
||||
m_distanceToStart(s, t) = (float)distanceToStart;
|
||||
ptrdiff_t distanceToStart = (ptrdiff_t)t - beginTime;
|
||||
ptrdiff_t distanceToEnd = (ptrdiff_t)(endTime - 1 - t);
|
||||
m_distanceToStart(s, t) = (float)distanceToStart;
|
||||
m_distanceToEnd(s, t) = (float)distanceToEnd;
|
||||
// and the aggregate
|
||||
if (m_distanceToNearestStart[t] > distanceToStart)
|
||||
m_distanceToNearestStart[t] = distanceToStart;
|
||||
ptrdiff_t distanceToEnd = endTime - 1 - t;
|
||||
if (m_distanceToEnd(s, t) > (float) distanceToEnd)
|
||||
m_distanceToEnd(s, t) = (float) distanceToEnd;
|
||||
if (m_distanceToNearestEnd[t] > distanceToEnd)
|
||||
m_distanceToNearestEnd[t] = distanceToEnd;
|
||||
}
|
||||
}
|
||||
|
||||
// short-hand to initialize an MBLayout for the common case of frame mode
|
||||
// In frame mode, there is one parallel "sequence" per sample, which is 1 frame long.
|
||||
// This function provides an efficient short-cut implementation of AddSequence(t, t, 0, 1) for every sample t.
|
||||
void InitAsFrameMode(size_t numSamples)
|
||||
{
|
||||
Init(numSamples, 1);
|
||||
|
||||
// create sequences array
|
||||
SequenceInfo virginSeqInfo = { 0, 0, 0, 1 };
|
||||
m_sequences.resize(numSamples, virginSeqInfo); // pass it here since otherwise STL will initialize everything to 0 unnecessarily
|
||||
|
||||
// update sequence indices
|
||||
for (size_t s = 0; s < numSamples; s++)
|
||||
{
|
||||
// remember it
|
||||
auto & seqDesc = m_sequences[s];
|
||||
seqDesc.seqId = s;
|
||||
seqDesc.s = s;
|
||||
}
|
||||
m_numFramesDeclared = numSamples;
|
||||
|
||||
// create all the cached fast-lookup information
|
||||
m_distanceToStart.SetValue(0);
|
||||
m_distanceToEnd.SetValue(0);
|
||||
m_distanceToNearestStart[0] = 0;
|
||||
m_distanceToNearestEnd[0] = 0;
|
||||
|
||||
Lock();
|
||||
}
|
||||
|
||||
// mark a range of frames in a parallel sequence as invalid
|
||||
// I'd love to start with all-gaps, but that would require to set flags upfront, and then clearing them.
|
||||
void AddGap(size_t s, ptrdiff_t beginTime, size_t endTime) { if ((ptrdiff_t)endTime > beginTime) AddSequence(GAP_SEQUENCE_ID, s, beginTime, endTime); }
|
||||
|
@ -330,10 +354,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// 2 1 0 . . ] // (last two time steps undefined)
|
||||
// m_distanceToNearestStart = [ 0 1 2 3 4 ]
|
||||
// m_distanceToNearestEnd = [ 2 1 0 1 0 ]
|
||||
Matrix<float> m_distanceToStart, m_distanceToEnd; // (s,t); value<0 stands for gap, PTRDIFF_MAX for 'not initialized'
|
||||
vector<ptrdiff_t> m_distanceToNearestStart, m_distanceToNearestEnd; // [t] (value<0 does NOT stand for gap; consult m_timeStepHasGap[] vector instead)
|
||||
Matrix<float> m_distanceToStart, m_distanceToEnd; // (s,t); value<0 stands for gap
|
||||
vector<ptrdiff_t> m_distanceToNearestStart, m_distanceToNearestEnd; // [t] (does not store info about gaps; consult m_timeStepHasGap[] vector instead)
|
||||
|
||||
vector<bool> m_timeStepHasGap; // [t]
|
||||
vector<bool> m_timeStepHasGap; // [t] true if at least one gap in time step t
|
||||
|
||||
// Cached mask indicating the validity of each column in the MBLayout
|
||||
// TODO: We actually just need a boolean matrix for this.
|
||||
|
@ -527,6 +551,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (s == SIZE_MAX) // aggregate requested
|
||||
{
|
||||
// determine flags from aggregate vectors
|
||||
// Note: We allow that all parallel sequences contain gaps (m_distanceToNearestStart[t] == PTRDIFF_MAX)
|
||||
// because that makes implementation of the reader easier for truncated BPTT (it knows too late that there are not that many frames left).
|
||||
auto distanceToStart = (ptrdiff_t)m_distanceToNearestStart[t];
|
||||
if (distanceToStart < -fr.m_timeOffset)
|
||||
return true;
|
||||
|
@ -557,7 +583,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// TODO: Remove this version (with sanity checks) after this has been tested. Then the function can be inlined above.
|
||||
inline size_t MBLayout::GetActualNumSamples() const
|
||||
{
|
||||
#if 1 // sanity check --TODO: delete this after a while
|
||||
#if 0 // sanity check --TODO: delete this after a while
|
||||
size_t n = GetNumCols();
|
||||
if (HasGaps())
|
||||
{
|
||||
|
|
|
@ -13,6 +13,8 @@
|
|||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
using namespace std;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// subroutines for Validate() implementations
|
||||
// -----------------------------------------------------------------------
|
||||
|
@ -41,13 +43,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// all are consistent: install it
|
||||
LinkToMBLayout(pMBLayout);
|
||||
}
|
||||
|
||||
// single input that maps its input element-wise (e.g. Sigmoid)
|
||||
void ComputationNodeBase::ValidateUnaryMap(bool isFinalValidationPass)
|
||||
{
|
||||
assert(m_inputs.size() == 1);
|
||||
ComputationNodeBase::Validate(isFinalValidationPass);
|
||||
InferMBLayoutFromInputsForStandardCase();
|
||||
SetDims(m_inputs[0]->GetNumRows(), DetermineNumCols(m_inputs[0]));
|
||||
SetDims(m_inputs[0]);
|
||||
InferImageDimsFromInputs();
|
||||
}
|
||||
// binary zip operation, e.g. Plus
|
||||
|
@ -138,6 +141,61 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// tensor helpers
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
static TensorShape GetSampleShape(const ComputationNode<ElemType> * node)
|
||||
{
|
||||
// TODO: use actual ImageLayout. While those are not yet inferred properly, maybe use it if its dims match numRows?
|
||||
if (node->HasMBLayout()) // if we have a layout, that dimension is not part of the sample shape
|
||||
return TensorShape(node->GetNumRows());
|
||||
else
|
||||
return TensorShape(node->GetNumRows(), node->GetNumCols());
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
std::vector<TensorView<ElemType>> ComputationNode<ElemType>::GetTensorsForwardBinary(const FrameRange & fr)
|
||||
{
|
||||
const size_t N = 3; // 2 inputs and 1 output
|
||||
// BUGBUG: Currently does not interpret actual ImageLayouts or convolutional models.
|
||||
// TODO: move this into a helper function
|
||||
// get tensor shapes
|
||||
vector<ComputationNode<ElemType>*> nodes;
|
||||
for (size_t i = 0; i < N; i++)
|
||||
nodes.push_back(i < N-1 ? Input(i).get() : this);
|
||||
vector<Matrix<ElemType>> values;
|
||||
vector<TensorShape> shapes;
|
||||
for (size_t i = 0; i < N; i++)
|
||||
{
|
||||
values.push_back(nodes[i]->ValueFor(i < N-1 ? fr.AllowBroadcast() : fr)); // no broadcasting for now allowed for output
|
||||
shapes.push_back(GetSampleShape(nodes[i]));
|
||||
}
|
||||
// pad
|
||||
size_t dims = 0;
|
||||
for (size_t i = 0; i < N; i++)
|
||||
if (dims < shapes[i].GetNumDims())
|
||||
dims = shapes[i].GetNumDims();
|
||||
for (size_t i = 0; i < N; i++)
|
||||
shapes[i] = shapes[i].Pad(dims);
|
||||
// concatenate MBLayout dims
|
||||
// TODO: Is it possible that the output has no layout, but inputs have? Then we lost dimensions. Tensor constructor will catch that, though.
|
||||
if (HasMBLayout())
|
||||
{
|
||||
for (size_t i = 0; i < N; i++)
|
||||
{
|
||||
auto sm = nodes[i]->HasMBLayout() ? TensorShape(GetNumParallelSequences(), GetNumTimeSteps()) : TensorShape(1, 1);
|
||||
shapes[i] = shapes[i].Concat(sm);
|
||||
}
|
||||
}
|
||||
// perform operation
|
||||
std::vector<TensorView<ElemType>> tensors;
|
||||
for (size_t i = 0; i < N; i++)
|
||||
tensors.push_back(TensorView<ElemType>(values[i], shapes[i]));
|
||||
return tensors;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// others
|
||||
// -----------------------------------------------------------------------
|
||||
|
@ -172,6 +230,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
template<> std::map<size_t, std::map<size_t, FloatMatrix*>> ComputationNode<float>::s_constOnes{};
|
||||
template<> std::map<size_t, std::map<size_t, DoubleMatrix*>> ComputationNode<double>::s_constOnes{};
|
||||
|
||||
template class ComputationNode<float>;
|
||||
template class ComputationNode<double>;
|
||||
|
||||
template class LearnableParameter<float>;
|
||||
template class LearnableParameter<double>;
|
||||
}}}
|
||||
|
|
|
@ -340,18 +340,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
}
|
||||
// helper functions for common cases
|
||||
private:
|
||||
// determine number of columns from a child and/or layout
|
||||
size_t DetermineNumCols(const ComputationNodeBasePtr & child) const
|
||||
{
|
||||
size_t childCols = child->GetNumCols(); // this is what the child says
|
||||
if (!m_pMBLayout) // no layout: copy from child
|
||||
return childCols;
|
||||
size_t cols = m_pMBLayout->GetNumCols(); // layout: get it from there, but validate against child
|
||||
if (childCols != cols)
|
||||
RuntimeError("%ls %ls operation: Mismatch in number of columns", OperationName().c_str(), NodeName().c_str());
|
||||
return cols;
|
||||
}
|
||||
protected:
|
||||
void ValidateUnaryMap(bool isFinalValidationPass);
|
||||
void ValidateUnaryReduce(bool isFinalValidationPass);
|
||||
|
@ -779,7 +767,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
protected:
|
||||
//std containers such as list and map does not support class reference so we need to use pointer
|
||||
typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr;
|
||||
ComputationNode() { }
|
||||
public:
|
||||
using ComputationNodeBase::AttachInputs; // import the convenience functions that take 1..6 parameters
|
||||
using ComputationNodeBase::SetDims;
|
||||
|
@ -1085,6 +1072,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
const Matrix<ElemType>& Gradient() const { return *m_gradient; }
|
||||
Matrix<ElemType>& Gradient() { return *m_gradient; }
|
||||
|
||||
std::vector<TensorView<ElemType>> GetTensorsForwardBinary(const FrameRange & fr);
|
||||
|
||||
// Function to return the number of columns for whole batch or single frame
|
||||
size_t GetNumColsFor(const FrameRange & fr/*select frame or entire batch*/)
|
||||
{
|
||||
|
@ -1519,7 +1508,7 @@ protected: \
|
|||
using Base::CreateUniqId; \
|
||||
using Base::GetNumInputs; using Base::ZeroGradientsOfInputs; using Base::VerifyDims; \
|
||||
using Base::ConstOnes; \
|
||||
using Base::GetImageLayout; using Base::InferImageDimsFromInput; using Base::InferImageDimsFromInputs; using Base::InferMBLayoutFromInputsForStandardCase; \
|
||||
using Base::GetImageLayout; using Base::GetTensorsForwardBinary; using Base::InferImageDimsFromInput; using Base::InferImageDimsFromInputs; using Base::InferMBLayoutFromInputsForStandardCase; \
|
||||
using Base::CopyTo; using Base::CreateUniqNodeName; using Base::DetachInputs; using Base::GetInputsFromConfig; \
|
||||
using Base::DumpNodeInfo; using Base::EnumerateNodes; \
|
||||
using Base::HasMBLayout; using Base::GetMBLayout; using Base::LinkToMBLayout; \
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
|
||||
#include "Basics.h"
|
||||
#include "Matrix.h"
|
||||
#include "TensorView.h"
|
||||
#include "ComputationNode.h"
|
||||
#include "ConvolutionalNodes.h"
|
||||
|
||||
|
@ -129,6 +130,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
|
||||
{
|
||||
#if 0 // TODO: use #if 0 until this is working
|
||||
auto args = GetTensorsForwardBinary(fr);
|
||||
args[2].DoSumOf(0.0f, args[0], args[1], 1.0f);
|
||||
#else
|
||||
Matrix<ElemType> functionValues = ValueForToDense(fr, false); // Switch to dense as a work-around because ColumnSlice doesn't support all the sparse formats
|
||||
Matrix<ElemType> inputFunctionValues0 = Input(0)->ValueFor(fr.AllowBroadcast());
|
||||
Matrix<ElemType> inputFunctionValues1 = Input(1)->ValueFor(fr.AllowBroadcast());
|
||||
|
@ -185,6 +190,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
else
|
||||
LogicError("%ls %ls operation's Validate() function let invalid dimensions slip by.", NodeName().c_str(), OperationName().c_str());
|
||||
#endif
|
||||
#if DUMPOUTPUT
|
||||
functionValues.Print("PlusNode");
|
||||
#endif
|
||||
|
|
|
@ -9,12 +9,13 @@
|
|||
#include "stdafx.h"
|
||||
#include "Basics.h"
|
||||
#include "File.h"
|
||||
|
||||
#include "CPUMatrix.h"
|
||||
#include "TensorOps.h"
|
||||
#include <assert.h>
|
||||
#include <stdexcept>
|
||||
#include <omp.h>
|
||||
#include <math.h>
|
||||
#include "CPUMatrix.h"
|
||||
|
||||
#include <random>
|
||||
#include <chrono>
|
||||
#include <exception>
|
||||
|
@ -351,7 +352,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
auto& us = *this;
|
||||
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j = 0; j<n; j++)
|
||||
{
|
||||
//four-way unrolling
|
||||
|
@ -384,7 +385,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
long n = (long)a.GetNumCols(); // note: OpenMP requires loop indices to be long, not size_t
|
||||
long k = (long)a.GetNumRows();
|
||||
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j=0; j<n; j++)
|
||||
{
|
||||
//memory copy might be faster?
|
||||
|
@ -428,7 +429,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
auto& us = *this;
|
||||
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j=0; j<n; j++)
|
||||
{
|
||||
//four-way unrolling
|
||||
|
@ -469,7 +470,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
auto& us = *this;
|
||||
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j = 0; j<n; j++)
|
||||
{
|
||||
//four-way unrolling
|
||||
|
@ -500,7 +501,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
auto& us = *this;
|
||||
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long i = 0; i < m_numRows; i++)
|
||||
{
|
||||
diag(0, (size_t)i) = us(i, i);
|
||||
|
@ -538,7 +539,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
auto& us = *this;
|
||||
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j = 0; j<sliceNumCols; j++)
|
||||
{
|
||||
for (int i = 0; i < inputMatrices.size(); i++)
|
||||
|
@ -575,7 +576,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
long n = (long)a.GetNumCols(), m = (long)a.GetNumRows();
|
||||
auto& us = *this;
|
||||
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long q = 0; q < numColRepeats; q++)
|
||||
{
|
||||
for (long p = 0; p < numRowRepeats; p++)
|
||||
|
@ -619,7 +620,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
auto& us = *this;
|
||||
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j = 0; j<n; j++)
|
||||
{
|
||||
//four-way unrolling
|
||||
|
@ -685,7 +686,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
auto& us = *this;
|
||||
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j=0; j<n; j++)
|
||||
{
|
||||
//four-way unrolling
|
||||
|
@ -719,7 +720,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
else
|
||||
{
|
||||
long m=(long)GetNumElements();
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
//four-way unrolling
|
||||
for (long i=0; i<(m & ~3); i+=4)
|
||||
{
|
||||
|
@ -777,7 +778,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
auto& us = *this;
|
||||
long m=(long)GetNumRows();
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
//four-way unrolling
|
||||
for (long i=0; i<(m & ~3); i+=4)
|
||||
{
|
||||
|
@ -802,7 +803,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
auto& us = *this;
|
||||
long m=(long)GetNumRows();
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
//four-way unrolling
|
||||
for (long i=0; i<(m & ~3); i+=4)
|
||||
{
|
||||
|
@ -827,7 +828,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
auto& us = *this;
|
||||
long m=(long)GetNumRows();
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
//four-way unrolling
|
||||
for (long i=0; i<(m & ~3); i+=4)
|
||||
{
|
||||
|
@ -935,7 +936,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
auto& us = *this;
|
||||
long m=(long)GetNumRows();
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
//four-way unrolling
|
||||
for (long i=0; i<(m & ~3); i+=4)
|
||||
{
|
||||
|
@ -974,7 +975,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
long m=(long)GetNumRows();
|
||||
if (vector.GetNumRows() == 1) //row vector
|
||||
{
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
//four-way unrolling
|
||||
for (long i=0; i<(m & ~3); i+=4)
|
||||
{
|
||||
|
@ -991,7 +992,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
else
|
||||
{
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
//four-way unrolling
|
||||
for (long i=0; i<(m & ~3); i+=4)
|
||||
{
|
||||
|
@ -1164,7 +1165,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
ElemType a0, a1, a2, a3;
|
||||
|
||||
//disable omp here because aveMultiper needs to be added atomically. however, it seems the result is incorrect even if rmp atomic and amp critical are used.
|
||||
//#pragma omp parallel for
|
||||
//#pragma omp parallel for
|
||||
for (long i = 0; i<(n & ~3); i += 4) //four-way unrolling
|
||||
{
|
||||
a[i] += d_v[i] * d_v[i];
|
||||
|
@ -1495,7 +1496,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
Resize(a.GetNumRows(), a.GetNumCols());
|
||||
|
||||
long m=(long)GetNumRows(), n=(long)GetNumCols();
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j=0; j<n; j++)
|
||||
{
|
||||
//four-way unrolling
|
||||
|
@ -1596,7 +1597,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
Resize(a.GetNumRows(), a.GetNumCols());
|
||||
|
||||
long m=(long)GetNumRows(), n=(long)GetNumCols();
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j=0; j<n; j++)
|
||||
{
|
||||
//four-way unrolling
|
||||
|
@ -1625,7 +1626,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
Resize(a.GetNumRows(), a.GetNumCols());
|
||||
|
||||
long m=(long)GetNumRows(), n=(long)GetNumCols();
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j=0; j<n; j++)
|
||||
{
|
||||
//four-way unrolling
|
||||
|
@ -1816,7 +1817,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
Resize(a.GetNumRows(), a.GetNumCols());
|
||||
|
||||
long m=(long)GetNumRows(), n=(long)GetNumCols();
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j=0; j<n; j++)
|
||||
{
|
||||
//four-way unrolling
|
||||
|
@ -1853,7 +1854,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
auto& us=*this;
|
||||
|
||||
long m=(long)GetNumRows(), n=(long)GetNumCols();
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j=0; j<n; j++)
|
||||
{
|
||||
//four-way unrolling
|
||||
|
@ -1921,7 +1922,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
auto& us=*this;
|
||||
|
||||
long m=(long)GetNumRows(), n=(long)GetNumCols();
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j=0; j<n; j++)
|
||||
{
|
||||
//four-way unrolling
|
||||
|
@ -1956,7 +1957,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
auto& us=*this;
|
||||
|
||||
long m=(long)GetNumRows(), n=(long)GetNumCols();
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j=0; j<n; j++)
|
||||
{
|
||||
ElemType v = a(0,j);
|
||||
|
@ -1991,7 +1992,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
auto& us = *this;
|
||||
|
||||
long m = (long)GetNumRows(), n = (long)GetNumCols();
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j = 0; j<n; j++)
|
||||
{
|
||||
ElemType v = a(0, j);
|
||||
|
@ -2032,7 +2033,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
long m=(long)GetNumRows(), n=(long)GetNumCols();
|
||||
|
||||
ElemType smallValue = EPS_IN_INVERSE;
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j=0; j<n; j++)
|
||||
{
|
||||
for (long i=0; i<m; i++)
|
||||
|
@ -2133,7 +2134,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
Resize(a.GetNumRows(), a.GetNumCols());
|
||||
|
||||
long m=(long)GetNumRows(), n=(long)GetNumCols();
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j=0; j<n; j++)
|
||||
{
|
||||
//four-way unrolling
|
||||
|
@ -2172,7 +2173,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
Resize(a.GetNumRows(), a.GetNumCols());
|
||||
|
||||
long m=(long)GetNumRows(), n=(long)GetNumCols();
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j=0; j<n; j++)
|
||||
{
|
||||
//four-way unrolling
|
||||
|
@ -2220,7 +2221,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
Resize(a.GetNumRows(), a.GetNumCols());
|
||||
|
||||
long m=(long)GetNumRows(), n=(long)GetNumCols();
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j=0; j<n; j++)
|
||||
{
|
||||
//four-way unrolling
|
||||
|
@ -2387,7 +2388,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
Resize(a.GetNumRows(), a.GetNumCols());
|
||||
|
||||
long m=(long)GetNumRows(), n=(long)GetNumCols();
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j=0; j<n; j++)
|
||||
{
|
||||
//four-way unrolling
|
||||
|
@ -2427,7 +2428,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
Resize(a.GetNumRows(), a.GetNumCols());
|
||||
|
||||
long m=(long)GetNumRows(), n=(long)GetNumCols();
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j=0; j<n; j++)
|
||||
{
|
||||
//four-way unrolling
|
||||
|
@ -2467,7 +2468,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
Resize(a.GetNumRows(), a.GetNumCols());
|
||||
|
||||
long m=(long)GetNumRows(), n=(long)GetNumCols();
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j=0; j<n; j++)
|
||||
{
|
||||
//four-way unrolling
|
||||
|
@ -2620,7 +2621,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
auto& us=*this;
|
||||
|
||||
long m=(long)GetNumRows(), n=(long)GetNumCols();
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j=0; j<n; j++)
|
||||
{
|
||||
//four-way unrolling
|
||||
|
@ -2660,7 +2661,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
ElemType locTHresholdNeg = -locThresholdPos;
|
||||
|
||||
long m=(long)GetNumRows(), n=(long)GetNumCols();
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j=0; j<n; j++)
|
||||
{
|
||||
//four-way unrolling
|
||||
|
@ -2708,7 +2709,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
long m = (long)GetNumElements();
|
||||
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long i = 0; i<(m & ~3); i += 4) //four-way unrolling
|
||||
{
|
||||
if (m_pArray[i] > threshold)
|
||||
|
@ -4304,7 +4305,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (sample_id == 0)
|
||||
sample_prob = -sample_prob;
|
||||
double score_noise = log_num_noise_samples + sample_prob;
|
||||
double z = logadd(score, score_noise);
|
||||
double z = LogAdd(score, score_noise);
|
||||
double logprob = score - z;
|
||||
double logprob_noise = score_noise - z;
|
||||
tmp(sample_id, instance_id) = (ElemType)-std::exp(logprob);
|
||||
|
@ -4387,7 +4388,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
ElemType v = alpha*a(0,0);
|
||||
long m=(long)c.GetNumRows(), n=(long)c.GetNumCols();
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j=0; j<n; j++)
|
||||
{
|
||||
//four-way unrolling
|
||||
|
@ -4497,7 +4498,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
LogicError("AddScaledDifference: Input matrix a is empty.");
|
||||
|
||||
long m=(long)c.GetNumElements();
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
//four-way unrolling
|
||||
for (long i=0; i<(m & ~3); i+=4)
|
||||
{
|
||||
|
@ -4536,7 +4537,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
c.Resize(a.GetNumRows(), a.GetNumCols());
|
||||
|
||||
long m=(long)c.GetNumElements();
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
//four-way unrolling
|
||||
for (long i=0; i<(m & ~3); i+=4)
|
||||
{
|
||||
|
@ -4634,7 +4635,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
c.Resize(m,n);
|
||||
|
||||
long size=(long)c.GetNumElements();
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
//four-way unrolling
|
||||
for (long i=0; i<(size & ~3); i+=4)
|
||||
{
|
||||
|
@ -4944,7 +4945,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
bool bHas = false;
|
||||
|
||||
bool isvFinite = std::isfinite(v);
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j = 0; j < mat.GetNumElements(); j++)
|
||||
{
|
||||
#pragma omp flush(bHas)
|
||||
|
@ -4992,7 +4993,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
long m = (long)GetNumRows(), n = (long)GetNumCols(); // a and b are of size (1,n)
|
||||
//#pragma omp parallel for
|
||||
//#pragma omp parallel for
|
||||
|
||||
for (long j = 0; j < n; j++)
|
||||
{
|
||||
|
@ -5247,7 +5248,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
//long m = (long)GetNumRows(), n = (long)GetNumCols(); // a and b are of size (1,n)
|
||||
long n = (long)GetNumCols(); // a and b are of size (1,n)
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for (long j = 0; j<n; j++)
|
||||
{
|
||||
us(0, j) = a(0, j) * b(0, (j + shift) % n);
|
||||
|
@ -5256,34 +5257,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
return *this;
|
||||
}
|
||||
|
||||
|
||||
#pragma endregion Static BLAS Functions
|
||||
|
||||
double logadd(double x, double y)
|
||||
{
|
||||
double temp, diff, z;
|
||||
|
||||
if (x < y) {
|
||||
temp = x; x = y; y = temp;
|
||||
}
|
||||
diff = y - x;
|
||||
if (diff < MINLOGEXP)
|
||||
{
|
||||
return (x < LSMALL)?LZERO:x;
|
||||
}
|
||||
else
|
||||
{
|
||||
z = exp(diff);
|
||||
return x + log(1.0 + z);
|
||||
}
|
||||
}
|
||||
// 'double' version of LogAdd
|
||||
double LogAddD(double x, double y) { return LogAdd(x, y); }
|
||||
|
||||
template<class ElemType>
|
||||
ElemType CPUMatrix<ElemType>::LogAddSumOfElements() const
|
||||
{
|
||||
ElemType fAlpha = (ElemType)LZERO;
|
||||
for (int k = 0; k < GetNumElements(); k++)
|
||||
fAlpha = (ElemType) logadd(fAlpha, m_pArray[k]);
|
||||
fAlpha = (ElemType) LogAddD(fAlpha, m_pArray[k]);
|
||||
return fAlpha;
|
||||
}
|
||||
|
||||
|
@ -5330,7 +5314,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
fSum = (ElemType)LZERO;
|
||||
for (int j = 0; j < iNumLab; j++)
|
||||
{
|
||||
fSum = (ElemType)logadd((double)fSum, alpha(j, t));
|
||||
fSum = (ElemType)LogAddD(fSum, alpha(j, t));
|
||||
}
|
||||
|
||||
fTmp = alpha(k, t) - fSum;
|
||||
|
@ -5343,10 +5327,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
fSum = (ElemType)LZERO;
|
||||
for (int m = 0; m < iNumLab; m++)
|
||||
{
|
||||
fSum = (ElemType)logadd((double)fSum, alpha(m, t) + pair_scores(j, m));
|
||||
fSum = (ElemType)LogAddD(fSum, alpha(m, t) + pair_scores(j, m));
|
||||
}
|
||||
|
||||
fTmp = (ElemType)logadd(fTmp, beta(j, t + 1) + alpha(k, t) + pair_scores(j, k) - fSum);
|
||||
fTmp = (ElemType)LogAddD(fTmp, beta(j, t + 1) + alpha(k, t) + pair_scores(j, k) - fSum);
|
||||
}
|
||||
beta(k, t) = fTmp;
|
||||
}
|
||||
|
@ -5455,7 +5439,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
else{
|
||||
fTmp2 = a(k, 0);
|
||||
}
|
||||
fSum = (ElemType)logadd(fSum, fTmp2 + pair_scores(j, k));
|
||||
fSum = (ElemType)LogAddD(fSum, fTmp2 + pair_scores(j, k));
|
||||
}
|
||||
|
||||
fTmp -= fSum;
|
||||
|
@ -5533,7 +5517,259 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
return numThreads;
|
||||
}
|
||||
|
||||
// The explicit instantiation part
|
||||
// -----------------------------------------------------------------------
|
||||
// TensorView support
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// To save time, this makes extensive use of templates and macros.
|
||||
|
||||
// perform loop over reduction index m
|
||||
// This function is declared inside a wrapper struct to allow partial specialization (m = -1).
|
||||
template<class ElemType, typename OPFN, size_t N, int m>
|
||||
struct TensorOpReduction
|
||||
{
|
||||
// reduction case (non-reduction case is specialized)
|
||||
static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN & opfn,
|
||||
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
|
||||
{
|
||||
array<ptrdiff_t, N - 1> strides; // N-1 because last one is the result pointer, which is unused in reduction
|
||||
for (size_t i = 0; i < N - 1; i++) // N = a small constant, this will be unrolled
|
||||
strides[i] = reducingStrides[i][(size_t)m];
|
||||
ElemType aggregate = 0;
|
||||
for (size_t dim = reducingOpDims[(size_t)m]; dim-- > 0;)
|
||||
{
|
||||
// need to descend into one loop deeper
|
||||
aggregate += TensorOpReduction<ElemType, OPFN, N, m - 1>::Loop(pointers, opfn, reducingOpDims, reducingStrides);
|
||||
// advance the pointers
|
||||
for (size_t i = 0; i < N - 1; i++)
|
||||
pointers[i] += strides[i]; // note: last pointer (result) is unused and untouched here
|
||||
}
|
||||
return aggregate;
|
||||
}
|
||||
};
|
||||
|
||||
// perform loop over reduction index m
|
||||
// This is the specialized version for m = -1, which terminates the recursion.
|
||||
template<class ElemType, typename OPFN, size_t N>
|
||||
struct TensorOpReduction<ElemType, OPFN, N, -1>
|
||||
{
|
||||
static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN & opfn,
|
||||
const std::vector<size_t> &, const std::array<std::vector<ptrdiff_t>, N> &)
|
||||
{
|
||||
return opfn(pointers); // finally we are doing some work!!!
|
||||
}
|
||||
};
|
||||
|
||||
// perform loop over regular index k and reducing index m for N operands (counting the output)
|
||||
template<class ElemType, typename OPFN, size_t N, bool vectorizable, int m, int k>
|
||||
struct TensorOpIteration
|
||||
{
|
||||
static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN & opfn,
|
||||
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, N> & regularStrides,
|
||||
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
|
||||
{
|
||||
// non-scalar case: still nested result loops left
|
||||
array<ptrdiff_t, N> strides;
|
||||
for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled
|
||||
strides[i] = regularStrides[i][(size_t)k];
|
||||
for (size_t dim = regularOpDims[(size_t)k]; dim--> 0;)
|
||||
{
|
||||
// need to descend into one loop deeper
|
||||
TensorOpIteration<ElemType, OPFN, N, vectorizable, m, k - 1>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
// advance the pointers
|
||||
for (size_t i = 0; i < N; i++)
|
||||
pointers[i] += strides[i];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Special version for innermost loop with strides all being 1 and no further reduction. Compiler can use SSE.
|
||||
// This is a very common case, e.g. adding vectors or computing the Sigmoid.
|
||||
template<class ElemType, typename OPFN>
|
||||
struct TensorOpIteration<ElemType, OPFN, 3, true/*vectorizable*/, -1/*no reduction*/, 0/*innermost loop*/>
|
||||
{
|
||||
static inline void Loop(ElemType beta, array<ElemType*, 3> pointers, ElemType alpha, const OPFN & opfn,
|
||||
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, 3> & regularStrides,
|
||||
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 3> & reducingStrides)
|
||||
{
|
||||
ElemType* pa = pointers[0];
|
||||
ElemType* pb = pointers[1];
|
||||
ElemType* pc = pointers[2];
|
||||
size_t K = regularOpDims[0];
|
||||
// special-case beta and alpha to allow the compiler to short-circuit it
|
||||
if (beta != 0)
|
||||
#pragma omp parallel for
|
||||
for (int k = 0; k < (int)K; k++)
|
||||
TensorOpIteration<ElemType, OPFN, 3, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(beta, array<ElemType*, 3> { pa + k, pb + k, pc + k }, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
else if (alpha != 1)
|
||||
#pragma omp parallel for
|
||||
for (int k = 0; k < (int)K; k++)
|
||||
TensorOpIteration<ElemType, OPFN, 3, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(0, array<ElemType*, 3> { pa + k, pb + k, pc + k }, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
else
|
||||
#pragma omp parallel for
|
||||
for (int k = 0; k < (int)K; k++)
|
||||
TensorOpIteration<ElemType, OPFN, 3, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(0, array<ElemType*, 3> { pa + k, pb + k, pc + k }, 1, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
// TODO: somehow this does not use 4-way parallelism with SSE (VS 2013), and the signedness of k (required for omp) causes an extra sign-extend
|
||||
// TODO: OMP adds LOTS of overhead. Do we need a guard, a min size when to use it?
|
||||
}
|
||||
};
|
||||
// and unary
|
||||
template<class ElemType, typename OPFN>
|
||||
struct TensorOpIteration<ElemType, OPFN, 2, true/*vectorizable*/, -1/*no reduction*/, 0/*innermost loop*/>
|
||||
{
|
||||
static inline void Loop(ElemType beta, array<ElemType*, 2> pointers, ElemType alpha, const OPFN & opfn,
|
||||
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, 2> & regularStrides,
|
||||
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 2> & reducingStrides)
|
||||
{
|
||||
ElemType* pa = pointers[0];
|
||||
ElemType* pb = pointers[1];
|
||||
size_t K = regularOpDims[0];
|
||||
// special-case beta and alpha to allow the compiler to short-circuit it
|
||||
if (beta != 0)
|
||||
#pragma omp parallel for
|
||||
for (int k = 0; k < (int)K; k++)
|
||||
TensorOpIteration<ElemType, OPFN, 2, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(beta, array<ElemType*, 2> { pa + k, pb + k }, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
else if (alpha != 1)
|
||||
#pragma omp parallel for
|
||||
for (int k = 0; k < (int)K; k++)
|
||||
TensorOpIteration<ElemType, OPFN, 2, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(0, array<ElemType*, 2> { pa + k, pb + k }, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
else
|
||||
#pragma omp parallel for
|
||||
for (int k = 0; k < (int)K; k++)
|
||||
TensorOpIteration<ElemType, OPFN, 2, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(0, array<ElemType*, 2> { pa + k, pb + k }, 1, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
};
|
||||
|
||||
template<class ElemType, typename OPFN, size_t N, bool vectorizable, int m>
|
||||
struct TensorOpIteration<ElemType, OPFN, N, vectorizable, m, -1>
|
||||
{
|
||||
static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN & opfn,
|
||||
const std::vector<size_t> &, const std::array<std::vector<ptrdiff_t>, N> &,
|
||||
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
|
||||
{
|
||||
// we are at element level for the result: perform the op (there may still be reduction)
|
||||
ElemType val = alpha * TensorOpReduction<ElemType, OPFN, N, m>::Loop(pointers, opfn, reducingOpDims, reducingStrides);
|
||||
// combine with previous value in target matrix, then write it out
|
||||
auto * pout = pointers.back();
|
||||
if (beta != 0)
|
||||
val += beta * *pout;
|
||||
*pout = val;
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
// tensor operation with k+1 dimensions (-1 means scalar)
|
||||
template<class ElemType, typename OPFN, size_t N, int k>
|
||||
static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N> & pointers, ElemType alpha, const OPFN & opfn,
|
||||
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, N> & regularStrides,
|
||||
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
|
||||
{
|
||||
size_t dims = reducingOpDims.size();
|
||||
switch (dims)
|
||||
{
|
||||
case 2: return TensorOpIteration<ElemType, OPFN, N, false/*vectorizable*/, 1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 1: return TensorOpIteration<ElemType, OPFN, N, false/*vectorizable*/, 0, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 0:
|
||||
{
|
||||
// if all leading dimensions are 1, we can let the compiler do some unrolling
|
||||
bool leadingAllOne = true;
|
||||
for (size_t i = 0; i < N; i++)
|
||||
leadingAllOne &= k >= 0 && regularStrides[i][0] == 1;
|
||||
if (leadingAllOne) // special version that uses a hard-coded increment of 1 for all leading dimensions
|
||||
return TensorOpIteration<ElemType, OPFN, N, true/*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
else
|
||||
return TensorOpIteration<ElemType, OPFN, N, false/*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
default: LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (int)dims);
|
||||
}
|
||||
}
|
||||
|
||||
// tensor operation, generalized in number of arguments, operation already provided as a lambda
|
||||
// This function now expands into different k.
|
||||
template<class ElemType, typename OPFN, size_t N>
|
||||
static void TensorOpWithFn(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN & opfn,
|
||||
const std::array<size_t, N> & offsets,
|
||||
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, N> & regularStrides,
|
||||
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
|
||||
{
|
||||
for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled
|
||||
pointers[i] += offsets[i];
|
||||
size_t dims = regularOpDims.size();
|
||||
switch (dims)
|
||||
{
|
||||
case 4: return TensorOpWithRegularLoop<ElemType, OPFN, N, 3>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 3: return TensorOpWithRegularLoop<ElemType, OPFN, N, 2>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 2: return TensorOpWithRegularLoop<ElemType, OPFN, N, 1>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 1: return TensorOpWithRegularLoop<ElemType, OPFN, N, 0>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 0: return TensorOpWithRegularLoop<ElemType, OPFN, N, -1>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
default: LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int)dims);
|
||||
}
|
||||
}
|
||||
|
||||
// perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
|
||||
// This maps 'op' to a lambda.
|
||||
template<class ElemType>
|
||||
void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
|
||||
const std::array<size_t, 2> & offsets,
|
||||
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, 2> & regularStrides,
|
||||
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 2> & reducingStrides)
|
||||
{
|
||||
#define CaseUnaryTensorOp(oper) \
|
||||
case ElementWiseOperator::op ## oper: \
|
||||
return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 2> & pp) { return Op ## oper((*(pp[0]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
|
||||
|
||||
array<ElemType*, 2> pointers = { a.m_pArray, m_pArray };
|
||||
switch (op)
|
||||
{
|
||||
ForAllUnaryOps(CaseUnaryTensorOp);
|
||||
default: LogicError("TensorUnaryOp: Unknown op code %d.", (int)op);
|
||||
}
|
||||
}
|
||||
|
||||
// perform binary operation 'op' on a and b giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
|
||||
// This maps 'op' to a lambda.
|
||||
template<class ElemType>
|
||||
void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
|
||||
const std::array<size_t, 3> & offsets,
|
||||
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, 3> & regularStrides,
|
||||
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 3> & reducingStrides)
|
||||
{
|
||||
#define CaseBinaryTensorOp(oper) \
|
||||
case ElementWiseOperator::op ## oper: \
|
||||
return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 3> & pp) { return Op ## oper((*(pp[0])), (*(pp[1]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
|
||||
|
||||
array<ElemType*, 3> pointers = { a.m_pArray, b.m_pArray, m_pArray };
|
||||
switch (op)
|
||||
{
|
||||
ForAllBinaryOps(CaseBinaryTensorOp);
|
||||
default: LogicError("TensorBinaryOp: Unknown op code %d.", (int)op);
|
||||
}
|
||||
}
|
||||
|
||||
// perform ternary operation 'op' on a, and c giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
|
||||
// This maps 'op' to a lambda.
|
||||
template<class ElemType>
|
||||
void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
|
||||
const std::array<size_t, 4> & offsets,
|
||||
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, 4> & regularStrides,
|
||||
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 4> & reducingStrides)
|
||||
{
|
||||
#define CaseTernaryTensorOp(oper) \
|
||||
case ElementWiseOperator::op ## oper: \
|
||||
return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 4> & pp) { return Op ## oper((*(pp[0])), (*(pp[1])), (*(pp[2]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
|
||||
|
||||
array<ElemType*, 4> pointers = { a.m_pArray, b.m_pArray, c.m_pArray, m_pArray };
|
||||
switch (op)
|
||||
{
|
||||
ForAllTernaryOps(CaseTernaryTensorOp);
|
||||
default: LogicError("TensorTernaryOp: Unknown op code %d.", (int)op);
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// explicit instantiations
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template class MATH_API CPUMatrix<float>;
|
||||
template class MATH_API CPUMatrix<double>;
|
||||
|
||||
|
@ -5551,5 +5787,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
template void CPUMatrix<char>::SetValue(const char);
|
||||
template void CPUMatrix<char>::SetValue(const size_t numRows, const size_t numCols, char *pArray, size_t matrixFlags);
|
||||
template void CPUMatrix<char>::SetValue(CPUMatrix<char> const&);
|
||||
template void CPUMatrix<char>::Resize(const size_t numRows, const size_t numCols, bool growOnly);
|
||||
|
||||
}}}
|
||||
|
|
|
@ -334,6 +334,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
static bool AreEqual(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const ElemType threshold = 1e-8);
|
||||
|
||||
static void TensorShuffleScaleAndAdd(ElemType keepWeight, const CPUMatrix<ElemType>& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& c);
|
||||
|
||||
void TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
|
||||
const std::array<size_t, 2> & offsets,
|
||||
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, 2> & regularStrides,
|
||||
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 2> & reducingStrides);
|
||||
void TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
|
||||
const std::array<size_t, 3> & offsets,
|
||||
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, 3> & regularStrides,
|
||||
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 3> & reducingStrides);
|
||||
void TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
|
||||
const std::array<size_t, 4> & offsets,
|
||||
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, 4> & regularStrides,
|
||||
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 4> & reducingStrides);
|
||||
|
||||
static CPUMatrix<ElemType> Ones(const size_t rows, const size_t cols);
|
||||
static CPUMatrix<ElemType> Zeros(const size_t rows, const size_t cols);
|
||||
|
|
|
@ -41,6 +41,51 @@ MATH_API DEVICEID_TYPE EnforceOneGPUOnly(DEVICEID_TYPE requestedDeviceId);
|
|||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// ElementWiseOperator -- This enum represents which function to apply.
|
||||
// This is shared between all matrix types and tensors.
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
enum ElementWiseOperator
|
||||
{
|
||||
// unary (or binary with constant parameter)
|
||||
opCopy,
|
||||
opNegate, opNot,
|
||||
opAbs,
|
||||
opSigmoid, opSigmoidDerivative, opTanh, opSqrt, opExp, opLog, opLinearRectifierDerivative, opCosine, opNegativeSine,
|
||||
// these are not implemented yet:
|
||||
opSaturateBetaAlpha, opSumAlpha, opSubDifferenceToAlpha, opSubDifferenceFromAlpha,
|
||||
// binary
|
||||
opSum, opDifference, opElementWiseProduct, opElementWiseQuotient,
|
||||
opLogSum, opMax, opMin,
|
||||
opEQ, opNE, opGT, opLT, opGE, opLE,
|
||||
// ternary
|
||||
opCond
|
||||
// Note: not all of the above are actually implement at present; and not all that's implemented has an opcode.
|
||||
};
|
||||
|
||||
// helper to apply a C macro for all operations of each kind
|
||||
#define ForAllUnaryOps(Macro) \
|
||||
Macro(Copy); \
|
||||
Macro(Negate); Macro(Not); \
|
||||
Macro(Abs); \
|
||||
Macro(Sigmoid); Macro(SigmoidDerivative); Macro(Tanh); Macro(Sqrt); Macro(Exp); Macro(Log); Macro(LinearRectifierDerivative); Macro(Cosine); Macro(NegativeSine);
|
||||
|
||||
#define ForAllParameterizedUnaryOps(Macro) \
|
||||
Macro(SaturateBetaAlpha); Macro(SumAlpha); Macro(SubDifferenceToAlpha); Macro(SubDifferenceFromAlpha);
|
||||
|
||||
#define ForAllBinaryOps(Macro) \
|
||||
Macro(Sum); Macro(Difference); Macro(ElementWiseProduct); Macro(ElementWiseQuotient); \
|
||||
Macro(LogSum); Macro(Max); Macro(Min); \
|
||||
Macro(EQ); Macro(NE); Macro(GT); Macro(LT); Macro(GE); Macro(LE);
|
||||
|
||||
#define ForAllTernaryOps(Macro) \
|
||||
Macro(Cond);
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// various enums to describe
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
enum MatrixFlagBitPosition
|
||||
{
|
||||
bitPosRowMajor = 0, // row major matrix
|
||||
|
@ -76,6 +121,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
matrixFlagSetValueOnDevice = 1<<bitPosSetValueOnDevice, // SetValue() call has a buffer that is already on the device
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// BaseMatrix -- base class for all matrix types (CPU, GPU) x (dense, sparse)
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
class BaseMatrix
|
||||
|
|
|
@ -71,16 +71,6 @@ namespace Microsoft {
|
|||
};
|
||||
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// ElementWiseOperator -- This enum represents which function to apply. It needs to be outside of GPUMatrix, because it is also used in GPUSparseMatrix
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
enum ElementWiseOperator
|
||||
{
|
||||
opSigmoid = 0, opTanh, opSqrt, opExp, opLog, opAbs, opLinearRectifierDerivative, opCosine, opNegativeSine, opSigmoidDerivative
|
||||
};
|
||||
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// GPUMatrix
|
||||
// -----------------------------------------------------------------------
|
||||
|
|
|
@ -162,6 +162,7 @@
|
|||
<ClInclude Include="CommonMatrix.h" />
|
||||
<ClInclude Include="ConvolutionEngine.h" />
|
||||
<ClInclude Include="CPUMatrix.h" />
|
||||
<ClInclude Include="TensorOps.h" />
|
||||
<ClInclude Include="TensorView.h" />
|
||||
<None Include="ClassDiagram.cd" />
|
||||
<None Include="GPUWatcher.cu" />
|
||||
|
|
|
@ -70,6 +70,9 @@
|
|||
<ClInclude Include="TensorView.h">
|
||||
<Filter>Tensors</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="TensorOps.h">
|
||||
<Filter>Tensors</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="GPUMatrix.h">
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -6,9 +6,8 @@
|
|||
|
||||
// TODO:
|
||||
// - remove empty-matrix checks: if an op is well-defined with empty matrices, then do it
|
||||
// - Resize() must be cheap if it does nothing (I already did that for CPU, still to be done for GPU)
|
||||
// - an overload for Resize() to match another matrix
|
||||
// - need a way to grow a minibatch matrix without destroying its content, something like PushColumns()
|
||||
// - Resize() must be cheap if it does nothing (I already did that for CPU; already done for GPU?)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "Basics.h"
|
||||
|
@ -16,11 +15,12 @@
|
|||
#include "CommonMatrix.h"
|
||||
#include <limits.h>
|
||||
#include <memory> // for shared_ptr
|
||||
#include <array>
|
||||
#include <initializer_list>
|
||||
|
||||
// This class is exported from the Math.dll
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
|
||||
enum CurrentDataLocation
|
||||
{
|
||||
NONE, CPU, GPU, BOTH
|
||||
|
@ -73,6 +73,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
void _transferToDevice(int id_to, bool ismoved=true, bool emptyTransfer=false) const;
|
||||
static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
|
||||
static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c);
|
||||
static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, const Matrix<ElemType>& d);
|
||||
static void CopyElementsFromDenseToSparse(CPUMatrix<ElemType>& from, CPUSparseMatrix<ElemType>& dest);
|
||||
|
||||
public:
|
||||
|
@ -168,6 +169,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
ElemType RmsProp(Matrix<ElemType>& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier);
|
||||
|
||||
void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve = 10000, bool growOnly = true); //by default we only reallocate if need to grow
|
||||
void Resize(const Matrix<ElemType>& other) { Resize(other.GetNumRows(), other.GetNumCols()); }
|
||||
void VerifySize(size_t rows, size_t cols)
|
||||
{
|
||||
m_baseMatrix->VerifySize(rows, cols);
|
||||
|
@ -200,6 +202,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
void SetValue(const Matrix<ElemType>& deepCopyFrom, const MatrixFormat format=matrixFormatSparseCSR);
|
||||
void SetValue(const size_t numRows, const size_t numCols, int deviceId, ElemType *pArray, const size_t matrixFlags = matrixFlagNormal);
|
||||
void SetValue(const size_t rIdx, const size_t cIdx, ElemType val); // set matrix sparsely
|
||||
void SetValue(const size_t numRows, const size_t numCols, std::initializer_list<ElemType> l) { std::vector<ElemType> vals(l); assert(vals.size() == numRows * numCols); SetValue(numRows, numCols, GetDeviceId(), vals.data(), matrixFormatRowMajor); } // SetValue(2,3, {1,2,3, 4,5,6});
|
||||
static ElemType MakeNan(size_t payload);
|
||||
void Invalidate() { SetValue(MakeNan(__LINE__)); }
|
||||
void SetMatrixFromCSCFormat(const CPUSPARSE_INDEX_TYPE *h_CSCCol, const CPUSPARSE_INDEX_TYPE *h_Row, const ElemType *h_Val,
|
||||
|
@ -376,7 +379,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
void VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise, int topK) const;
|
||||
void VectorMin(Matrix<ElemType>& minIndexes, Matrix<ElemType>& minValues, const bool isColWise) const;
|
||||
|
||||
Matrix<ElemType>& AssignNumOfDiff(const Matrix<ElemType>& a, const Matrix<ElemType>& b, bool searchInCol = false);
|
||||
Matrix<ElemType>& AssignNumOfDiff(const Matrix<ElemType>& a, const Matrix<ElemType>& b, bool searchInCol = false);
|
||||
|
||||
Matrix<ElemType>& AssignInnerProductOfMatrices(const Matrix<ElemType>& a, const Matrix<ElemType>& b); //this method will resize(1,1) first
|
||||
|
||||
|
@ -458,6 +461,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
static bool HasElement(const Matrix<ElemType>& a, const ElemType value = 0.0);
|
||||
|
||||
static void TensorShuffleScaleAndAdd(ElemType keepWeight, const Matrix<ElemType>& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const Matrix<ElemType>& b, Matrix<ElemType>& c);
|
||||
|
||||
void TensorOp(ElemType beta, const Matrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
|
||||
const std::array<size_t, 2> & offsets,
|
||||
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, 2> & regularStrides,
|
||||
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 2> & reducingStrides);
|
||||
void TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
|
||||
const std::array<size_t, 3> & offsets,
|
||||
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, 3> & regularStrides,
|
||||
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 3> & reducingStrides);
|
||||
void TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
|
||||
const std::array<size_t, 4> & offsets,
|
||||
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, 4> & regularStrides,
|
||||
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 4> & reducingStrides);
|
||||
public:
|
||||
void Read(File& stream);
|
||||
void Write(File& stream) const;
|
||||
|
|
|
@ -0,0 +1,132 @@
|
|||
//
|
||||
// <copyright file="TensorView.h" company="Microsoft">
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// </copyright>
|
||||
//
|
||||
|
||||
// This implements the elementwise tensor operations, including helper macros and some actual functions.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "Basics.h"
|
||||
#include "CommonMatrix.h"
|
||||
|
||||
#pragma push_macro("TENSOR_OPS_DECL")
|
||||
#ifndef TENSOR_OPS_DECL // to make these accessible to CUDA kernels, say '#define TENSOR_OPS_DECL __device__ __host__'
|
||||
#define TENSOR_OPS_DECL
|
||||
#endif
|
||||
|
||||
#pragma push_macro("DECL")
|
||||
#define DECL static inline TENSOR_OPS_DECL
|
||||
|
||||
// This class is exported from the Math.dll.
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// unified overloads for float/double math functions
|
||||
//
|
||||
// Declare float and double versions of the functions f we need as f_(),
|
||||
// e.g. exp_ -> exp(double), expf(float).
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
#pragma push_macro("OverloadUnaryMathFns")
|
||||
#define OverloadUnaryMathFns(func) \
|
||||
DECL float func ## _(float arg) { return func ## f(arg); } \
|
||||
DECL double func ## _(double arg) { return func(arg); }
|
||||
|
||||
OverloadUnaryMathFns(fabs); OverloadUnaryMathFns(sqrt);
|
||||
OverloadUnaryMathFns(exp); OverloadUnaryMathFns(log);
|
||||
OverloadUnaryMathFns(tanh); OverloadUnaryMathFns(cos); OverloadUnaryMathFns(sin);
|
||||
#pragma push_macro("OverloadUnaryMathFns")
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// additional functions that are standard in our context
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
DECL ElemType Sigmoid(ElemType z)
|
||||
{
|
||||
if (z >= 0)
|
||||
return 1 / (1 + exp_(-z));
|
||||
else
|
||||
{
|
||||
ElemType v = exp_(z);
|
||||
return v / (1 + v);
|
||||
}
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
DECL ElemType SigmoidDerivative(ElemType z)
|
||||
{
|
||||
ElemType v = Sigmoid(z);
|
||||
return v * (1 - v);
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
DECL ElemType LinearRectifierDerivative(ElemType z)
|
||||
{
|
||||
return z > 0 ? (ElemType)1 : 0;
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
DECL ElemType Sqrt(ElemType z)
|
||||
{
|
||||
// BUGBUG: Why clip to 0? An invalid sqrt() should show up as a NaN in the result, instead of hiding it.
|
||||
return sqrt_(z > 0 ? z : 0);
|
||||
}
|
||||
|
||||
// TODO: call this LogAdd() for consistency
|
||||
template<typename ElemType>
|
||||
DECL ElemType LogAdd(ElemType x, ElemType y)
|
||||
{
|
||||
if (x < y)
|
||||
{
|
||||
ElemType temp = x; x = y; y = temp;
|
||||
}
|
||||
ElemType diff = y - x;
|
||||
if (diff < (ElemType)MINLOGEXP)
|
||||
{
|
||||
return (x < (ElemType)LSMALL) ? (ElemType)LZERO : x;
|
||||
}
|
||||
else
|
||||
{
|
||||
ElemType z = exp_(diff);
|
||||
return x + log_((ElemType)1.0 + z);
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// ElementWiseOperator implementations
|
||||
//
|
||||
// Define a static function for every ElementWiseOperator (CommonMatrix.h).
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
#pragma push_macro("DefUnaryOp")
|
||||
#define DefUnaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op(ElemType a) { return expr; }
|
||||
|
||||
DefUnaryOp(Copy, a);
|
||||
DefUnaryOp(Negate, -a); DefUnaryOp(Not, !a);
|
||||
DefUnaryOp(Abs, fabs_(a));
|
||||
DefUnaryOp(Sigmoid, Sigmoid(a)); DefUnaryOp(SigmoidDerivative, SigmoidDerivative(a)); DefUnaryOp(Tanh, tanh_(a)); DefUnaryOp(Sqrt, Sqrt(a)); DefUnaryOp(Exp, exp_(a)); DefUnaryOp(Log, log_(a)); DefUnaryOp(LinearRectifierDerivative, LinearRectifierDerivative(a)); DefUnaryOp(Cosine, cos_(a)); DefUnaryOp(NegativeSine, -sin_(a));
|
||||
#pragma pop_macro("DefUnaryOp")
|
||||
|
||||
// parameterized unary ops
|
||||
//DefUnaryOp(SaturateBetaAlpha); DefUnaryOp(SumAlpha); DefUnaryOp(SubDifferenceToAlpha); DefUnaryOp(SubDifferenceFromAlpha);
|
||||
|
||||
#pragma push_macro("DefBinaryOp")
|
||||
#define DefBinaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op(ElemType a, ElemType b) { return expr; }
|
||||
|
||||
DefBinaryOp(Sum, a + b); DefBinaryOp(Difference, a - b); DefBinaryOp(ElementWiseProduct, a*b); DefBinaryOp(ElementWiseQuotient, a / b);
|
||||
DefBinaryOp(LogSum, LogAdd(a, b)); DefBinaryOp(Max, a > b ? a : b); DefBinaryOp(Min, a < b ? a : b);
|
||||
DefBinaryOp(EQ, a == b); DefBinaryOp(NE, a != b); DefBinaryOp(GT, a > b); DefBinaryOp(LT, a < b); DefBinaryOp(GE, a >= b); DefBinaryOp(LE, a <= b);
|
||||
#pragma pop_macro("DefBinaryOp")
|
||||
|
||||
#pragma push_macro("DefTernaryOp")
|
||||
#define DefTernaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op(ElemType a, ElemType b, ElemType c) { return expr; }
|
||||
|
||||
DefTernaryOp(Cond, a ? b : c);
|
||||
#pragma pop_macro("DefTernaryOp")
|
||||
|
||||
}}}
|
||||
#pragma pop_macro("DECL")
|
||||
#pragma pop_macro("TENSOR_OPS_DECL")
|
|
@ -26,11 +26,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// construction
|
||||
// -------------------------------------------------------------------
|
||||
|
||||
// cast a matrix as a tensor
|
||||
// cast a matrix as a TensorView
|
||||
template<class ElemType>
|
||||
TensorView<ElemType>::TensorView(Matrix<ElemType> & sob) :
|
||||
m_sob(sob), m_shape(TensorShape(array<size_t, 2> { sob.GetNumRows(), sob.GetNumCols() }))
|
||||
m_sob(&sob), m_shape(TensorShape(array<size_t, 2> { sob.GetNumRows(), sob.GetNumCols() }))
|
||||
{ }
|
||||
// reshape a TensorView
|
||||
template<class ElemType>
|
||||
TensorView<ElemType>::TensorView(const TensorView<ElemType> & other, const TensorShape & shape) :
|
||||
m_sob(other.m_sob), m_shape(shape)
|
||||
|
@ -40,14 +41,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// TODO: Use the multipliers instead?
|
||||
size_t i;
|
||||
size_t rowDim = 1;
|
||||
for (i = 0; i < m_shape.size() && rowDim < m_sob.GetNumRows(); i++)
|
||||
for (i = 0; i < m_shape.size() && rowDim < m_sob->GetNumRows(); i++)
|
||||
rowDim *= m_shape[i];
|
||||
// first i dimensions match matrix row dimension
|
||||
size_t colDim = 1;
|
||||
for (; i < m_shape.size(); i++)
|
||||
colDim *= m_shape[i];
|
||||
if (rowDim != m_sob.GetNumRows() || colDim != m_sob.GetNumCols())
|
||||
LogicError("TensorView: Tensor dimensions %s do not match storage-object dims %d x %d", string(m_shape).c_str(), (int)m_sob.GetNumRows(), (int)m_sob.GetNumCols());
|
||||
if (rowDim != m_sob->GetNumRows() || colDim != m_sob->GetNumCols())
|
||||
LogicError("TensorView: Tensor dimensions %s do not match storage-object dims %d x %d", string(m_shape).c_str(), (int)m_sob->GetNumRows(), (int)m_sob->GetNumCols());
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------
|
||||
|
@ -56,96 +57,168 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
static bool Matches(size_t d1, size_t d2) { return d1 == 1 || d2 == 1 || d1 == d2; } // do two dimensions match?
|
||||
|
||||
template<class ElemType>
|
||||
void TensorView<ElemType>::DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, int op/*will become an enum later*/)
|
||||
template<class ElemType, size_t N>
|
||||
static void PrepareTensorOperands(array<TensorShape, N> shapes, array<size_t, N> & offsets,
|
||||
vector<size_t> & regularOpDims,
|
||||
array<vector<ptrdiff_t>, N> & regularStrides,
|
||||
vector<size_t> & reducingOpDims,
|
||||
array<vector<ptrdiff_t>, N> & reducingStrides)
|
||||
{
|
||||
TensorView & c = *this;
|
||||
|
||||
// TODO: Turn the inner meat here into a function template using a std::array<., N-nariness>. Nullary ops are generators, e.g. constants.
|
||||
|
||||
// massage TensorShapes
|
||||
// Note that TensorShapes here may be shapes are stored or shapes with stride magic applied.
|
||||
auto as = a.GetShape().GetDims();
|
||||
auto bs = b.GetShape().GetDims();
|
||||
auto cs = c.GetShape().GetDims();
|
||||
|
||||
// expand ones to make tensors compatible
|
||||
// Trailing dimensions broadcast.
|
||||
// E.g. A(J) vs. B(J x T) will broadcast A(:) to all T columns.
|
||||
// To broadcast an A(T) to all J rows of B, use TensorShape editing to insert a dimension to get A(1,T).
|
||||
auto dims = max(max(as.size(), bs.size()), cs.size());
|
||||
as.resize(dims, 1);
|
||||
bs.resize(dims, 1);
|
||||
cs.resize(dims, 1);
|
||||
size_t dims = 0;
|
||||
for (size_t i = 0; i < N; i++)
|
||||
if (dims < shapes[i].GetNumDims())
|
||||
dims = shapes[i].GetNumDims();
|
||||
for (size_t i = 0; i < N; i++)
|
||||
shapes[i] = shapes[i].Pad(dims);
|
||||
|
||||
// determine operation shape (max over all dimensions)
|
||||
decltype(as) os(dims);
|
||||
vector<size_t> opDims(dims, 0);
|
||||
for (size_t k = 0; k < dims; k++)
|
||||
os[k] = max(max(as[k], bs[k]), cs[k]);
|
||||
for (size_t i = 0; i < N; i++)
|
||||
opDims[k] = max(opDims[k], shapes[i][k]);
|
||||
|
||||
// dimension compatibility check
|
||||
// Each participant can broadcast. Non-broadcasting dimensions must match the operation dimension.
|
||||
for (size_t k = 0; k < dims; k++)
|
||||
{
|
||||
if (!Matches(as[k], os[k]) || !Matches(bs[k], os[k]) || !Matches(cs[k], os[k]))
|
||||
InvalidArgument("Binary tensor operation: Dimension %d is incompatible between the two inputs and output (%d vs. %d vs. %d)", (int)dims, (int)as[k], (int)bs[k], (int)cs[k]);
|
||||
}
|
||||
for (size_t i = 0; i < N; i++)
|
||||
if (!Matches(shapes[i][k], opDims[k]))
|
||||
InvalidArgument("Binary tensor operation: Dimension %d is incompatible between input %d and output (%s vs. %s)", (int)k, (int)shapes[i][k], string(shapes[i]).c_str(), string(TensorShape(opDims)).c_str());
|
||||
|
||||
// flatten consecutive dimensions
|
||||
// Dimensions must be consecutive in memory, and either non-broadcasting or all-broadcasting, across all dimensions.
|
||||
// After this, as, bs, and cs no longer match the TensorShape objects.
|
||||
//fprintf(stderr, "Pre-flatten: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
|
||||
for (size_t k = 1; k < dims; k++)
|
||||
{
|
||||
// check if stored without gaps to skip
|
||||
if (!a.GetShape().CanFlatten(k) || !b.GetShape().CanFlatten(k) || !c.GetShape().CanFlatten(k))
|
||||
continue;
|
||||
// check if they are either all broadcasting or all not broadcasting
|
||||
if ((as[k] != os[k] || as[k - 1] != os[k - 1]) && (as[k] != 1 || as[k - 1] != 1))
|
||||
continue;
|
||||
if ((bs[k] != os[k] || bs[k - 1] != os[k - 1]) && (bs[k] != 1 || bs[k - 1] != 1))
|
||||
continue;
|
||||
if ((cs[k] != os[k] || cs[k - 1] != os[k - 1]) && (cs[k] != 1 || cs[k - 1] != 1))
|
||||
continue;
|
||||
// merge the dimensions
|
||||
as[k] *= as[k - 1]; as[k - 1] = 1;
|
||||
bs[k] *= bs[k - 1]; bs[k - 1] = 1;
|
||||
cs[k] *= cs[k - 1]; cs[k - 1] = 1;
|
||||
// BUGBUG: Must update multipliers as well
|
||||
for (size_t i = 0; i < N; i++)
|
||||
{
|
||||
// check if stored without gaps to skip
|
||||
if (!shapes[i].CanFlatten(k))
|
||||
goto nope;
|
||||
// check if they are either all broadcasting or all not broadcasting
|
||||
if ((shapes[i][k] != opDims[k] || shapes[i][k - 1] != opDims[k - 1]) && (shapes[i][k] != 1 || shapes[i][k - 1] != 1))
|
||||
goto nope;
|
||||
}
|
||||
// these dimensions can be merged
|
||||
for (size_t i = 0; i < N; i++)
|
||||
shapes[i] = shapes[i].Flatten(k); // TODO: overdoing the immutable thingy much?
|
||||
opDims = TensorShape(opDims).Flatten(k).GetDims(); // (ugh)
|
||||
nope:;
|
||||
}
|
||||
//fprintf(stderr, "Post-flatten: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
|
||||
|
||||
// remove singleton dimensions
|
||||
size_t j = 0;
|
||||
vector<bool> toDrop(dims, false);
|
||||
for (size_t k = 0; k < dims; k++)
|
||||
{
|
||||
if (as[k] == 1 && bs[k] == 1 && cs[k] == 1) // skip all-singleton dimensions
|
||||
continue;
|
||||
as[j] = as[k];
|
||||
bs[j] = bs[k];
|
||||
cs[j] = cs[k];
|
||||
os[j] = os[k];
|
||||
j++;
|
||||
for (size_t i = 0; i < N; i++)
|
||||
if (shapes[i][k] != 1)
|
||||
goto neither;
|
||||
toDrop[k] = true; // found an all-singleton dimensions
|
||||
neither:;
|
||||
}
|
||||
// note: if op is a scalar, then we end up with 0 dimensions here
|
||||
dims = j;
|
||||
as.resize(dims);
|
||||
bs.resize(dims);
|
||||
cs.resize(dims);
|
||||
os.resize(dims);
|
||||
let as1 = TensorShape(as); // BUGBUG: We just lost stride info.
|
||||
let bs1 = TensorShape(bs);
|
||||
let cs1 = TensorShape(cs);
|
||||
let os1 = TensorShape(os);
|
||||
for (size_t i = 0; i < N; i++)
|
||||
shapes[i] = shapes[i].DropDims(toDrop);
|
||||
opDims = TensorShape(opDims).DropDims(toDrop).GetDims(); // (ugh)
|
||||
dims = opDims.size(); // #dims has changed
|
||||
for (size_t i = 0; i < N; i++)
|
||||
assert(dims == shapes[i].size());
|
||||
// note: if op is a scalar, then we end up with 0 dimensions here, which is allowed
|
||||
//fprintf(stderr, "Post-drop: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
|
||||
|
||||
// determine broadcasting; that is, set strides to 0 for 1-dimensions
|
||||
// To be more precise, we should only set actually broadcasting dimensions to 0.
|
||||
// But since dimensions that are 1 across all args are eliminated, any 1 must be some form of broadcasting.
|
||||
// TODO: Do we need to allow other strides at this point in time? If not, broadcasting becomes a bit vector.
|
||||
for (size_t i = 0; i < N; i++)
|
||||
shapes[i] = shapes[i].WithBroadcastStrides();
|
||||
|
||||
//fprintf(stderr, "%s op %s -> %s via %s\n", string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
|
||||
|
||||
// determine inverse broadcasting dimensions
|
||||
// TODO: describe the resulting for loop as a set of tensor dims and strides as well.
|
||||
vector<bool> cBroadcasts(dims);
|
||||
// Inverse broadcasting dims are actual for loops in the kernel, whereas broadcasting input dims are handled by the thread index.
|
||||
// For regular input dims:
|
||||
// - determine number of steps (product over opDims[.])
|
||||
// - launch that many kernels
|
||||
// - pass in:
|
||||
// - total number of steps
|
||||
// - strides for all inputs (with stride magic), separated by regular and inverse broadcasting dimensions
|
||||
// - opDim (no stride magic allowed) for regular broadcasting dimensions
|
||||
// - reverse broadcasting dimensions
|
||||
// - opcodes for elementwise op and reduction op
|
||||
// - in each kernel:
|
||||
// - map thread index to dimensions (regular broadcasting ones)
|
||||
// - for-loop over inverse broadcasting dimensions
|
||||
// - map dimensions (including inverse broadcasting) for every input
|
||||
// - perform op on the input values
|
||||
// - accumulate
|
||||
// - map dimensions (regular) for output
|
||||
// - save result
|
||||
|
||||
// separate out the inverse-broadcasting dimensions
|
||||
// Any singleton dimension in the result tensor is inverse-broadcasting, because there must be at least one non-1 dimension
|
||||
// in one of the inputs, otherwise the entire dimension would have been optimized away above.
|
||||
vector<bool> isReducingDim(dims); // true for each inverse-broadcasting dimension
|
||||
for (size_t k = 0; k < dims; k++)
|
||||
cBroadcasts[k] = cs1[k] == 1 && (as1[k] != 1 || bs1[k] != 1);
|
||||
isReducingDim[k] = shapes.back()[k] == 1;
|
||||
|
||||
// form the regular (non-inverse-broadcasting) dims
|
||||
for (size_t i = 0; i < N; i++)
|
||||
regularStrides[i] = shapes[i].DropDims(isReducingDim).GetStrides();
|
||||
regularOpDims = TensorShape(opDims).DropDims(isReducingDim).GetDims(); // (ugh)
|
||||
|
||||
// form the inverse-broadcasting dims
|
||||
vector<bool> isRegularDim(dims); // true for each inverse-broadcasting dimension
|
||||
for (size_t k = 0; k < dims; k++)
|
||||
isRegularDim[k] = !isReducingDim[k]; // (no way to do this more nicely?)
|
||||
for (size_t i = 0; i < N; i++)
|
||||
reducingStrides[i] = shapes[i].DropDims(isRegularDim).GetStrides();
|
||||
reducingOpDims = TensorShape(opDims).DropDims(isRegularDim).GetDims(); // (ugh)
|
||||
|
||||
for (size_t i = 0; i < N; i++)
|
||||
offsets[i] = shapes[i].GetOffset();
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
void TensorView<ElemType>::DoUnaryOpOf(ElemType beta, const TensorView & a, ElemType alpha, ElementWiseOperator op)
|
||||
{
|
||||
// prepare all tensor descriptor information as needed for execution
|
||||
array<size_t, 2> offsets;
|
||||
array<vector<ptrdiff_t>, 2> regularStrides, reducingStrides;
|
||||
vector<size_t> regularOpDims, reducingOpDims;
|
||||
PrepareTensorOperands<ElemType,2>(array<TensorShape, 2> { a.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
|
||||
// now perform the operation
|
||||
fprintf(stderr, "Op %d: %s op %s -> %s via %s\n", (int)op, string(as1).c_str(), string(bs1).c_str(), string(cs1).c_str(), string(os1).c_str());
|
||||
// :)
|
||||
beta; alpha;
|
||||
GetSOB().TensorOp(beta, a.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
void TensorView<ElemType>::DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, ElementWiseOperator op)
|
||||
{
|
||||
array<size_t, 3> offsets;
|
||||
array<vector<ptrdiff_t>, 3> regularStrides, reducingStrides;
|
||||
vector<size_t> regularOpDims, reducingOpDims;
|
||||
PrepareTensorOperands<ElemType, 3>(array<TensorShape, 3> { a.GetShape(), b.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
|
||||
GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
void TensorView<ElemType>::DoTernaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, const TensorView & c, ElemType alpha, ElementWiseOperator op)
|
||||
{
|
||||
array<size_t, 4> offsets;
|
||||
array<vector<ptrdiff_t>, 4> regularStrides, reducingStrides;
|
||||
vector<size_t> regularOpDims, reducingOpDims;
|
||||
PrepareTensorOperands<ElemType, 4>(array<TensorShape, 4> { a.GetShape(), b.GetShape(), c.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
|
||||
GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), c.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
|
||||
// simple test function for testing stuff
|
||||
|
@ -153,16 +226,67 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
template<class ElemType>
|
||||
/*static*/ void TensorView<ElemType>::Test()
|
||||
{
|
||||
Matrix<ElemType> m1(0); m1.Resize(1, 42);
|
||||
Matrix<ElemType> m2(0); m2.Resize(13, 1);
|
||||
Matrix<ElemType> m3(0); m3.Resize(13, 21);
|
||||
TensorShape s1(1, 2, 21);
|
||||
TensorShape s2(13, 1);
|
||||
TensorShape s3(13, 1, 21);
|
||||
let t1 = TensorView<ElemType>(m1, s1); t1;
|
||||
let t2 = TensorView<ElemType>(m2, s2); t2;
|
||||
auto t3 = TensorView<ElemType>(m3, s3); t3;
|
||||
t3.DoSumOf(0, t1, t2, 1);
|
||||
Matrix<ElemType> m1(-1);
|
||||
Matrix<ElemType> m2(-1);
|
||||
Matrix<ElemType> m3(-1);
|
||||
{
|
||||
m1.SetValue(5, 3, { 1, 2, 3,
|
||||
14, 15, 6,
|
||||
4, 5, 16,
|
||||
41, 5, 1,
|
||||
1.8, 4.5, 7 });
|
||||
m2.SetValue(5, 1, { 42,
|
||||
13,
|
||||
1968,
|
||||
3.1415f,
|
||||
7 });
|
||||
|
||||
m3.Resize(m1);
|
||||
|
||||
// regular zip (just add m1 to itself)
|
||||
TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m1), 1);
|
||||
m3.Print();
|
||||
|
||||
// unary op
|
||||
TensorView(m3).DoSqrtOf(0, TensorView(m1), 1);
|
||||
m3.Print();
|
||||
|
||||
// broadcasting of an input
|
||||
TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1);
|
||||
m3.Print();
|
||||
|
||||
TensorView(m3).DoMaxOf(0, TensorView(m1), TensorView(m2), 1);
|
||||
m3.Print();
|
||||
|
||||
TensorView(m3).DoGTOf(0, TensorView(m1), TensorView(m2), 1);
|
||||
m3.Print();
|
||||
|
||||
// reduction over columns
|
||||
m3.Resize(5, 1);
|
||||
TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1);
|
||||
m3.Print();
|
||||
|
||||
// reduction over rows
|
||||
m3.Resize(1, 3);
|
||||
TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1);
|
||||
m3.Print();
|
||||
|
||||
TensorView(m3).DoLogSumOf(0, TensorView(m1), TensorView(m2), 1);
|
||||
m3.Print();
|
||||
}
|
||||
{
|
||||
m1.Resize(1, 42);
|
||||
m2.Resize(13, 1);
|
||||
m3.Resize(13, 21);
|
||||
TensorShape s1(1, 2, 21);
|
||||
TensorShape s2(13, 1);
|
||||
TensorShape s3(13, 1, 21);
|
||||
let t1 = TensorView<ElemType>(m1, s1); t1;
|
||||
let t2 = TensorView<ElemType>(m2, s2); t2;
|
||||
auto t3 = TensorView<ElemType>(m3, s3); t3;
|
||||
t3.DoSumOf(0, t1, t2, 1);
|
||||
m3.Print();
|
||||
}
|
||||
}
|
||||
|
||||
template class TensorView<float>;
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
// </copyright>
|
||||
//
|
||||
|
||||
// This implements the TensorView class, which is a layer around Matrix that reinterprets its content as a generic tensor.
|
||||
// This implements the TensorView class, which is a layer around Matrix that reinterprets its content as a generic tensor. [fseide]
|
||||
|
||||
#pragma once
|
||||
|
||||
|
@ -36,17 +36,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{ }
|
||||
// copy constructor
|
||||
TensorView(const TensorView<ElemType> & other) :
|
||||
TensorView(other.m_sob, other.m_shape)
|
||||
TensorView(*other.m_sob, other.m_shape)
|
||||
{ }
|
||||
// assignment is forbidden since we contain a reference
|
||||
// If you ever need this, change the reference to a pointer.
|
||||
void operator=(const TensorView & other) = delete; // since we have a reference
|
||||
|
||||
// -------------------------------------------------------------------
|
||||
// accessors
|
||||
// -------------------------------------------------------------------
|
||||
|
||||
const Matrix<ElemType> & GetSOB() const { return m_sob; }
|
||||
Matrix<ElemType> & GetSOB() const { return *m_sob; }
|
||||
const TensorShape & GetShape() const { return m_shape; }
|
||||
|
||||
// -------------------------------------------------------------------
|
||||
|
@ -59,20 +56,50 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// If beta == 0, c is not read out, i.e. it can be uninitialized or contain NaNs.
|
||||
// -------------------------------------------------------------------
|
||||
|
||||
void DoSumOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha) { DoBinaryOpOf(beta, a, b, alpha, 0); }
|
||||
#pragma push_macro("DeclareUnaryTensorOp")
|
||||
#define DeclareUnaryTensorOp(oper) \
|
||||
void Do ## oper ## Of(ElemType beta, const TensorView & a, ElemType alpha) { DoUnaryOpOf(beta, a, alpha, ElementWiseOperator::op ## oper); }
|
||||
|
||||
ForAllUnaryOps(DeclareUnaryTensorOp);
|
||||
ForAllParameterizedUnaryOps(DeclareUnaryTensorOp);
|
||||
//DeclareUnaryTensorOp(Copy);
|
||||
//DeclareUnaryTensorOp(Negate); DeclareUnaryTensorOp(Not);
|
||||
//DeclareUnaryTensorOp(Abs);
|
||||
//DeclareUnaryTensorOp(Sigmoid); DeclareUnaryTensorOp(SigmoidDerivative); DeclareUnaryTensorOp(Tanh); DeclareUnaryTensorOp(Sqrt); DeclareUnaryTensorOp(Exp); DeclareUnaryTensorOp(Log); DeclareUnaryTensorOp(LinearRectifierDerivative); DeclareUnaryTensorOp(Cosine); DeclareUnaryTensorOp(NegativeSine);
|
||||
//DeclareUnaryTensorOp(SaturateBetaAlpha); DeclareUnaryTensorOp(SumAlpha); DeclareUnaryTensorOp(SubDifferenceToAlpha); DeclareUnaryTensorOp(SubDifferenceFromAlpha);
|
||||
#pragma pop_macro("DeclareUnaryTensorOp")
|
||||
|
||||
#pragma push_macro("DeclareBinaryTensorOp")
|
||||
#define DeclareBinaryTensorOp(oper) \
|
||||
void Do ## oper ## Of(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha) { DoBinaryOpOf(beta, a, b, alpha, ElementWiseOperator::op ## oper); }
|
||||
|
||||
ForAllBinaryOps(DeclareBinaryTensorOp);
|
||||
//DeclareBinaryTensorOp(Sum); DeclareBinaryTensorOp(Difference); DeclareBinaryTensorOp(ElementWiseProduct); DeclareBinaryTensorOp(ElementWiseQuotient);
|
||||
//DeclareBinaryTensorOp(LogSum); DeclareBinaryTensorOp(Max); DeclareBinaryTensorOp(Min);
|
||||
//DeclareBinaryTensorOp(EQ); DeclareBinaryTensorOp(NE); DeclareBinaryTensorOp(GT); DeclareBinaryTensorOp(LT); DeclareBinaryTensorOp(GE); DeclareBinaryTensorOp(LE);
|
||||
#pragma pop_macro("DeclareBinaryTensorOp")
|
||||
|
||||
#pragma push_macro("DeclareTernaryTensorOp")
|
||||
#define DeclareTernaryTensorOp(oper) \
|
||||
void Do ## oper ## Of(ElemType beta, const TensorView & a, const TensorView & b, const TensorView & c, ElemType alpha) { DoTernaryOpOf(beta, a, b, c, alpha, ElementWiseOperator::op ## oper); }
|
||||
|
||||
ForAllTernaryOps(DeclareTernaryTensorOp);
|
||||
#pragma pop_macro("DeclareTernaryTensorOp")
|
||||
|
||||
static void Test();
|
||||
|
||||
private:
|
||||
|
||||
void DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, int op/*will become an enum later*/);
|
||||
void DoUnaryOpOf(ElemType beta, const TensorView & a, ElemType alpha, ElementWiseOperator op);
|
||||
void DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, ElementWiseOperator op);
|
||||
void DoTernaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, const TensorView & c, ElemType alpha, ElementWiseOperator op);
|
||||
|
||||
// -------------------------------------------------------------------
|
||||
// sob members
|
||||
// -------------------------------------------------------------------
|
||||
|
||||
Matrix<ElemType> & m_sob; // Storage OBject that holds the data that is being viewed with this TensorView
|
||||
TensorShape m_shape; // the meta-data that describes the data's shape and/or access pattern
|
||||
Matrix<ElemType> * m_sob; // Storage OBject that holds the data that is being viewed with this TensorView. Pointer instead of ref so this object is copyable.
|
||||
TensorShape m_shape; // the meta-data that describes the data's shape and/or access pattern
|
||||
// TODO: use a reference here or not? With a reference, we can hide more info in here such as cuDNN handles
|
||||
};
|
||||
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#include <sstream> // TODO: this should go away once we update the parameter parsing
|
||||
#include <unordered_map>
|
||||
#include <opencv2/opencv.hpp>
|
||||
#include <omp.h>
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
|
@ -400,6 +401,10 @@ void ImageReader<ElemType>::InitFromConfig(const ConfigRecordType& config)
|
|||
|
||||
m_prefetch = config(L"prefetch", true);
|
||||
|
||||
int cthread = config(L"numCPUThreads", 0);
|
||||
if (cthread > 0)
|
||||
omp_set_num_threads(cthread);
|
||||
|
||||
m_epochStart = 0;
|
||||
m_mbStart = 0;
|
||||
}
|
||||
|
@ -412,11 +417,16 @@ void ImageReader<ElemType>::Destroy()
|
|||
}
|
||||
|
||||
template<class ElemType>
|
||||
void ImageReader<ElemType>::StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples)
|
||||
void ImageReader<ElemType>::StartDistributedMinibatchLoop(size_t mbSize, size_t epoch, size_t subsetNum, size_t numSubsets, size_t requestedEpochSamples)
|
||||
{
|
||||
assert(mbSize > 0);
|
||||
assert(numSubsets > 0);
|
||||
assert(subsetNum < numSubsets);
|
||||
assert(requestedEpochSamples > 0);
|
||||
|
||||
m_subsetNum = subsetNum;
|
||||
m_numSubsets = numSubsets;
|
||||
|
||||
if (m_imgListRand)
|
||||
std::shuffle(m_files.begin(), m_files.end(), m_rng);
|
||||
|
||||
|
@ -457,7 +467,6 @@ bool ImageReader<ElemType>::GetMinibatch(std::map<std::wstring, Matrix<ElemType>
|
|||
|
||||
m_pMBLayout->InitAsFrameMode(mbSize);
|
||||
|
||||
m_mbStart += mbSize;
|
||||
// It is safe to run prefetching with just one buffer as SetValue is synchronous so there will be no race.
|
||||
m_mbPrefetchFut = std::async(GetLaunchPolicy(m_prefetch), [this]() { return ReadImages(); });
|
||||
|
||||
|
@ -505,10 +514,15 @@ size_t ImageReader<ElemType>::ReadImages()
|
|||
|
||||
std::fill(m_labBuf.begin(), m_labBuf.end(), static_cast<ElemType>(0));
|
||||
|
||||
size_t actualMBSize = mbLim - m_mbStart;
|
||||
size_t iStart = actualMBSize * m_subsetNum / m_numSubsets;
|
||||
size_t iLim = actualMBSize * (m_subsetNum + 1) / m_numSubsets;
|
||||
size_t subsetSize = iLim - iStart;
|
||||
|
||||
#pragma omp parallel for ordered schedule(dynamic)
|
||||
for (long long i = 0; i < static_cast<long long>(mbLim - m_mbStart); i++)
|
||||
for (long long i = 0; i < static_cast<long long>(subsetSize); i++)
|
||||
{
|
||||
const auto& p = m_files[i + m_mbStart];
|
||||
const auto& p = m_files[m_mbStart + iStart + i];
|
||||
cv::Mat img{ cv::imread(p.first, cv::IMREAD_COLOR) };
|
||||
if (!img.data)
|
||||
RuntimeError("Cannot read image file %s", p.first.c_str());
|
||||
|
@ -522,7 +536,8 @@ size_t ImageReader<ElemType>::ReadImages()
|
|||
m_labBuf[m_labDim * i + p.second] = 1;
|
||||
}
|
||||
|
||||
return mbLim - m_mbStart;
|
||||
m_mbStart += actualMBSize;
|
||||
return subsetSize;
|
||||
}
|
||||
|
||||
template class ImageReader<double>;
|
||||
|
|
|
@ -39,7 +39,12 @@ public:
|
|||
virtual void Init(const ScriptableObjects::IConfigRecord & config) override { InitFromConfig(config); }
|
||||
#endif
|
||||
void Destroy() override;
|
||||
void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize) override;
|
||||
bool SupportsDistributedMBRead() const { return true; }
|
||||
void StartDistributedMinibatchLoop(size_t mbSize, size_t epoch, size_t subsetNum, size_t numSubsets, size_t requestedEpochSamples = requestDataSize) override;
|
||||
void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize) override
|
||||
{
|
||||
return StartDistributedMinibatchLoop(mbSize, epoch, 0, 1, requestedEpochSamples);
|
||||
}
|
||||
bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices) override;
|
||||
bool DataEnd(EndDataType endDataType) override;
|
||||
|
||||
|
@ -73,6 +78,9 @@ private:
|
|||
size_t m_epochStart;
|
||||
size_t m_mbStart;
|
||||
|
||||
size_t m_subsetNum;
|
||||
size_t m_numSubsets;
|
||||
|
||||
bool m_prefetch;
|
||||
std::future<size_t> m_mbPrefetchFut;
|
||||
std::vector<ElemType> m_featBuf;
|
||||
|
|
|
@ -32,22 +32,26 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
assert(pMBLayout->GetNumParallelSequences() == m_numUttsPerMinibatch);
|
||||
uttInfoInMinibatch->clear();
|
||||
uttInfoInMinibatch->resize(uttInfo.size());
|
||||
|
||||
for (size_t i = 0; i < uttInfo.size(); ++i)
|
||||
{
|
||||
size_t startFrameIndexInMinibatch = 0;
|
||||
size_t numFrames = 0;
|
||||
|
||||
for (size_t j = 0; j < pMBLayout->GetNumTimeSteps(); ++j)
|
||||
{
|
||||
if (pMBLayout->Is(i, j, MinibatchPackingFlags::NoLabel))
|
||||
/* if (pMBLayout->Is(i, j, MinibatchPackingFlags::NoLabel))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
if (pMBLayout->Is(i, j, MinibatchPackingFlags::NoFeature))
|
||||
}*/
|
||||
FrameRange fr(pMBLayout,j);
|
||||
|
||||
if (pMBLayout->IsGap(fr.Sequence(i)))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
numFrames += 1;
|
||||
if (pMBLayout->Is(i, j, MinibatchPackingFlags::SequenceEnd)
|
||||
if (pMBLayout->IsBeyondStartOrEnd(fr.WithTimeOffset((ptrdiff_t) 1).Sequence(i))
|
||||
|| j == pMBLayout->GetNumTimeSteps() - 1)
|
||||
{
|
||||
size_t uttIndex = (*uttInfoInMinibatch)[i].size();
|
||||
|
|
|
@ -4,10 +4,10 @@
|
|||
// </copyright>
|
||||
//
|
||||
|
||||
//
|
||||
|
||||
#include "stdafx.h"
|
||||
#ifdef _WIN32
|
||||
#include <objbase.h>
|
||||
#endif
|
||||
#include "Basics.h"
|
||||
#include <fstream>
|
||||
#include <algorithm>
|
||||
|
|
|
@ -12,21 +12,6 @@
|
|||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
template<class ElemType>
|
||||
void DATAWRITER_API GetWriter(IDataWriter<ElemType>** pwriter)
|
||||
{
|
||||
*pwriter = new LMSequenceWriter<ElemType>();
|
||||
}
|
||||
|
||||
extern "C" DATAWRITER_API void GetWriterF(IDataWriter<float>** pwriter)
|
||||
{
|
||||
GetWriter(pwriter);
|
||||
}
|
||||
extern "C" DATAWRITER_API void GetWriterD(IDataWriter<double>** pwriter)
|
||||
{
|
||||
GetWriter(pwriter);
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
class LMSequenceWriter : public IDataWriter<ElemType>
|
||||
{
|
||||
|
@ -65,8 +50,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
public:
|
||||
using LabelType = typename IDataWriter<ElemType>::LabelType;
|
||||
using LabelIdType = typename IDataWriter<ElemType>::LabelIdType;
|
||||
void GetSections(std::map<std::wstring, SectionType, nocase_compare>& /*sections*/){}
|
||||
void SaveMapping(std::wstring saveId, const std::map<typename LabelIdType, typename LabelType>& /*labelMapping*/){}
|
||||
void SaveMapping(std::wstring saveId, const std::map<LabelIdType, LabelType>& /*labelMapping*/){}
|
||||
|
||||
public:
|
||||
template<class ConfigRecordType>
|
||||
|
@ -77,4 +64,21 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
virtual bool SaveData(size_t recordStart, const std::map<std::wstring, void*, nocase_compare>& matrices, size_t numRecords, size_t datasetSize, size_t byteVariableSized);
|
||||
};
|
||||
|
||||
template<class ElemType>
|
||||
void DATAWRITER_API GetWriter(IDataWriter<ElemType>** pwriter)
|
||||
{
|
||||
assert(pwriter != nullptr);
|
||||
*pwriter = new LMSequenceWriter<ElemType>();
|
||||
assert(*pwriter != nullptr);
|
||||
}
|
||||
|
||||
extern "C" DATAWRITER_API void GetWriterF(IDataWriter<float>** pwriter)
|
||||
{
|
||||
GetWriter(pwriter);
|
||||
}
|
||||
extern "C" DATAWRITER_API void GetWriterD(IDataWriter<double>** pwriter)
|
||||
{
|
||||
GetWriter(pwriter);
|
||||
}
|
||||
|
||||
}}}
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Загрузка…
Ссылка в новой задаче