Merge branch 'master' into qiwye/multiverso

This commit is contained in:
Qiwei Ye 2015-12-19 12:43:46 +08:00
Родитель b99f3e2f15 ef80d86ded
Коммит 9664daccb0
28 изменённых файлов: 4764 добавлений и 8099 удалений

Просмотреть файл

@ -9154,7 +9154,7 @@ L
\begin_layout Standard
\begin_inset Formula
\begin{eqnarray}
\alpha_{t}\left(i\right) & \leftarrow & h_{it}+logadd_{k}\left(\delta_{t-1}(k)+\eta a_{ki}\right)\\
\alpha_{t}\left(i\right) & \leftarrow & h_{it}+LogAdd{k}\left(\delta_{t-1}(k)+\eta a_{ki}\right)\\
\mathbf{\frac{\partial R}{\partial\delta_{t-1}(i)}} & \leftarrow & \sum_{j}\frac{\partial C_{logadd}}{\partial\delta_{t}(j)}\frac{\exp(\delta_{t-1}(i)+a_{i,j})}{\sum_{k}\exp(\delta_{t-1}(k)+a_{k,j})}\\
\mathbf{\frac{\partial R}{\partial\delta_{T}(i)}} & \leftarrow & \frac{\exp(\delta_{T}(i))}{\sum_{k}\exp(\delta_{T}(k))}\\
\frac{\partial R}{\partial h_{t}(i)} & \leftarrow & l_{t}(i)-\frac{\partial R}{\partial\delta_{t}(i)}\\

Просмотреть файл

@ -315,6 +315,7 @@ LMSEQUENCEREADER_SRC =\
$(SOURCEDIR)/Readers/LMSequenceReader/Exports.cpp \
$(SOURCEDIR)/Readers/LMSequenceReader/SequenceParser.cpp \
$(SOURCEDIR)/Readers/LMSequenceReader/SequenceReader.cpp \
$(SOURCEDIR)/Readers/LMSequenceReader/SequenceWriter.cpp \
LMSEQUENCEREADER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(LMSEQUENCEREADER_SRC))

Просмотреть файл

@ -11,25 +11,8 @@
#define _CRT_NONSTDC_NO_DEPRECATE // make VS accept POSIX functions without _
#include "stdafx.h"
#include "Actions.h"
#include <string>
#include <chrono>
#include <algorithm>
#if defined(_WIN32)
#include "io.h"
#endif
#include "buildinfo.h"
#include "hostname.h"
#ifdef LEAKDETECT
#include "vld.h" // for memory leak detection
#endif
#include <vector>
#include <iostream>
#include <queue>
#include <set>
#include <memory>
#include "Basics.h"
#include "Actions.h"
#include "ComputationNetwork.h"
#include "ComputationNode.h"
#include "DataReader.h"
@ -54,6 +37,23 @@
#include "BrainScriptEvaluator.h"
#include "BrainScriptParser.h"
#include <string>
#include <chrono>
#include <algorithm>
#if defined(_WIN32)
#include "io.h"
#endif
#include "buildinfo.h"
#include "hostname.h"
#ifdef LEAKDETECT
#include "vld.h" // for memory leak detection
#endif
#include <vector>
#include <iostream>
#include <queue>
#include <set>
#include <memory>
#ifndef let
#define let const auto
#endif

Просмотреть файл

@ -107,24 +107,32 @@ namespace Microsoft { namespace MSR { namespace CNTK {
void Invalidate() { m_dims.assign(3, SIZE_MAX); } // TODO: clean up the valid/invalid situation (this is currently done inconsistently). Also this object is immutable.
void Save(File& fstream) const
// verify that this refers to a dense matrix (no strides)
void VerifyIsDense() const
{
if (m_offset != 0)
LogicError("TensorShape::Save(): Cannot serialize TensorShape for slices.");
LogicError("TensorShape: A dense TensorShape expected. Offset %d not allowed.", (int)m_offset);
for (size_t k = 0; k < m_dims.size(); k++) // (TODO: we can save one multiplication here)
{
ptrdiff_t stride = k > 0 ? m_strides[k - 1] * (ptrdiff_t)m_dims[k - 1] : 1;
if (m_strides[k] != stride)
LogicError("TensorShape: A dense TensorShape expected. Dimension %d is not.", (int)k);
}
}
void Save(File& fstream) const
{
VerifyIsDense();
// saving as 32-bit ints. This allows to continue to support the old format (size_t W, H, C)
fstream << (uint32_t)m_dims.size();
ptrdiff_t mul = 1;
for (size_t k = 0; k < m_dims.size(); k++)
for (auto dim : m_dims)
{
auto dim = m_dims[k];
if (dim > UINT32_MAX)
LogicError("TensorShape::Save(): Tensor dimensions %s out of bounds (> 4G).", string(*this).c_str());
fstream << (uint32_t)dim;
if (m_steps[k] != mul)
LogicError("TensorShape::Save(): Cannot serialize TensorShape for slices.");
mul *= (ptrdiff_t)dim;
}
}
void Load(File& fstream)
{
// format: uint32_t n, dim[0], dim[1], ..., dim[n-1]
@ -154,8 +162,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// accessors
size_t GetDim(size_t k) const { return m_dims[k]; }
size_t GetNumDims() const { return m_dims.size(); }
size_t GetNumElements() const { size_t res = 1; for (auto & dim : m_dims) res *= dim; return res; }
ptrdiff_t GetStep(size_t k) const { return m_steps[k]; }
size_t GetNumElements() const { size_t res = 1; for (auto & dim : m_dims) res *= dim; return res; } // in slice
size_t GetOffset() const { return m_offset; }
// vector-like accessors
@ -163,12 +170,31 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t size() const { return GetNumDims(); }
const std::vector<size_t> & GetDims() const { return m_dims; } // get all, e.g. for logging or for constructing derived tensors with edited dimensions
const std::vector<ptrdiff_t> & GetStrides() const { return m_strides; }
// interpretation as an image tensor
size_t GetNumChannels() const { return m_dims[0]; }
size_t GetWidth() const { return m_dims[1]; }
size_t GetHeight() const { return m_dims[2]; }
// indexing
// Determines the offset into the underlying element array for a given multi-dimensional index.
// This function is for reference. Probably not often used.
size_t Locate(const std::vector<size_t> & index) const
{
ptrdiff_t location = m_offset;
for (size_t k = 0; k < index.size(); k++)
{
size_t dim = k < size() ? m_dims[k] : 1; // dimensions are bottomless
if (index[k] >= dim)
LogicError("Locate: Tensor index[%d]=%d exceeds bound %d.", (int)k, (int)index[k], (int)dim);
location += (ptrdiff_t)index[k] * m_strides[k]; // strides may be negative
}
if (location < 0 || (size_t)location >= m_allocation)
LogicError("Locate: Tensor index out of bounds.");
return (size_t)location;
}
// helpers for tensor operations
bool CanFlatten(size_t k) const // can dims k and k-1 be flattened into a single vector? (do they form a matrix without stride)
{
@ -179,66 +205,145 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (m_dims[k] == 1 || m_dims[k - 1] == 1) // both are broadcasting or scalar--we don't care about stride in this case
return true;
else
return m_steps[k] == m_steps[k - 1] * (ptrdiff_t)m_dims[k - 1];
return m_strides[k] == m_strides[k - 1] * (ptrdiff_t)m_dims[k - 1];
}
// editing functions
// These all create new TensorShape objects.
TensorShape Flatten(size_t k) const // flatten [k] with [k-1]
{
TensorShape result = *this;
if (!CanFlatten(k))
LogicError("Flatten() cannot flatten dimensions with gaps");
// We reshape local (I x J) sub-matrices to (1 x I*J) sub-matrices.
// We merge to right so that we can merge multiple by looping left-to-right.
// m_dims = I J K L
// m_strides = 1 I I*J I*J*K
// flattening J and K
// m_dims = I 1 J*K L
// m_strides = 1 I I I*J*K
// TODO: rethink whether this is correct for example of negative strides
result.m_dims[k] *= result.m_dims[k - 1];
result.m_dims[k - 1] = 1;
result.m_strides[k] = /*result.m_dims[k - 1] *, it's 1 */ result.m_strides[k - 1];
return result;
}
TensorShape DropDims(const std::vector<bool> & toDrop) const // remove dimension
{
// this deletes a dimension while retaining strides
// This implies a slice to [0] for this dimension.
TensorShape result = *this;
size_t j = 0;
for (size_t k = 0; k < size(); k++)
{
if (toDrop[k])
continue;
else
{
// example
// m_dims = I 1 J K
// m_strides = 1 I I I*J
// dropping the second dimension
// m_dims = I % J K
// m_strides = 1 % I I*J
result.m_dims[j] = result.m_dims[k];
result.m_strides[j] = result.m_strides[k];
j++;
}
}
result.m_dims.resize(j);
result.m_strides.resize(j);
return result;
}
TensorShape WithBroadcastStrides() const // flatten [k] with [k-1] if toFlatten[k] is set
{
TensorShape result = *this;
for (size_t k = 0; k < size(); k++)
if (result.m_dims[k] == 1)
result.m_strides[k] = 0;
return result;
}
TensorShape Pad(size_t numDims) const // append singleton dimensions
{
VerifyIsDense();
if (numDims < GetNumDims())
LogicError("Pad() cannot drop a shorten the dimensions.");
else if (numDims == GetNumDims())
return *this;
auto dims = GetDims();
dims.resize(numDims, 1);
return TensorShape(dims);
}
TensorShape Concat(const TensorShape & other) const // concatenate
{
auto dims = GetDims();
auto otherDims = other.GetDims();
dims.insert(dims.end(), otherDims.begin(), otherDims.end());
return TensorShape(dims);
}
// pretty-printing. Returns tensor dims in the form "I x J x K".
operator std::string() const
{
std::string s;
for (const auto & dim : m_dims)
for (size_t k = 0; k < size(); k++)
{
if (!s.empty())
s.append(" x ");
s.append(std::to_string(dim));
s.append(std::to_string(m_dims[k]));
}
#ifdef _DEBUG // also emit the strides, easier for debugging
s.append(" {");
for (size_t k = 0; k < size(); k++)
{
if (k > 0)
s.append(",");
s.append(std::to_string(m_strides[k]));
}
s.append("}");
#endif
return s;
}
private:
// reset m_steps and m_offset to represent a canonical no-strides tensor
// reset m_strides and m_offset to represent a canonical no-strides tensor
void InitAsNoSlice()
{
m_offset = 0;
m_steps.resize(m_dims.size());
ptrdiff_t mul = 1;
m_strides.resize(m_dims.size());
for (size_t k = 0; k < m_dims.size(); k++)
{
m_steps[k] = (ptrdiff_t)mul;
mul *= m_dims[k];
}
m_strides[k] = k > 0 ? m_strides[k - 1] * (ptrdiff_t)m_dims[k - 1] : 1;
m_allocation = m_dims.empty() ? 0 : m_dims.back() * (size_t)m_strides.back();
}
private:
std::vector<size_t> m_dims; // dimensions of tensor or tensor slice. The size of the box.
std::vector<ptrdiff_t> m_steps; // dimension gets multiplied by this for computing the index offset. How to hop to the next element in dimension[k]. Stride magic happening here!
std::vector<ptrdiff_t> m_strides; // dimension gets multiplied by this for computing the index offset. How to hop to the next element in dimension[k]. Stride magic happening here!
size_t m_offset; // offset to element(0,0,...,0). May be non-0 in case of slicing.
// For a regular tensor, there are no strides, m_steps[k] = m_steps[k-1] * m_dims[k-1]. This is how TensorShapes are created from dimensions.
size_t m_allocation; // allocation size of original dense tensor
// For a regular tensor, there are no strides, m_strides[k] = m_strides[k-1] * m_dims[k-1]. This is how TensorShapes are created from dimensions.
// For views into existing tensors, we do stride shenanigans to implement broadcasting (plus magic tricks). Examples:
// To traverse a 5 x 10 matrix with column order reversed:
// - op.dims = (5 x 10)
// - m_offset points to element (0,9)
// - m_steps[0] = 1 // regular forward iteration within each column
// - m_steps[1] = -5 // backward iteration over columns
// - m_strides = (1, -5) // backward iteration over columns
// To compute matrix C(13 x 42) = vector A(13 x 1) + matrix B(13 x 42):
// - op = sum
// - op.dims = (13 x 42)
// - *.m_steps[0] = 1 // forward iteration through each column
// - C.m_steps[1] = 13 // forward iteration over columns of B--defines the for loop
// - B.m_steps[1] = 13 // forward iteration over columns of B--iterates in sync with C
// - A.m_steps[1] = 0 // A, however, is stuck in column 0 forever
// - C.m_strides = (1, 13) // forward iteration over columns of B--defines the for loop
// - B.m_strides = (1, 13) // forward iteration over columns of B--iterates in sync with C
// - A.m_strides = (1, 0) // A, however, is stuck in column 0 forever
// Matrix product: C(I x K) = A(I x J) * B(J x K) --Note: Likely not RAM-bandwidth efficient!
// - op = mul
// - op.dims = (I x J x K) // iteration dimensions
// - C.m_steps = (1, 0, I) // inverse broadcasting for inner dimension
// - A.m_steps = (1, I, 0)
// - B.m_steps = (0, 1, J)
// - C.m_strides = (1, 0, I) // inverse broadcasting for inner dimension
// - A.m_strides = (1, I, 0)
// - B.m_strides = (0, 1, J)
// Convolution of time signals (without padding): Y(T-N+1) = X(T) * H(N): --Note: Likely not RAM-bandwidth efficient!
// - op = mul
// - op.dims = (T-N+1 x N) // iteration dimensions
// - Y.m_steps = (1, 0) // inverse broadcasting: this sums up the individual products
// - X.m_steps = (1, 1) // shift window by 1 for each output sample
// - H.m_steps = (0, -1) // reuse for each output sample; iterate in reverse order for convolution
// - Y.m_strides = (1, 0) // inverse broadcasting: this sums up the individual products
// - X.m_strides = (1, 1) // shift window by 1 for each output sample
// - H.m_strides = (0, -1) // reuse for each output sample; iterate in reverse order for convolution
// - H.m_offset = N - 1 // begin with last element (reverse order for convolution)
// TODO: double-check all these
// TODO: Does the same trick work for 2D images?

Просмотреть файл

@ -108,12 +108,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_numParallelSequences = numParallelSequences;
m_numTimeSteps = numTimeSteps;
// allocate lookup tables (note: except at the start, these don't really allocate new memory most of the time)
// PTRDIFF_MAX indicates not initialized (also in the matrix, which is stored as float).
m_distanceToStart.Resize(m_numParallelSequences, m_numTimeSteps); m_distanceToStart.SetValue((float)PTRDIFF_MAX);
m_distanceToEnd.Resize(m_numParallelSequences, m_numTimeSteps); m_distanceToEnd.SetValue((float)PTRDIFF_MAX);
#if 1
if ((m_distanceToStart.GetNumRows() != m_numParallelSequences || m_distanceToStart.GetNumCols() != m_numTimeSteps) && m_numTimeSteps > 0) // sanity check for debugging a regression
fprintf(stderr, "MBLayout::Init: Resizing m_distanceToStart from %d x %d to %d x %d\n",
(int)m_distanceToStart.GetNumRows(), (int)m_distanceToStart.GetNumCols(), (int)m_numParallelSequences, (int)m_numTimeSteps); // (I really want to know about actual allocations, but this is a necessary condition for them)
#endif
m_distanceToStart.Resize(m_numParallelSequences, m_numTimeSteps);
m_distanceToEnd.Resize(m_numParallelSequences, m_numTimeSteps);
m_distanceToNearestStart.assign(m_numTimeSteps, PTRDIFF_MAX);
m_distanceToNearestEnd.assign(m_numTimeSteps, PTRDIFF_MAX);
m_distanceToNearestEnd.assign(m_numTimeSteps, PTRDIFF_MAX);
m_timeStepHasGap.assign(m_numTimeSteps, false);
m_columnsValidityMask.Resize(0, 0); // invalidate
// reset state
m_numFramesDeclared = 0;
m_numGapFrames = 0;
@ -121,20 +126,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_writable = true;
}
// short-hand to initialize an MBLayout for the common case of frame mode
// In frame mode, there is one parallel "sequence" per sample, which is 1 frame long.
void InitAsFrameMode(size_t numSamples)
{
Init(numSamples, 1);
SequenceInfo seqInfo { 0, 0, 0, 1 };
for (size_t s = 0; s < numSamples; s++)
{
seqInfo.seqId = seqInfo.s = s;
AddSequence(seqInfo);
}
Lock();
}
// -------------------------------------------------------------------
// accessors
// -------------------------------------------------------------------
@ -199,7 +190,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
LogicError("AddSequence: Sequence added to an MBLayout must overlap with minibatch.");
// remember it
#ifdef _DEBUG
auto cap = m_sequences.capacity(); // Some sanity check for debugging a speed regression. This should only show up during the first minibatches, and growing only.
m_sequences.push_back(seqDesc);
if (cap != m_sequences.capacity())
fprintf(stderr, "AddSequence: m_sequences was reallocated from capacity %d to %d\n", (int)cap, (int)m_sequences.capacity());
#else
m_sequences.push_back(seqDesc);
#endif
// create all the cached fast-lookup information
const auto seqId = seqDesc.seqId;
@ -212,7 +210,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_numGapFrames += (e - b);
for (size_t t = b; t < e; t++)
{
//Set(s, t, MinibatchPackingFlags::NoInput);
m_timeStepHasGap[t] = true;
m_distanceToStart(s, t) = -1; // start flags also encode gaps
}
@ -220,22 +217,49 @@ namespace Microsoft { namespace MSR { namespace CNTK {
else for (size_t t = b; t < e; t++)
{
// update the nearest sentence boundaries, minimum over all parallel sequences
// -1 in distanceToStart(,) stands for a gap
assert(m_distanceToStart(s, t) != -1); // gaps not allowed to overlap
// If 0, then we are on a boundary. If not 0, we can still test in presence of FrameRange.m_timeOffset.
ptrdiff_t distanceToStart = t - beginTime;
if (m_distanceToStart(s, t) > (float)distanceToStart)
m_distanceToStart(s, t) = (float)distanceToStart;
ptrdiff_t distanceToStart = (ptrdiff_t)t - beginTime;
ptrdiff_t distanceToEnd = (ptrdiff_t)(endTime - 1 - t);
m_distanceToStart(s, t) = (float)distanceToStart;
m_distanceToEnd(s, t) = (float)distanceToEnd;
// and the aggregate
if (m_distanceToNearestStart[t] > distanceToStart)
m_distanceToNearestStart[t] = distanceToStart;
ptrdiff_t distanceToEnd = endTime - 1 - t;
if (m_distanceToEnd(s, t) > (float) distanceToEnd)
m_distanceToEnd(s, t) = (float) distanceToEnd;
if (m_distanceToNearestEnd[t] > distanceToEnd)
m_distanceToNearestEnd[t] = distanceToEnd;
}
}
// short-hand to initialize an MBLayout for the common case of frame mode
// In frame mode, there is one parallel "sequence" per sample, which is 1 frame long.
// This function provides an efficient short-cut implementation of AddSequence(t, t, 0, 1) for every sample t.
void InitAsFrameMode(size_t numSamples)
{
Init(numSamples, 1);
// create sequences array
SequenceInfo virginSeqInfo = { 0, 0, 0, 1 };
m_sequences.resize(numSamples, virginSeqInfo); // pass it here since otherwise STL will initialize everything to 0 unnecessarily
// update sequence indices
for (size_t s = 0; s < numSamples; s++)
{
// remember it
auto & seqDesc = m_sequences[s];
seqDesc.seqId = s;
seqDesc.s = s;
}
m_numFramesDeclared = numSamples;
// create all the cached fast-lookup information
m_distanceToStart.SetValue(0);
m_distanceToEnd.SetValue(0);
m_distanceToNearestStart[0] = 0;
m_distanceToNearestEnd[0] = 0;
Lock();
}
// mark a range of frames in a parallel sequence as invalid
// I'd love to start with all-gaps, but that would require to set flags upfront, and then clearing them.
void AddGap(size_t s, ptrdiff_t beginTime, size_t endTime) { if ((ptrdiff_t)endTime > beginTime) AddSequence(GAP_SEQUENCE_ID, s, beginTime, endTime); }
@ -330,10 +354,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// 2 1 0 . . ] // (last two time steps undefined)
// m_distanceToNearestStart = [ 0 1 2 3 4 ]
// m_distanceToNearestEnd = [ 2 1 0 1 0 ]
Matrix<float> m_distanceToStart, m_distanceToEnd; // (s,t); value<0 stands for gap, PTRDIFF_MAX for 'not initialized'
vector<ptrdiff_t> m_distanceToNearestStart, m_distanceToNearestEnd; // [t] (value<0 does NOT stand for gap; consult m_timeStepHasGap[] vector instead)
Matrix<float> m_distanceToStart, m_distanceToEnd; // (s,t); value<0 stands for gap
vector<ptrdiff_t> m_distanceToNearestStart, m_distanceToNearestEnd; // [t] (does not store info about gaps; consult m_timeStepHasGap[] vector instead)
vector<bool> m_timeStepHasGap; // [t]
vector<bool> m_timeStepHasGap; // [t] true if at least one gap in time step t
// Cached mask indicating the validity of each column in the MBLayout
// TODO: We actually just need a boolean matrix for this.
@ -527,6 +551,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (s == SIZE_MAX) // aggregate requested
{
// determine flags from aggregate vectors
// Note: We allow that all parallel sequences contain gaps (m_distanceToNearestStart[t] == PTRDIFF_MAX)
// because that makes implementation of the reader easier for truncated BPTT (it knows too late that there are not that many frames left).
auto distanceToStart = (ptrdiff_t)m_distanceToNearestStart[t];
if (distanceToStart < -fr.m_timeOffset)
return true;
@ -557,7 +583,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// TODO: Remove this version (with sanity checks) after this has been tested. Then the function can be inlined above.
inline size_t MBLayout::GetActualNumSamples() const
{
#if 1 // sanity check --TODO: delete this after a while
#if 0 // sanity check --TODO: delete this after a while
size_t n = GetNumCols();
if (HasGaps())
{

Просмотреть файл

@ -13,6 +13,8 @@
namespace Microsoft { namespace MSR { namespace CNTK {
using namespace std;
// -----------------------------------------------------------------------
// subroutines for Validate() implementations
// -----------------------------------------------------------------------
@ -41,13 +43,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// all are consistent: install it
LinkToMBLayout(pMBLayout);
}
// single input that maps its input element-wise (e.g. Sigmoid)
void ComputationNodeBase::ValidateUnaryMap(bool isFinalValidationPass)
{
assert(m_inputs.size() == 1);
ComputationNodeBase::Validate(isFinalValidationPass);
InferMBLayoutFromInputsForStandardCase();
SetDims(m_inputs[0]->GetNumRows(), DetermineNumCols(m_inputs[0]));
SetDims(m_inputs[0]);
InferImageDimsFromInputs();
}
// binary zip operation, e.g. Plus
@ -138,6 +141,61 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
}
// -----------------------------------------------------------------------
// tensor helpers
// -----------------------------------------------------------------------
template<class ElemType>
static TensorShape GetSampleShape(const ComputationNode<ElemType> * node)
{
// TODO: use actual ImageLayout. While those are not yet inferred properly, maybe use it if its dims match numRows?
if (node->HasMBLayout()) // if we have a layout, that dimension is not part of the sample shape
return TensorShape(node->GetNumRows());
else
return TensorShape(node->GetNumRows(), node->GetNumCols());
}
template<class ElemType>
std::vector<TensorView<ElemType>> ComputationNode<ElemType>::GetTensorsForwardBinary(const FrameRange & fr)
{
const size_t N = 3; // 2 inputs and 1 output
// BUGBUG: Currently does not interpret actual ImageLayouts or convolutional models.
// TODO: move this into a helper function
// get tensor shapes
vector<ComputationNode<ElemType>*> nodes;
for (size_t i = 0; i < N; i++)
nodes.push_back(i < N-1 ? Input(i).get() : this);
vector<Matrix<ElemType>> values;
vector<TensorShape> shapes;
for (size_t i = 0; i < N; i++)
{
values.push_back(nodes[i]->ValueFor(i < N-1 ? fr.AllowBroadcast() : fr)); // no broadcasting for now allowed for output
shapes.push_back(GetSampleShape(nodes[i]));
}
// pad
size_t dims = 0;
for (size_t i = 0; i < N; i++)
if (dims < shapes[i].GetNumDims())
dims = shapes[i].GetNumDims();
for (size_t i = 0; i < N; i++)
shapes[i] = shapes[i].Pad(dims);
// concatenate MBLayout dims
// TODO: Is it possible that the output has no layout, but inputs have? Then we lost dimensions. Tensor constructor will catch that, though.
if (HasMBLayout())
{
for (size_t i = 0; i < N; i++)
{
auto sm = nodes[i]->HasMBLayout() ? TensorShape(GetNumParallelSequences(), GetNumTimeSteps()) : TensorShape(1, 1);
shapes[i] = shapes[i].Concat(sm);
}
}
// perform operation
std::vector<TensorView<ElemType>> tensors;
for (size_t i = 0; i < N; i++)
tensors.push_back(TensorView<ElemType>(values[i], shapes[i]));
return tensors;
}
// -----------------------------------------------------------------------
// others
// -----------------------------------------------------------------------
@ -172,6 +230,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template<> std::map<size_t, std::map<size_t, FloatMatrix*>> ComputationNode<float>::s_constOnes{};
template<> std::map<size_t, std::map<size_t, DoubleMatrix*>> ComputationNode<double>::s_constOnes{};
template class ComputationNode<float>;
template class ComputationNode<double>;
template class LearnableParameter<float>;
template class LearnableParameter<double>;
}}}

Просмотреть файл

@ -340,18 +340,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
}
// helper functions for common cases
private:
// determine number of columns from a child and/or layout
size_t DetermineNumCols(const ComputationNodeBasePtr & child) const
{
size_t childCols = child->GetNumCols(); // this is what the child says
if (!m_pMBLayout) // no layout: copy from child
return childCols;
size_t cols = m_pMBLayout->GetNumCols(); // layout: get it from there, but validate against child
if (childCols != cols)
RuntimeError("%ls %ls operation: Mismatch in number of columns", OperationName().c_str(), NodeName().c_str());
return cols;
}
protected:
void ValidateUnaryMap(bool isFinalValidationPass);
void ValidateUnaryReduce(bool isFinalValidationPass);
@ -779,7 +767,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
protected:
//std containers such as list and map does not support class reference so we need to use pointer
typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr;
ComputationNode() { }
public:
using ComputationNodeBase::AttachInputs; // import the convenience functions that take 1..6 parameters
using ComputationNodeBase::SetDims;
@ -1085,6 +1072,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
const Matrix<ElemType>& Gradient() const { return *m_gradient; }
Matrix<ElemType>& Gradient() { return *m_gradient; }
std::vector<TensorView<ElemType>> GetTensorsForwardBinary(const FrameRange & fr);
// Function to return the number of columns for whole batch or single frame
size_t GetNumColsFor(const FrameRange & fr/*select frame or entire batch*/)
{
@ -1519,7 +1508,7 @@ protected: \
using Base::CreateUniqId; \
using Base::GetNumInputs; using Base::ZeroGradientsOfInputs; using Base::VerifyDims; \
using Base::ConstOnes; \
using Base::GetImageLayout; using Base::InferImageDimsFromInput; using Base::InferImageDimsFromInputs; using Base::InferMBLayoutFromInputsForStandardCase; \
using Base::GetImageLayout; using Base::GetTensorsForwardBinary; using Base::InferImageDimsFromInput; using Base::InferImageDimsFromInputs; using Base::InferMBLayoutFromInputsForStandardCase; \
using Base::CopyTo; using Base::CreateUniqNodeName; using Base::DetachInputs; using Base::GetInputsFromConfig; \
using Base::DumpNodeInfo; using Base::EnumerateNodes; \
using Base::HasMBLayout; using Base::GetMBLayout; using Base::LinkToMBLayout; \

Просмотреть файл

@ -20,6 +20,7 @@
#include "Basics.h"
#include "Matrix.h"
#include "TensorView.h"
#include "ComputationNode.h"
#include "ConvolutionalNodes.h"
@ -129,6 +130,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
{
#if 0 // TODO: use #if 0 until this is working
auto args = GetTensorsForwardBinary(fr);
args[2].DoSumOf(0.0f, args[0], args[1], 1.0f);
#else
Matrix<ElemType> functionValues = ValueForToDense(fr, false); // Switch to dense as a work-around because ColumnSlice doesn't support all the sparse formats
Matrix<ElemType> inputFunctionValues0 = Input(0)->ValueFor(fr.AllowBroadcast());
Matrix<ElemType> inputFunctionValues1 = Input(1)->ValueFor(fr.AllowBroadcast());
@ -185,6 +190,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
else
LogicError("%ls %ls operation's Validate() function let invalid dimensions slip by.", NodeName().c_str(), OperationName().c_str());
#endif
#if DUMPOUTPUT
functionValues.Print("PlusNode");
#endif

Просмотреть файл

@ -9,12 +9,13 @@
#include "stdafx.h"
#include "Basics.h"
#include "File.h"
#include "CPUMatrix.h"
#include "TensorOps.h"
#include <assert.h>
#include <stdexcept>
#include <omp.h>
#include <math.h>
#include "CPUMatrix.h"
#include <random>
#include <chrono>
#include <exception>
@ -351,7 +352,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
auto& us = *this;
#pragma omp parallel for
#pragma omp parallel for
for (long j = 0; j<n; j++)
{
//four-way unrolling
@ -384,7 +385,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
long n = (long)a.GetNumCols(); // note: OpenMP requires loop indices to be long, not size_t
long k = (long)a.GetNumRows();
#pragma omp parallel for
#pragma omp parallel for
for (long j=0; j<n; j++)
{
//memory copy might be faster?
@ -428,7 +429,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
auto& us = *this;
#pragma omp parallel for
#pragma omp parallel for
for (long j=0; j<n; j++)
{
//four-way unrolling
@ -469,7 +470,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
auto& us = *this;
#pragma omp parallel for
#pragma omp parallel for
for (long j = 0; j<n; j++)
{
//four-way unrolling
@ -500,7 +501,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
auto& us = *this;
#pragma omp parallel for
#pragma omp parallel for
for (long i = 0; i < m_numRows; i++)
{
diag(0, (size_t)i) = us(i, i);
@ -538,7 +539,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
auto& us = *this;
#pragma omp parallel for
#pragma omp parallel for
for (long j = 0; j<sliceNumCols; j++)
{
for (int i = 0; i < inputMatrices.size(); i++)
@ -575,7 +576,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
long n = (long)a.GetNumCols(), m = (long)a.GetNumRows();
auto& us = *this;
#pragma omp parallel for
#pragma omp parallel for
for (long q = 0; q < numColRepeats; q++)
{
for (long p = 0; p < numRowRepeats; p++)
@ -619,7 +620,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
auto& us = *this;
#pragma omp parallel for
#pragma omp parallel for
for (long j = 0; j<n; j++)
{
//four-way unrolling
@ -685,7 +686,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
auto& us = *this;
#pragma omp parallel for
#pragma omp parallel for
for (long j=0; j<n; j++)
{
//four-way unrolling
@ -719,7 +720,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
else
{
long m=(long)GetNumElements();
#pragma omp parallel for
#pragma omp parallel for
//four-way unrolling
for (long i=0; i<(m & ~3); i+=4)
{
@ -777,7 +778,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
auto& us = *this;
long m=(long)GetNumRows();
#pragma omp parallel for
#pragma omp parallel for
//four-way unrolling
for (long i=0; i<(m & ~3); i+=4)
{
@ -802,7 +803,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
auto& us = *this;
long m=(long)GetNumRows();
#pragma omp parallel for
#pragma omp parallel for
//four-way unrolling
for (long i=0; i<(m & ~3); i+=4)
{
@ -827,7 +828,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
auto& us = *this;
long m=(long)GetNumRows();
#pragma omp parallel for
#pragma omp parallel for
//four-way unrolling
for (long i=0; i<(m & ~3); i+=4)
{
@ -935,7 +936,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
auto& us = *this;
long m=(long)GetNumRows();
#pragma omp parallel for
#pragma omp parallel for
//four-way unrolling
for (long i=0; i<(m & ~3); i+=4)
{
@ -974,7 +975,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
long m=(long)GetNumRows();
if (vector.GetNumRows() == 1) //row vector
{
#pragma omp parallel for
#pragma omp parallel for
//four-way unrolling
for (long i=0; i<(m & ~3); i+=4)
{
@ -991,7 +992,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
else
{
#pragma omp parallel for
#pragma omp parallel for
//four-way unrolling
for (long i=0; i<(m & ~3); i+=4)
{
@ -1164,7 +1165,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ElemType a0, a1, a2, a3;
//disable omp here because aveMultiper needs to be added atomically. however, it seems the result is incorrect even if rmp atomic and amp critical are used.
//#pragma omp parallel for
//#pragma omp parallel for
for (long i = 0; i<(n & ~3); i += 4) //four-way unrolling
{
a[i] += d_v[i] * d_v[i];
@ -1495,7 +1496,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
Resize(a.GetNumRows(), a.GetNumCols());
long m=(long)GetNumRows(), n=(long)GetNumCols();
#pragma omp parallel for
#pragma omp parallel for
for (long j=0; j<n; j++)
{
//four-way unrolling
@ -1596,7 +1597,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
Resize(a.GetNumRows(), a.GetNumCols());
long m=(long)GetNumRows(), n=(long)GetNumCols();
#pragma omp parallel for
#pragma omp parallel for
for (long j=0; j<n; j++)
{
//four-way unrolling
@ -1625,7 +1626,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
Resize(a.GetNumRows(), a.GetNumCols());
long m=(long)GetNumRows(), n=(long)GetNumCols();
#pragma omp parallel for
#pragma omp parallel for
for (long j=0; j<n; j++)
{
//four-way unrolling
@ -1816,7 +1817,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
Resize(a.GetNumRows(), a.GetNumCols());
long m=(long)GetNumRows(), n=(long)GetNumCols();
#pragma omp parallel for
#pragma omp parallel for
for (long j=0; j<n; j++)
{
//four-way unrolling
@ -1853,7 +1854,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
auto& us=*this;
long m=(long)GetNumRows(), n=(long)GetNumCols();
#pragma omp parallel for
#pragma omp parallel for
for (long j=0; j<n; j++)
{
//four-way unrolling
@ -1921,7 +1922,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
auto& us=*this;
long m=(long)GetNumRows(), n=(long)GetNumCols();
#pragma omp parallel for
#pragma omp parallel for
for (long j=0; j<n; j++)
{
//four-way unrolling
@ -1956,7 +1957,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
auto& us=*this;
long m=(long)GetNumRows(), n=(long)GetNumCols();
#pragma omp parallel for
#pragma omp parallel for
for (long j=0; j<n; j++)
{
ElemType v = a(0,j);
@ -1991,7 +1992,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
auto& us = *this;
long m = (long)GetNumRows(), n = (long)GetNumCols();
#pragma omp parallel for
#pragma omp parallel for
for (long j = 0; j<n; j++)
{
ElemType v = a(0, j);
@ -2032,7 +2033,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
long m=(long)GetNumRows(), n=(long)GetNumCols();
ElemType smallValue = EPS_IN_INVERSE;
#pragma omp parallel for
#pragma omp parallel for
for (long j=0; j<n; j++)
{
for (long i=0; i<m; i++)
@ -2133,7 +2134,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
Resize(a.GetNumRows(), a.GetNumCols());
long m=(long)GetNumRows(), n=(long)GetNumCols();
#pragma omp parallel for
#pragma omp parallel for
for (long j=0; j<n; j++)
{
//four-way unrolling
@ -2172,7 +2173,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
Resize(a.GetNumRows(), a.GetNumCols());
long m=(long)GetNumRows(), n=(long)GetNumCols();
#pragma omp parallel for
#pragma omp parallel for
for (long j=0; j<n; j++)
{
//four-way unrolling
@ -2220,7 +2221,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
Resize(a.GetNumRows(), a.GetNumCols());
long m=(long)GetNumRows(), n=(long)GetNumCols();
#pragma omp parallel for
#pragma omp parallel for
for (long j=0; j<n; j++)
{
//four-way unrolling
@ -2387,7 +2388,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
Resize(a.GetNumRows(), a.GetNumCols());
long m=(long)GetNumRows(), n=(long)GetNumCols();
#pragma omp parallel for
#pragma omp parallel for
for (long j=0; j<n; j++)
{
//four-way unrolling
@ -2427,7 +2428,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
Resize(a.GetNumRows(), a.GetNumCols());
long m=(long)GetNumRows(), n=(long)GetNumCols();
#pragma omp parallel for
#pragma omp parallel for
for (long j=0; j<n; j++)
{
//four-way unrolling
@ -2467,7 +2468,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
Resize(a.GetNumRows(), a.GetNumCols());
long m=(long)GetNumRows(), n=(long)GetNumCols();
#pragma omp parallel for
#pragma omp parallel for
for (long j=0; j<n; j++)
{
//four-way unrolling
@ -2620,7 +2621,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
auto& us=*this;
long m=(long)GetNumRows(), n=(long)GetNumCols();
#pragma omp parallel for
#pragma omp parallel for
for (long j=0; j<n; j++)
{
//four-way unrolling
@ -2660,7 +2661,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ElemType locTHresholdNeg = -locThresholdPos;
long m=(long)GetNumRows(), n=(long)GetNumCols();
#pragma omp parallel for
#pragma omp parallel for
for (long j=0; j<n; j++)
{
//four-way unrolling
@ -2708,7 +2709,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
long m = (long)GetNumElements();
#pragma omp parallel for
#pragma omp parallel for
for (long i = 0; i<(m & ~3); i += 4) //four-way unrolling
{
if (m_pArray[i] > threshold)
@ -4304,7 +4305,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (sample_id == 0)
sample_prob = -sample_prob;
double score_noise = log_num_noise_samples + sample_prob;
double z = logadd(score, score_noise);
double z = LogAdd(score, score_noise);
double logprob = score - z;
double logprob_noise = score_noise - z;
tmp(sample_id, instance_id) = (ElemType)-std::exp(logprob);
@ -4387,7 +4388,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
ElemType v = alpha*a(0,0);
long m=(long)c.GetNumRows(), n=(long)c.GetNumCols();
#pragma omp parallel for
#pragma omp parallel for
for (long j=0; j<n; j++)
{
//four-way unrolling
@ -4497,7 +4498,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
LogicError("AddScaledDifference: Input matrix a is empty.");
long m=(long)c.GetNumElements();
#pragma omp parallel for
#pragma omp parallel for
//four-way unrolling
for (long i=0; i<(m & ~3); i+=4)
{
@ -4536,7 +4537,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
c.Resize(a.GetNumRows(), a.GetNumCols());
long m=(long)c.GetNumElements();
#pragma omp parallel for
#pragma omp parallel for
//four-way unrolling
for (long i=0; i<(m & ~3); i+=4)
{
@ -4634,7 +4635,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
c.Resize(m,n);
long size=(long)c.GetNumElements();
#pragma omp parallel for
#pragma omp parallel for
//four-way unrolling
for (long i=0; i<(size & ~3); i+=4)
{
@ -4944,7 +4945,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
bool bHas = false;
bool isvFinite = std::isfinite(v);
#pragma omp parallel for
#pragma omp parallel for
for (long j = 0; j < mat.GetNumElements(); j++)
{
#pragma omp flush(bHas)
@ -4992,7 +4993,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
long m = (long)GetNumRows(), n = (long)GetNumCols(); // a and b are of size (1,n)
//#pragma omp parallel for
//#pragma omp parallel for
for (long j = 0; j < n; j++)
{
@ -5247,7 +5248,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
//long m = (long)GetNumRows(), n = (long)GetNumCols(); // a and b are of size (1,n)
long n = (long)GetNumCols(); // a and b are of size (1,n)
#pragma omp parallel for
#pragma omp parallel for
for (long j = 0; j<n; j++)
{
us(0, j) = a(0, j) * b(0, (j + shift) % n);
@ -5256,34 +5257,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
return *this;
}
#pragma endregion Static BLAS Functions
double logadd(double x, double y)
{
double temp, diff, z;
if (x < y) {
temp = x; x = y; y = temp;
}
diff = y - x;
if (diff < MINLOGEXP)
{
return (x < LSMALL)?LZERO:x;
}
else
{
z = exp(diff);
return x + log(1.0 + z);
}
}
// 'double' version of LogAdd
double LogAddD(double x, double y) { return LogAdd(x, y); }
template<class ElemType>
ElemType CPUMatrix<ElemType>::LogAddSumOfElements() const
{
ElemType fAlpha = (ElemType)LZERO;
for (int k = 0; k < GetNumElements(); k++)
fAlpha = (ElemType) logadd(fAlpha, m_pArray[k]);
fAlpha = (ElemType) LogAddD(fAlpha, m_pArray[k]);
return fAlpha;
}
@ -5330,7 +5314,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
fSum = (ElemType)LZERO;
for (int j = 0; j < iNumLab; j++)
{
fSum = (ElemType)logadd((double)fSum, alpha(j, t));
fSum = (ElemType)LogAddD(fSum, alpha(j, t));
}
fTmp = alpha(k, t) - fSum;
@ -5343,10 +5327,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
fSum = (ElemType)LZERO;
for (int m = 0; m < iNumLab; m++)
{
fSum = (ElemType)logadd((double)fSum, alpha(m, t) + pair_scores(j, m));
fSum = (ElemType)LogAddD(fSum, alpha(m, t) + pair_scores(j, m));
}
fTmp = (ElemType)logadd(fTmp, beta(j, t + 1) + alpha(k, t) + pair_scores(j, k) - fSum);
fTmp = (ElemType)LogAddD(fTmp, beta(j, t + 1) + alpha(k, t) + pair_scores(j, k) - fSum);
}
beta(k, t) = fTmp;
}
@ -5455,7 +5439,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
else{
fTmp2 = a(k, 0);
}
fSum = (ElemType)logadd(fSum, fTmp2 + pair_scores(j, k));
fSum = (ElemType)LogAddD(fSum, fTmp2 + pair_scores(j, k));
}
fTmp -= fSum;
@ -5533,7 +5517,259 @@ namespace Microsoft { namespace MSR { namespace CNTK {
return numThreads;
}
// The explicit instantiation part
// -----------------------------------------------------------------------
// TensorView support
// -----------------------------------------------------------------------
// To save time, this makes extensive use of templates and macros.
// perform loop over reduction index m
// This function is declared inside a wrapper struct to allow partial specialization (m = -1).
template<class ElemType, typename OPFN, size_t N, int m>
struct TensorOpReduction
{
// reduction case (non-reduction case is specialized)
static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN & opfn,
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
{
array<ptrdiff_t, N - 1> strides; // N-1 because last one is the result pointer, which is unused in reduction
for (size_t i = 0; i < N - 1; i++) // N = a small constant, this will be unrolled
strides[i] = reducingStrides[i][(size_t)m];
ElemType aggregate = 0;
for (size_t dim = reducingOpDims[(size_t)m]; dim-- > 0;)
{
// need to descend into one loop deeper
aggregate += TensorOpReduction<ElemType, OPFN, N, m - 1>::Loop(pointers, opfn, reducingOpDims, reducingStrides);
// advance the pointers
for (size_t i = 0; i < N - 1; i++)
pointers[i] += strides[i]; // note: last pointer (result) is unused and untouched here
}
return aggregate;
}
};
// perform loop over reduction index m
// This is the specialized version for m = -1, which terminates the recursion.
template<class ElemType, typename OPFN, size_t N>
struct TensorOpReduction<ElemType, OPFN, N, -1>
{
static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN & opfn,
const std::vector<size_t> &, const std::array<std::vector<ptrdiff_t>, N> &)
{
return opfn(pointers); // finally we are doing some work!!!
}
};
// perform loop over regular index k and reducing index m for N operands (counting the output)
template<class ElemType, typename OPFN, size_t N, bool vectorizable, int m, int k>
struct TensorOpIteration
{
static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN & opfn,
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, N> & regularStrides,
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
{
// non-scalar case: still nested result loops left
array<ptrdiff_t, N> strides;
for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled
strides[i] = regularStrides[i][(size_t)k];
for (size_t dim = regularOpDims[(size_t)k]; dim--> 0;)
{
// need to descend into one loop deeper
TensorOpIteration<ElemType, OPFN, N, vectorizable, m, k - 1>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
// advance the pointers
for (size_t i = 0; i < N; i++)
pointers[i] += strides[i];
}
}
};
// Special version for innermost loop with strides all being 1 and no further reduction. Compiler can use SSE.
// This is a very common case, e.g. adding vectors or computing the Sigmoid.
template<class ElemType, typename OPFN>
struct TensorOpIteration<ElemType, OPFN, 3, true/*vectorizable*/, -1/*no reduction*/, 0/*innermost loop*/>
{
static inline void Loop(ElemType beta, array<ElemType*, 3> pointers, ElemType alpha, const OPFN & opfn,
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, 3> & regularStrides,
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 3> & reducingStrides)
{
ElemType* pa = pointers[0];
ElemType* pb = pointers[1];
ElemType* pc = pointers[2];
size_t K = regularOpDims[0];
// special-case beta and alpha to allow the compiler to short-circuit it
if (beta != 0)
#pragma omp parallel for
for (int k = 0; k < (int)K; k++)
TensorOpIteration<ElemType, OPFN, 3, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(beta, array<ElemType*, 3> { pa + k, pb + k, pc + k }, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
else if (alpha != 1)
#pragma omp parallel for
for (int k = 0; k < (int)K; k++)
TensorOpIteration<ElemType, OPFN, 3, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(0, array<ElemType*, 3> { pa + k, pb + k, pc + k }, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
else
#pragma omp parallel for
for (int k = 0; k < (int)K; k++)
TensorOpIteration<ElemType, OPFN, 3, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(0, array<ElemType*, 3> { pa + k, pb + k, pc + k }, 1, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
// TODO: somehow this does not use 4-way parallelism with SSE (VS 2013), and the signedness of k (required for omp) causes an extra sign-extend
// TODO: OMP adds LOTS of overhead. Do we need a guard, a min size when to use it?
}
};
// and unary
template<class ElemType, typename OPFN>
struct TensorOpIteration<ElemType, OPFN, 2, true/*vectorizable*/, -1/*no reduction*/, 0/*innermost loop*/>
{
static inline void Loop(ElemType beta, array<ElemType*, 2> pointers, ElemType alpha, const OPFN & opfn,
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, 2> & regularStrides,
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 2> & reducingStrides)
{
ElemType* pa = pointers[0];
ElemType* pb = pointers[1];
size_t K = regularOpDims[0];
// special-case beta and alpha to allow the compiler to short-circuit it
if (beta != 0)
#pragma omp parallel for
for (int k = 0; k < (int)K; k++)
TensorOpIteration<ElemType, OPFN, 2, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(beta, array<ElemType*, 2> { pa + k, pb + k }, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
else if (alpha != 1)
#pragma omp parallel for
for (int k = 0; k < (int)K; k++)
TensorOpIteration<ElemType, OPFN, 2, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(0, array<ElemType*, 2> { pa + k, pb + k }, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
else
#pragma omp parallel for
for (int k = 0; k < (int)K; k++)
TensorOpIteration<ElemType, OPFN, 2, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(0, array<ElemType*, 2> { pa + k, pb + k }, 1, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
};
template<class ElemType, typename OPFN, size_t N, bool vectorizable, int m>
struct TensorOpIteration<ElemType, OPFN, N, vectorizable, m, -1>
{
static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN & opfn,
const std::vector<size_t> &, const std::array<std::vector<ptrdiff_t>, N> &,
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
{
// we are at element level for the result: perform the op (there may still be reduction)
ElemType val = alpha * TensorOpReduction<ElemType, OPFN, N, m>::Loop(pointers, opfn, reducingOpDims, reducingStrides);
// combine with previous value in target matrix, then write it out
auto * pout = pointers.back();
if (beta != 0)
val += beta * *pout;
*pout = val;
return;
}
};
// tensor operation with k+1 dimensions (-1 means scalar)
template<class ElemType, typename OPFN, size_t N, int k>
static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N> & pointers, ElemType alpha, const OPFN & opfn,
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, N> & regularStrides,
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
{
size_t dims = reducingOpDims.size();
switch (dims)
{
case 2: return TensorOpIteration<ElemType, OPFN, N, false/*vectorizable*/, 1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 1: return TensorOpIteration<ElemType, OPFN, N, false/*vectorizable*/, 0, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 0:
{
// if all leading dimensions are 1, we can let the compiler do some unrolling
bool leadingAllOne = true;
for (size_t i = 0; i < N; i++)
leadingAllOne &= k >= 0 && regularStrides[i][0] == 1;
if (leadingAllOne) // special version that uses a hard-coded increment of 1 for all leading dimensions
return TensorOpIteration<ElemType, OPFN, N, true/*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
else
return TensorOpIteration<ElemType, OPFN, N, false/*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
default: LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (int)dims);
}
}
// tensor operation, generalized in number of arguments, operation already provided as a lambda
// This function now expands into different k.
template<class ElemType, typename OPFN, size_t N>
static void TensorOpWithFn(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN & opfn,
const std::array<size_t, N> & offsets,
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, N> & regularStrides,
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
{
for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled
pointers[i] += offsets[i];
size_t dims = regularOpDims.size();
switch (dims)
{
case 4: return TensorOpWithRegularLoop<ElemType, OPFN, N, 3>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 3: return TensorOpWithRegularLoop<ElemType, OPFN, N, 2>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 2: return TensorOpWithRegularLoop<ElemType, OPFN, N, 1>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 1: return TensorOpWithRegularLoop<ElemType, OPFN, N, 0>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 0: return TensorOpWithRegularLoop<ElemType, OPFN, N, -1>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
default: LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int)dims);
}
}
// perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
// This maps 'op' to a lambda.
template<class ElemType>
void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
const std::array<size_t, 2> & offsets,
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, 2> & regularStrides,
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 2> & reducingStrides)
{
#define CaseUnaryTensorOp(oper) \
case ElementWiseOperator::op ## oper: \
return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 2> & pp) { return Op ## oper((*(pp[0]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
array<ElemType*, 2> pointers = { a.m_pArray, m_pArray };
switch (op)
{
ForAllUnaryOps(CaseUnaryTensorOp);
default: LogicError("TensorUnaryOp: Unknown op code %d.", (int)op);
}
}
// perform binary operation 'op' on a and b giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
// This maps 'op' to a lambda.
template<class ElemType>
void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
const std::array<size_t, 3> & offsets,
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, 3> & regularStrides,
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 3> & reducingStrides)
{
#define CaseBinaryTensorOp(oper) \
case ElementWiseOperator::op ## oper: \
return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 3> & pp) { return Op ## oper((*(pp[0])), (*(pp[1]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
array<ElemType*, 3> pointers = { a.m_pArray, b.m_pArray, m_pArray };
switch (op)
{
ForAllBinaryOps(CaseBinaryTensorOp);
default: LogicError("TensorBinaryOp: Unknown op code %d.", (int)op);
}
}
// perform ternary operation 'op' on a, and c giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
// This maps 'op' to a lambda.
template<class ElemType>
void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
const std::array<size_t, 4> & offsets,
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, 4> & regularStrides,
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 4> & reducingStrides)
{
#define CaseTernaryTensorOp(oper) \
case ElementWiseOperator::op ## oper: \
return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 4> & pp) { return Op ## oper((*(pp[0])), (*(pp[1])), (*(pp[2]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
array<ElemType*, 4> pointers = { a.m_pArray, b.m_pArray, c.m_pArray, m_pArray };
switch (op)
{
ForAllTernaryOps(CaseTernaryTensorOp);
default: LogicError("TensorTernaryOp: Unknown op code %d.", (int)op);
}
}
// -----------------------------------------------------------------------
// explicit instantiations
// -----------------------------------------------------------------------
template class MATH_API CPUMatrix<float>;
template class MATH_API CPUMatrix<double>;
@ -5551,5 +5787,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template void CPUMatrix<char>::SetValue(const char);
template void CPUMatrix<char>::SetValue(const size_t numRows, const size_t numCols, char *pArray, size_t matrixFlags);
template void CPUMatrix<char>::SetValue(CPUMatrix<char> const&);
template void CPUMatrix<char>::Resize(const size_t numRows, const size_t numCols, bool growOnly);
}}}

Просмотреть файл

@ -334,6 +334,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
static bool AreEqual(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const ElemType threshold = 1e-8);
static void TensorShuffleScaleAndAdd(ElemType keepWeight, const CPUMatrix<ElemType>& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& c);
void TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
const std::array<size_t, 2> & offsets,
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, 2> & regularStrides,
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 2> & reducingStrides);
void TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
const std::array<size_t, 3> & offsets,
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, 3> & regularStrides,
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 3> & reducingStrides);
void TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
const std::array<size_t, 4> & offsets,
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, 4> & regularStrides,
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 4> & reducingStrides);
static CPUMatrix<ElemType> Ones(const size_t rows, const size_t cols);
static CPUMatrix<ElemType> Zeros(const size_t rows, const size_t cols);

Просмотреть файл

@ -41,6 +41,51 @@ MATH_API DEVICEID_TYPE EnforceOneGPUOnly(DEVICEID_TYPE requestedDeviceId);
namespace Microsoft { namespace MSR { namespace CNTK {
// -----------------------------------------------------------------------
// ElementWiseOperator -- This enum represents which function to apply.
// This is shared between all matrix types and tensors.
// -----------------------------------------------------------------------
enum ElementWiseOperator
{
// unary (or binary with constant parameter)
opCopy,
opNegate, opNot,
opAbs,
opSigmoid, opSigmoidDerivative, opTanh, opSqrt, opExp, opLog, opLinearRectifierDerivative, opCosine, opNegativeSine,
// these are not implemented yet:
opSaturateBetaAlpha, opSumAlpha, opSubDifferenceToAlpha, opSubDifferenceFromAlpha,
// binary
opSum, opDifference, opElementWiseProduct, opElementWiseQuotient,
opLogSum, opMax, opMin,
opEQ, opNE, opGT, opLT, opGE, opLE,
// ternary
opCond
// Note: not all of the above are actually implement at present; and not all that's implemented has an opcode.
};
// helper to apply a C macro for all operations of each kind
#define ForAllUnaryOps(Macro) \
Macro(Copy); \
Macro(Negate); Macro(Not); \
Macro(Abs); \
Macro(Sigmoid); Macro(SigmoidDerivative); Macro(Tanh); Macro(Sqrt); Macro(Exp); Macro(Log); Macro(LinearRectifierDerivative); Macro(Cosine); Macro(NegativeSine);
#define ForAllParameterizedUnaryOps(Macro) \
Macro(SaturateBetaAlpha); Macro(SumAlpha); Macro(SubDifferenceToAlpha); Macro(SubDifferenceFromAlpha);
#define ForAllBinaryOps(Macro) \
Macro(Sum); Macro(Difference); Macro(ElementWiseProduct); Macro(ElementWiseQuotient); \
Macro(LogSum); Macro(Max); Macro(Min); \
Macro(EQ); Macro(NE); Macro(GT); Macro(LT); Macro(GE); Macro(LE);
#define ForAllTernaryOps(Macro) \
Macro(Cond);
// -----------------------------------------------------------------------
// various enums to describe
// -----------------------------------------------------------------------
enum MatrixFlagBitPosition
{
bitPosRowMajor = 0, // row major matrix
@ -76,6 +121,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
matrixFlagSetValueOnDevice = 1<<bitPosSetValueOnDevice, // SetValue() call has a buffer that is already on the device
};
// -----------------------------------------------------------------------
// BaseMatrix -- base class for all matrix types (CPU, GPU) x (dense, sparse)
// -----------------------------------------------------------------------
template<class ElemType>
class BaseMatrix

Просмотреть файл

@ -71,16 +71,6 @@ namespace Microsoft {
};
// -----------------------------------------------------------------------
// ElementWiseOperator -- This enum represents which function to apply. It needs to be outside of GPUMatrix, because it is also used in GPUSparseMatrix
// -----------------------------------------------------------------------
enum ElementWiseOperator
{
opSigmoid = 0, opTanh, opSqrt, opExp, opLog, opAbs, opLinearRectifierDerivative, opCosine, opNegativeSine, opSigmoidDerivative
};
// -----------------------------------------------------------------------
// GPUMatrix
// -----------------------------------------------------------------------

Просмотреть файл

@ -162,6 +162,7 @@
<ClInclude Include="CommonMatrix.h" />
<ClInclude Include="ConvolutionEngine.h" />
<ClInclude Include="CPUMatrix.h" />
<ClInclude Include="TensorOps.h" />
<ClInclude Include="TensorView.h" />
<None Include="ClassDiagram.cd" />
<None Include="GPUWatcher.cu" />

Просмотреть файл

@ -70,6 +70,9 @@
<ClInclude Include="TensorView.h">
<Filter>Tensors</Filter>
</ClInclude>
<ClInclude Include="TensorOps.h">
<Filter>Tensors</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<None Include="GPUMatrix.h">

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -6,9 +6,8 @@
// TODO:
// - remove empty-matrix checks: if an op is well-defined with empty matrices, then do it
// - Resize() must be cheap if it does nothing (I already did that for CPU, still to be done for GPU)
// - an overload for Resize() to match another matrix
// - need a way to grow a minibatch matrix without destroying its content, something like PushColumns()
// - Resize() must be cheap if it does nothing (I already did that for CPU; already done for GPU?)
#pragma once
#include "Basics.h"
@ -16,11 +15,12 @@
#include "CommonMatrix.h"
#include <limits.h>
#include <memory> // for shared_ptr
#include <array>
#include <initializer_list>
// This class is exported from the Math.dll
namespace Microsoft { namespace MSR { namespace CNTK {
enum CurrentDataLocation
{
NONE, CPU, GPU, BOTH
@ -73,6 +73,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
void _transferToDevice(int id_to, bool ismoved=true, bool emptyTransfer=false) const;
static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c);
static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, const Matrix<ElemType>& d);
static void CopyElementsFromDenseToSparse(CPUMatrix<ElemType>& from, CPUSparseMatrix<ElemType>& dest);
public:
@ -168,6 +169,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ElemType RmsProp(Matrix<ElemType>& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier);
void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve = 10000, bool growOnly = true); //by default we only reallocate if need to grow
void Resize(const Matrix<ElemType>& other) { Resize(other.GetNumRows(), other.GetNumCols()); }
void VerifySize(size_t rows, size_t cols)
{
m_baseMatrix->VerifySize(rows, cols);
@ -200,6 +202,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
void SetValue(const Matrix<ElemType>& deepCopyFrom, const MatrixFormat format=matrixFormatSparseCSR);
void SetValue(const size_t numRows, const size_t numCols, int deviceId, ElemType *pArray, const size_t matrixFlags = matrixFlagNormal);
void SetValue(const size_t rIdx, const size_t cIdx, ElemType val); // set matrix sparsely
void SetValue(const size_t numRows, const size_t numCols, std::initializer_list<ElemType> l) { std::vector<ElemType> vals(l); assert(vals.size() == numRows * numCols); SetValue(numRows, numCols, GetDeviceId(), vals.data(), matrixFormatRowMajor); } // SetValue(2,3, {1,2,3, 4,5,6});
static ElemType MakeNan(size_t payload);
void Invalidate() { SetValue(MakeNan(__LINE__)); }
void SetMatrixFromCSCFormat(const CPUSPARSE_INDEX_TYPE *h_CSCCol, const CPUSPARSE_INDEX_TYPE *h_Row, const ElemType *h_Val,
@ -376,7 +379,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
void VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise, int topK) const;
void VectorMin(Matrix<ElemType>& minIndexes, Matrix<ElemType>& minValues, const bool isColWise) const;
Matrix<ElemType>& AssignNumOfDiff(const Matrix<ElemType>& a, const Matrix<ElemType>& b, bool searchInCol = false);
Matrix<ElemType>& AssignNumOfDiff(const Matrix<ElemType>& a, const Matrix<ElemType>& b, bool searchInCol = false);
Matrix<ElemType>& AssignInnerProductOfMatrices(const Matrix<ElemType>& a, const Matrix<ElemType>& b); //this method will resize(1,1) first
@ -458,6 +461,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
static bool HasElement(const Matrix<ElemType>& a, const ElemType value = 0.0);
static void TensorShuffleScaleAndAdd(ElemType keepWeight, const Matrix<ElemType>& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const Matrix<ElemType>& b, Matrix<ElemType>& c);
void TensorOp(ElemType beta, const Matrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
const std::array<size_t, 2> & offsets,
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, 2> & regularStrides,
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 2> & reducingStrides);
void TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
const std::array<size_t, 3> & offsets,
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, 3> & regularStrides,
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 3> & reducingStrides);
void TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
const std::array<size_t, 4> & offsets,
const std::vector<size_t> & regularOpDims, const std::array<std::vector<ptrdiff_t>, 4> & regularStrides,
const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 4> & reducingStrides);
public:
void Read(File& stream);
void Write(File& stream) const;

132
Source/Math/TensorOps.h Normal file
Просмотреть файл

@ -0,0 +1,132 @@
//
// <copyright file="TensorView.h" company="Microsoft">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//
// This implements the elementwise tensor operations, including helper macros and some actual functions.
#pragma once
#include "Basics.h"
#include "CommonMatrix.h"
#pragma push_macro("TENSOR_OPS_DECL")
#ifndef TENSOR_OPS_DECL // to make these accessible to CUDA kernels, say '#define TENSOR_OPS_DECL __device__ __host__'
#define TENSOR_OPS_DECL
#endif
#pragma push_macro("DECL")
#define DECL static inline TENSOR_OPS_DECL
// This class is exported from the Math.dll.
namespace Microsoft { namespace MSR { namespace CNTK {
// -----------------------------------------------------------------------
// unified overloads for float/double math functions
//
// Declare float and double versions of the functions f we need as f_(),
// e.g. exp_ -> exp(double), expf(float).
// -----------------------------------------------------------------------
#pragma push_macro("OverloadUnaryMathFns")
#define OverloadUnaryMathFns(func) \
DECL float func ## _(float arg) { return func ## f(arg); } \
DECL double func ## _(double arg) { return func(arg); }
OverloadUnaryMathFns(fabs); OverloadUnaryMathFns(sqrt);
OverloadUnaryMathFns(exp); OverloadUnaryMathFns(log);
OverloadUnaryMathFns(tanh); OverloadUnaryMathFns(cos); OverloadUnaryMathFns(sin);
#pragma push_macro("OverloadUnaryMathFns")
// -----------------------------------------------------------------------
// additional functions that are standard in our context
// -----------------------------------------------------------------------
template<class ElemType>
DECL ElemType Sigmoid(ElemType z)
{
if (z >= 0)
return 1 / (1 + exp_(-z));
else
{
ElemType v = exp_(z);
return v / (1 + v);
}
}
template<class ElemType>
DECL ElemType SigmoidDerivative(ElemType z)
{
ElemType v = Sigmoid(z);
return v * (1 - v);
}
template<class ElemType>
DECL ElemType LinearRectifierDerivative(ElemType z)
{
return z > 0 ? (ElemType)1 : 0;
}
template<class ElemType>
DECL ElemType Sqrt(ElemType z)
{
// BUGBUG: Why clip to 0? An invalid sqrt() should show up as a NaN in the result, instead of hiding it.
return sqrt_(z > 0 ? z : 0);
}
// TODO: call this LogAdd() for consistency
template<typename ElemType>
DECL ElemType LogAdd(ElemType x, ElemType y)
{
if (x < y)
{
ElemType temp = x; x = y; y = temp;
}
ElemType diff = y - x;
if (diff < (ElemType)MINLOGEXP)
{
return (x < (ElemType)LSMALL) ? (ElemType)LZERO : x;
}
else
{
ElemType z = exp_(diff);
return x + log_((ElemType)1.0 + z);
}
}
// -----------------------------------------------------------------------
// ElementWiseOperator implementations
//
// Define a static function for every ElementWiseOperator (CommonMatrix.h).
// -----------------------------------------------------------------------
#pragma push_macro("DefUnaryOp")
#define DefUnaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op(ElemType a) { return expr; }
DefUnaryOp(Copy, a);
DefUnaryOp(Negate, -a); DefUnaryOp(Not, !a);
DefUnaryOp(Abs, fabs_(a));
DefUnaryOp(Sigmoid, Sigmoid(a)); DefUnaryOp(SigmoidDerivative, SigmoidDerivative(a)); DefUnaryOp(Tanh, tanh_(a)); DefUnaryOp(Sqrt, Sqrt(a)); DefUnaryOp(Exp, exp_(a)); DefUnaryOp(Log, log_(a)); DefUnaryOp(LinearRectifierDerivative, LinearRectifierDerivative(a)); DefUnaryOp(Cosine, cos_(a)); DefUnaryOp(NegativeSine, -sin_(a));
#pragma pop_macro("DefUnaryOp")
// parameterized unary ops
//DefUnaryOp(SaturateBetaAlpha); DefUnaryOp(SumAlpha); DefUnaryOp(SubDifferenceToAlpha); DefUnaryOp(SubDifferenceFromAlpha);
#pragma push_macro("DefBinaryOp")
#define DefBinaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op(ElemType a, ElemType b) { return expr; }
DefBinaryOp(Sum, a + b); DefBinaryOp(Difference, a - b); DefBinaryOp(ElementWiseProduct, a*b); DefBinaryOp(ElementWiseQuotient, a / b);
DefBinaryOp(LogSum, LogAdd(a, b)); DefBinaryOp(Max, a > b ? a : b); DefBinaryOp(Min, a < b ? a : b);
DefBinaryOp(EQ, a == b); DefBinaryOp(NE, a != b); DefBinaryOp(GT, a > b); DefBinaryOp(LT, a < b); DefBinaryOp(GE, a >= b); DefBinaryOp(LE, a <= b);
#pragma pop_macro("DefBinaryOp")
#pragma push_macro("DefTernaryOp")
#define DefTernaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op(ElemType a, ElemType b, ElemType c) { return expr; }
DefTernaryOp(Cond, a ? b : c);
#pragma pop_macro("DefTernaryOp")
}}}
#pragma pop_macro("DECL")
#pragma pop_macro("TENSOR_OPS_DECL")

Просмотреть файл

@ -26,11 +26,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// construction
// -------------------------------------------------------------------
// cast a matrix as a tensor
// cast a matrix as a TensorView
template<class ElemType>
TensorView<ElemType>::TensorView(Matrix<ElemType> & sob) :
m_sob(sob), m_shape(TensorShape(array<size_t, 2> { sob.GetNumRows(), sob.GetNumCols() }))
m_sob(&sob), m_shape(TensorShape(array<size_t, 2> { sob.GetNumRows(), sob.GetNumCols() }))
{ }
// reshape a TensorView
template<class ElemType>
TensorView<ElemType>::TensorView(const TensorView<ElemType> & other, const TensorShape & shape) :
m_sob(other.m_sob), m_shape(shape)
@ -40,14 +41,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// TODO: Use the multipliers instead?
size_t i;
size_t rowDim = 1;
for (i = 0; i < m_shape.size() && rowDim < m_sob.GetNumRows(); i++)
for (i = 0; i < m_shape.size() && rowDim < m_sob->GetNumRows(); i++)
rowDim *= m_shape[i];
// first i dimensions match matrix row dimension
size_t colDim = 1;
for (; i < m_shape.size(); i++)
colDim *= m_shape[i];
if (rowDim != m_sob.GetNumRows() || colDim != m_sob.GetNumCols())
LogicError("TensorView: Tensor dimensions %s do not match storage-object dims %d x %d", string(m_shape).c_str(), (int)m_sob.GetNumRows(), (int)m_sob.GetNumCols());
if (rowDim != m_sob->GetNumRows() || colDim != m_sob->GetNumCols())
LogicError("TensorView: Tensor dimensions %s do not match storage-object dims %d x %d", string(m_shape).c_str(), (int)m_sob->GetNumRows(), (int)m_sob->GetNumCols());
}
// -------------------------------------------------------------------
@ -56,96 +57,168 @@ namespace Microsoft { namespace MSR { namespace CNTK {
static bool Matches(size_t d1, size_t d2) { return d1 == 1 || d2 == 1 || d1 == d2; } // do two dimensions match?
template<class ElemType>
void TensorView<ElemType>::DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, int op/*will become an enum later*/)
template<class ElemType, size_t N>
static void PrepareTensorOperands(array<TensorShape, N> shapes, array<size_t, N> & offsets,
vector<size_t> & regularOpDims,
array<vector<ptrdiff_t>, N> & regularStrides,
vector<size_t> & reducingOpDims,
array<vector<ptrdiff_t>, N> & reducingStrides)
{
TensorView & c = *this;
// TODO: Turn the inner meat here into a function template using a std::array<., N-nariness>. Nullary ops are generators, e.g. constants.
// massage TensorShapes
// Note that TensorShapes here may be shapes are stored or shapes with stride magic applied.
auto as = a.GetShape().GetDims();
auto bs = b.GetShape().GetDims();
auto cs = c.GetShape().GetDims();
// expand ones to make tensors compatible
// Trailing dimensions broadcast.
// E.g. A(J) vs. B(J x T) will broadcast A(:) to all T columns.
// To broadcast an A(T) to all J rows of B, use TensorShape editing to insert a dimension to get A(1,T).
auto dims = max(max(as.size(), bs.size()), cs.size());
as.resize(dims, 1);
bs.resize(dims, 1);
cs.resize(dims, 1);
size_t dims = 0;
for (size_t i = 0; i < N; i++)
if (dims < shapes[i].GetNumDims())
dims = shapes[i].GetNumDims();
for (size_t i = 0; i < N; i++)
shapes[i] = shapes[i].Pad(dims);
// determine operation shape (max over all dimensions)
decltype(as) os(dims);
vector<size_t> opDims(dims, 0);
for (size_t k = 0; k < dims; k++)
os[k] = max(max(as[k], bs[k]), cs[k]);
for (size_t i = 0; i < N; i++)
opDims[k] = max(opDims[k], shapes[i][k]);
// dimension compatibility check
// Each participant can broadcast. Non-broadcasting dimensions must match the operation dimension.
for (size_t k = 0; k < dims; k++)
{
if (!Matches(as[k], os[k]) || !Matches(bs[k], os[k]) || !Matches(cs[k], os[k]))
InvalidArgument("Binary tensor operation: Dimension %d is incompatible between the two inputs and output (%d vs. %d vs. %d)", (int)dims, (int)as[k], (int)bs[k], (int)cs[k]);
}
for (size_t i = 0; i < N; i++)
if (!Matches(shapes[i][k], opDims[k]))
InvalidArgument("Binary tensor operation: Dimension %d is incompatible between input %d and output (%s vs. %s)", (int)k, (int)shapes[i][k], string(shapes[i]).c_str(), string(TensorShape(opDims)).c_str());
// flatten consecutive dimensions
// Dimensions must be consecutive in memory, and either non-broadcasting or all-broadcasting, across all dimensions.
// After this, as, bs, and cs no longer match the TensorShape objects.
//fprintf(stderr, "Pre-flatten: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
for (size_t k = 1; k < dims; k++)
{
// check if stored without gaps to skip
if (!a.GetShape().CanFlatten(k) || !b.GetShape().CanFlatten(k) || !c.GetShape().CanFlatten(k))
continue;
// check if they are either all broadcasting or all not broadcasting
if ((as[k] != os[k] || as[k - 1] != os[k - 1]) && (as[k] != 1 || as[k - 1] != 1))
continue;
if ((bs[k] != os[k] || bs[k - 1] != os[k - 1]) && (bs[k] != 1 || bs[k - 1] != 1))
continue;
if ((cs[k] != os[k] || cs[k - 1] != os[k - 1]) && (cs[k] != 1 || cs[k - 1] != 1))
continue;
// merge the dimensions
as[k] *= as[k - 1]; as[k - 1] = 1;
bs[k] *= bs[k - 1]; bs[k - 1] = 1;
cs[k] *= cs[k - 1]; cs[k - 1] = 1;
// BUGBUG: Must update multipliers as well
for (size_t i = 0; i < N; i++)
{
// check if stored without gaps to skip
if (!shapes[i].CanFlatten(k))
goto nope;
// check if they are either all broadcasting or all not broadcasting
if ((shapes[i][k] != opDims[k] || shapes[i][k - 1] != opDims[k - 1]) && (shapes[i][k] != 1 || shapes[i][k - 1] != 1))
goto nope;
}
// these dimensions can be merged
for (size_t i = 0; i < N; i++)
shapes[i] = shapes[i].Flatten(k); // TODO: overdoing the immutable thingy much?
opDims = TensorShape(opDims).Flatten(k).GetDims(); // (ugh)
nope:;
}
//fprintf(stderr, "Post-flatten: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
// remove singleton dimensions
size_t j = 0;
vector<bool> toDrop(dims, false);
for (size_t k = 0; k < dims; k++)
{
if (as[k] == 1 && bs[k] == 1 && cs[k] == 1) // skip all-singleton dimensions
continue;
as[j] = as[k];
bs[j] = bs[k];
cs[j] = cs[k];
os[j] = os[k];
j++;
for (size_t i = 0; i < N; i++)
if (shapes[i][k] != 1)
goto neither;
toDrop[k] = true; // found an all-singleton dimensions
neither:;
}
// note: if op is a scalar, then we end up with 0 dimensions here
dims = j;
as.resize(dims);
bs.resize(dims);
cs.resize(dims);
os.resize(dims);
let as1 = TensorShape(as); // BUGBUG: We just lost stride info.
let bs1 = TensorShape(bs);
let cs1 = TensorShape(cs);
let os1 = TensorShape(os);
for (size_t i = 0; i < N; i++)
shapes[i] = shapes[i].DropDims(toDrop);
opDims = TensorShape(opDims).DropDims(toDrop).GetDims(); // (ugh)
dims = opDims.size(); // #dims has changed
for (size_t i = 0; i < N; i++)
assert(dims == shapes[i].size());
// note: if op is a scalar, then we end up with 0 dimensions here, which is allowed
//fprintf(stderr, "Post-drop: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
// determine broadcasting; that is, set strides to 0 for 1-dimensions
// To be more precise, we should only set actually broadcasting dimensions to 0.
// But since dimensions that are 1 across all args are eliminated, any 1 must be some form of broadcasting.
// TODO: Do we need to allow other strides at this point in time? If not, broadcasting becomes a bit vector.
for (size_t i = 0; i < N; i++)
shapes[i] = shapes[i].WithBroadcastStrides();
//fprintf(stderr, "%s op %s -> %s via %s\n", string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
// determine inverse broadcasting dimensions
// TODO: describe the resulting for loop as a set of tensor dims and strides as well.
vector<bool> cBroadcasts(dims);
// Inverse broadcasting dims are actual for loops in the kernel, whereas broadcasting input dims are handled by the thread index.
// For regular input dims:
// - determine number of steps (product over opDims[.])
// - launch that many kernels
// - pass in:
// - total number of steps
// - strides for all inputs (with stride magic), separated by regular and inverse broadcasting dimensions
// - opDim (no stride magic allowed) for regular broadcasting dimensions
// - reverse broadcasting dimensions
// - opcodes for elementwise op and reduction op
// - in each kernel:
// - map thread index to dimensions (regular broadcasting ones)
// - for-loop over inverse broadcasting dimensions
// - map dimensions (including inverse broadcasting) for every input
// - perform op on the input values
// - accumulate
// - map dimensions (regular) for output
// - save result
// separate out the inverse-broadcasting dimensions
// Any singleton dimension in the result tensor is inverse-broadcasting, because there must be at least one non-1 dimension
// in one of the inputs, otherwise the entire dimension would have been optimized away above.
vector<bool> isReducingDim(dims); // true for each inverse-broadcasting dimension
for (size_t k = 0; k < dims; k++)
cBroadcasts[k] = cs1[k] == 1 && (as1[k] != 1 || bs1[k] != 1);
isReducingDim[k] = shapes.back()[k] == 1;
// form the regular (non-inverse-broadcasting) dims
for (size_t i = 0; i < N; i++)
regularStrides[i] = shapes[i].DropDims(isReducingDim).GetStrides();
regularOpDims = TensorShape(opDims).DropDims(isReducingDim).GetDims(); // (ugh)
// form the inverse-broadcasting dims
vector<bool> isRegularDim(dims); // true for each inverse-broadcasting dimension
for (size_t k = 0; k < dims; k++)
isRegularDim[k] = !isReducingDim[k]; // (no way to do this more nicely?)
for (size_t i = 0; i < N; i++)
reducingStrides[i] = shapes[i].DropDims(isRegularDim).GetStrides();
reducingOpDims = TensorShape(opDims).DropDims(isRegularDim).GetDims(); // (ugh)
for (size_t i = 0; i < N; i++)
offsets[i] = shapes[i].GetOffset();
}
template<class ElemType>
void TensorView<ElemType>::DoUnaryOpOf(ElemType beta, const TensorView & a, ElemType alpha, ElementWiseOperator op)
{
// prepare all tensor descriptor information as needed for execution
array<size_t, 2> offsets;
array<vector<ptrdiff_t>, 2> regularStrides, reducingStrides;
vector<size_t> regularOpDims, reducingOpDims;
PrepareTensorOperands<ElemType,2>(array<TensorShape, 2> { a.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
// now perform the operation
fprintf(stderr, "Op %d: %s op %s -> %s via %s\n", (int)op, string(as1).c_str(), string(bs1).c_str(), string(cs1).c_str(), string(os1).c_str());
// :)
beta; alpha;
GetSOB().TensorOp(beta, a.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
template<class ElemType>
void TensorView<ElemType>::DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, ElementWiseOperator op)
{
array<size_t, 3> offsets;
array<vector<ptrdiff_t>, 3> regularStrides, reducingStrides;
vector<size_t> regularOpDims, reducingOpDims;
PrepareTensorOperands<ElemType, 3>(array<TensorShape, 3> { a.GetShape(), b.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
template<class ElemType>
void TensorView<ElemType>::DoTernaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, const TensorView & c, ElemType alpha, ElementWiseOperator op)
{
array<size_t, 4> offsets;
array<vector<ptrdiff_t>, 4> regularStrides, reducingStrides;
vector<size_t> regularOpDims, reducingOpDims;
PrepareTensorOperands<ElemType, 4>(array<TensorShape, 4> { a.GetShape(), b.GetShape(), c.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), c.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
// simple test function for testing stuff
@ -153,16 +226,67 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template<class ElemType>
/*static*/ void TensorView<ElemType>::Test()
{
Matrix<ElemType> m1(0); m1.Resize(1, 42);
Matrix<ElemType> m2(0); m2.Resize(13, 1);
Matrix<ElemType> m3(0); m3.Resize(13, 21);
TensorShape s1(1, 2, 21);
TensorShape s2(13, 1);
TensorShape s3(13, 1, 21);
let t1 = TensorView<ElemType>(m1, s1); t1;
let t2 = TensorView<ElemType>(m2, s2); t2;
auto t3 = TensorView<ElemType>(m3, s3); t3;
t3.DoSumOf(0, t1, t2, 1);
Matrix<ElemType> m1(-1);
Matrix<ElemType> m2(-1);
Matrix<ElemType> m3(-1);
{
m1.SetValue(5, 3, { 1, 2, 3,
14, 15, 6,
4, 5, 16,
41, 5, 1,
1.8, 4.5, 7 });
m2.SetValue(5, 1, { 42,
13,
1968,
3.1415f,
7 });
m3.Resize(m1);
// regular zip (just add m1 to itself)
TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m1), 1);
m3.Print();
// unary op
TensorView(m3).DoSqrtOf(0, TensorView(m1), 1);
m3.Print();
// broadcasting of an input
TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1);
m3.Print();
TensorView(m3).DoMaxOf(0, TensorView(m1), TensorView(m2), 1);
m3.Print();
TensorView(m3).DoGTOf(0, TensorView(m1), TensorView(m2), 1);
m3.Print();
// reduction over columns
m3.Resize(5, 1);
TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1);
m3.Print();
// reduction over rows
m3.Resize(1, 3);
TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1);
m3.Print();
TensorView(m3).DoLogSumOf(0, TensorView(m1), TensorView(m2), 1);
m3.Print();
}
{
m1.Resize(1, 42);
m2.Resize(13, 1);
m3.Resize(13, 21);
TensorShape s1(1, 2, 21);
TensorShape s2(13, 1);
TensorShape s3(13, 1, 21);
let t1 = TensorView<ElemType>(m1, s1); t1;
let t2 = TensorView<ElemType>(m2, s2); t2;
auto t3 = TensorView<ElemType>(m3, s3); t3;
t3.DoSumOf(0, t1, t2, 1);
m3.Print();
}
}
template class TensorView<float>;

Просмотреть файл

@ -4,7 +4,7 @@
// </copyright>
//
// This implements the TensorView class, which is a layer around Matrix that reinterprets its content as a generic tensor.
// This implements the TensorView class, which is a layer around Matrix that reinterprets its content as a generic tensor. [fseide]
#pragma once
@ -36,17 +36,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{ }
// copy constructor
TensorView(const TensorView<ElemType> & other) :
TensorView(other.m_sob, other.m_shape)
TensorView(*other.m_sob, other.m_shape)
{ }
// assignment is forbidden since we contain a reference
// If you ever need this, change the reference to a pointer.
void operator=(const TensorView & other) = delete; // since we have a reference
// -------------------------------------------------------------------
// accessors
// -------------------------------------------------------------------
const Matrix<ElemType> & GetSOB() const { return m_sob; }
Matrix<ElemType> & GetSOB() const { return *m_sob; }
const TensorShape & GetShape() const { return m_shape; }
// -------------------------------------------------------------------
@ -59,20 +56,50 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// If beta == 0, c is not read out, i.e. it can be uninitialized or contain NaNs.
// -------------------------------------------------------------------
void DoSumOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha) { DoBinaryOpOf(beta, a, b, alpha, 0); }
#pragma push_macro("DeclareUnaryTensorOp")
#define DeclareUnaryTensorOp(oper) \
void Do ## oper ## Of(ElemType beta, const TensorView & a, ElemType alpha) { DoUnaryOpOf(beta, a, alpha, ElementWiseOperator::op ## oper); }
ForAllUnaryOps(DeclareUnaryTensorOp);
ForAllParameterizedUnaryOps(DeclareUnaryTensorOp);
//DeclareUnaryTensorOp(Copy);
//DeclareUnaryTensorOp(Negate); DeclareUnaryTensorOp(Not);
//DeclareUnaryTensorOp(Abs);
//DeclareUnaryTensorOp(Sigmoid); DeclareUnaryTensorOp(SigmoidDerivative); DeclareUnaryTensorOp(Tanh); DeclareUnaryTensorOp(Sqrt); DeclareUnaryTensorOp(Exp); DeclareUnaryTensorOp(Log); DeclareUnaryTensorOp(LinearRectifierDerivative); DeclareUnaryTensorOp(Cosine); DeclareUnaryTensorOp(NegativeSine);
//DeclareUnaryTensorOp(SaturateBetaAlpha); DeclareUnaryTensorOp(SumAlpha); DeclareUnaryTensorOp(SubDifferenceToAlpha); DeclareUnaryTensorOp(SubDifferenceFromAlpha);
#pragma pop_macro("DeclareUnaryTensorOp")
#pragma push_macro("DeclareBinaryTensorOp")
#define DeclareBinaryTensorOp(oper) \
void Do ## oper ## Of(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha) { DoBinaryOpOf(beta, a, b, alpha, ElementWiseOperator::op ## oper); }
ForAllBinaryOps(DeclareBinaryTensorOp);
//DeclareBinaryTensorOp(Sum); DeclareBinaryTensorOp(Difference); DeclareBinaryTensorOp(ElementWiseProduct); DeclareBinaryTensorOp(ElementWiseQuotient);
//DeclareBinaryTensorOp(LogSum); DeclareBinaryTensorOp(Max); DeclareBinaryTensorOp(Min);
//DeclareBinaryTensorOp(EQ); DeclareBinaryTensorOp(NE); DeclareBinaryTensorOp(GT); DeclareBinaryTensorOp(LT); DeclareBinaryTensorOp(GE); DeclareBinaryTensorOp(LE);
#pragma pop_macro("DeclareBinaryTensorOp")
#pragma push_macro("DeclareTernaryTensorOp")
#define DeclareTernaryTensorOp(oper) \
void Do ## oper ## Of(ElemType beta, const TensorView & a, const TensorView & b, const TensorView & c, ElemType alpha) { DoTernaryOpOf(beta, a, b, c, alpha, ElementWiseOperator::op ## oper); }
ForAllTernaryOps(DeclareTernaryTensorOp);
#pragma pop_macro("DeclareTernaryTensorOp")
static void Test();
private:
void DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, int op/*will become an enum later*/);
void DoUnaryOpOf(ElemType beta, const TensorView & a, ElemType alpha, ElementWiseOperator op);
void DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, ElementWiseOperator op);
void DoTernaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, const TensorView & c, ElemType alpha, ElementWiseOperator op);
// -------------------------------------------------------------------
// sob members
// -------------------------------------------------------------------
Matrix<ElemType> & m_sob; // Storage OBject that holds the data that is being viewed with this TensorView
TensorShape m_shape; // the meta-data that describes the data's shape and/or access pattern
Matrix<ElemType> * m_sob; // Storage OBject that holds the data that is being viewed with this TensorView. Pointer instead of ref so this object is copyable.
TensorShape m_shape; // the meta-data that describes the data's shape and/or access pattern
// TODO: use a reference here or not? With a reference, we can hide more info in here such as cuDNN handles
};

Просмотреть файл

@ -16,6 +16,7 @@
#include <sstream> // TODO: this should go away once we update the parameter parsing
#include <unordered_map>
#include <opencv2/opencv.hpp>
#include <omp.h>
namespace Microsoft { namespace MSR { namespace CNTK {
@ -400,6 +401,10 @@ void ImageReader<ElemType>::InitFromConfig(const ConfigRecordType& config)
m_prefetch = config(L"prefetch", true);
int cthread = config(L"numCPUThreads", 0);
if (cthread > 0)
omp_set_num_threads(cthread);
m_epochStart = 0;
m_mbStart = 0;
}
@ -412,11 +417,16 @@ void ImageReader<ElemType>::Destroy()
}
template<class ElemType>
void ImageReader<ElemType>::StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples)
void ImageReader<ElemType>::StartDistributedMinibatchLoop(size_t mbSize, size_t epoch, size_t subsetNum, size_t numSubsets, size_t requestedEpochSamples)
{
assert(mbSize > 0);
assert(numSubsets > 0);
assert(subsetNum < numSubsets);
assert(requestedEpochSamples > 0);
m_subsetNum = subsetNum;
m_numSubsets = numSubsets;
if (m_imgListRand)
std::shuffle(m_files.begin(), m_files.end(), m_rng);
@ -457,7 +467,6 @@ bool ImageReader<ElemType>::GetMinibatch(std::map<std::wstring, Matrix<ElemType>
m_pMBLayout->InitAsFrameMode(mbSize);
m_mbStart += mbSize;
// It is safe to run prefetching with just one buffer as SetValue is synchronous so there will be no race.
m_mbPrefetchFut = std::async(GetLaunchPolicy(m_prefetch), [this]() { return ReadImages(); });
@ -505,10 +514,15 @@ size_t ImageReader<ElemType>::ReadImages()
std::fill(m_labBuf.begin(), m_labBuf.end(), static_cast<ElemType>(0));
size_t actualMBSize = mbLim - m_mbStart;
size_t iStart = actualMBSize * m_subsetNum / m_numSubsets;
size_t iLim = actualMBSize * (m_subsetNum + 1) / m_numSubsets;
size_t subsetSize = iLim - iStart;
#pragma omp parallel for ordered schedule(dynamic)
for (long long i = 0; i < static_cast<long long>(mbLim - m_mbStart); i++)
for (long long i = 0; i < static_cast<long long>(subsetSize); i++)
{
const auto& p = m_files[i + m_mbStart];
const auto& p = m_files[m_mbStart + iStart + i];
cv::Mat img{ cv::imread(p.first, cv::IMREAD_COLOR) };
if (!img.data)
RuntimeError("Cannot read image file %s", p.first.c_str());
@ -522,7 +536,8 @@ size_t ImageReader<ElemType>::ReadImages()
m_labBuf[m_labDim * i + p.second] = 1;
}
return mbLim - m_mbStart;
m_mbStart += actualMBSize;
return subsetSize;
}
template class ImageReader<double>;

Просмотреть файл

@ -39,7 +39,12 @@ public:
virtual void Init(const ScriptableObjects::IConfigRecord & config) override { InitFromConfig(config); }
#endif
void Destroy() override;
void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize) override;
bool SupportsDistributedMBRead() const { return true; }
void StartDistributedMinibatchLoop(size_t mbSize, size_t epoch, size_t subsetNum, size_t numSubsets, size_t requestedEpochSamples = requestDataSize) override;
void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize) override
{
return StartDistributedMinibatchLoop(mbSize, epoch, 0, 1, requestedEpochSamples);
}
bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices) override;
bool DataEnd(EndDataType endDataType) override;
@ -73,6 +78,9 @@ private:
size_t m_epochStart;
size_t m_mbStart;
size_t m_subsetNum;
size_t m_numSubsets;
bool m_prefetch;
std::future<size_t> m_mbPrefetchFut;
std::vector<ElemType> m_featBuf;

Просмотреть файл

@ -32,22 +32,26 @@ namespace Microsoft { namespace MSR { namespace CNTK {
assert(pMBLayout->GetNumParallelSequences() == m_numUttsPerMinibatch);
uttInfoInMinibatch->clear();
uttInfoInMinibatch->resize(uttInfo.size());
for (size_t i = 0; i < uttInfo.size(); ++i)
{
size_t startFrameIndexInMinibatch = 0;
size_t numFrames = 0;
for (size_t j = 0; j < pMBLayout->GetNumTimeSteps(); ++j)
{
if (pMBLayout->Is(i, j, MinibatchPackingFlags::NoLabel))
/* if (pMBLayout->Is(i, j, MinibatchPackingFlags::NoLabel))
{
continue;
}
if (pMBLayout->Is(i, j, MinibatchPackingFlags::NoFeature))
}*/
FrameRange fr(pMBLayout,j);
if (pMBLayout->IsGap(fr.Sequence(i)))
{
continue;
}
numFrames += 1;
if (pMBLayout->Is(i, j, MinibatchPackingFlags::SequenceEnd)
if (pMBLayout->IsBeyondStartOrEnd(fr.WithTimeOffset((ptrdiff_t) 1).Sequence(i))
|| j == pMBLayout->GetNumTimeSteps() - 1)
{
size_t uttIndex = (*uttInfoInMinibatch)[i].size();

Просмотреть файл

@ -4,10 +4,10 @@
// </copyright>
//
//
#include "stdafx.h"
#ifdef _WIN32
#include <objbase.h>
#endif
#include "Basics.h"
#include <fstream>
#include <algorithm>

Просмотреть файл

@ -12,21 +12,6 @@
namespace Microsoft { namespace MSR { namespace CNTK {
template<class ElemType>
void DATAWRITER_API GetWriter(IDataWriter<ElemType>** pwriter)
{
*pwriter = new LMSequenceWriter<ElemType>();
}
extern "C" DATAWRITER_API void GetWriterF(IDataWriter<float>** pwriter)
{
GetWriter(pwriter);
}
extern "C" DATAWRITER_API void GetWriterD(IDataWriter<double>** pwriter)
{
GetWriter(pwriter);
}
template<class ElemType>
class LMSequenceWriter : public IDataWriter<ElemType>
{
@ -65,8 +50,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
public:
using LabelType = typename IDataWriter<ElemType>::LabelType;
using LabelIdType = typename IDataWriter<ElemType>::LabelIdType;
void GetSections(std::map<std::wstring, SectionType, nocase_compare>& /*sections*/){}
void SaveMapping(std::wstring saveId, const std::map<typename LabelIdType, typename LabelType>& /*labelMapping*/){}
void SaveMapping(std::wstring saveId, const std::map<LabelIdType, LabelType>& /*labelMapping*/){}
public:
template<class ConfigRecordType>
@ -77,4 +64,21 @@ namespace Microsoft { namespace MSR { namespace CNTK {
virtual bool SaveData(size_t recordStart, const std::map<std::wstring, void*, nocase_compare>& matrices, size_t numRecords, size_t datasetSize, size_t byteVariableSized);
};
template<class ElemType>
void DATAWRITER_API GetWriter(IDataWriter<ElemType>** pwriter)
{
assert(pwriter != nullptr);
*pwriter = new LMSequenceWriter<ElemType>();
assert(*pwriter != nullptr);
}
extern "C" DATAWRITER_API void GetWriterF(IDataWriter<float>** pwriter)
{
GetWriter(pwriter);
}
extern "C" DATAWRITER_API void GetWriterD(IDataWriter<double>** pwriter)
{
GetWriter(pwriter);
}
}}}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу