multithread GPU code checkin, no speedup though due to the stream in kernel function
This commit is contained in:
Родитель
25f4191186
Коммит
1fc15ed105
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -75,6 +75,9 @@ void ComputationNetwork::CopySubTree(const ComputationNetwork& fromNet,
|
|||
LogicError("CopySubTree: you cannot copy a tree without copying the node values.");
|
||||
|
||||
ComputationNodeBasePtr fromRoot = fromNet.GetNodeFromName(fromName);
|
||||
std::set<wstring> set_cur_tonode_names;
|
||||
|
||||
set_cur_tonode_names.clear();
|
||||
|
||||
if (!fromNet.EvalOrderExists(fromRoot))
|
||||
const_cast<ComputationNetwork&>(fromNet).FormEvalOrder(fromRoot);
|
||||
|
@ -84,20 +87,66 @@ void ComputationNetwork::CopySubTree(const ComputationNetwork& fromNet,
|
|||
wstring fromNodeName = fromNode->NodeName();
|
||||
wstring toNodeName = toNamePrefix + fromNodeName;
|
||||
|
||||
// fprintf(stderr, "CopySubTree node = %ls\n", fromNodeName.c_str());
|
||||
|
||||
ComputationNodeBasePtr toNode = CopyNode(fromNet, fromNodeName,
|
||||
toNodeName,
|
||||
CopyNodeFlags::copyNodeValue);
|
||||
|
||||
set_cur_tonode_names.insert(toNodeName);
|
||||
ComputationNodeBasePtr fromNodeTmp;
|
||||
fromNodeTmp = fromNet.GetNodeFromName(fromNodeName);
|
||||
|
||||
/* fprintf(stderr, "CopySubTree node = %ls, numrows = %d, numcols = %d \n", fromNodeName.c_str(),
|
||||
int((&dynamic_pointer_cast<ComputationNode<float>>(fromNodeTmp)->Value())->GetNumRows()),
|
||||
int((&dynamic_pointer_cast<ComputationNode<float>>(fromNodeTmp)->Value())->GetNumCols())
|
||||
); */
|
||||
if (flags & CopyNodeFlags::copyNodeInputLinks)
|
||||
{
|
||||
// copy the children structure but use the new nodes generated
|
||||
for (int i = 0; i < fromNode->GetNumInputs(); i++)
|
||||
toNode->SetInput(i, GetNodeFromName(toNamePrefix + fromNode->GetInputs()[i]->NodeName()));
|
||||
{
|
||||
wstring inputNodeName;
|
||||
inputNodeName = (toNamePrefix + fromNode->GetInputs()[i]->NodeName());
|
||||
if (set_cur_tonode_names.find(inputNodeName) == set_cur_tonode_names.end() )
|
||||
{
|
||||
//somehow the evalorder goes wrong, and the input is not copied to the toNetwork yet, do copy here
|
||||
// fprintf(stderr, "CopySubTree i = %d, missing node = %ls\n", i, inputNodeName.c_str());
|
||||
|
||||
CopyNode(fromNet, fromNode->GetInputs()[i]->NodeName(),
|
||||
inputNodeName,
|
||||
CopyNodeFlags::copyNodeValue);
|
||||
set_cur_tonode_names.insert(inputNodeName);
|
||||
}
|
||||
//fprintf(stderr, "CopySubTree i = %d, node = %ls\n", i, toNodeName.c_str());
|
||||
|
||||
//toNode->SetInput(i, GetNodeFromName(toNamePrefix + fromNode->GetInputs()[i]->NodeName()));
|
||||
toNode->SetInput(i, GetNodeFromName(inputNodeName));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// you can only copy inputs from nodes in the same network
|
||||
void ComputationNetwork::ShowNodeMemory(const ComputationNetwork& fromNet,
|
||||
const std::wstring fromName)
|
||||
{
|
||||
|
||||
ComputationNodeBasePtr fromRoot = fromNet.GetNodeFromName(fromName);
|
||||
|
||||
if (!fromNet.EvalOrderExists(fromRoot))
|
||||
const_cast<ComputationNetwork&>(fromNet).FormEvalOrder(fromRoot);
|
||||
|
||||
for (const auto& fromNode : fromNet.GetEvalOrder(fromRoot)) // BUGBUG: This probably will fail because the precomputed eval orders are invalid at this point.
|
||||
{
|
||||
wstring fromNodeName = fromNode->NodeName();
|
||||
ComputationNodeBasePtr fromNodeTmp;
|
||||
fromNodeTmp = fromNet.GetNodeFromName(fromNodeName);
|
||||
|
||||
fprintf(stderr, "ShowNodeMemory node = %ls, numrows = %d, numcols = %d \n", fromNodeName.c_str(),
|
||||
int((&dynamic_pointer_cast<ComputationNode<float>>(fromNodeTmp)->Value())->GetNumRows()),
|
||||
int((&dynamic_pointer_cast<ComputationNode<float>>(fromNodeTmp)->Value())->GetNumCols()));
|
||||
}
|
||||
}
|
||||
// you can only copy inputs from nodes in the same network
|
||||
void ComputationNetwork::CopyInputs(const std::wstring fromName, std::wstring toName)
|
||||
{
|
||||
CopyNode(*this, fromName, toName, CopyNodeFlags::copyNodeInputLinks);
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
#include "MatrixPool.h"
|
||||
#include "ComputationEnvironment.h"
|
||||
#include "Globals.h"
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <unordered_set>
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
@ -233,7 +233,6 @@ struct ComputationNetworkOwnedNodeState
|
|||
return m_parentGradientOptimization == ParentGradientOptimization::Reuse;
|
||||
}
|
||||
|
||||
|
||||
virtual void MarkValueNonSharable()
|
||||
{
|
||||
m_valueSharable = false;
|
||||
|
@ -351,6 +350,8 @@ class ComputationNodeBase : public IComputationNode,
|
|||
public:
|
||||
typedef shared_ptr<ComputationNodeBase> ComputationNodeBasePtr;
|
||||
|
||||
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// constructors, copying, (de-)serialization
|
||||
// -----------------------------------------------------------------------
|
||||
|
|
|
@ -26,7 +26,12 @@
|
|||
#include <boost/algorithm/string/join.hpp>
|
||||
#include <boost/range/adaptor/transformed.hpp>
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
namespace Microsoft
|
||||
{
|
||||
namespace MSR
|
||||
{
|
||||
namespace CNTK
|
||||
{
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// ReduceElements (op, axis=, input)
|
||||
|
@ -39,10 +44,10 @@ template <class ElemType>
|
|||
if (flags & CopyNodeFlags::copyNodeValue)
|
||||
{
|
||||
auto node = dynamic_pointer_cast<ReduceElementsNode<ElemType>>(nodeP);
|
||||
node->m_axes = m_axes;
|
||||
node->m_operation = m_operation;
|
||||
node->m_axes = m_axes;
|
||||
node->m_operation = m_operation;
|
||||
node->m_reductionOp = m_reductionOp;
|
||||
node->m_scale = m_scale;
|
||||
node->m_scale = m_scale;
|
||||
node->m_keepDimensions = m_keepDimensions;
|
||||
}
|
||||
}
|
||||
|
@ -57,7 +62,7 @@ template <class ElemType>
|
|||
fstream >> num_axes;
|
||||
for (int i = 0; i < num_axes; ++i)
|
||||
{
|
||||
int axis;
|
||||
int axis;
|
||||
fstream >> axis;
|
||||
m_axes.push_back(axis);
|
||||
}
|
||||
|
@ -123,7 +128,7 @@ template <class ElemType>
|
|||
if (ReduceSequenceAxis())
|
||||
{
|
||||
ElemType gapPadValue = NeutralValue(m_reductionOp);
|
||||
input = ComputationNode<ElemType>::Unpack(GetSampleLayout(), InputRef(0).Value(), InputRef(0).GetMBLayout(), m_tempUnpackedData, m_tempScatterIndices, m_tempMask, /*batchMajor=*/ true, &gapPadValue);
|
||||
input = ComputationNode<ElemType>::Unpack(GetSampleLayout(), InputRef(0).Value(), InputRef(0).GetMBLayout(), m_tempUnpackedData, m_tempScatterIndices, m_tempMask, /*batchMajor=*/true, &gapPadValue);
|
||||
}
|
||||
else
|
||||
input = InputRef(0).ValueTensorFor(rank, frInput);
|
||||
|
@ -142,7 +147,24 @@ template <class ElemType>
|
|||
default:
|
||||
// the actual operation is a Copy with reduction, where the magic is in the reduction op
|
||||
// For "Mean", m_scale is 1/#elements, and 1 otherwise.
|
||||
result.DoUnaryOpOf(0, input, m_scale, ElementWiseOperator::opCopy, m_reductionOp);
|
||||
{
|
||||
/*
|
||||
if (num_frames == 94)
|
||||
{
|
||||
fprintf(stderr, "reshape 1, name = %ls, result = %f, input = %f, m_scale = %f \n", NodeName().c_str(), double(result.GetSOB().FrobeniusNorm()), double(input.GetSOB().FrobeniusNorm()), double(m_scale));
|
||||
}
|
||||
*/
|
||||
if (multi_thread)
|
||||
result.DoUnaryOpOfDebug(0, input, m_scale, ElementWiseOperator::opCopy, m_reductionOp);
|
||||
else
|
||||
result.DoUnaryOpOf(0, input, m_scale, ElementWiseOperator::opCopy, m_reductionOp);
|
||||
/*
|
||||
if (num_frames == 94)
|
||||
{
|
||||
fprintf(stderr, "reshape 2, name = %ls, result = %f, input = %f, m_scale = %f \n", NodeName().c_str(), double(result.GetSOB().FrobeniusNorm()), double(input.GetSOB().FrobeniusNorm()), double(m_scale));
|
||||
}
|
||||
*/
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -156,15 +178,15 @@ template <class ElemType>
|
|||
{
|
||||
// Broadcast along the sequence
|
||||
auto result = ValueFor(fr);
|
||||
ComputationNode<ElemType>::BroadcastToPacked(Gradient(), GetMBLayout(), /*beta =*/ accumulateGradient ? (ElemType)1 : (ElemType)0, InputRef(0).Gradient(), FrameRange(InputRef(0).GetMBLayout()), m_tempGatherIndices);
|
||||
ComputationNode<ElemType>::BroadcastToPacked(Gradient(), GetMBLayout(), /*beta =*/accumulateGradient ? (ElemType) 1 : (ElemType) 0, InputRef(0).Gradient(), FrameRange(InputRef(0).GetMBLayout()), m_tempGatherIndices);
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto frInput = (ReduceAllAxes() || ReduceBatchAxis()) ? FrameRange(InputRef(0).GetMBLayout()) : fr; // can't use 'fr' for ReduceAllAxes() as it refers to the result (same as for training criteria)
|
||||
// get the args
|
||||
// get the args
|
||||
size_t rank = DetermineElementwiseTensorRank();
|
||||
auto sliceOutputGrad = ReduceAllAxes() ? TensorView<ElemType>(GradientPtr(), GetSampleLayout()) : GradientTensorFor(rank, fr); // propagate from this one...
|
||||
auto sliceInputGrad = InputRef(0).GradientTensorFor(rank, frInput); // ...to this one
|
||||
auto sliceInputGrad = InputRef(0).GradientTensorFor(rank, frInput); // ...to this one
|
||||
|
||||
// gradients are not as simple as passing an op-code, unfortunately
|
||||
switch (m_reductionOp)
|
||||
|
@ -220,8 +242,8 @@ template <class ElemType>
|
|||
break;
|
||||
case ElementWiseOperator::opElementwiseProduct:
|
||||
{
|
||||
auto input = InputRef(inputIndex).ValueTensorFor(rank, frInput);
|
||||
auto output = ValueTensorFor(rank, fr.AllowBroadcast());
|
||||
auto input = InputRef(inputIndex).ValueTensorFor(rank, frInput);
|
||||
auto output = ValueTensorFor(rank, fr.AllowBroadcast());
|
||||
if (accumulateGradient)
|
||||
sliceInputGrad.AddElementwiseProductWithQuotientOf(sliceOutputGrad, output, input);
|
||||
else
|
||||
|
@ -242,13 +264,20 @@ template <class ElemType>
|
|||
{
|
||||
switch (m_reductionOp)
|
||||
{
|
||||
case ElementWiseOperator::opSum: return false;
|
||||
case ElementWiseOperator::opLogSum: return true;
|
||||
case ElementWiseOperator::opMin: return true;
|
||||
case ElementWiseOperator::opMax: return true;
|
||||
case ElementWiseOperator::opElementwiseProduct: return true;
|
||||
case ElementWiseOperator::opArgmin: return false;
|
||||
case ElementWiseOperator::opArgmax: return false;
|
||||
case ElementWiseOperator::opSum:
|
||||
return false;
|
||||
case ElementWiseOperator::opLogSum:
|
||||
return true;
|
||||
case ElementWiseOperator::opMin:
|
||||
return true;
|
||||
case ElementWiseOperator::opMax:
|
||||
return true;
|
||||
case ElementWiseOperator::opElementwiseProduct:
|
||||
return true;
|
||||
case ElementWiseOperator::opArgmin:
|
||||
return false;
|
||||
case ElementWiseOperator::opArgmax:
|
||||
return false;
|
||||
}
|
||||
LogicError("Should not get here.");
|
||||
}
|
||||
|
@ -258,13 +287,20 @@ template <class ElemType>
|
|||
{
|
||||
switch (m_reductionOp)
|
||||
{
|
||||
case ElementWiseOperator::opSum: return false;
|
||||
case ElementWiseOperator::opLogSum: return true;
|
||||
case ElementWiseOperator::opMin: return true;
|
||||
case ElementWiseOperator::opMax: return true;
|
||||
case ElementWiseOperator::opElementwiseProduct: return true;
|
||||
case ElementWiseOperator::opArgmin: return false;
|
||||
case ElementWiseOperator::opArgmax: return false;
|
||||
case ElementWiseOperator::opSum:
|
||||
return false;
|
||||
case ElementWiseOperator::opLogSum:
|
||||
return true;
|
||||
case ElementWiseOperator::opMin:
|
||||
return true;
|
||||
case ElementWiseOperator::opMax:
|
||||
return true;
|
||||
case ElementWiseOperator::opElementwiseProduct:
|
||||
return true;
|
||||
case ElementWiseOperator::opArgmin:
|
||||
return false;
|
||||
case ElementWiseOperator::opArgmax:
|
||||
return false;
|
||||
}
|
||||
LogicError("Should not get here.");
|
||||
}
|
||||
|
@ -282,7 +318,7 @@ template <class ElemType>
|
|||
// validate the opcode (in case we got instantiated empty and never updated)
|
||||
ValidateOp();
|
||||
|
||||
m_scale = (ElemType)1;
|
||||
m_scale = (ElemType) 1;
|
||||
if (ReduceAllAxes())
|
||||
Base::ValidateUnaryReduce(isFinalValidationPass, m_keepDimensions);
|
||||
else if (ReduceSequenceAxis())
|
||||
|
@ -319,29 +355,27 @@ template <class ElemType>
|
|||
|
||||
let shape = Input(0)->GetSampleLayout();
|
||||
auto dims = shape.GetDims();
|
||||
size_t reducedDimProd = 1;
|
||||
size_t reducedDimProd = 1;
|
||||
if (ReduceAllStaticAxes())
|
||||
{
|
||||
reducedDimProd = shape.GetNumElements();
|
||||
dims = m_keepDimensions ? SmallVector<size_t>(shape.GetRank(), 1) : (Environment().IsV2Library() ? SmallVector<size_t>({}) : SmallVector<size_t>({ 1 })); // entire sample is reduced to a scalar
|
||||
dims = m_keepDimensions ? SmallVector<size_t>(shape.GetRank(), 1) : (Environment().IsV2Library() ? SmallVector<size_t>({}) : SmallVector<size_t>({1})); // entire sample is reduced to a scalar
|
||||
}
|
||||
else if (!m_axes.empty()
|
||||
&& std::all_of(m_axes.begin(),
|
||||
m_axes.end(),
|
||||
[&dims](int axis) { return axis - 1 >= 0 && axis - 1 < dims.size(); }))
|
||||
else if (!m_axes.empty() && std::all_of(m_axes.begin(),
|
||||
m_axes.end(),
|
||||
[&dims](int axis) { return axis - 1 >= 0 && axis - 1 < dims.size(); }))
|
||||
{
|
||||
//Accumulate the number of elements for reduce_mean
|
||||
reducedDimProd = std::accumulate(m_axes.begin(),
|
||||
m_axes.end(),
|
||||
1,
|
||||
[&dims](size_t acc, int& axis) { return acc * dims[axis - 1]; });
|
||||
m_axes.end(),
|
||||
1,
|
||||
[&dims](size_t acc, int& axis) { return acc * dims[axis - 1]; });
|
||||
|
||||
// axes reduced to a scalar
|
||||
if (m_keepDimensions)
|
||||
std::for_each(m_axes.begin(),
|
||||
m_axes.end(),
|
||||
[&dims](int axis) {dims[axis - 1] = 1; }
|
||||
);
|
||||
m_axes.end(),
|
||||
[&dims](int axis) { dims[axis - 1] = 1; });
|
||||
else
|
||||
{
|
||||
SmallVector<size_t> reducedDims(dims.size() - m_axes.size());
|
||||
|
@ -355,13 +389,12 @@ template <class ElemType>
|
|||
dims = reducedDims;
|
||||
}
|
||||
}
|
||||
else if (isFinalValidationPass)
|
||||
else if (isFinalValidationPass)
|
||||
{
|
||||
InvalidArgument("The shape of %ls [%ls] can not be reduced along axes [%ls]",
|
||||
NodeDescription().c_str(),
|
||||
wstring(shape).c_str(),
|
||||
boost::algorithm::join(m_axes | boost::adaptors::transformed([](int axis) { return std::to_wstring(axis); }), ", ").c_str()
|
||||
);
|
||||
NodeDescription().c_str(),
|
||||
wstring(shape).c_str(),
|
||||
boost::algorithm::join(m_axes | boost::adaptors::transformed([](int axis) { return std::to_wstring(axis); }), ", ").c_str());
|
||||
}
|
||||
// for "Mean", we must divide by #elements
|
||||
if (isFinalValidationPass && IsMean())
|
||||
|
@ -384,10 +417,14 @@ struct SequenceLengthVector
|
|||
{
|
||||
typedef vector<vector<size_t>> SequenceVector;
|
||||
typedef MBLayout::SequenceInfo SequenceInfo;
|
||||
const SequenceVector& m_sequenceVector; // vector of sequences (to get sequence length)
|
||||
const vector<SequenceInfo>& m_sequenceInfo; // original sequence info (for seqId)
|
||||
SequenceLengthVector(const vector<SequenceInfo>& sequenceInfo, const SequenceVector& sequenceVector) : m_sequenceInfo(sequenceInfo), m_sequenceVector(sequenceVector) { }
|
||||
size_t size() const { return m_sequenceInfo.size(); }
|
||||
const SequenceVector& m_sequenceVector; // vector of sequences (to get sequence length)
|
||||
const vector<SequenceInfo>& m_sequenceInfo; // original sequence info (for seqId)
|
||||
SequenceLengthVector(const vector<SequenceInfo>& sequenceInfo, const SequenceVector& sequenceVector)
|
||||
: m_sequenceInfo(sequenceInfo), m_sequenceVector(sequenceVector) {}
|
||||
size_t size() const
|
||||
{
|
||||
return m_sequenceInfo.size();
|
||||
}
|
||||
MBLayout::SequenceInfo operator[](size_t i) const // return a descriptor of the new sequence
|
||||
{
|
||||
SequenceInfo seq;
|
||||
|
@ -427,12 +464,12 @@ template <class ElemType>
|
|||
for (size_t t = 0; t < seq.GetNumTimeSteps(); t++)
|
||||
{
|
||||
double delta = input(0, inMBLayout->GetColumnIndex(seq, t)); // how many frames the current time step should expand into
|
||||
desiredCount += delta; // this is now how many frames we should have
|
||||
desiredCount += delta; // this is now how many frames we should have
|
||||
// use a margin against round-off errors, so that we get non-binary ratios like 1/3 and 1/5 right
|
||||
// This really means generate a frame if too few, unless we are within machine accuracy of the target.
|
||||
// The assumption is that the delta has this error, while accumulation (in double) has no error.
|
||||
ElemType relativeMargin = 1 - std::numeric_limits<ElemType>::epsilon();
|
||||
while ((indexSequence.empty() && desiredCount > 0) // no margin for the first frame (always include unless flag is 0)
|
||||
while ((indexSequence.empty() && desiredCount > 0) // no margin for the first frame (always include unless flag is 0)
|
||||
|| indexSequence.size() < desiredCount * relativeMargin)
|
||||
indexSequence.push_back(t);
|
||||
}
|
||||
|
@ -441,10 +478,10 @@ template <class ElemType>
|
|||
input.CollapseDataLocation(); // BUGBUG: Move back, since BOTH state is broken at present.
|
||||
// create a new MBLayout
|
||||
let& outMBLayout = GetMBLayout();
|
||||
outMBLayout->InitAsPackedSequences(SequenceLengthVector(sequences, indexSequences), /*temp*/m_placementBuffer, /*temp*/m_rowAllocationsBuffer);
|
||||
outMBLayout->InitAsPackedSequences(SequenceLengthVector(sequences, indexSequences), /*temp*/ m_placementBuffer, /*temp*/ m_rowAllocationsBuffer);
|
||||
// copy to output
|
||||
vector<ElemType> buf(outMBLayout->GetNumCols(), numeric_limits<ElemType>::quiet_NaN()); // STL cannot easily avoid initializing, so we might as well init with NaN for gaps
|
||||
let size = min(sequences.size(), outMBLayout->GetAllSequences().size()); // no non-gap sequence has an index beyond this
|
||||
let size = min(sequences.size(), outMBLayout->GetAllSequences().size()); // no non-gap sequence has an index beyond this
|
||||
for (size_t i = 0; i < size; i++)
|
||||
{
|
||||
let& seq = outMBLayout->GetAllSequences()[i];
|
||||
|
@ -452,7 +489,7 @@ template <class ElemType>
|
|||
continue;
|
||||
let& indexSequence = indexSequences[i];
|
||||
for (size_t t = 0; t < seq.GetNumTimeSteps(); t++)
|
||||
buf[outMBLayout->GetColumnIndex(seq, t)] = (ElemType)indexSequence[t];
|
||||
buf[outMBLayout->GetColumnIndex(seq, t)] = (ElemType) indexSequence[t];
|
||||
}
|
||||
// there may be dangling gaps at the end. Take the opportunity to verify this.
|
||||
for (size_t i = size; i < sequences.size(); i++)
|
||||
|
@ -460,7 +497,7 @@ template <class ElemType>
|
|||
for (size_t i = size; i < outMBLayout->GetAllSequences().size(); i++)
|
||||
assert(outMBLayout->GetAllSequences()[i].seqId == GAP_SEQUENCE_ID);
|
||||
// the result will be kept in CPUDEVICE, since most likely we will access it again in PackedIndexNode
|
||||
Value().TransferToDeviceIfNotThere(CPUDEVICE, /*isBeingMoved=*/ true, /*emptyTransfer=*/ true, /*updatePreferredDevice=*/ true);
|
||||
Value().TransferToDeviceIfNotThere(CPUDEVICE, /*isBeingMoved=*/true, /*emptyTransfer=*/true, /*updatePreferredDevice=*/true);
|
||||
Value().SetValue(1, outMBLayout->GetNumCols(), CPUDEVICE, buf.data(), MatrixFormat::matrixFormatColMajor);
|
||||
}
|
||||
|
||||
|
@ -501,9 +538,9 @@ template <class ElemType>
|
|||
/*virtual*/ void PackedIndexNode<ElemType>::ForwardPropNonLooping() /*override*/
|
||||
{
|
||||
let& sourceMBLayout = InputRef(SOURCEDATA).GetMBLayout(); // only used for index conversion
|
||||
let& indexMBLayout = InputRef(INDEXDATA).GetMBLayout();
|
||||
let& index = InputRef(INDEXDATA).Value(); // per-seq index values that are to be mapped
|
||||
auto& result = Value(); // packed index values as mapped to sourceData's layout
|
||||
let& indexMBLayout = InputRef(INDEXDATA).GetMBLayout();
|
||||
let& index = InputRef(INDEXDATA).Value(); // per-seq index values that are to be mapped
|
||||
auto& result = Value(); // packed index values as mapped to sourceData's layout
|
||||
// loop over sourceSequences
|
||||
// Input matrix contains time indices for each sequence that refer to frames inside that sequence.
|
||||
// We replace every per-sequence index by the resolved column index w.r.t. the same MBLayout.
|
||||
|
@ -516,10 +553,10 @@ template <class ElemType>
|
|||
let& indexSeq = indexMBLayout->FindMatchingSequence(sourceSequences, i); // find corresponding entry in indexMBLayout
|
||||
for (size_t tIndex = 0; tIndex < indexSeq.GetNumTimeSteps(); tIndex++) // map all index values in index sequence
|
||||
{
|
||||
let jIndex = indexMBLayout->GetColumnIndex(indexSeq, tIndex); // map time index to actual location in the matrix storage object
|
||||
let tSource = (size_t)index(0, jIndex); // the new time location (relative to source sequence)
|
||||
let jIndex = indexMBLayout->GetColumnIndex(indexSeq, tIndex); // map time index to actual location in the matrix storage object
|
||||
let tSource = (size_t) index(0, jIndex); // the new time location (relative to source sequence)
|
||||
let jSource = sourceMBLayout->GetColumnIndex(sourceSeq, tSource); // map new time index as well. This performs a range check.
|
||||
result(0, jIndex) = (ElemType)jSource;
|
||||
result(0, jIndex) = (ElemType) jSource;
|
||||
}
|
||||
}
|
||||
// Note: maybe this is no longer needed, now that we do the same inside UpdateFunctionValueSize() for all nodes.
|
||||
|
@ -563,8 +600,8 @@ template <class ElemType>
|
|||
/*virtual*/ void GatherPackedNode<ElemType>::ForwardPropNonLooping() /*override*/
|
||||
{
|
||||
InputRef(INDEXDATA).MaskMissingValueColumnsTo(FrameRange(InputRef(INDEXDATA).GetMBLayout()), -1); // indicates an invalid column to Gather/Scatter
|
||||
let& index = InputRef(INDEXDATA) .Value(); // column indices to copy from
|
||||
let& source = InputRef(SOURCEDATA).Value(); // source data to copy
|
||||
let& index = InputRef(INDEXDATA).Value(); // column indices to copy from
|
||||
let& source = InputRef(SOURCEDATA).Value(); // source data to copy
|
||||
|
||||
#ifdef _MSC_VER
|
||||
auto& outputValuePtrRef = ValuePtrRef();
|
||||
|
@ -587,9 +624,9 @@ template <class ElemType>
|
|||
{
|
||||
if (inputIndex == SOURCEDATA)
|
||||
{
|
||||
let& index = InputRef(INDEXDATA) .Value(); // column indices to copy from
|
||||
let& index = InputRef(INDEXDATA).Value(); // column indices to copy from
|
||||
auto& sourceGradient = InputRef(SOURCEDATA).Gradient(); // source to propagate the gradient intpu
|
||||
auto& outputGradient = Gradient(); // output gradient to propagate
|
||||
auto& outputGradient = Gradient(); // output gradient to propagate
|
||||
sourceGradient.DoScatterColumnsOf(/*beta=*/1, index, outputGradient, /*alpha=*/1, true);
|
||||
}
|
||||
}
|
||||
|
@ -614,7 +651,7 @@ template <class ElemType>
|
|||
SetDims(Input(SOURCEDATA)->GetSampleLayout(), HasMBLayout());
|
||||
else
|
||||
{
|
||||
SmallVector<size_t> layout = { 1 }; // Scalar
|
||||
SmallVector<size_t> layout = {1}; // Scalar
|
||||
if (Input(SOURCEDATA)->GetSampleLayout().GetRank() > 1)
|
||||
{
|
||||
auto srcLayout = Input(SOURCEDATA)->GetSampleLayout().GetDims();
|
||||
|
@ -638,8 +675,8 @@ template <class ElemType>
|
|||
if (*InputRef(INDEXDATA).GetMBLayout() != *InputRef(SOURCEDATA).GetMBLayout())
|
||||
InvalidArgument("%ls %ls operation requires the minibatch layout of index and source data to be the same.", NodeName().c_str(), OperationName().c_str());
|
||||
InputRef(INDEXDATA).MaskMissingValueColumnsTo(FrameRange(InputRef(INDEXDATA).GetMBLayout()), -1); // indicates an invalid column to Gather/Scatter
|
||||
let& index = InputRef(INDEXDATA) .Value(); // column indices to copy from
|
||||
let& source = InputRef(SOURCEDATA).Value(); // source data to copy
|
||||
let& index = InputRef(INDEXDATA).Value(); // column indices to copy from
|
||||
let& source = InputRef(SOURCEDATA).Value(); // source data to copy
|
||||
|
||||
#ifdef _MSC_VER
|
||||
auto& outputValuePtrRef = ValuePtrRef();
|
||||
|
@ -653,7 +690,7 @@ template <class ElemType>
|
|||
source.GetMatrixType(),
|
||||
source.GetFormat());
|
||||
|
||||
auto& output = Value(); // output goes here
|
||||
auto& output = Value(); // output goes here
|
||||
output.DoScatterColumnsOf(/*beta=*/0, index, source, /*alpha=*/1, true);
|
||||
}
|
||||
|
||||
|
@ -662,9 +699,9 @@ template <class ElemType>
|
|||
{
|
||||
if (inputIndex == SOURCEDATA)
|
||||
{
|
||||
let& index = InputRef(INDEXDATA).Value(); // column indices to copy from
|
||||
let& index = InputRef(INDEXDATA).Value(); // column indices to copy from
|
||||
auto& sourceGradient = Input(SOURCEDATA)->Gradient(); // source to propagate the gradient input
|
||||
auto& outputGradient = Gradient(); // output gradient to propagate
|
||||
auto& outputGradient = Gradient(); // output gradient to propagate
|
||||
sourceGradient.DoGatherColumnsOf(/*beta=*/1, index, outputGradient, /*alpha=*/1);
|
||||
}
|
||||
}
|
||||
|
@ -707,8 +744,8 @@ template <class ElemType>
|
|||
CropNode<ElemType>::CropNode(size_t offsetX, size_t offsetY, DEVICEID_TYPE deviceId, const wstring& name)
|
||||
: CropNode(deviceId, name)
|
||||
{
|
||||
m_xOffset = (double)(offsetX);
|
||||
m_yOffset = (double)(offsetY);
|
||||
m_xOffset = (double) (offsetX);
|
||||
m_yOffset = (double) (offsetY);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
|
@ -880,8 +917,7 @@ void CropNode<ElemType>::ComputeCropOffsets()
|
|||
// nodeToTransformMap contains coordinate maps for all nodes traversed so far, and is updated by this function.
|
||||
// Traversal stack contains all nodes traversed so far. Inputs of currNode are pushed to traversal stack so that their
|
||||
// inputs can be processed later on.
|
||||
auto ProcessInputs = [](ComputationNodeBase* currNode, stack<ComputationNodeBase*>& traversalStack, unordered_map<ComputationNodeBase*, SpaceTransform>& nodeToTransformMap)
|
||||
{
|
||||
auto ProcessInputs = [](ComputationNodeBase* currNode, stack<ComputationNodeBase*>& traversalStack, unordered_map<ComputationNodeBase*, SpaceTransform>& nodeToTransformMap) {
|
||||
if (!currNode->Is<TransformerNode>())
|
||||
RuntimeError("Node does not support affine transform for cropping.");
|
||||
|
||||
|
@ -1033,4 +1069,6 @@ template class CropNode<float>;
|
|||
template class CropNode<double>;
|
||||
template class CropNode<half>;
|
||||
|
||||
}}}
|
||||
} // namespace CNTK
|
||||
} // namespace MSR
|
||||
} // namespace Microsoft
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -311,6 +311,7 @@ public:
|
|||
CPUMatrix<ElemType>& InplaceSoftThreshold(const ElemType threshold);
|
||||
|
||||
CPUMatrix<ElemType>& SetToZeroIfAbsLessThan(const ElemType threshold);
|
||||
CPUMatrix<ElemType>& SetToZeroIfLessThan(const ElemType threshold);
|
||||
|
||||
ElemType SumOfAbsElements() const; // sum of all abs(elements)
|
||||
ElemType SumOfElements() const; // sum of all elements
|
||||
|
|
|
@ -3215,6 +3215,24 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::SetToZeroIfAbsLessThan(const ElemType
|
|||
return *this;
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
CPUMatrix<ElemType>& CPUMatrix<ElemType>::SetToZeroIfLessThan(const ElemType threshold)
|
||||
{
|
||||
if (IsEmpty())
|
||||
LogicError("SetToZeroIfLessThan: Matrix is empty.");
|
||||
|
||||
auto& us = *this;
|
||||
|
||||
#pragma omp parallel for
|
||||
foreach_coord (i, j, us)
|
||||
{
|
||||
if ((us(i, j)) < threshold)
|
||||
us(i, j) = 0;
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
//sum of all abs(elements)
|
||||
template <class ElemType>
|
||||
ElemType CPUMatrix<ElemType>::SumOfAbsElements() const
|
||||
|
|
|
@ -2486,6 +2486,18 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::SetToZeroIfAbsLessThan(const ElemType
|
|||
return *this;
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
GPUMatrix<ElemType>& GPUMatrix<ElemType>::SetToZeroIfLessThan(const ElemType threshold)
|
||||
{
|
||||
if (IsEmpty())
|
||||
LogicError("SetToZeroIfLessThan: Matrix is empty.");
|
||||
CUDA_LONG N = (CUDA_LONG) GetNumElements();
|
||||
int blocksPerGrid = (int) ceil(N * 1.0 / GridDim::maxThreadsPerBlock);
|
||||
PrepareDevice();
|
||||
SyncGuard syncGuard;
|
||||
_setToZeroIfLessThan<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream>>>(Data(), threshold, N);
|
||||
return *this;
|
||||
}
|
||||
template <class ElemType>
|
||||
ElemType GPUMatrix<ElemType>::SumOfAbsElements() const
|
||||
{
|
||||
|
@ -5318,6 +5330,119 @@ void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a,
|
|||
return TensorOpN<ElemType, 2>(beta, array<ElemType*, 2>{a.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
|
||||
// perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
|
||||
// This binds the N-ariness to a template parameter N, and gets the data pointers out from the matrix objects.
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::TensorOpDebug(ElemType beta, const GPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 2>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
|
||||
{
|
||||
if (reductionOp != ElementWiseOperator::opSum &&
|
||||
reductionOp != ElementWiseOperator::opLogSum &&
|
||||
reductionOp != ElementWiseOperator::opMin &&
|
||||
reductionOp != ElementWiseOperator::opMax &&
|
||||
reductionOp != ElementWiseOperator::opElementwiseProduct)
|
||||
InvalidArgument("TensorOp: Unary reduction operations other than opMax, opMin, opSum, and opLogSum are not implemented.");
|
||||
|
||||
a.PrepareDevice();
|
||||
if (a.GetComputeDeviceId() != GetComputeDeviceId())
|
||||
InvalidArgument("All matrices must be on the same GPU");
|
||||
|
||||
// special case: linear processing
|
||||
// The case statement has measurable impact for unary ops (but not for binary ops it seems, due to double mem access).
|
||||
// Linear gap-free unary ops happen so regularly that we will eliminate the case statement from the CUDA kernel, and instead expand all.
|
||||
if (regularOpDims.size() == 1 && regularStrides[0][0] == 1 && regularStrides[1][0] == 1 && reducingOpDims.size() == 0)
|
||||
{
|
||||
// special case: for copy, use cudaMemcpy() instead, or cublas_axpy()
|
||||
// TODO: We should observe if these actually make a speed difference, and if not, remove these special cases.
|
||||
if (op == ElementWiseOperator::opCopy && beta == 0 && alpha == 1)
|
||||
{
|
||||
//fprintf(stderr, "TensorOpDebug 1 \n");
|
||||
return CUDA_CALL(cudaMemcpy(Data() + offsets[1], a.Data() + offsets[0], sizeof(ElemType) * regularOpDims[0], cudaMemcpyDeviceToDevice));
|
||||
}
|
||||
else if (op == ElementWiseOperator::opCopy && beta == 1)
|
||||
{
|
||||
//fprintf(stderr, "TensorOpDebug 2 \n");
|
||||
return CUBLAS_CALL(cublasaxpyHelper(GetCublasHandle(GetComputeDeviceId()), (int) regularOpDims[0], &alpha, a.Data() + offsets[0], 1, Data() + offsets[1], 1));
|
||||
}
|
||||
else
|
||||
{
|
||||
//fprintf(stderr, "TensorOpDebug 3 \n");
|
||||
|
||||
return LaunchUnaryTensorOp<ElemType>(beta, a.Data() + offsets[0], Data() + offsets[1], alpha, op, regularOpDims[0]);
|
||||
}
|
||||
}
|
||||
|
||||
// special case: sum-reducing a matrix onto a column vector; can be done with SGEMM
|
||||
// Note: A minor risk is that with this, our own reduction function will rarely be used.
|
||||
// That function was tested to give the same results with 'double', and nearly the same with 'float' (different summation order matters).
|
||||
else if (op == ElementWiseOperator::opCopy && // we are just adding to target without any further operation
|
||||
reductionOp == ElementWiseOperator::opSum &&
|
||||
#ifdef _DEBUG
|
||||
sizeof(ElemType) == sizeof(float) && // in debug don't shortcut 'double' so we have some test of our own codepath
|
||||
#endif
|
||||
regularOpDims.size() == 1 && regularStrides[0][0] == 1 && regularStrides[1][0] == 1 && // we are processing a column
|
||||
reducingOpDims.size() == 1 && reducingStrides[0][0] >= (ptrdiff_t) regularOpDims[0]) // reducing across columns and no overlap
|
||||
{
|
||||
assert(reducingStrides[1][0] == 0);
|
||||
auto ARows = regularOpDims[0]; // vertical steps
|
||||
auto ACols = reducingOpDims[0]; // horizontal steps (reduction)
|
||||
auto ALd = reducingStrides[0][0]; // horizontal step width through matrix
|
||||
cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId());
|
||||
CUBLAS_CALL(cublasgemmHelper(cuHandle, CUBLAS_OP_N, CUBLAS_OP_N, (int) /*CRows=*/ARows, /*CCols=*/1, (int) ACols, &alpha,
|
||||
/*A00=*/a.Data() + offsets[0], (int) ALd,
|
||||
/*B00=*/GetOnesVector<ElemType>(ACols, a.GetComputeDeviceId())->Data(), (int) /*BRows=*/ACols, &beta,
|
||||
/*C00=*/Data() + offsets[1], (int) /*CRows=*/ARows));
|
||||
//fprintf(stderr, "TensorOpDebug 4 \n");
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO: Add a special case for tensor bias reduction. cudnn is ~7% faster on Image/QuickE2E.
|
||||
|
||||
// regular case
|
||||
else
|
||||
{
|
||||
/*
|
||||
fprintf(stderr, "TensorOpDebug 5 \n");
|
||||
for (size_t i = 0; i < 2; i++)
|
||||
{
|
||||
// fprintf(stderr, "i = %d, offsets = %d, regularStrides = %d, reducingStrides = %d \n ", int(i), int(offsets[i]), int(regularStrides[i]), int(reducingStrides[i]));
|
||||
fprintf(stderr, "TensorOpDebug 5.1, i = %d, offsets = %d\n ", int(i), int(offsets[i]));
|
||||
}
|
||||
fprintf(stderr, "TensorOpDebug 5.1, regularOpDims.size() = %d \n", int(regularOpDims.size()));
|
||||
|
||||
for (size_t i = 0; i < regularOpDims.size(); i++)
|
||||
fprintf(stderr, "TensorOpDebug 5.1, i = %d, regularOpDims = %d\n ", int(i), int(regularOpDims[i]));
|
||||
|
||||
fprintf(stderr, "TensorOpDebug 5.1, reducingOpDims.size() = %d \n", int(reducingOpDims.size()));
|
||||
for (size_t i = 0; i < reducingOpDims.size(); i++)
|
||||
fprintf(stderr, "TensorOpDebug 5.1, i = %d, reducingOpDims = %d\n ", int(i), int(reducingOpDims[i]));
|
||||
|
||||
for (size_t i = 0; i < 2; i++)
|
||||
{
|
||||
fprintf(stderr, "TensorOpDebug 5.1, i = %d, regularStrides.size() = %d \n", int(i), int(regularStrides[i].size()));
|
||||
for (size_t j = 0; j < regularStrides[i].size(); j++)
|
||||
{
|
||||
fprintf(stderr, "TensorOpDebug 5.1, i = %d, j = %d, regularStrides = %d \n ", int(i), int(j), int(regularStrides[i][j]));
|
||||
}
|
||||
|
||||
fprintf(stderr, "TensorOpDebug 5.1, i = %d, reducingStrides.size() = %d \n", int(i), int(reducingStrides[i].size()));
|
||||
for (size_t j = 0; j < reducingStrides[i].size(); j++)
|
||||
{
|
||||
fprintf(stderr, "TensorOpDebug 5.1, i = %d, j = %d, reducingStrides = %d \n ", int(i), int(j), int(reducingStrides[i][j]));
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "TensorOpDebug 5.2, beta = %f, alpha = %f, a.data = %f, data = %f \n", double(beta), double(alpha), double(a.FrobeniusNorm()), double(FrobeniusNorm()));
|
||||
// return TensorOpN<ElemType, 2>(beta, array<ElemType*, 2>{a.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
*/
|
||||
return TensorOpNDebug<ElemType, 2>(beta, array<ElemType*, 2>{a.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides, a, *this);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// perform binary operation 'op' on a and b giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
|
|
|
@ -437,6 +437,7 @@ public:
|
|||
GPUMatrix<ElemType>& InplaceSoftThreshold(const ElemType threshold);
|
||||
|
||||
GPUMatrix<ElemType>& SetToZeroIfAbsLessThan(const ElemType threshold);
|
||||
GPUMatrix<ElemType>& SetToZeroIfLessThan(const ElemType threshold);
|
||||
|
||||
DeviceBoundNumber<ElemType> Sum_AsDeviceBoundNum() const;
|
||||
ElemType SumOfAbsElements() const; // sum of all abs(elements)
|
||||
|
@ -607,6 +608,11 @@ public:
|
|||
const std::array<size_t, 2>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
|
||||
void TensorOpDebug(ElemType beta, const GPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const std::array<size_t, 2>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
|
||||
|
||||
void TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const std::array<size_t, 3>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 3>& regularStrides,
|
||||
|
|
|
@ -1306,6 +1306,19 @@ __global__ void _setToZeroIfAbsLessThan(
|
|||
a[id] = 0;
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
__global__ void _setToZeroIfLessThan(
|
||||
ElemType* a,
|
||||
const ElemType threshold,
|
||||
const CUDA_LONG N)
|
||||
{
|
||||
typedef typename TypeSelector<ElemType>::comp_t comp_t;
|
||||
CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (id >= N)
|
||||
return;
|
||||
if (((comp_t) a[id]) < (comp_t) threshold)
|
||||
a[id] = 0;
|
||||
}
|
||||
template <class ElemType>
|
||||
__global__ void _areEqual(
|
||||
const ElemType* a,
|
||||
|
|
|
@ -2987,6 +2987,19 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::SetToZeroIfAbsLessThan(con
|
|||
return *this;
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::SetToZeroIfLessThan(const ElemType threshold)
|
||||
{
|
||||
VerifyWritable(__FUNCTION__);
|
||||
|
||||
if (IsEmpty())
|
||||
LogicError("SetToZeroIfLessThan: Matrix is empty.");
|
||||
CUDA_LONG N = (CUDA_LONG) GetNumNZElements();
|
||||
int blocksPerGrid = (int) ceil(N * 1.0 / GridDim::maxThreadsPerBlock);
|
||||
SyncGuard syncGuard;
|
||||
_setToZeroIfLessThan<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(NzValues(), threshold, N);
|
||||
return *this;
|
||||
}
|
||||
#pragma endregion
|
||||
|
||||
#pragma region Helper Functions
|
||||
|
|
|
@ -395,7 +395,7 @@ public:
|
|||
GPUSparseMatrix<ElemType>& AssignTruncateTopOf(const GPUSparseMatrix<ElemType>& a, const ElemType threshold);
|
||||
|
||||
GPUSparseMatrix<ElemType>& SetToZeroIfAbsLessThan(const ElemType threshold);
|
||||
|
||||
GPUSparseMatrix<ElemType>& SetToZeroIfLessThan(const ElemType threshold);
|
||||
GPUSparseMatrix<ElemType>& AssignOneHot(const GPUMatrix<ElemType>& a, vector<size_t>& shape, size_t axis);
|
||||
void SetDiagonalValue(const ElemType v);
|
||||
void SetDiagonalValue(const GPUMatrix<ElemType>& vector);
|
||||
|
|
|
@ -875,6 +875,12 @@ static shared_ptr<ElemType> GetReductionBuffer(size_t N)
|
|||
return reductionBuffersCache[deviceId];
|
||||
}
|
||||
|
||||
// this is safe for multithread calling in RNNT_EMBR
|
||||
template <class ElemType>
|
||||
static shared_ptr<ElemType> GetReductionBufferNoCache(size_t N)
|
||||
{
|
||||
return AllocateReductionBuffer<ElemType>(N);
|
||||
}
|
||||
// All dimensions (N-ariness, number of input dimensions K and number of reduction dimensions M) are bound to template parameters now.
|
||||
template <class ElemType, C_size_t N, C_int M, C_int K>
|
||||
static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> pointerVector, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
|
@ -1112,6 +1118,272 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template <class ElemType, C_size_t N, C_int M, C_int K>
|
||||
static void LaunchTensorOpWithReductionDebug(ElemType beta, array<ElemType*, N> pointerVector, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrideVectors,
|
||||
const SmallVector<size_t>& reducingOpDimVector, const array<SmallVector<ptrdiff_t>, N>& reducingStrideVectors, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& result)
|
||||
{
|
||||
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 1, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
|
||||
// return TensorOpN<ElemType, 2>(beta, array<ElemType*, 2>{a.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
a;
|
||||
result;
|
||||
typedef typename TypeSelector<ElemType>::comp_t ReduceElemType;
|
||||
// copy all parameters to CUDA-compatible data structures
|
||||
FixedArray<ElemType*, N> pointers(pointerVector);
|
||||
SmallVector<C_size_t> regularOpStrideVector; // kernel needs the strides for converting thread index back to multi-dimensional tensor index
|
||||
C_size_t numElements = 1;
|
||||
// input divisors
|
||||
SmallVector<fast_divmod> regularOpStrideDivmodVector;
|
||||
for (C_size_t k = 0; k < regularOpDims.size(); k++)
|
||||
{
|
||||
regularOpStrideVector.push_back(numElements); // stride for dense representation of our output elements (if they were flattened)
|
||||
regularOpStrideDivmodVector.push_back(fast_divmod((unsigned int) numElements));
|
||||
numElements *= (C_size_t) regularOpDims[k];
|
||||
}
|
||||
// output divisors
|
||||
SmallVector<fast_divmod> reducingOpDimDivmodVector;
|
||||
C_size_t stride = 1;
|
||||
for (C_size_t k = 0; k < reducingOpDimVector.size(); ++k)
|
||||
{
|
||||
reducingOpDimDivmodVector.push_back(fast_divmod(stride));
|
||||
stride *= (C_size_t) reducingOpDimVector[k];
|
||||
}
|
||||
|
||||
FixedArray<C_unsigned_int, K> regularOpStrides(regularOpStrideVector);
|
||||
FixedMatrix<C_int, N, K> regularStrides(regularStrideVectors);
|
||||
FixedArray<C_unsigned_int, M> reducingOpDims(reducingOpDimVector);
|
||||
FixedMatrix<C_int, N, M> reducingStrides(reducingStrideVectors);
|
||||
// reduced divisors
|
||||
FixedArray<fast_divmod, K> regularOpStrideDivmod(regularOpStrideDivmodVector);
|
||||
FixedArray<fast_divmod, M> reducingOpDimDivmod(reducingOpDimDivmodVector);
|
||||
|
||||
// launch the kernel
|
||||
CUDA_LONG NN = (CUDA_LONG) numElements; // linear space identifying each individual output element
|
||||
SyncGuard syncGuard;
|
||||
|
||||
// do some optimization for reductions
|
||||
// - example: 30 GPU procs, warp size 32 --> 960 GPU cores
|
||||
// - NN elements must be computed, each involving a reduction over reductionDim elements
|
||||
// Cases:
|
||||
// - #output elements NN >= GPU cores --> use one proc per element, do reduction in inner loop
|
||||
// E.g. if >=960 elements are computed, each gets its own GPU thread.
|
||||
// - reduction dimension would benefit from multiple blocks --> multiple blocks work on a single output element
|
||||
// E.g.
|
||||
// - gradient of adding a bias: reducing to a bias, e.g. 512-dim
|
||||
// - gradient of scalar multiplication: big elementwise product reduced to a scalar (big dot product, e.g. [1024 x 1024] = 1M elements)
|
||||
// - softmax in seq-2-seq attention model: reduce over length of attention window (e.g. 20)
|
||||
// - summation of criterion value: scalar reduction over a few hundred or thousand samples in the minibatch
|
||||
C_size_t reductionDim = 1; // number of elements to reduce over
|
||||
for (C_size_t k = 0; k < reducingOpDimVector.size(); k++)
|
||||
reductionDim *= (C_size_t) reducingOpDimVector[k];
|
||||
GridDim grid(NN);
|
||||
let& props = GridDim::GetDeviceProps();
|
||||
bool disableParallelReduction = false; // (for debugging)
|
||||
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 2, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
|
||||
|
||||
// === arg based reduction, one thread per output element
|
||||
if ((reductionOp == ElementWiseOperator::opArgmax) ||
|
||||
(reductionOp == ElementWiseOperator::opArgmin))
|
||||
{
|
||||
|
||||
_launchTensorArgOp<ElemType, N, M, K><<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream>>>(
|
||||
pointers, reductionOp,
|
||||
regularOpStrides, regularStrides, grid.m_N,
|
||||
reducingOpDims, reducingStrides,
|
||||
regularOpStrideDivmod, reducingOpDimDivmod);
|
||||
}
|
||||
// === simple case: NN large, one thread per output element
|
||||
else if (reductionDim == 1 || // no reduction
|
||||
grid.m_blocksPerGrid >= props.multiProcessorCount || // enough output elements to fill all multiprocs
|
||||
reductionDim * numElements <= 2 * props.warpSize || // trivial operation not worth the trouble (2* because the more complex one also needs 2 kernel launches)
|
||||
disableParallelReduction || // (for debugging)
|
||||
reductionDim * numElements <= props.multiProcessorCount) // recursive call from reduction below
|
||||
{
|
||||
// we got enough elements to generate: do one element per thread, and reduction inside
|
||||
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.1, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
|
||||
|
||||
_launchTensorOp<ElemType, N, M, K><<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream>>>(
|
||||
beta, pointers, alpha, op, reductionOp,
|
||||
regularOpStrides, regularStrides, grid.m_N,
|
||||
reducingOpDims, reducingStrides,
|
||||
regularOpStrideDivmod, reducingOpDimDivmod);
|
||||
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.2, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
|
||||
|
||||
}
|
||||
// === optimization: simple case would not use all multiprocs
|
||||
else
|
||||
{
|
||||
// m_blocksPerGrid can be thought of NN / 512, with appropriate rounding
|
||||
|
||||
// we are reducing and are underutilizing the multiprocs we have: get more parallelism by doing reduction in parallel
|
||||
// If we get here, then
|
||||
// - the total number of outputs to produce is < #multiprocs * warpSize, e.g. < 960
|
||||
// - each output has at least two inputs, but possibly millions
|
||||
// Examples:
|
||||
// (a1) NN=900
|
||||
// - each multiproc processes multiple elements concurrently, each reducing over its inputs inside
|
||||
// - use one block per output element
|
||||
// (a2) NN=30
|
||||
// - same as (a1) except 30 multiprocs run only a single block each
|
||||
// (a3) NN=16
|
||||
// - same as (a1) except only 16 multiproc run one block
|
||||
// (b1) NN=15
|
||||
// - 2 blocks work together on a single output element
|
||||
// (b2) NN=1 (NN < #multiprocs, e.g. NN < 30)
|
||||
// - multiple blocks work together on a single output element
|
||||
// - only this case requires memory, and only K * NN
|
||||
// where K = blocks that work together,
|
||||
// both K and NN < #multiprocs,
|
||||
// and K * NN = on the order of NN, but generally a bit larger due to rounding.
|
||||
|
||||
// By how much do we underutilize?
|
||||
// We increase #blocks by that factor by breaking reduction into that many chunks.
|
||||
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.3, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
|
||||
|
||||
int numReductionChunks = std::max<int>(props.multiProcessorCount / NN, 1); // only >1 for NN < multiProcessorCount
|
||||
|
||||
// distribute NN over block X and Y
|
||||
int blockXOverBy = CeilDiv(NN, props.maxGridSize[0]);
|
||||
int numBlocksX = CeilDiv(NN, blockXOverBy);
|
||||
int numBlocksY = CeilDiv(NN, numBlocksX);
|
||||
// while block Z is for multiple blocks working together on a single output element
|
||||
int numBlocksZ = numReductionChunks;
|
||||
// Block dim is now:
|
||||
// - X, Y: such that X*Y covers NN
|
||||
// - Z: reduction chunks
|
||||
|
||||
// reduction goes into thread dim X
|
||||
int reductionChunkSize = CeilDiv(reductionDim, numReductionChunks);
|
||||
int numThreadsX = std::min<int>(reductionChunkSize, GridDim::maxThreadsPerBlock); // any that's over will be done by looping inside the kernel
|
||||
|
||||
// --- cases (a1) and (a2)
|
||||
// This involves no reduction across blocks.
|
||||
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.4, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
|
||||
|
||||
if (numReductionChunks == 1)
|
||||
{
|
||||
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.5, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
|
||||
|
||||
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(
|
||||
beta, pointers, alpha, op, reductionOp,
|
||||
regularOpStrides, regularStrides, NN,
|
||||
reducingOpDims, reducingStrides, /*reductionBegin*/ 0, reductionChunkSize,
|
||||
regularOpStrideDivmod, reducingOpDimDivmod);
|
||||
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.6, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
|
||||
|
||||
}
|
||||
// --- case (b)
|
||||
// Reduction across blocks. This is the difficult one.
|
||||
#ifndef ALLOW_ATOMIC_REDUCTION // temporarily disabled to ensure it is not causing the non-reproducability
|
||||
else
|
||||
{
|
||||
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.7, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
|
||||
|
||||
// we get here if NN <= #multiprocs
|
||||
assert(NN <= props.multiProcessorCount && numBlocksX == NN && numBlocksY == 1);
|
||||
// dims are:
|
||||
// - numBlocksZ = numReductionChunks = how many multiprocs work together to produce one output element
|
||||
// - numBlocksX = NN = number of output elements
|
||||
// - numThreadsX = reductionChunkSize clipped to 512; reductionChunkSize > 512 is handled by an inner for loop inside of the kernel
|
||||
|
||||
// we need memory for block outputs of dimension [numBlocksX x numBlocksZ]
|
||||
// - total elements = NN * Floor(#multiprocs / NN) = <= #multiprocs
|
||||
let reductionBufferSize = props.multiProcessorCount;
|
||||
assert(reductionBufferSize >= NN * numBlocksZ);
|
||||
shared_ptr<ElemType> reductionBuffer = GetReductionBufferNoCache<ElemType>(reductionBufferSize);
|
||||
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.8, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
|
||||
|
||||
// 'pointers', 'regularOpStrides', and 'regularStrides' are set up to point to the target memory.
|
||||
// We need to reroute them to point to our reductionBuffer.
|
||||
// - pointer[N-1] -> replace by reductionBuffer
|
||||
// - regularStrides -> replace [N-1] by regularOpStrides which already represent the NN elements for a dense memory layout
|
||||
// - beta -> 0 since we write into temp memory
|
||||
// - kernel must use block.z as second index into the output buffer; add (block.z * NN) to the pointer
|
||||
FixedArray<ElemType*, N> pointers1 = pointers;
|
||||
pointers1[N - 1] = reductionBuffer.get();
|
||||
auto regularStrideVectors1 = regularStrideVectors;
|
||||
for (size_t k = 0; k < regularOpStrides.size(); k++)
|
||||
regularStrideVectors1[N - 1][k] = (ptrdiff_t) regularOpStrideVector[k];
|
||||
FixedMatrix<C_int, N, K> regularStrides1(regularStrideVectors1);
|
||||
ElemType beta1 = 0;
|
||||
ElemType alpha1 = 1;
|
||||
// fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.9, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
|
||||
|
||||
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(
|
||||
beta1, pointers1, alpha1, op, reductionOp,
|
||||
regularOpStrides, regularStrides1, NN,
|
||||
reducingOpDims, reducingStrides, /*reductionBegin*/ 0, reductionChunkSize,
|
||||
regularOpStrideDivmod, reducingOpDimDivmod);
|
||||
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 3, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
|
||||
|
||||
|
||||
#if 1
|
||||
// now reduce and redistribute
|
||||
// Create a new tensor task, and execute it recursively:
|
||||
// - input = reductionBuffer
|
||||
// - output = true output
|
||||
// - op dims/strides = output elements
|
||||
// - reduce dims/strides = numBlocksZ
|
||||
// - op = opCopy
|
||||
array<ElemType*, 2> pointerVector2{reductionBuffer.get(), pointerVector[N - 1]};
|
||||
const array<SmallVector<ptrdiff_t>, 2> regularStrideVectors2{regularStrideVectors1[N - 1], regularStrideVectors[N - 1]};
|
||||
const array<SmallVector<ptrdiff_t>, 2> reducingStrideVectors2{SmallVector<ptrdiff_t>{NN}, SmallVector<ptrdiff_t>{0}};
|
||||
const SmallVector<size_t> reducingOpDimVector2{(size_t) numReductionChunks};
|
||||
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 3.1, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
|
||||
|
||||
LaunchTensorOpWithReduction<ElemType, /*N=*/2, /*M=*/1, K>(
|
||||
beta, pointerVector2, alpha, ElementWiseOperator::opCopy, reductionOp,
|
||||
regularOpDims, regularStrideVectors2,
|
||||
reducingOpDimVector2, reducingStrideVectors2);
|
||||
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 3.2, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
|
||||
|
||||
// (note: ^^this will have a nested syncGuard, which is fine)
|
||||
|
||||
#else
|
||||
_launchTensorOp<ElemType, N, M, K><<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream>>>(
|
||||
beta, pointers, alpha, op, reductionOp,
|
||||
regularOpStrides, regularStrides, grid.m_N,
|
||||
reducingOpDims, reducingStrides);
|
||||
//for (size_t z = 0; z < numBlocksZ; z++)
|
||||
// _launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(z == 0 ? beta : 1, pointers, alpha, op,
|
||||
// regularOpStrides, regularStrides, NN,
|
||||
// reducingOpDims, reducingStrides, reductionChunkSize * z, reductionChunkSize);
|
||||
vector<ElemType> peekPartial(NN * numBlocksZ, -42);
|
||||
vector<ElemType> peekFinal(NN, -42);
|
||||
CUDA_CALL(cudaMemcpy(peekPartial.data(), reductionBuffer, sizeof(ElemType) * peekPartial.size(), cudaMemcpyDeviceToHost));
|
||||
CUDA_CALL(cudaMemcpy(peekFinal.data(), pointers[pointers.size() - 1], sizeof(ElemType) * peekFinal.size(), cudaMemcpyDeviceToHost));
|
||||
double s1 = 0, s2 = 0;
|
||||
for (auto v : peekPartial)
|
||||
s1 += v;
|
||||
for (auto v : peekFinal)
|
||||
s2 += v;
|
||||
sin(1.0);
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
else if (beta == 1)
|
||||
{
|
||||
// no need to pre-scale; just add (common for gradients)
|
||||
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(beta, pointers, alpha, op, reductionOp, regularOpStrides,
|
||||
regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize,
|
||||
regularOpStrideDivmod, reducingOpDimDivmod);
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
// We need more than one chunk, we will use atomicAdd().
|
||||
// First reset/pre-multiply input; then do the remaining chunks using atomicAdd().
|
||||
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(beta, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize,
|
||||
regularOpStrideDivmod, reducingOpDimDivmod);
|
||||
// We will leave it like this for a while, but eventually need to revisit using temporary memory.
|
||||
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ - 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(/*beta=*/1, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, reductionChunkSize, reductionChunkSize,
|
||||
regularOpStrideDivmod, reducingOpDimDivmod);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
// -----------------------------------------------------------------------
|
||||
// kernel and launch --linear unary
|
||||
// -----------------------------------------------------------------------
|
||||
|
@ -1205,6 +1477,24 @@ static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& po
|
|||
}
|
||||
}
|
||||
|
||||
template <class ElemType, C_size_t N, C_int K>
|
||||
static void TensorOpWithRegularLoopDebug(ElemType beta, const array<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
|
||||
{
|
||||
size_t dims = reducingOpDims.size();
|
||||
switch (dims)
|
||||
{
|
||||
case 2:
|
||||
return LaunchTensorOpWithReduction<ElemType, N, 2, K>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 1:
|
||||
return LaunchTensorOpWithReduction<ElemType, N, 1, K>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 0:
|
||||
return LaunchTensorOp<ElemType, N, K>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides);
|
||||
default:
|
||||
LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (C_int) dims);
|
||||
}
|
||||
}
|
||||
// tensor operation, generalized in number of arguments
|
||||
// This function now expands into different k. It also eliminates the offsets by adding them to the pointers.
|
||||
template <class ElemType, C_size_t N>
|
||||
|
@ -1236,6 +1526,17 @@ void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, Elem
|
|||
}
|
||||
}
|
||||
|
||||
template <class ElemType, C_size_t N>
|
||||
void TensorOpNDebug(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, N>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& result)
|
||||
{
|
||||
for (C_size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled
|
||||
pointers[i] += offsets[i];
|
||||
return LaunchTensorOpWithReductionDebug<ElemType, N, 1, 0>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides, a, result);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// explicit instantiations--these are being called from GPUMatrix.cu
|
||||
//------------------------------------------------------------------------
|
||||
|
@ -1277,6 +1578,21 @@ template void TensorOpN<half, 4>(half beta, array<half*, 4> pointers, half alpha
|
|||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides);
|
||||
|
||||
template void TensorOpNDebug<half, 2>(half beta, array<half*, 2> pointers, half alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 2>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides, const GPUMatrix<half>& a, GPUMatrix<half>& result);
|
||||
|
||||
template void TensorOpNDebug<double, 2>(double beta, array<double*, 2> pointers, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 2>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides, const GPUMatrix<double>& a, GPUMatrix<double>& result);
|
||||
|
||||
template void TensorOpNDebug<float, 2>(float beta, array<float*, 2> pointers, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 2>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides, const GPUMatrix<float>& a, GPUMatrix<float>& result);
|
||||
|
||||
|
||||
template void LaunchUnaryTensorOp(float beta, const float* pa, float* pb, float alpha, ElementWiseOperator op, size_t regularOpDim);
|
||||
template void LaunchUnaryTensorOp(double beta, const double* pa, double* pb, double alpha, ElementWiseOperator op, size_t regularOpDim);
|
||||
|
|
|
@ -23,6 +23,12 @@ void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, Elem
|
|||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides);
|
||||
|
||||
template <class ElemType, C_size_t N>
|
||||
void TensorOpNDebug(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, N>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& result);
|
||||
|
||||
template <class ElemType>
|
||||
void LaunchUnaryTensorOp(ElemType beta, const ElemType* pa, ElemType* pb, ElemType alpha, ElementWiseOperator op, size_t regularOpDim);
|
||||
|
||||
|
|
|
@ -3669,6 +3669,21 @@ Matrix<ElemType>& Matrix<ElemType>::SetToZeroIfAbsLessThan(const ElemType thresh
|
|||
return *this;
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
Matrix<ElemType>& Matrix<ElemType>::SetToZeroIfLessThan(const ElemType threshold)
|
||||
{
|
||||
if (IsEmpty())
|
||||
LogicError("SetToZeroIfLessThan: Matrix is empty.");
|
||||
|
||||
DISPATCH_MATRIX_ON_FLAG(this,
|
||||
this,
|
||||
m_CPUMatrix->SetToZeroIfLessThan(threshold),
|
||||
m_GPUMatrix->SetToZeroIfLessThan(threshold),
|
||||
NOT_IMPLEMENTED,
|
||||
m_GPUSparseMatrix->SetToZeroIfLessThan(threshold));
|
||||
|
||||
return *this;
|
||||
}
|
||||
//sum of all elements
|
||||
template <class ElemType>
|
||||
ElemType Matrix<ElemType>::SumOfElements() const
|
||||
|
@ -6399,6 +6414,23 @@ void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, ElemTy
|
|||
NOT_IMPLEMENTED,
|
||||
NOT_IMPLEMENTED);
|
||||
}
|
||||
template <class ElemType>
|
||||
void Matrix<ElemType>::TensorOpDebug(ElemType beta, const Matrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 2>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
|
||||
{
|
||||
VerifyIsDense(*this) && VerifyIsDense(a);
|
||||
|
||||
DecideAndMoveToRightDevice(*this, a);
|
||||
|
||||
DISPATCH_MATRIX_ON_FLAG(this,
|
||||
this,
|
||||
m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
|
||||
m_GPUMatrix->TensorOpDebug(beta, *a.m_GPUMatrix, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
|
||||
NOT_IMPLEMENTED,
|
||||
NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
|
|
|
@ -474,6 +474,8 @@ public:
|
|||
void InplaceTranspose();
|
||||
|
||||
Matrix<ElemType>& SetToZeroIfAbsLessThan(const ElemType threshold);
|
||||
Matrix<ElemType>& SetToZeroIfLessThan(const ElemType threshold);
|
||||
|
||||
|
||||
DeviceBoundNumber<ElemType> Sum_AsDeviceBoundNum() const;
|
||||
ElemType SumOfAbsElements() const; // sum of all abs(elements)
|
||||
|
@ -658,6 +660,11 @@ public:
|
|||
const std::array<size_t, 2>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
|
||||
void TensorOpDebug(ElemType beta, const Matrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const std::array<size_t, 2>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
|
||||
|
||||
void TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const std::array<size_t, 3>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 3>& regularStrides,
|
||||
|
|
|
@ -709,6 +709,11 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::SetToZeroIfAbsLessThan(con
|
|||
return *this;
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::SetToZeroIfLessThan(const ElemType threshold)
|
||||
{
|
||||
return *this;
|
||||
}
|
||||
template <class ElemType>
|
||||
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceSoftThreshold(const ElemType threshold)
|
||||
{
|
||||
|
@ -1723,7 +1728,11 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::SetToZeroIfAbsLessThan(const ElemType
|
|||
{
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
GPUMatrix<ElemType>& GPUMatrix<ElemType>::SetToZeroIfLessThan(const ElemType threshold)
|
||||
{
|
||||
return *this;
|
||||
}
|
||||
template <class ElemType>
|
||||
ElemType GPUMatrix<ElemType>::SumOfAbsElements() const
|
||||
{
|
||||
|
|
|
@ -217,7 +217,7 @@ static void PrepareTensorOperands(array<TensorShape, N> shapes, array<size_t, N>
|
|||
}
|
||||
|
||||
for (size_t i = 0; i < N; i++)
|
||||
offsets[i] = shapes[i].GetOffset();
|
||||
offsets[i] = shapes[i].GetOffset();
|
||||
}
|
||||
|
||||
// enforce that in case of broadcasting, the output must not be an input
|
||||
|
@ -249,6 +249,47 @@ void TensorView<ElemType>::DoUnaryOpOf(ElemType beta, const TensorView& a, ElemT
|
|||
GetSOB().TensorOp(beta, a.GetSOB(), alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void TensorView<ElemType>::DoUnaryOpOfDebug(ElemType beta, const TensorView& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp)
|
||||
{
|
||||
// static int cc = 0; if (cc++ == 0)
|
||||
// fprintf(stderr, "Tensor Op: Op %d: %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(GetShape()).c_str());
|
||||
|
||||
// prepare all tensor descriptor information as needed for execution
|
||||
array<size_t, 2> offsets;
|
||||
array<SmallVector<ptrdiff_t>, 2> regularStrides, reducingStrides;
|
||||
SmallVector<size_t> regularOpDims, reducingOpDims;
|
||||
PrepareTensorOperands<ElemType, 2>(array<TensorShape, 2>{a.GetShape(), GetShape()}, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
|
||||
// output cannot be input when reducing
|
||||
if (reducingOpDims.size() > 0)
|
||||
CheckDifferentObject(a, *this);
|
||||
/*
|
||||
for (size_t i = 0; i < 2; i++)
|
||||
{
|
||||
// fprintf(stderr, "i = %d, offsets = %d, regularStrides = %d, reducingStrides = %d \n ", int(i), int(offsets[i]), int(regularStrides[i]), int(reducingStrides[i]));
|
||||
fprintf(stderr, "i = %d, offsets = %d\n ", int(i), int(offsets[i]));
|
||||
}
|
||||
for (size_t i = 0; i < regularOpDims.size(); i++)
|
||||
fprintf(stderr, "i = %d, regularOpDims = %d\n ", int(i), int(regularOpDims[i]));
|
||||
for (size_t i = 0; i < reducingOpDims.size(); i++)
|
||||
fprintf(stderr, "i = %d, reducingOpDims = %d\n ", int(i), int(reducingOpDims[i]));
|
||||
|
||||
for (size_t i = 0; i < 2; i++)
|
||||
{
|
||||
for (size_t j = 0; j < regularStrides[i].size(); j++)
|
||||
{
|
||||
fprintf(stderr, "i = %d, j = %d, regularStrides = %d \n ", int(i), int(j), int(regularStrides[i][j]));
|
||||
}
|
||||
for (size_t j = 0; j < reducingStrides[i].size(); j++)
|
||||
{
|
||||
fprintf(stderr, "i = %d, j = %d, reducingStrides = %d \n ", int(i), int(j), int(reducingStrides[i][j]));
|
||||
}
|
||||
}
|
||||
*/
|
||||
// now perform the operation
|
||||
GetSOB().TensorOpDebug(beta, a.GetSOB(), alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
template <class ElemType>
|
||||
void TensorView<ElemType>::DoBinaryOpOf(ElemType beta, const TensorView& a, const TensorView& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp)
|
||||
{
|
||||
|
|
|
@ -131,6 +131,7 @@ public:
|
|||
#pragma pop_macro("DeclareTernaryTensorOp")
|
||||
|
||||
void DoUnaryOpOf (ElemType beta, const TensorView& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp);
|
||||
void DoUnaryOpOfDebug(ElemType beta, const TensorView& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp);
|
||||
void DoBinaryOpOf (ElemType beta, const TensorView& a, const TensorView& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp);
|
||||
void DoTernaryOpOf(ElemType beta, const TensorView& a, const TensorView& b, const TensorView& c, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp);
|
||||
|
||||
|
|
|
@ -1051,10 +1051,18 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
string showWERMode,
|
||||
bool SVD)
|
||||
{
|
||||
PROFILE_SCOPE(profilerEvtMainEpoch);
|
||||
|
||||
PROFILE_SCOPE(profilerEvtMainEpoch);
|
||||
std::vector<std::wstring> decodeOutputNodeNames(outputNodeNamesVector.begin() + 1, outputNodeNamesVector.begin() + 2);
|
||||
ScopedNetworkOperationMode modeGuard(net, NetworkOperationMode::training);
|
||||
|
||||
ComputationNetwork decode_cn_root;
|
||||
decode_cn_root.CopySubTree(*net, decodeOutputNodeNames[0], L"", CopyNodeFlags::copyNodeAll);
|
||||
decode_cn_root.CompileNetwork();
|
||||
std::vector<ComputationNodeBasePtr> decodeOutputNodesTmp = decode_cn_root.OutputNodesByName(decodeOutputNodeNames);
|
||||
decode_cn_root.FormEvalOrder(decodeOutputNodesTmp[0]);
|
||||
decode_cn_root.FormNestedNetwork(decodeOutputNodesTmp[0]);
|
||||
|
||||
// bring our 'out' values into consistent state
|
||||
epochCriterion = EpochCriterion(0);
|
||||
epochEvalErrors.assign(epochEvalErrors.size(), EpochCriterion(0));
|
||||
|
@ -1078,7 +1086,6 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
auto ContainsAccumulatedResult = [&evaluationNodesWhichAccumulateResult](ComputationNodeBasePtr node) {
|
||||
return evaluationNodesWhichAccumulateResult.find(node) != evaluationNodesWhichAccumulateResult.end();
|
||||
};
|
||||
|
||||
// MA-related variables
|
||||
size_t nSamplesSinceLastModelSync = 0;
|
||||
size_t blockSizePerWorker = 0;
|
||||
|
@ -1122,7 +1129,6 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
// this is non-trivial, we need a manager object to handle this
|
||||
if (numSubminibatchesNeeded > 1)
|
||||
smbDispatcher.Init(net, learnableNodes, criterionNodes, evaluationNodes);
|
||||
|
||||
// The following is a special feature only supported by the Kaldi2Reader for more efficient sequence training.
|
||||
// This attempts to compute the error signal for the whole utterance, which will
|
||||
// be fed to the neural network as features. Currently it is a workaround
|
||||
|
@ -1241,6 +1247,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
numSamplesWithLabelOfNetworkMBR = 0;
|
||||
|
||||
auto profGetMinibatch = ProfilerTimeBegin();
|
||||
|
||||
bool wasDataRead = DataReaderHelpers::GetMinibatchIntoNetwork<ElemType>(*trainSetDataReader, net, criterionNodes[0],
|
||||
useDistributedMBReading, useParallelTrain, *inputMatrices, actualMBSize, m_mpi);
|
||||
|
||||
|
@ -1321,12 +1328,10 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
//net->CompileNetwork();
|
||||
std::vector<std::wstring> encodeOutputNodeNames(outputNodeNamesVector.begin(), outputNodeNamesVector.begin() + 1);
|
||||
std::vector<ComputationNodeBasePtr> encodeOutputNodes = net->OutputNodesByName(encodeOutputNodeNames);
|
||||
|
||||
//
|
||||
//net->CollectInputAndLearnableParameters(encodeOutputNodes[0]);
|
||||
std::list<ComputationNodeBasePtr> InputNodesList = net->InputNodes(criterionNodes[0]);
|
||||
std::vector<std::wstring> encodeInputNodeNames;
|
||||
|
||||
if (SVD)
|
||||
encodeInputNodeNames.assign(outputNodeNamesVector.begin() + 7, outputNodeNamesVector.begin() + 8);
|
||||
else
|
||||
|
@ -1336,7 +1341,6 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
*encodeInputMatrices = DataReaderHelpersFunctions::RetrieveInputMatrices(encodeInputNodes);
|
||||
|
||||
//get decode input matrix
|
||||
std::vector<std::wstring> decodeOutputNodeNames(outputNodeNamesVector.begin() + 1, outputNodeNamesVector.begin() + 2);
|
||||
std::vector<ComputationNodeBasePtr> decodeOutputNodes = net->OutputNodesByName(decodeOutputNodeNames);
|
||||
|
||||
//net->CollectInputAndLearnableParameters(decodeOutputNodes[0]);
|
||||
|
@ -1346,6 +1350,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
else
|
||||
decodeInputNodeNames.assign(outputNodeNamesVector.begin() + 7, outputNodeNamesVector.begin() + 8);
|
||||
std::vector<ComputationNodeBasePtr> decodeinputNodes = net->OutputNodesByName(decodeInputNodeNames);
|
||||
|
||||
*decodeinputMatrices = DataReaderHelpersFunctions::RetrieveInputMatrices(decodeinputNodes);
|
||||
|
||||
if (!ordered)
|
||||
|
@ -1359,7 +1364,6 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
std::vector<ComputationNodeBasePtr> Plustransnodes = net->OutputNodesByName(plusTransNodeNames);
|
||||
net->FormEvalOrder(Plustransnodes[0]);
|
||||
}
|
||||
|
||||
//form eval order for RELU
|
||||
|
||||
auto reffeainput = (*encodeInputMatrices).begin();
|
||||
|
@ -1367,9 +1371,13 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
auto reflminput = (*decodeinputMatrices).begin();
|
||||
auto decodeMBLayout = reflminput->second.pMBLayout;
|
||||
|
||||
net->ForwardProp(encodeOutputNodes);
|
||||
//form eval order for RELU
|
||||
time_t my_time = time(NULL);
|
||||
fprintf(stderr, "SGD time 0 = %s", ctime(&my_time));
|
||||
|
||||
Matrix<ElemType> encodeOutput(net->GetDeviceId());
|
||||
net->ForwardProp(encodeOutputNodes);
|
||||
size_t deviceid = net->GetDeviceId();
|
||||
Matrix<ElemType> encodeOutput(deviceid);
|
||||
encodeOutput.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(encodeOutputNodes[0])->Value()));
|
||||
|
||||
vector<vector<PathInfo>> uttPathsInfo;
|
||||
|
@ -1379,12 +1387,50 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
vector<float> vt_onebest_wer;
|
||||
vt_onebest_wer.clear();
|
||||
|
||||
// time_t my_time = time(NULL);
|
||||
// fprintf(stderr, "SGD time 1 = %s", ctime(&my_time));
|
||||
my_time = time(NULL);
|
||||
fprintf(stderr, "SGD time 1 = %s", ctime(&my_time));
|
||||
RNNTDecodeFunctions<ElemType> rnntdfs;
|
||||
rnntdfs.RNNT_decode_nbest_MBR(outputNodeNamesVector, encodeOutput, encodeMBLayout, reflminput->second.GetMatrix<ElemType>(), decodeMBLayout, decodeinputNodes, numBestMBR, lengthNorm, vt_labels, uttPathsInfo, vt_nws, vt_onebest_wer, SVD, *net);
|
||||
//my_time = time(NULL);
|
||||
//fprintf(stderr, "SGD time 2 = %s", ctime(&my_time));
|
||||
//rnntdfs.RNNT_decode_nbest_MBR(outputNodeNamesVector, encodeOutput, encodeMBLayout, reflminput->second.GetMatrix<ElemType>(), decodeMBLayout, decodeinputNodes, numBestMBR, lengthNorm, vt_labels, uttPathsInfo, vt_nws, vt_onebest_wer, SVD, *net);
|
||||
|
||||
//vt_printname.push_back(L"DecodeOutputLN");
|
||||
|
||||
if (m_enableMultiThreadDecodeMBR)
|
||||
{
|
||||
ComputationNodeBasePtr WmNode, WmuNode, WmvNode, bmNode;
|
||||
Matrix<ElemType> Wm(deviceid), Wmu(deviceid), Wmv(deviceid), bm(deviceid);
|
||||
if (SVD)
|
||||
{
|
||||
WmuNode = net->GetNodeFromName(outputNodeNamesVector[4]);
|
||||
WmvNode = net->GetNodeFromName(outputNodeNamesVector[5]);
|
||||
bmNode = net->GetNodeFromName(outputNodeNamesVector[6]);
|
||||
Wmu.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(WmuNode)->Value()));
|
||||
Wmv.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(WmvNode)->Value()));
|
||||
WmNode;
|
||||
Wm;
|
||||
}
|
||||
else
|
||||
{
|
||||
WmNode = net->GetNodeFromName(outputNodeNamesVector[4]);
|
||||
bmNode = net->GetNodeFromName(outputNodeNamesVector[5]);
|
||||
Wm.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(WmNode)->Value()));
|
||||
WmuNode;
|
||||
WmvNode;
|
||||
Wmu;
|
||||
Wmv;
|
||||
}
|
||||
|
||||
bm.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(bmNode)->Value()));
|
||||
/*
|
||||
size_t num_utt = 7;
|
||||
size_t start_utt = 0;
|
||||
*/
|
||||
rnntdfs.RNNT_decode_nbest_MBR_Multithread(outputNodeNamesVector, encodeOutput, encodeMBLayout, reflminput->second.GetMatrix<ElemType>(), decodeMBLayout, decodeInputNodeNames, numBestMBR, lengthNorm, vt_labels, uttPathsInfo, vt_nws, vt_onebest_wer, SVD, decode_cn_root, Wm, Wmu, Wmv, bm); /*, num_utt, start_utt); */
|
||||
}
|
||||
else
|
||||
rnntdfs.RNNT_decode_nbest_MBR(outputNodeNamesVector, encodeOutput, encodeMBLayout, reflminput->second.GetMatrix<ElemType>(), decodeMBLayout, decodeinputNodes, numBestMBR, lengthNorm, vt_labels, uttPathsInfo, vt_nws, vt_onebest_wer, SVD, net);
|
||||
// rnntdfs.RNNT_decode_nbest_MBR_Multithread(outputNodeNamesVector, encodeOutput, encodeMBLayout, reflminput->second.GetMatrix<ElemType>(), decodeMBLayout, decodeinputNodes, numBestMBR, lengthNorm, vt_labels, uttPathsInfo, vt_nws, vt_onebest_wer, SVD, *net, decode_cn, decodeinputNodes_tmp);
|
||||
my_time = time(NULL);
|
||||
fprintf(stderr, "SGD time 2 = %s", ctime(&my_time));
|
||||
//fprintf(stderr, "decode SGD v0 .\n");
|
||||
|
||||
//net->BumpEvalTimeStamp(decodeinputNodes);
|
||||
|
@ -1400,21 +1446,11 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
refFeaMatBackup.SetValue(reffeainput->second.GetMatrix<ElemType>());
|
||||
size_t numParallelSequences = encodeMBLayout->GetNumParallelSequences();
|
||||
|
||||
//my_time = time(NULL);
|
||||
//fprintf(stderr, "SGD time 3 = %s", ctime(&my_time));
|
||||
my_time = time(NULL);
|
||||
fprintf(stderr, "SGD time 3 = %s", ctime(&my_time));
|
||||
|
||||
for (const auto& seq : encodeMBLayout->GetAllSequences())
|
||||
{
|
||||
/*
|
||||
if (seqId == 1)
|
||||
{
|
||||
for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
|
||||
{
|
||||
ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
|
||||
node->force_gradient_accumulate(true);
|
||||
}
|
||||
}
|
||||
*/
|
||||
if (seq.seqId == GAP_SEQUENCE_ID)
|
||||
{
|
||||
continue;
|
||||
|
@ -1423,15 +1459,12 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
//if (firstdebug)
|
||||
cNode->SetMWERInfo(uttPathsInfo[seqId], lengthNorm, wordPathPosteriorFromDecodeMBR, doMBR, vt_nws[seqId]);
|
||||
|
||||
// get the feature MBLayout
|
||||
size_t numFrames = seq.GetNumTimeSteps();
|
||||
numSamplesWithLabelOfNetworkMBR += numFrames;
|
||||
|
||||
// if (firstdebug)
|
||||
reffeainput->second.pMBLayout->Init(1, numFrames); // 1 channel, 1 utterance
|
||||
|
||||
Matrix<ElemType> fea(deviceID);
|
||||
|
@ -1447,11 +1480,9 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
|
||||
fea.SetColumn(refFeaMatBackup.ColumnSlice(uID, 1), t);
|
||||
}
|
||||
//if (firstdebug)
|
||||
{
|
||||
reffeainput->second.GetMatrix<ElemType>().SetValue(fea);
|
||||
reffeainput->second.pMBLayout->AddSequence(0, 0, 0, numFrames); // guoye: first 0 is for utterance ID, second 0 means 0th channel, lenght is 0 to numFrames
|
||||
}
|
||||
|
||||
reffeainput->second.GetMatrix<ElemType>().SetValue(fea);
|
||||
reffeainput->second.pMBLayout->AddSequence(0, 0, 0, numFrames); // guoye: first 0 is for utterance ID, second 0 means 0th channel, lenght is 0 to numFrames
|
||||
|
||||
// guoye: the below 2 commands reset the state, to make sure ForwardProb always get carried out
|
||||
ComputationNetwork::BumpEvalTimeStamp(encodeInputNodes); // guoy: update the time stamp before you do forward prob
|
||||
|
@ -1463,9 +1494,9 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
size_t nBest = uttPathsInfo[seqId].size();
|
||||
if (nBest > (m_maxFrameNumPerMinibatchMBR / numFrames))
|
||||
{
|
||||
// reset nBest to make the MB size framenum with budget
|
||||
// reset nBest to make the MB size
|
||||
nBest = (m_maxFrameNumPerMinibatchMBR / numFrames);
|
||||
}
|
||||
}
|
||||
size_t maxPhoneSeqLen = uttPathsInfo[seqId][0].label_seq.size();
|
||||
|
||||
for (size_t n = 1; n < nBest; n++)
|
||||
|
@ -1583,8 +1614,8 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
}
|
||||
}
|
||||
|
||||
//my_time = time(NULL);
|
||||
//fprintf(stderr, "SGD time 4 = %s", ctime(&my_time));
|
||||
my_time = time(NULL);
|
||||
fprintf(stderr, "SGD time 4 = %s", ctime(&my_time));
|
||||
}
|
||||
// ===========================================================
|
||||
// forward prop for evaluate eval nodes
|
||||
|
@ -1790,8 +1821,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
{
|
||||
#ifdef _DEBUG
|
||||
if (smoothedGradientIter->HasNan("TrainOneEpoch/UpdateWeights(): "))
|
||||
LogicError("
|
||||
%ls operation has NaNs in smoothedGradient.", node->NodeName().c_str(), node->OperationName().c_str());
|
||||
LogicError("%ls operation has NaNs in smoothedGradient.", node->NodeName().c_str(), node->OperationName().c_str());
|
||||
#endif
|
||||
double nodeDependentLearningRatePerSample = learnRatePerSample * node->GetLearningRateMultiplier();
|
||||
double nodeDependentRegMultiplier = dynamic_pointer_cast<LearnableParameter<ElemType>>(node)->GetRegMultiplier();
|
||||
|
@ -3460,7 +3490,7 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
|
|||
m_lengthNorm = configSGD(L"LengthNorm", true);
|
||||
m_showWERMode = configSGD(L"showWERMode", "average");
|
||||
m_isSVD = configSGD(L"SVD", true);
|
||||
|
||||
m_enableMultiThreadDecodeMBR = configSGD(L"enableMultiThreadDecodeMBR", true);
|
||||
m_maxFrameNumPerMinibatchMBR = configSGD(L"MaxFrameNumPerMinibatchMBR", (size_t) 2000);
|
||||
if (m_doGradientCheck && sizeofElemType != sizeof(double))
|
||||
{
|
||||
|
|
|
@ -292,6 +292,7 @@ protected:
|
|||
string m_showWERMode;
|
||||
bool m_isSVD;
|
||||
size_t m_maxFrameNumPerMinibatchMBR;
|
||||
bool m_enableMultiThreadDecodeMBR;
|
||||
// Parallel training
|
||||
MPIWrapperPtr m_mpi;
|
||||
|
||||
|
|
|
@ -421,7 +421,7 @@ public:
|
|||
Matrix<ElemType> encodeOutput(deviceid);
|
||||
Matrix<ElemType> decodeOutput(deviceid);
|
||||
Matrix<ElemType> greedyOutput(deviceid), greedyOutputMax(deviceid);
|
||||
Matrix<ElemType> sumofENandDE(deviceid), maxIdx(deviceid), maxVal(deviceid);
|
||||
Matrix<ElemType> maxIdx(deviceid), maxVal(deviceid);
|
||||
Matrix<ElemType> lmin(deviceid);
|
||||
MatrixPool m_matrixPool;
|
||||
m_matrixPool.OptimizedMemoryAllocation();
|
||||
|
@ -472,35 +472,7 @@ public:
|
|||
CurSequences = nextSequences;
|
||||
|
||||
vector<typename RNNTDecodeFunctions<ElemType>::Sequence>().swap(nextSequences);
|
||||
//deal with the same prefix
|
||||
/*sort(CurSequences.begin(), CurSequences.end(),
|
||||
[](const Sequence& a, const Sequence& b) -> bool {
|
||||
return a.labelseq.size() > b.labelseq.size();
|
||||
});
|
||||
for (size_t n = 0; n < CurSequences.size() - 1; n++)
|
||||
{
|
||||
for (size_t h = n + 1; h < CurSequences.size(); h++)
|
||||
{
|
||||
if (isPrefix(CurSequences[h], CurSequences[n]))
|
||||
{
|
||||
//forward_prop the prefix
|
||||
forward_decode(CurSequences[h], decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, CurSequences[h].labelseq.size());
|
||||
|
||||
forwardmerged(CurSequences[h], t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes);
|
||||
|
||||
size_t idx = CurSequences[h].labelseq.size();
|
||||
ElemType curlogp = CurSequences[h].logP + decodeOutput(CurSequences[n].labelseq[idx], 0);
|
||||
for (size_t k = idx; k < CurSequences[n].labelseq.size() - 1; k++)
|
||||
{
|
||||
forward_decode(CurSequences[n], decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, k + 1);
|
||||
forwardmerged(CurSequences[n], t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes);
|
||||
|
||||
curlogp += decodeOutput(CurSequences[n].labelseq[k + 1], 0);
|
||||
}
|
||||
CurSequences[n].logP = decodeOutput.LogAdd(curlogp, CurSequences[n].logP);
|
||||
}
|
||||
}
|
||||
}*/
|
||||
//nextSequences.clear();
|
||||
while (true)
|
||||
{
|
||||
|
@ -515,8 +487,8 @@ public:
|
|||
rnntdfs.prepareSequence(tempSeq);
|
||||
rnntdfs.forward_decode(tempSeq, decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, tempSeq.labelseq.size(), *m_net);
|
||||
if (isSVD)
|
||||
rnntdfs.forwardmergedSVD(tempSeq, t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes, Wmu, Wmv, bm, *m_net);
|
||||
else rnntdfs.forwardmerged(tempSeq, t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes, Wm, bm, *m_net);
|
||||
rnntdfs.forwardmergedSVD(tempSeq, t, encodeOutput, decodeOutput, Plusnodes, Plustransnodes, Wmu, Wmv, bm, m_net);
|
||||
else rnntdfs.forwardmerged(tempSeq, t, encodeOutput, decodeOutput, Plusnodes, Plustransnodes, Wm, bm, m_net);
|
||||
|
||||
//sumofENandDE.Print("sum");
|
||||
//sort log posterior and get best N labels
|
||||
|
@ -715,7 +687,7 @@ public:
|
|||
Matrix<ElemType> encodeOutput(deviceid);
|
||||
Matrix<ElemType> decodeOutput(deviceid);
|
||||
Matrix<ElemType> greedyOutput(deviceid), greedyOutputMax(deviceid);
|
||||
Matrix<ElemType> sumofENandDE(deviceid), maxIdx(deviceid), maxVal(deviceid);
|
||||
Matrix<ElemType> maxIdx(deviceid), maxVal(deviceid);
|
||||
Matrix<ElemType> lmin(deviceid);
|
||||
MatrixPool m_matrixPool;
|
||||
m_matrixPool.OptimizedMemoryAllocation();
|
||||
|
@ -760,36 +732,6 @@ public:
|
|||
CurSequences = nextSequences;
|
||||
|
||||
vector<typename RNNTDecodeFunctions<ElemType>::Sequence>().swap(nextSequences);
|
||||
//deal with the same prefix
|
||||
/*sort(CurSequences.begin(), CurSequences.end(),
|
||||
[](const Sequence& a, const Sequence& b) -> bool {
|
||||
return a.labelseq.size() > b.labelseq.size();
|
||||
});
|
||||
for (size_t n = 0; n < CurSequences.size() - 1; n++)
|
||||
{
|
||||
for (size_t h = n + 1; h < CurSequences.size(); h++)
|
||||
{
|
||||
if (isPrefix(CurSequences[h], CurSequences[n]))
|
||||
{
|
||||
//forward_prop the prefix
|
||||
forward_decode(CurSequences[h], decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, CurSequences[h].labelseq.size());
|
||||
|
||||
forwardmerged(CurSequences[h], t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes);
|
||||
|
||||
size_t idx = CurSequences[h].labelseq.size();
|
||||
ElemType curlogp = CurSequences[h].logP + decodeOutput(CurSequences[n].labelseq[idx], 0);
|
||||
for (size_t k = idx; k < CurSequences[n].labelseq.size() - 1; k++)
|
||||
{
|
||||
forward_decode(CurSequences[n], decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, k + 1);
|
||||
forwardmerged(CurSequences[n], t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes);
|
||||
|
||||
curlogp += decodeOutput(CurSequences[n].labelseq[k + 1], 0);
|
||||
}
|
||||
CurSequences[n].logP = decodeOutput.LogAdd(curlogp, CurSequences[n].logP);
|
||||
}
|
||||
}
|
||||
}*/
|
||||
//nextSequences.clear();
|
||||
while (true)
|
||||
{
|
||||
|
||||
|
@ -802,7 +744,7 @@ public:
|
|||
CurSequences.erase(maxSeq);
|
||||
rnntdfs.prepareSequence(tempSeq);
|
||||
rnntdfs.forward_decode(tempSeq, decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, tempSeq.labelseq.size(), *m_net);
|
||||
rnntdfs.forwardmerged(tempSeq, t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes, Wm, bm, *m_net);
|
||||
rnntdfs.forwardmerged(tempSeq, t, encodeOutput, decodeOutput, Plusnodes, Plustransnodes, Wm, bm, m_net);
|
||||
|
||||
//sumofENandDE.Print("sum");
|
||||
//sort log posterior and get best N labels
|
||||
|
|
Загрузка…
Ссылка в новой задаче