multithread GPU code checkin, no speedup though due to the stream in kernel function

This commit is contained in:
Guoli Ye 2020-02-20 22:34:44 -08:00
Родитель 25f4191186
Коммит 1fc15ed105
22 изменённых файлов: 2376 добавлений и 748 удалений

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -75,6 +75,9 @@ void ComputationNetwork::CopySubTree(const ComputationNetwork& fromNet,
LogicError("CopySubTree: you cannot copy a tree without copying the node values.");
ComputationNodeBasePtr fromRoot = fromNet.GetNodeFromName(fromName);
std::set<wstring> set_cur_tonode_names;
set_cur_tonode_names.clear();
if (!fromNet.EvalOrderExists(fromRoot))
const_cast<ComputationNetwork&>(fromNet).FormEvalOrder(fromRoot);
@ -84,20 +87,66 @@ void ComputationNetwork::CopySubTree(const ComputationNetwork& fromNet,
wstring fromNodeName = fromNode->NodeName();
wstring toNodeName = toNamePrefix + fromNodeName;
// fprintf(stderr, "CopySubTree node = %ls\n", fromNodeName.c_str());
ComputationNodeBasePtr toNode = CopyNode(fromNet, fromNodeName,
toNodeName,
CopyNodeFlags::copyNodeValue);
set_cur_tonode_names.insert(toNodeName);
ComputationNodeBasePtr fromNodeTmp;
fromNodeTmp = fromNet.GetNodeFromName(fromNodeName);
/* fprintf(stderr, "CopySubTree node = %ls, numrows = %d, numcols = %d \n", fromNodeName.c_str(),
int((&dynamic_pointer_cast<ComputationNode<float>>(fromNodeTmp)->Value())->GetNumRows()),
int((&dynamic_pointer_cast<ComputationNode<float>>(fromNodeTmp)->Value())->GetNumCols())
); */
if (flags & CopyNodeFlags::copyNodeInputLinks)
{
// copy the children structure but use the new nodes generated
for (int i = 0; i < fromNode->GetNumInputs(); i++)
toNode->SetInput(i, GetNodeFromName(toNamePrefix + fromNode->GetInputs()[i]->NodeName()));
{
wstring inputNodeName;
inputNodeName = (toNamePrefix + fromNode->GetInputs()[i]->NodeName());
if (set_cur_tonode_names.find(inputNodeName) == set_cur_tonode_names.end() )
{
//somehow the evalorder goes wrong, and the input is not copied to the toNetwork yet, do copy here
// fprintf(stderr, "CopySubTree i = %d, missing node = %ls\n", i, inputNodeName.c_str());
CopyNode(fromNet, fromNode->GetInputs()[i]->NodeName(),
inputNodeName,
CopyNodeFlags::copyNodeValue);
set_cur_tonode_names.insert(inputNodeName);
}
//fprintf(stderr, "CopySubTree i = %d, node = %ls\n", i, toNodeName.c_str());
//toNode->SetInput(i, GetNodeFromName(toNamePrefix + fromNode->GetInputs()[i]->NodeName()));
toNode->SetInput(i, GetNodeFromName(inputNodeName));
}
}
}
}
// you can only copy inputs from nodes in the same network
void ComputationNetwork::ShowNodeMemory(const ComputationNetwork& fromNet,
const std::wstring fromName)
{
ComputationNodeBasePtr fromRoot = fromNet.GetNodeFromName(fromName);
if (!fromNet.EvalOrderExists(fromRoot))
const_cast<ComputationNetwork&>(fromNet).FormEvalOrder(fromRoot);
for (const auto& fromNode : fromNet.GetEvalOrder(fromRoot)) // BUGBUG: This probably will fail because the precomputed eval orders are invalid at this point.
{
wstring fromNodeName = fromNode->NodeName();
ComputationNodeBasePtr fromNodeTmp;
fromNodeTmp = fromNet.GetNodeFromName(fromNodeName);
fprintf(stderr, "ShowNodeMemory node = %ls, numrows = %d, numcols = %d \n", fromNodeName.c_str(),
int((&dynamic_pointer_cast<ComputationNode<float>>(fromNodeTmp)->Value())->GetNumRows()),
int((&dynamic_pointer_cast<ComputationNode<float>>(fromNodeTmp)->Value())->GetNumCols()));
}
}
// you can only copy inputs from nodes in the same network
void ComputationNetwork::CopyInputs(const std::wstring fromName, std::wstring toName)
{
CopyNode(*this, fromName, toName, CopyNodeFlags::copyNodeInputLinks);

Просмотреть файл

@ -13,7 +13,7 @@
#include "MatrixPool.h"
#include "ComputationEnvironment.h"
#include "Globals.h"
#include <cuda_runtime.h>
#include <unordered_set>
#include <map>
#include <string>
@ -233,7 +233,6 @@ struct ComputationNetworkOwnedNodeState
return m_parentGradientOptimization == ParentGradientOptimization::Reuse;
}
virtual void MarkValueNonSharable()
{
m_valueSharable = false;
@ -351,6 +350,8 @@ class ComputationNodeBase : public IComputationNode,
public:
typedef shared_ptr<ComputationNodeBase> ComputationNodeBasePtr;
// -----------------------------------------------------------------------
// constructors, copying, (de-)serialization
// -----------------------------------------------------------------------

Просмотреть файл

@ -26,7 +26,12 @@
#include <boost/algorithm/string/join.hpp>
#include <boost/range/adaptor/transformed.hpp>
namespace Microsoft { namespace MSR { namespace CNTK {
namespace Microsoft
{
namespace MSR
{
namespace CNTK
{
// -----------------------------------------------------------------------
// ReduceElements (op, axis=, input)
@ -39,10 +44,10 @@ template <class ElemType>
if (flags & CopyNodeFlags::copyNodeValue)
{
auto node = dynamic_pointer_cast<ReduceElementsNode<ElemType>>(nodeP);
node->m_axes = m_axes;
node->m_operation = m_operation;
node->m_axes = m_axes;
node->m_operation = m_operation;
node->m_reductionOp = m_reductionOp;
node->m_scale = m_scale;
node->m_scale = m_scale;
node->m_keepDimensions = m_keepDimensions;
}
}
@ -57,7 +62,7 @@ template <class ElemType>
fstream >> num_axes;
for (int i = 0; i < num_axes; ++i)
{
int axis;
int axis;
fstream >> axis;
m_axes.push_back(axis);
}
@ -123,7 +128,7 @@ template <class ElemType>
if (ReduceSequenceAxis())
{
ElemType gapPadValue = NeutralValue(m_reductionOp);
input = ComputationNode<ElemType>::Unpack(GetSampleLayout(), InputRef(0).Value(), InputRef(0).GetMBLayout(), m_tempUnpackedData, m_tempScatterIndices, m_tempMask, /*batchMajor=*/ true, &gapPadValue);
input = ComputationNode<ElemType>::Unpack(GetSampleLayout(), InputRef(0).Value(), InputRef(0).GetMBLayout(), m_tempUnpackedData, m_tempScatterIndices, m_tempMask, /*batchMajor=*/true, &gapPadValue);
}
else
input = InputRef(0).ValueTensorFor(rank, frInput);
@ -142,7 +147,24 @@ template <class ElemType>
default:
// the actual operation is a Copy with reduction, where the magic is in the reduction op
// For "Mean", m_scale is 1/#elements, and 1 otherwise.
result.DoUnaryOpOf(0, input, m_scale, ElementWiseOperator::opCopy, m_reductionOp);
{
/*
if (num_frames == 94)
{
fprintf(stderr, "reshape 1, name = %ls, result = %f, input = %f, m_scale = %f \n", NodeName().c_str(), double(result.GetSOB().FrobeniusNorm()), double(input.GetSOB().FrobeniusNorm()), double(m_scale));
}
*/
if (multi_thread)
result.DoUnaryOpOfDebug(0, input, m_scale, ElementWiseOperator::opCopy, m_reductionOp);
else
result.DoUnaryOpOf(0, input, m_scale, ElementWiseOperator::opCopy, m_reductionOp);
/*
if (num_frames == 94)
{
fprintf(stderr, "reshape 2, name = %ls, result = %f, input = %f, m_scale = %f \n", NodeName().c_str(), double(result.GetSOB().FrobeniusNorm()), double(input.GetSOB().FrobeniusNorm()), double(m_scale));
}
*/
}
}
}
@ -156,15 +178,15 @@ template <class ElemType>
{
// Broadcast along the sequence
auto result = ValueFor(fr);
ComputationNode<ElemType>::BroadcastToPacked(Gradient(), GetMBLayout(), /*beta =*/ accumulateGradient ? (ElemType)1 : (ElemType)0, InputRef(0).Gradient(), FrameRange(InputRef(0).GetMBLayout()), m_tempGatherIndices);
ComputationNode<ElemType>::BroadcastToPacked(Gradient(), GetMBLayout(), /*beta =*/accumulateGradient ? (ElemType) 1 : (ElemType) 0, InputRef(0).Gradient(), FrameRange(InputRef(0).GetMBLayout()), m_tempGatherIndices);
}
else
{
const auto frInput = (ReduceAllAxes() || ReduceBatchAxis()) ? FrameRange(InputRef(0).GetMBLayout()) : fr; // can't use 'fr' for ReduceAllAxes() as it refers to the result (same as for training criteria)
// get the args
// get the args
size_t rank = DetermineElementwiseTensorRank();
auto sliceOutputGrad = ReduceAllAxes() ? TensorView<ElemType>(GradientPtr(), GetSampleLayout()) : GradientTensorFor(rank, fr); // propagate from this one...
auto sliceInputGrad = InputRef(0).GradientTensorFor(rank, frInput); // ...to this one
auto sliceInputGrad = InputRef(0).GradientTensorFor(rank, frInput); // ...to this one
// gradients are not as simple as passing an op-code, unfortunately
switch (m_reductionOp)
@ -220,8 +242,8 @@ template <class ElemType>
break;
case ElementWiseOperator::opElementwiseProduct:
{
auto input = InputRef(inputIndex).ValueTensorFor(rank, frInput);
auto output = ValueTensorFor(rank, fr.AllowBroadcast());
auto input = InputRef(inputIndex).ValueTensorFor(rank, frInput);
auto output = ValueTensorFor(rank, fr.AllowBroadcast());
if (accumulateGradient)
sliceInputGrad.AddElementwiseProductWithQuotientOf(sliceOutputGrad, output, input);
else
@ -242,13 +264,20 @@ template <class ElemType>
{
switch (m_reductionOp)
{
case ElementWiseOperator::opSum: return false;
case ElementWiseOperator::opLogSum: return true;
case ElementWiseOperator::opMin: return true;
case ElementWiseOperator::opMax: return true;
case ElementWiseOperator::opElementwiseProduct: return true;
case ElementWiseOperator::opArgmin: return false;
case ElementWiseOperator::opArgmax: return false;
case ElementWiseOperator::opSum:
return false;
case ElementWiseOperator::opLogSum:
return true;
case ElementWiseOperator::opMin:
return true;
case ElementWiseOperator::opMax:
return true;
case ElementWiseOperator::opElementwiseProduct:
return true;
case ElementWiseOperator::opArgmin:
return false;
case ElementWiseOperator::opArgmax:
return false;
}
LogicError("Should not get here.");
}
@ -258,13 +287,20 @@ template <class ElemType>
{
switch (m_reductionOp)
{
case ElementWiseOperator::opSum: return false;
case ElementWiseOperator::opLogSum: return true;
case ElementWiseOperator::opMin: return true;
case ElementWiseOperator::opMax: return true;
case ElementWiseOperator::opElementwiseProduct: return true;
case ElementWiseOperator::opArgmin: return false;
case ElementWiseOperator::opArgmax: return false;
case ElementWiseOperator::opSum:
return false;
case ElementWiseOperator::opLogSum:
return true;
case ElementWiseOperator::opMin:
return true;
case ElementWiseOperator::opMax:
return true;
case ElementWiseOperator::opElementwiseProduct:
return true;
case ElementWiseOperator::opArgmin:
return false;
case ElementWiseOperator::opArgmax:
return false;
}
LogicError("Should not get here.");
}
@ -282,7 +318,7 @@ template <class ElemType>
// validate the opcode (in case we got instantiated empty and never updated)
ValidateOp();
m_scale = (ElemType)1;
m_scale = (ElemType) 1;
if (ReduceAllAxes())
Base::ValidateUnaryReduce(isFinalValidationPass, m_keepDimensions);
else if (ReduceSequenceAxis())
@ -319,29 +355,27 @@ template <class ElemType>
let shape = Input(0)->GetSampleLayout();
auto dims = shape.GetDims();
size_t reducedDimProd = 1;
size_t reducedDimProd = 1;
if (ReduceAllStaticAxes())
{
reducedDimProd = shape.GetNumElements();
dims = m_keepDimensions ? SmallVector<size_t>(shape.GetRank(), 1) : (Environment().IsV2Library() ? SmallVector<size_t>({}) : SmallVector<size_t>({ 1 })); // entire sample is reduced to a scalar
dims = m_keepDimensions ? SmallVector<size_t>(shape.GetRank(), 1) : (Environment().IsV2Library() ? SmallVector<size_t>({}) : SmallVector<size_t>({1})); // entire sample is reduced to a scalar
}
else if (!m_axes.empty()
&& std::all_of(m_axes.begin(),
m_axes.end(),
[&dims](int axis) { return axis - 1 >= 0 && axis - 1 < dims.size(); }))
else if (!m_axes.empty() && std::all_of(m_axes.begin(),
m_axes.end(),
[&dims](int axis) { return axis - 1 >= 0 && axis - 1 < dims.size(); }))
{
//Accumulate the number of elements for reduce_mean
reducedDimProd = std::accumulate(m_axes.begin(),
m_axes.end(),
1,
[&dims](size_t acc, int& axis) { return acc * dims[axis - 1]; });
m_axes.end(),
1,
[&dims](size_t acc, int& axis) { return acc * dims[axis - 1]; });
// axes reduced to a scalar
if (m_keepDimensions)
std::for_each(m_axes.begin(),
m_axes.end(),
[&dims](int axis) {dims[axis - 1] = 1; }
);
m_axes.end(),
[&dims](int axis) { dims[axis - 1] = 1; });
else
{
SmallVector<size_t> reducedDims(dims.size() - m_axes.size());
@ -355,13 +389,12 @@ template <class ElemType>
dims = reducedDims;
}
}
else if (isFinalValidationPass)
else if (isFinalValidationPass)
{
InvalidArgument("The shape of %ls [%ls] can not be reduced along axes [%ls]",
NodeDescription().c_str(),
wstring(shape).c_str(),
boost::algorithm::join(m_axes | boost::adaptors::transformed([](int axis) { return std::to_wstring(axis); }), ", ").c_str()
);
NodeDescription().c_str(),
wstring(shape).c_str(),
boost::algorithm::join(m_axes | boost::adaptors::transformed([](int axis) { return std::to_wstring(axis); }), ", ").c_str());
}
// for "Mean", we must divide by #elements
if (isFinalValidationPass && IsMean())
@ -384,10 +417,14 @@ struct SequenceLengthVector
{
typedef vector<vector<size_t>> SequenceVector;
typedef MBLayout::SequenceInfo SequenceInfo;
const SequenceVector& m_sequenceVector; // vector of sequences (to get sequence length)
const vector<SequenceInfo>& m_sequenceInfo; // original sequence info (for seqId)
SequenceLengthVector(const vector<SequenceInfo>& sequenceInfo, const SequenceVector& sequenceVector) : m_sequenceInfo(sequenceInfo), m_sequenceVector(sequenceVector) { }
size_t size() const { return m_sequenceInfo.size(); }
const SequenceVector& m_sequenceVector; // vector of sequences (to get sequence length)
const vector<SequenceInfo>& m_sequenceInfo; // original sequence info (for seqId)
SequenceLengthVector(const vector<SequenceInfo>& sequenceInfo, const SequenceVector& sequenceVector)
: m_sequenceInfo(sequenceInfo), m_sequenceVector(sequenceVector) {}
size_t size() const
{
return m_sequenceInfo.size();
}
MBLayout::SequenceInfo operator[](size_t i) const // return a descriptor of the new sequence
{
SequenceInfo seq;
@ -427,12 +464,12 @@ template <class ElemType>
for (size_t t = 0; t < seq.GetNumTimeSteps(); t++)
{
double delta = input(0, inMBLayout->GetColumnIndex(seq, t)); // how many frames the current time step should expand into
desiredCount += delta; // this is now how many frames we should have
desiredCount += delta; // this is now how many frames we should have
// use a margin against round-off errors, so that we get non-binary ratios like 1/3 and 1/5 right
// This really means generate a frame if too few, unless we are within machine accuracy of the target.
// The assumption is that the delta has this error, while accumulation (in double) has no error.
ElemType relativeMargin = 1 - std::numeric_limits<ElemType>::epsilon();
while ((indexSequence.empty() && desiredCount > 0) // no margin for the first frame (always include unless flag is 0)
while ((indexSequence.empty() && desiredCount > 0) // no margin for the first frame (always include unless flag is 0)
|| indexSequence.size() < desiredCount * relativeMargin)
indexSequence.push_back(t);
}
@ -441,10 +478,10 @@ template <class ElemType>
input.CollapseDataLocation(); // BUGBUG: Move back, since BOTH state is broken at present.
// create a new MBLayout
let& outMBLayout = GetMBLayout();
outMBLayout->InitAsPackedSequences(SequenceLengthVector(sequences, indexSequences), /*temp*/m_placementBuffer, /*temp*/m_rowAllocationsBuffer);
outMBLayout->InitAsPackedSequences(SequenceLengthVector(sequences, indexSequences), /*temp*/ m_placementBuffer, /*temp*/ m_rowAllocationsBuffer);
// copy to output
vector<ElemType> buf(outMBLayout->GetNumCols(), numeric_limits<ElemType>::quiet_NaN()); // STL cannot easily avoid initializing, so we might as well init with NaN for gaps
let size = min(sequences.size(), outMBLayout->GetAllSequences().size()); // no non-gap sequence has an index beyond this
let size = min(sequences.size(), outMBLayout->GetAllSequences().size()); // no non-gap sequence has an index beyond this
for (size_t i = 0; i < size; i++)
{
let& seq = outMBLayout->GetAllSequences()[i];
@ -452,7 +489,7 @@ template <class ElemType>
continue;
let& indexSequence = indexSequences[i];
for (size_t t = 0; t < seq.GetNumTimeSteps(); t++)
buf[outMBLayout->GetColumnIndex(seq, t)] = (ElemType)indexSequence[t];
buf[outMBLayout->GetColumnIndex(seq, t)] = (ElemType) indexSequence[t];
}
// there may be dangling gaps at the end. Take the opportunity to verify this.
for (size_t i = size; i < sequences.size(); i++)
@ -460,7 +497,7 @@ template <class ElemType>
for (size_t i = size; i < outMBLayout->GetAllSequences().size(); i++)
assert(outMBLayout->GetAllSequences()[i].seqId == GAP_SEQUENCE_ID);
// the result will be kept in CPUDEVICE, since most likely we will access it again in PackedIndexNode
Value().TransferToDeviceIfNotThere(CPUDEVICE, /*isBeingMoved=*/ true, /*emptyTransfer=*/ true, /*updatePreferredDevice=*/ true);
Value().TransferToDeviceIfNotThere(CPUDEVICE, /*isBeingMoved=*/true, /*emptyTransfer=*/true, /*updatePreferredDevice=*/true);
Value().SetValue(1, outMBLayout->GetNumCols(), CPUDEVICE, buf.data(), MatrixFormat::matrixFormatColMajor);
}
@ -501,9 +538,9 @@ template <class ElemType>
/*virtual*/ void PackedIndexNode<ElemType>::ForwardPropNonLooping() /*override*/
{
let& sourceMBLayout = InputRef(SOURCEDATA).GetMBLayout(); // only used for index conversion
let& indexMBLayout = InputRef(INDEXDATA).GetMBLayout();
let& index = InputRef(INDEXDATA).Value(); // per-seq index values that are to be mapped
auto& result = Value(); // packed index values as mapped to sourceData's layout
let& indexMBLayout = InputRef(INDEXDATA).GetMBLayout();
let& index = InputRef(INDEXDATA).Value(); // per-seq index values that are to be mapped
auto& result = Value(); // packed index values as mapped to sourceData's layout
// loop over sourceSequences
// Input matrix contains time indices for each sequence that refer to frames inside that sequence.
// We replace every per-sequence index by the resolved column index w.r.t. the same MBLayout.
@ -516,10 +553,10 @@ template <class ElemType>
let& indexSeq = indexMBLayout->FindMatchingSequence(sourceSequences, i); // find corresponding entry in indexMBLayout
for (size_t tIndex = 0; tIndex < indexSeq.GetNumTimeSteps(); tIndex++) // map all index values in index sequence
{
let jIndex = indexMBLayout->GetColumnIndex(indexSeq, tIndex); // map time index to actual location in the matrix storage object
let tSource = (size_t)index(0, jIndex); // the new time location (relative to source sequence)
let jIndex = indexMBLayout->GetColumnIndex(indexSeq, tIndex); // map time index to actual location in the matrix storage object
let tSource = (size_t) index(0, jIndex); // the new time location (relative to source sequence)
let jSource = sourceMBLayout->GetColumnIndex(sourceSeq, tSource); // map new time index as well. This performs a range check.
result(0, jIndex) = (ElemType)jSource;
result(0, jIndex) = (ElemType) jSource;
}
}
// Note: maybe this is no longer needed, now that we do the same inside UpdateFunctionValueSize() for all nodes.
@ -563,8 +600,8 @@ template <class ElemType>
/*virtual*/ void GatherPackedNode<ElemType>::ForwardPropNonLooping() /*override*/
{
InputRef(INDEXDATA).MaskMissingValueColumnsTo(FrameRange(InputRef(INDEXDATA).GetMBLayout()), -1); // indicates an invalid column to Gather/Scatter
let& index = InputRef(INDEXDATA) .Value(); // column indices to copy from
let& source = InputRef(SOURCEDATA).Value(); // source data to copy
let& index = InputRef(INDEXDATA).Value(); // column indices to copy from
let& source = InputRef(SOURCEDATA).Value(); // source data to copy
#ifdef _MSC_VER
auto& outputValuePtrRef = ValuePtrRef();
@ -587,9 +624,9 @@ template <class ElemType>
{
if (inputIndex == SOURCEDATA)
{
let& index = InputRef(INDEXDATA) .Value(); // column indices to copy from
let& index = InputRef(INDEXDATA).Value(); // column indices to copy from
auto& sourceGradient = InputRef(SOURCEDATA).Gradient(); // source to propagate the gradient intpu
auto& outputGradient = Gradient(); // output gradient to propagate
auto& outputGradient = Gradient(); // output gradient to propagate
sourceGradient.DoScatterColumnsOf(/*beta=*/1, index, outputGradient, /*alpha=*/1, true);
}
}
@ -614,7 +651,7 @@ template <class ElemType>
SetDims(Input(SOURCEDATA)->GetSampleLayout(), HasMBLayout());
else
{
SmallVector<size_t> layout = { 1 }; // Scalar
SmallVector<size_t> layout = {1}; // Scalar
if (Input(SOURCEDATA)->GetSampleLayout().GetRank() > 1)
{
auto srcLayout = Input(SOURCEDATA)->GetSampleLayout().GetDims();
@ -638,8 +675,8 @@ template <class ElemType>
if (*InputRef(INDEXDATA).GetMBLayout() != *InputRef(SOURCEDATA).GetMBLayout())
InvalidArgument("%ls %ls operation requires the minibatch layout of index and source data to be the same.", NodeName().c_str(), OperationName().c_str());
InputRef(INDEXDATA).MaskMissingValueColumnsTo(FrameRange(InputRef(INDEXDATA).GetMBLayout()), -1); // indicates an invalid column to Gather/Scatter
let& index = InputRef(INDEXDATA) .Value(); // column indices to copy from
let& source = InputRef(SOURCEDATA).Value(); // source data to copy
let& index = InputRef(INDEXDATA).Value(); // column indices to copy from
let& source = InputRef(SOURCEDATA).Value(); // source data to copy
#ifdef _MSC_VER
auto& outputValuePtrRef = ValuePtrRef();
@ -653,7 +690,7 @@ template <class ElemType>
source.GetMatrixType(),
source.GetFormat());
auto& output = Value(); // output goes here
auto& output = Value(); // output goes here
output.DoScatterColumnsOf(/*beta=*/0, index, source, /*alpha=*/1, true);
}
@ -662,9 +699,9 @@ template <class ElemType>
{
if (inputIndex == SOURCEDATA)
{
let& index = InputRef(INDEXDATA).Value(); // column indices to copy from
let& index = InputRef(INDEXDATA).Value(); // column indices to copy from
auto& sourceGradient = Input(SOURCEDATA)->Gradient(); // source to propagate the gradient input
auto& outputGradient = Gradient(); // output gradient to propagate
auto& outputGradient = Gradient(); // output gradient to propagate
sourceGradient.DoGatherColumnsOf(/*beta=*/1, index, outputGradient, /*alpha=*/1);
}
}
@ -707,8 +744,8 @@ template <class ElemType>
CropNode<ElemType>::CropNode(size_t offsetX, size_t offsetY, DEVICEID_TYPE deviceId, const wstring& name)
: CropNode(deviceId, name)
{
m_xOffset = (double)(offsetX);
m_yOffset = (double)(offsetY);
m_xOffset = (double) (offsetX);
m_yOffset = (double) (offsetY);
}
template <class ElemType>
@ -880,8 +917,7 @@ void CropNode<ElemType>::ComputeCropOffsets()
// nodeToTransformMap contains coordinate maps for all nodes traversed so far, and is updated by this function.
// Traversal stack contains all nodes traversed so far. Inputs of currNode are pushed to traversal stack so that their
// inputs can be processed later on.
auto ProcessInputs = [](ComputationNodeBase* currNode, stack<ComputationNodeBase*>& traversalStack, unordered_map<ComputationNodeBase*, SpaceTransform>& nodeToTransformMap)
{
auto ProcessInputs = [](ComputationNodeBase* currNode, stack<ComputationNodeBase*>& traversalStack, unordered_map<ComputationNodeBase*, SpaceTransform>& nodeToTransformMap) {
if (!currNode->Is<TransformerNode>())
RuntimeError("Node does not support affine transform for cropping.");
@ -1033,4 +1069,6 @@ template class CropNode<float>;
template class CropNode<double>;
template class CropNode<half>;
}}}
} // namespace CNTK
} // namespace MSR
} // namespace Microsoft

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -311,6 +311,7 @@ public:
CPUMatrix<ElemType>& InplaceSoftThreshold(const ElemType threshold);
CPUMatrix<ElemType>& SetToZeroIfAbsLessThan(const ElemType threshold);
CPUMatrix<ElemType>& SetToZeroIfLessThan(const ElemType threshold);
ElemType SumOfAbsElements() const; // sum of all abs(elements)
ElemType SumOfElements() const; // sum of all elements

Просмотреть файл

@ -3215,6 +3215,24 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::SetToZeroIfAbsLessThan(const ElemType
return *this;
}
template <class ElemType>
CPUMatrix<ElemType>& CPUMatrix<ElemType>::SetToZeroIfLessThan(const ElemType threshold)
{
if (IsEmpty())
LogicError("SetToZeroIfLessThan: Matrix is empty.");
auto& us = *this;
#pragma omp parallel for
foreach_coord (i, j, us)
{
if ((us(i, j)) < threshold)
us(i, j) = 0;
}
return *this;
}
//sum of all abs(elements)
template <class ElemType>
ElemType CPUMatrix<ElemType>::SumOfAbsElements() const

Просмотреть файл

@ -2486,6 +2486,18 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::SetToZeroIfAbsLessThan(const ElemType
return *this;
}
template <class ElemType>
GPUMatrix<ElemType>& GPUMatrix<ElemType>::SetToZeroIfLessThan(const ElemType threshold)
{
if (IsEmpty())
LogicError("SetToZeroIfLessThan: Matrix is empty.");
CUDA_LONG N = (CUDA_LONG) GetNumElements();
int blocksPerGrid = (int) ceil(N * 1.0 / GridDim::maxThreadsPerBlock);
PrepareDevice();
SyncGuard syncGuard;
_setToZeroIfLessThan<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream>>>(Data(), threshold, N);
return *this;
}
template <class ElemType>
ElemType GPUMatrix<ElemType>::SumOfAbsElements() const
{
@ -5318,6 +5330,119 @@ void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a,
return TensorOpN<ElemType, 2>(beta, array<ElemType*, 2>{a.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
// perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
// This binds the N-ariness to a template parameter N, and gets the data pointers out from the matrix objects.
template <class ElemType>
void GPUMatrix<ElemType>::TensorOpDebug(ElemType beta, const GPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
{
if (reductionOp != ElementWiseOperator::opSum &&
reductionOp != ElementWiseOperator::opLogSum &&
reductionOp != ElementWiseOperator::opMin &&
reductionOp != ElementWiseOperator::opMax &&
reductionOp != ElementWiseOperator::opElementwiseProduct)
InvalidArgument("TensorOp: Unary reduction operations other than opMax, opMin, opSum, and opLogSum are not implemented.");
a.PrepareDevice();
if (a.GetComputeDeviceId() != GetComputeDeviceId())
InvalidArgument("All matrices must be on the same GPU");
// special case: linear processing
// The case statement has measurable impact for unary ops (but not for binary ops it seems, due to double mem access).
// Linear gap-free unary ops happen so regularly that we will eliminate the case statement from the CUDA kernel, and instead expand all.
if (regularOpDims.size() == 1 && regularStrides[0][0] == 1 && regularStrides[1][0] == 1 && reducingOpDims.size() == 0)
{
// special case: for copy, use cudaMemcpy() instead, or cublas_axpy()
// TODO: We should observe if these actually make a speed difference, and if not, remove these special cases.
if (op == ElementWiseOperator::opCopy && beta == 0 && alpha == 1)
{
//fprintf(stderr, "TensorOpDebug 1 \n");
return CUDA_CALL(cudaMemcpy(Data() + offsets[1], a.Data() + offsets[0], sizeof(ElemType) * regularOpDims[0], cudaMemcpyDeviceToDevice));
}
else if (op == ElementWiseOperator::opCopy && beta == 1)
{
//fprintf(stderr, "TensorOpDebug 2 \n");
return CUBLAS_CALL(cublasaxpyHelper(GetCublasHandle(GetComputeDeviceId()), (int) regularOpDims[0], &alpha, a.Data() + offsets[0], 1, Data() + offsets[1], 1));
}
else
{
//fprintf(stderr, "TensorOpDebug 3 \n");
return LaunchUnaryTensorOp<ElemType>(beta, a.Data() + offsets[0], Data() + offsets[1], alpha, op, regularOpDims[0]);
}
}
// special case: sum-reducing a matrix onto a column vector; can be done with SGEMM
// Note: A minor risk is that with this, our own reduction function will rarely be used.
// That function was tested to give the same results with 'double', and nearly the same with 'float' (different summation order matters).
else if (op == ElementWiseOperator::opCopy && // we are just adding to target without any further operation
reductionOp == ElementWiseOperator::opSum &&
#ifdef _DEBUG
sizeof(ElemType) == sizeof(float) && // in debug don't shortcut 'double' so we have some test of our own codepath
#endif
regularOpDims.size() == 1 && regularStrides[0][0] == 1 && regularStrides[1][0] == 1 && // we are processing a column
reducingOpDims.size() == 1 && reducingStrides[0][0] >= (ptrdiff_t) regularOpDims[0]) // reducing across columns and no overlap
{
assert(reducingStrides[1][0] == 0);
auto ARows = regularOpDims[0]; // vertical steps
auto ACols = reducingOpDims[0]; // horizontal steps (reduction)
auto ALd = reducingStrides[0][0]; // horizontal step width through matrix
cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId());
CUBLAS_CALL(cublasgemmHelper(cuHandle, CUBLAS_OP_N, CUBLAS_OP_N, (int) /*CRows=*/ARows, /*CCols=*/1, (int) ACols, &alpha,
/*A00=*/a.Data() + offsets[0], (int) ALd,
/*B00=*/GetOnesVector<ElemType>(ACols, a.GetComputeDeviceId())->Data(), (int) /*BRows=*/ACols, &beta,
/*C00=*/Data() + offsets[1], (int) /*CRows=*/ARows));
//fprintf(stderr, "TensorOpDebug 4 \n");
return;
}
// TODO: Add a special case for tensor bias reduction. cudnn is ~7% faster on Image/QuickE2E.
// regular case
else
{
/*
fprintf(stderr, "TensorOpDebug 5 \n");
for (size_t i = 0; i < 2; i++)
{
// fprintf(stderr, "i = %d, offsets = %d, regularStrides = %d, reducingStrides = %d \n ", int(i), int(offsets[i]), int(regularStrides[i]), int(reducingStrides[i]));
fprintf(stderr, "TensorOpDebug 5.1, i = %d, offsets = %d\n ", int(i), int(offsets[i]));
}
fprintf(stderr, "TensorOpDebug 5.1, regularOpDims.size() = %d \n", int(regularOpDims.size()));
for (size_t i = 0; i < regularOpDims.size(); i++)
fprintf(stderr, "TensorOpDebug 5.1, i = %d, regularOpDims = %d\n ", int(i), int(regularOpDims[i]));
fprintf(stderr, "TensorOpDebug 5.1, reducingOpDims.size() = %d \n", int(reducingOpDims.size()));
for (size_t i = 0; i < reducingOpDims.size(); i++)
fprintf(stderr, "TensorOpDebug 5.1, i = %d, reducingOpDims = %d\n ", int(i), int(reducingOpDims[i]));
for (size_t i = 0; i < 2; i++)
{
fprintf(stderr, "TensorOpDebug 5.1, i = %d, regularStrides.size() = %d \n", int(i), int(regularStrides[i].size()));
for (size_t j = 0; j < regularStrides[i].size(); j++)
{
fprintf(stderr, "TensorOpDebug 5.1, i = %d, j = %d, regularStrides = %d \n ", int(i), int(j), int(regularStrides[i][j]));
}
fprintf(stderr, "TensorOpDebug 5.1, i = %d, reducingStrides.size() = %d \n", int(i), int(reducingStrides[i].size()));
for (size_t j = 0; j < reducingStrides[i].size(); j++)
{
fprintf(stderr, "TensorOpDebug 5.1, i = %d, j = %d, reducingStrides = %d \n ", int(i), int(j), int(reducingStrides[i][j]));
}
}
fprintf(stderr, "TensorOpDebug 5.2, beta = %f, alpha = %f, a.data = %f, data = %f \n", double(beta), double(alpha), double(a.FrobeniusNorm()), double(FrobeniusNorm()));
// return TensorOpN<ElemType, 2>(beta, array<ElemType*, 2>{a.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
*/
return TensorOpNDebug<ElemType, 2>(beta, array<ElemType*, 2>{a.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides, a, *this);
}
}
// perform binary operation 'op' on a and b giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
template <class ElemType>
void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,

Просмотреть файл

@ -437,6 +437,7 @@ public:
GPUMatrix<ElemType>& InplaceSoftThreshold(const ElemType threshold);
GPUMatrix<ElemType>& SetToZeroIfAbsLessThan(const ElemType threshold);
GPUMatrix<ElemType>& SetToZeroIfLessThan(const ElemType threshold);
DeviceBoundNumber<ElemType> Sum_AsDeviceBoundNum() const;
ElemType SumOfAbsElements() const; // sum of all abs(elements)
@ -607,6 +608,11 @@ public:
const std::array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
void TensorOpDebug(ElemType beta, const GPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const std::array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
void TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const std::array<size_t, 3>& offsets,
const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 3>& regularStrides,

Просмотреть файл

@ -1306,6 +1306,19 @@ __global__ void _setToZeroIfAbsLessThan(
a[id] = 0;
}
template <class ElemType>
__global__ void _setToZeroIfLessThan(
ElemType* a,
const ElemType threshold,
const CUDA_LONG N)
{
typedef typename TypeSelector<ElemType>::comp_t comp_t;
CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
if (id >= N)
return;
if (((comp_t) a[id]) < (comp_t) threshold)
a[id] = 0;
}
template <class ElemType>
__global__ void _areEqual(
const ElemType* a,

Просмотреть файл

@ -2987,6 +2987,19 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::SetToZeroIfAbsLessThan(con
return *this;
}
template <class ElemType>
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::SetToZeroIfLessThan(const ElemType threshold)
{
VerifyWritable(__FUNCTION__);
if (IsEmpty())
LogicError("SetToZeroIfLessThan: Matrix is empty.");
CUDA_LONG N = (CUDA_LONG) GetNumNZElements();
int blocksPerGrid = (int) ceil(N * 1.0 / GridDim::maxThreadsPerBlock);
SyncGuard syncGuard;
_setToZeroIfLessThan<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(NzValues(), threshold, N);
return *this;
}
#pragma endregion
#pragma region Helper Functions

Просмотреть файл

@ -395,7 +395,7 @@ public:
GPUSparseMatrix<ElemType>& AssignTruncateTopOf(const GPUSparseMatrix<ElemType>& a, const ElemType threshold);
GPUSparseMatrix<ElemType>& SetToZeroIfAbsLessThan(const ElemType threshold);
GPUSparseMatrix<ElemType>& SetToZeroIfLessThan(const ElemType threshold);
GPUSparseMatrix<ElemType>& AssignOneHot(const GPUMatrix<ElemType>& a, vector<size_t>& shape, size_t axis);
void SetDiagonalValue(const ElemType v);
void SetDiagonalValue(const GPUMatrix<ElemType>& vector);

Просмотреть файл

@ -875,6 +875,12 @@ static shared_ptr<ElemType> GetReductionBuffer(size_t N)
return reductionBuffersCache[deviceId];
}
// this is safe for multithread calling in RNNT_EMBR
template <class ElemType>
static shared_ptr<ElemType> GetReductionBufferNoCache(size_t N)
{
return AllocateReductionBuffer<ElemType>(N);
}
// All dimensions (N-ariness, number of input dimensions K and number of reduction dimensions M) are bound to template parameters now.
template <class ElemType, C_size_t N, C_int M, C_int K>
static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> pointerVector, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
@ -1112,6 +1118,272 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
}
}
template <class ElemType, C_size_t N, C_int M, C_int K>
static void LaunchTensorOpWithReductionDebug(ElemType beta, array<ElemType*, N> pointerVector, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrideVectors,
const SmallVector<size_t>& reducingOpDimVector, const array<SmallVector<ptrdiff_t>, N>& reducingStrideVectors, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& result)
{
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 1, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
// return TensorOpN<ElemType, 2>(beta, array<ElemType*, 2>{a.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
a;
result;
typedef typename TypeSelector<ElemType>::comp_t ReduceElemType;
// copy all parameters to CUDA-compatible data structures
FixedArray<ElemType*, N> pointers(pointerVector);
SmallVector<C_size_t> regularOpStrideVector; // kernel needs the strides for converting thread index back to multi-dimensional tensor index
C_size_t numElements = 1;
// input divisors
SmallVector<fast_divmod> regularOpStrideDivmodVector;
for (C_size_t k = 0; k < regularOpDims.size(); k++)
{
regularOpStrideVector.push_back(numElements); // stride for dense representation of our output elements (if they were flattened)
regularOpStrideDivmodVector.push_back(fast_divmod((unsigned int) numElements));
numElements *= (C_size_t) regularOpDims[k];
}
// output divisors
SmallVector<fast_divmod> reducingOpDimDivmodVector;
C_size_t stride = 1;
for (C_size_t k = 0; k < reducingOpDimVector.size(); ++k)
{
reducingOpDimDivmodVector.push_back(fast_divmod(stride));
stride *= (C_size_t) reducingOpDimVector[k];
}
FixedArray<C_unsigned_int, K> regularOpStrides(regularOpStrideVector);
FixedMatrix<C_int, N, K> regularStrides(regularStrideVectors);
FixedArray<C_unsigned_int, M> reducingOpDims(reducingOpDimVector);
FixedMatrix<C_int, N, M> reducingStrides(reducingStrideVectors);
// reduced divisors
FixedArray<fast_divmod, K> regularOpStrideDivmod(regularOpStrideDivmodVector);
FixedArray<fast_divmod, M> reducingOpDimDivmod(reducingOpDimDivmodVector);
// launch the kernel
CUDA_LONG NN = (CUDA_LONG) numElements; // linear space identifying each individual output element
SyncGuard syncGuard;
// do some optimization for reductions
// - example: 30 GPU procs, warp size 32 --> 960 GPU cores
// - NN elements must be computed, each involving a reduction over reductionDim elements
// Cases:
// - #output elements NN >= GPU cores --> use one proc per element, do reduction in inner loop
// E.g. if >=960 elements are computed, each gets its own GPU thread.
// - reduction dimension would benefit from multiple blocks --> multiple blocks work on a single output element
// E.g.
// - gradient of adding a bias: reducing to a bias, e.g. 512-dim
// - gradient of scalar multiplication: big elementwise product reduced to a scalar (big dot product, e.g. [1024 x 1024] = 1M elements)
// - softmax in seq-2-seq attention model: reduce over length of attention window (e.g. 20)
// - summation of criterion value: scalar reduction over a few hundred or thousand samples in the minibatch
C_size_t reductionDim = 1; // number of elements to reduce over
for (C_size_t k = 0; k < reducingOpDimVector.size(); k++)
reductionDim *= (C_size_t) reducingOpDimVector[k];
GridDim grid(NN);
let& props = GridDim::GetDeviceProps();
bool disableParallelReduction = false; // (for debugging)
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 2, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
// === arg based reduction, one thread per output element
if ((reductionOp == ElementWiseOperator::opArgmax) ||
(reductionOp == ElementWiseOperator::opArgmin))
{
_launchTensorArgOp<ElemType, N, M, K><<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream>>>(
pointers, reductionOp,
regularOpStrides, regularStrides, grid.m_N,
reducingOpDims, reducingStrides,
regularOpStrideDivmod, reducingOpDimDivmod);
}
// === simple case: NN large, one thread per output element
else if (reductionDim == 1 || // no reduction
grid.m_blocksPerGrid >= props.multiProcessorCount || // enough output elements to fill all multiprocs
reductionDim * numElements <= 2 * props.warpSize || // trivial operation not worth the trouble (2* because the more complex one also needs 2 kernel launches)
disableParallelReduction || // (for debugging)
reductionDim * numElements <= props.multiProcessorCount) // recursive call from reduction below
{
// we got enough elements to generate: do one element per thread, and reduction inside
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.1, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
_launchTensorOp<ElemType, N, M, K><<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream>>>(
beta, pointers, alpha, op, reductionOp,
regularOpStrides, regularStrides, grid.m_N,
reducingOpDims, reducingStrides,
regularOpStrideDivmod, reducingOpDimDivmod);
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.2, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
}
// === optimization: simple case would not use all multiprocs
else
{
// m_blocksPerGrid can be thought of NN / 512, with appropriate rounding
// we are reducing and are underutilizing the multiprocs we have: get more parallelism by doing reduction in parallel
// If we get here, then
// - the total number of outputs to produce is < #multiprocs * warpSize, e.g. < 960
// - each output has at least two inputs, but possibly millions
// Examples:
// (a1) NN=900
// - each multiproc processes multiple elements concurrently, each reducing over its inputs inside
// - use one block per output element
// (a2) NN=30
// - same as (a1) except 30 multiprocs run only a single block each
// (a3) NN=16
// - same as (a1) except only 16 multiproc run one block
// (b1) NN=15
// - 2 blocks work together on a single output element
// (b2) NN=1 (NN < #multiprocs, e.g. NN < 30)
// - multiple blocks work together on a single output element
// - only this case requires memory, and only K * NN
// where K = blocks that work together,
// both K and NN < #multiprocs,
// and K * NN = on the order of NN, but generally a bit larger due to rounding.
// By how much do we underutilize?
// We increase #blocks by that factor by breaking reduction into that many chunks.
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.3, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
int numReductionChunks = std::max<int>(props.multiProcessorCount / NN, 1); // only >1 for NN < multiProcessorCount
// distribute NN over block X and Y
int blockXOverBy = CeilDiv(NN, props.maxGridSize[0]);
int numBlocksX = CeilDiv(NN, blockXOverBy);
int numBlocksY = CeilDiv(NN, numBlocksX);
// while block Z is for multiple blocks working together on a single output element
int numBlocksZ = numReductionChunks;
// Block dim is now:
// - X, Y: such that X*Y covers NN
// - Z: reduction chunks
// reduction goes into thread dim X
int reductionChunkSize = CeilDiv(reductionDim, numReductionChunks);
int numThreadsX = std::min<int>(reductionChunkSize, GridDim::maxThreadsPerBlock); // any that's over will be done by looping inside the kernel
// --- cases (a1) and (a2)
// This involves no reduction across blocks.
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.4, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
if (numReductionChunks == 1)
{
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.5, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(
beta, pointers, alpha, op, reductionOp,
regularOpStrides, regularStrides, NN,
reducingOpDims, reducingStrides, /*reductionBegin*/ 0, reductionChunkSize,
regularOpStrideDivmod, reducingOpDimDivmod);
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.6, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
}
// --- case (b)
// Reduction across blocks. This is the difficult one.
#ifndef ALLOW_ATOMIC_REDUCTION // temporarily disabled to ensure it is not causing the non-reproducability
else
{
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.7, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
// we get here if NN <= #multiprocs
assert(NN <= props.multiProcessorCount && numBlocksX == NN && numBlocksY == 1);
// dims are:
// - numBlocksZ = numReductionChunks = how many multiprocs work together to produce one output element
// - numBlocksX = NN = number of output elements
// - numThreadsX = reductionChunkSize clipped to 512; reductionChunkSize > 512 is handled by an inner for loop inside of the kernel
// we need memory for block outputs of dimension [numBlocksX x numBlocksZ]
// - total elements = NN * Floor(#multiprocs / NN) = <= #multiprocs
let reductionBufferSize = props.multiProcessorCount;
assert(reductionBufferSize >= NN * numBlocksZ);
shared_ptr<ElemType> reductionBuffer = GetReductionBufferNoCache<ElemType>(reductionBufferSize);
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.8, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
// 'pointers', 'regularOpStrides', and 'regularStrides' are set up to point to the target memory.
// We need to reroute them to point to our reductionBuffer.
// - pointer[N-1] -> replace by reductionBuffer
// - regularStrides -> replace [N-1] by regularOpStrides which already represent the NN elements for a dense memory layout
// - beta -> 0 since we write into temp memory
// - kernel must use block.z as second index into the output buffer; add (block.z * NN) to the pointer
FixedArray<ElemType*, N> pointers1 = pointers;
pointers1[N - 1] = reductionBuffer.get();
auto regularStrideVectors1 = regularStrideVectors;
for (size_t k = 0; k < regularOpStrides.size(); k++)
regularStrideVectors1[N - 1][k] = (ptrdiff_t) regularOpStrideVector[k];
FixedMatrix<C_int, N, K> regularStrides1(regularStrideVectors1);
ElemType beta1 = 0;
ElemType alpha1 = 1;
// fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.9, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(
beta1, pointers1, alpha1, op, reductionOp,
regularOpStrides, regularStrides1, NN,
reducingOpDims, reducingStrides, /*reductionBegin*/ 0, reductionChunkSize,
regularOpStrideDivmod, reducingOpDimDivmod);
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 3, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
#if 1
// now reduce and redistribute
// Create a new tensor task, and execute it recursively:
// - input = reductionBuffer
// - output = true output
// - op dims/strides = output elements
// - reduce dims/strides = numBlocksZ
// - op = opCopy
array<ElemType*, 2> pointerVector2{reductionBuffer.get(), pointerVector[N - 1]};
const array<SmallVector<ptrdiff_t>, 2> regularStrideVectors2{regularStrideVectors1[N - 1], regularStrideVectors[N - 1]};
const array<SmallVector<ptrdiff_t>, 2> reducingStrideVectors2{SmallVector<ptrdiff_t>{NN}, SmallVector<ptrdiff_t>{0}};
const SmallVector<size_t> reducingOpDimVector2{(size_t) numReductionChunks};
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 3.1, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
LaunchTensorOpWithReduction<ElemType, /*N=*/2, /*M=*/1, K>(
beta, pointerVector2, alpha, ElementWiseOperator::opCopy, reductionOp,
regularOpDims, regularStrideVectors2,
reducingOpDimVector2, reducingStrideVectors2);
//fprintf(stderr, "LaunchTensorOpWithReductionDebug 3.2, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
// (note: ^^this will have a nested syncGuard, which is fine)
#else
_launchTensorOp<ElemType, N, M, K><<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream>>>(
beta, pointers, alpha, op, reductionOp,
regularOpStrides, regularStrides, grid.m_N,
reducingOpDims, reducingStrides);
//for (size_t z = 0; z < numBlocksZ; z++)
// _launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(z == 0 ? beta : 1, pointers, alpha, op,
// regularOpStrides, regularStrides, NN,
// reducingOpDims, reducingStrides, reductionChunkSize * z, reductionChunkSize);
vector<ElemType> peekPartial(NN * numBlocksZ, -42);
vector<ElemType> peekFinal(NN, -42);
CUDA_CALL(cudaMemcpy(peekPartial.data(), reductionBuffer, sizeof(ElemType) * peekPartial.size(), cudaMemcpyDeviceToHost));
CUDA_CALL(cudaMemcpy(peekFinal.data(), pointers[pointers.size() - 1], sizeof(ElemType) * peekFinal.size(), cudaMemcpyDeviceToHost));
double s1 = 0, s2 = 0;
for (auto v : peekPartial)
s1 += v;
for (auto v : peekFinal)
s2 += v;
sin(1.0);
#endif
}
#else
else if (beta == 1)
{
// no need to pre-scale; just add (common for gradients)
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(beta, pointers, alpha, op, reductionOp, regularOpStrides,
regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize,
regularOpStrideDivmod, reducingOpDimDivmod);
return;
}
else
{
// We need more than one chunk, we will use atomicAdd().
// First reset/pre-multiply input; then do the remaining chunks using atomicAdd().
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(beta, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize,
regularOpStrideDivmod, reducingOpDimDivmod);
// We will leave it like this for a while, but eventually need to revisit using temporary memory.
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ - 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(/*beta=*/1, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, reductionChunkSize, reductionChunkSize,
regularOpStrideDivmod, reducingOpDimDivmod);
}
#endif
}
}
// -----------------------------------------------------------------------
// kernel and launch --linear unary
// -----------------------------------------------------------------------
@ -1205,6 +1477,24 @@ static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& po
}
}
template <class ElemType, C_size_t N, C_int K>
static void TensorOpWithRegularLoopDebug(ElemType beta, const array<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
{
size_t dims = reducingOpDims.size();
switch (dims)
{
case 2:
return LaunchTensorOpWithReduction<ElemType, N, 2, K>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 1:
return LaunchTensorOpWithReduction<ElemType, N, 1, K>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 0:
return LaunchTensorOp<ElemType, N, K>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides);
default:
LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (C_int) dims);
}
}
// tensor operation, generalized in number of arguments
// This function now expands into different k. It also eliminates the offsets by adding them to the pointers.
template <class ElemType, C_size_t N>
@ -1236,6 +1526,17 @@ void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, Elem
}
}
template <class ElemType, C_size_t N>
void TensorOpNDebug(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, N>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& result)
{
for (C_size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled
pointers[i] += offsets[i];
return LaunchTensorOpWithReductionDebug<ElemType, N, 1, 0>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides, a, result);
}
//------------------------------------------------------------------------
// explicit instantiations--these are being called from GPUMatrix.cu
//------------------------------------------------------------------------
@ -1277,6 +1578,21 @@ template void TensorOpN<half, 4>(half beta, array<half*, 4> pointers, half alpha
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides);
template void TensorOpNDebug<half, 2>(half beta, array<half*, 2> pointers, half alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides, const GPUMatrix<half>& a, GPUMatrix<half>& result);
template void TensorOpNDebug<double, 2>(double beta, array<double*, 2> pointers, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides, const GPUMatrix<double>& a, GPUMatrix<double>& result);
template void TensorOpNDebug<float, 2>(float beta, array<float*, 2> pointers, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides, const GPUMatrix<float>& a, GPUMatrix<float>& result);
template void LaunchUnaryTensorOp(float beta, const float* pa, float* pb, float alpha, ElementWiseOperator op, size_t regularOpDim);
template void LaunchUnaryTensorOp(double beta, const double* pa, double* pb, double alpha, ElementWiseOperator op, size_t regularOpDim);

Просмотреть файл

@ -23,6 +23,12 @@ void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, Elem
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides);
template <class ElemType, C_size_t N>
void TensorOpNDebug(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, N>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& result);
template <class ElemType>
void LaunchUnaryTensorOp(ElemType beta, const ElemType* pa, ElemType* pb, ElemType alpha, ElementWiseOperator op, size_t regularOpDim);

Просмотреть файл

@ -3669,6 +3669,21 @@ Matrix<ElemType>& Matrix<ElemType>::SetToZeroIfAbsLessThan(const ElemType thresh
return *this;
}
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::SetToZeroIfLessThan(const ElemType threshold)
{
if (IsEmpty())
LogicError("SetToZeroIfLessThan: Matrix is empty.");
DISPATCH_MATRIX_ON_FLAG(this,
this,
m_CPUMatrix->SetToZeroIfLessThan(threshold),
m_GPUMatrix->SetToZeroIfLessThan(threshold),
NOT_IMPLEMENTED,
m_GPUSparseMatrix->SetToZeroIfLessThan(threshold));
return *this;
}
//sum of all elements
template <class ElemType>
ElemType Matrix<ElemType>::SumOfElements() const
@ -6399,6 +6414,23 @@ void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, ElemTy
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);
}
template <class ElemType>
void Matrix<ElemType>::TensorOpDebug(ElemType beta, const Matrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
{
VerifyIsDense(*this) && VerifyIsDense(a);
DecideAndMoveToRightDevice(*this, a);
DISPATCH_MATRIX_ON_FLAG(this,
this,
m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
m_GPUMatrix->TensorOpDebug(beta, *a.m_GPUMatrix, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);
}
template <class ElemType>
void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,

Просмотреть файл

@ -474,6 +474,8 @@ public:
void InplaceTranspose();
Matrix<ElemType>& SetToZeroIfAbsLessThan(const ElemType threshold);
Matrix<ElemType>& SetToZeroIfLessThan(const ElemType threshold);
DeviceBoundNumber<ElemType> Sum_AsDeviceBoundNum() const;
ElemType SumOfAbsElements() const; // sum of all abs(elements)
@ -658,6 +660,11 @@ public:
const std::array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
void TensorOpDebug(ElemType beta, const Matrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const std::array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
void TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const std::array<size_t, 3>& offsets,
const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 3>& regularStrides,

Просмотреть файл

@ -709,6 +709,11 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::SetToZeroIfAbsLessThan(con
return *this;
}
template <class ElemType>
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::SetToZeroIfLessThan(const ElemType threshold)
{
return *this;
}
template <class ElemType>
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceSoftThreshold(const ElemType threshold)
{
@ -1723,7 +1728,11 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::SetToZeroIfAbsLessThan(const ElemType
{
return *this;
}
template <class ElemType>
GPUMatrix<ElemType>& GPUMatrix<ElemType>::SetToZeroIfLessThan(const ElemType threshold)
{
return *this;
}
template <class ElemType>
ElemType GPUMatrix<ElemType>::SumOfAbsElements() const
{

Просмотреть файл

@ -217,7 +217,7 @@ static void PrepareTensorOperands(array<TensorShape, N> shapes, array<size_t, N>
}
for (size_t i = 0; i < N; i++)
offsets[i] = shapes[i].GetOffset();
offsets[i] = shapes[i].GetOffset();
}
// enforce that in case of broadcasting, the output must not be an input
@ -249,6 +249,47 @@ void TensorView<ElemType>::DoUnaryOpOf(ElemType beta, const TensorView& a, ElemT
GetSOB().TensorOp(beta, a.GetSOB(), alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
template <class ElemType>
void TensorView<ElemType>::DoUnaryOpOfDebug(ElemType beta, const TensorView& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp)
{
// static int cc = 0; if (cc++ == 0)
// fprintf(stderr, "Tensor Op: Op %d: %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(GetShape()).c_str());
// prepare all tensor descriptor information as needed for execution
array<size_t, 2> offsets;
array<SmallVector<ptrdiff_t>, 2> regularStrides, reducingStrides;
SmallVector<size_t> regularOpDims, reducingOpDims;
PrepareTensorOperands<ElemType, 2>(array<TensorShape, 2>{a.GetShape(), GetShape()}, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
// output cannot be input when reducing
if (reducingOpDims.size() > 0)
CheckDifferentObject(a, *this);
/*
for (size_t i = 0; i < 2; i++)
{
// fprintf(stderr, "i = %d, offsets = %d, regularStrides = %d, reducingStrides = %d \n ", int(i), int(offsets[i]), int(regularStrides[i]), int(reducingStrides[i]));
fprintf(stderr, "i = %d, offsets = %d\n ", int(i), int(offsets[i]));
}
for (size_t i = 0; i < regularOpDims.size(); i++)
fprintf(stderr, "i = %d, regularOpDims = %d\n ", int(i), int(regularOpDims[i]));
for (size_t i = 0; i < reducingOpDims.size(); i++)
fprintf(stderr, "i = %d, reducingOpDims = %d\n ", int(i), int(reducingOpDims[i]));
for (size_t i = 0; i < 2; i++)
{
for (size_t j = 0; j < regularStrides[i].size(); j++)
{
fprintf(stderr, "i = %d, j = %d, regularStrides = %d \n ", int(i), int(j), int(regularStrides[i][j]));
}
for (size_t j = 0; j < reducingStrides[i].size(); j++)
{
fprintf(stderr, "i = %d, j = %d, reducingStrides = %d \n ", int(i), int(j), int(reducingStrides[i][j]));
}
}
*/
// now perform the operation
GetSOB().TensorOpDebug(beta, a.GetSOB(), alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
template <class ElemType>
void TensorView<ElemType>::DoBinaryOpOf(ElemType beta, const TensorView& a, const TensorView& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp)
{

Просмотреть файл

@ -131,6 +131,7 @@ public:
#pragma pop_macro("DeclareTernaryTensorOp")
void DoUnaryOpOf (ElemType beta, const TensorView& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp);
void DoUnaryOpOfDebug(ElemType beta, const TensorView& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp);
void DoBinaryOpOf (ElemType beta, const TensorView& a, const TensorView& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp);
void DoTernaryOpOf(ElemType beta, const TensorView& a, const TensorView& b, const TensorView& c, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp);

Просмотреть файл

@ -1051,10 +1051,18 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
string showWERMode,
bool SVD)
{
PROFILE_SCOPE(profilerEvtMainEpoch);
PROFILE_SCOPE(profilerEvtMainEpoch);
std::vector<std::wstring> decodeOutputNodeNames(outputNodeNamesVector.begin() + 1, outputNodeNamesVector.begin() + 2);
ScopedNetworkOperationMode modeGuard(net, NetworkOperationMode::training);
ComputationNetwork decode_cn_root;
decode_cn_root.CopySubTree(*net, decodeOutputNodeNames[0], L"", CopyNodeFlags::copyNodeAll);
decode_cn_root.CompileNetwork();
std::vector<ComputationNodeBasePtr> decodeOutputNodesTmp = decode_cn_root.OutputNodesByName(decodeOutputNodeNames);
decode_cn_root.FormEvalOrder(decodeOutputNodesTmp[0]);
decode_cn_root.FormNestedNetwork(decodeOutputNodesTmp[0]);
// bring our 'out' values into consistent state
epochCriterion = EpochCriterion(0);
epochEvalErrors.assign(epochEvalErrors.size(), EpochCriterion(0));
@ -1078,7 +1086,6 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
auto ContainsAccumulatedResult = [&evaluationNodesWhichAccumulateResult](ComputationNodeBasePtr node) {
return evaluationNodesWhichAccumulateResult.find(node) != evaluationNodesWhichAccumulateResult.end();
};
// MA-related variables
size_t nSamplesSinceLastModelSync = 0;
size_t blockSizePerWorker = 0;
@ -1122,7 +1129,6 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
// this is non-trivial, we need a manager object to handle this
if (numSubminibatchesNeeded > 1)
smbDispatcher.Init(net, learnableNodes, criterionNodes, evaluationNodes);
// The following is a special feature only supported by the Kaldi2Reader for more efficient sequence training.
// This attempts to compute the error signal for the whole utterance, which will
// be fed to the neural network as features. Currently it is a workaround
@ -1241,6 +1247,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
numSamplesWithLabelOfNetworkMBR = 0;
auto profGetMinibatch = ProfilerTimeBegin();
bool wasDataRead = DataReaderHelpers::GetMinibatchIntoNetwork<ElemType>(*trainSetDataReader, net, criterionNodes[0],
useDistributedMBReading, useParallelTrain, *inputMatrices, actualMBSize, m_mpi);
@ -1321,12 +1328,10 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
//net->CompileNetwork();
std::vector<std::wstring> encodeOutputNodeNames(outputNodeNamesVector.begin(), outputNodeNamesVector.begin() + 1);
std::vector<ComputationNodeBasePtr> encodeOutputNodes = net->OutputNodesByName(encodeOutputNodeNames);
//
//net->CollectInputAndLearnableParameters(encodeOutputNodes[0]);
std::list<ComputationNodeBasePtr> InputNodesList = net->InputNodes(criterionNodes[0]);
std::vector<std::wstring> encodeInputNodeNames;
if (SVD)
encodeInputNodeNames.assign(outputNodeNamesVector.begin() + 7, outputNodeNamesVector.begin() + 8);
else
@ -1336,7 +1341,6 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
*encodeInputMatrices = DataReaderHelpersFunctions::RetrieveInputMatrices(encodeInputNodes);
//get decode input matrix
std::vector<std::wstring> decodeOutputNodeNames(outputNodeNamesVector.begin() + 1, outputNodeNamesVector.begin() + 2);
std::vector<ComputationNodeBasePtr> decodeOutputNodes = net->OutputNodesByName(decodeOutputNodeNames);
//net->CollectInputAndLearnableParameters(decodeOutputNodes[0]);
@ -1346,6 +1350,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
else
decodeInputNodeNames.assign(outputNodeNamesVector.begin() + 7, outputNodeNamesVector.begin() + 8);
std::vector<ComputationNodeBasePtr> decodeinputNodes = net->OutputNodesByName(decodeInputNodeNames);
*decodeinputMatrices = DataReaderHelpersFunctions::RetrieveInputMatrices(decodeinputNodes);
if (!ordered)
@ -1359,7 +1364,6 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
std::vector<ComputationNodeBasePtr> Plustransnodes = net->OutputNodesByName(plusTransNodeNames);
net->FormEvalOrder(Plustransnodes[0]);
}
//form eval order for RELU
auto reffeainput = (*encodeInputMatrices).begin();
@ -1367,9 +1371,13 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
auto reflminput = (*decodeinputMatrices).begin();
auto decodeMBLayout = reflminput->second.pMBLayout;
net->ForwardProp(encodeOutputNodes);
//form eval order for RELU
time_t my_time = time(NULL);
fprintf(stderr, "SGD time 0 = %s", ctime(&my_time));
Matrix<ElemType> encodeOutput(net->GetDeviceId());
net->ForwardProp(encodeOutputNodes);
size_t deviceid = net->GetDeviceId();
Matrix<ElemType> encodeOutput(deviceid);
encodeOutput.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(encodeOutputNodes[0])->Value()));
vector<vector<PathInfo>> uttPathsInfo;
@ -1379,12 +1387,50 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
vector<float> vt_onebest_wer;
vt_onebest_wer.clear();
// time_t my_time = time(NULL);
// fprintf(stderr, "SGD time 1 = %s", ctime(&my_time));
my_time = time(NULL);
fprintf(stderr, "SGD time 1 = %s", ctime(&my_time));
RNNTDecodeFunctions<ElemType> rnntdfs;
rnntdfs.RNNT_decode_nbest_MBR(outputNodeNamesVector, encodeOutput, encodeMBLayout, reflminput->second.GetMatrix<ElemType>(), decodeMBLayout, decodeinputNodes, numBestMBR, lengthNorm, vt_labels, uttPathsInfo, vt_nws, vt_onebest_wer, SVD, *net);
//my_time = time(NULL);
//fprintf(stderr, "SGD time 2 = %s", ctime(&my_time));
//rnntdfs.RNNT_decode_nbest_MBR(outputNodeNamesVector, encodeOutput, encodeMBLayout, reflminput->second.GetMatrix<ElemType>(), decodeMBLayout, decodeinputNodes, numBestMBR, lengthNorm, vt_labels, uttPathsInfo, vt_nws, vt_onebest_wer, SVD, *net);
//vt_printname.push_back(L"DecodeOutputLN");
if (m_enableMultiThreadDecodeMBR)
{
ComputationNodeBasePtr WmNode, WmuNode, WmvNode, bmNode;
Matrix<ElemType> Wm(deviceid), Wmu(deviceid), Wmv(deviceid), bm(deviceid);
if (SVD)
{
WmuNode = net->GetNodeFromName(outputNodeNamesVector[4]);
WmvNode = net->GetNodeFromName(outputNodeNamesVector[5]);
bmNode = net->GetNodeFromName(outputNodeNamesVector[6]);
Wmu.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(WmuNode)->Value()));
Wmv.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(WmvNode)->Value()));
WmNode;
Wm;
}
else
{
WmNode = net->GetNodeFromName(outputNodeNamesVector[4]);
bmNode = net->GetNodeFromName(outputNodeNamesVector[5]);
Wm.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(WmNode)->Value()));
WmuNode;
WmvNode;
Wmu;
Wmv;
}
bm.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(bmNode)->Value()));
/*
size_t num_utt = 7;
size_t start_utt = 0;
*/
rnntdfs.RNNT_decode_nbest_MBR_Multithread(outputNodeNamesVector, encodeOutput, encodeMBLayout, reflminput->second.GetMatrix<ElemType>(), decodeMBLayout, decodeInputNodeNames, numBestMBR, lengthNorm, vt_labels, uttPathsInfo, vt_nws, vt_onebest_wer, SVD, decode_cn_root, Wm, Wmu, Wmv, bm); /*, num_utt, start_utt); */
}
else
rnntdfs.RNNT_decode_nbest_MBR(outputNodeNamesVector, encodeOutput, encodeMBLayout, reflminput->second.GetMatrix<ElemType>(), decodeMBLayout, decodeinputNodes, numBestMBR, lengthNorm, vt_labels, uttPathsInfo, vt_nws, vt_onebest_wer, SVD, net);
// rnntdfs.RNNT_decode_nbest_MBR_Multithread(outputNodeNamesVector, encodeOutput, encodeMBLayout, reflminput->second.GetMatrix<ElemType>(), decodeMBLayout, decodeinputNodes, numBestMBR, lengthNorm, vt_labels, uttPathsInfo, vt_nws, vt_onebest_wer, SVD, *net, decode_cn, decodeinputNodes_tmp);
my_time = time(NULL);
fprintf(stderr, "SGD time 2 = %s", ctime(&my_time));
//fprintf(stderr, "decode SGD v0 .\n");
//net->BumpEvalTimeStamp(decodeinputNodes);
@ -1400,21 +1446,11 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
refFeaMatBackup.SetValue(reffeainput->second.GetMatrix<ElemType>());
size_t numParallelSequences = encodeMBLayout->GetNumParallelSequences();
//my_time = time(NULL);
//fprintf(stderr, "SGD time 3 = %s", ctime(&my_time));
my_time = time(NULL);
fprintf(stderr, "SGD time 3 = %s", ctime(&my_time));
for (const auto& seq : encodeMBLayout->GetAllSequences())
{
/*
if (seqId == 1)
{
for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
{
ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
node->force_gradient_accumulate(true);
}
}
*/
if (seq.seqId == GAP_SEQUENCE_ID)
{
continue;
@ -1423,15 +1459,12 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
{
continue;
}
//if (firstdebug)
cNode->SetMWERInfo(uttPathsInfo[seqId], lengthNorm, wordPathPosteriorFromDecodeMBR, doMBR, vt_nws[seqId]);
// get the feature MBLayout
size_t numFrames = seq.GetNumTimeSteps();
numSamplesWithLabelOfNetworkMBR += numFrames;
// if (firstdebug)
reffeainput->second.pMBLayout->Init(1, numFrames); // 1 channel, 1 utterance
Matrix<ElemType> fea(deviceID);
@ -1447,11 +1480,9 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
fea.SetColumn(refFeaMatBackup.ColumnSlice(uID, 1), t);
}
//if (firstdebug)
{
reffeainput->second.GetMatrix<ElemType>().SetValue(fea);
reffeainput->second.pMBLayout->AddSequence(0, 0, 0, numFrames); // guoye: first 0 is for utterance ID, second 0 means 0th channel, lenght is 0 to numFrames
}
reffeainput->second.GetMatrix<ElemType>().SetValue(fea);
reffeainput->second.pMBLayout->AddSequence(0, 0, 0, numFrames); // guoye: first 0 is for utterance ID, second 0 means 0th channel, lenght is 0 to numFrames
// guoye: the below 2 commands reset the state, to make sure ForwardProb always get carried out
ComputationNetwork::BumpEvalTimeStamp(encodeInputNodes); // guoy: update the time stamp before you do forward prob
@ -1463,9 +1494,9 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
size_t nBest = uttPathsInfo[seqId].size();
if (nBest > (m_maxFrameNumPerMinibatchMBR / numFrames))
{
// reset nBest to make the MB size framenum with budget
// reset nBest to make the MB size
nBest = (m_maxFrameNumPerMinibatchMBR / numFrames);
}
}
size_t maxPhoneSeqLen = uttPathsInfo[seqId][0].label_seq.size();
for (size_t n = 1; n < nBest; n++)
@ -1583,8 +1614,8 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
}
}
//my_time = time(NULL);
//fprintf(stderr, "SGD time 4 = %s", ctime(&my_time));
my_time = time(NULL);
fprintf(stderr, "SGD time 4 = %s", ctime(&my_time));
}
// ===========================================================
// forward prop for evaluate eval nodes
@ -1790,8 +1821,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
{
#ifdef _DEBUG
if (smoothedGradientIter->HasNan("TrainOneEpoch/UpdateWeights(): "))
LogicError("
%ls operation has NaNs in smoothedGradient.", node->NodeName().c_str(), node->OperationName().c_str());
LogicError("%ls operation has NaNs in smoothedGradient.", node->NodeName().c_str(), node->OperationName().c_str());
#endif
double nodeDependentLearningRatePerSample = learnRatePerSample * node->GetLearningRateMultiplier();
double nodeDependentRegMultiplier = dynamic_pointer_cast<LearnableParameter<ElemType>>(node)->GetRegMultiplier();
@ -3460,7 +3490,7 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
m_lengthNorm = configSGD(L"LengthNorm", true);
m_showWERMode = configSGD(L"showWERMode", "average");
m_isSVD = configSGD(L"SVD", true);
m_enableMultiThreadDecodeMBR = configSGD(L"enableMultiThreadDecodeMBR", true);
m_maxFrameNumPerMinibatchMBR = configSGD(L"MaxFrameNumPerMinibatchMBR", (size_t) 2000);
if (m_doGradientCheck && sizeofElemType != sizeof(double))
{

Просмотреть файл

@ -292,6 +292,7 @@ protected:
string m_showWERMode;
bool m_isSVD;
size_t m_maxFrameNumPerMinibatchMBR;
bool m_enableMultiThreadDecodeMBR;
// Parallel training
MPIWrapperPtr m_mpi;

Просмотреть файл

@ -421,7 +421,7 @@ public:
Matrix<ElemType> encodeOutput(deviceid);
Matrix<ElemType> decodeOutput(deviceid);
Matrix<ElemType> greedyOutput(deviceid), greedyOutputMax(deviceid);
Matrix<ElemType> sumofENandDE(deviceid), maxIdx(deviceid), maxVal(deviceid);
Matrix<ElemType> maxIdx(deviceid), maxVal(deviceid);
Matrix<ElemType> lmin(deviceid);
MatrixPool m_matrixPool;
m_matrixPool.OptimizedMemoryAllocation();
@ -472,35 +472,7 @@ public:
CurSequences = nextSequences;
vector<typename RNNTDecodeFunctions<ElemType>::Sequence>().swap(nextSequences);
//deal with the same prefix
/*sort(CurSequences.begin(), CurSequences.end(),
[](const Sequence& a, const Sequence& b) -> bool {
return a.labelseq.size() > b.labelseq.size();
});
for (size_t n = 0; n < CurSequences.size() - 1; n++)
{
for (size_t h = n + 1; h < CurSequences.size(); h++)
{
if (isPrefix(CurSequences[h], CurSequences[n]))
{
//forward_prop the prefix
forward_decode(CurSequences[h], decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, CurSequences[h].labelseq.size());
forwardmerged(CurSequences[h], t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes);
size_t idx = CurSequences[h].labelseq.size();
ElemType curlogp = CurSequences[h].logP + decodeOutput(CurSequences[n].labelseq[idx], 0);
for (size_t k = idx; k < CurSequences[n].labelseq.size() - 1; k++)
{
forward_decode(CurSequences[n], decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, k + 1);
forwardmerged(CurSequences[n], t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes);
curlogp += decodeOutput(CurSequences[n].labelseq[k + 1], 0);
}
CurSequences[n].logP = decodeOutput.LogAdd(curlogp, CurSequences[n].logP);
}
}
}*/
//nextSequences.clear();
while (true)
{
@ -515,8 +487,8 @@ public:
rnntdfs.prepareSequence(tempSeq);
rnntdfs.forward_decode(tempSeq, decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, tempSeq.labelseq.size(), *m_net);
if (isSVD)
rnntdfs.forwardmergedSVD(tempSeq, t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes, Wmu, Wmv, bm, *m_net);
else rnntdfs.forwardmerged(tempSeq, t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes, Wm, bm, *m_net);
rnntdfs.forwardmergedSVD(tempSeq, t, encodeOutput, decodeOutput, Plusnodes, Plustransnodes, Wmu, Wmv, bm, m_net);
else rnntdfs.forwardmerged(tempSeq, t, encodeOutput, decodeOutput, Plusnodes, Plustransnodes, Wm, bm, m_net);
//sumofENandDE.Print("sum");
//sort log posterior and get best N labels
@ -715,7 +687,7 @@ public:
Matrix<ElemType> encodeOutput(deviceid);
Matrix<ElemType> decodeOutput(deviceid);
Matrix<ElemType> greedyOutput(deviceid), greedyOutputMax(deviceid);
Matrix<ElemType> sumofENandDE(deviceid), maxIdx(deviceid), maxVal(deviceid);
Matrix<ElemType> maxIdx(deviceid), maxVal(deviceid);
Matrix<ElemType> lmin(deviceid);
MatrixPool m_matrixPool;
m_matrixPool.OptimizedMemoryAllocation();
@ -760,36 +732,6 @@ public:
CurSequences = nextSequences;
vector<typename RNNTDecodeFunctions<ElemType>::Sequence>().swap(nextSequences);
//deal with the same prefix
/*sort(CurSequences.begin(), CurSequences.end(),
[](const Sequence& a, const Sequence& b) -> bool {
return a.labelseq.size() > b.labelseq.size();
});
for (size_t n = 0; n < CurSequences.size() - 1; n++)
{
for (size_t h = n + 1; h < CurSequences.size(); h++)
{
if (isPrefix(CurSequences[h], CurSequences[n]))
{
//forward_prop the prefix
forward_decode(CurSequences[h], decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, CurSequences[h].labelseq.size());
forwardmerged(CurSequences[h], t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes);
size_t idx = CurSequences[h].labelseq.size();
ElemType curlogp = CurSequences[h].logP + decodeOutput(CurSequences[n].labelseq[idx], 0);
for (size_t k = idx; k < CurSequences[n].labelseq.size() - 1; k++)
{
forward_decode(CurSequences[n], decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, k + 1);
forwardmerged(CurSequences[n], t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes);
curlogp += decodeOutput(CurSequences[n].labelseq[k + 1], 0);
}
CurSequences[n].logP = decodeOutput.LogAdd(curlogp, CurSequences[n].logP);
}
}
}*/
//nextSequences.clear();
while (true)
{
@ -802,7 +744,7 @@ public:
CurSequences.erase(maxSeq);
rnntdfs.prepareSequence(tempSeq);
rnntdfs.forward_decode(tempSeq, decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, tempSeq.labelseq.size(), *m_net);
rnntdfs.forwardmerged(tempSeq, t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes, Wm, bm, *m_net);
rnntdfs.forwardmerged(tempSeq, t, encodeOutput, decodeOutput, Plusnodes, Plustransnodes, Wm, bm, m_net);
//sumofENandDE.Print("sum");
//sort log posterior and get best N labels