Merge branch 'master' of https://git01.codeplex.com/cntk
Conflicts: DataReader/HTKMLFReader/HTKMLFReader.cpp DataReader/HTKMLFReader/HTKMLFReader.h
This commit is contained in:
Коммит
b4f465b8c4
|
@ -160,3 +160,9 @@ $RECYCLE.BIN/
|
|||
*.lyx~
|
||||
*.bak
|
||||
*.lyx#
|
||||
|
||||
# =========================
|
||||
# prebuild file
|
||||
# =========================
|
||||
MachineLearning/cn/buildinfo.h
|
||||
|
||||
|
|
|
@ -5,8 +5,9 @@
|
|||
//
|
||||
// ConfigFile.cpp : Defines the configuration file loader.
|
||||
//
|
||||
|
||||
#ifndef _CRT_SECURE_NO_WARNINGS
|
||||
#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
|
||||
#endif
|
||||
|
||||
#include "File.h"
|
||||
#include "commandArgUtil.h"
|
||||
|
|
|
@ -4,7 +4,9 @@
|
|||
// </copyright>
|
||||
//
|
||||
|
||||
#ifndef _CRT_SECURE_NO_WARNINGS
|
||||
#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
|
||||
#endif
|
||||
|
||||
#include "basetypes.h"
|
||||
#define FORMAT_SPECIALIZE // to get the specialized version of the format routines
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
#pragma once
|
||||
|
||||
#define MS_PER_SEC 1000
|
||||
|
||||
namespace Microsoft{namespace MSR {namespace CNTK {
|
||||
class Timer
|
||||
{
|
||||
public:
|
||||
Timer(){};
|
||||
~Timer(){};
|
||||
static unsigned long long MilliSecondElapsed();
|
||||
};
|
||||
}}}
|
|
@ -0,0 +1,39 @@
|
|||
#include "TimerUtility.h"
|
||||
|
||||
#ifdef WIN32
|
||||
#include <Windows.h>
|
||||
#else
|
||||
#include <time.h>
|
||||
#endif
|
||||
namespace Microsoft{
|
||||
namespace MSR {
|
||||
namespace CNTK {
|
||||
|
||||
//Returns the amount of milliseconds elapsed
|
||||
unsigned long long Timer::MilliSecondElapsed()
|
||||
{
|
||||
#ifdef WIN32
|
||||
FILETIME ft;
|
||||
LARGE_INTEGER li;
|
||||
|
||||
GetSystemTimeAsFileTime(&ft); //ideally we should use GetSystemTimePreciseAsFileTime. But it's only avaiable with Win8+ and Win Server 2012+
|
||||
li.LowPart = ft.dwLowDateTime;
|
||||
li.HighPart = ft.dwHighDateTime;
|
||||
|
||||
unsigned long long ret = li.QuadPart;
|
||||
ret -= 116444736000000000LL; // Make the values consistent with Linux.
|
||||
ret /= 10000; // From 100 nano seconds (10^-7) to 1 millisecond (10^-3)
|
||||
|
||||
return ret;
|
||||
#else
|
||||
timespec ts;
|
||||
clock_gettime(CLOCK_REALTIME, &ts); // Works on Linux
|
||||
|
||||
UINT64 ret = ts.tv_sec * 1000 + ts.tv_nsec/1000000;
|
||||
|
||||
return ret;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -4,7 +4,10 @@
|
|||
// </copyright>
|
||||
//
|
||||
|
||||
#ifndef _CRT_SECURE_NO_WARNINGS
|
||||
#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
|
||||
#endif
|
||||
|
||||
#define _CRT_NONSTDC_NO_DEPRECATE // make VS accept POSIX functions without _
|
||||
#pragma warning (disable: 4996) // ^^ this does not seem to work--TODO: make it work
|
||||
#define _FILE_OFFSET_BITS 64 // to force fseeko() and ftello() 64 bit in Linux
|
||||
|
|
|
@ -49,8 +49,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
m_truncated = readerConfig("Truncated", "false");
|
||||
m_convertLabelsToTargets = false;
|
||||
|
||||
m_numberOfuttsPerMinibatch = readerConfig("nbruttsineachrecurrentiter", "1");
|
||||
ConfigArray numberOfuttsPerMinibatchForAllEpochs = readerConfig("nbruttsineachrecurrentiter", "1");
|
||||
m_numberOfuttsPerMinibatchForAllEpochs = numberOfuttsPerMinibatchForAllEpochs;
|
||||
|
||||
for (int i = 0; i < m_numberOfuttsPerMinibatchForAllEpochs.size(); i++)
|
||||
{
|
||||
m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[i];
|
||||
if (m_numberOfuttsPerMinibatch < 1)
|
||||
{
|
||||
LogicError("nbrUttsInEachRecurrentIter cannot be less than 1.");
|
||||
|
@ -60,6 +64,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
LogicError("nbrUttsInEachRecurrentIter has to be 1 if Truncated is set to false.");
|
||||
}
|
||||
}
|
||||
|
||||
m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[0];
|
||||
|
||||
m_actualnumberOfuttsPerMinibatch = m_numberOfuttsPerMinibatch;
|
||||
m_sentenceEnd.assign(m_numberOfuttsPerMinibatch, true);
|
||||
|
@ -264,6 +271,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// get the read method, defaults to "blockRandomize" other option is "rollingWindow"
|
||||
std::string readMethod(readerConfig("readMethod","blockRandomize"));
|
||||
|
||||
if (readMethod == "blockRandomize" && randomize == randomizeNone)
|
||||
{
|
||||
fprintf(stderr, "WARNING: Randomize cannot be set to None when readMethod is set to blockRandomize. Change it Auto");
|
||||
randomize = randomizeAuto;
|
||||
}
|
||||
|
||||
// see if they want to use readAhead
|
||||
m_readAhead = readerConfig("readAhead", "false");
|
||||
|
||||
|
@ -352,6 +365,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
// now get the frame source. This has better randomization and doesn't create temp files
|
||||
m_frameSource = new msra::dbn::minibatchutterancesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, numContextLeft, numContextRight, randomize, *m_lattices, m_latticeMap, framemode);
|
||||
m_frameSource->setverbosity(verbosity);
|
||||
//m_frameSource = new msra::dbn::minibatchutterancesource(infilesmulti[0], labelsmulti[0], m_featDims[0], m_labelDims[0], numContextLeft[0], numContextRight[0], randomize, *m_lattices, m_latticeMap, framemode);
|
||||
|
||||
}
|
||||
|
@ -562,6 +576,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
m_mbSize = mbSize;
|
||||
|
||||
m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[epoch];
|
||||
|
||||
m_actualnumberOfuttsPerMinibatch = m_numberOfuttsPerMinibatch;
|
||||
m_sentenceEnd.assign(m_numberOfuttsPerMinibatch, true);
|
||||
m_processedFrame.assign(m_numberOfuttsPerMinibatch, 0);
|
||||
m_toProcess.assign(m_numberOfuttsPerMinibatch, 0);
|
||||
m_switchFrame.assign(m_numberOfuttsPerMinibatch, 0);
|
||||
|
||||
if (m_trainOrTest)
|
||||
{
|
||||
StartMinibatchLoopToTrainOrTest(mbSize,epoch,requestedEpochSamples);
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
<<<<<<< HEAD
|
||||
//
|
||||
// <copyright file="HTKMLFReader.h" company="Microsoft">
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
|
@ -111,4 +112,117 @@ public:
|
|||
void SetSentenceEnd(int /*actualMbSize*/){};
|
||||
};
|
||||
|
||||
=======
|
||||
//
|
||||
// <copyright file="HTKMLFReader.h" company="Microsoft">
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// </copyright>
|
||||
//
|
||||
// HTKMLFReader.h - Include file for the MTK and MLF format of features and samples
|
||||
#pragma once
|
||||
#include "DataReader.h"
|
||||
#include "commandArgUtil.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
template<class ElemType>
|
||||
class HTKMLFReader : public IDataReader<ElemType>
|
||||
{
|
||||
private:
|
||||
msra::dbn::minibatchiterator* m_mbiter;
|
||||
msra::dbn::minibatchsource* m_frameSource;
|
||||
msra::dbn::minibatchreadaheadsource* m_readAheadSource;
|
||||
msra::dbn::FileEvalSource* m_fileEvalSource;
|
||||
msra::dbn::latticesource* m_lattices;
|
||||
map<wstring,msra::lattices::lattice::htkmlfwordsequence> m_latticeMap;
|
||||
|
||||
vector<bool> m_sentenceEnd;
|
||||
bool m_readAhead;
|
||||
bool m_truncated;
|
||||
vector<size_t> m_processedFrame;
|
||||
intargvector m_numberOfuttsPerMinibatchForAllEpochs;
|
||||
size_t m_numberOfuttsPerMinibatch;
|
||||
size_t m_actualnumberOfuttsPerMinibatch;
|
||||
size_t m_mbSize;
|
||||
vector<size_t> m_toProcess;
|
||||
vector<size_t> m_switchFrame;
|
||||
bool m_noData;
|
||||
|
||||
bool m_trainOrTest; // if false, in file writing mode
|
||||
|
||||
std::map<LabelIdType, LabelType> m_idToLabelMap;
|
||||
|
||||
bool m_partialMinibatch; // allow partial minibatches?
|
||||
|
||||
std::vector<ElemType*> m_featuresBufferMultiUtt;
|
||||
std::vector<size_t> m_featuresBufferAllocatedMultiUtt;
|
||||
std::vector<ElemType*> m_labelsBufferMultiUtt;
|
||||
std::vector<size_t> m_labelsBufferAllocatedMultiUtt;
|
||||
std::vector<size_t> m_featuresStartIndexMultiUtt;
|
||||
std::vector<size_t> m_labelsStartIndexMultiUtt;
|
||||
|
||||
std::vector<ElemType*> m_featuresBufferMultiIO;
|
||||
std::vector<size_t> m_featuresBufferAllocatedMultiIO;
|
||||
std::vector<ElemType*> m_labelsBufferMultiIO;
|
||||
std::vector<size_t> m_labelsBufferAllocatedMultiIO;
|
||||
|
||||
std::map<std::wstring,size_t> m_featureNameToIdMap;
|
||||
std::map<std::wstring,size_t> m_labelNameToIdMap;
|
||||
std::map<std::wstring,size_t> m_nameToTypeMap;
|
||||
std::map<std::wstring,size_t> m_featureNameToDimMap;
|
||||
std::map<std::wstring,size_t> m_labelNameToDimMap;
|
||||
// for writing outputs to files (standard single input/output network) - deprecate eventually
|
||||
bool m_checkDictionaryKeys;
|
||||
bool m_convertLabelsToTargets;
|
||||
std::vector <bool> m_convertLabelsToTargetsMultiIO;
|
||||
std::vector<std::vector<std::wstring>> m_inputFilesMultiIO;
|
||||
|
||||
size_t m_inputFileIndex;
|
||||
std::vector<size_t> m_featDims;
|
||||
std::vector<size_t> m_labelDims;
|
||||
|
||||
std::vector<std::vector<std::vector<ElemType>>>m_labelToTargetMapMultiIO;
|
||||
|
||||
void PrepareForTrainingOrTesting(const ConfigParameters& config);
|
||||
void PrepareForWriting(const ConfigParameters& config);
|
||||
|
||||
bool GetMinibatchToTrainOrTest(std::map<std::wstring, Matrix<ElemType>*>&matrices);
|
||||
bool GetMinibatchToWrite(std::map<std::wstring, Matrix<ElemType>*>&matrices);
|
||||
|
||||
void StartMinibatchLoopToTrainOrTest(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
|
||||
void StartMinibatchLoopToWrite(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
|
||||
|
||||
bool ReNewBufferForMultiIO(size_t i);
|
||||
|
||||
size_t NumberSlicesInEachRecurrentIter() { return m_numberOfuttsPerMinibatch ;}
|
||||
void SetNbrSlicesEachRecurrentIter(const size_t) { };
|
||||
|
||||
void GetDataNamesFromConfig(const ConfigParameters& readerConfig, std::vector<std::wstring>& features, std::vector<std::wstring>& labels);
|
||||
|
||||
|
||||
size_t ReadLabelToTargetMappingFile (const std::wstring& labelToTargetMappingFile, const std::wstring& labelListFile, std::vector<std::vector<ElemType>>& labelToTargetMap);
|
||||
enum InputOutputTypes
|
||||
{
|
||||
real,
|
||||
category,
|
||||
};
|
||||
|
||||
|
||||
|
||||
public:
|
||||
virtual void Init(const ConfigParameters& config);
|
||||
virtual void Destroy() {delete this;}
|
||||
virtual ~HTKMLFReader();
|
||||
virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
|
||||
virtual bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices);
|
||||
virtual const std::map<LabelIdType, LabelType>& GetLabelMapping(const std::wstring& sectionName);
|
||||
virtual void SetLabelMapping(const std::wstring& sectionName, const std::map<unsigned, LabelType>& labelMapping);
|
||||
virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart=0);
|
||||
|
||||
virtual bool DataEnd(EndDataType endDataType);
|
||||
void SetSentenceEndInBatch(vector<size_t> &/*sentenceEnd*/);
|
||||
void SetSentenceEnd(int /*actualMbSize*/){};
|
||||
};
|
||||
|
||||
>>>>>>> bd4866bec82772b2e984f7e897b1e64cd0855d7d
|
||||
}}}
|
|
@ -768,6 +768,7 @@ private:
|
|||
if (chunkdata.isinram())
|
||||
return false;
|
||||
|
||||
if (verbosity)
|
||||
fprintf (stderr, "requirerandomizedchunk: paging in randomized chunk %d (frame range [%d..%d]), %d resident in RAM\n", chunkindex, chunk.globalts, chunk.globalte()-1, chunksinram+1);
|
||||
msra::util::attempt (5, [&]() // (reading from network)
|
||||
{
|
||||
|
@ -858,6 +859,7 @@ public:
|
|||
transcripts.clear();
|
||||
|
||||
// return these utterances
|
||||
if (verbosity > 0)
|
||||
fprintf (stderr, "getbatch: getting utterances %d..%d (%d frames out of %d requested) in sweep %d\n", spos, epos -1, mbframes, framesrequested, sweep);
|
||||
size_t tspos = 0; // relative start of utterance 'pos' within the returned minibatch
|
||||
for (size_t pos = spos; pos < epos; pos++)
|
||||
|
@ -922,6 +924,7 @@ public:
|
|||
const size_t lastchunk = chunkforframepos (globalte-1);
|
||||
const size_t windowbegin = randomizedchunks[firstchunk].windowbegin;
|
||||
const size_t windowend = randomizedchunks[lastchunk].windowend;
|
||||
if (verbosity > 0)
|
||||
fprintf (stderr, "getbatch: getting randomized frames [%d..%d] (%d frames out of %d requested) in sweep %d; chunks [%d..%d] -> chunk window [%d..%d)\n",
|
||||
globalts, globalte, mbframes, framesrequested, sweep, firstchunk, lastchunk, windowbegin, windowend);
|
||||
// release all data outside, and page in all data inside
|
||||
|
|
|
@ -102,7 +102,7 @@ class minibatchutterancesourcemulti : public minibatchsource
|
|||
bool isinram() const { return !frames.empty(); }
|
||||
// page in data for this chunk
|
||||
// We pass in the feature info variables by ref which will be filled lazily upon first read
|
||||
void requiredata (string & featkind, size_t & featdim, unsigned int & sampperiod, const latticesource & latticesource) const
|
||||
void requiredata (string & featkind, size_t & featdim, unsigned int & sampperiod, const latticesource & latticesource, int verbosity=0) const
|
||||
{
|
||||
if (numutterances() == 0)
|
||||
throw std::logic_error ("requiredata: cannot page in virgin block");
|
||||
|
@ -132,6 +132,7 @@ class minibatchutterancesourcemulti : public minibatchsource
|
|||
latticesource.getlattices (utteranceset[i].key(), lattices[i], uttframes.cols());
|
||||
}
|
||||
//fprintf (stderr, "\n");
|
||||
if (verbosity)
|
||||
fprintf (stderr, "requiredata: %d utterances read\n", utteranceset.size());
|
||||
}
|
||||
catch (...)
|
||||
|
@ -568,6 +569,7 @@ private:
|
|||
return sweep;
|
||||
|
||||
currentsweep = sweep;
|
||||
if (verbosity>0)
|
||||
fprintf (stderr, "lazyrandomization: re-randomizing for sweep %d in %s mode\n", currentsweep, framemode ? "frame" : "utterance");
|
||||
|
||||
const size_t sweepts = sweep * _totalframes; // first global frame index for this sweep
|
||||
|
@ -919,10 +921,11 @@ private:
|
|||
{
|
||||
auto & chunk = randomizedchunks[m][chunkindex];
|
||||
auto & chunkdata = chunk.getchunkdata();
|
||||
if (verbosity)
|
||||
fprintf (stderr, "feature set %d: requirerandomizedchunk: paging in randomized chunk %d (frame range [%d..%d]), %d resident in RAM\n", m, chunkindex, chunk.globalts, chunk.globalte()-1, chunksinram+1);
|
||||
msra::util::attempt (5, [&]() // (reading from network)
|
||||
{
|
||||
chunkdata.requiredata (featkind[m], featdim[m], sampperiod[m], this->lattices);
|
||||
chunkdata.requiredata (featkind[m], featdim[m], sampperiod[m], this->lattices, verbosity);
|
||||
});
|
||||
}
|
||||
chunksinram++;
|
||||
|
@ -1029,6 +1032,7 @@ public:
|
|||
}
|
||||
}
|
||||
// return these utterances
|
||||
if (verbosity > 0)
|
||||
fprintf (stderr, "getbatch: getting utterances %d..%d (%d frames out of %d requested) in sweep %d\n", spos, epos -1, mbframes, framesrequested, sweep);
|
||||
size_t tspos = 0; // relative start of utterance 'pos' within the returned minibatch
|
||||
for (size_t pos = spos; pos < epos; pos++)
|
||||
|
@ -1107,6 +1111,7 @@ public:
|
|||
const size_t lastchunk = chunkforframepos (globalte-1);
|
||||
const size_t windowbegin = randomizedchunks[0][firstchunk].windowbegin;
|
||||
const size_t windowend = randomizedchunks[0][lastchunk].windowend;
|
||||
if (verbosity)
|
||||
fprintf (stderr, "getbatch: getting randomized frames [%d..%d] (%d frames out of %d requested) in sweep %d; chunks [%d..%d] -> chunk window [%d..%d)\n",
|
||||
globalts, globalte, mbframes, framesrequested, sweep, firstchunk, lastchunk, windowbegin, windowend);
|
||||
// release all data outside, and page in all data inside
|
||||
|
@ -1230,3 +1235,4 @@ public:
|
|||
};
|
||||
|
||||
};};
|
||||
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
|
||||
using namespace std;
|
||||
|
||||
#define MAXSTRING 2048
|
||||
#define MAXSTRING 500000
|
||||
// UCI label location types
|
||||
enum LabelMode
|
||||
{
|
||||
|
|
|
@ -62,10 +62,7 @@ template<class ElemType>
|
|||
// not yet found, add to the map
|
||||
if (found == labelInfo.mapLabelToId.end())
|
||||
{
|
||||
labelInfo.mapLabelToId[labelValue] = labelInfo.idMax;
|
||||
labelInfo.mapIdToLabel[labelInfo.idMax] = labelValue;
|
||||
found = labelInfo.mapLabelToId.find(labelValue);
|
||||
labelInfo.idMax++;
|
||||
RuntimeError("%s not in vocabulary", labelValue.c_str());
|
||||
}
|
||||
return found->second;
|
||||
}
|
||||
|
|
|
@ -60,7 +60,7 @@
|
|||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>Use</PrecompiledHeader>
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
<WarningLevel>Level4</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>EVALDLL;WIN32;_DEBUG;_WINDOWS;_USRDLL;UCIREADER_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
|
@ -79,7 +79,7 @@
|
|||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level4</WarningLevel>
|
||||
<PrecompiledHeader>Use</PrecompiledHeader>
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
|
@ -107,6 +107,7 @@
|
|||
<ClInclude Include="..\..\Common\Include\Eval.h" />
|
||||
<ClInclude Include="..\..\Common\Include\File.h" />
|
||||
<ClInclude Include="..\..\Common\Include\fileutil.h" />
|
||||
<ClInclude Include="..\..\Common\Include\TimerUtility.h" />
|
||||
<ClInclude Include="EvalReader.h" />
|
||||
<ClInclude Include="EvalWriter.h" />
|
||||
<ClInclude Include="stdafx.h" />
|
||||
|
@ -127,6 +128,7 @@
|
|||
<ClCompile Include="..\..\Common\fileutil.cpp">
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\Common\TimerUtility.cpp" />
|
||||
<ClCompile Include="..\cn\ComputationNode.cpp">
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
</ClCompile>
|
||||
|
|
|
@ -19,6 +19,9 @@
|
|||
<ClCompile Include="..\..\Common\File.cpp">
|
||||
<Filter>Common</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\Common\TimerUtility.cpp">
|
||||
<Filter>Common</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="EvalReader.h" />
|
||||
|
@ -38,6 +41,9 @@
|
|||
<ClInclude Include="..\..\Common\Include\fileutil.h">
|
||||
<Filter>Common\Include</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\..\Common\Include\TimerUtility.h">
|
||||
<Filter>Common\Include</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Filter Include="Common">
|
||||
|
|
|
@ -3111,6 +3111,9 @@ protected: \
|
|||
inputGradientValues.Print("child Gradient-in/out");
|
||||
inputFunctionValues.Print("child Function values");
|
||||
#endif
|
||||
//currently we only support one combination when the input is sparse.
|
||||
if (inputFunctionValues.GetMatrixType() == SPARSE && inputGradientValues.GetMatrixType() == DENSE && gradientValues.GetMatrixType() == DENSE)
|
||||
inputGradientValues.SwitchToMatrixType(SPARSE, MatrixFormat::matrixFormatSparseBlockCol);
|
||||
|
||||
Matrix<ElemType>::MultiplyAndAdd(gradientValues, false, inputFunctionValues, true, inputGradientValues);
|
||||
#if DUMPOUTPUT
|
||||
|
|
|
@ -257,7 +257,12 @@ public:
|
|||
bool optParam = param->GetType() == ndlTypeOptionalParameter;
|
||||
if (optParam && !_stricmp(param->GetName().c_str(), name.c_str()))
|
||||
{
|
||||
return param->GetValue();
|
||||
auto paramValue = param->GetValue();
|
||||
auto resolveParamNode = m_parent->ParseVariable(paramValue, false);
|
||||
if (resolveParamNode != nullptr)
|
||||
return resolveParamNode->GetScalar();
|
||||
else
|
||||
return paramValue;
|
||||
}
|
||||
}
|
||||
return ConfigValue(deflt);
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#include "commandArgUtil.h"
|
||||
#include <chrono>
|
||||
#include <random>
|
||||
#include "TimerUtility.h"
|
||||
|
||||
#ifdef MPI_SUPPORT
|
||||
#include "mpi.h"
|
||||
|
@ -121,6 +122,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
SGD(const ConfigParameters& configSGD)
|
||||
{
|
||||
ConfigArray learningRatesPerMBStr = configSGD("learningRatesPerMB", "");
|
||||
m_needToNormalizeLRByParallUtterance = false;
|
||||
floatargvector learningRatesPerMB = learningRatesPerMBStr;
|
||||
|
||||
ConfigArray learningRatesPerSampleStr = configSGD("learningRatesPerSample", "");
|
||||
|
@ -206,13 +208,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
bool validateAfterModelReloading = configSGD("validateAfterModelReloading", "true");
|
||||
|
||||
bool UsingAllDataForPreComputedNode = configSGD("UseAllDataForPreComputedNode", "true");
|
||||
|
||||
Init(learningRatesPerMB, learningRatesPerSample, mbSize, epochSize, maxEpochs, modelPath, momentumPerMB, gradientClippingWithTruncation,
|
||||
clippingThresholdPerSample,autoAdjustLRType, increaseLearnRateIfImproveMoreThan, learnRateIncreaseFactor,
|
||||
reduceLearnRateIfImproveLessThan, continueReduce, learnRateDecreaseFactor, dropoutRates,
|
||||
loadBestModel, numMiniBatch4LRSearch, numPrevLearnRates, numBestSearchEpoch, traceLevel, numMBsToShowResult,
|
||||
maxTempMemSizeInSamplesForCNN, gUpdateInfo, usePtask, keepCheckPointFiles, adaptationRegType, adaptationRegWeight,
|
||||
trainCriterionNodeName, evalCriterionNodeName, doGradientCheck, gradientCheckSigDigit, validateAfterModelReloading,
|
||||
rpi, learnRateAdjustInterval);
|
||||
rpi, learnRateAdjustInterval, UsingAllDataForPreComputedNode);
|
||||
}
|
||||
|
||||
void setMomentum(float momentum)
|
||||
|
@ -234,7 +238,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
const GradientUpdateInfo gradUpdateType = GradientUpdateInfo(), const bool usePtask = false, const bool keepCheckPointFiles=false, const AdaptationRegType adaptationRegType = AdaptationRegType::None,
|
||||
const ElemType adaptationRegWeight = 0.0f, const wstring trainCriterionNodeName= L"", const wstring evalCriterionNodeName=L"",
|
||||
const bool doGradientCheck = false, const ElemType gradientCheckSigDigit = 6, const bool validateAfterModelReloading = true,
|
||||
RMSPropInfo rpi = RMSPropInfo(), size_t learnRateAdjustInterval = 1)
|
||||
RMSPropInfo rpi = RMSPropInfo(), size_t learnRateAdjustInterval = 1, const bool UsingAllDataForPreComputed=true)
|
||||
{
|
||||
numPrevLearnRates;
|
||||
m_mbSize=mbSize;
|
||||
|
@ -272,6 +276,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
m_trainCriterionNodeName = trainCriterionNodeName;
|
||||
m_evalCriterionNodeName = evalCriterionNodeName;
|
||||
m_useAllDataForPreComputedNode = UsingAllDataForPreComputed;
|
||||
|
||||
for (size_t i=0; i<m_mbSize.size(); i++)
|
||||
if (m_epochSize != requestDataSize && m_epochSize < m_mbSize[i])
|
||||
|
@ -298,6 +303,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
m_learningRatesPerSample[i] = learningRatesPerMB[i]/m_mbSize[i];
|
||||
}
|
||||
m_needToNormalizeLRByParallUtterance = true;
|
||||
}
|
||||
m_momentumPerMB = 0.9f;
|
||||
if (momentumPerMB.size() >0)
|
||||
|
@ -521,6 +527,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (0 == myRank) // only needs to be done by one process
|
||||
net.SaveToFile(GetModelNameForEpoch(int(startEpoch) - 1));
|
||||
|
||||
// first, we need to normalize the effect of nbruttsineachrecurrentiter
|
||||
if (trainSetDataReader->NumberSlicesInEachRecurrentIter()>1 && m_needToNormalizeLRByParallUtterance)
|
||||
{
|
||||
for (auto & x : m_learningRatesPerSample)
|
||||
{
|
||||
x /= trainSetDataReader->NumberSlicesInEachRecurrentIter();
|
||||
}
|
||||
}
|
||||
bool learnRateInitialized = false;
|
||||
if (startEpoch > 0)
|
||||
{
|
||||
|
@ -557,7 +571,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
for (int i = int(startEpoch); i < int(m_maxEpochs); i++)
|
||||
{
|
||||
auto t_start_epoch = clock();
|
||||
auto t_start_epoch = Timer::MilliSecondElapsed();
|
||||
|
||||
// set other information to inputMatrices that can contrain information
|
||||
// used for class-based LM for clustring information
|
||||
|
@ -600,12 +614,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
#ifdef MPI_SUPPORT
|
||||
INT32 mySamples = (INT32)
|
||||
#endif
|
||||
fprintf(stderr, "Starting Epoch %d: learning rate per sample = %f momentum = %f \n", (int)startEpoch, learnRatePerSample, m_momentumPerMB);
|
||||
TrainOneEpoch(net, refNet, refNode, i, m_epochSize, trainSetDataReader, learnRatePerSample, FeatureNodes, labelNodes,
|
||||
criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients,
|
||||
epochCriterion, epochEvalErrors, totalSamplesSeen);
|
||||
|
||||
auto t_end_epoch = clock();
|
||||
ElemType epochTime = ElemType(1.0)*(t_end_epoch - t_start_epoch) / (CLOCKS_PER_SEC);
|
||||
auto t_end_epoch = Timer::MilliSecondElapsed();
|
||||
ElemType epochTime = (t_end_epoch - t_start_epoch) / ElemType(MS_PER_SEC);
|
||||
|
||||
fprintf(stderr, "Finished Epoch[%d]: [Training Set] Train Loss Per Sample = %.8g ", i + 1, epochCriterion);
|
||||
if (epochEvalErrors.size() == 1)
|
||||
|
@ -800,7 +815,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
//compute
|
||||
//trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0 , requestDataSize);
|
||||
trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0 , m_epochSize); // only based on one epoch
|
||||
// trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0 , m_epochSize); // only based on one epoch
|
||||
// [1/12/2015 erw] to support large dataset, we usually paritition whole dataset into several epoches, so we need to use all the data to do precomputing
|
||||
if (m_useAllDataForPreComputedNode)
|
||||
trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0); // using all the data
|
||||
else
|
||||
trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0, m_epochSize); // using all the data
|
||||
|
||||
while (trainSetDataReader->GetMinibatch(inputMatrices))
|
||||
{
|
||||
|
@ -983,8 +1003,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
std::vector<ElemType> epochEvalErrorsLastMBs(epochEvalErrors.size(),0);
|
||||
PTaskGraphBuilder<ElemType>* ptaskGraphBuilder = NULL;
|
||||
|
||||
clock_t startReadMBTime = 0, startComputeMBTime=0;
|
||||
clock_t endReadMBTime=0, endComputeMBTime=0;
|
||||
unsigned long long startReadMBTime = 0, startComputeMBTime=0;
|
||||
unsigned long long endReadMBTime = 0, endComputeMBTime = 0;
|
||||
|
||||
//initialize statistics
|
||||
size_t totalEpochSamples = 0;
|
||||
|
@ -1027,14 +1047,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
}
|
||||
|
||||
startReadMBTime=clock();
|
||||
startReadMBTime=Timer::MilliSecondElapsed();
|
||||
while (trainSetDataReader->GetMinibatch(inputMatrices))
|
||||
{
|
||||
#ifdef MPI_SUPPORT
|
||||
DecimateMinibatch(inputMatrices);
|
||||
#endif
|
||||
endReadMBTime=clock();
|
||||
startComputeMBTime=clock();
|
||||
endReadMBTime=Timer::MilliSecondElapsed();
|
||||
startComputeMBTime=Timer::MilliSecondElapsed();
|
||||
|
||||
UpdateEvalTimeStamps(FeatureNodes);
|
||||
UpdateEvalTimeStamps(labelNodes);
|
||||
|
@ -1113,12 +1133,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
|
||||
endComputeMBTime=clock();
|
||||
endComputeMBTime=Timer::MilliSecondElapsed();
|
||||
numMBsRun ++;
|
||||
if (m_traceLevel > 0)
|
||||
{
|
||||
ElemType MBReadTime = (ElemType)(endReadMBTime-startReadMBTime)/(CLOCKS_PER_SEC);
|
||||
ElemType MBComputeTime = (ElemType)(endComputeMBTime-startComputeMBTime)/CLOCKS_PER_SEC;
|
||||
ElemType MBReadTime = (ElemType)(endReadMBTime-startReadMBTime)/(MS_PER_SEC);
|
||||
ElemType MBComputeTime = (ElemType)(endComputeMBTime-startComputeMBTime)/MS_PER_SEC;
|
||||
|
||||
readTimeInMBs += MBReadTime;
|
||||
ComputeTimeInMBs += MBComputeTime;
|
||||
|
@ -1149,7 +1169,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
epochEvalErrorsLastMBs[i] = epochEvalErrors[i];
|
||||
}
|
||||
}
|
||||
startReadMBTime=clock();
|
||||
startReadMBTime=Timer::MilliSecondElapsed();
|
||||
totalEpochSamples += actualMBSize;
|
||||
totalSamplesSeen += actualMBSize;
|
||||
|
||||
|
@ -1506,6 +1526,7 @@ protected:
|
|||
protected:
|
||||
|
||||
floatargvector m_learningRatesPerSample; /// learning rate per sample provided outside
|
||||
bool m_needToNormalizeLRByParallUtterance; // only true when the user specify LearningRatePerMB and the number of parallel utterances in Reader > 1
|
||||
intargvector m_mbSize;
|
||||
size_t m_epochSize;
|
||||
size_t m_maxEpochs;
|
||||
|
@ -1557,6 +1578,8 @@ protected:
|
|||
ElemType m_gradientCheckSigDigit;
|
||||
|
||||
bool m_validateAfterModelReloading;
|
||||
|
||||
bool m_useAllDataForPreComputedNode;
|
||||
};
|
||||
template class SGD<float>;
|
||||
template class SGD<double>;
|
||||
|
|
|
@ -102,7 +102,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
evalResultsLastMBs.push_back((ElemType)0);
|
||||
|
||||
dataReader.StartMinibatchLoop(mbSize, 0, testSize);
|
||||
dataReader.SetNbrSlicesEachRecurrentIter(1);
|
||||
|
||||
for (int i=0; i<evalNodes.size(); i++)
|
||||
{
|
||||
|
|
|
@ -228,7 +228,7 @@ public:
|
|||
nodePtr = m_net.CreateLearnableParameter(name, rows, cols);
|
||||
nodePtr->NeedGradient() = false;
|
||||
}
|
||||
else if (pass == ndlPassFinal)
|
||||
else if (pass == ndlPassFinal || nodePtr->FunctionValues().GetNumElements() != 0)
|
||||
{
|
||||
ElemType val = parameter[0]->GetScalar();
|
||||
nodePtr->FunctionValues().SetValue(val);
|
||||
|
|
|
@ -1143,8 +1143,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (m_children.size() != 3)
|
||||
throw std::logic_error("ClassBasedCrossEntropyWithSoftmaxNode criterion requires three inputs.");
|
||||
|
||||
if (Inputs(0)->OperationName() != L"SparseInputValue"
|
||||
&& Inputs(0)->OperationName() != L"InputValue")
|
||||
if (Inputs(0)->OperationName() != SparseInputValue<ElemType>::TypeName()
|
||||
&& Inputs(0)->OperationName() != InputValue<ElemType>::TypeName())
|
||||
throw std::logic_error("ClassBasedCrossEntropyWithSoftmaxNode criterion requires the first input to be the label.");
|
||||
|
||||
if (!(Inputs(1)->FunctionValues().GetNumRows() == Inputs(2)->FunctionValues().GetNumCols() && // input and matrix can be timed
|
||||
|
|
|
@ -28,6 +28,7 @@
|
|||
#include "io.h"
|
||||
#endif
|
||||
#include "hostname.h"
|
||||
#include "buildinfo.h"
|
||||
#ifdef LEAKDETECT
|
||||
#include "vld.h" // for memory leak detection
|
||||
#endif
|
||||
|
@ -619,8 +620,26 @@ int MPIAPI MPI_Init(_In_opt_ int *argc, _Inout_count_(*argc) wchar_t*** argv)
|
|||
}
|
||||
#endif
|
||||
|
||||
void PrintBuiltInfo()
|
||||
{
|
||||
fprintf(stderr, "-------------------------------------------------------------------\n");
|
||||
fprintf(stderr, "Build info: \n\n");
|
||||
fprintf(stderr, "\t\tBuilt time: %s %s\n", __DATE__, __TIME__);
|
||||
fprintf(stderr, "\t\tLast modified date: %s\n", __TIMESTAMP__);
|
||||
fprintf(stderr, "\t\tBuilt by %s on %s\n", _BUILDER_, _BUILDMACHINE_);
|
||||
fprintf(stderr, "\t\tBuild Path: %s\n", _BUILDPATH_);
|
||||
#ifdef _GIT_EXIST
|
||||
fprintf(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
|
||||
fprintf(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
|
||||
#endif
|
||||
fprintf(stderr, "-------------------------------------------------------------------\n");
|
||||
|
||||
}
|
||||
|
||||
|
||||
int wmain(int argc, wchar_t* argv[])
|
||||
{
|
||||
|
||||
try
|
||||
{
|
||||
#ifdef MPI_SUPPORT
|
||||
|
@ -647,6 +666,8 @@ int wmain(int argc, wchar_t* argv[])
|
|||
|
||||
// get the command param set they want
|
||||
wstring logpath = config("stderr", L"");
|
||||
// [1/26/2015 erw, add done file so that it can be used on HPC]
|
||||
wstring DoneFile = config("DoneFile", L"");
|
||||
ConfigArray command = config("command", "train");
|
||||
|
||||
if (logpath != L"")
|
||||
|
@ -663,9 +684,14 @@ int wmain(int argc, wchar_t* argv[])
|
|||
oss << myRank;
|
||||
logpath += L"rank" + oss.str();
|
||||
}
|
||||
|
||||
RedirectStdErr(logpath);
|
||||
}
|
||||
|
||||
|
||||
PrintBuiltInfo();
|
||||
|
||||
|
||||
std::string timestamp = TimeDateStamp();
|
||||
|
||||
if (myRank == 0) // main process
|
||||
|
@ -714,8 +740,15 @@ int wmain(int argc, wchar_t* argv[])
|
|||
DoCommand<double>(config);
|
||||
else
|
||||
RuntimeError("invalid precision specified: %s", type.c_str());
|
||||
|
||||
// still here , write a DoneFile if necessary
|
||||
if (!DoneFile.empty()){
|
||||
FILE* fp = fopenOrDie(DoneFile.c_str(), L"w");
|
||||
fprintf(fp, "successfully finished at %s on %s\n", TimeDateStamp().c_str(),GetHostName().c_str());
|
||||
fcloseOrDie(fp);
|
||||
}
|
||||
catch(const std::exception &err)
|
||||
}
|
||||
catch (const std::exception &err)
|
||||
{
|
||||
fprintf(stderr, "EXCEPTION occurred: %s", err.what());
|
||||
#ifdef _DEBUG
|
||||
|
|
|
@ -139,6 +139,9 @@
|
|||
<TreatOutputAsContent>true</TreatOutputAsContent>
|
||||
<Message>Copy content files to target directory</Message>
|
||||
</CustomBuildStep>
|
||||
<PreBuildEvent>
|
||||
<Command>prebuild.bat</Command>
|
||||
</PreBuildEvent>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<ClCompile>
|
||||
|
@ -199,6 +202,9 @@
|
|||
<Message>
|
||||
</Message>
|
||||
</CustomBuildStep>
|
||||
<PreBuildEvent>
|
||||
<Command>prebuild.bat</Command>
|
||||
</PreBuildEvent>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<Text Include="DefaultMacros.txt" />
|
||||
|
@ -216,6 +222,7 @@
|
|||
<ClInclude Include="..\..\Common\Include\fileutil.h" />
|
||||
<ClInclude Include="..\..\Common\Include\hostname.h" />
|
||||
<ClInclude Include="..\..\Common\Include\nvml.h" />
|
||||
<ClInclude Include="..\..\Common\Include\TimerUtility.h" />
|
||||
<ClInclude Include="CompositeComputationNode.h" />
|
||||
<ClInclude Include="ComputationNetwork.h" />
|
||||
<ClInclude Include="ComputationNetworkHelper.h" />
|
||||
|
@ -249,6 +256,7 @@
|
|||
<ClCompile Include="..\..\Common\fileutil.cpp">
|
||||
<PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\Common\TimerUtility.cpp" />
|
||||
<ClCompile Include="cn.cpp" />
|
||||
<ClCompile Include="ComputationNode.cpp" />
|
||||
<ClCompile Include="ModelEditLanguage.cpp" />
|
||||
|
|
|
@ -43,6 +43,9 @@
|
|||
<ClCompile Include="NetworkDescriptionLanguage.cpp">
|
||||
<Filter>Network</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\Common\TimerUtility.cpp">
|
||||
<Filter>Common</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="..\..\Common\Include\basetypes.h">
|
||||
|
@ -138,6 +141,9 @@
|
|||
<ClInclude Include="..\..\Common\Include\hostname.h">
|
||||
<Filter>Common\Include</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\..\Common\Include\TimerUtility.h">
|
||||
<Filter>Common\Include</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Text Include="modelEditor.txt">
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
@echo off
|
||||
|
||||
|
||||
echo #ifndef _BUILDINFO_H > buildinfo.h
|
||||
echo #define _BUILDINFO_H >> buildinfo.h
|
||||
|
||||
|
||||
FOR /F "usebackq" %%i IN (`hostname`) DO SET HOST=%%i
|
||||
:: assuming hostname always exists
|
||||
|
||||
:: not sure whether git in path ?
|
||||
git --version 2 > nul
|
||||
if not %ERRORLEVEL% == 9909 (
|
||||
echo #define _GIT_EXIST >> buildinfo.h
|
||||
FOR /F "usebackq" %%i IN (`git rev-parse --abbrev-ref HEAD`) DO SET BRANCH=%%i
|
||||
FOR /F "usebackq" %%i IN (`git rev-parse HEAD`) DO SET COMMIT=%%i
|
||||
echo #define _BUILDBRANCH_ "%BRANCH%" >> buildinfo.h
|
||||
echo #define _BUILDSHA1_ "%COMMIT%" >> buildinfo.h
|
||||
)
|
||||
|
||||
|
||||
echo #define _BUILDER_ "%USERNAME%" >> buildinfo.h
|
||||
echo #define _BUILDMACHINE_ "%HOST%" >> buildinfo.h
|
||||
|
||||
set a=%~dp0
|
||||
set buildpath="%a:\=\\%"
|
||||
echo #define _BUILDPATH_ %buildpath% >> buildinfo.h
|
||||
|
||||
|
||||
echo #endif >> buildinfo.h
|
|
@ -78,6 +78,53 @@ namespace CNTKMathTest
|
|||
Assert::IsTrue(C1.IsEqualTo(C,0.00005)); //Seems like bad precision
|
||||
}
|
||||
|
||||
TEST_METHOD(CPUMatrixDenseTimesSparse)
|
||||
{
|
||||
Matrix<float> Ad(CPUDEVICE);
|
||||
Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024, 2048, -3, 0.1, 0), 0);
|
||||
Matrix<float> As(Ad);
|
||||
As.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseCSC);
|
||||
|
||||
Matrix<float> B = Matrix<float>::RandomGaussian(2048, 1024, 1, 4, USE_TIME_BASED_SEED, CPUDEVICE);
|
||||
Matrix<float> C = Matrix<float>::RandomGaussian(2048, 2048, 1, 2, USE_TIME_BASED_SEED, CPUDEVICE);
|
||||
Matrix<float> C1(C);
|
||||
|
||||
float alpha = 0.3, beta = 0;
|
||||
bool transposeA = false, transposeB = false;
|
||||
Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, Ad, transposeB, beta, C);
|
||||
Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, As, transposeB, beta, C1);
|
||||
Assert::IsTrue(C1.IsEqualTo(C, 0.0001));
|
||||
|
||||
alpha = 3.3, beta = 1.3;
|
||||
Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, Ad, transposeB, beta, C);
|
||||
Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, As, transposeB, beta, C1);
|
||||
|
||||
// TODO IsEqualTo NYI
|
||||
// Assert::IsTrue(C1.IsEqualTo(C, 0.00005));
|
||||
}
|
||||
|
||||
TEST_METHOD(CPUMatrixDenseTimesSparseAsSparse)
|
||||
{
|
||||
Matrix<float> Ad(CPUDEVICE);
|
||||
Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(2048, 1024, -3, 0.1, 0), 0);
|
||||
|
||||
Matrix<float> As(Ad);
|
||||
As.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseCSC);
|
||||
|
||||
Matrix<float> B = Matrix<float>::RandomGaussian(2048, 1024, 1, 4, USE_TIME_BASED_SEED, CPUDEVICE);
|
||||
Matrix<float> AsCsc = Matrix<float>::RandomGaussian(2048, 2048, 1, 2, USE_TIME_BASED_SEED, CPUDEVICE);
|
||||
Matrix<float> AsBlock(CPUDEVICE);
|
||||
AsBlock.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseBlockCol);
|
||||
|
||||
float alpha = 0.3, beta = 0;
|
||||
bool transposeA = false, transposeB = true;
|
||||
Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, As, transposeB, beta, AsBlock);
|
||||
Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, As, transposeB, beta, AsCsc);
|
||||
|
||||
// TODO IsEqualTo NYI
|
||||
// Assert::IsTrue(AsBlock.IsEqualTo(AsCsc, 0.0001));
|
||||
}
|
||||
|
||||
TEST_METHOD(MatrixSparseTimesSparse)
|
||||
{
|
||||
Matrix<float> Ad;
|
||||
|
|
|
@ -111,7 +111,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
//else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow)
|
||||
{
|
||||
m_blockSize = 0;
|
||||
m_blockVal = NULL;
|
||||
m_pArray = NULL;
|
||||
m_blockIds = NULL;
|
||||
}
|
||||
}
|
||||
|
@ -125,6 +125,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
throw std::logic_error("CPUSparseMatrix: unsupported sparse matrix format");
|
||||
}
|
||||
m_format = format;
|
||||
m_default = defaultElem();
|
||||
ZeroInit();
|
||||
}
|
||||
|
||||
|
@ -160,8 +161,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow)
|
||||
{
|
||||
if(m_blockVal != NULL)
|
||||
delete[] m_blockVal;
|
||||
if (m_pArray != NULL)
|
||||
delete[] m_pArray;
|
||||
if(m_blockIds != NULL)
|
||||
delete[] m_blockIds;
|
||||
}
|
||||
|
@ -219,16 +220,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
template<class ElemType>
|
||||
ElemType* CPUSparseMatrix<ElemType>::BufferPointer() const
|
||||
{
|
||||
if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR)
|
||||
{
|
||||
return m_pArray;
|
||||
}
|
||||
else
|
||||
{
|
||||
return m_blockVal;
|
||||
}
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
void CPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, size_t numNZElemToReserve, const bool growOnly, const bool keepExistingValues)
|
||||
|
@ -280,16 +274,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (keepExistingValues && m_elemSizeAllocated > 0)
|
||||
{
|
||||
assert(m_compIndexSize > 0 && m_elemSizeAllocated < numNZElemToReserve);
|
||||
memcpy(blockVal, m_blockVal, NzSize());
|
||||
memcpy(blockVal, m_pArray, NzSize());
|
||||
memcpy(blockIds, m_blockIds, sizeof(size_t)*m_compIndexSize);
|
||||
}
|
||||
|
||||
if (m_blockVal != NULL)
|
||||
delete[] m_blockVal;
|
||||
if (m_pArray != NULL)
|
||||
delete[] m_pArray;
|
||||
if(m_blockIds != NULL)
|
||||
delete[] m_blockIds;
|
||||
|
||||
m_blockVal = blockVal;
|
||||
m_pArray = blockVal;
|
||||
m_blockIds = blockIds;
|
||||
}
|
||||
|
||||
|
@ -457,10 +451,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{ // h range over hidden layer
|
||||
if(first == true)
|
||||
{
|
||||
c.m_blockVal[pos] = alpha*lhs(h, j)*val;
|
||||
c.m_pArray[pos] = alpha*lhs(h, j)*val;
|
||||
} else
|
||||
{
|
||||
c.m_blockVal[pos] += alpha*lhs(h, j)*val;
|
||||
c.m_pArray[pos] += alpha*lhs(h, j)*val;
|
||||
}
|
||||
pos++;
|
||||
}
|
||||
|
@ -522,7 +516,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
size_t start = j * len;
|
||||
for(size_t p = start; p < start+len; p++)
|
||||
{
|
||||
ElemType val = lhs.m_blockVal[p];
|
||||
ElemType val = lhs.m_pArray[p];
|
||||
|
||||
size_t r = (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol) ? (p - start) : i;
|
||||
size_t c = (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol) ? i : (p - start);
|
||||
|
@ -537,6 +531,30 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
|
||||
template<class ElemType>
|
||||
bool CPUSparseMatrix<ElemType>::AreEqual(const CPUSparseMatrix<ElemType>& a, const CPUSparseMatrix<ElemType>& b, const ElemType threshold)
|
||||
{
|
||||
if (a.IsEmpty() || b.IsEmpty())
|
||||
throw std::logic_error("AreEqual: one of the input matrices is empty.");
|
||||
|
||||
if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
|
||||
return false;
|
||||
|
||||
bool result = true;
|
||||
|
||||
#pragma omp parallel for
|
||||
foreach_coord(i, j, a)
|
||||
{
|
||||
if (abs(a(i, j) - b(i, j)) > threshold)
|
||||
{
|
||||
result = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// a: H x No: H is hidden layer size and No is mini-batch size
|
||||
// weight: V x H, V is vocab size
|
||||
// label: V x No
|
||||
|
@ -698,11 +716,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{ // h range over hidden layer
|
||||
if(first == true)
|
||||
{
|
||||
grd.m_blockVal[pos] = input(h, j)*error.m_pArray[p];
|
||||
grd.m_pArray[pos] = input(h, j)*error.m_pArray[p];
|
||||
}
|
||||
else
|
||||
{
|
||||
grd.m_blockVal[pos] += input(h, j)*error.m_pArray[p];
|
||||
grd.m_pArray[pos] += input(h, j)*error.m_pArray[p];
|
||||
}
|
||||
pos++;
|
||||
}
|
||||
|
@ -735,11 +753,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
size_t start = j* len;
|
||||
for(size_t p = start; p < start+len; p++)
|
||||
{
|
||||
ElemType val = m_blockVal[p];
|
||||
ElemType val = m_pArray[p];
|
||||
size_t row = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? (p - start) : i;
|
||||
size_t col = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? i : (p - start);
|
||||
c(row, col) = (1-momentum)*val + momentum*c(row, col);
|
||||
m_blockVal[p] = c(row, col);
|
||||
m_pArray[p] = c(row, col);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -790,14 +808,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
size_t start = j* len;
|
||||
for(size_t p = start; p < start+len; p++)
|
||||
{
|
||||
ElemType val = m_blockVal[p];
|
||||
ElemType val = m_pArray[p];
|
||||
|
||||
size_t row = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? (p - start) : i;
|
||||
size_t col = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? i : (p - start);
|
||||
ElemType adenorm = c(row, col);
|
||||
adenorm += val * val;
|
||||
val = val / (floor + sqrt(adenorm));
|
||||
m_blockVal[p] = val;
|
||||
m_pArray[p] = val;
|
||||
c(row, col) = adenorm;
|
||||
}
|
||||
}
|
||||
|
@ -818,13 +836,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
size_t start = j* len;
|
||||
for (size_t p = start; p < start+len; p++)
|
||||
{
|
||||
if (m_blockVal[p] > locThresholdPos)
|
||||
if (m_pArray[p] > locThresholdPos)
|
||||
{
|
||||
m_blockVal[p] = locThresholdPos;
|
||||
m_pArray[p] = locThresholdPos;
|
||||
}
|
||||
else if (m_blockVal[p] < locTHresholdNeg)
|
||||
else if (m_pArray[p] < locTHresholdNeg)
|
||||
{
|
||||
m_blockVal[p] = locTHresholdNeg;
|
||||
m_pArray[p] = locTHresholdNeg;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,6 +34,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
CPUSparseMatrix(const MatrixFormat format);
|
||||
CPUSparseMatrix(const MatrixFormat format, const size_t numRows, const size_t numCols, const size_t size);
|
||||
|
||||
|
||||
~CPUSparseMatrix();
|
||||
|
||||
public:
|
||||
|
@ -76,6 +77,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
static void ScaleAndAdd(const ElemType alpha, const CPUSparseMatrix<ElemType>& lhs, CPUMatrix<ElemType>& c);
|
||||
|
||||
static bool AreEqual(const CPUSparseMatrix<ElemType>& a, const CPUSparseMatrix<ElemType>& b, const ElemType threshold = 1e-8);
|
||||
|
||||
/// sum(vec(a).*vec(b))
|
||||
static ElemType InnerProductOfMatrices(const CPUSparseMatrix<ElemType>& /*a*/, const CPUMatrix<ElemType>& /*b*/) { NOT_IMPLEMENTED; }
|
||||
|
||||
|
@ -89,6 +92,41 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
void Resize(const size_t numRows, const size_t numCols, size_t numNZElemToReserve = 0, const bool growOnly = true, const bool keepExistingValues = true);
|
||||
void Reset();
|
||||
|
||||
inline ElemType defaultElem()
|
||||
{
|
||||
ElemType default;
|
||||
memset(&default, 0, sizeof(ElemType));
|
||||
return default;
|
||||
}
|
||||
|
||||
const ElemType& operator() (const size_t row, const size_t col) const
|
||||
{
|
||||
if (col >= m_numCols || row >= m_numRows)
|
||||
{
|
||||
throw std::runtime_error("Position outside matrix dimensions");
|
||||
}
|
||||
|
||||
if (m_format == MatrixFormat::matrixFormatSparseCSC)
|
||||
{
|
||||
size_t start = m_compIndex[col];
|
||||
size_t end = m_compIndex[col + 1];
|
||||
for (size_t p = start; p < end; p++)
|
||||
{
|
||||
size_t i = m_unCompIndex[p];
|
||||
if (i == row)
|
||||
{
|
||||
return m_pArray[p];
|
||||
}
|
||||
}
|
||||
|
||||
return m_default;
|
||||
}
|
||||
else
|
||||
{
|
||||
NOT_IMPLEMENTED;
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
void NormalGrad(CPUMatrix<ElemType>& c, const ElemType momentum);
|
||||
void Adagrad(CPUMatrix<ElemType>& c);
|
||||
|
@ -103,7 +141,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
public:
|
||||
const ElemType* NzValues() const { return m_pArray; }
|
||||
ElemType* NzValues() { return m_pArray; }
|
||||
inline ElemType* NzValues() { return m_pArray; }
|
||||
size_t NzSize() const { return sizeof(ElemType)*m_nz; } // actual number of element bytes in use
|
||||
|
||||
CPUSPARSE_INDEX_TYPE* MajorIndexLocation() const { return m_unCompIndex; } //this is the major index, row/col ids in CSC/CSR format
|
||||
|
@ -140,8 +178,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
CPUSPARSE_INDEX_TYPE *m_compIndex; //begin ids of col/row in CSC/CSR format
|
||||
|
||||
size_t m_blockSize; //block size
|
||||
ElemType *m_blockVal; //block values
|
||||
size_t *m_blockIds; //block ids
|
||||
|
||||
ElemType m_default;
|
||||
};
|
||||
|
||||
typedef CPUSparseMatrix<float> CPUSingleSparseMatrix;
|
||||
|
|
|
@ -3356,45 +3356,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
#pragma endregion Static BLAS Functions
|
||||
|
||||
|
||||
//#pragma region File << and >> operators
|
||||
// template<class ElemType>
|
||||
// File& operator>>(File& stream, GPUMatrix<ElemType> &us)
|
||||
// {
|
||||
// //auto& us = *this;
|
||||
//
|
||||
// stream.GetMarker(fileMarkerBeginSection, std::string("BMAT"));
|
||||
// size_t elsize;
|
||||
// stream>>elsize;
|
||||
// if (sizeof(ElemType)!=elsize)
|
||||
// throw std::runtime_error("Template argument size doesn't match those in file");
|
||||
// std::wstring matrixName;
|
||||
// size_t numRows, numCols;
|
||||
// stream>>matrixName>>numRows>>numCols;
|
||||
// ElemType* d_array = new ElemType[numRows*numCols];
|
||||
// for (long i=0;i<numRows*numCols;++i)
|
||||
// stream>>d_array[i];
|
||||
// stream.GetMarker(fileMarkerEndSection, std::string("EMAT"));
|
||||
// us.SetValue(numRows,numCols,d_array, matrixFlagNormal);
|
||||
// us.m_matrixName = matrixName;
|
||||
// return stream;
|
||||
// }
|
||||
//
|
||||
// template<class ElemType>
|
||||
// File& operator<<(File& stream, GPUMatrix<ElemType> &us)
|
||||
// {
|
||||
// //auto& us = *this;
|
||||
//
|
||||
// stream.PutMarker(fileMarkerBeginSection, std::string("BMAT"));
|
||||
// stream<<sizeof(ElemType)<<us.m_matrixName<<us.m_numRows<<us.m_numCols;
|
||||
// ElemType *d_array = us.CopyToArray();
|
||||
// for (long i=0;i<us.GetNumElements();++i)
|
||||
// stream<<d_array[i];
|
||||
// stream.PutMarker(fileMarkerEndSection, std::string("EMAT"));
|
||||
// return stream;
|
||||
// }
|
||||
//
|
||||
//#pragma endregion File << and >> operators
|
||||
|
||||
template class GPUMatrix<float>;
|
||||
template class GPUMatrix<double>;
|
||||
template class DeviceBoundNumber<float>;
|
||||
|
|
|
@ -2250,37 +2250,12 @@ __global__ void _sparseCSRElemMulDense(
|
|||
}
|
||||
}
|
||||
|
||||
// forward pass from feature to hidden layer
|
||||
/*template<class ElemType>
|
||||
__global__ void _denseMulSparseCSCToDense(
|
||||
ElemType alpha,
|
||||
const ElemType* lhs,
|
||||
int numrows,
|
||||
int numcols,
|
||||
const GPUSPARSE_INDEX_TYPE* row,
|
||||
ElemType* c)
|
||||
{
|
||||
int loadPerThread = (numrows+blockDim.x-1)/blockDim.x;
|
||||
int tStart = loadPerThread * threadIdx.x;
|
||||
int tEnd = min(numrows, loadPerThread + tStart);
|
||||
|
||||
int p = blockIdx.x;
|
||||
int i = row[p];
|
||||
int j = blockIdx.x;
|
||||
|
||||
for (int h = tStart; h < tEnd; h++)
|
||||
{
|
||||
ElemType res = alpha * lhs[IDX2C(h, i, numrows)];
|
||||
atomicAdd(&c[IDX2C(h,j,numrows)], res);
|
||||
}
|
||||
}*/
|
||||
|
||||
//c = alpha * op(a) * op(b) + beta*c
|
||||
//this function can be further improved by using shared memory
|
||||
template<class ElemType>
|
||||
__global__ void _denseMultSparseCSCAndWeightedAddToDense(
|
||||
int m, //rowDense
|
||||
int k, //colDense = rowSparse
|
||||
int n, //colSparse
|
||||
ElemType alpha,
|
||||
const ElemType* a, //dense
|
||||
|
@ -2311,62 +2286,77 @@ __global__ void _denseMultSparseCSCAndWeightedAddToDense(
|
|||
}
|
||||
|
||||
// backward pass from hidden layer to feature weight
|
||||
//result (sparse BlockCol)= alpha * (lhs (dense) X rhs^T (sparse CSC)
|
||||
//assume resultValues are 0-initialized
|
||||
template<class ElemType>
|
||||
__global__ void _denseMulSparseCSCTransposeToSparseBlockCol(
|
||||
ElemType alpha,
|
||||
ElemType* lhs,
|
||||
size_t nrs,
|
||||
ElemType* rhsNZValues,
|
||||
const GPUSPARSE_INDEX_TYPE* row,
|
||||
const size_t* rowIdx,
|
||||
ElemType* blockVal,
|
||||
size_t* blockIds)
|
||||
const ElemType alpha,
|
||||
const ElemType* lhsValues,
|
||||
const size_t numRowsLhs,
|
||||
const size_t numColsRhs,
|
||||
const ElemType* rhsNZValues,
|
||||
const GPUSPARSE_INDEX_TYPE* rhsRows,
|
||||
const GPUSPARSE_INDEX_TYPE* rhsCols,
|
||||
const size_t* rhsRowIdx,
|
||||
ElemType* resultValues,
|
||||
size_t* resultBlockIds)
|
||||
{
|
||||
int p = blockIdx.x;
|
||||
int i = row[p];
|
||||
int ii = rowIdx[p];
|
||||
int j = blockIdx.x;
|
||||
const LONG64 index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const LONG64 lhsCol = index / numRowsLhs; //rhsCol == lhsCol
|
||||
if (lhsCol >= numColsRhs)
|
||||
return;
|
||||
const LONG64 lhsRow = index - numRowsLhs*lhsCol; //resultRow == lhsRow
|
||||
|
||||
int load = (nrs+blockDim.x-1)/blockDim.x;
|
||||
int pStart = load * threadIdx.x;
|
||||
int pEnd = min((int)nrs, load + pStart);
|
||||
//each thread handles one [row, col] combination
|
||||
ElemType lhsValue = alpha*lhsValues[IDX2C(lhsRow, lhsCol, numRowsLhs)];
|
||||
|
||||
for(int h = pStart; h < pEnd; h++)
|
||||
LONG64 start = rhsCols[lhsCol]; //rhsCol == lhsCol
|
||||
LONG64 end = rhsCols[lhsCol + 1];
|
||||
|
||||
for (LONG64 p = start; p < end; p++)
|
||||
{
|
||||
ElemType temp = alpha*lhs[IDX2C(h, j, nrs)]*rhsNZValues[p];
|
||||
atomicAdd(&blockVal[ii*nrs+h], temp);
|
||||
blockIds[ii] = i;
|
||||
LONG64 rhsRow = rhsRows[p];
|
||||
ElemType rhsVal = rhsNZValues[p];
|
||||
LONG64 resultCol = rhsRowIdx[p]; //resultCol == rhsRow maps to columnid
|
||||
resultBlockIds[resultCol] = rhsRow; //indicate which colmn it actually points to
|
||||
|
||||
//assume resultValues are 0-initialized
|
||||
atomicAdd(&resultValues[IDX2C(lhsRow, resultCol, numRowsLhs)], lhsValue * rhsVal);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// gradients update
|
||||
template<class ElemType>
|
||||
__global__ void _scaleSparseAndAddToDense(
|
||||
ElemType alpha,
|
||||
bool blockCol,
|
||||
ElemType* blockVal,
|
||||
size_t* blockIds,
|
||||
size_t len,
|
||||
ElemType* rhs,
|
||||
size_t numrows)
|
||||
__global__ void _scaleSparseBlockAndAddToDense(
|
||||
const ElemType alpha,
|
||||
const bool blockCol, //true if blockRow
|
||||
const size_t numRows,
|
||||
const size_t numCols,
|
||||
const size_t numBlocks,
|
||||
const ElemType* lhsValues, //lhs is blockCol or blockRow
|
||||
const size_t* blockIds,
|
||||
ElemType* rhs)
|
||||
{
|
||||
int ii = blockIdx.x;
|
||||
int i = blockIds[ii];
|
||||
int load = (len+blockDim.x-1)/blockDim.x;
|
||||
int pStart = load * threadIdx.x;
|
||||
int pEnd = min((int)len, load + pStart);
|
||||
|
||||
for(int h = pStart; h < pEnd; h++)
|
||||
{ ElemType temp = alpha*blockVal[ii*len + h];
|
||||
if(blockCol)
|
||||
const LONG64 index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
LONG64 row, col;
|
||||
if (blockCol)
|
||||
{
|
||||
atomicAdd(&rhs[IDX2C(h, i, numrows)], temp);
|
||||
const LONG64 blockId = index / numRows;
|
||||
if (blockId >= numBlocks)
|
||||
return;
|
||||
row = index - numRows* blockId;
|
||||
col = blockIds[blockId];
|
||||
}
|
||||
else
|
||||
{
|
||||
atomicAdd(&rhs[IDX2C(i, h, numrows)], temp);
|
||||
}
|
||||
const LONG64 blockId = index / numCols;
|
||||
if (blockId >= numBlocks)
|
||||
return;
|
||||
col = index - numCols* blockId;
|
||||
row = blockIds[blockId];
|
||||
}
|
||||
rhs[IDX2C(row, col, numRows)] += alpha * lhsValues[index];
|
||||
}
|
||||
|
||||
// compute predictions in cross entory node
|
||||
|
@ -2638,30 +2628,36 @@ __global__ void _inplaceTruncate(
|
|||
}
|
||||
|
||||
template<class ElemType>
|
||||
__global__ void _normalGrad(
|
||||
bool isBlockCol,
|
||||
size_t len,
|
||||
__global__ void _normalGradForSparseBlock(
|
||||
const ElemType momentum,
|
||||
size_t* blockIds,
|
||||
ElemType* blockVal,
|
||||
ElemType* c,
|
||||
size_t numrows)
|
||||
const bool blockCol, //true if blockRow
|
||||
const size_t numRows,
|
||||
const size_t numCols,
|
||||
const size_t numBlocks,
|
||||
ElemType* lhsValues, //lhs is blockCol or blockRow
|
||||
const size_t* blockIds,
|
||||
ElemType* rhs)
|
||||
{
|
||||
int j = blockIdx.x;
|
||||
int i = blockIds[j];
|
||||
int start = j * len;
|
||||
|
||||
int load = (len+blockDim.x-1)/blockDim.x;
|
||||
int pStart = load * threadIdx.x;
|
||||
int pLen = min((int)len, load + pStart);
|
||||
|
||||
for(int p = start+pStart; p < start+pLen; p++)
|
||||
const LONG64 index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
LONG64 row, col;
|
||||
if (blockCol)
|
||||
{
|
||||
int row = isBlockCol ? (p - start) : i;
|
||||
int col = isBlockCol ? i: (p - start);
|
||||
c[IDX2C(row, col, numrows)] = (1-momentum)*blockVal[p] + momentum*c[IDX2C(row, col, numrows)];
|
||||
blockVal[p] = c[IDX2C(row, col, numrows)];
|
||||
const LONG64 blockId = index / numRows;
|
||||
if (blockId >= numBlocks)
|
||||
return;
|
||||
row = index - numRows* blockId;
|
||||
col = blockIds[blockId];
|
||||
}
|
||||
else
|
||||
{
|
||||
const LONG64 blockId = index / numCols;
|
||||
if (blockId >= numBlocks)
|
||||
return;
|
||||
col = index - numCols* blockId;
|
||||
row = blockIds[blockId];
|
||||
}
|
||||
rhs[IDX2C(row, col, numRows)] = (1 - momentum)*lhsValues[index] + momentum*rhs[IDX2C(row, col, numRows)];
|
||||
lhsValues[index] = rhs[IDX2C(row, col, numRows)];
|
||||
}
|
||||
|
||||
static __inline__ __device__ double atomicAdd(double* address, double val)
|
||||
|
@ -3263,4 +3259,147 @@ d_tmp[0] = max((ElemType)0, d_tmp[0]/max((ElemType)1.0e-10,sqrt(d_tmp[1]))/max((
|
|||
}
|
||||
*/
|
||||
|
||||
|
||||
template<class ElemType>
|
||||
__global__ void _assignElementProductOfWithShiftNeg(
|
||||
ElemType* us,
|
||||
const ElemType* a,
|
||||
const ElemType* b,
|
||||
const int shift,
|
||||
const int NTPlusOne,
|
||||
const int BS)
|
||||
{
|
||||
LONG64 idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
LONG64 idy = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (idx >= NTPlusOne || idy >= BS)
|
||||
return;
|
||||
|
||||
if (idx == 0)
|
||||
{
|
||||
// this is row-0. No need to shift
|
||||
us[IDX2C(idx, idy, NTPlusOne)] = a[idy] * b[idy];
|
||||
}
|
||||
else
|
||||
{
|
||||
int cs = shift + idx - 1;
|
||||
int tmpidy = (idy + cs) % BS;
|
||||
us[IDX2C(idx, idy, NTPlusOne)] = a[idy] * b[tmpidy];
|
||||
}
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
__global__ void _innerProductWithShiftNeg(
|
||||
ElemType* c,
|
||||
const ElemType* a,
|
||||
const ElemType* b,
|
||||
const long N, //a.GetNumRows();
|
||||
const long M, //a.GetNumCols();
|
||||
const long shift,
|
||||
const long NTPlusOne
|
||||
)
|
||||
{
|
||||
LONG64 idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
LONG64 idy = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (idx >= NTPlusOne || idy >= M)
|
||||
return;
|
||||
|
||||
ElemType sum = 0;
|
||||
long index_a = 0;
|
||||
long index_b = 0;
|
||||
long col_a = 0;
|
||||
long col_b = 0;
|
||||
if (idx == 0)
|
||||
{
|
||||
// this is row 0. No need to shift
|
||||
// the product of a(:,idy) dot b(:,idy)
|
||||
col_a = idy;
|
||||
for (long i = 0; i < N; ++i)
|
||||
{
|
||||
index_a = IDX2C(i, col_a, N);
|
||||
sum += a[index_a] * b[index_a];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
int cs = shift + idx - 1;
|
||||
col_a = idy;
|
||||
col_b = (idy + cs) % M;
|
||||
for (int i = 0; i < N; ++i)
|
||||
{
|
||||
index_a = IDX2C(i, col_a, N);
|
||||
index_b = IDX2C(i, col_b, N);
|
||||
sum += a[index_a] * b[index_b];
|
||||
}
|
||||
}
|
||||
c[IDX2C(idx, idy, NTPlusOne)] = sum;
|
||||
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
__global__ void _getARowByIndex(
|
||||
ElemType* us,
|
||||
const ElemType* a,
|
||||
const int O, // a's rows
|
||||
const int P, // a's cols
|
||||
const int m // the m-th row of a
|
||||
)
|
||||
{
|
||||
LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (id >= P)
|
||||
return;
|
||||
// us[id] = a[id] * b[id];
|
||||
us[id] = a[IDX2C(m, id, O)];
|
||||
}
|
||||
|
||||
|
||||
template<class ElemType>
|
||||
__global__ void _conductRowElementMultiplyWithShift(
|
||||
ElemType* us,
|
||||
const ElemType* a,
|
||||
const ElemType* b,
|
||||
const int O, // b's rows
|
||||
const int P, // b's cols
|
||||
const int shift,
|
||||
const bool isafixed)
|
||||
{
|
||||
LONG64 idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
LONG64 idy = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (idx >= O || idy >= P)
|
||||
return;
|
||||
|
||||
int tmpidy = (idy + shift) % P;
|
||||
if (isafixed)
|
||||
{
|
||||
// we fix a, and shift b
|
||||
us[IDX2C(idx, idy, O)] = a[idy] * b[IDX2C(idx, tmpidy, O)];
|
||||
}
|
||||
else
|
||||
{
|
||||
// we fix b, but shift a
|
||||
us[IDX2C(idx, idy, O)] = a[tmpidy] * b[IDX2C(idx, idy, O)];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
__global__ void _assignElementProductOfWithShift(
|
||||
ElemType* us,
|
||||
const ElemType* a,
|
||||
const ElemType* b,
|
||||
const int shift,
|
||||
const LONG64 N)
|
||||
{
|
||||
LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (id >= N)
|
||||
return;
|
||||
|
||||
int tmpidb = (id + shift) % N;
|
||||
us[id] = a[id] * b[tmpidb];
|
||||
}
|
||||
|
||||
|
||||
|
||||
#endif // !CPUONLY
|
||||
|
|
|
@ -79,7 +79,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
m_matrixName=nullptr;
|
||||
|
||||
m_blockSize = 0;
|
||||
m_blockVal = nullptr;
|
||||
m_blockIds = nullptr;
|
||||
|
||||
m_expandedSize = 0;
|
||||
|
@ -241,7 +240,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
CopyBuffer(cpuSparseMatrix.ColLocation(), h_Col, MajorIndexCount());
|
||||
}
|
||||
|
||||
CUDACALL(cudaMemcpy(cpuSparseMatrix.BufferPointer(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
|
||||
CUDACALL(cudaMemcpy(cpuSparseMatrix.NzValues(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
|
||||
|
||||
}
|
||||
else if (this->GetFormat() == matrixFormatSparseCSC)
|
||||
|
@ -267,7 +266,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
CopyBuffer(cpuSparseMatrix.RowLocation(), h_Row, MajorIndexCount());
|
||||
}
|
||||
|
||||
CUDACALL(cudaMemcpy(cpuSparseMatrix.BufferPointer(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
|
||||
CUDACALL(cudaMemcpy(cpuSparseMatrix.NzValues(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
|
||||
}
|
||||
else
|
||||
NOT_IMPLEMENTED;
|
||||
|
@ -571,7 +570,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
m_matrixName=moveFrom.m_matrixName;
|
||||
|
||||
m_blockSize = moveFrom.m_blockSize;
|
||||
m_blockVal = moveFrom.m_blockVal;
|
||||
m_blockIds = moveFrom.m_blockIds;
|
||||
|
||||
m_expandedSize = moveFrom.m_expandedSize;
|
||||
|
@ -602,7 +600,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
m_matrixName=moveFrom.m_matrixName;
|
||||
|
||||
m_blockSize = moveFrom.m_blockSize;
|
||||
m_blockVal = moveFrom.m_blockVal;
|
||||
m_blockIds = moveFrom.m_blockIds;
|
||||
|
||||
m_expandedSize = moveFrom.m_expandedSize;
|
||||
|
@ -636,8 +633,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if(m_pArray != nullptr)
|
||||
CUDACALL(cudaFree(m_pArray));
|
||||
|
||||
if(m_blockVal != nullptr)
|
||||
CUDACALL(cudaFree(m_blockVal));
|
||||
if(m_blockIds != nullptr)
|
||||
CUDACALL(cudaFree(m_blockIds));
|
||||
if (m_rowToId != nullptr)
|
||||
|
@ -669,22 +664,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
//-------------------------------------------------------------------------
|
||||
// Start of new GPU Sparse Matrix code
|
||||
//-------------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
ElemType* GPUSparseMatrix<ElemType>::BufferPointer() const
|
||||
{
|
||||
if(m_format == matrixFormatSparseCSC || m_format == matrixFormatSparseCSR)
|
||||
{
|
||||
return m_pArray;
|
||||
}
|
||||
else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow)
|
||||
{
|
||||
return m_blockVal;
|
||||
}
|
||||
else
|
||||
NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const bool growOnly)
|
||||
{
|
||||
|
@ -728,13 +707,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
m_totalBufferSizeAllocated = bufferSizeNeeded;
|
||||
m_elemSizeAllocated = numNZElemToReserve;
|
||||
}
|
||||
else
|
||||
{
|
||||
m_elemSizeAllocated = ElemCountFromBufferSize();
|
||||
}
|
||||
}
|
||||
else if (matrixFormat == MatrixFormat::matrixFormatSparseBlockCol || matrixFormat == MatrixFormat::matrixFormatSparseBlockRow)
|
||||
{
|
||||
if (m_elemSizeAllocated < numNZElemToReserve || (m_elemSizeAllocated > numNZElemToReserve && !growOnly))
|
||||
{
|
||||
if (m_blockVal != nullptr)
|
||||
CUDACALL(cudaFree(m_blockVal));
|
||||
if (m_pArray != nullptr)
|
||||
CUDACALL(cudaFree(m_pArray));
|
||||
if (m_blockIds != nullptr)
|
||||
CUDACALL(cudaFree(m_blockIds));
|
||||
if (m_block2UniqId != nullptr)
|
||||
|
@ -742,7 +725,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
PrepareDevice();
|
||||
size_t newCompIndexSize = max(numRows, numCols) + 1;
|
||||
CUDACALL(cudaMalloc((void **)&m_blockVal, sizeof(ElemType)*numNZElemToReserve));
|
||||
CUDACALL(cudaMalloc((void **)&m_pArray, sizeof(ElemType)*numNZElemToReserve));
|
||||
CUDACALL(cudaMalloc((void **)&m_blockIds, sizeof(size_t)*newCompIndexSize));
|
||||
CUDACALL(cudaMalloc((void **)&m_block2UniqId, sizeof(size_t)*newCompIndexSize));
|
||||
|
||||
|
@ -828,7 +811,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
rowToId[i] = indexer[row];
|
||||
}
|
||||
|
||||
m_blockSize = indexer.size();
|
||||
CUDACALL(cudaMemcpy(m_rowToId, rowToId, sizeof(size_t)*nz, cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
|
@ -928,7 +911,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (do_sync) CUDACALL(cudaEventCreate(&done));
|
||||
_denseMultSparseCSCAndWeightedAddToDense<ElemType> <<< blocksPerGrid, threadsPerBlock >>> (
|
||||
m, //rowDense
|
||||
k, //colDense = rowSparse
|
||||
n, //colSparse
|
||||
alpha,
|
||||
reinterpret_cast<const ElemType*>(lhs.BufferPointer()), //dense
|
||||
|
@ -991,25 +973,32 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (rhs.GetFormat() != matrixFormatSparseCSC)
|
||||
NOT_IMPLEMENTED;
|
||||
|
||||
lhs.PrepareDevice();
|
||||
|
||||
c.SetFormat(matrixFormatSparseBlockCol);
|
||||
c.m_blockSize = n < rhs.m_nz ? n : rhs.m_nz;
|
||||
//c.m_blockSize = n < rhs.m_nz ? n : rhs.m_nz;
|
||||
c.m_blockSize = rhs.m_blockSize;
|
||||
c.m_nz = m*c.m_blockSize;
|
||||
c.Resize(m, n, c.m_nz);
|
||||
CUDACALL(cudaMemset(c.m_blockVal, 0, sizeof(ElemType)*(c.m_nz)));
|
||||
CUDACALL(cudaMemset(c.NzValues(), 0, sizeof(ElemType)*(c.m_nz)));
|
||||
CUDACALL(cudaMemset(c.m_blockIds, 0, sizeof(size_t)*(c.m_blockSize)));
|
||||
|
||||
LONG64 N = (LONG64)lhs.GetNumElements(); //here we process for each row in lhs and each column in rhs (==columns in lhs)
|
||||
int blocksPerGrid = (int)ceil(((double)N) / threadsPerBlock);
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDACALL(cudaEventCreate(&done));
|
||||
int blocksPerGrid = rhs.GetNumNZElements();
|
||||
_denseMulSparseCSCTransposeToSparseBlockCol<ElemType> << <blocksPerGrid, threadsPerBlock >> >(
|
||||
_denseMulSparseCSCTransposeToSparseBlockCol<ElemType> << <blocksPerGrid, threadsPerBlock, 0, t_stream >> >(
|
||||
alpha,
|
||||
lhs.BufferPointer(),
|
||||
m,
|
||||
rhs.BufferPointer(),
|
||||
l,
|
||||
rhs.NzValues(),
|
||||
rhs.RowLocation(),
|
||||
rhs.ColLocation(),
|
||||
rhs.m_rowToId,
|
||||
c.m_blockVal,
|
||||
c.NzValues(),
|
||||
c.m_blockIds);
|
||||
|
||||
if (do_sync) CUDACALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDACALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDACALL(cudaEventDestroy(done));
|
||||
|
@ -1028,25 +1017,30 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
template<class ElemType>
|
||||
void GPUSparseMatrix<ElemType>::ScaleAndAdd(const ElemType alpha, const GPUSparseMatrix<ElemType>& lhs, GPUMatrix<ElemType>& rhs)
|
||||
{
|
||||
if (lhs.GetComputeDeviceId()!=rhs.GetComputeDeviceId())
|
||||
if (lhs.GetNumRows() != rhs.GetNumRows() || lhs.GetNumCols() != rhs.GetNumCols())
|
||||
throw std::logic_error("ScaleAndAdd: dimension mismatch");
|
||||
|
||||
if (lhs.GetComputeDeviceId() != rhs.GetComputeDeviceId())
|
||||
throw std::runtime_error("GPUSparseMatrix::ScaleAndAdd: All matrices must be on the same GPU");
|
||||
|
||||
if (lhs.m_format == matrixFormatSparseBlockCol || lhs.m_format == matrixFormatSparseBlockRow)
|
||||
{
|
||||
size_t len = (lhs.m_format == matrixFormatSparseBlockCol) ? lhs.GetNumRows(): lhs.GetNumCols();
|
||||
bool blockCol = (lhs.m_format == matrixFormatSparseBlockCol);
|
||||
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDACALL(cudaEventCreate(&done));
|
||||
size_t blocksPerGrid = lhs.m_blockSize;
|
||||
_scaleSparseAndAddToDense<ElemType> << <blocksPerGrid, threadsPerBlock >> >(
|
||||
LONG64 N = (LONG64)lhs.GetNumNZElements();
|
||||
int blocksPerGrid = (int)ceil(((double)N) / threadsPerBlock);
|
||||
_scaleSparseBlockAndAddToDense<ElemType> << <blocksPerGrid, threadsPerBlock >> >(
|
||||
alpha,
|
||||
blockCol,
|
||||
lhs.m_blockVal,
|
||||
lhs.GetNumRows(),
|
||||
lhs.GetNumCols(),
|
||||
lhs.m_blockSize,
|
||||
lhs.NzValues(),
|
||||
lhs.m_blockIds,
|
||||
len,
|
||||
rhs.BufferPointer(),
|
||||
rhs.GetNumRows());
|
||||
rhs.BufferPointer());
|
||||
|
||||
if (do_sync) CUDACALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDACALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDACALL(cudaEventDestroy(done));
|
||||
|
@ -1104,7 +1098,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
label.m_block2Id,
|
||||
cls.BufferPointer(),
|
||||
idx2cls.BufferPointer(),
|
||||
etp.m_pArray,
|
||||
etp.NzValues(),
|
||||
etp.MajorIndexLocation(),
|
||||
etp.SecondaryIndexLocation());
|
||||
|
||||
|
@ -1184,7 +1178,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
grd.m_blockSize = label.m_blockSize;
|
||||
grd.m_nz = nz;
|
||||
CUDACALL(cudaMemset(grd.m_blockVal,0,sizeof(ElemType)*(grd.m_nz)));
|
||||
CUDACALL(cudaMemset(grd.BufferPointer(),0,sizeof(ElemType)*(grd.m_nz)));
|
||||
CUDACALL(cudaMemset(grd.m_blockIds,0,sizeof(size_t)*(grd.m_blockSize)));
|
||||
|
||||
cudaEvent_t done = nullptr;
|
||||
|
@ -1203,7 +1197,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
idx2cls.BufferPointer(),
|
||||
input.BufferPointer(),
|
||||
input.GetNumRows(),
|
||||
grd.m_blockVal,
|
||||
grd.BufferPointer(),
|
||||
grd.m_blockIds);
|
||||
if (do_sync) CUDACALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDACALL(cudaEventSynchronize(done));
|
||||
|
@ -1221,8 +1215,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDACALL(cudaEventCreate(&done));
|
||||
ElemType * values = NzValues();
|
||||
if (m_format == matrixFormatSparseBlockCol || m_format == matrixFormatSparseBlockRow)
|
||||
values = m_blockVal;
|
||||
_inplaceTruncate<ElemType><<<blocksPerGrid,threadsPerBlock>>>(values,threshold,N);
|
||||
if (do_sync) CUDACALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDACALL(cudaEventSynchronize(done));
|
||||
|
@ -1247,19 +1239,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
if(m_format == matrixFormatSparseBlockCol || m_format == matrixFormatSparseBlockRow)
|
||||
{
|
||||
size_t blocksPerGrid = m_blockSize;
|
||||
bool isBlockCol = (m_format == MatrixFormat::matrixFormatSparseBlockCol);
|
||||
size_t len = isBlockCol ? GetNumRows(): GetNumCols();
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDACALL(cudaEventCreate(&done));
|
||||
_normalGrad<ElemType><<<blocksPerGrid,threadsPerBlock>>>(
|
||||
isBlockCol,
|
||||
len,
|
||||
LONG64 N = (LONG64)GetNumNZElements();
|
||||
int blocksPerGrid = (int)ceil(((double)N) / threadsPerBlock);
|
||||
|
||||
_normalGradForSparseBlock<ElemType> << <blocksPerGrid, threadsPerBlock >> >(
|
||||
momentum,
|
||||
isBlockCol,
|
||||
GetNumRows(),
|
||||
GetNumCols(),
|
||||
m_blockSize,
|
||||
NzValues(),
|
||||
m_blockIds,
|
||||
m_blockVal,
|
||||
c.BufferPointer(),
|
||||
c.GetNumRows());
|
||||
c.BufferPointer());
|
||||
|
||||
if (do_sync) CUDACALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDACALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDACALL(cudaEventDestroy(done));
|
||||
|
|
|
@ -49,9 +49,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// in memory format is always in the following order:
|
||||
// Non-zero data elements, Full index locations, compressed index locations
|
||||
// In CSR row data is compressed, in CSC col data is compressed
|
||||
const ElemType* NzValues() const {return m_pArray;}
|
||||
ElemType* NzValues() {return m_pArray;}
|
||||
size_t NzSize() const {return sizeof(ElemType)*m_nz;} // actual number of element bytes in use
|
||||
inline const ElemType* NzValues() const {return m_pArray;}
|
||||
inline ElemType* NzValues() {return m_pArray;}
|
||||
inline size_t NzSize() const {return sizeof(ElemType)*m_nz;} // actual number of element bytes in use
|
||||
|
||||
GPUSPARSE_INDEX_TYPE* MajorIndexLocation() const { return (GPUSPARSE_INDEX_TYPE*)(m_pArray + m_elemSizeAllocated); } //this is the major index, row/col ids in CSC/CSR format
|
||||
size_t MajorIndexCount() const { return m_nz; }
|
||||
|
@ -82,8 +82,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
size_t BufferSizeNeeded(const size_t numNZ) const
|
||||
{ return sizeof(ElemType)*numNZ + sizeof(GPUSPARSE_INDEX_TYPE)*(numNZ + SecondaryIndexCount(numNZ)); }
|
||||
|
||||
size_t BufferSizeAllocated() const { return m_totalBufferSizeAllocated; }
|
||||
ElemType* BufferPointer() const;
|
||||
inline size_t BufferSizeAllocated() const { return m_totalBufferSizeAllocated; }
|
||||
inline ElemType* BufferPointer() const { return m_pArray; }
|
||||
|
||||
// the column and row locations will swap based on what format we are in. Full index always follows the data array
|
||||
GPUSPARSE_INDEX_TYPE* RowLocation() const { return (m_format&matrixFormatRowMajor) ? SecondaryIndexLocation() : MajorIndexLocation(); }
|
||||
|
@ -125,7 +125,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
bool IsEqualTo(const GPUMatrix<ElemType>& a, const ElemType threshold = 1e-8) const;
|
||||
public:
|
||||
virtual DEVICEID_TYPE GetComputeDeviceId(void) const;
|
||||
size_t GetNumNZElements() const {return m_nz;}
|
||||
inline size_t GetNumNZElements() const {return m_nz;}
|
||||
|
||||
//Sets sparse matrix in CSR format. this acts as deep copy
|
||||
void SetMatrixFromCSRFormat(const GPUSPARSE_INDEX_TYPE *h_CSRRow, const GPUSPARSE_INDEX_TYPE *h_Col, const ElemType *h_Val,
|
||||
|
@ -249,7 +249,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
size_t m_totalBufferSizeAllocated;
|
||||
|
||||
size_t m_blockSize; //block size
|
||||
ElemType *m_blockVal; //block values
|
||||
size_t *m_blockIds; //block ids
|
||||
size_t *m_rowToId; //the id showing the order row number is observed in the nnz values.
|
||||
|
||||
|
|
|
@ -733,6 +733,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (m_CPUSparseMatrix == nullptr)
|
||||
{
|
||||
m_CPUSparseMatrix = new CPUSparseMatrix<ElemType>(newMatrixFormat);
|
||||
|
||||
if (GetMatrixType() == MatrixType::DENSE && m_CPUMatrix != nullptr)
|
||||
{
|
||||
m_CPUSparseMatrix->Resize(GetNumRows(), GetNumCols());
|
||||
CopyElementsFromDenseToSparse(*m_CPUMatrix, *m_CPUSparseMatrix);
|
||||
}
|
||||
else
|
||||
{
|
||||
// TODO: Assign Sparse from Sparse!
|
||||
}
|
||||
|
||||
delete m_CPUMatrix;
|
||||
m_CPUMatrix = nullptr;
|
||||
}
|
||||
|
@ -801,6 +812,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
void Matrix<ElemType>::CopyElementsFromDenseToSparse(CPUMatrix<ElemType>& from, CPUSparseMatrix<ElemType>& dest)
|
||||
{
|
||||
foreach_coord(row, col, from)
|
||||
{
|
||||
auto val = from(row, col);
|
||||
dest.SetValue(row, col, val);
|
||||
}
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
ElemType Matrix<ElemType>::Get00Element() const
|
||||
|
@ -3992,7 +4012,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
nullptr,
|
||||
return CPUMatrix<ElemType>::AreEqual(*a.m_CPUMatrix, *b.m_CPUMatrix, threshold),
|
||||
return GPUMatrix<ElemType>::AreEqual(*a.m_GPUMatrix, *b.m_GPUMatrix, threshold),
|
||||
NOT_IMPLEMENTED; return false ,
|
||||
return CPUSparseMatrix<ElemType>::AreEqual(*a.m_CPUSparseMatrix, *b.m_CPUSparseMatrix, threshold),
|
||||
return GPUSparseMatrix<ElemType>::AreEqual(*a.m_GPUSparseMatrix, *b.m_GPUSparseMatrix, threshold)
|
||||
);
|
||||
}
|
||||
|
|
|
@ -45,6 +45,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
void _transferToDevice(int id_to, bool ismoved=true, bool emptyTransfer=false) const;
|
||||
static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
|
||||
static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c);
|
||||
static void CopyElementsFromDenseToSparse(CPUMatrix<ElemType>& from, CPUSparseMatrix<ElemType>& dest);
|
||||
|
||||
public:
|
||||
//Constructors, destructors and other static matrix builders
|
||||
|
|
Загрузка…
Ссылка в новой задаче