Conflicts:
	DataReader/HTKMLFReader/HTKMLFReader.cpp
	DataReader/HTKMLFReader/HTKMLFReader.h
This commit is contained in:
Mike Seltzer 2015-02-06 16:14:47 -08:00
Родитель 26d9e66b87 f3dfe81034
Коммит b4f465b8c4
34 изменённых файлов: 21951 добавлений и 21411 удалений

6
.gitignore поставляемый
Просмотреть файл

@ -160,3 +160,9 @@ $RECYCLE.BIN/
*.lyx~
*.bak
*.lyx#
# =========================
# prebuild file
# =========================
MachineLearning/cn/buildinfo.h

Просмотреть файл

@ -5,8 +5,9 @@
//
// ConfigFile.cpp : Defines the configuration file loader.
//
#ifndef _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
#endif
#include "File.h"
#include "commandArgUtil.h"

Просмотреть файл

@ -4,7 +4,9 @@
// </copyright>
//
#ifndef _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
#endif
#include "basetypes.h"
#define FORMAT_SPECIALIZE // to get the specialized version of the format routines

Просмотреть файл

@ -0,0 +1,13 @@
#pragma once
#define MS_PER_SEC 1000
namespace Microsoft{namespace MSR {namespace CNTK {
class Timer
{
public:
Timer(){};
~Timer(){};
static unsigned long long MilliSecondElapsed();
};
}}}

39
Common/TimerUtility.cpp Normal file
Просмотреть файл

@ -0,0 +1,39 @@
#include "TimerUtility.h"
#ifdef WIN32
#include <Windows.h>
#else
#include <time.h>
#endif
namespace Microsoft{
namespace MSR {
namespace CNTK {
//Returns the amount of milliseconds elapsed
unsigned long long Timer::MilliSecondElapsed()
{
#ifdef WIN32
FILETIME ft;
LARGE_INTEGER li;
GetSystemTimeAsFileTime(&ft); //ideally we should use GetSystemTimePreciseAsFileTime. But it's only avaiable with Win8+ and Win Server 2012+
li.LowPart = ft.dwLowDateTime;
li.HighPart = ft.dwHighDateTime;
unsigned long long ret = li.QuadPart;
ret -= 116444736000000000LL; // Make the values consistent with Linux.
ret /= 10000; // From 100 nano seconds (10^-7) to 1 millisecond (10^-3)
return ret;
#else
timespec ts;
clock_gettime(CLOCK_REALTIME, &ts); // Works on Linux
UINT64 ret = ts.tv_sec * 1000 + ts.tv_nsec/1000000;
return ret;
#endif
}
}
}
}

Просмотреть файл

@ -4,7 +4,10 @@
// </copyright>
//
#ifndef _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
#endif
#define _CRT_NONSTDC_NO_DEPRECATE // make VS accept POSIX functions without _
#pragma warning (disable: 4996) // ^^ this does not seem to work--TODO: make it work
#define _FILE_OFFSET_BITS 64 // to force fseeko() and ftello() 64 bit in Linux

Просмотреть файл

@ -49,8 +49,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_truncated = readerConfig("Truncated", "false");
m_convertLabelsToTargets = false;
m_numberOfuttsPerMinibatch = readerConfig("nbruttsineachrecurrentiter", "1");
ConfigArray numberOfuttsPerMinibatchForAllEpochs = readerConfig("nbruttsineachrecurrentiter", "1");
m_numberOfuttsPerMinibatchForAllEpochs = numberOfuttsPerMinibatchForAllEpochs;
for (int i = 0; i < m_numberOfuttsPerMinibatchForAllEpochs.size(); i++)
{
m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[i];
if (m_numberOfuttsPerMinibatch < 1)
{
LogicError("nbrUttsInEachRecurrentIter cannot be less than 1.");
@ -60,6 +64,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
LogicError("nbrUttsInEachRecurrentIter has to be 1 if Truncated is set to false.");
}
}
m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[0];
m_actualnumberOfuttsPerMinibatch = m_numberOfuttsPerMinibatch;
m_sentenceEnd.assign(m_numberOfuttsPerMinibatch, true);
@ -264,6 +271,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// get the read method, defaults to "blockRandomize" other option is "rollingWindow"
std::string readMethod(readerConfig("readMethod","blockRandomize"));
if (readMethod == "blockRandomize" && randomize == randomizeNone)
{
fprintf(stderr, "WARNING: Randomize cannot be set to None when readMethod is set to blockRandomize. Change it Auto");
randomize = randomizeAuto;
}
// see if they want to use readAhead
m_readAhead = readerConfig("readAhead", "false");
@ -352,6 +365,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// now get the frame source. This has better randomization and doesn't create temp files
m_frameSource = new msra::dbn::minibatchutterancesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, numContextLeft, numContextRight, randomize, *m_lattices, m_latticeMap, framemode);
m_frameSource->setverbosity(verbosity);
//m_frameSource = new msra::dbn::minibatchutterancesource(infilesmulti[0], labelsmulti[0], m_featDims[0], m_labelDims[0], numContextLeft[0], numContextRight[0], randomize, *m_lattices, m_latticeMap, framemode);
}
@ -562,6 +576,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
m_mbSize = mbSize;
m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[epoch];
m_actualnumberOfuttsPerMinibatch = m_numberOfuttsPerMinibatch;
m_sentenceEnd.assign(m_numberOfuttsPerMinibatch, true);
m_processedFrame.assign(m_numberOfuttsPerMinibatch, 0);
m_toProcess.assign(m_numberOfuttsPerMinibatch, 0);
m_switchFrame.assign(m_numberOfuttsPerMinibatch, 0);
if (m_trainOrTest)
{
StartMinibatchLoopToTrainOrTest(mbSize,epoch,requestedEpochSamples);

Просмотреть файл

@ -1,3 +1,4 @@
<<<<<<< HEAD
//
// <copyright file="HTKMLFReader.h" company="Microsoft">
// Copyright (c) Microsoft Corporation. All rights reserved.
@ -111,4 +112,117 @@ public:
void SetSentenceEnd(int /*actualMbSize*/){};
};
=======
//
// <copyright file="HTKMLFReader.h" company="Microsoft">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//
// HTKMLFReader.h - Include file for the MTK and MLF format of features and samples
#pragma once
#include "DataReader.h"
#include "commandArgUtil.h"
namespace Microsoft { namespace MSR { namespace CNTK {
template<class ElemType>
class HTKMLFReader : public IDataReader<ElemType>
{
private:
msra::dbn::minibatchiterator* m_mbiter;
msra::dbn::minibatchsource* m_frameSource;
msra::dbn::minibatchreadaheadsource* m_readAheadSource;
msra::dbn::FileEvalSource* m_fileEvalSource;
msra::dbn::latticesource* m_lattices;
map<wstring,msra::lattices::lattice::htkmlfwordsequence> m_latticeMap;
vector<bool> m_sentenceEnd;
bool m_readAhead;
bool m_truncated;
vector<size_t> m_processedFrame;
intargvector m_numberOfuttsPerMinibatchForAllEpochs;
size_t m_numberOfuttsPerMinibatch;
size_t m_actualnumberOfuttsPerMinibatch;
size_t m_mbSize;
vector<size_t> m_toProcess;
vector<size_t> m_switchFrame;
bool m_noData;
bool m_trainOrTest; // if false, in file writing mode
std::map<LabelIdType, LabelType> m_idToLabelMap;
bool m_partialMinibatch; // allow partial minibatches?
std::vector<ElemType*> m_featuresBufferMultiUtt;
std::vector<size_t> m_featuresBufferAllocatedMultiUtt;
std::vector<ElemType*> m_labelsBufferMultiUtt;
std::vector<size_t> m_labelsBufferAllocatedMultiUtt;
std::vector<size_t> m_featuresStartIndexMultiUtt;
std::vector<size_t> m_labelsStartIndexMultiUtt;
std::vector<ElemType*> m_featuresBufferMultiIO;
std::vector<size_t> m_featuresBufferAllocatedMultiIO;
std::vector<ElemType*> m_labelsBufferMultiIO;
std::vector<size_t> m_labelsBufferAllocatedMultiIO;
std::map<std::wstring,size_t> m_featureNameToIdMap;
std::map<std::wstring,size_t> m_labelNameToIdMap;
std::map<std::wstring,size_t> m_nameToTypeMap;
std::map<std::wstring,size_t> m_featureNameToDimMap;
std::map<std::wstring,size_t> m_labelNameToDimMap;
// for writing outputs to files (standard single input/output network) - deprecate eventually
bool m_checkDictionaryKeys;
bool m_convertLabelsToTargets;
std::vector <bool> m_convertLabelsToTargetsMultiIO;
std::vector<std::vector<std::wstring>> m_inputFilesMultiIO;
size_t m_inputFileIndex;
std::vector<size_t> m_featDims;
std::vector<size_t> m_labelDims;
std::vector<std::vector<std::vector<ElemType>>>m_labelToTargetMapMultiIO;
void PrepareForTrainingOrTesting(const ConfigParameters& config);
void PrepareForWriting(const ConfigParameters& config);
bool GetMinibatchToTrainOrTest(std::map<std::wstring, Matrix<ElemType>*>&matrices);
bool GetMinibatchToWrite(std::map<std::wstring, Matrix<ElemType>*>&matrices);
void StartMinibatchLoopToTrainOrTest(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
void StartMinibatchLoopToWrite(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
bool ReNewBufferForMultiIO(size_t i);
size_t NumberSlicesInEachRecurrentIter() { return m_numberOfuttsPerMinibatch ;}
void SetNbrSlicesEachRecurrentIter(const size_t) { };
void GetDataNamesFromConfig(const ConfigParameters& readerConfig, std::vector<std::wstring>& features, std::vector<std::wstring>& labels);
size_t ReadLabelToTargetMappingFile (const std::wstring& labelToTargetMappingFile, const std::wstring& labelListFile, std::vector<std::vector<ElemType>>& labelToTargetMap);
enum InputOutputTypes
{
real,
category,
};
public:
virtual void Init(const ConfigParameters& config);
virtual void Destroy() {delete this;}
virtual ~HTKMLFReader();
virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
virtual bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices);
virtual const std::map<LabelIdType, LabelType>& GetLabelMapping(const std::wstring& sectionName);
virtual void SetLabelMapping(const std::wstring& sectionName, const std::map<unsigned, LabelType>& labelMapping);
virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart=0);
virtual bool DataEnd(EndDataType endDataType);
void SetSentenceEndInBatch(vector<size_t> &/*sentenceEnd*/);
void SetSentenceEnd(int /*actualMbSize*/){};
};
>>>>>>> bd4866bec82772b2e984f7e897b1e64cd0855d7d
}}}

Просмотреть файл

@ -768,6 +768,7 @@ private:
if (chunkdata.isinram())
return false;
if (verbosity)
fprintf (stderr, "requirerandomizedchunk: paging in randomized chunk %d (frame range [%d..%d]), %d resident in RAM\n", chunkindex, chunk.globalts, chunk.globalte()-1, chunksinram+1);
msra::util::attempt (5, [&]() // (reading from network)
{
@ -858,6 +859,7 @@ public:
transcripts.clear();
// return these utterances
if (verbosity > 0)
fprintf (stderr, "getbatch: getting utterances %d..%d (%d frames out of %d requested) in sweep %d\n", spos, epos -1, mbframes, framesrequested, sweep);
size_t tspos = 0; // relative start of utterance 'pos' within the returned minibatch
for (size_t pos = spos; pos < epos; pos++)
@ -922,6 +924,7 @@ public:
const size_t lastchunk = chunkforframepos (globalte-1);
const size_t windowbegin = randomizedchunks[firstchunk].windowbegin;
const size_t windowend = randomizedchunks[lastchunk].windowend;
if (verbosity > 0)
fprintf (stderr, "getbatch: getting randomized frames [%d..%d] (%d frames out of %d requested) in sweep %d; chunks [%d..%d] -> chunk window [%d..%d)\n",
globalts, globalte, mbframes, framesrequested, sweep, firstchunk, lastchunk, windowbegin, windowend);
// release all data outside, and page in all data inside

Просмотреть файл

@ -102,7 +102,7 @@ class minibatchutterancesourcemulti : public minibatchsource
bool isinram() const { return !frames.empty(); }
// page in data for this chunk
// We pass in the feature info variables by ref which will be filled lazily upon first read
void requiredata (string & featkind, size_t & featdim, unsigned int & sampperiod, const latticesource & latticesource) const
void requiredata (string & featkind, size_t & featdim, unsigned int & sampperiod, const latticesource & latticesource, int verbosity=0) const
{
if (numutterances() == 0)
throw std::logic_error ("requiredata: cannot page in virgin block");
@ -132,6 +132,7 @@ class minibatchutterancesourcemulti : public minibatchsource
latticesource.getlattices (utteranceset[i].key(), lattices[i], uttframes.cols());
}
//fprintf (stderr, "\n");
if (verbosity)
fprintf (stderr, "requiredata: %d utterances read\n", utteranceset.size());
}
catch (...)
@ -568,6 +569,7 @@ private:
return sweep;
currentsweep = sweep;
if (verbosity>0)
fprintf (stderr, "lazyrandomization: re-randomizing for sweep %d in %s mode\n", currentsweep, framemode ? "frame" : "utterance");
const size_t sweepts = sweep * _totalframes; // first global frame index for this sweep
@ -919,10 +921,11 @@ private:
{
auto & chunk = randomizedchunks[m][chunkindex];
auto & chunkdata = chunk.getchunkdata();
if (verbosity)
fprintf (stderr, "feature set %d: requirerandomizedchunk: paging in randomized chunk %d (frame range [%d..%d]), %d resident in RAM\n", m, chunkindex, chunk.globalts, chunk.globalte()-1, chunksinram+1);
msra::util::attempt (5, [&]() // (reading from network)
{
chunkdata.requiredata (featkind[m], featdim[m], sampperiod[m], this->lattices);
chunkdata.requiredata (featkind[m], featdim[m], sampperiod[m], this->lattices, verbosity);
});
}
chunksinram++;
@ -1029,6 +1032,7 @@ public:
}
}
// return these utterances
if (verbosity > 0)
fprintf (stderr, "getbatch: getting utterances %d..%d (%d frames out of %d requested) in sweep %d\n", spos, epos -1, mbframes, framesrequested, sweep);
size_t tspos = 0; // relative start of utterance 'pos' within the returned minibatch
for (size_t pos = spos; pos < epos; pos++)
@ -1107,6 +1111,7 @@ public:
const size_t lastchunk = chunkforframepos (globalte-1);
const size_t windowbegin = randomizedchunks[0][firstchunk].windowbegin;
const size_t windowend = randomizedchunks[0][lastchunk].windowend;
if (verbosity)
fprintf (stderr, "getbatch: getting randomized frames [%d..%d] (%d frames out of %d requested) in sweep %d; chunks [%d..%d] -> chunk window [%d..%d)\n",
globalts, globalte, mbframes, framesrequested, sweep, firstchunk, lastchunk, windowbegin, windowend);
// release all data outside, and page in all data inside
@ -1230,3 +1235,4 @@ public:
};
};};

Просмотреть файл

@ -15,7 +15,7 @@
using namespace std;
#define MAXSTRING 2048
#define MAXSTRING 500000
// UCI label location types
enum LabelMode
{

Просмотреть файл

@ -62,10 +62,7 @@ template<class ElemType>
// not yet found, add to the map
if (found == labelInfo.mapLabelToId.end())
{
labelInfo.mapLabelToId[labelValue] = labelInfo.idMax;
labelInfo.mapIdToLabel[labelInfo.idMax] = labelValue;
found = labelInfo.mapLabelToId.find(labelValue);
labelInfo.idMax++;
RuntimeError("%s not in vocabulary", labelValue.c_str());
}
return found->second;
}

Просмотреть файл

@ -60,7 +60,7 @@
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<PrecompiledHeader>Use</PrecompiledHeader>
<PrecompiledHeader>NotUsing</PrecompiledHeader>
<WarningLevel>Level4</WarningLevel>
<Optimization>Disabled</Optimization>
<PreprocessorDefinitions>EVALDLL;WIN32;_DEBUG;_WINDOWS;_USRDLL;UCIREADER_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
@ -79,7 +79,7 @@
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level4</WarningLevel>
<PrecompiledHeader>Use</PrecompiledHeader>
<PrecompiledHeader>NotUsing</PrecompiledHeader>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
@ -107,6 +107,7 @@
<ClInclude Include="..\..\Common\Include\Eval.h" />
<ClInclude Include="..\..\Common\Include\File.h" />
<ClInclude Include="..\..\Common\Include\fileutil.h" />
<ClInclude Include="..\..\Common\Include\TimerUtility.h" />
<ClInclude Include="EvalReader.h" />
<ClInclude Include="EvalWriter.h" />
<ClInclude Include="stdafx.h" />
@ -127,6 +128,7 @@
<ClCompile Include="..\..\Common\fileutil.cpp">
<PrecompiledHeader>NotUsing</PrecompiledHeader>
</ClCompile>
<ClCompile Include="..\..\Common\TimerUtility.cpp" />
<ClCompile Include="..\cn\ComputationNode.cpp">
<PrecompiledHeader>NotUsing</PrecompiledHeader>
</ClCompile>

Просмотреть файл

@ -19,6 +19,9 @@
<ClCompile Include="..\..\Common\File.cpp">
<Filter>Common</Filter>
</ClCompile>
<ClCompile Include="..\..\Common\TimerUtility.cpp">
<Filter>Common</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="EvalReader.h" />
@ -38,6 +41,9 @@
<ClInclude Include="..\..\Common\Include\fileutil.h">
<Filter>Common\Include</Filter>
</ClInclude>
<ClInclude Include="..\..\Common\Include\TimerUtility.h">
<Filter>Common\Include</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<Filter Include="Common">

Просмотреть файл

@ -3111,6 +3111,9 @@ protected: \
inputGradientValues.Print("child Gradient-in/out");
inputFunctionValues.Print("child Function values");
#endif
//currently we only support one combination when the input is sparse.
if (inputFunctionValues.GetMatrixType() == SPARSE && inputGradientValues.GetMatrixType() == DENSE && gradientValues.GetMatrixType() == DENSE)
inputGradientValues.SwitchToMatrixType(SPARSE, MatrixFormat::matrixFormatSparseBlockCol);
Matrix<ElemType>::MultiplyAndAdd(gradientValues, false, inputFunctionValues, true, inputGradientValues);
#if DUMPOUTPUT

Просмотреть файл

@ -257,7 +257,12 @@ public:
bool optParam = param->GetType() == ndlTypeOptionalParameter;
if (optParam && !_stricmp(param->GetName().c_str(), name.c_str()))
{
return param->GetValue();
auto paramValue = param->GetValue();
auto resolveParamNode = m_parent->ParseVariable(paramValue, false);
if (resolveParamNode != nullptr)
return resolveParamNode->GetScalar();
else
return paramValue;
}
}
return ConfigValue(deflt);

Просмотреть файл

@ -17,6 +17,7 @@
#include "commandArgUtil.h"
#include <chrono>
#include <random>
#include "TimerUtility.h"
#ifdef MPI_SUPPORT
#include "mpi.h"
@ -121,6 +122,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
SGD(const ConfigParameters& configSGD)
{
ConfigArray learningRatesPerMBStr = configSGD("learningRatesPerMB", "");
m_needToNormalizeLRByParallUtterance = false;
floatargvector learningRatesPerMB = learningRatesPerMBStr;
ConfigArray learningRatesPerSampleStr = configSGD("learningRatesPerSample", "");
@ -206,13 +208,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
bool validateAfterModelReloading = configSGD("validateAfterModelReloading", "true");
bool UsingAllDataForPreComputedNode = configSGD("UseAllDataForPreComputedNode", "true");
Init(learningRatesPerMB, learningRatesPerSample, mbSize, epochSize, maxEpochs, modelPath, momentumPerMB, gradientClippingWithTruncation,
clippingThresholdPerSample,autoAdjustLRType, increaseLearnRateIfImproveMoreThan, learnRateIncreaseFactor,
reduceLearnRateIfImproveLessThan, continueReduce, learnRateDecreaseFactor, dropoutRates,
loadBestModel, numMiniBatch4LRSearch, numPrevLearnRates, numBestSearchEpoch, traceLevel, numMBsToShowResult,
maxTempMemSizeInSamplesForCNN, gUpdateInfo, usePtask, keepCheckPointFiles, adaptationRegType, adaptationRegWeight,
trainCriterionNodeName, evalCriterionNodeName, doGradientCheck, gradientCheckSigDigit, validateAfterModelReloading,
rpi, learnRateAdjustInterval);
rpi, learnRateAdjustInterval, UsingAllDataForPreComputedNode);
}
void setMomentum(float momentum)
@ -234,7 +238,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
const GradientUpdateInfo gradUpdateType = GradientUpdateInfo(), const bool usePtask = false, const bool keepCheckPointFiles=false, const AdaptationRegType adaptationRegType = AdaptationRegType::None,
const ElemType adaptationRegWeight = 0.0f, const wstring trainCriterionNodeName= L"", const wstring evalCriterionNodeName=L"",
const bool doGradientCheck = false, const ElemType gradientCheckSigDigit = 6, const bool validateAfterModelReloading = true,
RMSPropInfo rpi = RMSPropInfo(), size_t learnRateAdjustInterval = 1)
RMSPropInfo rpi = RMSPropInfo(), size_t learnRateAdjustInterval = 1, const bool UsingAllDataForPreComputed=true)
{
numPrevLearnRates;
m_mbSize=mbSize;
@ -272,6 +276,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_trainCriterionNodeName = trainCriterionNodeName;
m_evalCriterionNodeName = evalCriterionNodeName;
m_useAllDataForPreComputedNode = UsingAllDataForPreComputed;
for (size_t i=0; i<m_mbSize.size(); i++)
if (m_epochSize != requestDataSize && m_epochSize < m_mbSize[i])
@ -298,6 +303,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
m_learningRatesPerSample[i] = learningRatesPerMB[i]/m_mbSize[i];
}
m_needToNormalizeLRByParallUtterance = true;
}
m_momentumPerMB = 0.9f;
if (momentumPerMB.size() >0)
@ -521,6 +527,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (0 == myRank) // only needs to be done by one process
net.SaveToFile(GetModelNameForEpoch(int(startEpoch) - 1));
// first, we need to normalize the effect of nbruttsineachrecurrentiter
if (trainSetDataReader->NumberSlicesInEachRecurrentIter()>1 && m_needToNormalizeLRByParallUtterance)
{
for (auto & x : m_learningRatesPerSample)
{
x /= trainSetDataReader->NumberSlicesInEachRecurrentIter();
}
}
bool learnRateInitialized = false;
if (startEpoch > 0)
{
@ -557,7 +571,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
for (int i = int(startEpoch); i < int(m_maxEpochs); i++)
{
auto t_start_epoch = clock();
auto t_start_epoch = Timer::MilliSecondElapsed();
// set other information to inputMatrices that can contrain information
// used for class-based LM for clustring information
@ -600,12 +614,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
#ifdef MPI_SUPPORT
INT32 mySamples = (INT32)
#endif
fprintf(stderr, "Starting Epoch %d: learning rate per sample = %f momentum = %f \n", (int)startEpoch, learnRatePerSample, m_momentumPerMB);
TrainOneEpoch(net, refNet, refNode, i, m_epochSize, trainSetDataReader, learnRatePerSample, FeatureNodes, labelNodes,
criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients,
epochCriterion, epochEvalErrors, totalSamplesSeen);
auto t_end_epoch = clock();
ElemType epochTime = ElemType(1.0)*(t_end_epoch - t_start_epoch) / (CLOCKS_PER_SEC);
auto t_end_epoch = Timer::MilliSecondElapsed();
ElemType epochTime = (t_end_epoch - t_start_epoch) / ElemType(MS_PER_SEC);
fprintf(stderr, "Finished Epoch[%d]: [Training Set] Train Loss Per Sample = %.8g ", i + 1, epochCriterion);
if (epochEvalErrors.size() == 1)
@ -800,7 +815,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
//compute
//trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0 , requestDataSize);
trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0 , m_epochSize); // only based on one epoch
// trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0 , m_epochSize); // only based on one epoch
// [1/12/2015 erw] to support large dataset, we usually paritition whole dataset into several epoches, so we need to use all the data to do precomputing
if (m_useAllDataForPreComputedNode)
trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0); // using all the data
else
trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0, m_epochSize); // using all the data
while (trainSetDataReader->GetMinibatch(inputMatrices))
{
@ -983,8 +1003,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
std::vector<ElemType> epochEvalErrorsLastMBs(epochEvalErrors.size(),0);
PTaskGraphBuilder<ElemType>* ptaskGraphBuilder = NULL;
clock_t startReadMBTime = 0, startComputeMBTime=0;
clock_t endReadMBTime=0, endComputeMBTime=0;
unsigned long long startReadMBTime = 0, startComputeMBTime=0;
unsigned long long endReadMBTime = 0, endComputeMBTime = 0;
//initialize statistics
size_t totalEpochSamples = 0;
@ -1027,14 +1047,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
}
startReadMBTime=clock();
startReadMBTime=Timer::MilliSecondElapsed();
while (trainSetDataReader->GetMinibatch(inputMatrices))
{
#ifdef MPI_SUPPORT
DecimateMinibatch(inputMatrices);
#endif
endReadMBTime=clock();
startComputeMBTime=clock();
endReadMBTime=Timer::MilliSecondElapsed();
startComputeMBTime=Timer::MilliSecondElapsed();
UpdateEvalTimeStamps(FeatureNodes);
UpdateEvalTimeStamps(labelNodes);
@ -1113,12 +1133,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
endComputeMBTime=clock();
endComputeMBTime=Timer::MilliSecondElapsed();
numMBsRun ++;
if (m_traceLevel > 0)
{
ElemType MBReadTime = (ElemType)(endReadMBTime-startReadMBTime)/(CLOCKS_PER_SEC);
ElemType MBComputeTime = (ElemType)(endComputeMBTime-startComputeMBTime)/CLOCKS_PER_SEC;
ElemType MBReadTime = (ElemType)(endReadMBTime-startReadMBTime)/(MS_PER_SEC);
ElemType MBComputeTime = (ElemType)(endComputeMBTime-startComputeMBTime)/MS_PER_SEC;
readTimeInMBs += MBReadTime;
ComputeTimeInMBs += MBComputeTime;
@ -1149,7 +1169,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
epochEvalErrorsLastMBs[i] = epochEvalErrors[i];
}
}
startReadMBTime=clock();
startReadMBTime=Timer::MilliSecondElapsed();
totalEpochSamples += actualMBSize;
totalSamplesSeen += actualMBSize;
@ -1506,6 +1526,7 @@ protected:
protected:
floatargvector m_learningRatesPerSample; /// learning rate per sample provided outside
bool m_needToNormalizeLRByParallUtterance; // only true when the user specify LearningRatePerMB and the number of parallel utterances in Reader > 1
intargvector m_mbSize;
size_t m_epochSize;
size_t m_maxEpochs;
@ -1557,6 +1578,8 @@ protected:
ElemType m_gradientCheckSigDigit;
bool m_validateAfterModelReloading;
bool m_useAllDataForPreComputedNode;
};
template class SGD<float>;
template class SGD<double>;

Просмотреть файл

@ -102,7 +102,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
evalResultsLastMBs.push_back((ElemType)0);
dataReader.StartMinibatchLoop(mbSize, 0, testSize);
dataReader.SetNbrSlicesEachRecurrentIter(1);
for (int i=0; i<evalNodes.size(); i++)
{

Просмотреть файл

@ -228,7 +228,7 @@ public:
nodePtr = m_net.CreateLearnableParameter(name, rows, cols);
nodePtr->NeedGradient() = false;
}
else if (pass == ndlPassFinal)
else if (pass == ndlPassFinal || nodePtr->FunctionValues().GetNumElements() != 0)
{
ElemType val = parameter[0]->GetScalar();
nodePtr->FunctionValues().SetValue(val);

Просмотреть файл

@ -1143,8 +1143,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (m_children.size() != 3)
throw std::logic_error("ClassBasedCrossEntropyWithSoftmaxNode criterion requires three inputs.");
if (Inputs(0)->OperationName() != L"SparseInputValue"
&& Inputs(0)->OperationName() != L"InputValue")
if (Inputs(0)->OperationName() != SparseInputValue<ElemType>::TypeName()
&& Inputs(0)->OperationName() != InputValue<ElemType>::TypeName())
throw std::logic_error("ClassBasedCrossEntropyWithSoftmaxNode criterion requires the first input to be the label.");
if (!(Inputs(1)->FunctionValues().GetNumRows() == Inputs(2)->FunctionValues().GetNumCols() && // input and matrix can be timed

Просмотреть файл

@ -28,6 +28,7 @@
#include "io.h"
#endif
#include "hostname.h"
#include "buildinfo.h"
#ifdef LEAKDETECT
#include "vld.h" // for memory leak detection
#endif
@ -619,8 +620,26 @@ int MPIAPI MPI_Init(_In_opt_ int *argc, _Inout_count_(*argc) wchar_t*** argv)
}
#endif
void PrintBuiltInfo()
{
fprintf(stderr, "-------------------------------------------------------------------\n");
fprintf(stderr, "Build info: \n\n");
fprintf(stderr, "\t\tBuilt time: %s %s\n", __DATE__, __TIME__);
fprintf(stderr, "\t\tLast modified date: %s\n", __TIMESTAMP__);
fprintf(stderr, "\t\tBuilt by %s on %s\n", _BUILDER_, _BUILDMACHINE_);
fprintf(stderr, "\t\tBuild Path: %s\n", _BUILDPATH_);
#ifdef _GIT_EXIST
fprintf(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
fprintf(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
#endif
fprintf(stderr, "-------------------------------------------------------------------\n");
}
int wmain(int argc, wchar_t* argv[])
{
try
{
#ifdef MPI_SUPPORT
@ -647,6 +666,8 @@ int wmain(int argc, wchar_t* argv[])
// get the command param set they want
wstring logpath = config("stderr", L"");
// [1/26/2015 erw, add done file so that it can be used on HPC]
wstring DoneFile = config("DoneFile", L"");
ConfigArray command = config("command", "train");
if (logpath != L"")
@ -663,9 +684,14 @@ int wmain(int argc, wchar_t* argv[])
oss << myRank;
logpath += L"rank" + oss.str();
}
RedirectStdErr(logpath);
}
PrintBuiltInfo();
std::string timestamp = TimeDateStamp();
if (myRank == 0) // main process
@ -714,8 +740,15 @@ int wmain(int argc, wchar_t* argv[])
DoCommand<double>(config);
else
RuntimeError("invalid precision specified: %s", type.c_str());
// still here , write a DoneFile if necessary
if (!DoneFile.empty()){
FILE* fp = fopenOrDie(DoneFile.c_str(), L"w");
fprintf(fp, "successfully finished at %s on %s\n", TimeDateStamp().c_str(),GetHostName().c_str());
fcloseOrDie(fp);
}
catch(const std::exception &err)
}
catch (const std::exception &err)
{
fprintf(stderr, "EXCEPTION occurred: %s", err.what());
#ifdef _DEBUG

Просмотреть файл

@ -139,6 +139,9 @@
<TreatOutputAsContent>true</TreatOutputAsContent>
<Message>Copy content files to target directory</Message>
</CustomBuildStep>
<PreBuildEvent>
<Command>prebuild.bat</Command>
</PreBuildEvent>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<ClCompile>
@ -199,6 +202,9 @@
<Message>
</Message>
</CustomBuildStep>
<PreBuildEvent>
<Command>prebuild.bat</Command>
</PreBuildEvent>
</ItemDefinitionGroup>
<ItemGroup>
<Text Include="DefaultMacros.txt" />
@ -216,6 +222,7 @@
<ClInclude Include="..\..\Common\Include\fileutil.h" />
<ClInclude Include="..\..\Common\Include\hostname.h" />
<ClInclude Include="..\..\Common\Include\nvml.h" />
<ClInclude Include="..\..\Common\Include\TimerUtility.h" />
<ClInclude Include="CompositeComputationNode.h" />
<ClInclude Include="ComputationNetwork.h" />
<ClInclude Include="ComputationNetworkHelper.h" />
@ -249,6 +256,7 @@
<ClCompile Include="..\..\Common\fileutil.cpp">
<PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
</ClCompile>
<ClCompile Include="..\..\Common\TimerUtility.cpp" />
<ClCompile Include="cn.cpp" />
<ClCompile Include="ComputationNode.cpp" />
<ClCompile Include="ModelEditLanguage.cpp" />

Просмотреть файл

@ -43,6 +43,9 @@
<ClCompile Include="NetworkDescriptionLanguage.cpp">
<Filter>Network</Filter>
</ClCompile>
<ClCompile Include="..\..\Common\TimerUtility.cpp">
<Filter>Common</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\Include\basetypes.h">
@ -138,6 +141,9 @@
<ClInclude Include="..\..\Common\Include\hostname.h">
<Filter>Common\Include</Filter>
</ClInclude>
<ClInclude Include="..\..\Common\Include\TimerUtility.h">
<Filter>Common\Include</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<Text Include="modelEditor.txt">

Просмотреть файл

@ -0,0 +1,30 @@
@echo off
echo #ifndef _BUILDINFO_H > buildinfo.h
echo #define _BUILDINFO_H >> buildinfo.h
FOR /F "usebackq" %%i IN (`hostname`) DO SET HOST=%%i
:: assuming hostname always exists
:: not sure whether git in path ?
git --version 2 > nul
if not %ERRORLEVEL% == 9909 (
echo #define _GIT_EXIST >> buildinfo.h
FOR /F "usebackq" %%i IN (`git rev-parse --abbrev-ref HEAD`) DO SET BRANCH=%%i
FOR /F "usebackq" %%i IN (`git rev-parse HEAD`) DO SET COMMIT=%%i
echo #define _BUILDBRANCH_ "%BRANCH%" >> buildinfo.h
echo #define _BUILDSHA1_ "%COMMIT%" >> buildinfo.h
)
echo #define _BUILDER_ "%USERNAME%" >> buildinfo.h
echo #define _BUILDMACHINE_ "%HOST%" >> buildinfo.h
set a=%~dp0
set buildpath="%a:\=\\%"
echo #define _BUILDPATH_ %buildpath% >> buildinfo.h
echo #endif >> buildinfo.h

Просмотреть файл

@ -78,6 +78,53 @@ namespace CNTKMathTest
Assert::IsTrue(C1.IsEqualTo(C,0.00005)); //Seems like bad precision
}
TEST_METHOD(CPUMatrixDenseTimesSparse)
{
Matrix<float> Ad(CPUDEVICE);
Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(1024, 2048, -3, 0.1, 0), 0);
Matrix<float> As(Ad);
As.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseCSC);
Matrix<float> B = Matrix<float>::RandomGaussian(2048, 1024, 1, 4, USE_TIME_BASED_SEED, CPUDEVICE);
Matrix<float> C = Matrix<float>::RandomGaussian(2048, 2048, 1, 2, USE_TIME_BASED_SEED, CPUDEVICE);
Matrix<float> C1(C);
float alpha = 0.3, beta = 0;
bool transposeA = false, transposeB = false;
Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, Ad, transposeB, beta, C);
Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, As, transposeB, beta, C1);
Assert::IsTrue(C1.IsEqualTo(C, 0.0001));
alpha = 3.3, beta = 1.3;
Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, Ad, transposeB, beta, C);
Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, As, transposeB, beta, C1);
// TODO IsEqualTo NYI
// Assert::IsTrue(C1.IsEqualTo(C, 0.00005));
}
TEST_METHOD(CPUMatrixDenseTimesSparseAsSparse)
{
Matrix<float> Ad(CPUDEVICE);
Ad.AssignTruncateBottomOf(Matrix<float>::RandomUniform(2048, 1024, -3, 0.1, 0), 0);
Matrix<float> As(Ad);
As.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseCSC);
Matrix<float> B = Matrix<float>::RandomGaussian(2048, 1024, 1, 4, USE_TIME_BASED_SEED, CPUDEVICE);
Matrix<float> AsCsc = Matrix<float>::RandomGaussian(2048, 2048, 1, 2, USE_TIME_BASED_SEED, CPUDEVICE);
Matrix<float> AsBlock(CPUDEVICE);
AsBlock.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseBlockCol);
float alpha = 0.3, beta = 0;
bool transposeA = false, transposeB = true;
Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, As, transposeB, beta, AsBlock);
Matrix<float>::MultiplyAndWeightedAdd(alpha, B, transposeA, As, transposeB, beta, AsCsc);
// TODO IsEqualTo NYI
// Assert::IsTrue(AsBlock.IsEqualTo(AsCsc, 0.0001));
}
TEST_METHOD(MatrixSparseTimesSparse)
{
Matrix<float> Ad;

Просмотреть файл

@ -111,7 +111,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
//else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow)
{
m_blockSize = 0;
m_blockVal = NULL;
m_pArray = NULL;
m_blockIds = NULL;
}
}
@ -125,6 +125,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
throw std::logic_error("CPUSparseMatrix: unsupported sparse matrix format");
}
m_format = format;
m_default = defaultElem();
ZeroInit();
}
@ -160,8 +161,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow)
{
if(m_blockVal != NULL)
delete[] m_blockVal;
if (m_pArray != NULL)
delete[] m_pArray;
if(m_blockIds != NULL)
delete[] m_blockIds;
}
@ -219,16 +220,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template<class ElemType>
ElemType* CPUSparseMatrix<ElemType>::BufferPointer() const
{
if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR)
{
return m_pArray;
}
else
{
return m_blockVal;
}
}
template<class ElemType>
void CPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, size_t numNZElemToReserve, const bool growOnly, const bool keepExistingValues)
@ -280,16 +274,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (keepExistingValues && m_elemSizeAllocated > 0)
{
assert(m_compIndexSize > 0 && m_elemSizeAllocated < numNZElemToReserve);
memcpy(blockVal, m_blockVal, NzSize());
memcpy(blockVal, m_pArray, NzSize());
memcpy(blockIds, m_blockIds, sizeof(size_t)*m_compIndexSize);
}
if (m_blockVal != NULL)
delete[] m_blockVal;
if (m_pArray != NULL)
delete[] m_pArray;
if(m_blockIds != NULL)
delete[] m_blockIds;
m_blockVal = blockVal;
m_pArray = blockVal;
m_blockIds = blockIds;
}
@ -457,10 +451,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{ // h range over hidden layer
if(first == true)
{
c.m_blockVal[pos] = alpha*lhs(h, j)*val;
c.m_pArray[pos] = alpha*lhs(h, j)*val;
} else
{
c.m_blockVal[pos] += alpha*lhs(h, j)*val;
c.m_pArray[pos] += alpha*lhs(h, j)*val;
}
pos++;
}
@ -522,7 +516,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t start = j * len;
for(size_t p = start; p < start+len; p++)
{
ElemType val = lhs.m_blockVal[p];
ElemType val = lhs.m_pArray[p];
size_t r = (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol) ? (p - start) : i;
size_t c = (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol) ? i : (p - start);
@ -537,6 +531,30 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
template<class ElemType>
bool CPUSparseMatrix<ElemType>::AreEqual(const CPUSparseMatrix<ElemType>& a, const CPUSparseMatrix<ElemType>& b, const ElemType threshold)
{
if (a.IsEmpty() || b.IsEmpty())
throw std::logic_error("AreEqual: one of the input matrices is empty.");
if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
return false;
bool result = true;
#pragma omp parallel for
foreach_coord(i, j, a)
{
if (abs(a(i, j) - b(i, j)) > threshold)
{
result = false;
break;
}
}
return result;
}
// a: H x No: H is hidden layer size and No is mini-batch size
// weight: V x H, V is vocab size
// label: V x No
@ -698,11 +716,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{ // h range over hidden layer
if(first == true)
{
grd.m_blockVal[pos] = input(h, j)*error.m_pArray[p];
grd.m_pArray[pos] = input(h, j)*error.m_pArray[p];
}
else
{
grd.m_blockVal[pos] += input(h, j)*error.m_pArray[p];
grd.m_pArray[pos] += input(h, j)*error.m_pArray[p];
}
pos++;
}
@ -735,11 +753,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t start = j* len;
for(size_t p = start; p < start+len; p++)
{
ElemType val = m_blockVal[p];
ElemType val = m_pArray[p];
size_t row = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? (p - start) : i;
size_t col = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? i : (p - start);
c(row, col) = (1-momentum)*val + momentum*c(row, col);
m_blockVal[p] = c(row, col);
m_pArray[p] = c(row, col);
}
}
}
@ -790,14 +808,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t start = j* len;
for(size_t p = start; p < start+len; p++)
{
ElemType val = m_blockVal[p];
ElemType val = m_pArray[p];
size_t row = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? (p - start) : i;
size_t col = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? i : (p - start);
ElemType adenorm = c(row, col);
adenorm += val * val;
val = val / (floor + sqrt(adenorm));
m_blockVal[p] = val;
m_pArray[p] = val;
c(row, col) = adenorm;
}
}
@ -818,13 +836,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t start = j* len;
for (size_t p = start; p < start+len; p++)
{
if (m_blockVal[p] > locThresholdPos)
if (m_pArray[p] > locThresholdPos)
{
m_blockVal[p] = locThresholdPos;
m_pArray[p] = locThresholdPos;
}
else if (m_blockVal[p] < locTHresholdNeg)
else if (m_pArray[p] < locTHresholdNeg)
{
m_blockVal[p] = locTHresholdNeg;
m_pArray[p] = locTHresholdNeg;
}
}
}

Просмотреть файл

@ -34,6 +34,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
CPUSparseMatrix(const MatrixFormat format);
CPUSparseMatrix(const MatrixFormat format, const size_t numRows, const size_t numCols, const size_t size);
~CPUSparseMatrix();
public:
@ -76,6 +77,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
static void ScaleAndAdd(const ElemType alpha, const CPUSparseMatrix<ElemType>& lhs, CPUMatrix<ElemType>& c);
static bool AreEqual(const CPUSparseMatrix<ElemType>& a, const CPUSparseMatrix<ElemType>& b, const ElemType threshold = 1e-8);
/// sum(vec(a).*vec(b))
static ElemType InnerProductOfMatrices(const CPUSparseMatrix<ElemType>& /*a*/, const CPUMatrix<ElemType>& /*b*/) { NOT_IMPLEMENTED; }
@ -89,6 +92,41 @@ namespace Microsoft { namespace MSR { namespace CNTK {
void Resize(const size_t numRows, const size_t numCols, size_t numNZElemToReserve = 0, const bool growOnly = true, const bool keepExistingValues = true);
void Reset();
inline ElemType defaultElem()
{
ElemType default;
memset(&default, 0, sizeof(ElemType));
return default;
}
const ElemType& operator() (const size_t row, const size_t col) const
{
if (col >= m_numCols || row >= m_numRows)
{
throw std::runtime_error("Position outside matrix dimensions");
}
if (m_format == MatrixFormat::matrixFormatSparseCSC)
{
size_t start = m_compIndex[col];
size_t end = m_compIndex[col + 1];
for (size_t p = start; p < end; p++)
{
size_t i = m_unCompIndex[p];
if (i == row)
{
return m_pArray[p];
}
}
return m_default;
}
else
{
NOT_IMPLEMENTED;
}
}
public:
void NormalGrad(CPUMatrix<ElemType>& c, const ElemType momentum);
void Adagrad(CPUMatrix<ElemType>& c);
@ -103,7 +141,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
public:
const ElemType* NzValues() const { return m_pArray; }
ElemType* NzValues() { return m_pArray; }
inline ElemType* NzValues() { return m_pArray; }
size_t NzSize() const { return sizeof(ElemType)*m_nz; } // actual number of element bytes in use
CPUSPARSE_INDEX_TYPE* MajorIndexLocation() const { return m_unCompIndex; } //this is the major index, row/col ids in CSC/CSR format
@ -140,8 +178,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
CPUSPARSE_INDEX_TYPE *m_compIndex; //begin ids of col/row in CSC/CSR format
size_t m_blockSize; //block size
ElemType *m_blockVal; //block values
size_t *m_blockIds; //block ids
ElemType m_default;
};
typedef CPUSparseMatrix<float> CPUSingleSparseMatrix;

Просмотреть файл

@ -3356,45 +3356,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
#pragma endregion Static BLAS Functions
//#pragma region File << and >> operators
// template<class ElemType>
// File& operator>>(File& stream, GPUMatrix<ElemType> &us)
// {
// //auto& us = *this;
//
// stream.GetMarker(fileMarkerBeginSection, std::string("BMAT"));
// size_t elsize;
// stream>>elsize;
// if (sizeof(ElemType)!=elsize)
// throw std::runtime_error("Template argument size doesn't match those in file");
// std::wstring matrixName;
// size_t numRows, numCols;
// stream>>matrixName>>numRows>>numCols;
// ElemType* d_array = new ElemType[numRows*numCols];
// for (long i=0;i<numRows*numCols;++i)
// stream>>d_array[i];
// stream.GetMarker(fileMarkerEndSection, std::string("EMAT"));
// us.SetValue(numRows,numCols,d_array, matrixFlagNormal);
// us.m_matrixName = matrixName;
// return stream;
// }
//
// template<class ElemType>
// File& operator<<(File& stream, GPUMatrix<ElemType> &us)
// {
// //auto& us = *this;
//
// stream.PutMarker(fileMarkerBeginSection, std::string("BMAT"));
// stream<<sizeof(ElemType)<<us.m_matrixName<<us.m_numRows<<us.m_numCols;
// ElemType *d_array = us.CopyToArray();
// for (long i=0;i<us.GetNumElements();++i)
// stream<<d_array[i];
// stream.PutMarker(fileMarkerEndSection, std::string("EMAT"));
// return stream;
// }
//
//#pragma endregion File << and >> operators
template class GPUMatrix<float>;
template class GPUMatrix<double>;
template class DeviceBoundNumber<float>;

Просмотреть файл

@ -2250,37 +2250,12 @@ __global__ void _sparseCSRElemMulDense(
}
}
// forward pass from feature to hidden layer
/*template<class ElemType>
__global__ void _denseMulSparseCSCToDense(
ElemType alpha,
const ElemType* lhs,
int numrows,
int numcols,
const GPUSPARSE_INDEX_TYPE* row,
ElemType* c)
{
int loadPerThread = (numrows+blockDim.x-1)/blockDim.x;
int tStart = loadPerThread * threadIdx.x;
int tEnd = min(numrows, loadPerThread + tStart);
int p = blockIdx.x;
int i = row[p];
int j = blockIdx.x;
for (int h = tStart; h < tEnd; h++)
{
ElemType res = alpha * lhs[IDX2C(h, i, numrows)];
atomicAdd(&c[IDX2C(h,j,numrows)], res);
}
}*/
//c = alpha * op(a) * op(b) + beta*c
//this function can be further improved by using shared memory
template<class ElemType>
__global__ void _denseMultSparseCSCAndWeightedAddToDense(
int m, //rowDense
int k, //colDense = rowSparse
int n, //colSparse
ElemType alpha,
const ElemType* a, //dense
@ -2311,62 +2286,77 @@ __global__ void _denseMultSparseCSCAndWeightedAddToDense(
}
// backward pass from hidden layer to feature weight
//result (sparse BlockCol)= alpha * (lhs (dense) X rhs^T (sparse CSC)
//assume resultValues are 0-initialized
template<class ElemType>
__global__ void _denseMulSparseCSCTransposeToSparseBlockCol(
ElemType alpha,
ElemType* lhs,
size_t nrs,
ElemType* rhsNZValues,
const GPUSPARSE_INDEX_TYPE* row,
const size_t* rowIdx,
ElemType* blockVal,
size_t* blockIds)
const ElemType alpha,
const ElemType* lhsValues,
const size_t numRowsLhs,
const size_t numColsRhs,
const ElemType* rhsNZValues,
const GPUSPARSE_INDEX_TYPE* rhsRows,
const GPUSPARSE_INDEX_TYPE* rhsCols,
const size_t* rhsRowIdx,
ElemType* resultValues,
size_t* resultBlockIds)
{
int p = blockIdx.x;
int i = row[p];
int ii = rowIdx[p];
int j = blockIdx.x;
const LONG64 index = blockIdx.x * blockDim.x + threadIdx.x;
const LONG64 lhsCol = index / numRowsLhs; //rhsCol == lhsCol
if (lhsCol >= numColsRhs)
return;
const LONG64 lhsRow = index - numRowsLhs*lhsCol; //resultRow == lhsRow
int load = (nrs+blockDim.x-1)/blockDim.x;
int pStart = load * threadIdx.x;
int pEnd = min((int)nrs, load + pStart);
//each thread handles one [row, col] combination
ElemType lhsValue = alpha*lhsValues[IDX2C(lhsRow, lhsCol, numRowsLhs)];
for(int h = pStart; h < pEnd; h++)
LONG64 start = rhsCols[lhsCol]; //rhsCol == lhsCol
LONG64 end = rhsCols[lhsCol + 1];
for (LONG64 p = start; p < end; p++)
{
ElemType temp = alpha*lhs[IDX2C(h, j, nrs)]*rhsNZValues[p];
atomicAdd(&blockVal[ii*nrs+h], temp);
blockIds[ii] = i;
LONG64 rhsRow = rhsRows[p];
ElemType rhsVal = rhsNZValues[p];
LONG64 resultCol = rhsRowIdx[p]; //resultCol == rhsRow maps to columnid
resultBlockIds[resultCol] = rhsRow; //indicate which colmn it actually points to
//assume resultValues are 0-initialized
atomicAdd(&resultValues[IDX2C(lhsRow, resultCol, numRowsLhs)], lhsValue * rhsVal);
}
}
// gradients update
template<class ElemType>
__global__ void _scaleSparseAndAddToDense(
ElemType alpha,
bool blockCol,
ElemType* blockVal,
size_t* blockIds,
size_t len,
ElemType* rhs,
size_t numrows)
__global__ void _scaleSparseBlockAndAddToDense(
const ElemType alpha,
const bool blockCol, //true if blockRow
const size_t numRows,
const size_t numCols,
const size_t numBlocks,
const ElemType* lhsValues, //lhs is blockCol or blockRow
const size_t* blockIds,
ElemType* rhs)
{
int ii = blockIdx.x;
int i = blockIds[ii];
int load = (len+blockDim.x-1)/blockDim.x;
int pStart = load * threadIdx.x;
int pEnd = min((int)len, load + pStart);
for(int h = pStart; h < pEnd; h++)
{ ElemType temp = alpha*blockVal[ii*len + h];
if(blockCol)
const LONG64 index = blockIdx.x * blockDim.x + threadIdx.x;
LONG64 row, col;
if (blockCol)
{
atomicAdd(&rhs[IDX2C(h, i, numrows)], temp);
const LONG64 blockId = index / numRows;
if (blockId >= numBlocks)
return;
row = index - numRows* blockId;
col = blockIds[blockId];
}
else
{
atomicAdd(&rhs[IDX2C(i, h, numrows)], temp);
}
const LONG64 blockId = index / numCols;
if (blockId >= numBlocks)
return;
col = index - numCols* blockId;
row = blockIds[blockId];
}
rhs[IDX2C(row, col, numRows)] += alpha * lhsValues[index];
}
// compute predictions in cross entory node
@ -2638,30 +2628,36 @@ __global__ void _inplaceTruncate(
}
template<class ElemType>
__global__ void _normalGrad(
bool isBlockCol,
size_t len,
__global__ void _normalGradForSparseBlock(
const ElemType momentum,
size_t* blockIds,
ElemType* blockVal,
ElemType* c,
size_t numrows)
const bool blockCol, //true if blockRow
const size_t numRows,
const size_t numCols,
const size_t numBlocks,
ElemType* lhsValues, //lhs is blockCol or blockRow
const size_t* blockIds,
ElemType* rhs)
{
int j = blockIdx.x;
int i = blockIds[j];
int start = j * len;
int load = (len+blockDim.x-1)/blockDim.x;
int pStart = load * threadIdx.x;
int pLen = min((int)len, load + pStart);
for(int p = start+pStart; p < start+pLen; p++)
const LONG64 index = blockIdx.x * blockDim.x + threadIdx.x;
LONG64 row, col;
if (blockCol)
{
int row = isBlockCol ? (p - start) : i;
int col = isBlockCol ? i: (p - start);
c[IDX2C(row, col, numrows)] = (1-momentum)*blockVal[p] + momentum*c[IDX2C(row, col, numrows)];
blockVal[p] = c[IDX2C(row, col, numrows)];
const LONG64 blockId = index / numRows;
if (blockId >= numBlocks)
return;
row = index - numRows* blockId;
col = blockIds[blockId];
}
else
{
const LONG64 blockId = index / numCols;
if (blockId >= numBlocks)
return;
col = index - numCols* blockId;
row = blockIds[blockId];
}
rhs[IDX2C(row, col, numRows)] = (1 - momentum)*lhsValues[index] + momentum*rhs[IDX2C(row, col, numRows)];
lhsValues[index] = rhs[IDX2C(row, col, numRows)];
}
static __inline__ __device__ double atomicAdd(double* address, double val)
@ -3263,4 +3259,147 @@ d_tmp[0] = max((ElemType)0, d_tmp[0]/max((ElemType)1.0e-10,sqrt(d_tmp[1]))/max((
}
*/
template<class ElemType>
__global__ void _assignElementProductOfWithShiftNeg(
ElemType* us,
const ElemType* a,
const ElemType* b,
const int shift,
const int NTPlusOne,
const int BS)
{
LONG64 idx = blockDim.x * blockIdx.x + threadIdx.x;
LONG64 idy = blockDim.y * blockIdx.y + threadIdx.y;
if (idx >= NTPlusOne || idy >= BS)
return;
if (idx == 0)
{
// this is row-0. No need to shift
us[IDX2C(idx, idy, NTPlusOne)] = a[idy] * b[idy];
}
else
{
int cs = shift + idx - 1;
int tmpidy = (idy + cs) % BS;
us[IDX2C(idx, idy, NTPlusOne)] = a[idy] * b[tmpidy];
}
}
template<class ElemType>
__global__ void _innerProductWithShiftNeg(
ElemType* c,
const ElemType* a,
const ElemType* b,
const long N, //a.GetNumRows();
const long M, //a.GetNumCols();
const long shift,
const long NTPlusOne
)
{
LONG64 idx = blockDim.x * blockIdx.x + threadIdx.x;
LONG64 idy = blockDim.y * blockIdx.y + threadIdx.y;
if (idx >= NTPlusOne || idy >= M)
return;
ElemType sum = 0;
long index_a = 0;
long index_b = 0;
long col_a = 0;
long col_b = 0;
if (idx == 0)
{
// this is row 0. No need to shift
// the product of a(:,idy) dot b(:,idy)
col_a = idy;
for (long i = 0; i < N; ++i)
{
index_a = IDX2C(i, col_a, N);
sum += a[index_a] * b[index_a];
}
}
else
{
int cs = shift + idx - 1;
col_a = idy;
col_b = (idy + cs) % M;
for (int i = 0; i < N; ++i)
{
index_a = IDX2C(i, col_a, N);
index_b = IDX2C(i, col_b, N);
sum += a[index_a] * b[index_b];
}
}
c[IDX2C(idx, idy, NTPlusOne)] = sum;
}
template<class ElemType>
__global__ void _getARowByIndex(
ElemType* us,
const ElemType* a,
const int O, // a's rows
const int P, // a's cols
const int m // the m-th row of a
)
{
LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
if (id >= P)
return;
// us[id] = a[id] * b[id];
us[id] = a[IDX2C(m, id, O)];
}
template<class ElemType>
__global__ void _conductRowElementMultiplyWithShift(
ElemType* us,
const ElemType* a,
const ElemType* b,
const int O, // b's rows
const int P, // b's cols
const int shift,
const bool isafixed)
{
LONG64 idx = blockDim.x * blockIdx.x + threadIdx.x;
LONG64 idy = blockDim.y * blockIdx.y + threadIdx.y;
if (idx >= O || idy >= P)
return;
int tmpidy = (idy + shift) % P;
if (isafixed)
{
// we fix a, and shift b
us[IDX2C(idx, idy, O)] = a[idy] * b[IDX2C(idx, tmpidy, O)];
}
else
{
// we fix b, but shift a
us[IDX2C(idx, idy, O)] = a[tmpidy] * b[IDX2C(idx, idy, O)];
}
}
template<class ElemType>
__global__ void _assignElementProductOfWithShift(
ElemType* us,
const ElemType* a,
const ElemType* b,
const int shift,
const LONG64 N)
{
LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
if (id >= N)
return;
int tmpidb = (id + shift) % N;
us[id] = a[id] * b[tmpidb];
}
#endif // !CPUONLY

Просмотреть файл

@ -79,7 +79,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_matrixName=nullptr;
m_blockSize = 0;
m_blockVal = nullptr;
m_blockIds = nullptr;
m_expandedSize = 0;
@ -241,7 +240,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
CopyBuffer(cpuSparseMatrix.ColLocation(), h_Col, MajorIndexCount());
}
CUDACALL(cudaMemcpy(cpuSparseMatrix.BufferPointer(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
CUDACALL(cudaMemcpy(cpuSparseMatrix.NzValues(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
}
else if (this->GetFormat() == matrixFormatSparseCSC)
@ -267,7 +266,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
CopyBuffer(cpuSparseMatrix.RowLocation(), h_Row, MajorIndexCount());
}
CUDACALL(cudaMemcpy(cpuSparseMatrix.BufferPointer(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
CUDACALL(cudaMemcpy(cpuSparseMatrix.NzValues(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
}
else
NOT_IMPLEMENTED;
@ -571,7 +570,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_matrixName=moveFrom.m_matrixName;
m_blockSize = moveFrom.m_blockSize;
m_blockVal = moveFrom.m_blockVal;
m_blockIds = moveFrom.m_blockIds;
m_expandedSize = moveFrom.m_expandedSize;
@ -602,7 +600,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_matrixName=moveFrom.m_matrixName;
m_blockSize = moveFrom.m_blockSize;
m_blockVal = moveFrom.m_blockVal;
m_blockIds = moveFrom.m_blockIds;
m_expandedSize = moveFrom.m_expandedSize;
@ -636,8 +633,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if(m_pArray != nullptr)
CUDACALL(cudaFree(m_pArray));
if(m_blockVal != nullptr)
CUDACALL(cudaFree(m_blockVal));
if(m_blockIds != nullptr)
CUDACALL(cudaFree(m_blockIds));
if (m_rowToId != nullptr)
@ -669,22 +664,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
//-------------------------------------------------------------------------
// Start of new GPU Sparse Matrix code
//-------------------------------------------------------------------------
template<class ElemType>
ElemType* GPUSparseMatrix<ElemType>::BufferPointer() const
{
if(m_format == matrixFormatSparseCSC || m_format == matrixFormatSparseCSR)
{
return m_pArray;
}
else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow)
{
return m_blockVal;
}
else
NOT_IMPLEMENTED;
}
template<class ElemType>
void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const bool growOnly)
{
@ -728,13 +707,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_totalBufferSizeAllocated = bufferSizeNeeded;
m_elemSizeAllocated = numNZElemToReserve;
}
else
{
m_elemSizeAllocated = ElemCountFromBufferSize();
}
}
else if (matrixFormat == MatrixFormat::matrixFormatSparseBlockCol || matrixFormat == MatrixFormat::matrixFormatSparseBlockRow)
{
if (m_elemSizeAllocated < numNZElemToReserve || (m_elemSizeAllocated > numNZElemToReserve && !growOnly))
{
if (m_blockVal != nullptr)
CUDACALL(cudaFree(m_blockVal));
if (m_pArray != nullptr)
CUDACALL(cudaFree(m_pArray));
if (m_blockIds != nullptr)
CUDACALL(cudaFree(m_blockIds));
if (m_block2UniqId != nullptr)
@ -742,7 +725,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
PrepareDevice();
size_t newCompIndexSize = max(numRows, numCols) + 1;
CUDACALL(cudaMalloc((void **)&m_blockVal, sizeof(ElemType)*numNZElemToReserve));
CUDACALL(cudaMalloc((void **)&m_pArray, sizeof(ElemType)*numNZElemToReserve));
CUDACALL(cudaMalloc((void **)&m_blockIds, sizeof(size_t)*newCompIndexSize));
CUDACALL(cudaMalloc((void **)&m_block2UniqId, sizeof(size_t)*newCompIndexSize));
@ -828,7 +811,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
rowToId[i] = indexer[row];
}
m_blockSize = indexer.size();
CUDACALL(cudaMemcpy(m_rowToId, rowToId, sizeof(size_t)*nz, cudaMemcpyHostToDevice));
}
@ -928,7 +911,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (do_sync) CUDACALL(cudaEventCreate(&done));
_denseMultSparseCSCAndWeightedAddToDense<ElemType> <<< blocksPerGrid, threadsPerBlock >>> (
m, //rowDense
k, //colDense = rowSparse
n, //colSparse
alpha,
reinterpret_cast<const ElemType*>(lhs.BufferPointer()), //dense
@ -991,25 +973,32 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (rhs.GetFormat() != matrixFormatSparseCSC)
NOT_IMPLEMENTED;
lhs.PrepareDevice();
c.SetFormat(matrixFormatSparseBlockCol);
c.m_blockSize = n < rhs.m_nz ? n : rhs.m_nz;
//c.m_blockSize = n < rhs.m_nz ? n : rhs.m_nz;
c.m_blockSize = rhs.m_blockSize;
c.m_nz = m*c.m_blockSize;
c.Resize(m, n, c.m_nz);
CUDACALL(cudaMemset(c.m_blockVal, 0, sizeof(ElemType)*(c.m_nz)));
CUDACALL(cudaMemset(c.NzValues(), 0, sizeof(ElemType)*(c.m_nz)));
CUDACALL(cudaMemset(c.m_blockIds, 0, sizeof(size_t)*(c.m_blockSize)));
LONG64 N = (LONG64)lhs.GetNumElements(); //here we process for each row in lhs and each column in rhs (==columns in lhs)
int blocksPerGrid = (int)ceil(((double)N) / threadsPerBlock);
cudaEvent_t done = nullptr;
if (do_sync) CUDACALL(cudaEventCreate(&done));
int blocksPerGrid = rhs.GetNumNZElements();
_denseMulSparseCSCTransposeToSparseBlockCol<ElemType> << <blocksPerGrid, threadsPerBlock >> >(
_denseMulSparseCSCTransposeToSparseBlockCol<ElemType> << <blocksPerGrid, threadsPerBlock, 0, t_stream >> >(
alpha,
lhs.BufferPointer(),
m,
rhs.BufferPointer(),
l,
rhs.NzValues(),
rhs.RowLocation(),
rhs.ColLocation(),
rhs.m_rowToId,
c.m_blockVal,
c.NzValues(),
c.m_blockIds);
if (do_sync) CUDACALL(cudaEventRecord(done));
if (do_sync) CUDACALL(cudaEventSynchronize(done));
if (do_sync) CUDACALL(cudaEventDestroy(done));
@ -1028,25 +1017,30 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template<class ElemType>
void GPUSparseMatrix<ElemType>::ScaleAndAdd(const ElemType alpha, const GPUSparseMatrix<ElemType>& lhs, GPUMatrix<ElemType>& rhs)
{
if (lhs.GetComputeDeviceId()!=rhs.GetComputeDeviceId())
if (lhs.GetNumRows() != rhs.GetNumRows() || lhs.GetNumCols() != rhs.GetNumCols())
throw std::logic_error("ScaleAndAdd: dimension mismatch");
if (lhs.GetComputeDeviceId() != rhs.GetComputeDeviceId())
throw std::runtime_error("GPUSparseMatrix::ScaleAndAdd: All matrices must be on the same GPU");
if (lhs.m_format == matrixFormatSparseBlockCol || lhs.m_format == matrixFormatSparseBlockRow)
{
size_t len = (lhs.m_format == matrixFormatSparseBlockCol) ? lhs.GetNumRows(): lhs.GetNumCols();
bool blockCol = (lhs.m_format == matrixFormatSparseBlockCol);
cudaEvent_t done = nullptr;
if (do_sync) CUDACALL(cudaEventCreate(&done));
size_t blocksPerGrid = lhs.m_blockSize;
_scaleSparseAndAddToDense<ElemType> << <blocksPerGrid, threadsPerBlock >> >(
LONG64 N = (LONG64)lhs.GetNumNZElements();
int blocksPerGrid = (int)ceil(((double)N) / threadsPerBlock);
_scaleSparseBlockAndAddToDense<ElemType> << <blocksPerGrid, threadsPerBlock >> >(
alpha,
blockCol,
lhs.m_blockVal,
lhs.GetNumRows(),
lhs.GetNumCols(),
lhs.m_blockSize,
lhs.NzValues(),
lhs.m_blockIds,
len,
rhs.BufferPointer(),
rhs.GetNumRows());
rhs.BufferPointer());
if (do_sync) CUDACALL(cudaEventRecord(done));
if (do_sync) CUDACALL(cudaEventSynchronize(done));
if (do_sync) CUDACALL(cudaEventDestroy(done));
@ -1104,7 +1098,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
label.m_block2Id,
cls.BufferPointer(),
idx2cls.BufferPointer(),
etp.m_pArray,
etp.NzValues(),
etp.MajorIndexLocation(),
etp.SecondaryIndexLocation());
@ -1184,7 +1178,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
grd.m_blockSize = label.m_blockSize;
grd.m_nz = nz;
CUDACALL(cudaMemset(grd.m_blockVal,0,sizeof(ElemType)*(grd.m_nz)));
CUDACALL(cudaMemset(grd.BufferPointer(),0,sizeof(ElemType)*(grd.m_nz)));
CUDACALL(cudaMemset(grd.m_blockIds,0,sizeof(size_t)*(grd.m_blockSize)));
cudaEvent_t done = nullptr;
@ -1203,7 +1197,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
idx2cls.BufferPointer(),
input.BufferPointer(),
input.GetNumRows(),
grd.m_blockVal,
grd.BufferPointer(),
grd.m_blockIds);
if (do_sync) CUDACALL(cudaEventRecord(done));
if (do_sync) CUDACALL(cudaEventSynchronize(done));
@ -1221,8 +1215,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
cudaEvent_t done = nullptr;
if (do_sync) CUDACALL(cudaEventCreate(&done));
ElemType * values = NzValues();
if (m_format == matrixFormatSparseBlockCol || m_format == matrixFormatSparseBlockRow)
values = m_blockVal;
_inplaceTruncate<ElemType><<<blocksPerGrid,threadsPerBlock>>>(values,threshold,N);
if (do_sync) CUDACALL(cudaEventRecord(done));
if (do_sync) CUDACALL(cudaEventSynchronize(done));
@ -1247,19 +1239,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if(m_format == matrixFormatSparseBlockCol || m_format == matrixFormatSparseBlockRow)
{
size_t blocksPerGrid = m_blockSize;
bool isBlockCol = (m_format == MatrixFormat::matrixFormatSparseBlockCol);
size_t len = isBlockCol ? GetNumRows(): GetNumCols();
cudaEvent_t done = nullptr;
if (do_sync) CUDACALL(cudaEventCreate(&done));
_normalGrad<ElemType><<<blocksPerGrid,threadsPerBlock>>>(
isBlockCol,
len,
LONG64 N = (LONG64)GetNumNZElements();
int blocksPerGrid = (int)ceil(((double)N) / threadsPerBlock);
_normalGradForSparseBlock<ElemType> << <blocksPerGrid, threadsPerBlock >> >(
momentum,
isBlockCol,
GetNumRows(),
GetNumCols(),
m_blockSize,
NzValues(),
m_blockIds,
m_blockVal,
c.BufferPointer(),
c.GetNumRows());
c.BufferPointer());
if (do_sync) CUDACALL(cudaEventRecord(done));
if (do_sync) CUDACALL(cudaEventSynchronize(done));
if (do_sync) CUDACALL(cudaEventDestroy(done));

Просмотреть файл

@ -49,9 +49,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// in memory format is always in the following order:
// Non-zero data elements, Full index locations, compressed index locations
// In CSR row data is compressed, in CSC col data is compressed
const ElemType* NzValues() const {return m_pArray;}
ElemType* NzValues() {return m_pArray;}
size_t NzSize() const {return sizeof(ElemType)*m_nz;} // actual number of element bytes in use
inline const ElemType* NzValues() const {return m_pArray;}
inline ElemType* NzValues() {return m_pArray;}
inline size_t NzSize() const {return sizeof(ElemType)*m_nz;} // actual number of element bytes in use
GPUSPARSE_INDEX_TYPE* MajorIndexLocation() const { return (GPUSPARSE_INDEX_TYPE*)(m_pArray + m_elemSizeAllocated); } //this is the major index, row/col ids in CSC/CSR format
size_t MajorIndexCount() const { return m_nz; }
@ -82,8 +82,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t BufferSizeNeeded(const size_t numNZ) const
{ return sizeof(ElemType)*numNZ + sizeof(GPUSPARSE_INDEX_TYPE)*(numNZ + SecondaryIndexCount(numNZ)); }
size_t BufferSizeAllocated() const { return m_totalBufferSizeAllocated; }
ElemType* BufferPointer() const;
inline size_t BufferSizeAllocated() const { return m_totalBufferSizeAllocated; }
inline ElemType* BufferPointer() const { return m_pArray; }
// the column and row locations will swap based on what format we are in. Full index always follows the data array
GPUSPARSE_INDEX_TYPE* RowLocation() const { return (m_format&matrixFormatRowMajor) ? SecondaryIndexLocation() : MajorIndexLocation(); }
@ -125,7 +125,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
bool IsEqualTo(const GPUMatrix<ElemType>& a, const ElemType threshold = 1e-8) const;
public:
virtual DEVICEID_TYPE GetComputeDeviceId(void) const;
size_t GetNumNZElements() const {return m_nz;}
inline size_t GetNumNZElements() const {return m_nz;}
//Sets sparse matrix in CSR format. this acts as deep copy
void SetMatrixFromCSRFormat(const GPUSPARSE_INDEX_TYPE *h_CSRRow, const GPUSPARSE_INDEX_TYPE *h_Col, const ElemType *h_Val,
@ -249,7 +249,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t m_totalBufferSizeAllocated;
size_t m_blockSize; //block size
ElemType *m_blockVal; //block values
size_t *m_blockIds; //block ids
size_t *m_rowToId; //the id showing the order row number is observed in the nnz values.

Просмотреть файл

@ -733,6 +733,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (m_CPUSparseMatrix == nullptr)
{
m_CPUSparseMatrix = new CPUSparseMatrix<ElemType>(newMatrixFormat);
if (GetMatrixType() == MatrixType::DENSE && m_CPUMatrix != nullptr)
{
m_CPUSparseMatrix->Resize(GetNumRows(), GetNumCols());
CopyElementsFromDenseToSparse(*m_CPUMatrix, *m_CPUSparseMatrix);
}
else
{
// TODO: Assign Sparse from Sparse!
}
delete m_CPUMatrix;
m_CPUMatrix = nullptr;
}
@ -801,6 +812,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
}
template<class ElemType>
void Matrix<ElemType>::CopyElementsFromDenseToSparse(CPUMatrix<ElemType>& from, CPUSparseMatrix<ElemType>& dest)
{
foreach_coord(row, col, from)
{
auto val = from(row, col);
dest.SetValue(row, col, val);
}
}
template<class ElemType>
ElemType Matrix<ElemType>::Get00Element() const
@ -3992,7 +4012,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
nullptr,
return CPUMatrix<ElemType>::AreEqual(*a.m_CPUMatrix, *b.m_CPUMatrix, threshold),
return GPUMatrix<ElemType>::AreEqual(*a.m_GPUMatrix, *b.m_GPUMatrix, threshold),
NOT_IMPLEMENTED; return false ,
return CPUSparseMatrix<ElemType>::AreEqual(*a.m_CPUSparseMatrix, *b.m_CPUSparseMatrix, threshold),
return GPUSparseMatrix<ElemType>::AreEqual(*a.m_GPUSparseMatrix, *b.m_GPUSparseMatrix, threshold)
);
}

Просмотреть файл

@ -45,6 +45,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
void _transferToDevice(int id_to, bool ismoved=true, bool emptyTransfer=false) const;
static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c);
static void CopyElementsFromDenseToSparse(CPUMatrix<ElemType>& from, CPUSparseMatrix<ElemType>& dest);
public:
//Constructors, destructors and other static matrix builders