Merge branch 'master' of https://github.com/Microsoft/CNTK into amitaga/cntkv2Library
Conflicts: CNTK.sln
This commit is contained in:
Коммит
0ec5b27404
6
CNTK.sln
6
CNTK.sln
|
@ -1133,6 +1133,12 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKv2LibraryDll", "Source\
|
|||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "V2LibraryTests", "Tests\UnitTests\V2LibraryTests\V2LibraryTests.vcxproj", "{F4CC3AB2-0DB2-4281-929A-2E68E30F0F6E}"
|
||||
EndProject
|
||||
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Scripts", "Scripts", "{68263A2F-1D5F-4C46-B5AF-2304B80FC3D4}"
|
||||
ProjectSection(SolutionItems) = preProject
|
||||
Scripts\pytest.ini = Scripts\pytest.ini
|
||||
Scripts\txt2ctf.py = Scripts\txt2ctf.py
|
||||
EndProjectSection
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug_CpuOnly|x64 = Debug_CpuOnly|x64
|
||||
|
|
6
Makefile
6
Makefile
|
@ -738,15 +738,15 @@ DEP := $(patsubst %.o, %.d, $(OBJ))
|
|||
# will result in the rebuild.
|
||||
-include ${DEP}
|
||||
|
||||
MAKEFILES := Makefile $(BUILD_TOP)/Config.make
|
||||
$(OBJDIR)/%.o : %.cu $(MAKEFILES)
|
||||
BUILD_CONFIGURATION := Makefile $(BUILD_TOP)/Config.make
|
||||
|
||||
$(OBJDIR)/%.o : %.cu $(BUILD_CONFIGURATION)
|
||||
@echo $(SEPARATOR)
|
||||
@echo creating $@ for $(ARCH) with build type $(BUILDTYPE)
|
||||
@mkdir -p $(dir $@)
|
||||
$(NVCC) -c $< -o $@ $(COMMON_FLAGS) $(CUFLAGS) $(INCLUDEPATH:%=-I%) -Xcompiler "-fPIC -Werror"
|
||||
|
||||
$(OBJDIR)/%.o : %.cpp $(MAKEFILES)
|
||||
$(OBJDIR)/%.o : %.cpp $(BUILD_CONFIGURATION)
|
||||
@echo $(SEPARATOR)
|
||||
@echo creating $@ for $(ARCH) with build type $(BUILDTYPE)
|
||||
@mkdir -p $(dir $@)
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
[pytest]
|
||||
python_files = *.py
|
|
@ -0,0 +1,116 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
# This script takes a list of dictionary files and a plain text file and converts this text input file to CNTK text format.
|
||||
#
|
||||
# The input text file must contain N streams per line (N TAB-separated "columns") and should be accompanied by N dictionary files.
|
||||
# The input text file must be in the following form:
|
||||
# text1 TAB text2 TAB ... TAB textN
|
||||
# .....
|
||||
# where each line represents one sequence across all N input streams.
|
||||
# Each text consists of one or more space-separated word tokens (samples).
|
||||
#
|
||||
# Dictionary files are text files that are required to be specified for all streams,
|
||||
# so the #dictionaries = #columns in the input file.
|
||||
# A dictionary contains a single token per line. The zero-based line number becomes the numeric index
|
||||
# of the token in the output CNTK text format file.
|
||||
|
||||
# Example usage (i.e. for PennTreebank files):
|
||||
# sed -e 's/^<\/s> //' -e 's/ <\/s>$//' < en.txt > en.txt1
|
||||
# sed -e 's/^<\/s> //' -e 's/ <\/s>$//' < fr.txt > fr.txt1
|
||||
# paste en.txt1 fr.txt1 | txt2ctf.py --map en.dict fr.dict > en-fr.ctf
|
||||
#
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
def convert(dictionaryStreams, inputs, output, annotated):
|
||||
# create in memory dictionaries
|
||||
dictionaries = [{ line.rstrip('\r\n').strip():index for index, line in enumerate(dic) } for dic in dictionaryStreams]
|
||||
|
||||
# convert inputs
|
||||
for input in inputs:
|
||||
sequenceId = 0
|
||||
for index, line in enumerate(input):
|
||||
line = line.rstrip('\r\n')
|
||||
columns = line.split("\t")
|
||||
if len(columns) != len(dictionaries):
|
||||
raise Exception("Number of dictionaries {0} does not correspond to the number of streams in line {1}:'{2}'"
|
||||
.format(len(dictionaries), index, line))
|
||||
_convertSequence(dictionaries, columns, sequenceId, output, annotated)
|
||||
sequenceId += 1
|
||||
|
||||
def _convertSequence(dictionaries, streams, sequenceId, output, annotated):
|
||||
tokensPerStream = [[t for t in s.strip(' ').split(' ') if t != ""] for s in streams]
|
||||
maxLen = max(len(tokens) for tokens in tokensPerStream)
|
||||
|
||||
# writing to the output file
|
||||
for sampleIndex in range(maxLen):
|
||||
output.write(str(sequenceId))
|
||||
for streamIndex in range(len(tokensPerStream)):
|
||||
if len(tokensPerStream[streamIndex]) <= sampleIndex:
|
||||
output.write("\t")
|
||||
continue
|
||||
token = tokensPerStream[streamIndex][sampleIndex]
|
||||
if token not in dictionaries[streamIndex]:
|
||||
raise Exception("Token '{0}' cannot be found in the dictionary for stream {1}".format(token, streamIndex))
|
||||
value = dictionaries[streamIndex][token]
|
||||
output.write("\t|S" + str(streamIndex) + " "+ str(value) + ":1")
|
||||
if annotated:
|
||||
output.write(" |# " + token)
|
||||
output.write("\n")
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Transforms text file given dictionaries into CNTK text format.")
|
||||
parser.add_argument('--map', help='List of dictionaries, given in the same order as streams in the input files',
|
||||
nargs="+", required=True)
|
||||
parser.add_argument('--annotated', help='Whether to annotate indices with tokens. Default is false',
|
||||
choices=["True", "False"], default="False", required=False)
|
||||
parser.add_argument('--output', help='Name of the output file, stdout if not given', default="", required=False)
|
||||
parser.add_argument('--input', help='Name of the inputs files, stdin if not given', default="", nargs="*", required=False)
|
||||
args = parser.parse_args()
|
||||
|
||||
# creating inputs
|
||||
inputs = [sys.stdin]
|
||||
if len(args.input) != 0:
|
||||
inputs = [open(i) for i in args.input]
|
||||
|
||||
# creating output
|
||||
output = sys.stdout
|
||||
if args.output != "":
|
||||
output = open(args.output, "w")
|
||||
|
||||
convert([open(d) for d in args.map], inputs, output, args.annotated == "True")
|
||||
|
||||
|
||||
#####################################################################################################
|
||||
# Tests
|
||||
#####################################################################################################
|
||||
|
||||
import StringIO
|
||||
import pytest
|
||||
|
||||
def test_simpleSanityCheck():
|
||||
dictionary1 = StringIO.StringIO("hello\nmy\nworld\nof\nnothing\n")
|
||||
dictionary2 = StringIO.StringIO("let\nme\nbe\nclear\nabout\nit\n")
|
||||
input = StringIO.StringIO("hello my\tclear about\nworld of\tit let clear\n")
|
||||
output = StringIO.StringIO()
|
||||
|
||||
convert([dictionary1, dictionary2], [input], output, False)
|
||||
|
||||
expectedOutput = StringIO.StringIO()
|
||||
expectedOutput.write("0\t|S0 0:1\t|S1 3:1\n")
|
||||
expectedOutput.write("0\t|S0 1:1\t|S1 4:1\n")
|
||||
expectedOutput.write("1\t|S0 2:1\t|S1 5:1\n")
|
||||
expectedOutput.write("1\t|S0 3:1\t|S1 0:1\n")
|
||||
expectedOutput.write("1\t\t|S1 3:1\n")
|
||||
|
||||
assert expectedOutput.getvalue() == output.getvalue()
|
||||
|
||||
def test_nonExistingWord():
|
||||
dictionary1 = StringIO.StringIO("hello\nmy\nworld\nof\nnothing\n")
|
||||
input = StringIO.StringIO("hello my\nworld of nonexistent\n")
|
||||
output = StringIO.StringIO()
|
||||
|
||||
with pytest.raises(Exception) as info:
|
||||
convert([dictionary1], [input], output, False)
|
||||
assert info.value.message == "Token 'nonexistent' cannot be found in the dictionary for stream 0"
|
|
@ -1 +1 @@
|
|||
Subproject commit 18fcb1a9378432ae179948b0f1e281115a2c7d86
|
||||
Subproject commit 3e454fc7571e90f5cf866637d43623e900cfee41
|
|
@ -37,11 +37,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
size_t m_reportFrequency;
|
||||
size_t m_totalSamplesProcessedSinceLastReport;
|
||||
size_t m_localSamplesProcessedSinceLastReport;
|
||||
double m_accumulatedSecondsOnSyncPointInOneEpoch;
|
||||
size_t m_syncPointHitCounterInOneEpoch;
|
||||
Timer m_Timer;
|
||||
|
||||
public:
|
||||
MASGDPerfStats(size_t myRank, size_t numWorkers):
|
||||
m_numWorkers(numWorkers), m_myRank(myRank), m_numSyncPerformedInCurrentEpoch(0), m_reportFrequency(1)
|
||||
m_numWorkers(numWorkers), m_myRank(myRank), m_numSyncPerformedInCurrentEpoch(0), m_reportFrequency(1),
|
||||
m_totalSamplesProcessedSinceLastReport(0), m_localSamplesProcessedSinceLastReport(0)
|
||||
{
|
||||
m_Timer.Start();
|
||||
}
|
||||
|
@ -55,6 +58,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
m_Timer.Restart();
|
||||
m_numSyncPerformedInCurrentEpoch = 0;
|
||||
m_accumulatedSecondsOnSyncPointInOneEpoch = 0;
|
||||
m_syncPointHitCounterInOneEpoch = 0;
|
||||
}
|
||||
void OnEpochEnd()
|
||||
{
|
||||
|
@ -65,18 +70,35 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
m_numSyncPerformedInCurrentEpoch++;
|
||||
m_totalSamplesProcessedSinceLastReport += totalSamplesProcessedSinceLastSync;
|
||||
m_localSamplesProcessedSinceLastReport += localSamplesProcessedSinceLastSync;
|
||||
if ( m_reportFrequency > 0 && m_numSyncPerformedInCurrentEpoch % m_reportFrequency == 0)
|
||||
if ( m_reportFrequency > 0 &&
|
||||
( m_numSyncPerformedInCurrentEpoch % m_reportFrequency == 0 || m_numSyncPerformedInCurrentEpoch <=5 )
|
||||
)
|
||||
// reporting condition:
|
||||
// 1. if m_reportFrequency == 0 , no reporting
|
||||
// 2. if m_reportFrequence >0 , report MA perf Stats every m_reportFrequency model aggregation are performed
|
||||
// and the first 5 perf stats within each epoch is always reported
|
||||
{
|
||||
ReportMAPerfStats(
|
||||
m_totalSamplesProcessedSinceLastReport,
|
||||
m_localSamplesProcessedSinceLastReport,
|
||||
secondsOnCommunication
|
||||
);
|
||||
ReportMAPerfStats(m_totalSamplesProcessedSinceLastReport,
|
||||
m_localSamplesProcessedSinceLastReport,
|
||||
secondsOnCommunication );
|
||||
|
||||
m_totalSamplesProcessedSinceLastReport = 0;
|
||||
m_localSamplesProcessedSinceLastReport = 0;
|
||||
}
|
||||
}
|
||||
void OnArriveAtSyncPoint(double secondOnSyncPoint, bool printMessage)
|
||||
{
|
||||
if (printMessage)
|
||||
{
|
||||
m_accumulatedSecondsOnSyncPointInOneEpoch += secondOnSyncPoint;
|
||||
m_syncPointHitCounterInOneEpoch++;
|
||||
fprintf(stderr, "\t\t(model aggregation stats): %d-th sync point was hit, introducing a %.2f-seconds latency this time; accumulated time on sync point = %.2f seconds , average latency = %.2f seconds\n",
|
||||
(int)m_syncPointHitCounterInOneEpoch,
|
||||
secondOnSyncPoint,
|
||||
m_accumulatedSecondsOnSyncPointInOneEpoch,
|
||||
m_accumulatedSecondsOnSyncPointInOneEpoch / m_syncPointHitCounterInOneEpoch);
|
||||
}
|
||||
}
|
||||
|
||||
void ReportMAPerfStats( size_t totalSamplesProcessedSinceLastReport,
|
||||
size_t localSamplesProcessedSinceLastReport,
|
||||
|
@ -93,7 +115,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
"\t\t(model aggregation stats) %d-th sync: totalThroughput = %.2fk samplesPerSecond , throughputPerWorker = %.2fk samplesPerSecond\n";
|
||||
fprintf(stderr, prefix.c_str(), (int)m_numSyncPerformedInCurrentEpoch, secondsSinceLastReport, secondOnCommunication, (int)totalSamplesProcessedSinceLastReport, (int)m_numWorkers, (int)localSamplesProcessedSinceLastReport,
|
||||
(int)m_numSyncPerformedInCurrentEpoch, totalThroughput, throughputPerWorker);
|
||||
|
||||
}
|
||||
};
|
||||
// base class for MA-SGD algorithm family
|
||||
|
@ -131,7 +152,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
)
|
||||
{
|
||||
m_MAworkerStatus[m_myRank] = MAWorkerStatus::DataEnd;
|
||||
bool read2sync=UpdateWorkerStatus(MAWorkerStatus::DataEnd);
|
||||
Timer syncPointTimer; syncPointTimer.Start();
|
||||
bool read2sync = UpdateWorkerStatus(MAWorkerStatus::DataEnd);
|
||||
syncPointTimer.Stop();
|
||||
m_perfReporter.OnArriveAtSyncPoint(syncPointTimer.ElapsedSeconds(), true);
|
||||
// assert(read2sync);
|
||||
size_t totalSamplesProcessed = 0;
|
||||
float secondsOnCommunication = 0.0f;
|
||||
|
@ -152,7 +176,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
size_t samplesSinceLastSync /* input: samples processed since last sync on this worker only */
|
||||
)
|
||||
{
|
||||
Timer syncPointTimer;
|
||||
syncPointTimer.Start();
|
||||
bool read2Sync=UpdateWorkerStatus(MAWorkerStatus::DataProcessing);
|
||||
syncPointTimer.Stop();
|
||||
m_perfReporter.OnArriveAtSyncPoint(syncPointTimer.ElapsedSeconds(),read2Sync);
|
||||
|
||||
size_t totalSamplesProcessed=0;
|
||||
float secondsOnCommunication = 0.0f;
|
||||
if (read2Sync)
|
||||
|
|
|
@ -777,9 +777,11 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
|
||||
// MA-related variables
|
||||
size_t nSamplesSinceLastModelSync = 0;
|
||||
size_t blockSizePerWorker = 0;
|
||||
if (useParallelTrain && m_pMASGDHelper)
|
||||
{
|
||||
m_pMASGDHelper->OnEpochStart(learnableNodes);
|
||||
blockSizePerWorker = m_modelAggregationBlockSize / m_mpi->NumNodesInUse();
|
||||
}
|
||||
|
||||
std::vector<Matrix<ElemType>*> learnParamsGradients;
|
||||
|
@ -1074,7 +1076,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
// aggregation by model averaging or block momentum
|
||||
if (useModelAggregation)
|
||||
{
|
||||
if (nSamplesSinceLastModelSync >= m_nFramesBetweenMASync)
|
||||
if (nSamplesSinceLastModelSync >= blockSizePerWorker)
|
||||
{
|
||||
bool synced = m_pMASGDHelper->OnArrivingAtSyncPoint(learnableNodes, smoothedGradients, nSamplesSinceLastModelSync);
|
||||
if (synced)
|
||||
|
@ -1835,7 +1837,7 @@ void SGD<ElemType>::InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE de
|
|||
m_pMASGDHelper = make_shared<BlockMomentumSGD<ElemType>>(m_mpi, traceLevel, devID,
|
||||
m_useNesterovBlockMomentum, m_resetSGDMomentum,
|
||||
m_blockLearningRate, m_blockMomentumAsTimeConstant,
|
||||
m_nFramesBetweenMASync);
|
||||
m_modelAggregationBlockSize);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
@ -2272,30 +2274,6 @@ bool SGD<ElemType>::GradientCheck(ComputationNetworkPtr net,
|
|||
return errMsgs.empty();
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void SGD<ElemType>::InitializeAndCheckBlockMomentumSGDParameters()
|
||||
{
|
||||
#ifdef CNTK_PARALLEL_TRAINING_SUPPORT
|
||||
// if user has not specified the time constant
|
||||
if (m_blockMomentumAsTimeConstant <= 0.0)
|
||||
{
|
||||
double blockMomentum = 1 - 1.0 / m_mpi->NumNodesInUse();
|
||||
// This is the recommended value for block momentum, see documents and papers
|
||||
m_blockMomentumAsTimeConstant = BlockMomentumSGD<double>::Momentum2TimeConstant(blockMomentum, m_nFramesBetweenMASync);
|
||||
}
|
||||
// final argument checking in case of user specifying a bad parameter
|
||||
double blockMomentum = BlockMomentumSGD<double>::TimeConstant2Momentum(m_blockMomentumAsTimeConstant, m_nFramesBetweenMASync);
|
||||
if ((1 - blockMomentum)*m_blockLearningRate*m_mpi->NumNodesInUse() >= 2.0)
|
||||
{
|
||||
fprintf(stderr, "WARNING: (1-blockMomentumPerSync)*blockLearningRate is larger than 2*numWorkers; it is possible to overshoot.");
|
||||
}
|
||||
#else
|
||||
// don't need do anything here
|
||||
m_blockMomentumAsTimeConstant = 0.0;
|
||||
m_blockLearningRate= 1.0;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void SGD<ElemType>::MarkDropoutNodesEvalTimeStampAsOutdated(const ComputationNetworkPtr& net, const ComputationNodeBasePtr& criterionNode)
|
||||
{
|
||||
|
@ -2581,76 +2559,144 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
|
|||
m_bufferedAsyncGradientAggregation = false;
|
||||
m_enableDistributedMBReading = false;
|
||||
m_parallelizationStartEpochNum = 0;
|
||||
m_nFramesBetweenMASync = 0;
|
||||
m_modelAggregationBlockSize = 0;
|
||||
|
||||
if (configSGD.Exists(L"ParallelTrain"))
|
||||
{
|
||||
const ConfigRecordType& configParallelTrain(configSGD(L"ParallelTrain", ConfigRecordType::Record()));
|
||||
m_parallelizationMethod = ParseParallelizationMethod(configParallelTrain(L"parallelizationMethod", L"none"));
|
||||
m_parallelizationStartEpochNum = configParallelTrain(L"parallelizationStartEpoch", (int) 1) - 1; // Epoch numbers internally are 0 based
|
||||
m_enableDistributedMBReading = configParallelTrain(L"distributedMBReading", false);
|
||||
m_syncStatsTrace = configParallelTrain(L"syncPerfStats", (int) 0);
|
||||
|
||||
if (configParallelTrain.Exists(L"DataParallelSGD"))
|
||||
MPIWrapperPtr pMPI = MPIWrapper::GetInstance();
|
||||
if (!pMPI)
|
||||
{
|
||||
const ConfigRecordType& configDataParallelSGD(configParallelTrain(L"DataParallelSGD", ConfigRecordType::Record()));
|
||||
size_t defaultGradientBits = 8 * sizeofElemType;
|
||||
m_numGradientBits = configDataParallelSGD(L"gradientBits", defaultGradientBits);
|
||||
m_zeroThresholdFor1Bit = configDataParallelSGD(L"useZeroThresholdFor1BitQuantization", true);
|
||||
m_bufferedAsyncGradientAggregation = configDataParallelSGD(L"useBufferedAsyncGradientAggregation", false);
|
||||
if ((m_numGradientBits < 1) || (m_numGradientBits > (8 * sizeofElemType)))
|
||||
{
|
||||
InvalidArgument("gradientBits must be in the range [1, 32] when using precision=float and in range [1, 64] when using precision=double!");
|
||||
}
|
||||
// some users may forget to specify useParallelTrain option
|
||||
// in this case, falling back to normal SGD
|
||||
fprintf(stderr, "useParallelTrain option is not enabled. ParallelTrain config will be ignored.");
|
||||
}
|
||||
|
||||
if (configParallelTrain.Exists(L"ModelAveragingSGD"))
|
||||
else
|
||||
{
|
||||
const ConfigRecordType& configMASGD(configParallelTrain(L"ModelAveragingSGD", ConfigRecordType::Record()));
|
||||
m_nFramesBetweenMASync = configMASGD(L"syncPeriod", (size_t)40000);
|
||||
size_t numMPIWorkers = pMPI->NumNodesInUse();
|
||||
const ConfigRecordType& configParallelTrain(configSGD(L"ParallelTrain", ConfigRecordType::Record()));
|
||||
m_parallelizationMethod = ParseParallelizationMethod(configParallelTrain(L"parallelizationMethod", L"none"));
|
||||
m_parallelizationStartEpochNum = configParallelTrain(L"parallelizationStartEpoch", (int)1) - 1; // Epoch numbers internally are 0 based
|
||||
m_enableDistributedMBReading = configParallelTrain(L"distributedMBReading", false);
|
||||
m_syncStatsTrace = configParallelTrain(L"syncPerfStats", (int)0);
|
||||
|
||||
if (configParallelTrain.Exists(L"DataParallelSGD"))
|
||||
{
|
||||
const ConfigRecordType& configDataParallelSGD(configParallelTrain(L"DataParallelSGD", ConfigRecordType::Record()));
|
||||
size_t defaultGradientBits = 8 * sizeofElemType;
|
||||
m_numGradientBits = configDataParallelSGD(L"gradientBits", defaultGradientBits);
|
||||
m_zeroThresholdFor1Bit = configDataParallelSGD(L"useZeroThresholdFor1BitQuantization", true);
|
||||
m_bufferedAsyncGradientAggregation = configDataParallelSGD(L"useBufferedAsyncGradientAggregation", false);
|
||||
if ( m_numGradientBits < 1 || m_numGradientBits > (8 * sizeofElemType) )
|
||||
{
|
||||
InvalidArgument("gradientBits must be in the range [1, 32] when using precision=float and in range [1, 64] when using precision=double!");
|
||||
}
|
||||
}
|
||||
if (configParallelTrain.Exists(L"ModelAveragingSGD"))
|
||||
{
|
||||
const ConfigRecordType& configMASGD(configParallelTrain(L"ModelAveragingSGD", ConfigRecordType::Record()));
|
||||
if (configMASGD.Exists(L"blockSizePerWorker") && configMASGD.Exists(L"blockSize"))
|
||||
{
|
||||
InvalidArgument("It is only allowed to set blockSizePerWorker or blockSize, not both of them");
|
||||
}
|
||||
else if (configMASGD.Exists(L"blockSize"))
|
||||
{
|
||||
m_modelAggregationBlockSize = configMASGD(L"blockSize");
|
||||
}
|
||||
else if (configMASGD.Exists(L"blockSizePerWorker"))
|
||||
{
|
||||
m_modelAggregationBlockSize = configMASGD(L"blockSizePerWorker");
|
||||
m_modelAggregationBlockSize *= numMPIWorkers;
|
||||
}
|
||||
else
|
||||
{
|
||||
m_modelAggregationBlockSize = 40000 * numMPIWorkers; // default value
|
||||
}
|
||||
#if 1 // legacy option
|
||||
if (configMASGD.Exists(L"syncFrequencyInFrames"))
|
||||
{
|
||||
m_nFramesBetweenMASync = configMASGD(L"syncFrequencyInFrames");
|
||||
}
|
||||
if (configMASGD.Exists(L"syncFrequencyInFrames"))
|
||||
{
|
||||
if (configMASGD.Exists(L"blockSizePerWorker") || configMASGD.Exists(L"blockSize"))
|
||||
InvalidArgument("syncFrequencyInFrames is a deprecated alias of blockSizePerWorker. It is not allowed to specify both of them");
|
||||
m_modelAggregationBlockSize = configMASGD(L"syncFrequencyInFrames");
|
||||
m_modelAggregationBlockSize *= numMPIWorkers;
|
||||
fprintf(stderr, "WARNING: option syncFrequencyInFrames in ModelAveragingSGD is going to be deprecated. Please use blockSizePerWorker instead\n");
|
||||
}
|
||||
if (configMASGD.Exists(L"syncPeroid"))
|
||||
{
|
||||
if (configMASGD.Exists(L"blockSizePerWorker") || configMASGD.Exists(L"blockSize"))
|
||||
{
|
||||
InvalidArgument("syncPeriod is a deprecated alias of blockSizePerWorker. It is not allowed to specify both of them");
|
||||
}
|
||||
m_modelAggregationBlockSize = configMASGD(L"syncPeriod");
|
||||
m_modelAggregationBlockSize *= numMPIWorkers;
|
||||
fprintf(stderr, "WARNING: option syncPeroid in ModelAveragingSGD is going to be deprecated. Please use blockSizePerWorker instead in the future.\n");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
if (configParallelTrain.Exists(L"BlockMomentumSGD"))
|
||||
{
|
||||
}
|
||||
if (configParallelTrain.Exists(L"BlockMomentumSGD"))
|
||||
{
|
||||
#ifndef CNTK_PARALLEL_TRAINING_SUPPORT
|
||||
InvalidArgument("BlockMomentumSGD is not enabled in this version.\n");
|
||||
InvalidArgument("BlockMomentumSGD is not enabled in this version.\n");
|
||||
#else
|
||||
const ConfigRecordType& configBMSGD(configParallelTrain(L"BlockMomentumSGD", ConfigRecordType::Record()));
|
||||
m_nFramesBetweenMASync = configBMSGD(L"syncPeriod", 120000);
|
||||
m_resetSGDMomentum = configBMSGD(L"resetSGDMomentum", true);
|
||||
m_useNesterovBlockMomentum = configBMSGD(L"useNesterovMomentum", true);
|
||||
m_blockLearningRate = configBMSGD(L"blockLearningRate", 1.0);
|
||||
const ConfigRecordType& configBMSGD(configParallelTrain(L"BlockMomentumSGD", ConfigRecordType::Record()));
|
||||
if (configBMSGD.Exists(L"blockSize") && configBMSGD.Exists(L"blockSizePerWorker"))
|
||||
{
|
||||
InvalidArgument("It is only allowed to set blockSizePerWorker or blockSize, not both of them");
|
||||
}
|
||||
else if (configBMSGD.Exists(L"blockSizePerWorker"))
|
||||
{
|
||||
m_modelAggregationBlockSize = configBMSGD(L"blockSizePerWorker");
|
||||
m_modelAggregationBlockSize *= numMPIWorkers;
|
||||
}
|
||||
else if (configBMSGD.Exists(L"blockSize"))
|
||||
{
|
||||
m_modelAggregationBlockSize = configBMSGD(L"blockSize");
|
||||
}
|
||||
else
|
||||
{
|
||||
m_modelAggregationBlockSize = 120000 *numMPIWorkers; // default value
|
||||
}
|
||||
#if 1 // legacy option
|
||||
if (configBMSGD.Exists(L"syncPeriod"))
|
||||
{
|
||||
if (configBMSGD.Exists(L"blockSizePerWorker") || configBMSGD.Exists(L"blockSize"))
|
||||
{
|
||||
InvalidArgument("syncPeriod is a deprecated alias of blockSizePerWorker. It is not allowed to specify both of them");
|
||||
}
|
||||
m_modelAggregationBlockSize = configBMSGD(L"syncPeriod");
|
||||
m_modelAggregationBlockSize *= numMPIWorkers;
|
||||
fprintf(stderr, "WARNING: option syncPeroid in BlockMomentumSGD is going to be deprecated. Please use blockSizePerWorker instead in the future.\n");
|
||||
}
|
||||
#endif
|
||||
m_resetSGDMomentum = configBMSGD(L"resetSGDMomentum", true);
|
||||
m_useNesterovBlockMomentum = configBMSGD(L"useNesterovMomentum", true);
|
||||
m_blockLearningRate = configBMSGD(L"blockLearningRate", 1.0);
|
||||
|
||||
if (configBMSGD.Exists(L"blockMomentumPerSync") && configBMSGD.Exists(L"blockMomentumAsTimeConstant"))
|
||||
{
|
||||
InvalidArgument("It is only allowed to set either blockMomentumPerSync or blockMomentumAsTimeConstant, not both of them");
|
||||
}
|
||||
else if (configBMSGD.Exists(L"blockMomentumAsTimeConstant"))
|
||||
{
|
||||
m_blockMomentumAsTimeConstant = configBMSGD(L"blockMomentumAsTimeConstant");
|
||||
}
|
||||
if (configBMSGD.Exists(L"blockMomentumPerSync") && configBMSGD.Exists(L"blockMomentumAsTimeConstant"))
|
||||
{
|
||||
InvalidArgument("It is only allowed to set either blockMomentumPerSync or blockMomentumAsTimeConstant, not both of them");
|
||||
}
|
||||
else if (configBMSGD.Exists(L"blockMomentumAsTimeConstant"))
|
||||
{
|
||||
m_blockMomentumAsTimeConstant = configBMSGD(L"blockMomentumAsTimeConstant");
|
||||
}
|
||||
#if 1 // This option "blockMomentumPerSync" is going to be deprecated in the future
|
||||
else if (configBMSGD.Exists(L"blockMomentumPerSync"))
|
||||
{
|
||||
double blockMomentum = configBMSGD(L"blockMomentumPerSync");
|
||||
m_blockMomentumAsTimeConstant = BlockMomentumSGD<double>::Momentum2TimeConstant(blockMomentum, m_nFramesBetweenMASync);
|
||||
}
|
||||
else if (configBMSGD.Exists(L"blockMomentumPerSync"))
|
||||
{
|
||||
double blockMomentum = configBMSGD(L"blockMomentumPerSync");
|
||||
m_blockMomentumAsTimeConstant = BlockMomentumSGD<double>::Momentum2TimeConstant(blockMomentum, m_modelAggregationBlockSize);
|
||||
}
|
||||
#endif
|
||||
else /*if (!configBMSGD.Exists(L"blockMomentumPerSync") && !configBMSGD.Exists(L"blockMomentumAsTimeConstant"))*/
|
||||
{
|
||||
m_blockMomentumAsTimeConstant = 0.0 ;
|
||||
// marking them not specified by user ,
|
||||
// it will be set after mpi is initialized, i.e., in function InitializeAndCheckBlockMomentumSGDParameters
|
||||
}
|
||||
else /*if (!configBMSGD.Exists(L"blockMomentumPerSync") && !configBMSGD.Exists(L"blockMomentumAsTimeConstant"))*/
|
||||
{
|
||||
double blockMomentum = 1.0 - 1.0 / (double)numMPIWorkers; // this is a default value which ensures each block update contributes equally
|
||||
m_blockMomentumAsTimeConstant = BlockMomentumSGD<double>::Momentum2TimeConstant(blockMomentum, m_modelAggregationBlockSize);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
InitializeAndCheckBlockMomentumSGDParameters();
|
||||
|
||||
}
|
||||
} // if (!pMPI)
|
||||
} // if (configSGD.Exists(L"ParallelTrain"))
|
||||
}
|
||||
|
||||
static size_t GetSizeOfPrecision(const ScriptableObjects::IConfigRecordPtr configp)
|
||||
|
@ -2669,6 +2715,27 @@ SGDParams::SGDParams(const ScriptableObjects::IConfigRecordPtr configp)
|
|||
{
|
||||
}
|
||||
|
||||
void SGDParams::InitializeAndCheckBlockMomentumSGDParameters()
|
||||
{
|
||||
#ifdef CNTK_PARALLEL_TRAINING_SUPPORT
|
||||
// final argument checking in case of user specifying a bad parameter
|
||||
size_t numMPIWorker = MPIWrapper::GetInstance()->NumNodesInUse();
|
||||
double blockMomentum = BlockMomentumSGD<double>::TimeConstant2Momentum(m_blockMomentumAsTimeConstant, m_modelAggregationBlockSize);
|
||||
if ((1 - blockMomentum)*m_blockLearningRate*numMPIWorker >= 2.0)
|
||||
{
|
||||
fprintf(stderr, "WARNING: (1-blockMomentumPerSync)*blockLearningRate is larger than 2*numWorkers; it is possible to overshoot.");
|
||||
}
|
||||
if (blockMomentum == 0.0)
|
||||
{
|
||||
fprintf(stderr, "WARNING: blockMomentum equals to zero. \n");
|
||||
}
|
||||
#else
|
||||
// don't need do anything here
|
||||
m_blockMomentumAsTimeConstant = 0.0;
|
||||
m_blockLearningRate = 1.0;
|
||||
#endif
|
||||
}
|
||||
|
||||
// register SGD<> with the ScriptableObject system
|
||||
ScriptableObjects::ConfigurableRuntimeTypeRegister::AddFloatDouble<SGD<float>, SGD<double>> registerSGDOptimizer(L"SGDOptimizer");
|
||||
|
||||
|
|
|
@ -161,6 +161,8 @@ protected:
|
|||
return m_parallelizationMethod;
|
||||
}
|
||||
|
||||
// helper function to initialize and check BlockMomentumSGD related parameters
|
||||
void InitializeAndCheckBlockMomentumSGDParameters();
|
||||
// only true when the user specify LearningRatePerMB and the number of parallel utterances in Reader > 1
|
||||
// bool m_needToNormalizeLRByParallUtterance; // TODO: should go away
|
||||
// bool m_needToNormalizeMomentumByParallUtterance;
|
||||
|
@ -265,7 +267,7 @@ protected:
|
|||
bool m_zeroThresholdFor1Bit;
|
||||
|
||||
// Parallel training related with MA / BM
|
||||
size_t m_nFramesBetweenMASync;
|
||||
size_t m_modelAggregationBlockSize;
|
||||
bool m_resetSGDMomentum;
|
||||
bool m_useNesterovBlockMomentum;
|
||||
double m_blockLearningRate;
|
||||
|
@ -336,13 +338,6 @@ public:
|
|||
|
||||
if (m_mpi == nullptr)
|
||||
m_parallelizationMethod = ParallelizationMethod::none;
|
||||
|
||||
if (m_parallelizationMethod == ParallelizationMethod::blockMomentumSGD)
|
||||
{
|
||||
// This is used to finish initializing BlockMomentumSGD parameter
|
||||
// since some of the parameter may not be specified by the users
|
||||
InitializeAndCheckBlockMomentumSGDParameters();
|
||||
}
|
||||
}
|
||||
|
||||
void Train(function<ComputationNetworkPtr(DEVICEID_TYPE)> createNetworkFn, DEVICEID_TYPE deviceId,
|
||||
|
@ -563,21 +558,18 @@ protected:
|
|||
shared_ptr<IMASGD<ElemType>> m_pMASGDHelper;
|
||||
|
||||
private:
|
||||
void InitializeAndCheckBlockMomentumSGDParameters();
|
||||
void MarkDropoutNodesEvalTimeStampAsOutdated(const ComputationNetworkPtr& net, const ComputationNodeBasePtr& criterionNode);
|
||||
|
||||
bool UsingGradientAggregation(size_t epochNumber) const
|
||||
{
|
||||
return ((GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD) && (epochNumber >= m_parallelizationStartEpochNum));
|
||||
}
|
||||
|
||||
bool UsingModelAggregation(size_t epochNumber) const
|
||||
{
|
||||
return ((GetParallelizationMethod() == ParallelizationMethod::modelAveragingSGD ||
|
||||
GetParallelizationMethod() == ParallelizationMethod::blockMomentumSGD) &&
|
||||
(epochNumber >= m_parallelizationStartEpochNum));
|
||||
}
|
||||
|
||||
bool UsingParallelTrain(size_t epochNumber) const
|
||||
{
|
||||
return UsingGradientAggregation(epochNumber) || UsingModelAggregation(epochNumber);
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
__COMPLETED__
|
|
@ -0,0 +1,15 @@
|
|||
#!/bin/bash
|
||||
|
||||
SCRIPTSDIR=$TEST_ROOT_DIR/../../Scripts
|
||||
|
||||
pushd $SCRIPTSDIR
|
||||
echo Current dir: $SCRIPTSDIR
|
||||
|
||||
py.test
|
||||
|
||||
if [ "$?" -eq "0" ]; then
|
||||
echo "__COMPLETED__"
|
||||
fi
|
||||
|
||||
popd
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
dataDir: .
|
||||
|
||||
tags:
|
||||
# running on linux BVT job in 'E' (Examples) leg
|
||||
- bvt-e (build_sku == 'gpu') and (device=='cpu') and (flavor=='release')
|
||||
# running on linux Nightly job in 'E' (Examples) leg
|
||||
- nightly-e (build_sku == 'gpu') and (device=='cpu') and (flavor=='release')
|
||||
|
||||
testCases:
|
||||
PyTest run must finish with error code 0 (outputs __COMPLETED__ in that case):
|
||||
patterns:
|
||||
- __COMPLETED__
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -65,7 +65,7 @@ speechTrain = [
|
|||
distributedMBReading = true
|
||||
syncPerfStats=1
|
||||
BlockMomentumSGD = [
|
||||
syncPeriod=2048
|
||||
blockSizePerWorker=2048
|
||||
resetSGDMomentum=true
|
||||
useNesterovMomentum=true
|
||||
]
|
||||
|
|
|
@ -29,11 +29,11 @@ testCases:
|
|||
patterns:
|
||||
- ^MPI Rank {{integer}}
|
||||
- block momentum = {{float,tolerance=0.1%}}
|
||||
- block momentum time constant = {{float,tolerance=1%}}
|
||||
- block momentum time constant (per worker) = {{float,tolerance=1%}}
|
||||
- block learning rate = {{float,tolerance=0.1%}}
|
||||
- sync period = {{integer}} samples
|
||||
- block size per worker = {{integer}} samples
|
||||
- resetting SGD momentum after sync
|
||||
- using Nesterov style block momentum
|
||||
- using Nesterov-style block momentum
|
||||
|
||||
Per-minibatch training results must match for each MPI Rank:
|
||||
patterns:
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
# for full license information.
|
||||
# ==============================================================================
|
||||
|
||||
__version__ = '1.4'
|
||||
__version__ = '1.5'
|
||||
|
||||
from .context import *
|
||||
from .graph import *
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Python Wrapper for CNTK 1.4 documentation build configuration file, created by
|
||||
# Python Wrapper for CNTK 1.5 documentation build configuration file, created by
|
||||
# sphinx-quickstart on Wed Apr 6 13:21:01 2016.
|
||||
#
|
||||
# This file is execfile()d with the current directory set to its
|
||||
|
@ -54,7 +54,7 @@ source_suffix = '.rst'
|
|||
master_doc = 'index'
|
||||
|
||||
# General information about the project.
|
||||
project = 'Python Wrapper for CNTK 1.4'
|
||||
project = 'Python Wrapper for CNTK 1.5'
|
||||
copyright = '2016, Microsoft'
|
||||
author = 'Microsoft'
|
||||
|
||||
|
@ -63,9 +63,9 @@ author = 'Microsoft'
|
|||
# built documents.
|
||||
#
|
||||
# The short X.Y version.
|
||||
version = '1.4'
|
||||
version = '1.5'
|
||||
# The full version, including alpha/beta/rc tags.
|
||||
release = '1.4'
|
||||
release = '1.5'
|
||||
|
||||
# The language for content autogenerated by Sphinx. Refer to documentation
|
||||
# for a list of supported languages.
|
||||
|
@ -209,7 +209,7 @@ html_static_path = ['_static']
|
|||
#html_search_scorer = 'scorer.js'
|
||||
|
||||
# Output file base name for HTML help builder.
|
||||
htmlhelp_basename = 'CNTK14doc'
|
||||
htmlhelp_basename = 'CNTK15doc'
|
||||
|
||||
# -- Options for LaTeX output ---------------------------------------------
|
||||
|
||||
|
@ -231,7 +231,7 @@ latex_elements = {
|
|||
# (source start file, target name, title,
|
||||
# author, documentclass [howto, manual, or own class]).
|
||||
latex_documents = [
|
||||
(master_doc, 'CNTK14.tex', 'Python Wrapper for CNTK 1.4 Documentation',
|
||||
(master_doc, 'CNTK15.tex', 'Python Wrapper for CNTK 1.5 Documentation',
|
||||
'Microsoft', 'manual'),
|
||||
]
|
||||
|
||||
|
@ -261,7 +261,7 @@ latex_documents = [
|
|||
# One entry per manual page. List of tuples
|
||||
# (source start file, name, description, authors, manual section).
|
||||
man_pages = [
|
||||
(master_doc, 'cntk14', 'Python Wrapper for CNTK 1.4 Documentation',
|
||||
(master_doc, 'cntk15', 'Python Wrapper for CNTK 1.5 Documentation',
|
||||
[author], 1)
|
||||
]
|
||||
|
||||
|
@ -275,8 +275,8 @@ man_pages = [
|
|||
# (source start file, target name, title, author,
|
||||
# dir menu entry, description, category)
|
||||
texinfo_documents = [
|
||||
(master_doc, 'CNTK14', 'Python Wrapper for CNTK 1.4 Documentation',
|
||||
author, 'CNTK14', 'One line description of project.',
|
||||
(master_doc, 'CNTK15', 'Python Wrapper for CNTK 1.5 Documentation',
|
||||
author, 'CNTK15', 'One line description of project.',
|
||||
'Miscellaneous'),
|
||||
]
|
||||
|
||||
|
|
|
@ -32,15 +32,16 @@ Installing the Python module
|
|||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
#. Go to ``<cntkpath>/contrib/Python`` and run ``python setup.py install``
|
||||
#. Set up the environment variable ``CNTK_EXECUTABLE_PATH`` to point to the
|
||||
CNTK executable
|
||||
CNTK executable. Make sure the executable is also included
|
||||
#. Enjoy Python's ease of use with CNTK's speed::
|
||||
|
||||
>>> import cntk as cn
|
||||
>>> cn.__version__
|
||||
1.4
|
||||
>>> with cn.Context('demo', clean_up=False) as ctx:
|
||||
... a = cn.constant([[1,2], [3,4]])
|
||||
... print(ctx.eval(a + [[10,20], [30, 40]]))
|
||||
>>> import cntk as C
|
||||
>>> C.__version__
|
||||
1.5
|
||||
>>> with C.LocalExecutionContext('demo', clean_up=False) as ctx:
|
||||
... a = C.constant([[1,2], [3,4]])
|
||||
... i = C.input_numpy([[[10,20], [30, 40]]])
|
||||
... print(ctx.eval(a + i))
|
||||
[[11.0, 22.0], [33.0, 44.0]]
|
||||
|
||||
In this case, we have set ``clean_up=False`` so that you can now peek into the
|
||||
|
@ -71,46 +72,43 @@ explained::
|
|||
import cntk as C
|
||||
import numpy as np
|
||||
|
||||
def simple_network():
|
||||
# 500 samples, 250-dimensional data
|
||||
N = 500
|
||||
d = 250
|
||||
# 500 samples, 250-dimensional data
|
||||
N = 500
|
||||
d = 250
|
||||
|
||||
# create synthetic data using numpy
|
||||
X = np.random.randn(N, d)
|
||||
Y = np.random.randint(size=(N, 1), low=0, high=2)
|
||||
Y = np.hstack((Y, 1-Y))
|
||||
# create synthetic data using numpy
|
||||
X = np.random.randn(N, d)
|
||||
Y = np.random.randint(size=(N, 1), low=0, high=2)
|
||||
Y = np.hstack((Y, 1-Y))
|
||||
|
||||
# set up the training data for CNTK
|
||||
x = C.input_numpy(X, has_dynamic_axis=False)
|
||||
y = C.input_numpy(Y, has_dynamic_axis=False)
|
||||
# set up the training data for CNTK
|
||||
x = C.input_numpy(X)
|
||||
y = C.input_numpy(Y)
|
||||
|
||||
# define our network parameters: a weight tensor and a bias
|
||||
W = C.parameter((2, d))
|
||||
b = C.parameter((2, 1))
|
||||
|
||||
# create a dense 'layer' by multiplying the weight tensor and
|
||||
# the features and adding the bias
|
||||
out = C.times(W, x) + b
|
||||
# define our network parameters: a weight tensor and a bias
|
||||
W = C.parameter((d, 2))
|
||||
b = C.parameter((1, 2))
|
||||
|
||||
# setup the criterion node using cross entropy with softmax
|
||||
ce = C.cross_entropy_with_softmax(y, out, name='loss')
|
||||
ce.tag = 'criterion'
|
||||
# create a dense 'layer' by multiplying the weight tensor and
|
||||
# the features and adding the bias
|
||||
out = C.times(x, W) + b
|
||||
|
||||
# define our SGD parameters and train!
|
||||
my_sgd = C.SGDParams(epoch_size=0, minibatch_size=25, learning_rates_per_mb=0.1, max_epochs=3)
|
||||
with C.LocalExecutionContext('logreg') as ctx:
|
||||
ctx.train(root_nodes=[ce], training_params=my_sgd)
|
||||
print(ctx.test(root_nodes=[ce]))
|
||||
# setup the criterion node using cross entropy with softmax
|
||||
ce = C.cross_entropy_with_softmax(y, out, name='loss')
|
||||
ce.tag = 'criterion'
|
||||
|
||||
# define our SGD parameters and train!
|
||||
my_sgd = C.SGDParams(epoch_size=0, minibatch_size=25, learning_rates_per_mb=0.1, max_epochs=3)
|
||||
with C.LocalExecutionContext('logreg') as ctx:
|
||||
ctx.train(root_nodes=[ce], training_params=my_sgd)
|
||||
print(ctx.test(root_nodes=[ce]))
|
||||
|
||||
|
||||
In the example above, we first create a synthetic data set of 500 samples, each with a 2-dimensional
|
||||
one-hot vector representing 0 (``[1 0]``) or 1 (``[0 1]``). We then begin describing the topology of our network
|
||||
by setting up the data inputs. This is typically done using the :class:`cntk.reader.CNTKTextFormatReader` by reading data
|
||||
in from a file, but for interactive experimentation and small examples we can use the ``input_numpy`` reader to
|
||||
access numpy data. Because dealing with dynamic axis data and sequences is where CNTK really shines,
|
||||
the default input data has a dynamic axis defined. Since we're not dealing with dynamic axes here, we
|
||||
set ``has_dynamic_axis`` to False.
|
||||
access numpy data.
|
||||
|
||||
Next, we define our network. In this case it's a simple 1-layer network with a weight tensor and a bias.
|
||||
We multiply our data `x` with the weight tensor `W` and add the bias `b`. We then input the model prediction
|
||||
|
@ -151,7 +149,7 @@ on the right.
|
|||
.. figure:: images/nn_layers.png
|
||||
:width: 600px
|
||||
:alt: NN Layers
|
||||
|
||||
|
||||
As is apparent from the figure above on the right, RNNs are the natural structure for
|
||||
dealing with sequences. This includes everything from text to music to video; anything
|
||||
where the current state is dependent on the previous state. While RNNs are indeed
|
||||
|
@ -169,7 +167,7 @@ concentrate on the central feature of the LSTM model: the `memory cell`.
|
|||
.. figure:: images/lstm_cell.png
|
||||
:width: 400px
|
||||
:alt: LSTM cell
|
||||
|
||||
|
||||
An LSTM cell.
|
||||
|
||||
The LSTM cell is associated with three gates that control how information is stored /
|
||||
|
@ -208,48 +206,49 @@ sequence classification. We can think of the network as adding a series of layer
|
|||
|
||||
We can define this network as follows in the CNTK Python API::
|
||||
|
||||
import cntk as C
|
||||
import cntk as C
|
||||
|
||||
def seqcla():
|
||||
# model
|
||||
num_labels = 5
|
||||
vocab = 2000
|
||||
embed_dim = 50
|
||||
|
||||
# LSTM params
|
||||
input_dim = 50
|
||||
output_dim = 128
|
||||
cell_dim = 128
|
||||
def seqcla():
|
||||
# model
|
||||
num_labels = 5
|
||||
vocab = 2000
|
||||
embed_dim = 50
|
||||
|
||||
t = C.dynamic_axis(name='t')
|
||||
# temporarily using cntk1 SpareInput because cntk2's input() will simply allow sparse as a parameter
|
||||
features = cntk1.SparseInput(vocab, dynamicAxis=t, name='features')
|
||||
labels = C.input(num_labels, name='labels')
|
||||
|
||||
train_reader = C.CNTKTextFormatReader(train_file)
|
||||
# LSTM params
|
||||
input_dim = 50
|
||||
output_dim = 128
|
||||
cell_dim = 128
|
||||
|
||||
# setup embedding matrix
|
||||
embedding = C.parameter((embed_dim, vocab),
|
||||
learning_rate_multiplier=0.0,
|
||||
init_from_file_path=embedding_file)
|
||||
t = C.dynamic_axis(name='t')
|
||||
# temporarily using cntk1 SparseInput because cntk2's input() will simply allow sparse as a parameter
|
||||
features = cntk1.SparseInput(vocab, dynamicAxis=t, name='features')
|
||||
labels = C.input(num_labels, name='labels')
|
||||
|
||||
train_reader = C.CNTKTextFormatReader(train_file)
|
||||
|
||||
# setup embedding matrix
|
||||
embedding = C.parameter((embed_dim, vocab),
|
||||
learning_rate_multiplier=0.0,
|
||||
init_from_file_path=embedding_file)
|
||||
|
||||
# get the vector representing the word
|
||||
sequence = C.times(embedding, features, name='sequence')
|
||||
|
||||
# add an LSTM layer
|
||||
L = lstm_layer(output_dim, cell_dim, sequence, input_dim)
|
||||
|
||||
# add a dense layer on top
|
||||
w = C.parameter((num_labels, output_dim), name='w')
|
||||
b = C.parameter((num_labels), name='b')
|
||||
z = C.plus(C.times(w, L), b, name='z')
|
||||
z.tag = "output"
|
||||
|
||||
# and reconcile the shared dynamic axis
|
||||
pred = C.reconcile_dynamic_axis(z, labels, name='pred')
|
||||
|
||||
ce = C.cross_entropy_with_softmax(labels, pred)
|
||||
ce.tag = "criterion"
|
||||
|
||||
# get the vector representing the word
|
||||
sequence = C.times(embedding, features, name='sequence')
|
||||
|
||||
# add an LSTM layer
|
||||
L = lstm_layer(output_dim, cell_dim, sequence, input_dim)
|
||||
|
||||
# add a dense layer on top
|
||||
w = C.parameter((num_labels, output_dim), name='w')
|
||||
b = C.parameter((num_labels), name='b')
|
||||
z = C.plus(C.times(w, L), b, name='z')
|
||||
z.tag = "output"
|
||||
|
||||
# and reconcile the shared dynamic axis
|
||||
pred = C.reconcile_dynamic_axis(z, labels, name='pred')
|
||||
|
||||
ce = C.cross_entropy_with_softmax(labels, pred)
|
||||
ce.tag = "criterion"
|
||||
|
||||
Let's go through some of the intricacies of the above network definition. First, we define
|
||||
some parameters of the data and the network. We have 5 possible classes for the sequences;
|
||||
|
@ -282,7 +281,7 @@ the criterion node that adds a softmax and then implements the cross entropy los
|
|||
we add the criterion node, however, we call :func:`cntk.ops.reconcile_dynamic_axis` which will ensure
|
||||
that the minibatch layout for the labels and the data with dynamic axes is compatible.
|
||||
|
||||
For the full explanation of how ``lstm_layer()`` is defined, please see the full example in the
|
||||
For the full explanation of how ``lstm_layer()`` is defined, please see the full example (`seqcla.py <https://github.com/Microsoft/CNTK/blob/master/contrib/Python/cntk/examples/LSTM/seqcla.py>`_) in the
|
||||
Examples section.
|
||||
|
||||
How to pass Python data as train/test data
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
.. some aliases
|
||||
.. _CNTK: http://cntk.ai/
|
||||
|
||||
Python Wrapper API for CNTK 1.4
|
||||
Python Wrapper API for CNTK 1.5
|
||||
===============================
|
||||
|
||||
CNTK_, the Computational Network Toolkit, is a system for describing, training,
|
||||
|
@ -12,7 +12,7 @@ neural networks (CNNs), recurrent neural networks (RNNs), long short term
|
|||
memory (LSTM), logistic regression, and maximum entropy model. CNTK is an
|
||||
implementation of computational networks that supports both CPU and GPU.
|
||||
|
||||
This page describes the Python Wrapper for CNTK_ version 1.4. This is an ongoing effort
|
||||
This page describes the Python Wrapper for CNTK_ version 1.5. This is an ongoing effort
|
||||
to expose such an API to the CNTK system, thus enabling the use of higher-level
|
||||
tools such as IDEs to facilitate the definition of computational networks, to execute
|
||||
them on sample data in real time.
|
||||
|
|
Загрузка…
Ссылка в новой задаче