Merge branch 'wilrich/missingV2Ops' of https://github.com/Microsoft/CNTK into wilrich/missingV2Ops

This commit is contained in:
Willi Richert 2016-09-28 21:01:39 +02:00
Родитель 742a8fd08d e41f1955d1
Коммит f73f611de3
29 изменённых файлов: 15245 добавлений и 26 удалений

Просмотреть файл

@ -1,6 +1,18 @@
# CNTK
## Latest news
*2016-09-28.* V 1.7.1 Binary release
Highlights of this Release:
* Two Breaking Changes related to Layers library default initialization and ```fsAdagrad``` gradient-normalization scheme
* Improvements in BrainScript
* Enabling of Deterministic Algorithm enforcement
* Improvements in Model Evaluation including the support of Evaluation for Azure Applications
* Different Performance improvements
* Multiple bug fixes
See more in the [Release Notes](https://github.com/Microsoft/CNTK/wiki/CNTK_1_7_1_Release_Notes) (including the full list of bugs fixed)
Get the Release from the [CNTK Releases page](https://github.com/Microsoft/CNTK/releases)
*2016-08-31.* V 1.7 Binary release
Highlights of this Release:
* Improvements in BrainScript (New library of predefined common layer types, Support of cuDNN5 RNN and Common random-initialization types, improved handling of GRUs)
@ -22,8 +34,6 @@ Get the Release from the [CNTK Releases page](https://github.com/Microsoft/CNTK/
*2016-07-15.* V 1.6 Binary release
CNTK v.1.6 binaries are on the [CNTK Releases page](https://github.com/Microsoft/CNTK/releases)
*2016-07-12.* We have further expanded Licensing options for CNTK 1bit-SGD and related components. See the details at the [Wiki page](https://github.com/microsoft/cntk/wiki/CNTK-1bit-SGD-License). These new options are an extension of the new CNTK 1bit-SGD License that we have announced on Jun 23, 2016.
See [all news](https://github.com/Microsoft/CNTK/wiki/News).
## What is CNTK

Просмотреть файл

@ -633,11 +633,9 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
// main() for CNTK config language (this is the current way of using CNTK)
// ---------------------------------------------------------------------------
// helper to print a little banner
// CNTK 1.7.1 (fseide/samplebs hash, Sep 3 2016 00:17:33) on FSEIDE-GPU at 2016/09/03 00:25:30
static void PrintBanner(int argc, wchar_t* argv[], const string& timestamp)
{
fprintf(stderr, "CNTK 1.7.1 (");
fprintf(stderr, "CNTK 1.7.1+ (");
#ifdef _GIT_EXIST
fprintf(stderr, "%s %.6s, ", _BUILDBRANCH_, _BUILDSHA1_);
#endif

Просмотреть файл

@ -2138,7 +2138,7 @@ namespace CNTK
std::copy(currentFunctionOutputs.begin(), currentFunctionOutputs.end(), std::back_inserter(inputs));
}
return Internal::Combine(inputs);
return Internal::Combine(inputs, name);
}
namespace Sequence

Просмотреть файл

@ -43,16 +43,50 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// TODO: Should be unified with StreamDescription from the new reader API
struct InputStreamDescription
{
InputStreamDescription(const std::wstring& name, int deviceId, MatrixType matrixType, MatrixFormat format)
: m_name(name), m_deviceId(deviceId), m_matrixType(matrixType), m_format(format)
{}
const std::wstring& GetStreamName() const
{
return m_name;
}
int GetDeviceId() const
{
return m_deviceId;
}
MatrixType GetMatrixType() const
{
return m_matrixType;
}
MatrixFormat GetMatrixFormat() const
{
return m_format;
}
private:
// Stream name.
std::wstring m_name;
// Device identifier for the resulting matrix of this stream.
int m_deviceId;
// Matrix type.
MatrixType m_matrixType;
// Matrix format.
MatrixFormat m_format;
};
inline bool operator == (const InputStreamDescription& a, const InputStreamDescription& b)
{
return a.m_name == b.m_name && a.m_deviceId == b.m_deviceId;
return a.GetStreamName() == b.GetStreamName() &&
a.GetDeviceId() == b.GetDeviceId() &&
a.GetMatrixType() == b.GetMatrixType() &&
a.GetMatrixFormat() == b.GetMatrixFormat();
};
}}}
@ -63,7 +97,7 @@ namespace std
size_t operator()(const Microsoft::MSR::CNTK::InputStreamDescription& x) const
{
// Input name is unique, simply return the hash of the input stream.
return std::hash<std::wstring>()(x.m_name);
return std::hash<std::wstring>()(x.GetStreamName());
}
};
}
@ -163,7 +197,8 @@ public:
std::unordered_set<InputStreamDescription> streamDescriptions;
for (auto input = begin(); input != end(); ++input)
{
streamDescriptions.insert(InputStreamDescription{ input->first, input->second.matrix->GetDeviceId() });
streamDescriptions.insert(
InputStreamDescription(input->first, input->second.matrix->GetDeviceId(), input->second.matrix->GetMatrixType(), input->second.matrix->GetFormat()));
}
return streamDescriptions;
}

Просмотреть файл

@ -117,6 +117,9 @@ const char* CudaErrString<curandStatus>(curandStatus)
namespace Microsoft { namespace MSR { namespace CNTK {
// Todo: After upgrade to VS2015, remove it here, and put it as a local static variable in GetDeviceProps().
std::vector<cudaDeviceProp> GridDim::s_cachedDeviceProps = GridDim::CacheDeviceProps();
/*static*/ bool SyncGuard::s_isSyncEnabled = false;
/*static*/ void SyncGuard::EnableSync()

Просмотреть файл

@ -159,8 +159,10 @@ struct GridDim
// get device properties of current device
static const cudaDeviceProp& GetDeviceProps()
{
static std::vector<cudaDeviceProp> props = CacheDeviceProps(); // thread-safe according to C++ standard
return props[GetCurrentDeviceId()];
// Unfortunatelly, initialization of local static variables is not thread-safe in VS2013.
// As workaround, it is moved to the struct level.
// static std::vector<cudaDeviceProp> props = CacheDeviceProps(); // thread-safe according to C++ standard
return s_cachedDeviceProps[GetCurrentDeviceId()];
}
// compute our location on the grid
@ -168,6 +170,10 @@ struct GridDim
{
return blockDim.x * blockIdx.x + threadIdx.x;
}
private:
// Todo: after upgraded to VS2015, move the satic variable into GetDeviceProps() as local static variables there.
static std::vector<cudaDeviceProp> s_cachedDeviceProps;
};
#define CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N) \

Просмотреть файл

@ -912,7 +912,14 @@ void GPUSparseMatrix<ElemType>::SetMatrixFromCSCFormat(const CPUSPARSE_INDEX_TYP
cudaMemcpyKind kind = IsOnDevice ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice;
if (transferer)
{
// TODO: All RequireSizeAndAllocate should be async and use a transferer.
// Currently there are some memset operations that can be still executing on the default stream,
// Here we have to wait for them to finish.
transferer->RecordComputeStreamSyncPoint();
transferer->WaitForSyncPointOnAssignStreamAsync();
transferer->CopyCPUToGPUAsync(h_Val, nz, sizeof(ElemType), Data());
}
else
CUDA_CALL(cudaMemcpy(Data(), h_Val, nz * sizeof(ElemType), kind));

Просмотреть файл

@ -3502,6 +3502,18 @@ int Matrix<ElemType>::GetDeviceId() const
{ return m_GPUSparseMatrix->GetComputeDeviceId(); });
}
template <class ElemType>
MatrixType Matrix<ElemType>::GetMatrixType() const
{
return m_matrixType;
}
template <class ElemType>
MatrixFormat Matrix<ElemType>::GetFormat() const
{
return m_baseMatrix->GetFormat();
}
// TODO: Comment why we need a second ElemType.
// TODO: Move the shared core functions to the front of this source file.
// BUGBUG: This performs a copy operation even for the output matrix that gets overwritten right away.

Просмотреть файл

@ -49,6 +49,8 @@ template <class ElemType> class DeviceBoundNumber;
struct /*interface*/ MATH_API MatrixBase
{
virtual int GetDeviceId() const = 0;
virtual MatrixType GetMatrixType() const = 0;
virtual MatrixFormat GetFormat() const = 0;
// TODO: Move more generic functions such as getting dims, resizing, and getting/setting as scalars in here.
virtual ~MatrixBase();
};
@ -147,8 +149,8 @@ public:
return node;
}
MatrixType GetMatrixType() const { return m_matrixType; }
MatrixFormat GetFormat() const { return m_baseMatrix->GetFormat(); }
MatrixType GetMatrixType() const override;
MatrixFormat GetFormat() const override;
bool OwnBuffer() const { return m_baseMatrix->OwnBuffer(); }
int GetDeviceId() const; // -1 if CPU, otherwise GPU CUDA device id
DEVICEID_TYPE GetPreferredDeviceId() const { return m_preferredDeviceId; }; // -1 if CPU, otherwise GPU CUDA device id

Просмотреть файл

@ -84,15 +84,15 @@ void ReaderShim<ElemType>::StartDistributedMinibatchLoop(
// Now we can be sure, no prefetch thread is running and there are no outstanding memcopies.
// Let's check that requested devices are ok and see whether we need to change our data transferers.
auto device = std::find_if(inputs.begin(), inputs.end(),
[](const InputStreamDescription& d) { return d.m_deviceId != CPUDEVICE; });
auto deviceId = device != inputs.end() ? device->m_deviceId : CPUDEVICE;
[](const InputStreamDescription& d) { return d.GetDeviceId() != CPUDEVICE; });
auto deviceId = device != inputs.end() ? device->GetDeviceId() : CPUDEVICE;
// Check that all devices either the same as m_deviceId or CPU.
auto secondDevice = std::find_if(inputs.begin(), inputs.end(),
[deviceId](const InputStreamDescription& d) { return d.m_deviceId != CPUDEVICE && d.m_deviceId != deviceId; });
[deviceId](const InputStreamDescription& d) { return d.GetDeviceId() != CPUDEVICE && d.GetDeviceId() != deviceId; });
if (secondDevice != inputs.end())
{
LogicError("Readers do not support running on several GPUs in the same process, at least two devices found '%d', '%d'", deviceId, secondDevice->m_deviceId);
LogicError("Readers do not support running on several GPUs in the same process, at least two devices found '%d', '%d'", deviceId, secondDevice->GetDeviceId());
}
if (m_deviceId != deviceId)
@ -109,8 +109,13 @@ void ReaderShim<ElemType>::StartDistributedMinibatchLoop(
std::map<std::wstring, int> inputDescriptions;
for (const auto& i : inputs)
{
inputDescriptions[i.m_name] = i.m_deviceId;
m_prefetchBuffers[i.m_name] = StreamPrefetchBuffer{ std::make_shared<Matrix<ElemType>>(i.m_deviceId), nullptr };
inputDescriptions[i.GetStreamName()] = i.GetDeviceId();
// Creating buffers with the same properties the network expects.
m_prefetchBuffers[i.GetStreamName()] = StreamPrefetchBuffer
{
std::make_shared<Matrix<ElemType>>(0, 0, i.GetDeviceId(), i.GetMatrixType(), i.GetMatrixFormat()),
nullptr
};
}
m_endOfEpoch = false;

Просмотреть файл

@ -13,8 +13,7 @@ python -c "import scipy; print('SciPy: %s'%scipy.version.full_version)"
python -c "import pytest; print('PyTest: %s'%pytest.__version__)"
cd $PYBINDINGDIR
py.test
# TODO --deviceid $TEST_DEVICE
py.test --deviceid $TEST_DEVICE
if [ "$?" -eq "0" ]; then
echo "__COMPLETED__"

Просмотреть файл

@ -1,8 +1,8 @@
dataDir: .
tags:
- bvt-l (os == 'windows') and (build_sku == 'gpu') and (flavor == 'release') and (device == 'gpu')
- nightly-l (os == 'windows') and (build_sku == 'gpu') and (flavor == 'release') and (device == 'gpu')
- bvt-l (os == 'windows') and (build_sku == 'gpu') and (flavor == 'release')
- nightly-l (os == 'windows') and (build_sku == 'gpu') and (flavor == 'release')
testCases:
BindingPyTest run must finish with error code 0 (outputs __COMPLETED__ in that case):

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,37 @@
#!/bin/bash
. $TEST_ROOT_DIR/run-test-common
OriginalTestDir=../../../DNN/Dropout
ConfigDir=$TEST_DIR/$OriginalTestDir
LogFileName=stderr
Instances=2
NumCPUThreads=$(threadsPerInstance $Instances)
(cd $TEST_DIR/$OriginalTestDir && md5sum baseline*) | (cd $TEST_DIR && md5sum --status -c -)
if [ $? != 0 ]; then
echo Error: Baselines must match original test. Copy from $OriginalTestDir.
exit 1
fi
# cntkmpirun <MPI args> <CNTK config file name> <additional CNTK args>
DeleteModelsAfterTest=0
cntkmpirun "-n $Instances" cntk.cntk "numCPUThreads=$NumCPUThreads speechTrain=[reader=[readerType=HTKDeserializers]]"
ExitCode=$?
sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank0
sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank1
if [ "$ExitCode" != "0" ]; then
exit $ExitCode
fi
echo === Deleting last 2 epochs and restart
rm $TEST_RUN_DIR/models/*.dnn || exit $?
rm $TEST_RUN_DIR/models/*.dnn.4 || exit $?
echo ==== Re-running from checkpoint
DeleteExistingModels=0
DeleteModelsAfterTest=1
cntkmpirun "-n $Instances" cntk.cntk "numCPUThreads=$NumCPUThreads speechTrain=[reader=[readerType=HTKDeserializers]]"
ExitCode=$?
sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank0
sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank1
exit $ExitCode

Просмотреть файл

@ -0,0 +1,40 @@
dataDir: ../../../Data
tags:
# running for gpu build SKU on every BVT job in 'S' (Speech) leg in Debug-GPU and Release-CPU configurations:
- bvt-s (build_sku == 'gpu') and ((flavor=='debug') ^ (device=='cpu'))
# running for gpu build SKU on every Nightly job in 'S' leg
- nightly-s (build_sku == 'gpu')
testCases:
Must train epochs in exactly same order and parameters for each MPI Rank:
patterns:
- ^MPI Rank {{integer}}
- Starting Epoch {{integer}}
- learning rate per sample = {{float}}
- momentum = {{float}}
Epochs must be finished with expected results for each MPI Rank:
patterns:
- ^MPI Rank {{integer}}
- Finished Epoch[{{integer}} of {{integer}}]
- ce = {{float,tolerance=0.2%}}
- err = {{float,tolerance=0.2%}}
- learningRatePerSample = {{float,tolerance=0.001%}}
Per-minibatch training results must match for each MPI Rank:
patterns:
- ^MPI Rank {{integer}}
- Epoch[{{integer}} of {{integer}}]-Minibatch[{{integer}}-{{integer}}
- " * {{integer}}; "
- ce = {{float,tolerance=0.2%}}
- err = {{float,tolerance=0.2%}}
DataParallelSGD training parameters must match for each MPI Rank:
patterns:
- ^MPI Rank {{integer}}
- Starting minibatch loop
- DataParallelSGD training
- myRank = {{integer}}
- numNodes = 2
- numGradientBits = 32
- distributed reading is ENABLED

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,22 @@
#!/bin/bash
. $TEST_ROOT_DIR/run-test-common
OriginalTestDir=../../../DNN/ParallelBM
ConfigDir=$TEST_DIR/$OriginalTestDir
LogFileName=stderr
Instances=2
NumCPUThreads=$(threadsPerInstance $Instances)
(cd $TEST_DIR/$OriginalTestDir && md5sum baseline*) | (cd $TEST_DIR && md5sum --status -c -)
if [ $? != 0 ]; then
echo Error: Baselines must match original test. Copy from $OriginalTestDir.
exit 1
fi
# cntkmpirun <MPI args> <CNTK config file name> <additional CNTK args>
cntkmpirun "-n $Instances" cntk.cntk "numCPUThreads=$NumCPUThreads precision=double speechTrain=[SGD=[ParallelTrain=[parallelizationStartEpoch=2]]]"
ExitCode=$?
sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank0
sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank1
exit $ExitCode

Просмотреть файл

@ -0,0 +1,46 @@
dataDir: ../../../Data
tags:
# running for 1bitsgd build SKU on every BVT job in 'S' (Speech) leg in Debug-GPU and Release-CPU configurations:
- bvt-s (build_sku == '1bitsgd') and ((flavor=='debug') ^ (device=='cpu'))
# running for 1bitsgd build SKU on every Nightly job in 'S' leg
- nightly-s (build_sku == '1bitsgd')
testCases:
Must train epochs in exactly same order and parameters for each MPI Rank:
patterns:
- ^MPI Rank {{integer}}
- Starting Epoch {{integer}}
- learning rate per sample = {{float}}
- momentum = {{float}}
Epochs must be finished with expected results for each MPI Rank:
patterns:
- ^MPI Rank {{integer}}
- Finished Epoch[{{integer}} of {{integer}}]
- CrossEntropyWithSoftmax = {{float,tolerance=2%}}
- EvalClassificationError = {{float,tolerance=2%}}
- learningRatePerSample = {{float,tolerance=0.001%}}
BlockMomentumSGD training should have distributed reading enabled:
patterns:
- distributed reading is ENABLED
BlockMomentumSGD training should have the expected parameters:
patterns:
- ^MPI Rank {{integer}}
- block momentum = {{float,tolerance=0.1%}}
- block momentum time constant (per worker) = {{float,tolerance=1%}}
- block learning rate = {{float,tolerance=0.1%}}
- block size per worker = {{integer}} samples
- resetting SGD momentum after sync
- using Nesterov-style block momentum
Per-minibatch training results must match for each MPI Rank:
patterns:
- ^MPI Rank {{integer}}
- Epoch[{{integer}} of {{integer}}]-Minibatch[{{integer}}-{{integer}}
- "* {{integer}}; "
- CrossEntropyWithSoftmax = {{float,tolerance=2%}}
- EvalClassificationError = {{float,tolerance=2%}}

Просмотреть файл

@ -35,8 +35,12 @@ int main()
TrainSequenceToSequenceTranslator();
// Test multi-threads evaluation
// Todo: Also test on GPUDevice()
fprintf(stderr, "Test multi-threaded evaluation on CPU.\n");
EvalMultiThreadsWithNewNetwork(DeviceDescriptor::CPUDevice(), 2);
#ifndef CPUONLY
fprintf(stderr, "Test multi-threaded evaluation on GPU\n");
EvalMultiThreadsWithNewNetwork(DeviceDescriptor::GPUDevice(0), 2);
#endif
fprintf(stderr, "\nCNTKv2Library tests: Passed\n");
fflush(stderr);

Просмотреть файл

@ -0,0 +1,38 @@
name: cntk-py34
dependencies:
- libgfortran=3.0.0=1
- mkl=11.3.3=0
- numpy=1.11.1=py34_0
- openssl=1.0.2j=0
- pip=8.1.2=py34_0
- python=3.4.4=5
- readline=6.2=2
- scipy=0.18.1=np111py34_0
- setuptools=27.2.0=py34_0
- sqlite=3.13.0=0
- tk=8.5.18=0
- wheel=0.29.0=py34_0
- xz=5.2.2=0
- zlib=1.2.8=3
- pip:
- alabaster==0.7.9
- args==0.1.0
- babel==2.3.4
- clint==0.5.1
- docutils==0.12
- imagesize==0.7.1
- jinja2==2.8
- markupsafe==0.23
- pkginfo==1.3.2
- py==1.4.31
- pygments==2.1.3
- pytest==3.0.2
- pytz==2016.6.1
- requests==2.11.1
- requests-toolbelt==0.7.0
- six==1.10.0
- snowballstemmer==1.2.1
- sphinx==1.4.6
- sphinx-rtd-theme==0.1.9
- twine==1.8.1

Просмотреть файл

@ -0,0 +1,32 @@
name: cntk-py34
dependencies:
- mkl=11.3.3=1
- numpy=1.11.1=py34_1
- pip=8.1.2=py34_0
- python=3.4.4=5
- scipy=0.18.1=np111py34_0
- setuptools=27.2.0=py34_1
- vs2010_runtime=10.00.40219.1=2
- wheel=0.29.0=py34_0
- pip:
- alabaster==0.7.9
- args==0.1.0
- babel==2.3.4
- clint==0.5.1
- colorama==0.3.7
- docutils==0.12
- imagesize==0.7.1
- jinja2==2.8
- markupsafe==0.23
- pkginfo==1.3.2
- py==1.4.31
- pygments==2.1.3
- pytest==3.0.2
- pytz==2016.6.1
- requests==2.11.1
- requests-toolbelt==0.7.0
- six==1.10.0
- snowballstemmer==1.2.1
- sphinx==1.4.6
- sphinx-rtd-theme==0.1.9
- twine==1.8.1

Просмотреть файл

@ -18,12 +18,15 @@ swig.bat
# a) If you are just building to use it locally:
# Build -> generate .pyd
# 1) go two levels up
# 2) run the following:
python .\setup.py build_ext -if -c msvc --plat-name=win-amd64
# 3) add to PATH the path to cntk dlls (e.g. e:\CNTK\x64\Release)
# 4) add to PYTHONPATH the path to the python api source (e.g. e:\CNTK\bindings\python\)
SET PATH=%PATH%;<your CNTK release path e.g., e:\CNTK\x64\Release
# 4) add to PYTHONPATH the path to the python examples (e.g. e:\CNTK\bindings\python\examples)
SET PYTHONPATH=e:\CNTK\bindings\python\examples
# 5) test by running any of the examples or running py.test from the inside bindings\python directory
# b) If you want to package it:

Просмотреть файл

@ -145,7 +145,7 @@ def seqcla():
# do some manual accuracy testing
acc = calc_accuracy(train_file, ctx.output_filename_base)
# and test for the same number...
TOLERANCE_ABSOLUTE = 2E-02
TOLERANCE_ABSOLUTE = 1E-02
assert np.allclose(acc, 0.6022453889334403, atol=TOLERANCE_ABSOLUTE)
"""