Merge branch 'wilrich/missingV2Ops' of https://github.com/Microsoft/CNTK into wilrich/missingV2Ops

2016-09-28 21:01:39 +02:00 · 2016-09-28 21:01:39 +02:00 · f73f611de3
--- a/README.md
+++ b/README.md
@ -1,6 +1,18 @@
 # CNTK

 ## Latest news
+*2016-09-28.* V 1.7.1 Binary release  
+Highlights of this Release:
+* Two Breaking Changes related to Layers library default initialization and ```fsAdagrad``` gradient-normalization scheme
+* Improvements in BrainScript
+* Enabling of Deterministic Algorithm enforcement
+* Improvements in Model Evaluation including the support of Evaluation for Azure Applications
+* Different Performance improvements
+* Multiple bug fixes
+
+See more in the [Release Notes](https://github.com/Microsoft/CNTK/wiki/CNTK_1_7_1_Release_Notes) (including the full list of bugs fixed)  
+Get the Release from the [CNTK Releases page](https://github.com/Microsoft/CNTK/releases)
+
 *2016-08-31.* V 1.7 Binary release  
 Highlights of this Release:
 * Improvements in BrainScript (New library of predefined common layer types, Support of cuDNN5 RNN and Common random-initialization types, improved handling of GRUs)
@ -22,8 +34,6 @@ Get the Release from the [CNTK Releases page](https://github.com/Microsoft/CNTK/
 *2016-07-15.* V 1.6 Binary release  
 CNTK v.1.6 binaries are on the [CNTK Releases page](https://github.com/Microsoft/CNTK/releases)

-*2016-07-12.* We have further expanded Licensing options for CNTK 1bit-SGD and related components. See the details at the [Wiki page](https://github.com/microsoft/cntk/wiki/CNTK-1bit-SGD-License). These new options are an extension of the new CNTK 1bit-SGD License that we have announced on Jun 23, 2016.
-
 See [all news](https://github.com/Microsoft/CNTK/wiki/News).

 ## What is CNTK
--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@ -633,11 +633,9 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
 // main() for CNTK config language (this is the current way of using CNTK)
 // ---------------------------------------------------------------------------

-// helper to print a little banner
-// CNTK 1.7.1 (fseide/samplebs hash, Sep 3 2016 00:17:33) on FSEIDE-GPU at 2016/09/03 00:25:30
 static void PrintBanner(int argc, wchar_t* argv[], const string& timestamp)
 {
-    fprintf(stderr, "CNTK 1.7.1 (");
+    fprintf(stderr, "CNTK 1.7.1+ (");
 #ifdef _GIT_EXIST
    fprintf(stderr, "%s %.6s, ", _BUILDBRANCH_, _BUILDSHA1_);
 #endif
--- a/Source/CNTKv2LibraryDll/Function.cpp
+++ b/Source/CNTKv2LibraryDll/Function.cpp
@ -2138,7 +2138,7 @@ namespace CNTK
            std::copy(currentFunctionOutputs.begin(), currentFunctionOutputs.end(), std::back_inserter(inputs));
        }

-        return Internal::Combine(inputs);
+        return Internal::Combine(inputs, name);
    }

    namespace Sequence
--- a/Source/Common/Include/DataReader.h
+++ b/Source/Common/Include/DataReader.h
@ -43,16 +43,50 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // TODO: Should be unified with StreamDescription from the new reader API
    struct InputStreamDescription
    {
+        InputStreamDescription(const std::wstring& name, int deviceId, MatrixType matrixType, MatrixFormat format)
+            : m_name(name), m_deviceId(deviceId), m_matrixType(matrixType), m_format(format)
+        {}
+
+        const std::wstring& GetStreamName() const
+        {
+            return m_name;
+        }
+
+        int GetDeviceId() const
+        {
+            return m_deviceId;
+        }
+
+        MatrixType GetMatrixType() const
+        {
+            return m_matrixType;
+        }
+
+        MatrixFormat GetMatrixFormat() const
+        {
+            return m_format;
+        }
+
+    private:
        // Stream name.
        std::wstring m_name;

        // Device identifier for the resulting matrix of this stream.
        int m_deviceId;
+
+        // Matrix type.
+        MatrixType m_matrixType;
+
+        // Matrix format.
+        MatrixFormat m_format;
    };

    inline bool operator == (const InputStreamDescription& a, const InputStreamDescription& b)
    {
-        return a.m_name == b.m_name && a.m_deviceId == b.m_deviceId;
+        return a.GetStreamName() == b.GetStreamName() &&
+               a.GetDeviceId() == b.GetDeviceId() &&
+               a.GetMatrixType() == b.GetMatrixType() &&
+               a.GetMatrixFormat() == b.GetMatrixFormat();
    };
 }}}

@ -63,7 +97,7 @@ namespace std
        size_t operator()(const Microsoft::MSR::CNTK::InputStreamDescription& x) const
        {
            // Input name is unique, simply return the hash of the input stream.
-            return std::hash<std::wstring>()(x.m_name);
+            return std::hash<std::wstring>()(x.GetStreamName());
        }
    };
 }
@ -163,7 +197,8 @@ public:
        std::unordered_set<InputStreamDescription> streamDescriptions;
        for (auto input = begin(); input != end(); ++input)
        {
-            streamDescriptions.insert(InputStreamDescription{ input->first, input->second.matrix->GetDeviceId() });
+            streamDescriptions.insert(
+                InputStreamDescription(input->first, input->second.matrix->GetDeviceId(), input->second.matrix->GetMatrixType(), input->second.matrix->GetFormat()));
        }
        return streamDescriptions;
    }
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -117,6 +117,9 @@ const char* CudaErrString<curandStatus>(curandStatus)

 namespace Microsoft { namespace MSR { namespace CNTK {

+// Todo: After upgrade to VS2015, remove it here, and put it as a local static variable in GetDeviceProps().
+std::vector<cudaDeviceProp> GridDim::s_cachedDeviceProps = GridDim::CacheDeviceProps();
+
 /*static*/ bool SyncGuard::s_isSyncEnabled = false;

 /*static*/ void SyncGuard::EnableSync()
--- a/Source/Math/GPUMatrixCUDAKernels.cuh
+++ b/Source/Math/GPUMatrixCUDAKernels.cuh
@ -159,8 +159,10 @@ struct GridDim
    // get device properties of current device
    static const cudaDeviceProp& GetDeviceProps()
    {
-        static std::vector<cudaDeviceProp> props = CacheDeviceProps(); // thread-safe according to C++ standard
-        return props[GetCurrentDeviceId()];
+        // Unfortunatelly, initialization of local static variables is not thread-safe in VS2013.
+        // As workaround, it is moved to the struct level. 
+        // static std::vector<cudaDeviceProp> props = CacheDeviceProps(); // thread-safe according to C++ standard
+        return s_cachedDeviceProps[GetCurrentDeviceId()];
    }

    // compute our location on the grid
@ -168,6 +170,10 @@ struct GridDim
    {
        return blockDim.x * blockIdx.x + threadIdx.x;
    }
+
+private:     
+    // Todo: after upgraded to VS2015, move the satic variable into GetDeviceProps() as local static variables there.
+    static std::vector<cudaDeviceProp> s_cachedDeviceProps;
 };

 #define CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N) \
--- a/Source/Math/GPUSparseMatrix.cu
+++ b/Source/Math/GPUSparseMatrix.cu
@ -912,7 +912,14 @@ void GPUSparseMatrix<ElemType>::SetMatrixFromCSCFormat(const CPUSPARSE_INDEX_TYP

    cudaMemcpyKind kind = IsOnDevice ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice;
    if (transferer)
+    {
+        // TODO: All RequireSizeAndAllocate should be async and use a transferer.
+        // Currently there are some memset operations that can be still executing on the default stream,
+        // Here we have to wait for them to finish.
+        transferer->RecordComputeStreamSyncPoint();
+        transferer->WaitForSyncPointOnAssignStreamAsync();
        transferer->CopyCPUToGPUAsync(h_Val, nz, sizeof(ElemType), Data());
+    }
    else
        CUDA_CALL(cudaMemcpy(Data(), h_Val, nz * sizeof(ElemType), kind));

--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@ -3502,6 +3502,18 @@ int Matrix<ElemType>::GetDeviceId() const
        { return m_GPUSparseMatrix->GetComputeDeviceId(); });
 }

+template <class ElemType>
+MatrixType Matrix<ElemType>::GetMatrixType() const
+{
+    return m_matrixType;
+}
+
+template <class ElemType>
+MatrixFormat Matrix<ElemType>::GetFormat() const
+{
+    return m_baseMatrix->GetFormat();
+}
+
 // TODO: Comment why we need a second ElemType.
 // TODO: Move the shared core functions to the front of this source file.
 // BUGBUG: This performs a copy operation even for the output matrix that gets overwritten right away.
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@ -49,6 +49,8 @@ template <class ElemType> class DeviceBoundNumber;
 struct /*interface*/ MATH_API MatrixBase
 {
    virtual int GetDeviceId() const = 0;
+    virtual MatrixType GetMatrixType() const = 0;
+    virtual MatrixFormat GetFormat() const = 0;
    // TODO: Move more generic functions such as getting dims, resizing, and getting/setting as scalars in here.
    virtual ~MatrixBase();
 };
@ -147,8 +149,8 @@ public:
        return node;
    }

-    MatrixType GetMatrixType() const { return m_matrixType; }
-    MatrixFormat GetFormat() const { return m_baseMatrix->GetFormat(); }
+    MatrixType GetMatrixType() const override;
+    MatrixFormat GetFormat() const override;
    bool OwnBuffer() const { return m_baseMatrix->OwnBuffer(); }
    int GetDeviceId() const; // -1 if CPU, otherwise GPU CUDA device id
    DEVICEID_TYPE GetPreferredDeviceId() const { return m_preferredDeviceId; }; // -1 if CPU, otherwise GPU CUDA device id
--- a/Source/Readers/ReaderLib/ReaderShim.cpp
+++ b/Source/Readers/ReaderLib/ReaderShim.cpp
@ -84,15 +84,15 @@ void ReaderShim<ElemType>::StartDistributedMinibatchLoop(
    // Now we can be sure, no prefetch thread is running and there are no outstanding memcopies.
    // Let's check that requested devices are ok and see whether we need to change our data transferers.
    auto device = std::find_if(inputs.begin(), inputs.end(),
-        [](const InputStreamDescription& d) { return d.m_deviceId != CPUDEVICE; });
-    auto deviceId = device != inputs.end() ? device->m_deviceId : CPUDEVICE;
+        [](const InputStreamDescription& d) { return d.GetDeviceId() != CPUDEVICE; });
+    auto deviceId = device != inputs.end() ? device->GetDeviceId() : CPUDEVICE;

    // Check that all devices either the same as m_deviceId or CPU.
    auto secondDevice = std::find_if(inputs.begin(), inputs.end(), 
-        [deviceId](const InputStreamDescription& d) { return d.m_deviceId != CPUDEVICE && d.m_deviceId != deviceId; });
+        [deviceId](const InputStreamDescription& d) { return d.GetDeviceId() != CPUDEVICE && d.GetDeviceId() != deviceId; });
    if (secondDevice != inputs.end())
    {
-        LogicError("Readers do not support running on several GPUs in the same process, at least two devices found '%d', '%d'", deviceId, secondDevice->m_deviceId);
+        LogicError("Readers do not support running on several GPUs in the same process, at least two devices found '%d', '%d'", deviceId, secondDevice->GetDeviceId());
    }

    if (m_deviceId != deviceId)
@ -109,8 +109,13 @@ void ReaderShim<ElemType>::StartDistributedMinibatchLoop(
    std::map<std::wstring, int> inputDescriptions;
    for (const auto& i : inputs)
    {
-        inputDescriptions[i.m_name] = i.m_deviceId;
-        m_prefetchBuffers[i.m_name] = StreamPrefetchBuffer{ std::make_shared<Matrix<ElemType>>(i.m_deviceId), nullptr };
+        inputDescriptions[i.GetStreamName()] = i.GetDeviceId();
+        // Creating buffers with the same properties the network expects.
+        m_prefetchBuffers[i.GetStreamName()] = StreamPrefetchBuffer
+        {
+            std::make_shared<Matrix<ElemType>>(0, 0, i.GetDeviceId(), i.GetMatrixType(), i.GetMatrixFormat()),
+            nullptr
+        };
    }

    m_endOfEpoch = false;
--- a/Tests/EndToEndTests/BindingPyTest/run-test
+++ b/Tests/EndToEndTests/BindingPyTest/run-test
@ -13,8 +13,7 @@ python -c "import scipy; print('SciPy: %s'%scipy.version.full_version)"
 python -c "import pytest; print('PyTest: %s'%pytest.__version__)"

 cd $PYBINDINGDIR
-py.test
-# TODO --deviceid $TEST_DEVICE
+py.test --deviceid $TEST_DEVICE

 if [ "$?" -eq "0" ]; then
  echo "__COMPLETED__"
--- a/Tests/EndToEndTests/BindingPyTest/testcases.disabled
+++ b/Tests/EndToEndTests/BindingPyTest/testcases.disabled
@ -1,8 +1,8 @@
 dataDir: .

 tags:
-    - bvt-l (os == 'windows') and (build_sku == 'gpu') and (flavor == 'release') and (device == 'gpu')
-    - nightly-l (os == 'windows') and (build_sku == 'gpu') and (flavor == 'release') and (device == 'gpu')
+    - bvt-l (os == 'windows') and (build_sku == 'gpu') and (flavor == 'release')
+    - nightly-l (os == 'windows') and (build_sku == 'gpu') and (flavor == 'release')

 testCases:
  BindingPyTest run must finish with error code 0 (outputs __COMPLETED__ in that case):
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/Dropout/baseline.cpu.txt
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/Dropout/baseline.cpu.txt
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/Dropout/baseline.gpu.txt
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/Dropout/baseline.gpu.txt
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/Dropout/baseline.windows.cpu.txt
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/Dropout/baseline.windows.cpu.txt
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/Dropout/baseline.windows.gpu.txt
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/Dropout/baseline.windows.gpu.txt
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/Dropout/run-test
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/Dropout/run-test
@ -0,0 +1,37 @@
+#!/bin/bash
+
+. $TEST_ROOT_DIR/run-test-common
+
+OriginalTestDir=../../../DNN/Dropout
+ConfigDir=$TEST_DIR/$OriginalTestDir
+LogFileName=stderr
+Instances=2
+NumCPUThreads=$(threadsPerInstance $Instances)
+
+(cd $TEST_DIR/$OriginalTestDir && md5sum baseline*) | (cd $TEST_DIR && md5sum --status -c -)
+if [ $? != 0 ]; then
+  echo Error: Baselines must match original test. Copy from $OriginalTestDir.
+  exit 1
+fi
+
+
+# cntkmpirun <MPI args> <CNTK config file name> <additional CNTK args>
+DeleteModelsAfterTest=0
+cntkmpirun "-n $Instances" cntk.cntk "numCPUThreads=$NumCPUThreads speechTrain=[reader=[readerType=HTKDeserializers]]"
+ExitCode=$?
+sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank0
+sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank1
+if [ "$ExitCode" != "0" ]; then
+    exit $ExitCode
+fi
+echo === Deleting last 2 epochs and restart
+rm $TEST_RUN_DIR/models/*.dnn || exit $?
+rm $TEST_RUN_DIR/models/*.dnn.4 || exit $?
+echo ==== Re-running from checkpoint
+DeleteExistingModels=0
+DeleteModelsAfterTest=1
+cntkmpirun "-n $Instances" cntk.cntk "numCPUThreads=$NumCPUThreads speechTrain=[reader=[readerType=HTKDeserializers]]"
+ExitCode=$?
+sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank0
+sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank1
+exit $ExitCode
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/Dropout/testcases.yml
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/Dropout/testcases.yml
@ -0,0 +1,40 @@
+dataDir: ../../../Data
+tags:
+     # running for gpu build SKU on every BVT job in 'S' (Speech) leg in Debug-GPU and Release-CPU configurations:
+     - bvt-s  (build_sku == 'gpu') and ((flavor=='debug') ^ (device=='cpu'))
+     # running for gpu build SKU  on every Nightly job in 'S' leg
+     - nightly-s (build_sku == 'gpu')
+
+testCases:
+  Must train epochs in exactly same order and parameters for each MPI Rank:
+    patterns:
+      - ^MPI Rank {{integer}}
+      - Starting Epoch {{integer}}
+      - learning rate per sample = {{float}}
+      - momentum = {{float}}
+
+  Epochs must be finished with expected results for each MPI Rank:
+    patterns:
+      - ^MPI Rank {{integer}}
+      - Finished Epoch[{{integer}} of {{integer}}]
+      - ce = {{float,tolerance=0.2%}}
+      - err = {{float,tolerance=0.2%}}
+      - learningRatePerSample = {{float,tolerance=0.001%}}
+
+  Per-minibatch training results must match for each MPI Rank:
+    patterns:
+      - ^MPI Rank {{integer}}
+      - Epoch[{{integer}} of {{integer}}]-Minibatch[{{integer}}-{{integer}}
+      - " * {{integer}}; "
+      - ce = {{float,tolerance=0.2%}}
+      - err = {{float,tolerance=0.2%}}
+
+  DataParallelSGD training parameters must match for each MPI Rank:
+    patterns:
+      - ^MPI Rank {{integer}}
+      - Starting minibatch loop
+      - DataParallelSGD training
+      - myRank = {{integer}}
+      - numNodes = 2
+      - numGradientBits = 32
+      - distributed reading is ENABLED
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/ParallelBM/baseline.linux.cpu.txt
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/ParallelBM/baseline.linux.cpu.txt
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/ParallelBM/baseline.linux.gpu.txt
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/ParallelBM/baseline.linux.gpu.txt
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/ParallelBM/baseline.windows.cpu.txt
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/ParallelBM/baseline.windows.cpu.txt
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/ParallelBM/baseline.windows.gpu.txt
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/ParallelBM/baseline.windows.gpu.txt
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/ParallelBM/run-test
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/ParallelBM/run-test
@ -0,0 +1,22 @@
+#!/bin/bash
+
+. $TEST_ROOT_DIR/run-test-common
+
+OriginalTestDir=../../../DNN/ParallelBM
+ConfigDir=$TEST_DIR/$OriginalTestDir
+LogFileName=stderr
+Instances=2
+NumCPUThreads=$(threadsPerInstance $Instances)
+
+(cd $TEST_DIR/$OriginalTestDir && md5sum baseline*) | (cd $TEST_DIR && md5sum --status -c -)
+if [ $? != 0 ]; then
+  echo Error: Baselines must match original test. Copy from $OriginalTestDir.
+  exit 1
+fi
+
+# cntkmpirun <MPI args> <CNTK config file name> <additional CNTK args>
+cntkmpirun "-n $Instances" cntk.cntk "numCPUThreads=$NumCPUThreads precision=double speechTrain=[SGD=[ParallelTrain=[parallelizationStartEpoch=2]]]"
+ExitCode=$?
+sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank0
+sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank1
+exit $ExitCode
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/ParallelBM/testcases.yml
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/ParallelBM/testcases.yml
@ -0,0 +1,46 @@
+dataDir: ../../../Data
+tags:
+     # running for 1bitsgd build SKU on every BVT job in 'S' (Speech) leg in Debug-GPU and Release-CPU configurations:
+     - bvt-s  (build_sku == '1bitsgd') and ((flavor=='debug') ^ (device=='cpu'))
+     # running for 1bitsgd build SKU  on every Nightly job in 'S' leg
+     - nightly-s (build_sku == '1bitsgd')
+
+testCases:
+  Must train epochs in exactly same order and parameters for each MPI Rank:
+    patterns:
+      - ^MPI Rank {{integer}}
+      - Starting Epoch {{integer}}
+      - learning rate per sample = {{float}}
+      - momentum = {{float}}
+
+  Epochs must be finished with expected results for each MPI Rank:
+    patterns:
+      - ^MPI Rank {{integer}}
+      - Finished Epoch[{{integer}} of {{integer}}]
+      - CrossEntropyWithSoftmax = {{float,tolerance=2%}}
+      - EvalClassificationError = {{float,tolerance=2%}}
+      - learningRatePerSample = {{float,tolerance=0.001%}}
+
+  BlockMomentumSGD training should have distributed reading enabled:
+    patterns:
+      - distributed reading is ENABLED
+
+  BlockMomentumSGD training should have the expected parameters:
+    patterns:
+      - ^MPI Rank {{integer}}
+      - block momentum = {{float,tolerance=0.1%}}
+      - block momentum time constant (per worker) = {{float,tolerance=1%}}
+      - block learning rate = {{float,tolerance=0.1%}}
+      - block size per worker = {{integer}} samples
+      - resetting SGD momentum after sync
+      - using Nesterov-style block momentum
+
+  Per-minibatch training results must match for each MPI Rank:
+    patterns:
+      - ^MPI Rank {{integer}}
+      - Epoch[{{integer}} of {{integer}}]-Minibatch[{{integer}}-{{integer}}
+      - "* {{integer}}; "
+      - CrossEntropyWithSoftmax = {{float,tolerance=2%}}
+      - EvalClassificationError = {{float,tolerance=2%}}
+
+
--- a/Tests/UnitTests/V2LibraryTests/Main.cpp
+++ b/Tests/UnitTests/V2LibraryTests/Main.cpp
@ -35,8 +35,12 @@ int main()
    TrainSequenceToSequenceTranslator();

    // Test multi-threads evaluation
-    // Todo: Also test on GPUDevice()
+    fprintf(stderr, "Test multi-threaded evaluation on CPU.\n");
    EvalMultiThreadsWithNewNetwork(DeviceDescriptor::CPUDevice(), 2);
+#ifndef CPUONLY
+    fprintf(stderr, "Test multi-threaded evaluation on GPU\n");
+    EvalMultiThreadsWithNewNetwork(DeviceDescriptor::GPUDevice(0), 2);
+#endif

    fprintf(stderr, "\nCNTKv2Library tests: Passed\n");
    fflush(stderr);
--- a/bindings/python/conda-linux-cntk-py34-environment.yml
+++ b/bindings/python/conda-linux-cntk-py34-environment.yml
@ -0,0 +1,38 @@
+name: cntk-py34
+dependencies:
+- libgfortran=3.0.0=1
+- mkl=11.3.3=0
+- numpy=1.11.1=py34_0
+- openssl=1.0.2j=0
+- pip=8.1.2=py34_0
+- python=3.4.4=5
+- readline=6.2=2
+- scipy=0.18.1=np111py34_0
+- setuptools=27.2.0=py34_0
+- sqlite=3.13.0=0
+- tk=8.5.18=0
+- wheel=0.29.0=py34_0
+- xz=5.2.2=0
+- zlib=1.2.8=3
+- pip:
+  - alabaster==0.7.9
+  - args==0.1.0
+  - babel==2.3.4
+  - clint==0.5.1
+  - docutils==0.12
+  - imagesize==0.7.1
+  - jinja2==2.8
+  - markupsafe==0.23
+  - pkginfo==1.3.2
+  - py==1.4.31
+  - pygments==2.1.3
+  - pytest==3.0.2
+  - pytz==2016.6.1
+  - requests==2.11.1
+  - requests-toolbelt==0.7.0
+  - six==1.10.0
+  - snowballstemmer==1.2.1
+  - sphinx==1.4.6
+  - sphinx-rtd-theme==0.1.9
+  - twine==1.8.1
+
--- a/bindings/python/conda-windows-cntk-py34-environment.yml
+++ b/bindings/python/conda-windows-cntk-py34-environment.yml
@ -0,0 +1,32 @@
+name: cntk-py34 
+dependencies:
+- mkl=11.3.3=1
+- numpy=1.11.1=py34_1
+- pip=8.1.2=py34_0
+- python=3.4.4=5
+- scipy=0.18.1=np111py34_0
+- setuptools=27.2.0=py34_1
+- vs2010_runtime=10.00.40219.1=2
+- wheel=0.29.0=py34_0
+- pip:
+  - alabaster==0.7.9
+  - args==0.1.0
+  - babel==2.3.4
+  - clint==0.5.1
+  - colorama==0.3.7
+  - docutils==0.12
+  - imagesize==0.7.1
+  - jinja2==2.8
+  - markupsafe==0.23
+  - pkginfo==1.3.2
+  - py==1.4.31
+  - pygments==2.1.3
+  - pytest==3.0.2
+  - pytz==2016.6.1
+  - requests==2.11.1
+  - requests-toolbelt==0.7.0
+  - six==1.10.0
+  - snowballstemmer==1.2.1
+  - sphinx==1.4.6
+  - sphinx-rtd-theme==0.1.9
+  - twine==1.8.1
--- a/bindings/python/readme.txt
+++ b/bindings/python/readme.txt
@ -18,12 +18,15 @@ swig.bat

 # a) If you are just building to use it locally:
    # Build -> generate .pyd
+
    # 1) go two levels up
    # 2) run the following:
    python .\setup.py build_ext -if -c msvc --plat-name=win-amd64

    # 3) add to PATH the path to cntk dlls (e.g. e:\CNTK\x64\Release)
-    # 4) add to PYTHONPATH the path to the python api source (e.g. e:\CNTK\bindings\python\)
+	SET PATH=%PATH%;<your CNTK release path e.g., e:\CNTK\x64\Release
+    # 4) add to PYTHONPATH the path to the python examples (e.g. e:\CNTK\bindings\python\examples)
+	SET PYTHONPATH=e:\CNTK\bindings\python\examples
    # 5) test by running any of the examples or running py.test from the inside bindings\python directory

 # b) If you want to package it:
--- a/contrib/Python/cntk/examples/LSTM/seqcla.py
+++ b/contrib/Python/cntk/examples/LSTM/seqcla.py
@ -145,7 +145,7 @@ def seqcla():
        # do some manual accuracy testing
        acc = calc_accuracy(train_file, ctx.output_filename_base)
        # and test for the same number...
-        TOLERANCE_ABSOLUTE = 2E-02
+        TOLERANCE_ABSOLUTE = 1E-02
        assert np.allclose(acc, 0.6022453889334403, atol=TOLERANCE_ABSOLUTE)

 """