Merge branch 'master' of https://git01.codeplex.com/cntk into linux-gcc

2015-08-12 12:08:43 -07:00 · 2015-08-12 12:08:43 -07:00 · a0645c9cdf
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1 @@
+run-test text eol=lf
--- a/.gitignore
+++ b/.gitignore
@ -6,6 +6,11 @@
 *.user
 *.sln.docstates
 *.orig
+\#*
+.\#*
+
+# Local build configuration
+Config.make

 # Build results

@ -16,6 +21,8 @@ build/
 [Bb]in/
 [Oo]bj/
 .run-*
+lib/
+bin/

 # Enable "build/" folder in the NuGet Packages folder since NuGet packages use it for MSBuild targets
 !packages/*/build/
--- a/Common/Include/Platform.h
+++ b/Common/Include/Platform.h
@ -54,7 +54,7 @@ typedef void* HANDLE;
 #define __forceinline inline
 //string and io conversion
 #define strtok_s strtok_r
-#define sprintf_s sprintf
+#define sprintf_s snprintf
 #define sscanf_s sscanf
 #define _strdup strdup

--- a/Common/Include/commandArgUtil.h
+++ b/Common/Include/commandArgUtil.h
@ -24,12 +24,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 static const std::string::size_type npos = (std::string::size_type) -1;

 // These are the constants associated with the "ResolveVariables" method.
-static const std::string openBraceVar = "$";
-static const std::string closingBraceVar = "$";
-static const std::string forbiddenCharactersInVarName = ",/<>?;':\"[]{}\\|!@#%^&*()+=~` \t\n";
-static const std::string forbiddenCharactersInVarNameEscapeWhitespace = ",/<>?;':\"[]{}\\|!@#%^&*()+=~` \\t\\n";
-static const std::size_t openBraceVarSize = openBraceVar.size();
-static const std::size_t closingBraceVarSize = openBraceVar.size();
+static const char* openBraceVar = "$";
+static const char* closingBraceVar = "$";
+static const char* forbiddenCharactersInVarName = ",/<>?;':\"[]{}\\|!@#%^&*()+=~` \t\n";
+static const char* forbiddenCharactersInVarNameEscapeWhitespace = ",/<>?;':\"[]{}\\|!@#%^&*()+=~` \\t\\n";
+static const std::size_t openBraceVarSize = strlen(openBraceVar);
+static const std::size_t closingBraceVarSize = strlen(closingBraceVar);

 // Trim - trim white space off the start and end of the string
 // str - string to trim
@ -991,7 +991,7 @@ public:
        while (start != std::string::npos)
        {
            // search for whitespace or closing brace.
-            end = newConfigLine.find_first_of(closingBraceVar + forbiddenCharactersInVarName,
+            end = newConfigLine.find_first_of(std::string(closingBraceVar) + forbiddenCharactersInVarName,
                                              start + openBraceVarSize);

            // ensure that a closing brace exists for every opening brace.
@ -999,15 +999,15 @@ public:
            if (end == std::string::npos)
            {
                RuntimeError("\"%s\" found without corresponding closing \"%s\": %s:%s",
-                             openBraceVar.c_str(), closingBraceVar.c_str(),
+                             openBraceVar, closingBraceVar,
                             m_configName.c_str(), newConfigLine.c_str());
            }

            if (newConfigLine[end] != '$')
            {
                RuntimeError("Forbidden characters found between \"%s\" and \"%s\".  Variable names cannot any of the following characters: %s. %s:%s",
-                             openBraceVar.c_str(), closingBraceVar.c_str(),
-                             forbiddenCharactersInVarNameEscapeWhitespace.c_str(),
+                             openBraceVar, closingBraceVar,
+                             forbiddenCharactersInVarNameEscapeWhitespace,
                             m_configName.c_str(), newConfigLine.c_str());
            }

--- a/DataReader/BinaryReader/BinaryReader.cpp
+++ b/DataReader/BinaryReader/BinaryReader.cpp
@ -138,7 +138,7 @@ void BinaryReader<ElemType>::DisplayProperties()
    for (auto pair : m_sections)
    {
        Section* section = pair.second;
-        fprintf(stderr,"Section: %ls, Elements: %lld, ElementsPerRecord: %lld, ElementSize: %lld\n", pair.first.c_str(), section->GetElementCount(), section->GetElementsPerRecord(), section->GetElementSize());
+        fprintf(stderr,"Section: %ls, Elements: %zd, ElementsPerRecord: %zd, ElementSize: %zd\n", pair.first.c_str(), section->GetElementCount(), section->GetElementsPerRecord(), section->GetElementSize());
        if (section->GetSectionType() == sectionTypeStats)
        {
            vector<NumericStatistics> stats;
@ -434,4 +434,4 @@ bool BinaryReader<ElemType>::DataEnd(EndDataType endDataType)
 // instantiate all the combinations we expect to be used
 template class BinaryReader<double>; 
 template class BinaryReader<float>;
-}}}
+}}}
--- a/DataReader/Kaldi2Reader/HTKMLFReader.cpp
+++ b/DataReader/Kaldi2Reader/HTKMLFReader.cpp
@ -1161,11 +1161,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    // Sets the utterance boundary.
                    if (m_framemode == false)
                    {
-                        // If <m_truncated> is false, then the whole utterance
-                        // will be loaded into the minibatch.
-                        if (m_truncated == false)
+                        if (startFrame == 0)
                        {
-                            assert(startFrame == 0);
                            m_sentenceBegin.SetValue(i, 0, (ElemType)SEQUENCE_START);
                            m_minibatchPackingFlag[0] |= MinibatchPackingFlag::SequenceStart;
                        }
@ -1227,11 +1224,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    // minibatch, and then load it.
                    if (m_framemode == false)
                    {
-                        // If <m_truncated> is false, then the whole utterance
-                        // will be loaded into the minibatch.
-                        if (m_truncated == false)
+                        if (startFrame == 0)
                        {
-                            assert(startFrame == 0);
                            m_sentenceBegin.SetValue(i, 0, (ElemType)SEQUENCE_START);
                            m_minibatchPackingFlag[0] |= MinibatchPackingFlag::SequenceStart;
                        }
--- a/DataReader/LMSequenceReader/SequenceParser.h
+++ b/DataReader/LMSequenceReader/SequenceParser.h
@ -445,7 +445,7 @@ public:
        long TickDelta = TickStop - TickStart;

        if (m_traceLevel > 2)
-            fprintf(stderr, "\n%l ms, %d numbers parsed\n\n", TickDelta, m_totalNumbersConverted );
+            fprintf(stderr, "\n%ld ms, %d numbers parsed\n\n", TickDelta, m_totalNumbersConverted );
        return lineCount;
    }

@ -608,7 +608,7 @@ public:
        long TickDelta = TickStop - TickStart;

        if (m_traceLevel > 2)
-            fprintf(stderr, "\n%l ms, %d numbers parsed\n\n", TickDelta, m_totalNumbersConverted );
+            fprintf(stderr, "\n%ld ms, %d numbers parsed\n\n", TickDelta, m_totalNumbersConverted );
        return lineCount;
    }

--- a/DataReader/LMSequenceReader/SequenceReader.cpp
+++ b/DataReader/LMSequenceReader/SequenceReader.cpp
@ -795,7 +795,7 @@ void SequenceReader<ElemType>::SetupEpoch()
        if (m_totalSamples == 0)
        {
            if (m_traceLevel > 0)
-                fprintf(stderr, "starting at epoch %d parsing all data to determine record count\n", m_epoch);
+                fprintf(stderr, "starting at epoch %zd parsing all data to determine record count\n", m_epoch);
            // choose a large number to read
            m_parser.SetFilePosition(0);
            m_mbStartSample = 0;
@ -805,7 +805,7 @@ void SequenceReader<ElemType>::SetupEpoch()
                m_seqIndex = m_sequence.size();
            }
            if (m_traceLevel > 0)
-                fprintf(stderr, "\n %lld records found\n", m_totalSamples);
+                fprintf(stderr, "\n %zd records found\n", m_totalSamples);
        }
        m_seqIndex = 0;

@ -2129,4 +2129,4 @@ int BatchSequenceReader<ElemType>::GetSentenceEndIdFromOutputLabel()

 template class BatchSequenceReader<double>; 
 template class BatchSequenceReader<float>;
-}}}
+}}}
--- a/DataReader/UCIFastReader/UCIParser.cpp
+++ b/DataReader/UCIFastReader/UCIParser.cpp
@ -653,7 +653,7 @@ long UCIParser<NumType, LabelType>::Parse(size_t recordsRequested, std::vector<N
    long TickDelta = TickStop - TickStart;

    if (m_traceLevel > 2)
-        fprintf(stderr, "\n%ld ms, %d numbers parsed\n\n", TickDelta, m_totalNumbersConverted );
+        fprintf(stderr, "\n%ld ms, %ld numbers parsed\n\n", TickDelta, m_totalNumbersConverted );
    return recordCount;
 }

--- a/Documentation/CNTK-TechReport/lyx/CNTKBook-20150702.pdf
+++ b/Documentation/CNTK-TechReport/lyx/CNTKBook-20150702.pdf
--- a/Documentation/CNTK-TechReport/lyx/CNTKBook-20150805.pdf
+++ b/Documentation/CNTK-TechReport/lyx/CNTKBook-20150805.pdf
--- a/Documentation/CNTK-TechReport/lyx/CNTKBook-master.lyx
+++ b/Documentation/CNTK-TechReport/lyx/CNTKBook-master.lyx
@ -25,7 +25,7 @@
 \font_roman times
 \font_sans helvet
 \font_typewriter courier
-\font_math auto
+\font_math default
 \font_default_family default
 \use_non_tex_fonts false
 \font_sc false
@ -56,11 +56,11 @@
 \use_package cancel 0
 \use_package esint 1
 \use_package mathdots 1
-\use_package mathtools 0
+\use_package mathtools 1
 \use_package mhchem 1
-\use_package stackrel 0
-\use_package stmaryrd 0
-\use_package undertilde 0
+\use_package stackrel 1
+\use_package stmaryrd 1
+\use_package undertilde 1
 \cite_engine basic
 \cite_engine_type default
 \biblio_style plain
@ -99,23 +99,35 @@ An Introduction to Computational Networks and the Computational Network

 \begin_layout Author
 Dong Yu, Adam Eversole, Michael L.
- Seltzer, Kaisheng Yao, Zhiheng Huang, 
+ Seltzer, Kaisheng Yao,
 \begin_inset Newline newline
 \end_inset

-Brian Guenter, Oleksii Kuchaiev, Yu Zhang, Frank Seide, Huaming Wang, 
+Brian Guenter, Oleksii Kuchaiev, Yu Zhang, Frank Seide, Guoguo Chen, 
 \begin_inset Newline newline
 \end_inset

-Jasha Droppo, Geoffrey Zweig, Chris Rossbach, Jon Currey,
+Huaming Wang, Jasha Droppo, Amit Agarwal, Chris Basoglu, 
 \begin_inset Newline newline
 \end_inset

-Jie Gao, Avner May, Baolin Peng, Andreas Stolcke, Malcolm Slaney
+Marko Padmilac, Alexey Kamenev, Vladimir Ivanov, Scott Cypher, 
+\begin_inset Newline newline
+\end_inset
+
+Hari Parthasarathi, Bhaskar Mitra, Zhiheng Huang, Geoffrey Zweig, 
+\begin_inset Newline newline
+\end_inset
+
+Chris Rossbach, Jon Currey,Jie Gao, Avner May, Baolin Peng, 
+\begin_inset Newline newline
+\end_inset
+
+Andreas Stolcke, Malcolm Slaney, Xuedong Huang
 \end_layout

 \begin_layout Date
-MSR-TR-2014-112 (DRAFT v0.8: May 19, 2015)
+MSR-TR-2014-112 (DRAFT v0.8: Aug 5, 2015)
 \end_layout

 \begin_layout Standard
--- a/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Adv_Chapter.lyx
+++ b/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Adv_Chapter.lyx
@ -2073,6 +2073,45 @@ SumElements(m)
 \end_inset


+\end_layout
+
+\begin_layout Itemize
+m - input matrix 
+\end_layout
+
+\begin_layout Subsubsection
+SumColumnElements
+\begin_inset Index idx
+status open
+
+\begin_layout Plain Layout
+SumColumnElements
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Calculate the sum of all elements in each column of the input matrix.
+ The result is a row vector.
+ The syntax is
+\end_layout
+
+\begin_layout Standard
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+SumColumnElements(m)
+\end_layout
+
+\end_inset
+
+
 \end_layout

 \begin_layout Itemize
@ -2118,6 +2157,55 @@ Negate(m)
 m - input matrix.
 \end_layout

+\begin_layout Subsubsection
+Reshape
+\begin_inset Index idx
+status open
+
+\begin_layout Plain Layout
+Reshape
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Reshape the input matrix.
+ The resulting matrix has the same number of elements as that of the input
+ matrix but is interpreted differently.
+ The syntax is
+\end_layout
+
+\begin_layout Standard
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+Reshape(m, numRows, [imageWidth=], [imageHeight=], [imageChannels=]
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Itemize
+m - input matrix.
+\end_layout
+
+\begin_layout Itemize
+numRows - reshape to a matrix with numRows rows.
+\end_layout
+
+\begin_layout Itemize
+imageWidth, imageHeight, imageChannels - are named optional parameters that
+ allow you to interpret each column as images with different dimensions.
+\end_layout
+
 \begin_layout Subsubsection
 RowSlice
 \begin_inset Index idx
@ -2401,23 +2489,13 @@ Minus

 \end_inset

-, ElementTimes
-\begin_inset Index idx
-status open
-
-\begin_layout Plain Layout
-ElementTimes
-\end_layout
-
-\end_inset
-

 \end_layout

 \begin_layout Standard
-Calculate the sum (Plus), difference (Minus), or element-wise product (ElementTi
-mes) of two matrices.
- The resulting matrices have the same dimension as that of the input matrices.
+Calculate the sum (Plus) and difference (Minus) of two matrices.
+ The resulting matrices have the same dimension as that of the input matrix
+ with larger dimension.
 The syntax is
 \end_layout

@ -2436,19 +2514,95 @@ Plus(m1, m2)
 Minus(m1, m2)
 \end_layout

+\end_inset
+
+
+\end_layout
+
+\begin_layout Itemize
+m1, m2 - input matrices.
+ 
+\end_layout
+
+\begin_layout Subsubsection
+ElementTimes
+\begin_inset Index idx
+status open
+
+\begin_layout Plain Layout
+ElementTimes
+\end_layout
+
+\end_inset
+
+, RowElementTimes
+\begin_inset Index idx
+status open
+
+\begin_layout Plain Layout
+RowElementTimes
+\end_layout
+
+\end_inset
+
+, ColumnElementTimes
+\begin_inset Index idx
+status open
+
+\begin_layout Plain Layout
+ColumnElementTimes
+\end_layout
+
+\end_inset
+
+, 
+\end_layout
+
+\begin_layout Standard
+Calculate the element-wise product of two matrices.
+ The resulting matrices have the same dimension as that of the input matrix
+ with larger dimension.
+ The syntax is
+\end_layout
+
+\begin_layout Standard
+\begin_inset listings
+inline false
+status open
+
 \begin_layout Plain Layout

 ElementTimes(m1, m2)
 \end_layout

+\begin_layout Plain Layout
+
+RowElementTimes(m, vrow)
+\end_layout
+
+\begin_layout Plain Layout
+
+ColumnElementTimes(m, vcol)
+\end_layout
+
 \end_inset


 \end_layout

 \begin_layout Itemize
-m1, m2 - input matrices.
- Must be the same dimensions.
+m1, m2, m - input matrices.
+ 
+\end_layout
+
+\begin_layout Itemize
+vrow, vcol - row and column vectors
+\end_layout
+
+\begin_layout Standard
+m1 and m2 must have same dimension in ElementTimes.
+ m and vrow must have same number of columns in RowElementTimes, and m and
+ vcol must have same number of rows in ColumnElementTimes.
 \end_layout

 \begin_layout Subsubsection
@ -3547,7 +3701,7 @@ stepH - step (or stride) used in the height direction
 \end_layout

 \begin_layout Subsubsection
-Delay
+PastValue (or Delay
 \begin_inset Index idx
 status open

@ -3557,11 +3711,21 @@ Delay

 \end_inset

+) and FutureValue
+\begin_inset Index idx
+status open

+\begin_layout Plain Layout
+FutureValue
+\end_layout
+
+\end_inset
+
+ 
 \end_layout

 \begin_layout Standard
-Used to apply a value in the past to the current time.
+Used to get the past or future value of a node.
 It is most often used to create recurrent networks.
 The resulting matrix has the same dimension as that of the input matrix.
 The syntax is
@ -3574,7 +3738,17 @@ status open

 \begin_layout Plain Layout

-Delay(rows, [cols], m, [delayTime=1, defaultPastValue=0.1])
+PastValue(rows, [cols], m, [timeStep=1, defaultHiddenActivity=0.1])
+\end_layout
+
+\begin_layout Plain Layout
+
+Delay(rows, [cols], m, [delayTime=1, defaultPastValue=0.1]) #deprecated
+\end_layout
+
+\begin_layout Plain Layout
+
+FutureValue(rows, [cols], m, [timeStep=1, defaultHiddenActivity=0.1])
 \end_layout

 \end_inset
@ -3583,19 +3757,19 @@ Delay(rows, [cols], m, [delayTime=1, defaultPastValue=0.1])
 \end_layout

 \begin_layout Itemize
-rows - the number of rows in the delay node (and in the input matrix).
+rows - the number of rows in the input matrix.
 This parameter is needed because under some loopy conditions the dimensions
 cannot be automatically inferred from the input matrix.
 \end_layout

 \begin_layout Itemize
-cols - the number of columns in the delay node (and in the input matrix).
+cols - the number of columns in the input matrix.
 This parameter is optional since it will be set based on the minibatch
 size during training and testing.
 \end_layout

 \begin_layout Itemize
-m - input matrix to be delayed.
+m - input matrix from which the past or future value is obtained.
 Each column is a sample.
 The samples may be from different utterances as explained in Chapter .
 \begin_inset CommandInset ref
@ -3608,13 +3782,14 @@ reference "chap:CN"
 \end_layout

 \begin_layout Itemize
-delayTime - [named optional] the amount of delay.
+timeStep, delayTime - [named optional] the amount of time steps look into
+ past or future.
 Default is 1.
 \end_layout

 \begin_layout Itemize
-defaultPastValue - [named optional] the default value to use if the past
- value is not available.
+defaultHiddenActivity, defaultPastValue - [named optional] the default value
+ to use if the past or future values are not available.
 Default is 0.1.
 \end_layout

--- a/MachineLearning/CNTK/CNTK.cpp
+++ b/MachineLearning/CNTK/CNTK.cpp
@ -60,6 +60,9 @@ using namespace Microsoft::MSR::CNTK;
 template <typename ElemType>
 void TestCn(const ConfigParameters& config);

+template <typename ElemType>
+void DoEvalBeamSearch(const ConfigParameters& config, IDataReader<ElemType>& reader);
+
 template <typename T>
 struct compare_second
 {
@ -726,41 +729,37 @@ void DoTrain(const ConfigParameters& config)
    ConfigParameters readerConfig(config("reader"));
    readerConfig.Insert("traceLevel", config("traceLevel", "0"));

-    IComputationNetBuilder<ElemType>* netBuilder = NULL;
+    unique_ptr<IComputationNetBuilder<ElemType> > netBuilder;

    if (config.Exists("NDLNetworkBuilder"))
    {
        ConfigParameters configNDL(config("NDLNetworkBuilder"));
-        netBuilder = (IComputationNetBuilder<ElemType>*)new NDLBuilder<ElemType>(configNDL);
+        netBuilder = unique_ptr<IComputationNetBuilder<ElemType> >( static_cast<IComputationNetBuilder<ElemType>*>(new NDLBuilder<ElemType>(configNDL)));
    }
    else if (config.Exists("SimpleNetworkBuilder"))
    {
        ConfigParameters configSNB(config("SimpleNetworkBuilder"));
-        netBuilder = (IComputationNetBuilder<ElemType>*)new SimpleNetworkBuilder<ElemType>(configSNB);
+        netBuilder = unique_ptr<IComputationNetBuilder<ElemType> >{ static_cast<IComputationNetBuilder<ElemType>*>(new SimpleNetworkBuilder<ElemType>(configSNB)) };
    }
    else
    {
        RuntimeError("No network builder found in the config file. NDLNetworkBuilder or SimpleNetworkBuilde must be specified");
    }

-    DataReader<ElemType>* dataReader = new DataReader<ElemType>(readerConfig);
+    unique_ptr<DataReader<ElemType> > dataReader { new DataReader<ElemType>(readerConfig) };

-    DataReader<ElemType>* cvDataReader = nullptr;
+    unique_ptr<DataReader<ElemType> > cvDataReader;
    ConfigParameters cvReaderConfig(config("cvReader", L""));

    if (cvReaderConfig.size() != 0)
    {
        cvReaderConfig.Insert("traceLevel", config("traceLevel", "0"));
-        cvDataReader = new DataReader<ElemType>(cvReaderConfig);
+        cvDataReader = unique_ptr<DataReader<ElemType> >{ new DataReader<ElemType>(cvReaderConfig) };
    }

    SGD<ElemType> sgd(configSGD);

-    sgd.Train(netBuilder, dataReader, cvDataReader, makeMode);
-
-    delete netBuilder;
-    delete dataReader;
-    delete cvDataReader;
+    sgd.Train(netBuilder.get(), dataReader.get(), cvDataReader.get(), makeMode);
 }

 template <typename ElemType>
@ -1477,7 +1476,8 @@ int wmain(int argc, wchar_t* argv[])
            fcloseOrDie(fp);
        }
        fprintf(stderr, "COMPLETED\n");
-    }
+		fflush(stderr);
+	}
    catch (const std::exception &err)
    {
        fprintf(stderr, "EXCEPTION occurred: %s\n", err.what());
--- a/MachineLearning/CNTK/ComputationNetwork.h
+++ b/MachineLearning/CNTK/ComputationNetwork.h
@ -1515,7 +1515,7 @@ public:
                    ComputationNodePtr newNode(new PairNetworkNode<ElemType>(m_deviceId, nodeName));
                    if (this->GetNodeFromName(a->NodeName(), nullptr, false) != nullptr)
                    {
-                        fprintf(stderr, "PairNetwork : asked to pair a node with name l%s in another network.However, this network has already a node with the same name.Should avoid this case.\n", a->NodeName().c_str());
+                        fprintf(stderr, "PairNetwork : asked to pair a node with name %ls in another network.However, this network has already a node with the same name.Should avoid this case.\n", a->NodeName().c_str());
                        throw std::runtime_error("PairNetwork : asked to pair a node with name in another network.However, this network has already a node with the same name.Should avoid this case.\n");
                    }
                    newNode->AttachInputs(a);
@ -2441,8 +2441,7 @@ public:
    }

    int FindInRecurrentLoop(const ComputationNodePtr startNode,
-                            std::vector<ComputationNodePtr>& recurrentNodes,
-                            bool isForwardComputing = false)
+        std::vector<ComputationNodePtr>& recurrentNodes)
    {
        int iFound = -1;

@ -2451,14 +2450,8 @@ public:
            if (std::find((*iter).m_recurrentNodes.begin(), (*iter).m_recurrentNodes.end(), startNode) != (*iter).m_recurrentNodes.end())
            {
                iFound = (*iter).m_loopId;
-                if (isForwardComputing)
-                {
-                    recurrentNodes = (*iter).m_recurrentNodesForForward;
-                }
-                else
-                {
-                    recurrentNodes = (*iter).m_recurrentNodesForForward;
-                }
+                recurrentNodes = (*iter).m_recurrentNodesForForward;
+
                break;
            }
        }
@ -2499,7 +2492,7 @@ public:
    void EvaluateLoop(std::list<ComputationNodePtr>& /*allNodes*/, const ComputationNodePtr startNode)
    {
        std::vector<ComputationNodePtr> recurrentNodes;
-        int iLoopId = FindInRecurrentLoop(startNode, recurrentNodes, true);
+        int iLoopId = FindInRecurrentLoop(startNode, recurrentNodes);
        if (iLoopId != -1 && IsFuncValueOlderThanInputs(recurrentNodes) && 
            m_recurrentInfo[iLoopId].m_completedEvaluate == false)
        {
@ -3337,7 +3330,7 @@ public:

                bool UnitTest(const ComputationNodePtr rootNode)
                {
-                    fprintf(stderr, "\n\n Unit test node %ws \n", rootNode->NodeName().c_str());
+                    fprintf(stderr, "\n\n Unit test node %ls \n", rootNode->NodeName().c_str());

                    std::list<ComputationNodePtr>&  nodes = GetEvalOrder(rootNode);

--- a/MachineLearning/CNTK/MultiNetworksSGD.h
+++ b/MachineLearning/CNTK/MultiNetworksSGD.h
@ -33,41 +33,50 @@ namespace Microsoft {
    namespace MSR {
        namespace CNTK {

+            extern std::wstring GetEncoderModelNameForEpoch(int epoch, bool b = false);
+            extern std::wstring GetDecoderModelNameForEpoch(int epoch, bool b = false);
+
            template<class ElemType>
            class MultiNetworksSGD : SGD<ElemType>
            {
                ElemType  m_default_activity;

-                typedef SGD<ElemType> SGD;
+                using SGDBase = SGD<ElemType>;

            public:
-                using SGD::m_modelPath;
-                using SGD::m_maxEpochs;
-                using SGD::m_doUnitTest;
-                using SGD::m_learnRateAdjustInterval;
-                using SGD::m_mbSize;
-                using SGD::m_momentumPerSample;
-                using SGD::m_learningRatesPerSample;
-                using SGD::m_dropoutRates;
-                using SGD::m_autoLearnRateSearchType;
-                using SGD::m_minLearnRate;
-                using SGD::m_loadBestModel;
-                using SGD::m_validateAfterModelReloading;
-                using SGD::m_continueReduce;
-                using SGD::m_reduceLearnRateIfImproveLessThan;
-                using SGD::m_epochSize;
-                using SGD::m_learnRateDecreaseFactor;
-                using SGD::m_increaseLearnRateIfImproveMoreThan;
-                using SGD::m_learnRateIncreaseFactor;
-                using SGD::m_keepCheckPointFiles;
-                using SGD::m_doGradientCheck;
-                using SGD::m_L2RegWeight;
-                using SGD::m_L1RegWeight;
-                using SGD::m_needAveMultiplier;
-                using SGD::m_traceLevel;
-                using SGD::m_numMBsToShowResult;
-                using SGD::m_gradientCheckSigDigit;
-                using SGD::m_prevChosenMinibatchSize;
+                using SGDBase::m_modelPath;
+                using SGDBase::m_maxEpochs;
+                using SGDBase::m_doUnitTest;
+                using SGDBase::m_learnRateAdjustInterval;
+                using SGDBase::m_mbSize;
+                using SGDBase::m_momentumPerSample;
+                using SGDBase::m_learningRatesPerSample;
+                using SGDBase::m_dropoutRates;
+                using SGDBase::m_autoLearnRateSearchType;
+                using SGDBase::m_minLearnRate;
+                using SGDBase::m_loadBestModel;
+                using SGDBase::m_validateAfterModelReloading;
+                using SGDBase::m_continueReduce;
+                using SGDBase::m_reduceLearnRateIfImproveLessThan;
+                using SGDBase::m_epochSize;
+                using SGDBase::m_learnRateDecreaseFactor;
+                using SGDBase::m_increaseLearnRateIfImproveMoreThan;
+                using SGDBase::m_learnRateIncreaseFactor;
+                using SGDBase::m_keepCheckPointFiles;
+                using SGDBase::m_doGradientCheck;
+                using SGDBase::m_L2RegWeight;
+                using SGDBase::m_L1RegWeight;
+                using SGDBase::m_needAveMultiplier;
+                using SGDBase::m_traceLevel;
+                using SGDBase::m_numMBsToShowResult;
+                using SGDBase::m_gradientCheckSigDigit;
+                using SGDBase::m_prevChosenMinibatchSize;
+                using SGDBase::GetTrainCriterionNodes;
+                using SGDBase::GetEvalCriterionNodes;
+                using SGDBase::SetDropoutRate;
+                using SGDBase::UpdateEvalTimeStamps;
+                using SGDBase::UpdateWeights;
+                using SGDBase::GetCheckPointFileNameForEpoch;

                typedef ComputationNode<ElemType>* ComputationNodePtr;

@ -80,7 +89,7 @@ namespace Microsoft {
                list<pair<ComputationNodePtr, ComputationNodePtr>> m_lst_pair_encoder_decoder_nodes;

            public:
-                MultiNetworksSGD(const ConfigParameters& configSGD) : SGD(configSGD)
+                MultiNetworksSGD(const ConfigParameters& configSGD) : SGDBase(configSGD)
                {
                }

@ -663,7 +672,7 @@ namespace Microsoft {

                        if (learnRatePerSample < m_minLearnRate)
                        {
-                            fprintf(stderr, "Learn Rate Per Sample for Epoch[%lu] = %.8g is less than minLearnRate %.8g. Training stops.\n", i + 1, learnRatePerSample, m_minLearnRate);
+                            fprintf(stderr, "Learn Rate Per Sample for Epoch[%d] = %.8g is less than minLearnRate %.8g. Training stops.\n", i + 1, learnRatePerSample, m_minLearnRate);
                            break;
                        }

@ -692,7 +701,7 @@ namespace Microsoft {
                        IDataReader<ElemType>* decoderTrainSetDataReader = trainDataReader[decoderIdx];
                        ComputationNetwork<ElemType>* decoderNet = nets[decoderIdx];

-                        fprintf(stderr, "Finished Epoch[%lu]: [Training Set] Decoder Train Loss Per Sample = %.8g    ", i + 1, epochCriterion);
+                        fprintf(stderr, "Finished Epoch[%d]: [Training Set] Decoder Train Loss Per Sample = %.8g    ", i + 1, epochCriterion);
                        if (epochEvalErrors.size() == 1)
                        {
                            fprintf(stderr, "EvalErr Per Sample = %.8g   Ave Learn Rate Per Sample = %.10g  Epoch Time=%.8g\n", epochEvalErrors[0], learnRatePerSample, epochTime);
@ -703,9 +712,9 @@ namespace Microsoft {
                            for (size_t j = 0; j<epochEvalErrors.size(); j++)
                                fprintf(stderr, "[%lu]=%.8g ", j, epochEvalErrors[j]);
                            fprintf(stderr, "Ave Learn Rate Per Sample = %.10g  Epoch Time=%.8g\n", learnRatePerSample, epochTime);
-                            fprintf(stderr, "Finished Epoch[%lu]: Criterion Node Per Sample = %.8g\n", i + 1, epochCriterion);
+                            fprintf(stderr, "Finished Epoch[%d]: Criterion Node Per Sample = %.8g\n", i + 1, epochCriterion);
                            for (size_t j = 0; j<epochEvalErrors.size(); j++)
-                                fprintf(stderr, "Finished Epoch[%lu]: Evaluation Node [%ws] Per Sample = %.8g\n", i + 1, evalNodeNames[j].c_str(), epochEvalErrors[j]);
+                                fprintf(stderr, "Finished Epoch[%d]: Evaluation Node [%ls] Per Sample = %.8g\n", i + 1, evalNodeNames[j].c_str(), epochEvalErrors[j]);
                        }

                        if (decoderValidationSetDataReader != decoderTrainSetDataReader && decoderValidationSetDataReader != nullptr)
@ -717,7 +726,7 @@ namespace Microsoft {
                                validationDataReader,
                                m_mbSize[i]);

-                            fprintf(stderr, "Finished Epoch[%lu]: [Validation Set] Loss Per Sample = %.8g \n ",  vScore );
+                            fprintf(stderr, "Finished Epoch[%d]: [Validation Set] Loss Per Sample = %.8g \n ", i+1, vScore );

                            epochCriterion = vScore; 
                        }
@ -1013,7 +1022,7 @@ namespace Microsoft {
                    {
                        epochEvalErrors[i] = (const ElemType)localEpochEvalErrors(0, i);
                    }
-                    fprintf(stderr, "total samples in epoch[%d] = %d\n", epochNumber, totalEpochSamples);
+                    fprintf(stderr, "total samples in epoch[%d] = %zd\n", epochNumber, totalEpochSamples);
                }

                bool EncoderDecoderGradientCheck(
@ -1053,7 +1062,7 @@ namespace Microsoft {
                                irow = max(0, irow);
                                icol = max(0, icol);

-                                fprintf(stderr, "\n###### d%ws######\n", node->NodeName().c_str());
+                                fprintf(stderr, "\n###### d%ls######\n", node->NodeName().c_str());
                                deviceId = node->FunctionValues().GetDeviceId();  // original device id

                                node->FunctionValues().TransferFromDeviceToDevice(deviceId, CPUDEVICE, true, false, false);
@ -1124,7 +1133,7 @@ namespace Microsoft {
                                if (wrong)
                                {
                                    char serr[2048];
-                                    sprintf_s((char*)serr, 2048, "Decoder %ws Numeric gradient = %e, Error BP gradient = %e", node->NodeName().c_str(), grdNum, grdErr);
+                                    sprintf_s((char*)serr, 2048, "Decoder %ls Numeric gradient = %e, Error BP gradient = %e", node->NodeName().c_str(), static_cast<double>(grdNum), static_cast<double>(grdErr));
                                    fprintf(stdout, "%s\n", serr);
                                    verror_msgs.push_back(serr);
                                }
--- a/MachineLearning/CNTK/SGD.h
+++ b/MachineLearning/CNTK/SGD.h
@ -1532,7 +1532,7 @@ protected:
        if (epochNumber < 2 && m_prevChosenMinibatchSize != 0)
        {
            // newly started training: any previous MB size stored in the model is to be ignored
-            fprintf(stderr, "before epoch .2, previous minibatchSize %d is "
+            fprintf(stderr, "before epoch .2, previous minibatchSize %zd is "
                    "considered invalid -> resetting\n", m_prevChosenMinibatchSize);
            m_prevChosenMinibatchSize = 0;
        }
@ -1543,7 +1543,7 @@ protected:
            (epochNumber + 1) % m_minibatchSizeTuningFrequency != 0)
        {
            fprintf(stderr, "AdaptiveMinibatchSearch: Search for a better minibatchSize "
-                    "in epoch %d skipped, keeping minibatchSize of %d\n",
+                    "in epoch %d skipped, keeping minibatchSize of %zd\n",
                    epochNumber + 1, m_prevChosenMinibatchSize);
            chosenMinibatchSize = m_prevChosenMinibatchSize;
        }
@ -1568,7 +1568,7 @@ protected:
                assert(m_prevChosenMinibatchSize >= chosenMinibatchSize);

                fprintf(stderr, "AdaptiveMinibatchSearch: Limiting maxMinibatchSize to "
-                        "previous minibatchSize %d*2\n", m_prevChosenMinibatchSize);
+                        "previous minibatchSize %zd*2\n", m_prevChosenMinibatchSize);
                maxMinibatchSize = min(maxMinibatchSize, m_prevChosenMinibatchSize * 2);
            }

@ -1634,7 +1634,7 @@ protected:
            // round mbsize to something meaningful
            trialMinibatchSize = RoundToMultipleOf64(trialMinibatchSizeFloat);

-            fprintf(stderr, "\nAdaptiveMinibatchSearch: Evaluating trial minibatchSize=%d out of range %d..%d ...\n\n",
+            fprintf(stderr, "\nAdaptiveMinibatchSearch: Evaluating trial minibatchSize=%zd out of range %zd..%zd ...\n\n",
                    trialMinibatchSize, RoundToMultipleOf64(minMinibatchSize), RoundToMultipleOf64(maxMinibatchSize));

            size_t totalSamplesSeen;
--- a/MachineLearning/CNTK/SimpleEvaluator.h
+++ b/MachineLearning/CNTK/SimpleEvaluator.h
@ -911,7 +911,7 @@ namespace Microsoft {

                            ComputeTimeInMBs += MBComputeTime;

-                            fprintf(stderr, "Sentenes Seen = %d; Samples seen = %d; Total Compute Time = %.8g ; Time Per Sample=%.8g\n", numMBsRun, totalEpochSamples, ComputeTimeInMBs, ComputeTimeInMBs / totalEpochSamples);
+                            fprintf(stderr, "Sentences Seen = %zd; Samples seen = %zd; Total Compute Time = %.8g ; Time Per Sample=%.8g\n", numMBsRun, totalEpochSamples, ComputeTimeInMBs, ComputeTimeInMBs / totalEpochSamples);
                        }

                        startReadMBTime = clock();
@ -1229,7 +1229,7 @@ namespace Microsoft {
                        {
                            ElemType score = result_queue.top().score;
                            best_score = score;
-                            fprintf(stderr, "best[%d] score = %.4e\t", ibest, score);
+                            fprintf(stderr, "best[%zd] score = %.4e\t", ibest, score);
                            if (best_path.size() > 0)
                                WriteNbest(ibest, best_path, outputNodes, dataWriter);
                        }
--- a/520
+++ b/520
@ -7,228 +7,402 @@
 #
 # This makefile will be extended/completed as we go.
 #
-# You will need to modify PATH and LD_LIBRARY_PATH environment variables to run CNTK
-# export LD_LIBRARY_PATH=<path_to_math_lib>/ifort64/lib:<path_to_cuda>/lib64:/usr/local/lib
-# export PATH=$PATH:/usr/local/bin:<path_to_cuda>/bin
-#
-# In order to deviate from the default settings in this Makefile, please specify options on
-# the make command line, like this, for example (to build release):
-#
-# make BUILDTYPE=release -j
+# To use this Makefile, create a directory to build in and make a Config.make in the directory
+# that provides
+# ACML_PATH= path to ACML library installation
+#   only needed if MATHLIB=acml
+# MKL_PATH= path to MKL library installation
+#   only needed if MATHLIB=mkl
+# GDK_PATH= path to cuda gdk installation, so $(GDK_PATH)/include/nvidia/gdk/nvml.h exists
+#   defaults to /usr
+# BUILDTYPE= One of release or debug
+#   defaults to release
+# MATHLIB= One of acml or mkl
+#   defaults to acml
+# CUDA_PATH= Path to CUDA
+#   If not specified, GPU will not be enabled
+# KALDI_PATH= Path to Kaldi
+#   If not specified, Kaldi plugins will not be built

-CC = g++
-NVCC = nvcc
-ARCH = x86_64
-
-# DEVICE can also be cpu
-DEVICE = gpu
-
-# BUILDTYPE can also be release
-BUILDTYPE = debug
-
-# MATHLIB can also be mkl
-MATHLIB = acml
-
-# This is a suggested/default location for ACML library
-MATHLIB_PATH = /usr/local/acml5.3.1/ifort64
-
-# This is a suggested/default location for CUDA
-CUDA_PATH = /usr/local/cuda-7.0
-
-# This is a suggested/default location for NVML
-NVML_INCLUDE = /usr/include/nvidia/gdk
-NVML_LIB = /usr/src/gdk/nvml/lib
-#######
-
-BUILDFOR = $(ARCH).$(DEVICE).$(BUILDTYPE).$(MATHLIB)
-
-OBJDIR = .build/$(BUILDFOR)
-BINDIR = bin/$(BUILDFOR)
-
-# Set up debug vs release compiler settings, both nvcc and gcc
-ifeq ($(BUILDTYPE),debug)
-	BUILDTYPE_OPT = -g
-	GPU_BUILDTYPE_OPT = -O0 -G -lineinfo
-else
-	BUILDTYPE_OPT = -O3 -flto
-	GPU_BUILDTYPE_OPT = -O3 -use_fast_math -lineinfo
+ifndef BUILD_TOP
+BUILD_TOP=.
 endif

-# Set up math library defines and libraries
-ifeq ($(MATHLIB),mkl)
-	MATHLIB_INCLUDE = $(MATHLIB_PATH)/mkl/include
-	MATHLIB_LIB = -L$(MATHLIB_PATH)/compiler/lib/intel64 -L$(MATHLIB_PATH)/mkl/lib/intel64 -L$(MATHLIB_PATH)/compiler/lib/mic -L$(MATHLIB_PATH)/mkl/lib/mic -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lm -liomp5 -lpthread
-	MATHLIB_DEFINE = -DUSE_MKL
+ifneq ("$(wildcard $(BUILD_TOP)/Config.make)","")
+  include $(BUILD_TOP)/Config.make
 else
-	MATHLIB_INCLUDE = $(MATHLIB_PATH)/include
-	MATHLIB_LIB = -L$(MATHLIB_PATH)/lib -lacml -lm -lpthread
-	MATHLIB_DEFINE = -DUSE_ACML
+  $(error Cannot fine $(BUILD_TOP)/Config.make.  Please see the README file for configuration instructions.)
 endif

-# Set up CUDA includes and libraries
-CUDA_INCLUDE = $(CUDA_PATH)/include
-CUDA_LIB = -L$(CUDA_PATH)/lib64 -L$(NVML_LIB) -lcublas -lcudart -lcurand -lcusparse -lnvidia-ml
-
-# Set up final list of libs to use
-ifeq ($(DEVICE),gpu)
-	LINK_LIBS = $(CUDA_LIB) $(MATHLIB_LIB)
-else
-	LINK_LIBS = $(MATHLIB_LIB)
+ifndef BUILDTYPE
+$(info Defaulting BUILDTYPE=release)
+BUILDTYPE=release
 endif

-# Compile CNTK math into its own shared library to ensure that any change to its
-# global variables, like CUDA streams is made in one place and has global effect.
-# Otherwise, different clients of CNTK math would observe different states.
-CNTKMATH_LINK_LIB = -L$(BINDIR) -lcntkmath
-CNTKMATH_LIB = $(BINDIR)/libcntkmath.so
-
-# Set up gcc includes and libraries
-INCFLAGS_COMMON = -I Common/Include -I Math/Math -I MachineLearning/CNTK -I $(MATHLIB_INCLUDE)
-CFLAGS_COMMON = -msse3 -std=c++0x -std=c++11 -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K $(MATHLIB_DEFINE) -fopenmp -fpermissive -fPIC
-
-ifeq ($(DEVICE),gpu)
-	INCFLAGS = $(INCFLAGS_COMMON) -I $(CUDA_INCLUDE) -I $(NVML_INCLUDE)
-	CFLAGS = $(CFLAGS_COMMON)
-else
-	INCFLAGS = $(INCFLAGS_COMMON)
-	CFLAGS = $(CFLAGS_COMMON) -DCPUONLY
+ifndef MATHLIB
+$(info DEFAULTING MATHLIB=acml)
+MATHLIB=acml
 endif

+#### Configure based on options above
+
+CXX = g++
+
+INCLUDEPATH:= Common/Include Math/Math MachineLearning/CNTK
+CPPFLAGS:= -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K
+CXXFLAGS:= -msse3 -std=c++0x -std=c++11 -fopenmp -fpermissive -fPIC
+LIBPATH:=
+LIBS:=
+LDFLAGS:=
+
+SEPARATOR = "=-----------------------------------------------------------="
+ALL:=
+SRC:=
+
+# Make sure all is the first (i.e. default) target, but we can't actually define it
+# this early in the file, so let buildall do the work.
+all : buildall
+
 # Set up nvcc target architectures (will generate code to support them all, i.e. fat-binary)
 GENCODE_SM20 := -gencode arch=compute_20,code=\"sm_20,compute_20\"
 GENCODE_SM30 := -gencode arch=compute_30,code=\"sm_30,compute_30\"
 GENCODE_SM35 := -gencode arch=compute_35,code=\"sm_35,compute_35\"
 GENCODE_FLAGS := $(GENCODE_SM20) $(GENCODE_SM30) $(GENCODE_SM35)

-# Set up basic nvcc options and add GPU targets from above
-NVCCFLAGS = -std=c++11 -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -m 64 $(GENCODE_FLAGS)
+# Set up basic nvcc options and add CUDA targets from above
+CUFLAGS = -std=c++11 -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -m 64 $(GENCODE_FLAGS)

-# Set up linker option to embed ORIGIN, i.e. directory where cntk is into the search path option
-# at runtime. This will try to resolve all dependent binaries in the same directory where cntk binary resides
-LDFLAGS=-Wl,-rpath,'$$ORIGIN'
+ifdef CUDA_PATH
+  ifndef GDK_PATH
+    $(info defaulting GDK_PATH to /usr)
+    GDK_PATH=/usr
+  endif

-# Define all sources that need to be built
-COMMON_SRC = Common/fileutil.cpp Common/DataWriter.cpp Common/ConfigFile.cpp Common/DataReader.cpp \
-			 Common/Eval.cpp Common/File.cpp Common/BestGpu.cpp Common/TimerUtility.cpp
+  DEVICE = gpu

-MATH_COMMON_SRC = Math/Math/Matrix.cpp Math/Math/CPUMatrix.cpp Math/Math/CPUSparseMatrix.cpp
+  NVCC = $(CUDA_PATH)/bin/nvcc
+
+  # This is a suggested/default location for NVML
+  INCLUDEPATH+=$(GDK_PATH)/include/nvidia/gdk
+  NVMLPATH=$(GDK_PATH)/src/gdk/nvml/lib
+
+  # Set up CUDA includes and libraries
+  INCLUDEPATH += $(CUDA_PATH)/include
+  LIBPATH += $(CUDA_PATH)/lib64
+  LIBS += -lcublas -lcudart -lcuda -lcurand -lcusparse -lnvidia-ml

-ifeq ($(DEVICE),gpu)
-	MATH_SRC = $(MATH_COMMON_SRC) Math/Math/GPUMatrix.cu Math/Math/GPUMatrixCUDAKernels.cu Math/Math/GPUSparseMatrix.cu Math/Math/GPUWatcher.cu
 else
-	MATH_SRC = $(MATH_COMMON_SRC) Math/Math/NoGPU.cpp
+  DEVICE = cpu
+
+  CPPFLAGS +=-DCPUONLY
 endif

-CN_SRC =  MachineLearning/CNTK/NetworkDescriptionLanguage.cpp MachineLearning/CNTK/CNTK.cpp MachineLearning/CNTK/ComputationNode.cpp \
-		  MachineLearning/CNTK/ModelEditLanguage.cpp MachineLearning/CNTK/SimpleNetworkBuilder.cpp MachineLearning/CNTK/tests.cpp \
-		  MachineLearning/CNTK/Profiler.cpp MachineLearning/CNTKEval/CNTKEval.cpp
+ifeq ("$(MATHLIB)","acml")
+  INCLUDEPATH += $(ACML_PATH)/include
+  LIBPATH += $(ACML_PATH)/lib
+  LIBS += -lacml -lm -lpthread
+  CPPFLAGS += -DUSE_ACML
+endif

-BINARYREADER_SRC = DataReader/BinaryReader/BinaryWriter.cpp DataReader/BinaryReader/BinaryReader.cpp DataReader/BinaryReader/BinaryFile.cpp
-HTKMLFREADER_SRC = DataReader/HTKMLFReader_linux/HTKMLFWriter.cpp DataReader/HTKMLFReader_linux/DataWriter.cpp DataReader/HTKMLFReader_linux/DataReader.cpp DataReader/HTKMLFReader_linux/HTKMLFReader.cpp
-SEQUENCEREADER_SRC = DataReader/LMSequenceReader/SequenceReader.cpp DataReader/LMSequenceReader/SequenceParser.cpp DataReader/LMSequenceReader/Exports.cpp
-LUSEQUENCEREADER_SRC = DataReader/LUSequenceReader/LUSequenceReader.cpp DataReader/LUSequenceReader/LUSequenceParser.cpp DataReader/LUSequenceReader/Exports.cpp
-UCIFASTREADER_SRC = DataReader/UCIFastReader/UCIParser.cpp DataReader/UCIFastReader/UCIFastReader.cpp DataReader/UCIFastReader/Exports.cpp 
+ifeq ("$(MATHLIB)","mkl")
+  INCLUDEPATH += $(MKL_PATH)/mkl/include
+  LIBPATH += $(MKL_PATH)/compiler/lib/intel64 $(MKL_PATH)/mkl/lib/intel64 $(MKL_PATH)/compiler/lib/mic $(MKL_PATH)/mkl/lib/mic
+  LIBS += -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lm -liomp5 -lpthread
+  CPPFLAGS += -DUSE_MKL
+endif

-READER_SRC = $(UCIFASTREADER_SRC) $(LUSEQUENCEREADER_SRC) $(HTKMLFREADER_SRC) $(SEQUENCEREADER_SRC) $(BINARYREADER_SRC)
-CORE_SRC = $(CN_SRC) $(COMMON_SRC)
-SRC =  $(READER_SRC) $(CORE_SRC) $(MATH_SRC)
+
+ifdef KALDI_PATH
+  ########## Copy includes and defines from $(KALDI_PATH)/src/kaldi.mk ##########
+  FSTROOT = $(KALDI_PATH)/tools/openfst
+  ATLASINC = $(KALDI_PATH)/tools/ATLAS/include
+
+  INCLUDEPATH += $(KALDI_PATH)/src $(ATLASINC) $(FSTROOT)/include
+  CPPFLAGS+= -DKALDI_DOUBLEPRECISION=0 -DHAVE_POSIX_MEMALIGN -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -DHAVE_OPENFST_GE_10400
+
+  KALDI_LIBPATH += $(KALDI_PATH)/src/lib
+  KALDI_LIBS += -lkaldi-util -lkaldi-matrix -lkaldi-base -lkaldi-hmm -lkaldi-cudamatrix -lkaldi-nnet -lkaldi-lat
+endif
+
+ifeq ("$(BUILDTYPE)","debug")
+  CXXFLAGS += -g
+  CUFLAGS += -O0 -G -lineinfo
+endif
+
+ifeq ("$(BUILDTYPE)","release")
+  CXXFLAGS += -O4
+  CUFLAGS += -O3 -use_fast_math -lineinfo
+endif
+
+#######
+
+OBJDIR:= $(BUILD_TOP)/.build
+BINDIR:= $(BUILD_TOP)/bin
+LIBDIR:= $(BUILD_TOP)/lib
+
+ORIGINLIBDIR:='$$ORIGIN/../lib'
+ORIGINDIR:='$$ORIGIN'
+
+CNTKMATH:=cntkmath
+
+########################################
+# Math library
+########################################
+
+# Define all sources that need to be built
+COMMON_SRC =\
+	Common/BestGpu.cpp \
+	Common/ConfigFile.cpp \
+	Common/DataReader.cpp \
+	Common/DataWriter.cpp \
+	Common/Eval.cpp \
+	Common/File.cpp \
+	Common/TimerUtility.cpp \
+	Common/fileutil.cpp \
+
+MATH_SRC =\
+	Math/Math/CPUMatrix.cpp \
+	Math/Math/CPUSparseMatrix.cpp \
+	Math/Math/Matrix.cpp \
+
+ifdef CUDA_PATH
+MATH_SRC +=\
+	Math/Math/GPUMatrix.cu \
+	Math/Math/GPUMatrixCUDAKernels.cu \
+	Math/Math/GPUSparseMatrix.cu \
+	Math/Math/GPUWatcher.cu \
+
+else
+MATH_SRC +=\
+	Math/Math/NoGPU.cpp
+
+endif
+
+MATH_SRC+=$(COMMON_SRC)
+
+MATH_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(MATH_SRC)))
+
+CNTKMATH_LIB:= $(LIBDIR)/lib$(CNTKMATH).so
+ALL += $(CNTKMATH_LIB)
+SRC+=$(MATH_SRC)
+
+RPATH=-Wl,-rpath,
+
+$(CNTKMATH_LIB): $(MATH_OBJ)
+	@echo $(SEPARATOR)
+	@echo creating $@ for $(ARCH) with build type $(BUILDTYPE) 
+	@mkdir -p $(dir $@)
+	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBPATH) $(NVMLPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -fopenmp
+
+########################################
+# BinaryReader plugin
+########################################
+
+
+BINARYREADER_SRC =\
+	DataReader/BinaryReader/BinaryFile.cpp \
+	DataReader/BinaryReader/BinaryReader.cpp \
+	DataReader/BinaryReader/BinaryWriter.cpp \
+
+BINARYREADER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(BINARYREADER_SRC))
+
+BINARY_READER:= $(LIBDIR)/BinaryReader.so
+
+#ALL += $(BINARY_READER)
+#SRC+=$(BINARYREADER_SRC)
+
+$(BINARY_READER): $(BINARYREADER_OBJ) | $(CNTKMATH_LIB)
+	@echo $(SEPARATOR)
+	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ -l$(CNTKMATH)
+
+########################################
+# HTKMLFReader plugin
+########################################
+
+
+HTKMLFREADER_SRC =\
+	DataReader/HTKMLFReader_linux/DataReader.cpp \
+	DataReader/HTKMLFReader_linux/DataWriter.cpp \
+	DataReader/HTKMLFReader_linux/HTKMLFReader.cpp \
+	DataReader/HTKMLFReader_linux/HTKMLFWriter.cpp \
+
+HTKMLREADER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(HTKMLFREADER_SRC))
+
+HTKMLREADER:=$(LIBDIR)/HTKMLFReader.so
+ALL+=$(HTKMLREADER)
+SRC+=$(HTKMLREADER_SRC)
+
+$(LIBDIR)/HTKMLFReader.so: $(HTKMLREADER_OBJ) | $(CNTKMATH_LIB)
+	@echo $(SEPARATOR)
+	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ -l$(CNTKMATH)
+
+########################################
+# LMSequenceReader plugin
+########################################
+
+LMSEQUENCEREADER_SRC =\
+	DataReader/LMSequenceReader/Exports.cpp \
+	DataReader/LMSequenceReader/SequenceParser.cpp \
+	DataReader/LMSequenceReader/SequenceReader.cpp \
+
+LMSEQUENCEREADER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(LMSEQUENCEREADER_SRC))
+
+LMSEQUENCEREADER:= $(LIBDIR)/LMSequenceReader.so
+ALL+=$(LMSEQUENCEREADER)
+SRC+=$(LMSEQUENCEREADER_SRC)
+
+$(LMSEQUENCEREADER): $(LMSEQUENCEREADER_OBJ) | $(CNTKMATH_LIB)
+	@echo $(SEPARATOR)
+	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ -l$(CNTKMATH)
+
+########################################
+# LUSequenceReader plugin
+########################################
+
+LUSEQUENCEREADER_SRC =\
+	DataReader/LUSequenceReader/Exports.cpp \
+	DataReader/LUSequenceReader/LUSequenceParser.cpp \
+	DataReader/LUSequenceReader/LUSequenceReader.cpp \
+
+LUSEQUENCEREADER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(LUSEQUENCEREADER_SRC))
+
+LUSEQUENCEREADER:=$(LIBDIR)/LUSequenceReader.so
+ALL+=$(LUSEQUENCEREADER)
+SRC+=$(LUSEQUENCEREADER_SRC)
+
+$(LUSEQUENCEREADER): $(LUSEQUENCEREADER_OBJ) | $(CNTKMATH_LIB)
+	@echo $(SEPARATOR)
+	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ -l$(CNTKMATH)
+
+########################################
+# UCIFastReader plugin
+########################################
+
+UCIFASTREADER_SRC =\
+	DataReader/UCIFastReader/Exports.cpp \
+	DataReader/UCIFastReader/UCIFastReader.cpp \
+	DataReader/UCIFastReader/UCIParser.cpp \
+
+UCIFASTREADER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UCIFASTREADER_SRC))
+
+UCIFASTREADER:=$(LIBDIR)/UCIFastReader.so
+ALL += $(UCIFASTREADER)
+SRC+=$(UCIFASTREADER_SRC)
+
+$(UCIFASTREADER): $(UCIFASTREADER_OBJ) | $(CNTKMATH_LIB)
+	@echo $(SEPARATOR)
+	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ -l$(CNTKMATH)
+
+########################################
+# Kaldi plugins
+########################################
+
+ifdef KALDI_PATH
+KALDIREADER_SRC = \
+	DataReader/KaldiReader/DataReader.cpp \
+	DataReader/KaldiReader/DataWriter.cpp \
+	DataReader/KaldiReader/HTKMLFReader.cpp \
+	DataReader/KaldiReader/HTKMLFWriter.cpp \
+
+KALDIREADER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(KALDIREADER_SRC))
+
+KALDIREADER:=$(LIBDIR)/KaldiReader.so
+ALL+=$(KALDIREADER)
+SRC+=$(KALDIREADER_SRC)
+
+$(KALDIREADER): $(KALDIREADER_OBJ)
+	@echo $(SEPARATOR)
+	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(KALDI_LIBPATH) $(LIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(KALDI_LIBPATH) $(LIBPATH)) -o $@ $^ -l$(CNTKMATH) $(KALDI_LIBS)
+
+KALDIWRITER:=$(LIBDIR)/KaldiWriter.so
+ALL+=$(KALDIWRITER)
+
+$(KALDIWRITER): $(KALDIREADER_OBJ)
+	@echo $(SEPARATOR)
+	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ -l$(CNTKMATH)
+
+
+KALDI2READER_SRC = \
+	DataReader/Kaldi2Reader/DataReader.cpp \
+	DataReader/Kaldi2Reader/DataWriter.cpp \
+	DataReader/Kaldi2Reader/HTKMLFReader.cpp \
+	DataReader/Kaldi2Reader/HTKMLFWriter.cpp \
+	DataReader/Kaldi2Reader/KaldiSequenceTrainingDerivative.cpp \
+	DataReader/Kaldi2Reader/UtteranceDerivativeBuffer.cpp \
+
+KALDI2READER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(KALDI2READER_SRC))
+
+KALDI2READER:=$(LIBDIR)/Kaldi2Reader.so
+ALL+=$(KALDI2READER)
+SRC+=$(KALDI2READER_SRC)
+
+$(KALDI2READER): $(KALDI2READER_OBJ)
+	@echo $(SEPARATOR)
+	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(KALDI_LIBPATH) $(LIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(KALDI_LIBPATH) $(LIBPATH)) -o $@ $^ -l$(CNTKMATH) $(KALDI_LIBS)
+
+endif
+
+########################################
+# cntk
+########################################
+
+CNTK_SRC =\
+	MachineLearning/CNTK/CNTK.cpp \
+	MachineLearning/CNTK/ComputationNode.cpp \
+	MachineLearning/CNTK/ModelEditLanguage.cpp \
+	MachineLearning/CNTK/NetworkDescriptionLanguage.cpp \
+	MachineLearning/CNTK/Profiler.cpp \
+	MachineLearning/CNTK/SimpleNetworkBuilder.cpp \
+	MachineLearning/CNTK/tests.cpp \
+	MachineLearning/CNTKEval/CNTKEval.cpp \
+
+CNTK_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(CNTK_SRC))
+
+CNTK:=$(BINDIR)/cntk
+ALL+=$(CNTK)
+
+$(CNTK): $(CNTK_OBJ) | $(CNTKMATH_LIB)
+	@echo $(SEPARATOR)
+	@mkdir -p $(dir $@)
+	@echo building output for $(ARCH) with build type $(BUILDTYPE)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) -fopenmp
+
+########################################
+# General compile and dependency rules
+########################################

 VPATH := $(sort  $(dir $(SRC)))

 # Define object files
-OBJ_TMP := $(patsubst %.cpp, $(OBJDIR)/%.o, $(SRC))
-ifeq ($(DEVICE),gpu)
-	OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(OBJ_TMP))
-else
-	OBJ := $(OBJ_TMP)
-endif
-
-CORE_OBJ_TMP := $(patsubst %.cpp, $(OBJDIR)/%.o, $(CORE_SRC))
-ifeq ($(DEVICE),gpu)
-	CORE_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(CORE_OBJ_TMP))
-else
-	CORE_OBJ := $(CORE_OBJ_TMP)
-endif
-
-COMMON_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(COMMON_SRC))
-
-MATH_OBJ_TMP := $(patsubst %.cpp, $(OBJDIR)/%.o, $(MATH_SRC))
-ifeq ($(DEVICE),gpu)
-	MATH_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(MATH_OBJ_TMP))
-else
-	MATH_OBJ := $(MATH_OBJ_TMP)
-endif
-
-UCIFASTREADER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UCIFASTREADER_SRC))
-LUSEQUENCEREADER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(LUSEQUENCEREADER_SRC))
-SEQUENCEREADER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(SEQUENCEREADER_SRC))
-HTKMLFREADER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(HTKMLFREADER_SRC))
-BINARYREADER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(BINARYREADER_SRC))
+OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(SRC)))

 # C++ include dependencies generated by -MF compiler option
 DEP := $(patsubst %.o, %.d, $(OBJ))

-SEPARATOR = "=-----------------------------------------------------------="
-
-# Define build targets
-all: $(BINDIR)/cntk $(BINDIR)/UCIFastReader.so $(BINDIR)/LMSequenceReader.so $(BINDIR)/LUSequenceReader.so $(BINDIR)/HTKMLFReader.so
-	@echo $(SEPARATOR)
-	@echo finished building for $(ARCH) with build type $(BUILDTYPE)
-
-$(BINDIR)/UCIFastReader.so: $(UCIFASTREADER_OBJ) | $(CNTKMATH_LIB)
-	@echo $(SEPARATOR)
-	$(CC) $(BUILDTYPE_OPT) -fPIC -shared -o $@ $^ $(CNTKMATH_LINK_LIB)
-
-$(BINDIR)/LMSequenceReader.so: $(SEQUENCEREADER_OBJ) | $(CNTKMATH_LIB)
-	@echo $(SEPARATOR)
-	$(CC) $(BUILDTYPE_OPT) -fPIC -shared -o $@ $^ $(CNTKMATH_LINK_LIB)
-
-$(BINDIR)/LUSequenceReader.so: $(LUSEQUENCEREADER_OBJ) | $(CNTKMATH_LIB)
-	@echo $(SEPARATOR)
-	$(CC) $(BUILDTYPE_OPT) -fPIC -shared -o $@ $^ $(CNTKMATH_LINK_LIB)
-
-$(BINDIR)/HTKMLFReader.so: $(HTKMLFREADER_OBJ) | $(CNTKMATH_LIB)
-	@echo $(SEPARATOR)
-	$(CC) $(BUILDTYPE_OPT) -fPIC -shared -o $@ $^ $(CNTKMATH_LINK_LIB)
-
-$(BINDIR)/BinaryReader.so: $(BINARYREADER_OBJ) | $(CNTKMATH_LIB)
-	@echo $(SEPARATOR)
-	$(CC) $(BUILDTYPE_OPT) -fPIC -shared -o $@ $^ $(CNTKMATH_LINK_LIB)
-
-$(BINDIR)/cntk: $(CORE_OBJ) | $(CNTKMATH_LIB)
-	@echo $(SEPARATOR)
-	@mkdir -p $(dir $@)
-	@echo building output for $(ARCH) with build type $(BUILDTYPE)
-	$(CC) $(BUILDTYPE_OPT) $(LDFLAGS) -o $@ $^ $(LINK_LIBS) $(CNTKMATH_LINK_LIB) -fopenmp -ldl -fPIC 
-
-$(CNTKMATH_LIB): $(MATH_OBJ) $(COMMON_OBJ)
-	@echo $(SEPARATOR)
-	@echo creating $@ for $(ARCH) with build type $(BUILDTYPE) 
-	@mkdir -p $(dir $@)
-	$(CC) $(BUILDTYPE_OPT) -fPIC -shared -o $@ $^ $(LINK_LIBS) -fopenmp
-
 # Include all C++ dependencies, like header files, to ensure that a change in those
 # will result in the rebuild.
 -include ${DEP}

-ifeq ($(DEVICE),gpu)
 $(OBJDIR)/%.o : %.cu Makefile
 	@echo $(SEPARATOR)
 	@echo creating $@ for $(ARCH) with build type $(BUILDTYPE) 
 	@mkdir -p $(dir $@)
-	$(NVCC) -c $< -o $@ $(GPU_BUILDTYPE_OPT) $(NVCCFLAGS) $(INCFLAGS) -Xcompiler -fPIC
-endif
+	$(NVCC) -c $< -o $@  $(CUFLAGS) $(INCLUDEPATH:%=-I%) -Xcompiler -fPIC

 $(OBJDIR)/%.o : %.cpp Makefile
 	@echo $(SEPARATOR)
 	@echo creating $@ for $(ARCH) with build type $(BUILDTYPE) 
 	@mkdir -p $(dir $@)
-	$(CC) -c $< -o $@ $(BUILDTYPE_OPT) $(CPPFLAGS) $(CFLAGS) $(INCFLAGS) -MD -MP -MF ${@:.o=.d}
+	$(CXX) -c $< -o $@ $(CPPFLAGS) $(CXXFLAGS) $(INCLUDEPATH:%=-I%) -MD -MP -MF ${@:.o=.d}

-.PHONY: clean
+.PHONY: clean buildall all

 clean:
 	@echo $(SEPARATOR)
 	@rm -rf $(OBJDIR)
-	@rm -rf $(BINDIR)
+	@rm -rf $(ALL)
 	@echo finished cleaning up the project 
+
+buildall : $(ALL)
+	@echo $(SEPARATOR)
+	@echo finished building for $(ARCH) with build type $(BUILDTYPE)
--- a/65
+++ b/65
@ -33,55 +33,62 @@ To build the cpu version, you have to install intel MKL blas library or ACML lib

 for MKL:
 1. Download from https://software.intel.com/en-us/intel-mkl
-2. You can modify variable MKL_PATH in makefile.cpu to change your mkl path.
-Then add ${MKL_PATH}/mkl/lib/intel64, ${MKL_PATH}/mkl/lib/mic, ${MKL_PATH}/compiler/lib/intel64. ${MKL_PATH}/compiler/lib/mic to your ${LD_LIBRARY_PATH} to make sure the program links the library correctly.

 for ACML:
 1. Download from http://developer.amd.com/tools-and-sdks/cpu-development/amd-core-math-library-acml/
-2. Modify ACML_PATH in the makefile.cpu and makefile.gpu to provide your ACML library path.
-You need to add ${ACML_PATH}/lib to your ${LD_LIBRARY_PATH}.

 for Kaldi:
 1. In kaldi-trunk/tools/Makefile, uncomment # OPENFST_VERSION = 1.4.1, and
   re-install OpenFst using the makefile.
 2. In kaldi-trunk/src/, do ./configure --shared; make depend -j 8; make -j 8;
   and re-compile Kaldi (the -j option is for parallelization).
-3. Set KALDI_PATH in kaldi_vars.mk
-4. When running the binaries, make sure you add $KALDI_PATH/tools/openfst/lib
-   and $KALDI_PATH/src/lib/ to your $LD_LIBRARY_PATH 

 To build the gpu version, you have to install NIVIDIA CUDA first
-You can modify the path CUDA_PATH in makefile.cpu to change your cuda path
-We use cuda-7.0 as default.
-Then add ${CUDA_PATH}/lib, ${CUDA_PATH}/lib64 to your ${LD_LIBRARY_PATH} to make sure the program links to the library correctly.

-... TODO: add documentation on nvml lib
+== Build Preparation ==
+Let $CNTK be the CNTK directory.
+>mkdir build
+>$CNTK/configure -h

-== Build ==
-To build the cpu version, run
-	make DEVICE=cpu 
-To build the gpu version, run
-	make
-To clean the compile, just run
-	make DEVICE=cpu clean
-or
-	make clean
+You will see various options for configure, as well as their default
+values.  CNTK needs a CPU math directory, either acml or mkl.  If you
+do not specify one and both are available, acml will be used.  For GPU
+use, a cuda and gdk directory are also required.  Similary, to build
+the kaldi plugin a kaldi directory is required.  You may also specify
+whether you want a debug or release build.  Rerun configure with the
+desired options.

-For release version, just add BUILDTYPE=release to the make command line.
+>$CNTK/configure ...
+
+This will create a Config.make and a Makefile (if you are in the $CNTK
+directory, a Makefile will not be created).  The Config.make file
+records the configuration parameters and the Makefile reinvokes the
+$CNTK/Makefile, passing it the build directory where it can find the
+Config.make.
+
+After make completes, you will have the following directories:
+
+  .build will contain object files, and can be deleted
+  bin contains the cntk program
+  lib contains libraries and plugins
+
+  The bin and lib directories can safely be moved as long as they remain siblings.
+
+To clean
+
+>make clean

 == Run ==
-All executables are in bin/ directory:
-	cn.exe: The main executable for CNTK
+All executables are in bin directory:
+	cntk: The main executable for CNTK
 	*.so: shared library for corresponding reader, these readers will be linked and loaded dynamically at runtime.

-To run the executable, make sure bin/ is in your ${LD_LIBRARY_PATH}, if not, running cn.exe will fail when cn.exe tries to link the corresponding reader. Once it's done, run in command line:
-	./cn.exe configFile=${your config file}
+	./cntk configFile=${your cntk config file}

 == Kaldi Reader ==
 This is a HTKMLF reader and kaldi writer (for decode)

-To build the cpu/gpu version, run
-    make -f Makefile_kaldi.cpu/gpu
+To build, set KALDI_PATH in your Config.make

 The feature section is like:

@ -102,9 +109,7 @@ writer=[
 == Kaldi2 Reader ==
 This is a kaldi reader and kaldi writer (for decode)

-To build the cpu/gpu version, run
-    make -f Makefile_kaldi2.cpu/gpu
-
+To build, set KALDI_PATH in your Config.make

 The features section is different:

--- a/Scripts/build-and-test
+++ b/Scripts/build-and-test
@ -96,10 +96,11 @@ if [[ $OS == "Windows_NT" && $OSTYPE == "cygwin" ]]; then
        exit 1
    fi
 elif [[ $OSTYPE == "linux-gnu" ]]; then
-    DEBUG_DIR=x86_64.gpu.debug.acml
-    RELEASE_DIR=x86_64.gpu.release.acml
-    PREFIX_DIR=bin
-    BIN_NAME=cntk
+    DEBUG_DIR=build/debug
+    RELEASE_DIR=build/release
+    PREFIX_DIR=
+    # Make sure no dependencies on current directory
+    BIN_NAME=bin/cntk
    MAKEFILE=Makefile
    BUILD_OS="linux"
 else
@ -179,12 +180,16 @@ if [[ $BUILD == 1 ]]; then
            fi
            msbuild.exe /property:Configuration=$FLAVOR /m 1>&6 2>&7 || exit $?
        else
+            BUILD_DIR=build/$FLAVOR
+            ./configure --with-build-top=$BUILD_DIR --with-acml=$ACML_PATH --with-buildtype=$FLAVOR
            if [[ $CLEAN_BEFORE == 1 ]]; then
-                make BUILDTYPE=$FLAVOR -f $MAKEFILE clean 1>&6 2>&7 || exit $?
+                make -C $BUILD_DIR -f $MAKEFILE clean 1>&6 2>&7 || exit $?
            fi
-            make BUILDTYPE=$FLAVOR -j -f $MAKEFILE 1>&6 2>&7 || exit $?
+            make -C $BUILD_DIR -j -f $MAKEFILE 1>&6 2>&7 || exit $?
+        fi
+        if [[ $QUIET_BUILD == 1 ]]; then
+          chmod a+r $BUILD_FILE.*
        fi
-        chmod a+r $BUILD_FILE.*
    done
 fi

@ -212,18 +217,21 @@ if [[ $RUN == 1 ]]; then
            fi
            OUT_FILE="$RUN_FILE.$FLAVOR.$TARGET.out"

-            if ! [[ -f "./$FLAVOR_DIR/$BIN_NAME" ]]; then
+            BIN_PATH=$CNTK_ROOT/$FLAVOR_DIR/$BIN_NAME 
+            if ! [[ -f $BIN_PATH ]]; then
                echo "============ ERROR: CNTK did not build properly for flavor ($FLAVOR)  ============"
+                echo "Missing file: $BIN_PATH"
                exit 1
            fi

            echo "============ Running CNTK for ($FLAVOR) ($TARGET), output in ($RUN_FILE.*) ============"
+            cd $CNTK_ROOT/Demos
            rm -rf models
            if [[ $OS == "Windows_NT" ]]; then
                # We have to use cygpath on Windows to modify the file paths into the format readable by cntk.
-                time ./$FLAVOR_DIR/$BIN_NAME configFile="`cygpath -w $CONF_FILE`" &>$OUT_FILE || exit $?
+                time $BIN_PATH configFile="`cygpath -w $CONF_FILE`" &>$OUT_FILE || exit $?
            else
-                time ./$FLAVOR_DIR/$BIN_NAME configFile=$CONF_FILE &>$OUT_FILE || exit $?
+                time $BIN_PATH configFile=$CONF_FILE &>$OUT_FILE || exit $?
            fi
            chmod a+r $RUN_FILE.*

--- a/Tests/Speech/QuickE2E/baseline.windows.cpu.txt
+++ b/Tests/Speech/QuickE2E/baseline.windows.cpu.txt
@ -0,0 +1,738 @@
+=== Running /cygdrive/c/Users/svcphil/workspace.vlivan/CNTK-Build-Windows/x64/release/cntk.exe configFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\QuickE2E\cntk.config RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_cpu DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data DeviceId=Auto
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Aug 11 2015 16:18:17
+		Last modified date: Tue Aug 11 16:16:08 2015
+		Built by svcphil on dphaim-26-new           
+		Build Path: C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\MachineLearning\CNTK\
+		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
+		Build Branch: master
+		Build SHA1: 397cc7cc16c00b1c12864d331c0729fde7a1bde3
+-------------------------------------------------------------------
+running on dphaim-26-new at 2015/08/11 17:47:10
+command line options: 
+configFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\QuickE2E\cntk.config RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_cpu DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data DeviceId=Auto 
+
+>>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+precision=float
+command=speechTrain
+deviceId=$DeviceId$
+parallelTrain=false
+speechTrain=[
+    action=train
+    modelPath=$RunDir$/models/cntkSpeech.dnn
+    deviceId=$DeviceId$
+    traceLevel=1
+    SimpleNetworkBuilder=[
+        layerSizes=363:512:512:132
+        trainingCriterion=CrossEntropyWithSoftmax
+        evalCriterion=ErrorPrediction
+        layerTypes=Sigmoid
+        initValueScale=1.0
+        applyMeanVarNorm=true
+        uniformInit=true
+        needPrior=true
+    ]
+    SGD=[
+        epochSize=20480
+        minibatchSize=64:256:1024:
+        learningRatesPerMB=1.0:0.5:0.1
+        numMBsToShowResult=10
+        momentumPerMB=0.9:0.656119
+        dropoutRate=0.0
+        maxEpochs=3
+        keepCheckPointFiles=true       
+        AutoAdjust=[
+            reduceLearnRateIfImproveLessThan=0
+            loadBestModel=true
+            increaseLearnRateIfImproveMoreThan=1000000000
+            learnRateDecreaseFactor=0.5
+            learnRateIncreaseFactor=1.382
+            autoAdjustLR=AdjustAfterEpoch
+        ]
+        clippingThresholdPerSample=1#INF
+    ]
+    reader=[
+      readerType=HTKMLFReader
+      readMethod=blockRandomize
+      miniBatchMode=Partial
+      randomize=Auto
+      verbosity=0
+      features=[
+          dim=363
+          type=Real
+          scpFile=glob_0000.scp
+      ]
+      labels=[
+          mlfFile=$DataDir$/glob_0000.mlf
+          labelMappingFile=$DataDir$/state.list
+          labelDim=132
+          labelType=Category
+      ]
+    ]
+]
+RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_cpu
+DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data
+DeviceId=Auto
+
+<<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+
+>>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+precision=float
+command=speechTrain
+deviceId=Auto
+parallelTrain=false
+speechTrain=[
+    action=train
+    modelPath=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_cpu/models/cntkSpeech.dnn
+    deviceId=Auto
+    traceLevel=1
+    SimpleNetworkBuilder=[
+        layerSizes=363:512:512:132
+        trainingCriterion=CrossEntropyWithSoftmax
+        evalCriterion=ErrorPrediction
+        layerTypes=Sigmoid
+        initValueScale=1.0
+        applyMeanVarNorm=true
+        uniformInit=true
+        needPrior=true
+    ]
+    SGD=[
+        epochSize=20480
+        minibatchSize=64:256:1024:
+        learningRatesPerMB=1.0:0.5:0.1
+        numMBsToShowResult=10
+        momentumPerMB=0.9:0.656119
+        dropoutRate=0.0
+        maxEpochs=3
+        keepCheckPointFiles=true       
+        AutoAdjust=[
+            reduceLearnRateIfImproveLessThan=0
+            loadBestModel=true
+            increaseLearnRateIfImproveMoreThan=1000000000
+            learnRateDecreaseFactor=0.5
+            learnRateIncreaseFactor=1.382
+            autoAdjustLR=AdjustAfterEpoch
+        ]
+        clippingThresholdPerSample=1#INF
+    ]
+    reader=[
+      readerType=HTKMLFReader
+      readMethod=blockRandomize
+      miniBatchMode=Partial
+      randomize=Auto
+      verbosity=0
+      features=[
+          dim=363
+          type=Real
+          scpFile=glob_0000.scp
+      ]
+      labels=[
+          mlfFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/glob_0000.mlf
+          labelMappingFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/state.list
+          labelDim=132
+          labelType=Category
+      ]
+    ]
+]
+RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_cpu
+DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data
+DeviceId=Auto
+
+<<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+
+>>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+configparameters: cntk.config:command=speechTrain
+configparameters: cntk.config:DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data
+configparameters: cntk.config:deviceId=Auto
+configparameters: cntk.config:parallelTrain=false
+configparameters: cntk.config:precision=float
+configparameters: cntk.config:RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_cpu
+configparameters: cntk.config:speechTrain=[
+    action=train
+    modelPath=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_cpu/models/cntkSpeech.dnn
+    deviceId=Auto
+    traceLevel=1
+    SimpleNetworkBuilder=[
+        layerSizes=363:512:512:132
+        trainingCriterion=CrossEntropyWithSoftmax
+        evalCriterion=ErrorPrediction
+        layerTypes=Sigmoid
+        initValueScale=1.0
+        applyMeanVarNorm=true
+        uniformInit=true
+        needPrior=true
+    ]
+    SGD=[
+        epochSize=20480
+        minibatchSize=64:256:1024:
+        learningRatesPerMB=1.0:0.5:0.1
+        numMBsToShowResult=10
+        momentumPerMB=0.9:0.656119
+        dropoutRate=0.0
+        maxEpochs=3
+        keepCheckPointFiles=true       
+        AutoAdjust=[
+            reduceLearnRateIfImproveLessThan=0
+            loadBestModel=true
+            increaseLearnRateIfImproveMoreThan=1000000000
+            learnRateDecreaseFactor=0.5
+            learnRateIncreaseFactor=1.382
+            autoAdjustLR=AdjustAfterEpoch
+        ]
+        clippingThresholdPerSample=1#INF
+    ]
+    reader=[
+      readerType=HTKMLFReader
+      readMethod=blockRandomize
+      miniBatchMode=Partial
+      randomize=Auto
+      verbosity=0
+      features=[
+          dim=363
+          type=Real
+          scpFile=glob_0000.scp
+      ]
+      labels=[
+          mlfFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/glob_0000.mlf
+          labelMappingFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/state.list
+          labelDim=132
+          labelType=Category
+      ]
+    ]
+]
+
+<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+command: speechTrain 
+precision = float
+LockDevice: Capture device 1 and lock it for exclusive use
+LockDevice: Capture device 2 and lock it for exclusive use
+LockDevice: Capture device 3 and lock it for exclusive use
+LockDevice: Capture device 0 and lock it for exclusive use
+LockDevice: Capture device 1 and lock it for exclusive use
+SimpleNetworkBuilder Using GPU 1
+reading script file glob_0000.scp ... 948 entries
+trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion
+total 132 state names in state list C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/state.list
+htkmlfreader: reading MLF file C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/glob_0000.mlf ... total 948 entries
+...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
+label set 0: 129 classes
+minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
+GetTrainCriterionNodes  ...
+GetEvalCriterionNodes  ...
+
+
+Validating node CrossEntropyWithSoftmax 
+
+Validating --> labels = InputValue
+Validating --> W2 = LearnableParameter
+Validating --> W1 = LearnableParameter
+Validating --> W0 = LearnableParameter
+Validating --> features = InputValue
+Validating --> MeanOfFeatures = Mean(features[363, 3])
+Validating --> InvStdOfFeatures = InvStdDev(features[363, 3])
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 3])
+Validating --> B0 = LearnableParameter
+Validating --> W0*features+B0 = Plus(W0*features[512, 3], B0[512, 1])
+Validating --> H1 = Sigmoid(W0*features+B0[512, 3])
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, 3])
+Validating --> B1 = LearnableParameter
+Validating --> W1*H1+B1 = Plus(W1*H1[512, 3], B1[512, 1])
+Validating --> H2 = Sigmoid(W1*H1+B1[512, 3])
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, 3])
+Validating --> B2 = LearnableParameter
+Validating --> HLast = Plus(W2*H1[132, 3], B2[132, 1])
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, 3], HLast[132, 3])
+
+Found 3 PreCompute nodes
+	NodeName: InvStdOfFeatures
+	NodeName: MeanOfFeatures
+	NodeName: Prior
+minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0) with 1 datapasses
+requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
+
+
+Validating node InvStdOfFeatures 
+
+Validating --> features = InputValue
+Validating --> InvStdOfFeatures = InvStdDev(features[363, 64])
+
+
+
+Validating node MeanOfFeatures 
+
+Validating --> features = InputValue
+Validating --> MeanOfFeatures = Mean(features[363, 64])
+
+
+
+Validating node Prior 
+
+Validating --> labels = InputValue
+Validating --> Prior = Mean(labels[132, 64])
+
+Set Max Temp Mem Size For Convolution Nodes to 0 samples.
+Starting Epoch 1: learning rate per sample = 0.015625  momentum = 0.900000 
+minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0) with 1 datapasses
+
+
+Validating node EvalErrorPrediction 
+
+Validating --> labels = InputValue
+Validating --> W2 = LearnableParameter
+Validating --> W1 = LearnableParameter
+Validating --> W0 = LearnableParameter
+Validating --> features = InputValue
+Validating --> MeanOfFeatures = Mean(features[363, 64])
+Validating --> InvStdOfFeatures = InvStdDev(features[363, 64])
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 64], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 64])
+Validating --> B0 = LearnableParameter
+Validating --> W0*features+B0 = Plus(W0*features[512, 64], B0[512, 1])
+Validating --> H1 = Sigmoid(W0*features+B0[512, 64])
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, 64])
+Validating --> B1 = LearnableParameter
+Validating --> W1*H1+B1 = Plus(W1*H1[512, 64], B1[512, 1])
+Validating --> H2 = Sigmoid(W1*H1+B1[512, 64])
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, 64])
+Validating --> B2 = LearnableParameter
+Validating --> HLast = Plus(W2*H1[132, 64], B2[132, 1])
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, 64], HLast[132, 64])
+
+ Epoch[ 1 of 3]-Minibatch[   1-  10 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.45646143; EvalErr[0]PerSample = 0.92500001; TotalTime = 0.01913s; TotalTimePerSample = 0.02988ms; SamplesPerSecond = 33462
+ Epoch[ 1 of 3]-Minibatch[  11-  20 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.22315693; EvalErr[0]PerSample = 0.90156251; TotalTime = 0.01453s; TotalTimePerSample = 0.02270ms; SamplesPerSecond = 44043
+ Epoch[ 1 of 3]-Minibatch[  21-  30 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.95180511; EvalErr[0]PerSample = 0.84687501; TotalTime = 0.01459s; TotalTimePerSample = 0.02279ms; SamplesPerSecond = 43874
+ Epoch[ 1 of 3]-Minibatch[  31-  40 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.94157934; EvalErr[0]PerSample = 0.89843750; TotalTime = 0.01459s; TotalTimePerSample = 0.02280ms; SamplesPerSecond = 43859
+ Epoch[ 1 of 3]-Minibatch[  41-  50 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.85668945; EvalErr[0]PerSample = 0.91093749; TotalTime = 0.01456s; TotalTimePerSample = 0.02275ms; SamplesPerSecond = 43953
+ Epoch[ 1 of 3]-Minibatch[  51-  60 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.72866368; EvalErr[0]PerSample = 0.89531249; TotalTime = 0.01450s; TotalTimePerSample = 0.02265ms; SamplesPerSecond = 44140
+ Epoch[ 1 of 3]-Minibatch[  61-  70 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.51809072; EvalErr[0]PerSample = 0.82968748; TotalTime = 0.01453s; TotalTimePerSample = 0.02271ms; SamplesPerSecond = 44034
+ Epoch[ 1 of 3]-Minibatch[  71-  80 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.48454905; EvalErr[0]PerSample = 0.80781251; TotalTime = 0.01452s; TotalTimePerSample = 0.02269ms; SamplesPerSecond = 44074
+ Epoch[ 1 of 3]-Minibatch[  81-  90 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.33829641; EvalErr[0]PerSample = 0.76875001; TotalTime = 0.01453s; TotalTimePerSample = 0.02271ms; SamplesPerSecond = 44037
+ Epoch[ 1 of 3]-Minibatch[  91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.50167227; EvalErr[0]PerSample = 0.79843748; TotalTime = 0.01447s; TotalTimePerSample = 0.02261ms; SamplesPerSecond = 44229
+WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times.
+ Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.22861624; EvalErr[0]PerSample = 0.80000001; TotalTime = 0.01459s; TotalTimePerSample = 0.02279ms; SamplesPerSecond = 43874
+ Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.32616878; EvalErr[0]PerSample = 0.79062498; TotalTime = 0.01449s; TotalTimePerSample = 0.02264ms; SamplesPerSecond = 44174
+ Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.16897583; EvalErr[0]PerSample = 0.77968752; TotalTime = 0.01448s; TotalTimePerSample = 0.02262ms; SamplesPerSecond = 44201
+ Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.08891916; EvalErr[0]PerSample = 0.77656251; TotalTime = 0.01442s; TotalTimePerSample = 0.02253ms; SamplesPerSecond = 44385
+ Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.06004953; EvalErr[0]PerSample = 0.72968751; TotalTime = 0.01454s; TotalTimePerSample = 0.02271ms; SamplesPerSecond = 44031
+ Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.91128540; EvalErr[0]PerSample = 0.69531250; TotalTime = 0.01446s; TotalTimePerSample = 0.02259ms; SamplesPerSecond = 44272
+ Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.90172124; EvalErr[0]PerSample = 0.72968751; TotalTime = 0.01450s; TotalTimePerSample = 0.02266ms; SamplesPerSecond = 44128
+ Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.73261714; EvalErr[0]PerSample = 0.65312499; TotalTime = 0.01447s; TotalTimePerSample = 0.02261ms; SamplesPerSecond = 44232
+ Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.66515493; EvalErr[0]PerSample = 0.68437499; TotalTime = 0.01453s; TotalTimePerSample = 0.02270ms; SamplesPerSecond = 44061
+ Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.67383432; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.01449s; TotalTimePerSample = 0.02264ms; SamplesPerSecond = 44165
+ Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.52869272; EvalErr[0]PerSample = 0.63593751; TotalTime = 0.01450s; TotalTimePerSample = 0.02266ms; SamplesPerSecond = 44134
+ Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.60032344; EvalErr[0]PerSample = 0.66718751; TotalTime = 0.01450s; TotalTimePerSample = 0.02266ms; SamplesPerSecond = 44128
+ Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.51134038; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.01452s; TotalTimePerSample = 0.02268ms; SamplesPerSecond = 44086
+ Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.45362544; EvalErr[0]PerSample = 0.63749999; TotalTime = 0.01452s; TotalTimePerSample = 0.02269ms; SamplesPerSecond = 44068
+ Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.41640615; EvalErr[0]PerSample = 0.61562502; TotalTime = 0.01445s; TotalTimePerSample = 0.02258ms; SamplesPerSecond = 44287
+ Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.39745474; EvalErr[0]PerSample = 0.62812501; TotalTime = 0.01447s; TotalTimePerSample = 0.02261ms; SamplesPerSecond = 44229
+ Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.16415405; EvalErr[0]PerSample = 0.56718749; TotalTime = 0.01454s; TotalTimePerSample = 0.02272ms; SamplesPerSecond = 44013
+ Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.30347300; EvalErr[0]PerSample = 0.63593751; TotalTime = 0.01454s; TotalTimePerSample = 0.02272ms; SamplesPerSecond = 44016
+ Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.24398804; EvalErr[0]PerSample = 0.60937500; TotalTime = 0.01446s; TotalTimePerSample = 0.02260ms; SamplesPerSecond = 44253
+ Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.15322256; EvalErr[0]PerSample = 0.57968748; TotalTime = 0.01447s; TotalTimePerSample = 0.02262ms; SamplesPerSecond = 44214
+ Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.21664429; EvalErr[0]PerSample = 0.59531248; TotalTime = 0.01448s; TotalTimePerSample = 0.02262ms; SamplesPerSecond = 44208
+ Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.25246572; EvalErr[0]PerSample = 0.60156250; TotalTime = 0.01442s; TotalTimePerSample = 0.02253ms; SamplesPerSecond = 44392
+Finished Epoch[1]: [Training Set] TrainLossPerSample = 3.0000031; EvalErrPerSample = 0.72836918; Ave LearnRatePerSample = 0.015625; EpochTime=0.4851
+Starting Epoch 2: learning rate per sample = 0.001953  momentum = 0.656119 
+minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480) with 1 datapasses
+ Epoch[ 2 of 3]-Minibatch[   1-  10 of 80]: SamplesSeen = 2560; TrainLossPerSample =  2.08151960; EvalErr[0]PerSample = 0.55859375; TotalTime = 0.03149s; TotalTimePerSample = 0.01230ms; SamplesPerSecond = 81290
+ Epoch[ 2 of 3]-Minibatch[  11-  20 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98395634; EvalErr[0]PerSample = 0.54257810; TotalTime = 0.02336s; TotalTimePerSample = 0.00913ms; SamplesPerSecond = 109570
+ Epoch[ 2 of 3]-Minibatch[  21-  30 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98575521; EvalErr[0]PerSample = 0.54492188; TotalTime = 0.02325s; TotalTimePerSample = 0.00908ms; SamplesPerSecond = 110116
+ Epoch[ 2 of 3]-Minibatch[  31-  40 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.90484965; EvalErr[0]PerSample = 0.53164065; TotalTime = 0.02321s; TotalTimePerSample = 0.00906ms; SamplesPerSecond = 110316
+ Epoch[ 2 of 3]-Minibatch[  41-  50 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.88324130; EvalErr[0]PerSample = 0.52539063; TotalTime = 0.02328s; TotalTimePerSample = 0.00909ms; SamplesPerSecond = 109975
+ Epoch[ 2 of 3]-Minibatch[  51-  60 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89109266; EvalErr[0]PerSample = 0.53359377; TotalTime = 0.02325s; TotalTimePerSample = 0.00908ms; SamplesPerSecond = 110093
+ Epoch[ 2 of 3]-Minibatch[  61-  70 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89496076; EvalErr[0]PerSample = 0.52890623; TotalTime = 0.02326s; TotalTimePerSample = 0.00909ms; SamplesPerSecond = 110055
+ Epoch[ 2 of 3]-Minibatch[  71-  80 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.85944366; EvalErr[0]PerSample = 0.52265626; TotalTime = 0.02296s; TotalTimePerSample = 0.00897ms; SamplesPerSecond = 111473
+Finished Epoch[2]: [Training Set] TrainLossPerSample = 1.9356024; EvalErrPerSample = 0.53603518; Ave LearnRatePerSample = 0.001953125; EpochTime=0.195263
+Starting Epoch 3: learning rate per sample = 0.000098  momentum = 0.656119 
+minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960) with 1 datapasses
+ Epoch[ 3 of 3]-Minibatch[   1-  10 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.86752820; EvalErr[0]PerSample = 0.52177733; TotalTime = 0.08160s; TotalTimePerSample = 0.00797ms; SamplesPerSecond = 125485
+ Epoch[ 3 of 3]-Minibatch[  11-  20 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.87358737; EvalErr[0]PerSample = 0.51542968; TotalTime = 0.05742s; TotalTimePerSample = 0.00561ms; SamplesPerSecond = 178319
+Finished Epoch[3]: [Training Set] TrainLossPerSample = 1.8705578; EvalErrPerSample = 0.5186035; Ave LearnRatePerSample = 9.765625146e-005; EpochTime=0.142001
+COMPLETED
+=== Deleting last epoch data
+==== Re-running from checkpoint
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Aug 11 2015 16:18:17
+		Last modified date: Tue Aug 11 16:16:08 2015
+		Built by svcphil on dphaim-26-new           
+		Build Path: C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\MachineLearning\CNTK\
+		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
+		Build Branch: master
+		Build SHA1: 397cc7cc16c00b1c12864d331c0729fde7a1bde3
+-------------------------------------------------------------------
+running on dphaim-26-new at 2015/08/11 17:47:19
+command line options: 
+configFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\QuickE2E\cntk.config RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_cpu DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data DeviceId=Auto 
+
+>>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+precision=float
+command=speechTrain
+deviceId=$DeviceId$
+parallelTrain=false
+speechTrain=[
+    action=train
+    modelPath=$RunDir$/models/cntkSpeech.dnn
+    deviceId=$DeviceId$
+    traceLevel=1
+    SimpleNetworkBuilder=[
+        layerSizes=363:512:512:132
+        trainingCriterion=CrossEntropyWithSoftmax
+        evalCriterion=ErrorPrediction
+        layerTypes=Sigmoid
+        initValueScale=1.0
+        applyMeanVarNorm=true
+        uniformInit=true
+        needPrior=true
+    ]
+    SGD=[
+        epochSize=20480
+        minibatchSize=64:256:1024:
+        learningRatesPerMB=1.0:0.5:0.1
+        numMBsToShowResult=10
+        momentumPerMB=0.9:0.656119
+        dropoutRate=0.0
+        maxEpochs=3
+        keepCheckPointFiles=true       
+        AutoAdjust=[
+            reduceLearnRateIfImproveLessThan=0
+            loadBestModel=true
+            increaseLearnRateIfImproveMoreThan=1000000000
+            learnRateDecreaseFactor=0.5
+            learnRateIncreaseFactor=1.382
+            autoAdjustLR=AdjustAfterEpoch
+        ]
+        clippingThresholdPerSample=1#INF
+    ]
+    reader=[
+      readerType=HTKMLFReader
+      readMethod=blockRandomize
+      miniBatchMode=Partial
+      randomize=Auto
+      verbosity=0
+      features=[
+          dim=363
+          type=Real
+          scpFile=glob_0000.scp
+      ]
+      labels=[
+          mlfFile=$DataDir$/glob_0000.mlf
+          labelMappingFile=$DataDir$/state.list
+          labelDim=132
+          labelType=Category
+      ]
+    ]
+]
+RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_cpu
+DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data
+DeviceId=Auto
+
+<<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+
+>>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+precision=float
+command=speechTrain
+deviceId=Auto
+parallelTrain=false
+speechTrain=[
+    action=train
+    modelPath=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_cpu/models/cntkSpeech.dnn
+    deviceId=Auto
+    traceLevel=1
+    SimpleNetworkBuilder=[
+        layerSizes=363:512:512:132
+        trainingCriterion=CrossEntropyWithSoftmax
+        evalCriterion=ErrorPrediction
+        layerTypes=Sigmoid
+        initValueScale=1.0
+        applyMeanVarNorm=true
+        uniformInit=true
+        needPrior=true
+    ]
+    SGD=[
+        epochSize=20480
+        minibatchSize=64:256:1024:
+        learningRatesPerMB=1.0:0.5:0.1
+        numMBsToShowResult=10
+        momentumPerMB=0.9:0.656119
+        dropoutRate=0.0
+        maxEpochs=3
+        keepCheckPointFiles=true       
+        AutoAdjust=[
+            reduceLearnRateIfImproveLessThan=0
+            loadBestModel=true
+            increaseLearnRateIfImproveMoreThan=1000000000
+            learnRateDecreaseFactor=0.5
+            learnRateIncreaseFactor=1.382
+            autoAdjustLR=AdjustAfterEpoch
+        ]
+        clippingThresholdPerSample=1#INF
+    ]
+    reader=[
+      readerType=HTKMLFReader
+      readMethod=blockRandomize
+      miniBatchMode=Partial
+      randomize=Auto
+      verbosity=0
+      features=[
+          dim=363
+          type=Real
+          scpFile=glob_0000.scp
+      ]
+      labels=[
+          mlfFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/glob_0000.mlf
+          labelMappingFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/state.list
+          labelDim=132
+          labelType=Category
+      ]
+    ]
+]
+RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_cpu
+DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data
+DeviceId=Auto
+
+<<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+
+>>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+configparameters: cntk.config:command=speechTrain
+configparameters: cntk.config:DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data
+configparameters: cntk.config:deviceId=Auto
+configparameters: cntk.config:parallelTrain=false
+configparameters: cntk.config:precision=float
+configparameters: cntk.config:RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_cpu
+configparameters: cntk.config:speechTrain=[
+    action=train
+    modelPath=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_cpu/models/cntkSpeech.dnn
+    deviceId=Auto
+    traceLevel=1
+    SimpleNetworkBuilder=[
+        layerSizes=363:512:512:132
+        trainingCriterion=CrossEntropyWithSoftmax
+        evalCriterion=ErrorPrediction
+        layerTypes=Sigmoid
+        initValueScale=1.0
+        applyMeanVarNorm=true
+        uniformInit=true
+        needPrior=true
+    ]
+    SGD=[
+        epochSize=20480
+        minibatchSize=64:256:1024:
+        learningRatesPerMB=1.0:0.5:0.1
+        numMBsToShowResult=10
+        momentumPerMB=0.9:0.656119
+        dropoutRate=0.0
+        maxEpochs=3
+        keepCheckPointFiles=true       
+        AutoAdjust=[
+            reduceLearnRateIfImproveLessThan=0
+            loadBestModel=true
+            increaseLearnRateIfImproveMoreThan=1000000000
+            learnRateDecreaseFactor=0.5
+            learnRateIncreaseFactor=1.382
+            autoAdjustLR=AdjustAfterEpoch
+        ]
+        clippingThresholdPerSample=1#INF
+    ]
+    reader=[
+      readerType=HTKMLFReader
+      readMethod=blockRandomize
+      miniBatchMode=Partial
+      randomize=Auto
+      verbosity=0
+      features=[
+          dim=363
+          type=Real
+          scpFile=glob_0000.scp
+      ]
+      labels=[
+          mlfFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/glob_0000.mlf
+          labelMappingFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/state.list
+          labelDim=132
+          labelType=Category
+      ]
+    ]
+]
+
+<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+command: speechTrain 
+precision = float
+LockDevice: Capture device 1 and lock it for exclusive use
+LockDevice: Capture device 2 and lock it for exclusive use
+LockDevice: Capture device 3 and lock it for exclusive use
+LockDevice: Capture device 0 and lock it for exclusive use
+LockDevice: Capture device 1 and lock it for exclusive use
+SimpleNetworkBuilder Using GPU 1
+reading script file glob_0000.scp ... 948 entries
+trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion
+total 132 state names in state list C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/state.list
+htkmlfreader: reading MLF file C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/glob_0000.mlf ... total 948 entries
+...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
+label set 0: 129 classes
+minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
+Starting from checkpoint. Load Network From File C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_cpu/models/cntkSpeech.dnn.2.
+
+
+Printing Gradient Computation Node Order ... 
+
+CrossEntropyWithSoftmax[0, 0] = CrossEntropyWithSoftmax(labels[132, 256], HLast[0, 0])
+HLast[0, 0] = Plus(W2*H1[0, 0], B2[132, 1])
+B2[132, 1] = LearnableParameter
+W2*H1[0, 0] = Times(W2[132, 512], H2[0, 0])
+H2[0, 0] = Sigmoid(W1*H1+B1[0, 0])
+W1*H1+B1[0, 0] = Plus(W1*H1[0, 0], B1[512, 1])
+B1[512, 1] = LearnableParameter
+W1*H1[0, 0] = Times(W1[512, 512], H1[0, 0])
+H1[0, 0] = Sigmoid(W0*features+B0[0, 0])
+W0*features+B0[0, 0] = Plus(W0*features[0, 0], B0[512, 1])
+B0[512, 1] = LearnableParameter
+W0*features[0, 0] = Times(W0[512, 363], MVNormalizedFeatures[0, 0])
+MVNormalizedFeatures[0, 0] = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
+InvStdOfFeatures[363, 1] = InvStdDev(features[363, 256])
+MeanOfFeatures[363, 1] = Mean(features[363, 256])
+features[363, 256] = InputValue
+W0[512, 363] = LearnableParameter
+W1[512, 512] = LearnableParameter
+W2[132, 512] = LearnableParameter
+labels[132, 256] = InputValue
+
+Validating node CrossEntropyWithSoftmax 
+
+Validating --> labels = InputValue
+Validating --> W2 = LearnableParameter
+Validating --> W1 = LearnableParameter
+Validating --> W0 = LearnableParameter
+Validating --> features = InputValue
+Validating --> MeanOfFeatures = Mean(features[363, 256])
+Validating --> InvStdOfFeatures = InvStdDev(features[363, 256])
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 256])
+Validating --> B0 = LearnableParameter
+Validating --> W0*features+B0 = Plus(W0*features[512, 256], B0[512, 1])
+Validating --> H1 = Sigmoid(W0*features+B0[512, 256])
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, 256])
+Validating --> B1 = LearnableParameter
+Validating --> W1*H1+B1 = Plus(W1*H1[512, 256], B1[512, 1])
+Validating --> H2 = Sigmoid(W1*H1+B1[512, 256])
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, 256])
+Validating --> B2 = LearnableParameter
+Validating --> HLast = Plus(W2*H1[132, 256], B2[132, 1])
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, 256], HLast[132, 256])
+
+
+
+Validating node ScaledLogLikelihood 
+
+Validating --> W2 = LearnableParameter
+Validating --> W1 = LearnableParameter
+Validating --> W0 = LearnableParameter
+Validating --> features = InputValue
+Validating --> MeanOfFeatures = Mean(features[363, 256])
+Validating --> InvStdOfFeatures = InvStdDev(features[363, 256])
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 256])
+Validating --> B0 = LearnableParameter
+Validating --> W0*features+B0 = Plus(W0*features[512, 256], B0[512, 1])
+Validating --> H1 = Sigmoid(W0*features+B0[512, 256])
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, 256])
+Validating --> B1 = LearnableParameter
+Validating --> W1*H1+B1 = Plus(W1*H1[512, 256], B1[512, 1])
+Validating --> H2 = Sigmoid(W1*H1+B1[512, 256])
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, 256])
+Validating --> B2 = LearnableParameter
+Validating --> HLast = Plus(W2*H1[132, 256], B2[132, 1])
+Validating --> labels = InputValue
+Validating --> Prior = Mean(labels[132, 256])
+Validating --> LogOfPrior = Log(Prior[132, 1])
+Validating --> ScaledLogLikelihood = Minus(HLast[132, 256], LogOfPrior[132, 1])
+
+
+
+Validating node EvalErrorPrediction 
+
+Validating --> labels = InputValue
+Validating --> W2 = LearnableParameter
+Validating --> W1 = LearnableParameter
+Validating --> W0 = LearnableParameter
+Validating --> features = InputValue
+Validating --> MeanOfFeatures = Mean(features[363, 256])
+Validating --> InvStdOfFeatures = InvStdDev(features[363, 256])
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 256])
+Validating --> B0 = LearnableParameter
+Validating --> W0*features+B0 = Plus(W0*features[512, 256], B0[512, 1])
+Validating --> H1 = Sigmoid(W0*features+B0[512, 256])
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, 256])
+Validating --> B1 = LearnableParameter
+Validating --> W1*H1+B1 = Plus(W1*H1[512, 256], B1[512, 1])
+Validating --> H2 = Sigmoid(W1*H1+B1[512, 256])
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, 256])
+Validating --> B2 = LearnableParameter
+Validating --> HLast = Plus(W2*H1[132, 256], B2[132, 1])
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, 256], HLast[132, 256])
+
+GetTrainCriterionNodes  ...
+GetEvalCriterionNodes  ...
+
+
+Validating node CrossEntropyWithSoftmax 
+
+Validating --> labels = InputValue
+Validating --> W2 = LearnableParameter
+Validating --> W1 = LearnableParameter
+Validating --> W0 = LearnableParameter
+Validating --> features = InputValue
+Validating --> MeanOfFeatures = Mean(features[363, 256])
+Validating --> InvStdOfFeatures = InvStdDev(features[363, 256])
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 256])
+Validating --> B0 = LearnableParameter
+Validating --> W0*features+B0 = Plus(W0*features[512, 256], B0[512, 1])
+Validating --> H1 = Sigmoid(W0*features+B0[512, 256])
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, 256])
+Validating --> B1 = LearnableParameter
+Validating --> W1*H1+B1 = Plus(W1*H1[512, 256], B1[512, 1])
+Validating --> H2 = Sigmoid(W1*H1+B1[512, 256])
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, 256])
+Validating --> B2 = LearnableParameter
+Validating --> HLast = Plus(W2*H1[132, 256], B2[132, 1])
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, 256], HLast[132, 256])
+
+No PreCompute nodes found, skipping PreCompute step
+Set Max Temp Mem Size For Convolution Nodes to 0 samples.
+Starting Epoch 3: learning rate per sample = 0.000098  momentum = 0.656119 
+minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960) with 1 datapasses
+requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
+
+
+Validating node EvalErrorPrediction 
+
+Validating --> labels = InputValue
+Validating --> W2 = LearnableParameter
+Validating --> W1 = LearnableParameter
+Validating --> W0 = LearnableParameter
+Validating --> features = InputValue
+Validating --> MeanOfFeatures = Mean(features[363, 1024])
+Validating --> InvStdOfFeatures = InvStdDev(features[363, 1024])
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 1024], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 1024])
+Validating --> B0 = LearnableParameter
+Validating --> W0*features+B0 = Plus(W0*features[512, 1024], B0[512, 1])
+Validating --> H1 = Sigmoid(W0*features+B0[512, 1024])
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, 1024])
+Validating --> B1 = LearnableParameter
+Validating --> W1*H1+B1 = Plus(W1*H1[512, 1024], B1[512, 1])
+Validating --> H2 = Sigmoid(W1*H1+B1[512, 1024])
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, 1024])
+Validating --> B2 = LearnableParameter
+Validating --> HLast = Plus(W2*H1[132, 1024], B2[132, 1])
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, 1024], HLast[132, 1024])
+
+ Epoch[ 3 of 3]-Minibatch[   1-  10 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.86752820; EvalErr[0]PerSample = 0.52177733; TotalTime = 0.40600s; TotalTimePerSample = 0.03965ms; SamplesPerSecond = 25221
+ Epoch[ 3 of 3]-Minibatch[  11-  20 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.87358737; EvalErr[0]PerSample = 0.51542968; TotalTime = 0.05538s; TotalTimePerSample = 0.00541ms; SamplesPerSecond = 184900
+Finished Epoch[3]: [Training Set] TrainLossPerSample = 1.8705578; EvalErrPerSample = 0.5186035; Ave LearnRatePerSample = 9.765625146e-005; EpochTime=0.692077
+COMPLETED
--- a/Tests/Speech/QuickE2E/baseline.windows.gpu.txt
+++ b/Tests/Speech/QuickE2E/baseline.windows.gpu.txt
@ -0,0 +1,738 @@
+=== Running /cygdrive/c/Users/svcphil/workspace.vlivan/CNTK-Build-Windows/x64/release/cntk.exe configFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\QuickE2E\cntk.config RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data DeviceId=Auto
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Aug 11 2015 16:18:17
+		Last modified date: Tue Aug 11 16:16:08 2015
+		Built by svcphil on dphaim-26-new           
+		Build Path: C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\MachineLearning\CNTK\
+		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
+		Build Branch: master
+		Build SHA1: 397cc7cc16c00b1c12864d331c0729fde7a1bde3
+-------------------------------------------------------------------
+running on dphaim-26-new at 2015/08/11 17:47:26
+command line options: 
+configFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\QuickE2E\cntk.config RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data DeviceId=Auto 
+
+>>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+precision=float
+command=speechTrain
+deviceId=$DeviceId$
+parallelTrain=false
+speechTrain=[
+    action=train
+    modelPath=$RunDir$/models/cntkSpeech.dnn
+    deviceId=$DeviceId$
+    traceLevel=1
+    SimpleNetworkBuilder=[
+        layerSizes=363:512:512:132
+        trainingCriterion=CrossEntropyWithSoftmax
+        evalCriterion=ErrorPrediction
+        layerTypes=Sigmoid
+        initValueScale=1.0
+        applyMeanVarNorm=true
+        uniformInit=true
+        needPrior=true
+    ]
+    SGD=[
+        epochSize=20480
+        minibatchSize=64:256:1024:
+        learningRatesPerMB=1.0:0.5:0.1
+        numMBsToShowResult=10
+        momentumPerMB=0.9:0.656119
+        dropoutRate=0.0
+        maxEpochs=3
+        keepCheckPointFiles=true       
+        AutoAdjust=[
+            reduceLearnRateIfImproveLessThan=0
+            loadBestModel=true
+            increaseLearnRateIfImproveMoreThan=1000000000
+            learnRateDecreaseFactor=0.5
+            learnRateIncreaseFactor=1.382
+            autoAdjustLR=AdjustAfterEpoch
+        ]
+        clippingThresholdPerSample=1#INF
+    ]
+    reader=[
+      readerType=HTKMLFReader
+      readMethod=blockRandomize
+      miniBatchMode=Partial
+      randomize=Auto
+      verbosity=0
+      features=[
+          dim=363
+          type=Real
+          scpFile=glob_0000.scp
+      ]
+      labels=[
+          mlfFile=$DataDir$/glob_0000.mlf
+          labelMappingFile=$DataDir$/state.list
+          labelDim=132
+          labelType=Category
+      ]
+    ]
+]
+RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu
+DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data
+DeviceId=Auto
+
+<<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+
+>>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+precision=float
+command=speechTrain
+deviceId=Auto
+parallelTrain=false
+speechTrain=[
+    action=train
+    modelPath=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu/models/cntkSpeech.dnn
+    deviceId=Auto
+    traceLevel=1
+    SimpleNetworkBuilder=[
+        layerSizes=363:512:512:132
+        trainingCriterion=CrossEntropyWithSoftmax
+        evalCriterion=ErrorPrediction
+        layerTypes=Sigmoid
+        initValueScale=1.0
+        applyMeanVarNorm=true
+        uniformInit=true
+        needPrior=true
+    ]
+    SGD=[
+        epochSize=20480
+        minibatchSize=64:256:1024:
+        learningRatesPerMB=1.0:0.5:0.1
+        numMBsToShowResult=10
+        momentumPerMB=0.9:0.656119
+        dropoutRate=0.0
+        maxEpochs=3
+        keepCheckPointFiles=true       
+        AutoAdjust=[
+            reduceLearnRateIfImproveLessThan=0
+            loadBestModel=true
+            increaseLearnRateIfImproveMoreThan=1000000000
+            learnRateDecreaseFactor=0.5
+            learnRateIncreaseFactor=1.382
+            autoAdjustLR=AdjustAfterEpoch
+        ]
+        clippingThresholdPerSample=1#INF
+    ]
+    reader=[
+      readerType=HTKMLFReader
+      readMethod=blockRandomize
+      miniBatchMode=Partial
+      randomize=Auto
+      verbosity=0
+      features=[
+          dim=363
+          type=Real
+          scpFile=glob_0000.scp
+      ]
+      labels=[
+          mlfFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/glob_0000.mlf
+          labelMappingFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/state.list
+          labelDim=132
+          labelType=Category
+      ]
+    ]
+]
+RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu
+DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data
+DeviceId=Auto
+
+<<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+
+>>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+configparameters: cntk.config:command=speechTrain
+configparameters: cntk.config:DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data
+configparameters: cntk.config:deviceId=Auto
+configparameters: cntk.config:parallelTrain=false
+configparameters: cntk.config:precision=float
+configparameters: cntk.config:RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu
+configparameters: cntk.config:speechTrain=[
+    action=train
+    modelPath=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu/models/cntkSpeech.dnn
+    deviceId=Auto
+    traceLevel=1
+    SimpleNetworkBuilder=[
+        layerSizes=363:512:512:132
+        trainingCriterion=CrossEntropyWithSoftmax
+        evalCriterion=ErrorPrediction
+        layerTypes=Sigmoid
+        initValueScale=1.0
+        applyMeanVarNorm=true
+        uniformInit=true
+        needPrior=true
+    ]
+    SGD=[
+        epochSize=20480
+        minibatchSize=64:256:1024:
+        learningRatesPerMB=1.0:0.5:0.1
+        numMBsToShowResult=10
+        momentumPerMB=0.9:0.656119
+        dropoutRate=0.0
+        maxEpochs=3
+        keepCheckPointFiles=true       
+        AutoAdjust=[
+            reduceLearnRateIfImproveLessThan=0
+            loadBestModel=true
+            increaseLearnRateIfImproveMoreThan=1000000000
+            learnRateDecreaseFactor=0.5
+            learnRateIncreaseFactor=1.382
+            autoAdjustLR=AdjustAfterEpoch
+        ]
+        clippingThresholdPerSample=1#INF
+    ]
+    reader=[
+      readerType=HTKMLFReader
+      readMethod=blockRandomize
+      miniBatchMode=Partial
+      randomize=Auto
+      verbosity=0
+      features=[
+          dim=363
+          type=Real
+          scpFile=glob_0000.scp
+      ]
+      labels=[
+          mlfFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/glob_0000.mlf
+          labelMappingFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/state.list
+          labelDim=132
+          labelType=Category
+      ]
+    ]
+]
+
+<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+command: speechTrain 
+precision = float
+LockDevice: Capture device 1 and lock it for exclusive use
+LockDevice: Capture device 2 and lock it for exclusive use
+LockDevice: Capture device 3 and lock it for exclusive use
+LockDevice: Capture device 0 and lock it for exclusive use
+LockDevice: Capture device 1 and lock it for exclusive use
+SimpleNetworkBuilder Using GPU 1
+reading script file glob_0000.scp ... 948 entries
+trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion
+total 132 state names in state list C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/state.list
+htkmlfreader: reading MLF file C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/glob_0000.mlf ... total 948 entries
+...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
+label set 0: 129 classes
+minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
+GetTrainCriterionNodes  ...
+GetEvalCriterionNodes  ...
+
+
+Validating node CrossEntropyWithSoftmax 
+
+Validating --> labels = InputValue
+Validating --> W2 = LearnableParameter
+Validating --> W1 = LearnableParameter
+Validating --> W0 = LearnableParameter
+Validating --> features = InputValue
+Validating --> MeanOfFeatures = Mean(features[363, 3])
+Validating --> InvStdOfFeatures = InvStdDev(features[363, 3])
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 3])
+Validating --> B0 = LearnableParameter
+Validating --> W0*features+B0 = Plus(W0*features[512, 3], B0[512, 1])
+Validating --> H1 = Sigmoid(W0*features+B0[512, 3])
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, 3])
+Validating --> B1 = LearnableParameter
+Validating --> W1*H1+B1 = Plus(W1*H1[512, 3], B1[512, 1])
+Validating --> H2 = Sigmoid(W1*H1+B1[512, 3])
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, 3])
+Validating --> B2 = LearnableParameter
+Validating --> HLast = Plus(W2*H1[132, 3], B2[132, 1])
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, 3], HLast[132, 3])
+
+Found 3 PreCompute nodes
+	NodeName: InvStdOfFeatures
+	NodeName: MeanOfFeatures
+	NodeName: Prior
+minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0) with 1 datapasses
+requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
+
+
+Validating node InvStdOfFeatures 
+
+Validating --> features = InputValue
+Validating --> InvStdOfFeatures = InvStdDev(features[363, 64])
+
+
+
+Validating node MeanOfFeatures 
+
+Validating --> features = InputValue
+Validating --> MeanOfFeatures = Mean(features[363, 64])
+
+
+
+Validating node Prior 
+
+Validating --> labels = InputValue
+Validating --> Prior = Mean(labels[132, 64])
+
+Set Max Temp Mem Size For Convolution Nodes to 0 samples.
+Starting Epoch 1: learning rate per sample = 0.015625  momentum = 0.900000 
+minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0) with 1 datapasses
+
+
+Validating node EvalErrorPrediction 
+
+Validating --> labels = InputValue
+Validating --> W2 = LearnableParameter
+Validating --> W1 = LearnableParameter
+Validating --> W0 = LearnableParameter
+Validating --> features = InputValue
+Validating --> MeanOfFeatures = Mean(features[363, 64])
+Validating --> InvStdOfFeatures = InvStdDev(features[363, 64])
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 64], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 64])
+Validating --> B0 = LearnableParameter
+Validating --> W0*features+B0 = Plus(W0*features[512, 64], B0[512, 1])
+Validating --> H1 = Sigmoid(W0*features+B0[512, 64])
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, 64])
+Validating --> B1 = LearnableParameter
+Validating --> W1*H1+B1 = Plus(W1*H1[512, 64], B1[512, 1])
+Validating --> H2 = Sigmoid(W1*H1+B1[512, 64])
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, 64])
+Validating --> B2 = LearnableParameter
+Validating --> HLast = Plus(W2*H1[132, 64], B2[132, 1])
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, 64], HLast[132, 64])
+
+ Epoch[ 1 of 3]-Minibatch[   1-  10 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.45646143; EvalErr[0]PerSample = 0.92500001; TotalTime = 0.03190s; TotalTimePerSample = 0.04985ms; SamplesPerSecond = 20061
+ Epoch[ 1 of 3]-Minibatch[  11-  20 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.22315693; EvalErr[0]PerSample = 0.90156251; TotalTime = 0.02454s; TotalTimePerSample = 0.03835ms; SamplesPerSecond = 26075
+ Epoch[ 1 of 3]-Minibatch[  21-  30 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.95180511; EvalErr[0]PerSample = 0.84687501; TotalTime = 0.02438s; TotalTimePerSample = 0.03809ms; SamplesPerSecond = 26254
+ Epoch[ 1 of 3]-Minibatch[  31-  40 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.94157934; EvalErr[0]PerSample = 0.89843750; TotalTime = 0.02445s; TotalTimePerSample = 0.03820ms; SamplesPerSecond = 26181
+ Epoch[ 1 of 3]-Minibatch[  41-  50 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.85668945; EvalErr[0]PerSample = 0.91093749; TotalTime = 0.02429s; TotalTimePerSample = 0.03795ms; SamplesPerSecond = 26352
+ Epoch[ 1 of 3]-Minibatch[  51-  60 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.72866368; EvalErr[0]PerSample = 0.89531249; TotalTime = 0.02445s; TotalTimePerSample = 0.03820ms; SamplesPerSecond = 26178
+ Epoch[ 1 of 3]-Minibatch[  61-  70 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.51809072; EvalErr[0]PerSample = 0.82968748; TotalTime = 0.02423s; TotalTimePerSample = 0.03786ms; SamplesPerSecond = 26415
+ Epoch[ 1 of 3]-Minibatch[  71-  80 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.48454905; EvalErr[0]PerSample = 0.80781251; TotalTime = 0.02249s; TotalTimePerSample = 0.03514ms; SamplesPerSecond = 28457
+ Epoch[ 1 of 3]-Minibatch[  81-  90 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.33829641; EvalErr[0]PerSample = 0.76875001; TotalTime = 0.02169s; TotalTimePerSample = 0.03390ms; SamplesPerSecond = 29501
+ Epoch[ 1 of 3]-Minibatch[  91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.50167227; EvalErr[0]PerSample = 0.79843748; TotalTime = 0.02178s; TotalTimePerSample = 0.03403ms; SamplesPerSecond = 29386
+WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times.
+ Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.22861624; EvalErr[0]PerSample = 0.80000001; TotalTime = 0.02166s; TotalTimePerSample = 0.03385ms; SamplesPerSecond = 29546
+ Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.32616878; EvalErr[0]PerSample = 0.79062498; TotalTime = 0.02063s; TotalTimePerSample = 0.03224ms; SamplesPerSecond = 31018
+ Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.16897583; EvalErr[0]PerSample = 0.77968752; TotalTime = 0.01950s; TotalTimePerSample = 0.03048ms; SamplesPerSecond = 32813
+ Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.08891916; EvalErr[0]PerSample = 0.77656251; TotalTime = 0.01961s; TotalTimePerSample = 0.03063ms; SamplesPerSecond = 32644
+ Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.06004953; EvalErr[0]PerSample = 0.72968751; TotalTime = 0.01950s; TotalTimePerSample = 0.03046ms; SamplesPerSecond = 32825
+ Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.91128540; EvalErr[0]PerSample = 0.69531250; TotalTime = 0.01965s; TotalTimePerSample = 0.03070ms; SamplesPerSecond = 32571
+ Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.90172124; EvalErr[0]PerSample = 0.72968751; TotalTime = 0.01828s; TotalTimePerSample = 0.02857ms; SamplesPerSecond = 35003
+ Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.73261714; EvalErr[0]PerSample = 0.65312499; TotalTime = 0.01799s; TotalTimePerSample = 0.02811ms; SamplesPerSecond = 35569
+ Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.66515493; EvalErr[0]PerSample = 0.68437499; TotalTime = 0.01789s; TotalTimePerSample = 0.02796ms; SamplesPerSecond = 35766
+ Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.67383432; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.01792s; TotalTimePerSample = 0.02800ms; SamplesPerSecond = 35708
+ Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.52869272; EvalErr[0]PerSample = 0.63593751; TotalTime = 0.01805s; TotalTimePerSample = 0.02821ms; SamplesPerSecond = 35451
+ Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.60032344; EvalErr[0]PerSample = 0.66718751; TotalTime = 0.01696s; TotalTimePerSample = 0.02650ms; SamplesPerSecond = 37738
+ Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.51134038; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.01658s; TotalTimePerSample = 0.02591ms; SamplesPerSecond = 38598
+ Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.45362544; EvalErr[0]PerSample = 0.63749999; TotalTime = 0.01663s; TotalTimePerSample = 0.02598ms; SamplesPerSecond = 38491
+ Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.41640615; EvalErr[0]PerSample = 0.61562502; TotalTime = 0.01670s; TotalTimePerSample = 0.02610ms; SamplesPerSecond = 38321
+ Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.39745474; EvalErr[0]PerSample = 0.62812501; TotalTime = 0.01672s; TotalTimePerSample = 0.02612ms; SamplesPerSecond = 38279
+ Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.16415405; EvalErr[0]PerSample = 0.56718749; TotalTime = 0.01621s; TotalTimePerSample = 0.02533ms; SamplesPerSecond = 39481
+ Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.30347300; EvalErr[0]PerSample = 0.63593751; TotalTime = 0.01583s; TotalTimePerSample = 0.02474ms; SamplesPerSecond = 40427
+ Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.24398804; EvalErr[0]PerSample = 0.60937500; TotalTime = 0.01579s; TotalTimePerSample = 0.02467ms; SamplesPerSecond = 40542
+ Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.15322256; EvalErr[0]PerSample = 0.57968748; TotalTime = 0.01582s; TotalTimePerSample = 0.02472ms; SamplesPerSecond = 40447
+ Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.21664429; EvalErr[0]PerSample = 0.59531248; TotalTime = 0.01570s; TotalTimePerSample = 0.02453ms; SamplesPerSecond = 40761
+ Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.25246572; EvalErr[0]PerSample = 0.60156250; TotalTime = 0.01556s; TotalTimePerSample = 0.02431ms; SamplesPerSecond = 41139
+Finished Epoch[1]: [Training Set] TrainLossPerSample = 3.0000031; EvalErrPerSample = 0.72836918; Ave LearnRatePerSample = 0.015625; EpochTime=0.657568
+Starting Epoch 2: learning rate per sample = 0.001953  momentum = 0.656119 
+minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480) with 1 datapasses
+ Epoch[ 2 of 3]-Minibatch[   1-  10 of 80]: SamplesSeen = 2560; TrainLossPerSample =  2.08151960; EvalErr[0]PerSample = 0.55859375; TotalTime = 0.03143s; TotalTimePerSample = 0.01228ms; SamplesPerSecond = 81456
+ Epoch[ 2 of 3]-Minibatch[  11-  20 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98395634; EvalErr[0]PerSample = 0.54257810; TotalTime = 0.02295s; TotalTimePerSample = 0.00896ms; SamplesPerSecond = 111561
+ Epoch[ 2 of 3]-Minibatch[  21-  30 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98575521; EvalErr[0]PerSample = 0.54492188; TotalTime = 0.02287s; TotalTimePerSample = 0.00893ms; SamplesPerSecond = 111951
+ Epoch[ 2 of 3]-Minibatch[  31-  40 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.90484965; EvalErr[0]PerSample = 0.53164065; TotalTime = 0.02284s; TotalTimePerSample = 0.00892ms; SamplesPerSecond = 112069
+ Epoch[ 2 of 3]-Minibatch[  41-  50 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.88324130; EvalErr[0]PerSample = 0.52539063; TotalTime = 0.02277s; TotalTimePerSample = 0.00889ms; SamplesPerSecond = 112448
+ Epoch[ 2 of 3]-Minibatch[  51-  60 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89109266; EvalErr[0]PerSample = 0.53359377; TotalTime = 0.02287s; TotalTimePerSample = 0.00894ms; SamplesPerSecond = 111917
+ Epoch[ 2 of 3]-Minibatch[  61-  70 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89496076; EvalErr[0]PerSample = 0.52890623; TotalTime = 0.02279s; TotalTimePerSample = 0.00890ms; SamplesPerSecond = 112325
+ Epoch[ 2 of 3]-Minibatch[  71-  80 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.85944366; EvalErr[0]PerSample = 0.52265626; TotalTime = 0.02265s; TotalTimePerSample = 0.00885ms; SamplesPerSecond = 113044
+Finished Epoch[2]: [Training Set] TrainLossPerSample = 1.9356024; EvalErrPerSample = 0.53603518; Ave LearnRatePerSample = 0.001953125; EpochTime=0.192318
+Starting Epoch 3: learning rate per sample = 0.000098  momentum = 0.656119 
+minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960) with 1 datapasses
+ Epoch[ 3 of 3]-Minibatch[   1-  10 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.86752820; EvalErr[0]PerSample = 0.52177733; TotalTime = 0.08080s; TotalTimePerSample = 0.00789ms; SamplesPerSecond = 126735
+ Epoch[ 3 of 3]-Minibatch[  11-  20 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.87358737; EvalErr[0]PerSample = 0.51542968; TotalTime = 0.05544s; TotalTimePerSample = 0.00541ms; SamplesPerSecond = 184694
+Finished Epoch[3]: [Training Set] TrainLossPerSample = 1.8705578; EvalErrPerSample = 0.5186035; Ave LearnRatePerSample = 9.765625146e-005; EpochTime=0.139063
+COMPLETED
+=== Deleting last epoch data
+==== Re-running from checkpoint
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Aug 11 2015 16:18:17
+		Last modified date: Tue Aug 11 16:16:08 2015
+		Built by svcphil on dphaim-26-new           
+		Build Path: C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\MachineLearning\CNTK\
+		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
+		Build Branch: master
+		Build SHA1: 397cc7cc16c00b1c12864d331c0729fde7a1bde3
+-------------------------------------------------------------------
+running on dphaim-26-new at 2015/08/11 17:47:34
+command line options: 
+configFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\QuickE2E\cntk.config RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data DeviceId=Auto 
+
+>>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+precision=float
+command=speechTrain
+deviceId=$DeviceId$
+parallelTrain=false
+speechTrain=[
+    action=train
+    modelPath=$RunDir$/models/cntkSpeech.dnn
+    deviceId=$DeviceId$
+    traceLevel=1
+    SimpleNetworkBuilder=[
+        layerSizes=363:512:512:132
+        trainingCriterion=CrossEntropyWithSoftmax
+        evalCriterion=ErrorPrediction
+        layerTypes=Sigmoid
+        initValueScale=1.0
+        applyMeanVarNorm=true
+        uniformInit=true
+        needPrior=true
+    ]
+    SGD=[
+        epochSize=20480
+        minibatchSize=64:256:1024:
+        learningRatesPerMB=1.0:0.5:0.1
+        numMBsToShowResult=10
+        momentumPerMB=0.9:0.656119
+        dropoutRate=0.0
+        maxEpochs=3
+        keepCheckPointFiles=true       
+        AutoAdjust=[
+            reduceLearnRateIfImproveLessThan=0
+            loadBestModel=true
+            increaseLearnRateIfImproveMoreThan=1000000000
+            learnRateDecreaseFactor=0.5
+            learnRateIncreaseFactor=1.382
+            autoAdjustLR=AdjustAfterEpoch
+        ]
+        clippingThresholdPerSample=1#INF
+    ]
+    reader=[
+      readerType=HTKMLFReader
+      readMethod=blockRandomize
+      miniBatchMode=Partial
+      randomize=Auto
+      verbosity=0
+      features=[
+          dim=363
+          type=Real
+          scpFile=glob_0000.scp
+      ]
+      labels=[
+          mlfFile=$DataDir$/glob_0000.mlf
+          labelMappingFile=$DataDir$/state.list
+          labelDim=132
+          labelType=Category
+      ]
+    ]
+]
+RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu
+DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data
+DeviceId=Auto
+
+<<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+
+>>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+precision=float
+command=speechTrain
+deviceId=Auto
+parallelTrain=false
+speechTrain=[
+    action=train
+    modelPath=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu/models/cntkSpeech.dnn
+    deviceId=Auto
+    traceLevel=1
+    SimpleNetworkBuilder=[
+        layerSizes=363:512:512:132
+        trainingCriterion=CrossEntropyWithSoftmax
+        evalCriterion=ErrorPrediction
+        layerTypes=Sigmoid
+        initValueScale=1.0
+        applyMeanVarNorm=true
+        uniformInit=true
+        needPrior=true
+    ]
+    SGD=[
+        epochSize=20480
+        minibatchSize=64:256:1024:
+        learningRatesPerMB=1.0:0.5:0.1
+        numMBsToShowResult=10
+        momentumPerMB=0.9:0.656119
+        dropoutRate=0.0
+        maxEpochs=3
+        keepCheckPointFiles=true       
+        AutoAdjust=[
+            reduceLearnRateIfImproveLessThan=0
+            loadBestModel=true
+            increaseLearnRateIfImproveMoreThan=1000000000
+            learnRateDecreaseFactor=0.5
+            learnRateIncreaseFactor=1.382
+            autoAdjustLR=AdjustAfterEpoch
+        ]
+        clippingThresholdPerSample=1#INF
+    ]
+    reader=[
+      readerType=HTKMLFReader
+      readMethod=blockRandomize
+      miniBatchMode=Partial
+      randomize=Auto
+      verbosity=0
+      features=[
+          dim=363
+          type=Real
+          scpFile=glob_0000.scp
+      ]
+      labels=[
+          mlfFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/glob_0000.mlf
+          labelMappingFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/state.list
+          labelDim=132
+          labelType=Category
+      ]
+    ]
+]
+RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu
+DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data
+DeviceId=Auto
+
+<<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+
+>>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+configparameters: cntk.config:command=speechTrain
+configparameters: cntk.config:DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data
+configparameters: cntk.config:deviceId=Auto
+configparameters: cntk.config:parallelTrain=false
+configparameters: cntk.config:precision=float
+configparameters: cntk.config:RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu
+configparameters: cntk.config:speechTrain=[
+    action=train
+    modelPath=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu/models/cntkSpeech.dnn
+    deviceId=Auto
+    traceLevel=1
+    SimpleNetworkBuilder=[
+        layerSizes=363:512:512:132
+        trainingCriterion=CrossEntropyWithSoftmax
+        evalCriterion=ErrorPrediction
+        layerTypes=Sigmoid
+        initValueScale=1.0
+        applyMeanVarNorm=true
+        uniformInit=true
+        needPrior=true
+    ]
+    SGD=[
+        epochSize=20480
+        minibatchSize=64:256:1024:
+        learningRatesPerMB=1.0:0.5:0.1
+        numMBsToShowResult=10
+        momentumPerMB=0.9:0.656119
+        dropoutRate=0.0
+        maxEpochs=3
+        keepCheckPointFiles=true       
+        AutoAdjust=[
+            reduceLearnRateIfImproveLessThan=0
+            loadBestModel=true
+            increaseLearnRateIfImproveMoreThan=1000000000
+            learnRateDecreaseFactor=0.5
+            learnRateIncreaseFactor=1.382
+            autoAdjustLR=AdjustAfterEpoch
+        ]
+        clippingThresholdPerSample=1#INF
+    ]
+    reader=[
+      readerType=HTKMLFReader
+      readMethod=blockRandomize
+      miniBatchMode=Partial
+      randomize=Auto
+      verbosity=0
+      features=[
+          dim=363
+          type=Real
+          scpFile=glob_0000.scp
+      ]
+      labels=[
+          mlfFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/glob_0000.mlf
+          labelMappingFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/state.list
+          labelDim=132
+          labelType=Category
+      ]
+    ]
+]
+
+<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+command: speechTrain 
+precision = float
+LockDevice: Capture device 1 and lock it for exclusive use
+LockDevice: Capture device 2 and lock it for exclusive use
+LockDevice: Capture device 3 and lock it for exclusive use
+LockDevice: Capture device 0 and lock it for exclusive use
+LockDevice: Capture device 1 and lock it for exclusive use
+SimpleNetworkBuilder Using GPU 1
+reading script file glob_0000.scp ... 948 entries
+trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion
+total 132 state names in state list C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/state.list
+htkmlfreader: reading MLF file C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/glob_0000.mlf ... total 948 entries
+...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
+label set 0: 129 classes
+minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
+Starting from checkpoint. Load Network From File C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu/models/cntkSpeech.dnn.2.
+
+
+Printing Gradient Computation Node Order ... 
+
+CrossEntropyWithSoftmax[0, 0] = CrossEntropyWithSoftmax(labels[132, 256], HLast[0, 0])
+HLast[0, 0] = Plus(W2*H1[0, 0], B2[132, 1])
+B2[132, 1] = LearnableParameter
+W2*H1[0, 0] = Times(W2[132, 512], H2[0, 0])
+H2[0, 0] = Sigmoid(W1*H1+B1[0, 0])
+W1*H1+B1[0, 0] = Plus(W1*H1[0, 0], B1[512, 1])
+B1[512, 1] = LearnableParameter
+W1*H1[0, 0] = Times(W1[512, 512], H1[0, 0])
+H1[0, 0] = Sigmoid(W0*features+B0[0, 0])
+W0*features+B0[0, 0] = Plus(W0*features[0, 0], B0[512, 1])
+B0[512, 1] = LearnableParameter
+W0*features[0, 0] = Times(W0[512, 363], MVNormalizedFeatures[0, 0])
+MVNormalizedFeatures[0, 0] = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
+InvStdOfFeatures[363, 1] = InvStdDev(features[363, 256])
+MeanOfFeatures[363, 1] = Mean(features[363, 256])
+features[363, 256] = InputValue
+W0[512, 363] = LearnableParameter
+W1[512, 512] = LearnableParameter
+W2[132, 512] = LearnableParameter
+labels[132, 256] = InputValue
+
+Validating node CrossEntropyWithSoftmax 
+
+Validating --> labels = InputValue
+Validating --> W2 = LearnableParameter
+Validating --> W1 = LearnableParameter
+Validating --> W0 = LearnableParameter
+Validating --> features = InputValue
+Validating --> MeanOfFeatures = Mean(features[363, 256])
+Validating --> InvStdOfFeatures = InvStdDev(features[363, 256])
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 256])
+Validating --> B0 = LearnableParameter
+Validating --> W0*features+B0 = Plus(W0*features[512, 256], B0[512, 1])
+Validating --> H1 = Sigmoid(W0*features+B0[512, 256])
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, 256])
+Validating --> B1 = LearnableParameter
+Validating --> W1*H1+B1 = Plus(W1*H1[512, 256], B1[512, 1])
+Validating --> H2 = Sigmoid(W1*H1+B1[512, 256])
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, 256])
+Validating --> B2 = LearnableParameter
+Validating --> HLast = Plus(W2*H1[132, 256], B2[132, 1])
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, 256], HLast[132, 256])
+
+
+
+Validating node ScaledLogLikelihood 
+
+Validating --> W2 = LearnableParameter
+Validating --> W1 = LearnableParameter
+Validating --> W0 = LearnableParameter
+Validating --> features = InputValue
+Validating --> MeanOfFeatures = Mean(features[363, 256])
+Validating --> InvStdOfFeatures = InvStdDev(features[363, 256])
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 256])
+Validating --> B0 = LearnableParameter
+Validating --> W0*features+B0 = Plus(W0*features[512, 256], B0[512, 1])
+Validating --> H1 = Sigmoid(W0*features+B0[512, 256])
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, 256])
+Validating --> B1 = LearnableParameter
+Validating --> W1*H1+B1 = Plus(W1*H1[512, 256], B1[512, 1])
+Validating --> H2 = Sigmoid(W1*H1+B1[512, 256])
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, 256])
+Validating --> B2 = LearnableParameter
+Validating --> HLast = Plus(W2*H1[132, 256], B2[132, 1])
+Validating --> labels = InputValue
+Validating --> Prior = Mean(labels[132, 256])
+Validating --> LogOfPrior = Log(Prior[132, 1])
+Validating --> ScaledLogLikelihood = Minus(HLast[132, 256], LogOfPrior[132, 1])
+
+
+
+Validating node EvalErrorPrediction 
+
+Validating --> labels = InputValue
+Validating --> W2 = LearnableParameter
+Validating --> W1 = LearnableParameter
+Validating --> W0 = LearnableParameter
+Validating --> features = InputValue
+Validating --> MeanOfFeatures = Mean(features[363, 256])
+Validating --> InvStdOfFeatures = InvStdDev(features[363, 256])
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 256])
+Validating --> B0 = LearnableParameter
+Validating --> W0*features+B0 = Plus(W0*features[512, 256], B0[512, 1])
+Validating --> H1 = Sigmoid(W0*features+B0[512, 256])
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, 256])
+Validating --> B1 = LearnableParameter
+Validating --> W1*H1+B1 = Plus(W1*H1[512, 256], B1[512, 1])
+Validating --> H2 = Sigmoid(W1*H1+B1[512, 256])
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, 256])
+Validating --> B2 = LearnableParameter
+Validating --> HLast = Plus(W2*H1[132, 256], B2[132, 1])
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, 256], HLast[132, 256])
+
+GetTrainCriterionNodes  ...
+GetEvalCriterionNodes  ...
+
+
+Validating node CrossEntropyWithSoftmax 
+
+Validating --> labels = InputValue
+Validating --> W2 = LearnableParameter
+Validating --> W1 = LearnableParameter
+Validating --> W0 = LearnableParameter
+Validating --> features = InputValue
+Validating --> MeanOfFeatures = Mean(features[363, 256])
+Validating --> InvStdOfFeatures = InvStdDev(features[363, 256])
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 256])
+Validating --> B0 = LearnableParameter
+Validating --> W0*features+B0 = Plus(W0*features[512, 256], B0[512, 1])
+Validating --> H1 = Sigmoid(W0*features+B0[512, 256])
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, 256])
+Validating --> B1 = LearnableParameter
+Validating --> W1*H1+B1 = Plus(W1*H1[512, 256], B1[512, 1])
+Validating --> H2 = Sigmoid(W1*H1+B1[512, 256])
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, 256])
+Validating --> B2 = LearnableParameter
+Validating --> HLast = Plus(W2*H1[132, 256], B2[132, 1])
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, 256], HLast[132, 256])
+
+No PreCompute nodes found, skipping PreCompute step
+Set Max Temp Mem Size For Convolution Nodes to 0 samples.
+Starting Epoch 3: learning rate per sample = 0.000098  momentum = 0.656119 
+minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960) with 1 datapasses
+requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
+
+
+Validating node EvalErrorPrediction 
+
+Validating --> labels = InputValue
+Validating --> W2 = LearnableParameter
+Validating --> W1 = LearnableParameter
+Validating --> W0 = LearnableParameter
+Validating --> features = InputValue
+Validating --> MeanOfFeatures = Mean(features[363, 1024])
+Validating --> InvStdOfFeatures = InvStdDev(features[363, 1024])
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 1024], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 1024])
+Validating --> B0 = LearnableParameter
+Validating --> W0*features+B0 = Plus(W0*features[512, 1024], B0[512, 1])
+Validating --> H1 = Sigmoid(W0*features+B0[512, 1024])
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, 1024])
+Validating --> B1 = LearnableParameter
+Validating --> W1*H1+B1 = Plus(W1*H1[512, 1024], B1[512, 1])
+Validating --> H2 = Sigmoid(W1*H1+B1[512, 1024])
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, 1024])
+Validating --> B2 = LearnableParameter
+Validating --> HLast = Plus(W2*H1[132, 1024], B2[132, 1])
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, 1024], HLast[132, 1024])
+
+ Epoch[ 3 of 3]-Minibatch[   1-  10 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.86752820; EvalErr[0]PerSample = 0.52177733; TotalTime = 0.42093s; TotalTimePerSample = 0.04111ms; SamplesPerSecond = 24327
+ Epoch[ 3 of 3]-Minibatch[  11-  20 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.87358737; EvalErr[0]PerSample = 0.51542968; TotalTime = 0.05521s; TotalTimePerSample = 0.00539ms; SamplesPerSecond = 185480
+Finished Epoch[3]: [Training Set] TrainLossPerSample = 1.8705578; EvalErrPerSample = 0.5186035; Ave LearnRatePerSample = 9.765625146e-005; EpochTime=0.690137
+COMPLETED
--- a/Tests/Speech/QuickE2E/run-test
+++ b/Tests/Speech/QuickE2E/run-test
@ -1,17 +1,28 @@
 #!/bin/bash
-CNTK_BINARY=$TEST_BUILD_LOCATION/x86_64.gpu.$TEST_FLAVOR.acml/cntk
 if [ "$TEST_DEVICE" == "CPU" ]; then
   CNTK_DEVICE_ID=-1
 else
   CNTK_DEVICE_ID=Auto
 fi
-CNTK_ARGS="configFile=$TEST_DIR/cntk.config RunDir=$TEST_RUN_DIR DataDir=$TEST_DATA_DIR DeviceId=$CNTK_DEVICE_ID"
+
+configFile=$TEST_DIR/cntk.config
+RunDir=$TEST_RUN_DIR
+DataDir=$TEST_DATA_DIR
+
+if [ "$OS" == "Windows_NT" ]; then
+  # When running on cygwin translating /cygdrive/xxx paths to proper windows paths:
+  configFile=$(cygpath -aw $configFile)
+  RunDir=$(cygpath -aw $RunDir)
+  DataDir=$(cygpath -aw $DataDir)
+fi
+
+CNTK_ARGS="configFile=$configFile RunDir=$RunDir DataDir=$DataDir DeviceId=$CNTK_DEVICE_ID"
 MODELS_DIR=$TEST_RUN_DIR/models
 [ -d $MODELS_DIR ] && rm -rf $MODELS_DIR
 mkdir -p $MODELS_DIR || exit $?
-echo === Running $CNTK_BINARY $CNTK_ARGS
-$CNTK_BINARY $CNTK_ARGS || exit $?
+echo === Running $TEST_CNTK_BINARY $CNTK_ARGS
+$TEST_CNTK_BINARY $CNTK_ARGS || exit $?
 echo === Deleting last epoch data
 rm $TEST_RUN_DIR/models/*.dnn
 echo ==== Re-running from checkpoint
-$CNTK_BINARY $CNTK_ARGS || exit $?
+$TEST_CNTK_BINARY $CNTK_ARGS || exit $?
--- a/Tests/TestDriver.py
+++ b/Tests/TestDriver.py
@ -15,7 +15,7 @@
 #
 # Each test directory has a following components:
 #    - testcases.yml - main test confuguration file, whcih defines all test cases
-#    - run-test  - (run-test) script
+#    - run-test - (run-test) script
 #    - baseline*.txt - baseline files whith a captured expected output of run-test script
 #
 # ----- testcases.yml format -------
@ -52,10 +52,14 @@
 # ---- Baseline files ----
 # Order of searching baseline files, depends on the current mode for a given test:
 #
-#   1. baseline.<flavor>.<device>.txt
-#   2. baseline.<flavor>.txt
-#   3. baseline.<device>.txt
-#   4. baseline.txt
+#   1. baseline.<os>.<flavor>.<device>.txt
+#   2. baseline.<os>.<flavor>.txt
+#   3. baseline.<os>.<device>.txt
+#   4. baseline.<os>.txt
+#   5. baseline.<flavor>.<device>.txt
+#   6. baseline.<flavor>.txt
+#   7. baseline.<device>.txt
+#   8. baseline.txt
 #        where <flavor> = { debug | release }
 #              <device> = { cpu | gpu }
 # 
@ -79,6 +83,7 @@
 import sys, os, argparse, traceback, yaml, subprocess, random, re, time

 thisDir = os.path.dirname(os.path.realpath(__file__))
+windows = os.getenv("OS")=="Windows_NT"

 # This class encapsulates an instance of the test
 class Test:
@ -169,6 +174,10 @@ class Test:
    os.environ["TEST_FLAVOR"] = flavor
    os.environ["TEST_DEVICE"] = device
    os.environ["TEST_BUILD_LOCATION"] = args.build_location
+    if windows:
+      os.environ["TEST_CNTK_BINARY"] = os.path.join(args.build_location, flavor, "cntk.exe")
+    else:
+      os.environ["TEST_CNTK_BINARY"] = os.path.join(args.build_location, flavor, "bin", "cntk")
    os.environ["TEST_DIR"] = self.testDir
    os.environ["TEST_DATA_DIR"] = self.dataDir
    os.environ["TEST_RUN_DIR"] = runDir
@ -237,17 +246,22 @@ class Test:
    return result

  # Finds a location of a baseline file by probing different names in the following order:
+  #   baseline.$os.$flavor.$device.txt
+  #   baseline.$os.$flavor.txt
+  #   baseline.$os.$device.txt
+  #   baseline.$os.txt
  #   baseline.$flavor.$device.txt
  #   baseline.$flavor.txt
  #   baseline.$device.txt
  #   baseline.txt
  def findBaselineFile(self, flavor, device):
-    for f in ["." + flavor.lower(), ""]:
-      for d in ["." + device.lower(), ""]:
-        candidateName = "baseline" + f + d + ".txt";
-        fullPath = os.path.join(self.testDir, candidateName)
-        if os.path.isfile(fullPath):
-           return fullPath
+    for o in ["." + ("windows" if windows else "linux"), ""]:
+      for f in ["." + flavor.lower(), ""]:
+        for d in ["." + device.lower(), ""]:
+          candidateName = "baseline" + o + f + d + ".txt"
+          fullPath = os.path.join(self.testDir, candidateName)
+          if os.path.isfile(fullPath):
+            return fullPath
    return None

 # This class encapsulates one testcase (in testcases.yml file)
@ -521,13 +535,13 @@ runSubparser.add_argument("test", nargs="*",
                    help="optional test name(s) to run, specified as Suite/TestName. "
                         "Use list command to list available tests. "
                         "If not specified then all tests will be run.")
-#TODO: port paths to Windows
-defaultBuildLocation=os.path.realpath(os.path.join(thisDir, "..", "bin"))
+defaultBuildLocation=os.path.realpath(os.path.join(thisDir, "..", "x64" if windows else "build"))
+
 runSubparser.add_argument("-b", "--build-location", default=defaultBuildLocation, help="location of the CNTK build to run")
 runSubparser.add_argument("-d", "--device", help="cpu|gpu - run on a specific device")
 runSubparser.add_argument("-f", "--flavor", help="release|debug - run only a specific flavor")
-#TODO: port paths to Windows
-defaultRunDir=os.path.join("/tmp", "cntk-test-{0}.{1}".format(time.strftime("%Y%m%d%H%M%S"), random.randint(0,1000000)))
+tmpDir = os.getenv("TEMP") if windows else "/tmp"
+defaultRunDir=os.path.join(tmpDir, "cntk-test-{0}.{1}".format(time.strftime("%Y%m%d%H%M%S"), random.randint(0,1000000)))
 runSubparser.add_argument("-r", "--run-dir", default=defaultRunDir, help="directory where to store test output, default: a random dir within /tmp")
 runSubparser.add_argument("--update-baseline", action='store_true', help="update baseline file(s) instead of matching them")
 runSubparser.add_argument("-v", "--verbose", action='store_true', help="verbose output - dump all output of test script")
--- a/412
+++ b/412
@ -0,0 +1,412 @@
+#!/bin/bash
+
+configure=$0
+build_top=$PWD
+
+have_cuda=no
+cuda_path=
+cuda_check=include/cuda.h
+enable_cuda=
+
+have_acml=no
+acml_path=
+acml_check=include/acml.h
+
+have_mkl=no
+mkl_path=
+mkl_check=mkl/include/mkl.h
+
+have_kaldi=no
+kaldi_path=
+kaldi_check=src/kaldi.mk 
+
+have_buildtype=no
+buildtype=
+default_buildtype=release
+
+have_gdk=no
+gdk_path=
+gdk_check=include/nvidia/gdk/nvml.h
+
+mathlib=
+
+# List from best to worst choice
+default_path_list="/usr /usr/local /opt /opt/local"
+
+# List from best to worst choice
+default_acmls="acml5.3.1/ifort64"
+default_mkls=""
+
+# NOTE: Will get compilation errors with cuda-6.0
+default_cudas="cuda-7.5 cuda-7.0 cuda-6.5"
+default_kaldis="kaldi-trunk"
+default_gdks="."
+
+function default_paths ()
+{
+    echo $build_top $HOME $default_path_list
+}
+
+# $1 is directory
+# $2 is file that must be present
+function check_dir ()
+{
+    if test -e $1/$2
+    then
+        echo yes
+    else
+        echo no
+    fi
+}
+
+# $1 is the list of tails to search, ordered by preference
+# $2 is some file that must exist in $1
+function find_dir ()
+{
+    for tail in $1
+    do
+        for head in $(default_paths)
+        do
+            if test x$(check_dir "$head/$tail" $2) = xyes
+            then
+                echo $head/$tail
+                return 0
+            fi
+        done
+    done
+}
+
+function find_acml ()
+{
+    find_dir "$default_acmls" "$acml_check"
+}
+
+function find_mkl ()
+{
+    find_dir "$default_mkls" "$mkl_check"
+}
+
+function find_cuda ()
+{
+    find_dir "$default_cudas" "$cuda_check"
+}
+
+function find_kaldi ()
+{
+    find_dir "$default_kaldis" "$kaldi_check"
+}
+
+function find_gdk ()
+{
+    find_dir "$default_gdks" "$gdk_check"
+}
+
+function is_hardlinked ()
+{
+    r=no
+    if test -e $1 && test -e $2
+    then
+        r=yes
+        [ "`stat -c '%i' $1`" != "`stat -c '%i' $2`" ] && r=no
+    fi
+    echo $r
+}
+
+function default_use_cuda () 
+{
+    if test x$(find_cuda) = x || test x$(find_gdk) = x
+    then
+        echo no
+    else
+        echo yes
+    fi
+}
+enable_cuda=$(default_use_cuda)
+
+function show_default () 
+{
+    if test x$1 = x
+    then
+        echo "(no default)"
+    else
+        echo "(default $1)"
+    fi
+}
+
+function show_help ()
+{
+    echo "Usage: configure [options]"
+    echo "Options:"
+    echo "  -h|--help this help"
+    echo "  --with-build-top=directory build directory $(show_default $build_top)"
+    echo "  --add directory add directory to library search path"
+    echo "  --cuda[=(yes|no)] use cuda GPU $(show_default $(default_use_cuda))"
+    echo "  --with-cuda[=directory] $(show_default $(find_cuda))"
+    echo "  --with-gdk[=directory] $(show_default $(find_gdk))"
+    echo "  --with-acml[=directory] $(show_default $(find_acml))"
+    echo "  --with-mkl[=directory] $(show_default $(find_mkl))"
+    echo "  --with-buildtype=(debug|release) $(show_default $default_buildtype)"
+    echo "  --with-kaldi[=directory] $(show_default $(find_kaldi))"
+    echo "Libraries search path:"
+    for head in $(default_paths)
+    do
+        echo "    $head"
+    done
+}
+
+while [[ $# > 0 ]]
+do
+    key="$1"
+    case $key in
+        *=?*) optarg=`expr "X$key" : '[^=]*=\(.*\)'` ;;
+        *=)   optarg= ;;
+        *)    optarg= ;;
+    esac
+
+    case $key in
+        -h|--help)
+            show_help
+            exit 1
+            ;;
+        --with-build-top*)
+            if test x$optarg != x
+            then
+                build_top=$optarg
+                mkdir -p $build_top
+            fi
+            ;;
+        --add*)
+            if test x$optarg = x
+            then
+                shift ; optarg=$1
+            fi
+            default_path_list="$optarg $default_path_list"
+            ;;
+        --cuda)
+            if test x$optarg = yes || test x$optarg = no
+            then
+                enable_cuda=$optarg
+            else
+                echo "Invalid value for --cuda"
+                show_help
+                exit
+            fi
+            ;;
+
+        --with-cuda*)
+            have_cuda=yes
+            enable_cuda=yes
+            if test x$optarg = x
+            then
+                cuda_path=$(find_cuda)
+                if test x$cuda_path = x
+                then
+                    echo "Cannot find cuda directory."
+                    echo "Please specify a value for --with-cuda"
+                    exit 1
+                fi
+            else
+                if test $(check_dir $optarg $cuda_check) = yes
+                then
+                    cuda_path=$optarg
+                else
+                    echo "Invalid cuda directory $optarg"
+                    exit 1
+                fi
+            fi
+            ;;
+        --with-gdk*)
+            have_gdk=yes
+            if test x$optarg = x
+            then
+                gdk_path=$(find_gdk)
+                if test x$gdk_path = x
+                then
+                    echo "Cannot find gdk directory."
+                    echo "Please specify a value for --with-gdk"
+                    exit 1
+                fi
+            else
+                if test $(check_dir $optarg $gdk_check) = yes
+                then
+                    gdk_path=$optarg
+                else
+                    echo "Invalid gdk directory $optarg"
+                    exit 1
+                fi
+            fi
+            ;;
+        --with-acml*)
+            have_acml=yes
+            mathlib=acml
+            if test x$optarg = x
+            then
+                acml_path=$(find_acml)
+                if test x$acml_path = x
+                then
+                    echo "Cannot fine acml directory"
+                    echo "Please specify a value for --with-acml"
+                    exit 1
+                fi
+            else
+                if test $(check_dir $optarg $acml_check) = yes
+                then
+                    acml_path=$optarg
+                else
+                    echo "Invalid acml directory $optarg"
+                    exit 1
+                fi
+            fi
+            ;;
+        --with-mkl*)
+            have_mkl=yes
+            mathlib=mkl
+            if test x$optarg = x
+            then
+                mkl_path=$(find_mkl)
+                if test x$mkl_path = x
+                then
+                    echo "Cannot fine mkl directory"
+                    echo "Please specify a value for --with-mkl"
+                    exit 1
+                fi
+            else
+                if test $(check_dir $optarg $mkl_check) = yes
+                then
+                    mkl_path=$optarg
+                else
+                    echo "Invalid mkl directory $optarg"
+                    exit 1
+                fi
+            fi
+            ;;
+        --with-buildtype*)
+            have_buildtype=yes
+            case $optarg in
+                debug|release)
+                    buildtype=$optarg
+                    ;;
+                *)
+                    echo Invalid buildtype $optarg
+                    echo Must be debug or release
+                    exit 1
+            esac
+            ;;
+        --with-kaldi*)
+            have_kaldi=yes
+            if test x$optarg = x
+            then
+                kaldi_path=$(find_kaldi)
+                if test x$kaldi_path = x
+                then
+                    echo "Cannot find kaldi directory"
+                    echo "Please specify a value for --with-kaldi"
+                    exit 1
+                fi
+            else
+                if test $(check_dir $optarg $kaldi_check)
+                then
+                    kaldi_path=$optarg
+                else
+                    echo "Invalid kaldi directory $optarg"
+                    exit 1
+                fi
+            fi
+            ;;
+        *)
+            echo Invalid option $key
+            show_help
+            exit 1
+    esac
+    shift
+done
+
+if test x$buildtype = x
+then
+    buildtype=$default_buildtype
+    echo Defaulting to --with-buildtype=release
+fi
+
+# If no math library was specified, search for acml and then mkl
+if test x$have_acml = xno && test x$have_mkl = xno
+then
+    acml_path=$(find_acml)
+    if test x$acml_path = x
+    then
+        mkl_path=$(find_mkl)
+        if test x$mkl_path = x
+        then
+            echo "Cannot find a CPU math library."
+            echo "Please specify --with-acml or --with-mkl with a path."
+            exit 1
+        else
+            mathlib=mkl
+        fi
+    else
+        mathlib=acml
+    fi
+fi
+
+# If no cuda library specified, search for one
+if test x$enable_cuda = xyes && test x$cuda_path = x
+then
+    cuda_path=$(find_cuda)
+    if test x$cuda_path = x ; then
+        echo Cannot locate a cuda directory
+        echo GPU will be disabled
+        enable_cuda=no
+    else
+        echo Found cuda at $cuda_path
+    fi
+fi
+
+if test $enable_cuda = yes && test x$gdk_path = x
+then
+    gdk_path=$(find_gdk)
+    if test x$gdk_path = x ; then
+        echo Cannot locate a gdk directory
+        echo GPU will be disabled
+        enable_cuda=no
+    else
+        echo Found gdk at $gdk_path
+    fi
+fi
+
+config=$build_top/Config.make
+echo Generating $config
+echo "#Configuration file for cntk" > $config
+echo BUILDTYPE=$buildtype >> $config
+echo MATHLIB=$mathlib >> $config
+case $mathlib in
+    acml)
+        echo ACML_PATH=$acml_path >> $config
+        ;;
+    mkl)
+        echo MKL_PATH=$mkl_path >> $config
+        ;;
+esac
+if test $enable_cuda = yes ; then
+    echo CUDA_PATH=$cuda_path >> $config
+    echo GDK_PATH=$gdk_path >> $config
+fi
+if test x$kaldi_path != x ; then
+    echo KALDI_PATH=$kaldi_path >> $config
+fi
+
+# If we are not in the configure directory, generate a trampoline Makefile
+makefile=$build_top/Makefile
+if test $(is_hardlinked "$configure" "$build_top/configure") = no
+then
+    echo Generating $makefile
+    realconf=`readlink -f $configure`
+    dir=`dirname $realconf`
+    echo "#Generate Makefile" > $makefile
+    echo dir=$dir >> $makefile
+    echo BUILD_TOP=$build_top >> $makefile
+    echo >> $makefile
+    echo all clean : >> $makefile
+    printf '\t$(MAKE) -C $(dir) BUILD_TOP=$(BUILD_TOP) $@\n' >> $makefile
+fi
+echo run 
+echo '>make -j all'
+echo to build