Integrate clemensm/s2sfix into master

2016-05-19 12:32:28 -07:00 · 2016-05-19 12:32:28 -07:00 · b9f3c541b5
--- a/.gitignore
+++ b/.gitignore
@ -152,7 +152,9 @@ ModelManifest.xml

 # Python
 *.pyc
-__pychache__/
+__pycache__/
+contrib/Python/doc/_build/*
+contrib/Python/_cntk_default/*

 # =========================
 # Windows detritus
--- a/Source/Common/Include/Eval.h
+++ b/Source/Common/Include/Eval.h
@ -28,6 +28,32 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

+template <typename ElemType>
+class IEvaluateModelBase 
+{
+public:
+    // 
+    // Load a model based on configuration. The syntax is the same as when calling the cntk executable.
+    // e.g. "modelFile=model.dat deviceId=0".
+    // numCPUThreads can be used to set the thread count of BLAS.
+    // 
+    virtual void Init(const std::string& config) = 0;
+
+    //
+    // Create a network based on an (NDL) network description.
+    //
+    virtual void CreateNetwork(const std::string& networkDescription) = 0;
+
+    //
+    // Free resources
+    //
+    virtual void Destroy() = 0;
+};
+
+// ------------------------------------------------------------------------
+// Basic (legacy) interface
+// ------------------------------------------------------------------------
+
 enum NodeGroup
 {
    nodeInput,  // an input node
@ -39,33 +65,54 @@ enum NodeGroup
 // NOTICE: This interface is a public interface for evaluating models in CNTK. 
 //         Changes to this interface may affect other projects, such as Argon and LatGen,
 //         and therefore need to be communicated with such groups.
-template <class ElemType>
-class IEvaluateModel // Evaluate Model Interface
+template <typename ElemType>
+class IEvaluateModel : public IEvaluateModelBase<ElemType> // Evaluate Model Interface
 {
 public:
-    virtual void Init(const std::string& config) = 0;
-    virtual void Destroy() = 0;
-
-    virtual void CreateNetwork(const std::string& networkDescription) = 0;
+    //
+    // Retrieves the (flattened) dimensions 
+    //
    virtual void GetNodeDimensions(std::map<std::wstring, size_t>& dimensions, NodeGroup nodeGroup) = 0;
+
+    //
+    // Allocate resources for a particular output.
+    //
    virtual void StartEvaluateMinibatchLoop(const std::wstring& outputNodeName) = 0;
+    
+    //
+    // Evaluate a model in frame mode. This does not support dynamic axes or sparse input data.
+    // Given a feature vector of dimension d, the inputs may contain n * d elements. The output will then be computed 
+    // for n samples.
+    // inputs - map from node name to array of input tensors, flattened to vector
+    // outputs - map from node name to output vector, outputs vectors need to be preallocated by caller, sizing will
+    // happen during evaluation
+    // 
    virtual void Evaluate(std::map<std::wstring, std::vector<ElemType>*>& inputs, std::map<std::wstring, std::vector<ElemType>*>& outputs) = 0;
+
+    //
+    // Evaluate - Evaluate using the network without input and provide the outputs
+    // outputs - map from node name to output vector, outputs vectors need to be preallocated by caller, sizing will 
+    // happen during evaluation
+    //
    virtual void Evaluate(std::map<std::wstring, std::vector<ElemType>*>& outputs) = 0;
+
    virtual void ResetState() = 0;
 };

+
 // GetEval - get a evaluator type from the DLL
 // since we have 2 evaluator types based on template parameters, exposes 2 exports
 // could be done directly with the templated name, but that requires mangled C++ names
-template <class ElemType>
+template <typename ElemType>
 void EVAL_API GetEval(IEvaluateModel<ElemType>** peval);
 extern "C" EVAL_API void GetEvalF(IEvaluateModel<float>** peval);
 extern "C" EVAL_API void GetEvalD(IEvaluateModel<double>** peval);

+
 // Data Reader class
 // interface for clients of the Data Reader
 // mirrors the IEvaluateModel interface, except the Init method is private (use the constructor)
-template <class ElemType>
+template <typename ElemType>
 class Eval : public IEvaluateModel<ElemType>, protected Plugin
 {
 private:
@ -84,6 +131,7 @@ public:
    // modelPath=c:\models\model.dnn (model path, if not specified, must call LoadModel() method before Evaluate()
    // minibatchSize=1024 (minibatch size used during evaluation if < passed data size)
    Eval(const std::string& config);
+
    virtual ~Eval();

    // CreateNetwork - create a network based on the network description
@ -101,14 +149,146 @@ public:

    // Evaluate - Evaluate using the model with the given inputs and outputs
    // inputs - map from node name to input vector
-    // outputs - map from node name to output vector, outputs vectors need to be preallocated by caller, sizing will happen during evaluation
+    // outputs - map from node name to output vector, outputs vectors need to be preallocated by caller, sizing will 
+    // happen during evaluation
    virtual void Evaluate(std::map<std::wstring, std::vector<ElemType>*>& inputs, std::map<std::wstring, std::vector<ElemType>*>& outputs);

    // Evaluate - Evaluate using the network without input, and provide the outputs
-    // outputs - map from node name to output vector, outputs vectors need to be preallocated by caller, sizing will happen during evaluation
+    // outputs - map from node name to output vector, outputs vectors need to be preallocated by caller, sizing will 
+    // happen during evaluation
    virtual void Evaluate(std::map<std::wstring, std::vector<ElemType>*>& outputs);

    virtual void Init(const std::string& config);
+
    virtual void ResetState();
 };
+
+
+// ------------------------------------------------------------------------
+// Extended interface
+// ------------------------------------------------------------------------
+
+//
+// A buffer to keep data for all samples in a (variable length) sequence 
+// from a single input or output.
+// This is used for both dense and sparse data.
+//
+template<typename ElemType>
+struct VariableBuffer
+{
+    size_t m_numberOfSamples = 0;
+
+    //
+    // All elements of a sequence, concatenated.
+    //
+    std::vector<ElemType> m_buffer;
+
+    // In case of sparse data, the following is also used. Otherwise, the 
+    // contents are ignored.
+
+    // E.g. a sequence of three sparse vectors with 2 / 4 / 2 non-zero values
+    // could be represented as the following:
+    // colIdx:  0   2       6   8
+    //          v   v       v   v
+    // indices  1 3 2 3 5 6 2 7
+    // buffer   0 1 2 3 4 5 6 7
+
+    //
+    // For every element in buffer, an entry in this array gives its position.
+    // For every vector the entries must be ascending.
+    //
+    std::vector<int> m_indices;
+
+    //
+    // Contains numberOfsamples + 1 indices into the buffer. The first entry
+    // is always 0. The last entry points after the last element.
+    // See http://docs.nvidia.com/cuda/cusparse/#compressed-sparse-column-format-csc
+    //
+    std::vector<int> m_colIndices;
+};
+
+//
+// Meta data
+//
+struct VariableLayout
+{
+    enum DataType
+    {
+        Float32,
+        Float64
+    };
+
+    enum StorageType
+    {
+        Undetermined,
+        Dense,
+        Sparse,
+    };
+
+    // Name of the input
+    std::wstring m_name;
+
+    DataType m_dataType;
+
+    StorageType m_storageType;
+
+    // Dimension of the tensor, flattened to 1 dimension, for one entry on the dynamic axis.
+    // E.g. for a tensor [2,3,*] this would be 6.
+    int m_numElements;
+
+    // Name of the axis, potentially shared between inputs. For any two inputs sharing the same
+    // dynamic axis, the sequence cardinality must be the same.
+    std::wstring m_dynamicAxisName;
+};
+
+template <typename ElemType>
+using Variables = std::vector<VariableBuffer<ElemType>>;
+
+using VariableSchema = std::vector<VariableLayout>;
+
+//
+// Extended interface, allowing for sparse input.
+//
+template <typename ElemType>
+class IEvaluateModelExtended : public IEvaluateModelBase<ElemType>
+{
+public:
+    //
+    // GetOutputSchema - retrieve information about tensor shapes and memory layout of the outputs for this
+    // model.
+    //
+    virtual VariableSchema GetOutputSchema() const = 0;
+
+    //
+    // Allocate internal state for calling ForwardPass(). The call restricts the network (inputs and outputs)
+    // to the functions represented by the output name.
+    //
+    virtual void StartForwardEvaluation(std::vector<std::wstring> outputs) = 0;
+
+    //
+    // GetVariableLayout - retrieve information about tensor shapes and memory layout of inputs necessary for a
+    // particular output. By default this returns all available inputs. After StartForwardEvaluation(), this
+    // returns all the inputs necessary to compute the outputs.
+    //
+    virtual VariableSchema GetInputSchema() const = 0;
+
+    //
+    // Evaluate - Evaluate (perform a forward pass for) a single unit using the model with the given inputs and 
+    // outputs.
+    // The layout and shape of the data in inputs vector must match the schema returned by GetInputLayouts.
+    // This method is not reentrant, as the forward pass keeps internal state.
+    // outputId - output to compute values for. See GetOutputLayouts()
+    // inputs - vector of input buffers, one for every input as given by GetInputLayouts()
+    // outputs - map from node name to output vector, outputs vectors need to be preallocated by caller, sizing 
+    // will happen during evaluation.
+    // Called after StartForwardEvaluation()
+    //
+    virtual void ForwardPass(const Variables<ElemType>& inputs, Variables<ElemType>& output) = 0;
+};
+
+template <typename ElemType>
+void EVAL_API GetEvalExtended(IEvaluateModelExtended<ElemType>** peval);
+extern "C" EVAL_API void GetEvalExtendedF(IEvaluateModelExtended<float>** peval);
+extern "C" EVAL_API void GetEvalExtendedD(IEvaluateModelExtended<double>** peval);
+
 } } }
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@ -478,6 +478,47 @@ public:
        return std::vector<ComputationNodeBasePtr>{node};
    }

+    std::vector<ComputationNodeBasePtr> OutputNodesByName(const std::vector<std::wstring>& outputNodeNames) 
+    {
+        std::vector<ComputationNodeBasePtr> outputNodes;
+
+        if (outputNodeNames.size() == 0)
+        {
+            if (OutputNodes().size() == 0)
+                RuntimeError("There is no default output node specified in the network.");
+
+            outputNodes = OutputNodes();
+        }
+        else
+        {
+            for (int i = 0; i < outputNodeNames.size(); i++)
+                outputNodes.push_back(GetNodeFromName(outputNodeNames[i]));
+        }
+
+        return outputNodes;
+    }
+
+    // Collect all input nodes that outputNodes depend on.
+    std::vector<ComputationNodeBasePtr> InputNodesForOutputs(const std::vector<std::wstring>& outputNodeNames)
+    {
+        // use map to remove duplicated items
+        auto outputNodes = OutputNodesByName(outputNodeNames);
+
+        std::set<ComputationNodeBasePtr> inputNodesMap;
+        for (auto& onode : outputNodes)
+        {
+            for (auto& inode : InputNodes(onode))
+                inputNodesMap.insert(inode);
+        }
+
+        std::vector<ComputationNodeBasePtr> inputNodes;
+        for (auto& inode : inputNodesMap)
+            inputNodes.push_back(inode);
+
+        return inputNodes;
+    }
+
+
    // these are specified as such by the user
    const std::vector<ComputationNodeBasePtr>& FeatureNodes()        const { return m_featureNodes   ; }
    const std::vector<ComputationNodeBasePtr>& LabelNodes()          const { return m_labelNodes     ; }
--- a/Source/EvalDll/CNTKEval.cpp
+++ b/Source/EvalDll/CNTKEval.cpp
@ -18,6 +18,11 @@
 #endif
 #include "BestGpu.h"
 #include "MPIWrapper.h"
+#include "DataDeserializer.h"
+#include "SequencePacker.h"
+#include "NoRandomizer.h"
+#include "HeapMemoryProvider.h"
+#include "InputAndParamNodes.h"

 // TODO: Temporary mechanism to enable memory sharing for
 // node output value matrices. This will go away when the
@ -26,7 +31,50 @@ bool g_shareNodeValueMatrices = false;

 namespace Microsoft { namespace MSR { namespace CNTK {

-template <class ElemType>
+
+template <typename ElemType>
+void CNTKEvalBase<ElemType>::Init(const std::string& config)
+{
+    m_config.Parse(config);
+    size_t nThreads = m_config("numCPUThreads", "1");
+    CPUMatrix<ElemType>::SetNumThreads(nThreads);
+    g_shareNodeValueMatrices = m_config(L"shareNodeValueMatrices", false);
+}
+
+
+// CreateNetwork - create a network based on the network description
+// networkDescription - network description
+template <typename ElemType>
+void CNTKEvalBase<ElemType>::CreateNetwork(const std::string& networkDescription)
+{
+    ConfigParameters config;
+    config.Parse(networkDescription);
+
+    std::vector<wstring> outputNodeNames;
+    m_net = GetModelFromConfig<ConfigParameters, ElemType>(config, outputNodeNames);
+    
+    if (m_net == nullptr)
+    {
+        LogicError("Unable to construct network from description");
+    }
+}
+
+
+// Destroy - cleanup and remove this class
+// NOTE: this destroys the object, and it can't be used past this point
+template <typename ElemType>
+void CNTKEvalBase<ElemType>::Destroy()
+{
+    // cleanup everything
+    m_net.reset();
+}
+
+
+// ----------------------------------------------------------------------------
+// Basic interface
+// ----------------------------------------------------------------------------
+
+template <typename ElemType>
 void EVAL_API GetEval(IEvaluateModel<ElemType>** peval)
 {
    *peval = new CNTKEval<ElemType>();
@ -41,51 +89,11 @@ extern "C" EVAL_API void GetEvalD(IEvaluateModel<double>** peval)
    GetEval(peval);
 }

-template <class ElemType>
-void CNTKEval<ElemType>::Init(const std::string& config)
-{
-    m_start = 0;
-    m_config.Parse(config);
-    size_t nThreads = m_config("numCPUThreads", "1");
-    CPUMatrix<ElemType>::SetNumThreads(nThreads);
-
-    g_shareNodeValueMatrices = m_config(L"shareNodeValueMatrices", false);
-}
-
-// Destroy - cleanup and remove this class
-// NOTE: this destroys the object, and it can't be used past this point
-template <class ElemType>
-void CNTKEval<ElemType>::Destroy()
-{
-    // cleanup everything
-    m_net.reset();
-    delete m_reader;
-    delete m_writer;
-    delete this;
-}
-
-// CreateNetwork - create a network based on the network description
-// networkDescription - network description
-template <class ElemType>
-void CNTKEval<ElemType>::CreateNetwork(const std::string& networkDescription)
-{
-    ConfigParameters config;
-    config.Parse(networkDescription);
-
-    std::vector<wstring> outputNodeNames;
-    m_net = GetModelFromConfig<ConfigParameters, ElemType>(config, outputNodeNames);
-    
-    if (m_net == nullptr)
-    {
-        LogicError("Unable to construct network from description");
-    }
-}
-
 // GetNodeDimensions - Get the node dimensions of the specified nodes
 // dimensions - map from name of node to dimension of the node, will be appended to for Input/Output scenarios
 // nodeGroup - type of node we are requesting (input/output/specified)
 // NOTE: when nodeGroup==specified the dimensions map is expected to be populated with the string names of the nodes requested, dimensions will be modified return the current value.
-template <class ElemType>
+template <typename ElemType>
 void CNTKEval<ElemType>::GetNodeDimensions(std::map<std::wstring, size_t>& dimensions, NodeGroup nodeGroup)
 {
    if (m_net == NULL)
@ -137,7 +145,7 @@ void CNTKEval<ElemType>::GetNodeDimensions(std::map<std::wstring, size_t>& dimen

 // StartEvaluateMinibatchLoop - Prepare network for Evaluate() calls.
 // ouputNodeName - name of node that will be evaluated
-template <class ElemType>
+template <typename ElemType>
 void CNTKEval<ElemType>::StartEvaluateMinibatchLoop(const std::wstring& outputNodeName)
 {
    m_net->StartEvaluateMinibatchLoop(m_net->GetNodeFromName(outputNodeName));
@ -146,7 +154,7 @@ void CNTKEval<ElemType>::StartEvaluateMinibatchLoop(const std::wstring& outputNo
 // Evaluate - Evalute using the model with the given inputs and outputs
 // inputs - map from node name to input vector
 // outputs - map from node name to output vector, outputs vectors need to be preallocated by caller, sizing will happen during evaluation
-template <class ElemType>
+template <typename ElemType>
 void CNTKEval<ElemType>::Evaluate(std::map<std::wstring, std::vector<ElemType>*>& inputs, std::map<std::wstring, std::vector<ElemType>*>& outputs)
 {
    size_t minibatchSize = m_config(L"minibatchSize", (size_t) 10240);
@ -183,7 +191,7 @@ void CNTKEval<ElemType>::Evaluate(std::map<std::wstring, std::vector<ElemType>*>

 // Evaluate - Evalute using the model with the given inputs and outputs
 // outputs - map from node name to output vector, outputs vectors need to be preallocated by caller, sizing will happen during evaluation
-template <class ElemType>
+template <typename ElemType>
 void CNTKEval<ElemType>::Evaluate(std::map<std::wstring, std::vector<ElemType>*>& outputs)
 {
    // get the evaluation names from the output string
@ -206,14 +214,168 @@ void CNTKEval<ElemType>::Evaluate(std::map<std::wstring, std::vector<ElemType>*>
    eval.WriteOutput(*m_writer, outNodeNames);
 }

-// ResetState - Reset the cell state when we get start of an utterance
-template <class ElemType>
-void CNTKEval<ElemType>::ResetState()
+
+template <typename ElemType>
+void CNTKEval<ElemType>::Destroy()
 {
-    m_start = 1 - m_start;
+    CNTKEvalBase<ElemType>::Destroy();
+    delete m_reader;
+    delete m_writer;
+    delete this;
 }

 // instantiate all the combinations we expect to be used
 template class CNTKEval<double>;
 template class CNTKEval<float>;
+
+// ----------------------------------------------------------------------------
+// Extended interface
+// ----------------------------------------------------------------------------
+
+template<typename ElemType>
+VariableLayout CNTKEvalExtended<ElemType>::ToVariableLayout(const ComputationNodeBasePtr n) 
+{
+    auto matrix = dynamic_pointer_cast<Matrix<ElemType>>(n->ValuePtr());
+    return VariableLayout
+    {
+        /* name */ n->GetName(),
+        /* type */ sizeof(ElemType) == sizeof(float) ? VariableLayout::Float32 : VariableLayout::Float64,
+        /* storage */  matrix ? matrix->GetMatrixType() == MatrixType::DENSE ? VariableLayout::Dense :
+                                matrix->GetMatrixType() == MatrixType::SPARSE ? VariableLayout::Sparse : 
+                                VariableLayout::Undetermined :
+                                VariableLayout::Undetermined,
+        /* dimension */ n->GetSampleLayout().GetNumElements(),
+        /* dynamic axis */ wstring(n->GetMBLayout()->GetAxisName())
+    };
+}
+
+
+template<typename ElemType>
+void CNTKEvalExtended<ElemType>::StartForwardEvaluation(std::vector<wstring> outputNodeNames)
+{
+    m_scopedNetworkOperationMode = make_shared<ScopedNetworkOperationMode>(m_net, NetworkOperationMode::inferring);
+    // allocate memory for forward computation
+    m_outputNodes  = m_net->OutputNodesByName(outputNodeNames);
+    m_inputNodes = m_net->InputNodesForOutputs(outputNodeNames);
+    // allocate memory for forward computation
+    m_net->AllocateAllMatrices({}, m_outputNodes, nullptr);
+    m_net->StartEvaluateMinibatchLoop(m_outputNodes);
+    m_inputMatrices = DataReaderHelpers::RetrieveInputMatrices(m_inputNodes);
+} 
+
+template<typename ElemType>
+VariableSchema CNTKEvalExtended<ElemType>::GetOutputSchema() const
+{
+    VariableSchema schema;
+    for (const auto& n : m_net->OutputNodes())
+    {
+        schema.push_back(ToVariableLayout(n));
+    }
+    return schema;
+}
+
+template<typename ElemType>
+VariableSchema CNTKEvalExtended<ElemType>::GetInputSchema() const
+{
+    VariableSchema inputLayouts;
+    auto nodes = m_inputNodes;
+    if (nodes.size() == 0)
+    {
+        // Default to all nodes
+        nodes = m_net->InputNodesForOutputs({});
+    }
+
+    for (const auto& n : nodes)
+    {
+        inputLayouts.push_back(ToVariableLayout(n));
+    }
+    return inputLayouts;
+}
+
+template<typename ElemType>
+void CNTKEvalExtended<ElemType>::ForwardPass(const Variables<ElemType>& inputs, Variables<ElemType>& output)
+{
+    if (inputs.size() != (size_t)std::distance(m_inputMatrices.begin(), m_inputMatrices.end()))
+    {
+        RuntimeError("Expected %d inputs, but got %d", (int)std::distance(m_inputMatrices.begin(), m_inputMatrices.end()), (int)inputs.size());
+    }
+
+    int i = 0;
+    for (auto& input : m_inputMatrices)
+    {
+        VariableBuffer<ElemType> buffer = inputs[i];
+        int numRows = input.second.sampleLayout.GetNumElements(); 
+        int numCols = buffer.m_numberOfSamples;
+        shared_ptr<Matrix<ElemType>> matrix = dynamic_pointer_cast<Matrix<ElemType>>(input.second.matrix);
+        auto type = matrix->GetMatrixType(); 
+
+        input.second.pMBLayout->Init(1, numCols);
+        input.second.pMBLayout->AddSequence(0, 0, 0, numCols);
+       
+        if (type == MatrixType::DENSE)
+        {
+            matrix->SetValue(numRows, numCols, matrix->GetDeviceId(), buffer.m_buffer.data(), matrixFlagNormal);
+        }
+        else if (type == MatrixType::SPARSE)
+        {
+            // In the sparse case the m_data layout is identical to CUDA's CSC layout
+            // (see http://docs.nvidia.com/cuda/cusparse/#compressed-sparse-column-format-csc).
+            matrix->SetMatrixFromCSCFormat(buffer.m_colIndices.data(), buffer.m_indices.data(), buffer.m_buffer.data(), buffer.m_buffer.size(), numRows, numCols);
+        }
+
+        ++i;
+    }
+
+    ComputationNetwork::BumpEvalTimeStamp(m_inputNodes);
+    
+    for (int i = 0; i < m_outputNodes.size(); ++i)
+    {
+        auto node = m_outputNodes[i];
+        m_net->ForwardProp(node);
+        shared_ptr<Matrix<ElemType>> outputMatrix = dynamic_pointer_cast<Matrix<ElemType>>(node->ValuePtr());
+        auto pMBLayout = node->GetMBLayout();
+        if (!pMBLayout) 
+        {
+            pMBLayout = make_shared<MBLayout>();
+            pMBLayout->InitAsFrameMode(1); // treat this as if we have one single sample
+        }
+
+        const auto& seq = pMBLayout->GetAllSequences();
+        if (seq.size() != 1)
+        {
+            RuntimeError("Only 1 sequence supported by this API"); // TODO
+        }
+        std::vector<ElemType>& vec = output[i].m_buffer;
+        
+        vec.resize(outputMatrix->GetNumElements());
+        ElemType* data = const_cast<ElemType*>(vec.data());
+        size_t numElements = outputMatrix->GetNumElements();
+        outputMatrix->CopyToArray(data, numElements);
+    }
+}
+
+template <typename ElemType>
+void CNTKEvalExtended<ElemType>::Destroy()
+{
+    CNTKEvalBase<ElemType>::Destroy();
+    delete this;
+}
+
+template <typename ElemType>
+void EVAL_API GetEvalExtended(IEvaluateModelExtended<ElemType>** peval)
+{
+    *peval = new CNTKEvalExtended<ElemType>();
+}
+
+extern "C" EVAL_API void  GetEvalExtendedF(IEvaluateModelExtended<float>** peval)
+{
+    GetEvalExtended(peval);
+}
+extern "C" EVAL_API void GetEvalExtendedD(IEvaluateModelExtended<double>** peval)
+{
+    GetEvalExtended(peval);
+}
+
+template class CNTKEvalExtended<double>;
+template class CNTKEvalExtended<float>;
 } } }
--- a/Source/EvalDll/CNTKEval.h
+++ b/Source/EvalDll/CNTKEval.h
@ -22,48 +22,97 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

-template <class ElemType>
-class CNTKEval : public IEvaluateModel<ElemType>
+template <typename ElemType>
+class CNTKEvalBase : public IEvaluateModelBase<ElemType>
 {
+protected:
    typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr;
-    EvalReader<ElemType>* m_reader;
-    EvalWriter<ElemType>* m_writer;
    ConfigParameters m_config;
    ComputationNetworkPtr m_net;
-    std::map<std::wstring, size_t> m_dimensions;
-    size_t m_start;

-public:
    // constructor
-    CNTKEval()
-        : m_reader(nullptr), m_net(nullptr)
-    {
-    }
+    CNTKEvalBase() : m_net(nullptr) { }
+public:

    // CreateNetwork - create a network based on the network description
    // networkDescription - network description
    virtual void CreateNetwork(const std::string& networkDescription);
-
-    // GetNodeDimensions - Get the node dimensions of the specified nodes
-    // dimensions - map from name of node to dimension of the node
-    // nodeGroup - type of node we are requesting (input/output/specified)
-    virtual void GetNodeDimensions(std::map<std::wstring, size_t>& dimensions, NodeGroup nodeGroup);
-
-    // StartEvaluateMinibatchLoop - Prepare network for Evaluate() calls.
-    // ouputNodeName - name of node that will be evaluated
-    virtual void StartEvaluateMinibatchLoop(const std::wstring& outputNodeName);
-
-    // Evaluate - Evalute using the model with the given inputs and outputs
-    // inputs - map from node name to input vector
-    // outputs - map from node name to output vector, outputs vectors need to be preallocated by caller, sizing will happen during evaluation
-    virtual void Evaluate(std::map<std::wstring, std::vector<ElemType>*>& inputs, std::map<std::wstring, std::vector<ElemType>*>& outputs);
-
-    // Evaluate - Evalute using the model with the given inputs and outputs
-    // outputs - map from node name to output vector, outputs vectors need to be preallocated by caller, sizing will happen during evaluation
-    virtual void Evaluate(std::map<std::wstring, std::vector<ElemType>*>& outputs);
-
    virtual void Init(const std::string& config);
    virtual void Destroy();
-    virtual void ResetState();
+};
+
+// ------------------------------------------------------------------------
+// Basic interface
+// ------------------------------------------------------------------------
+template <typename ElemType>
+class CNTKEval : public CNTKEvalBase<ElemType>, public IEvaluateModel<ElemType>
+{
+    EvalReader<ElemType>* m_reader;
+    EvalWriter<ElemType>* m_writer;
+    std::map<std::wstring, size_t> m_dimensions;
+    size_t m_start;
+public:
+    CNTKEval() : CNTKEvalBase<ElemType>(), m_reader(nullptr), m_writer(nullptr) {}
+
+    virtual void GetNodeDimensions(std::map<std::wstring, size_t>& dimensions, NodeGroup nodeGroup);
+
+    virtual void StartEvaluateMinibatchLoop(const std::wstring& outputNodeName);
+
+    virtual void Evaluate(std::map<std::wstring, std::vector<ElemType>*>& inputs, std::map<std::wstring, std::vector<ElemType>*>& outputs);
+
+    virtual void Evaluate(std::map<std::wstring, std::vector<ElemType>*>& outputs);
+
+    virtual void Destroy() override;
+
+    virtual void CreateNetwork(const std::string& networkDescription) override
+    {
+        CNTKEvalBase<ElemType>::CreateNetwork(networkDescription);
+    }
+    
+    virtual void Init(const std::string& config) override
+    {
+        CNTKEvalBase<ElemType>::Init(config);
+        m_start = 0;
+    }
+
+    virtual void ResetState() override
+    {
+        m_start = 1 - m_start;
+    }
+};
+
+
+
+// ------------------------------------------------------------------------
+// Extended interface
+// ------------------------------------------------------------------------
+template <typename ElemType>
+class CNTKEvalExtended : public CNTKEvalBase<ElemType>, public IEvaluateModelExtended<ElemType>
+{
+    virtual VariableSchema GetOutputSchema() const override;
+
+    virtual void StartForwardEvaluation(std::vector<wstring> outputs) override;
+
+    virtual VariableSchema GetInputSchema() const override;
+
+    virtual void ForwardPass(const Variables<ElemType>& inputs, Variables<ElemType>& output) override;
+
+    virtual void Destroy() override;
+
+    virtual void CreateNetwork(const std::string& networkDescription) override
+    {
+        CNTKEvalBase<ElemType>::CreateNetwork(networkDescription);
+    }
+
+    virtual void Init(const std::string& config) override
+    {
+        CNTKEvalBase<ElemType>::Init(config);
+    }
+private:
+    static VariableLayout ToVariableLayout(const ComputationNodeBasePtr n);
+    std::vector<ComputationNodeBasePtr> m_outputNodes;
+    std::shared_ptr<ScopedNetworkOperationMode> m_scopedNetworkOperationMode;
+    std::vector<ComputationNodeBasePtr> m_inputNodes;
+    StreamMinibatchInputs m_inputMatrices;
 };
 } } }
--- a/Source/EvalDll/EvalDll.vcxproj
+++ b/Source/EvalDll/EvalDll.vcxproj
@ -55,8 +55,8 @@
    <TargetName>EvalDll</TargetName>
  </PropertyGroup>
  <ItemDefinitionGroup>
-    <ClCompile>
-      <AdditionalIncludeDirectories>$(SolutionDir)Source\SGDLib;$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\SequenceTrainingLib;$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\CNTK\BrainScript;$(SolutionDir)Source\ActionsLib;$(MSMPI_INC);$(NvmlInclude)</AdditionalIncludeDirectories>
+      <ClCompile>
+      <AdditionalIncludeDirectories>$(SolutionDir)Source\Readers\ReaderLib;$(SolutionDir)Source\SGDLib;$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\SequenceTrainingLib;$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\CNTK\BrainScript;$(SolutionDir)Source\ActionsLib;$(MSMPI_INC);$(NvmlInclude)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\Math;$(MSMPI_LIB64);$(SolutionDir)$(Platform)\$(Configuration);$(NvmlLibPath)</AdditionalLibraryDirectories>
@ -99,7 +99,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; Common.lib; ActionsLib.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib; %(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; Common.lib; ActionsLib.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib;ReaderLib.lib; %(AdditionalDependencies)</AdditionalDependencies>
      <Profile>true</Profile>
      <DelayLoadDLLs>Math.dll; nvml.dll; $(CudaRuntimeDll)</DelayLoadDLLs>
    </Link>
@ -153,4 +153,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/Source/EvalDll/EvalDll.vcxproj.filters
+++ b/Source/EvalDll/EvalDll.vcxproj.filters
@ -2,39 +2,18 @@
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup>
    <ClCompile Include="CNTKEval.cpp" />
-    <ClCompile Include="..\Common\fileutil.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
-    <ClCompile Include="..\Common\File.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
-    <ClCompile Include="..\Common\TimerUtility.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
    <ClCompile Include="dllmain.cpp">
      <Filter>Misc</Filter>
    </ClCompile>
    <ClCompile Include="stdafx.cpp">
      <Filter>Misc</Filter>
    </ClCompile>
-    <ClCompile Include="..\Common\Config.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
-    <ClCompile Include="..\Common\Eval.cpp">
-      <Filter>For External Use</Filter>
-    </ClCompile>
-    <ClCompile Include="..\Common\ExceptionWithCallStack.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
    <ClCompile Include="..\CNTK\BrainScript\BrainScriptEvaluator.cpp">
      <Filter>BrainScript</Filter>
    </ClCompile>
    <ClCompile Include="..\CNTK\BrainScript\BrainScriptParser.cpp">
      <Filter>BrainScript</Filter>
    </ClCompile>
-    <ClCompile Include="..\Common\DataReader.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="EvalReader.h" />
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@ -715,11 +715,12 @@ void CPUMatrix<ElemType>::SetValue(const ElemType v)
    }
    else
    {
-		ElemType* bufPtr = Data();
+        ElemType* bufPtr = Data();
        long m = (long) GetNumElements();
        // 2-way thread parallelism is sufficient for the memory bound
        // operation of just setting the values of an array.
        const unsigned SETVALUE_NUM_THREADS = 2;
+        UNUSED(SETVALUE_NUM_THREADS); // in case OMP is turned off.
 #pragma omp parallel for num_threads(SETVALUE_NUM_THREADS)
        // four-way unrolling
        for (long i = 0; i < (m & ~3); i += 4)
--- a/Source/Math/Math.vcxproj.filters
+++ b/Source/Math/Math.vcxproj.filters
@ -2,12 +2,6 @@
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup>
    <ClCompile Include="Matrix.cpp" />
-    <ClCompile Include="..\Common\File.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
-    <ClCompile Include="..\Common\fileutil.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
    <ClCompile Include="CPUMatrix.cpp">
      <Filter>CPU</Filter>
    </ClCompile>
--- a/Source/SGDLib/DataReaderHelpers.h
+++ b/Source/SGDLib/DataReaderHelpers.h
@ -15,12 +15,27 @@ namespace Microsoft { namespace MSR { namespace CNTK {

 /*static*/ struct DataReaderHelpers
 {
+    template <class ElemType>
+    static void NotifyChangedNodes(ComputationNetworkPtr net, StreamMinibatchInputs& inputMatrices)
+    {
+        // reader will have resized input node's m_value directly. Nodes must be notified to do necessary internal state updates from that.
+        // TODO: This is a stopgap. SGD will at some point change from sets of matrices to sets of nodes. Then this will become much simpler.
+        std::set<MatrixBasePtr> matrices;
+        for (const auto& iter : inputMatrices)
+            matrices.insert(iter.second.matrix);
+        for (auto& node : net->FeatureNodes())
+            if (matrices.find(node->As<ComputationNode<ElemType>>()->ValuePtr()) != matrices.end())
+                node->NotifyFunctionValuesMBSizeModified();
+        for (auto& node : net->LabelNodes())
+            if (matrices.find(node->As<ComputationNode<ElemType>>()->ValuePtr()) != matrices.end())
+                node->NotifyFunctionValuesMBSizeModified();
+    }
+
    // -------------------------------------------------------------------
    // GetMinibatchIntoNetwork() -- get one minibatch from Reader (this->trainSetDataReader) into Network (this->net)
    // Returns false if no data is read. In that case, no other return value can be expected to contain meaningful values (e.g. actualMBSize will be unchanged).
    // Sets actualMBSize to the number of matrix columns. Note that 0 is a valid value to be returned for actualMBSize, caller must handle that correctly.
    // -------------------------------------------------------------------
-
    // Note: This will go away with the redesigned reader interface.
    // TODO: callers of this often do ComputationNetwork::BumpEvalTimeStamp(featureNodes) and also for labels; we should eliminate the need for this.
    template <class ElemType>
@ -78,17 +93,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            DecimateMinibatchInPlace<ElemType>(inputMatrices, mpi->NumNodesInUse(), mpi->CurrentNodeRank(), pMBLayout);
        }

-        // reader will have resized input node's m_value directly. Nodes must be notified to do necessary internal state updates from that.
-        // TODO: This is a stopgap. SGD will at some point change from sets of matrices to sets of nodes. Then this will become much simpler.
-        std::set<MatrixBasePtr> matrices;
-        for (const auto& iter : inputMatrices)
-            matrices.insert(iter.second.matrix);
-        for (auto& node : net->FeatureNodes())
-            if (matrices.find(node->As<ComputationNode<ElemType>>()->ValuePtr()) != matrices.end())
-                node->NotifyFunctionValuesMBSizeModified();
-        for (auto& node : net->LabelNodes())
-            if (matrices.find(node->As<ComputationNode<ElemType>>()->ValuePtr()) != matrices.end())
-                node->NotifyFunctionValuesMBSizeModified();
+        NotifyChangedNodes<ElemType>(net, inputMatrices);

        // get MB size and tell Network to update its nodes' buffers based on what's in the input matrices
        // Note: Decimation may have reduced this to 0 frames. We still must return 'true'.
@ -99,6 +104,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return true;
    }

+    // get StreamMinibatchInputs for a given set of input nodes
+    static StreamMinibatchInputs RetrieveInputMatrices(const std::vector<ComputationNodeBasePtr>& inputNodes)
+    {
+        StreamMinibatchInputs inputMatrices;
+        for (auto& node : inputNodes)
+            inputMatrices.AddInput(node->NodeName(), node->ValuePtr(), node->GetMBLayout(), node->GetSampleLayout());
+        return inputMatrices;
+    }
+
+
    // -------------------------------------------------------------------
    // DecimateMinibatch - decimate minibatch for parallelization
    // -------------------------------------------------------------------
--- a/Source/SGDLib/SimpleOutputWriter.h
+++ b/Source/SGDLib/SimpleOutputWriter.h
@ -23,63 +23,12 @@ using namespace std;

 namespace Microsoft { namespace MSR { namespace CNTK {

+
 template <class ElemType>
 class SimpleOutputWriter
 {
    typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr;

-private:
-    std::vector<ComputationNodeBasePtr> DetermineOutputNodes(const std::vector<std::wstring>& outputNodeNames)
-    {
-        std::vector<ComputationNodeBasePtr> outputNodes;
-
-        if (outputNodeNames.size() == 0)
-        {
-            if (m_verbosity > 0)
-                fprintf(stderr, "OutputNodeNames are not specified, using the default outputnodes.\n");
-            if (m_net->OutputNodes().size() == 0)
-                LogicError("There is no default output node specified in the network.");
-
-            outputNodes = m_net->OutputNodes();
-        }
-        else
-        {
-            for (int i = 0; i < outputNodeNames.size(); i++)
-                outputNodes.push_back(m_net->GetNodeFromName(outputNodeNames[i]));
-        }
-
-        return outputNodes;
-    }
-
-    // collect all input nodes that outputNodes depend on
-    // TODO: This is rather generic, we should move this to a shared place. DataReaderHelpers.h?
-    std::vector<ComputationNodeBasePtr> DetermineInputNodes(const std::vector<ComputationNodeBasePtr>& outputNodes)
-    {
-        // use map to remove duplicated items
-        std::set<ComputationNodeBasePtr> inputNodesMap;
-        for (auto& onode : outputNodes)
-        {
-            for (auto& inode : m_net->InputNodes(onode))
-                inputNodesMap.insert(inode);
-        }
-
-        std::vector<ComputationNodeBasePtr> inputNodes;
-        for (auto& inode : inputNodesMap)
-            inputNodes.push_back(inode);
-
-        return inputNodes;
-    }
-
-    // get StreamMinibatchInputs for a given set of input nodes
-    // TODO: This seems generic, we should have that in a shared place.
-    StreamMinibatchInputs RetrieveInputMatrices(const std::vector<ComputationNodeBasePtr>& inputNodes)
-    {
-        StreamMinibatchInputs inputMatrices;
-        for (auto& node : inputNodes)
-            inputMatrices.AddInput(node->NodeName(), node->ValuePtr(), node->GetMBLayout(), node->GetSampleLayout());
-        return inputMatrices;
-    }
-
 public:
    SimpleOutputWriter(ComputationNetworkPtr net, int verbosity = 0)
        : m_net(net), m_verbosity(verbosity)
@ -90,13 +39,16 @@ public:
    {
        ScopedNetworkOperationMode modeGuard(m_net, NetworkOperationMode::inferring);

-        std::vector<ComputationNodeBasePtr> outputNodes = DetermineOutputNodes(outputNodeNames);
-        std::vector<ComputationNodeBasePtr> inputNodes  = DetermineInputNodes(outputNodes);
+        if (outputNodeNames.size() == 0 && m_verbosity > 0)
+            fprintf(stderr, "OutputNodeNames are not specified, using the default outputnodes.\n");
+
+        std::vector<ComputationNodeBasePtr> outputNodes = m_net->OutputNodesByName(outputNodeNames);
+        std::vector<ComputationNodeBasePtr> inputNodes  = m_net->InputNodesForOutputs(outputNodeNames);

        // allocate memory for forward computation
        m_net->AllocateAllMatrices({}, outputNodes, nullptr);

-        StreamMinibatchInputs inputMatrices = RetrieveInputMatrices(inputNodes);
+        StreamMinibatchInputs inputMatrices = DataReaderHelpers::RetrieveInputMatrices(inputNodes);

        // evaluate with minibatches
        dataReader.StartMinibatchLoop(mbSize, 0, numOutputSamples);
@ -148,7 +100,7 @@ public:
    // Perform a single forward pass to obtain the output values from a network
    void WriteOutput(IDataWriter& dataWriter, const std::vector<std::wstring>& outputNodeNames, size_t numOutputSamples = requestDataSize, bool doUnitTest = false)
    {
-        std::vector<ComputationNodeBasePtr> outputNodes = DetermineOutputNodes(outputNodeNames);
+        std::vector<ComputationNodeBasePtr> outputNodes = m_net->OutputNodesByName(outputNodeNames);

        // allocate memory for forward computation
        m_net->AllocateAllMatrices({}, outputNodes, nullptr);
@ -203,8 +155,8 @@ public:
        // In case of unit test, make sure backprop works
        ScopedNetworkOperationMode modeGuard(m_net, nodeUnitTest ? NetworkOperationMode::training : NetworkOperationMode::inferring);

-        std::vector<ComputationNodeBasePtr> outputNodes = DetermineOutputNodes(outputNodeNames);
-        std::vector<ComputationNodeBasePtr> inputNodes = DetermineInputNodes(outputNodes);
+        std::vector<ComputationNodeBasePtr> outputNodes = m_net->OutputNodesByName(outputNodeNames);
+        std::vector<ComputationNodeBasePtr> inputNodes = m_net->InputNodesForOutputs(outputNodeNames);
        std::vector<ComputationNodePtr> gradientNodes;
        std::vector<ComputationNodeBasePtr> allOutputNodes = outputNodes;

@ -244,7 +196,7 @@ public:
            m_net->AllocateAllMatrices({}, outputNodes, outputNodes[0]);
        }

-        StreamMinibatchInputs inputMatrices = RetrieveInputMatrices(inputNodes);
+        StreamMinibatchInputs inputMatrices = DataReaderHelpers::RetrieveInputMatrices(inputNodes);
        
        // load a label mapping if requested
        std::vector<std::string> labelMapping;