399 строки
16 KiB
399 строки
16 KiB
// <copyright file="PTaskGraphBuilder.h" company="Microsoft">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
#pragma once
#ifdef USE_PTASK
#include "PTask.h"
typedef void Port;
typedef void Graph;
typedef void Task;
typedef void Channel;
typedef int CONTROLSIGNAL;
#define DBCTLC_BOF 1
#define DBCTLC_EOF 2
#define DBCTLC_NONE 3
#include <string>
#include <cuda_runtime.h>
namespace Microsoft { namespace MSR { namespace CNTK {
extern __declspec(thread) cudaStream_t t_stream;
// class for stream overrides for PTask
// auto-class to set stream override inside a function
// usage at each function that calls CUDA:
// onstream override(stream);
class onstream
cudaStream_t prevStream;
onstream (cudaStream_t stream) { prevStream = GetStream(); SetStream(stream);}
~onstream() { SetStream(prevStream); }
// any pointer/reference and all scalar types that fit into 8 bytes can be parameters
enum ParamType
paramTypeReference = paramTypePointer, // both the same in memory
paramTypeLong = paramTypeInteger,
paramTypeLongLong = paramTypeSizet,
paramTypeNode, // pass the node as first parameter, to be used ONLY for CONSTANT values
enum ParamOptions
paramOptionsNull = 0, // invalid value
paramOptionsInput = 1, // and input value for a task
paramOptionsOutput = 2, // an output value from a task
paramOptionsTemporary = 4, // for variables only used within one task (and then thrown away)
paramOptionsInitialize = 8, // initialize buffer with value
paramOptionsRecurrantIterator = 16, // iterator for recurrancy
paramOptionsConstant = 32, // constant value
paramOptionsMaintainValue = 64, // maintain value for in/out parameters, if not specified in/out means pass-through variable
paramOptionsNoPush = 128, // flag for parameter routines to not push the values
paramOptionsInitOnBOF = 256, // Initialize the buffer on BOF signal
paramOptionsSaveOnEOF = 512, // Save on EOF
paramOptionsInitalValuesOnDestinations = 1024, // Initial values need to be set on all destination ports, needed for Update
enum TaskType
taskNull, // invalid value
taskEvaluate, // EvaluateThisNode() task
taskComputeInputPartial, // ComputeInputPartial() task
taskEvaluateRecurrent, // EvaluateThisNode() Recurrent task
taskComputeInputPartialRecurrent, // ComputeInputPartial() Recurrent task
taskUpdate, // update weight matricies
taskOutput, // output node, copy back to ComputationNode structure
// ParamData - parameter data, explains the parameter type
template<class ElemType>
class ParamData
ParamType type; // data type of the parameter
void *assocData; // associated data (i.e. for Matrix, the matrix is the node this corresponds to)
Port* port; // PTask port that is created for this parameter
Port* portOut; // output port for in/out parameters
UINT options; // parameter options (see ParamOptions above)
ElemType initValue; // initialization data for "paramOptionsInitialize"
std::string name; // name of the parameter
// constructors
ParamData(ParamType type, const std::string& name, UINT options) : type(type), name(name), assocData(NULL), port(NULL), portOut(NULL), options(options), initValue(0)
ParamData(ParamType type, const std::string& name, void* data, UINT options) : type(type), name(name), assocData(data), port(NULL), portOut(NULL), options(options), initValue(0)
// Initialize a parameter with an inital value
// initValue - value to initialize the port with
void SetInitialize(ElemType initVal)
initValue = initVal;
options |= paramOptionsInitialize;
// SetConstant - Set a constant value to a port
// data - pointer to the data
// sizeData - size in bytes of the data
void SetConstant(void* data, int sizeData)
#ifdef USE_PTASK
GraphInputChannel* channel = (GraphInputChannel*)port->GetChannel(0);
Datablock* pblock = PTask::Runtime::AllocateDatablock(port->GetTemplate(), (void *)data, sizeData, NULL);
// predeclaration
template<class ElemType>
class ComputationNetwork;
template<class ElemType>
class ComputationNode;
// Describes a PTask task.
// One instance is created for each actual task that will be added to the PTask graph.
// The descriptors are created first, to support phased assembly of the information
// about the tasks and their relationships.
template<class ElemType>
class TaskDescriptor
typedef ComputationNode<ElemType>* ComputationNodePtr;
const ComputationNode<ElemType>* node,
TaskType taskType,
size_t input=0
virtual ~TaskDescriptor();
const std::string& TaskName() const { return m_taskName; }
std::string& TaskName() { return m_taskName; }
bool IsForwardTask() const { return m_taskType == taskEvaluate || m_taskType == taskEvaluateRecurrent; }
TaskType GetTaskType() const { return m_taskType;}
const ComputationNodePtr GetNode() const {return m_node;}
const Task* GetTask() const {return m_task;}
ParamData<ElemType>* GradientParam(int index=-1, UINT options=paramOptionsInput, ElemType initValue=ElemType(0.0));
ParamData<ElemType>* FunctionParam(int index=-1, UINT options=paramOptionsOutput);
ParamData<ElemType>* MatrixParam(const Matrix<ElemType>& matrix, const std::string& name, UINT options=paramOptionsInput);
ParamData<ElemType>* Param(ParamType paramType, const std::string& name, UINT options=paramOptionsInput, void* data=nullptr);
void SetFunction(FARPROC function) {m_function = function;}
#ifdef USE_PTASK
FARPROC GetFunction() {return m_function;}
void ConfigureInputsAndOutputs(UINT& uidCounter, std::map<const std::string, Port*>& valueNameToProducerPortMap);
void CreateTask(Graph* graph);
void CreateChannelsForInputs(
Graph* graph,
std::map<const std::string, Port*>& valueNameToProducerPortMap,
std::map<const std::string, std::vector<PTask::GraphInputChannel*>*>& inputNameToChannelsMap,
int verbosity);
void CreateInitializerChannel(
Graph* graph,
Port* port,
Matrix<ElemType>& matrix,
const std::string& name
void CreateBackAndInitChannel(Graph* graph, std::map<const std::string, PTask::GraphOutputChannel*>& outputNameToChannelsMap);
void FindEmptyOutPorts(Graph* graph);
// GetParamData - return the parameter data in parameter order
const std::vector<ParamData<ElemType>*>& GetParameters() const {return m_paramData;}
Port* TaskDescriptor<ElemType>::CreatePortForTemplate(DatablockTemplate* dt,
UINT portType,
std::string& valueName,
UINT portIndex, UINT inoutPort, bool gpuBuffer,
UINT& uidCounter,
std::map<const std::string, Port*>& valueNameToProducerPortMap
std::vector<const std::string> m_inputNames;
std::vector<const std::string> m_outputNames;
UINT m_numInputPorts;
Port** m_inputPorts;
UINT m_numOutputPorts;
Port** m_outputPorts;
static DatablockTemplate* s_descriptorTemplate;
std::vector<ParamData<ElemType>*> m_paramData; // parameter data for CNTK task
ComputationNodePtr m_node;
TaskType m_taskType;
std::string m_taskName;
Task* m_task;
FARPROC m_function;
template<class ElemType>
class PTaskGraphBuilder
typedef ComputationNode<ElemType>* ComputationNodePtr;
typedef TaskDescriptor<ElemType>* TaskDescriptorPtr;
virtual ~PTaskGraphBuilder();
virtual void BuildFromComputationNetwork(ComputationNetwork<ElemType>* cn);
void StartPTaskGraph();
void UpdateParameters(void* sgd, const ElemType learnRatePerSample, const size_t expectedMBSize);
void PushActualMBSize(const std::list<ComputationNodePtr>& learnableNodes, size_t actualMBSize, CONTROLSIGNAL signal=DBCTLC_NONE);
void PushData(std::map<std::wstring, Matrix<ElemType>*>& data, CONTROLSIGNAL signal=DBCTLC_NONE);
void PushMatrix(const Matrix<ElemType>& matrix, Channel* channel, CONTROLSIGNAL signal=DBCTLC_NONE);
ElemType GetValue(ComputationNodePtr node);
void GetValue(ComputationNode<ElemType>* node, Matrix<ElemType>& matTo);
#ifdef USE_PTASK
//static void WINAPI OutputParameter(const ComputationalNode<ElemType>* node, Matrix<ElemType> &functionValues);
static void __stdcall ApplicationContextCallback(
const Datablock * pDatablock,
void ** ppApplicationContext
// Copy constructor, should never be called.
PTaskGraphBuilder(const PTaskGraphBuilder<ElemType>& deepCopyFrom) {};
// Assignment operator, should never be called.
PTaskGraphBuilder<ElemType>& operator=(const PTaskGraphBuilder<ElemType>& deepCopyFrom) {assert(false); return *this; /* NOTE: just doing this to appease the compiler*/};
// Create descriptors to model PTask tasks that ComputationNodes will be mapped to.
void CreateTaskDescriptorsForComputationNodes();
void CreateTaskDescriptorForNode(ComputationNodePtr node, TaskType taskFlag);
void CreateInputName(std::string inputName);
// Configure task inputs and outputs.
void ConfigureTaskInputsAndOutputs()
if (m_verbosity >= 1) fprintf(stderr, "\nConfiguring task inputs and outputs ...\n");
for (auto taskIter=m_taskNameToTaskDescriptorMap.begin(); taskIter != m_taskNameToTaskDescriptorMap.end(); taskIter++)
if (m_verbosity >= 2) fprintf(stderr, " Task %s\n", taskIter->first.c_str());
TaskDescriptorPtr taskDescriptor = taskIter->second;
taskDescriptor->ConfigureInputsAndOutputs(m_portUIDCounter, m_valueNameToProducerPortMap);
// Create actual PTask tasks from task descriptors.
void CreateTasksFromDescriptors()
if (m_verbosity >= 1) fprintf(stderr, "\nCreating tasks from descriptors ...\n");
for (auto taskIter=m_taskNameToTaskDescriptorMap.begin(); taskIter != m_taskNameToTaskDescriptorMap.end(); taskIter++)
if (m_verbosity >= 2) fprintf(stderr, " Task %s\n", taskIter->first.c_str());
TaskDescriptorPtr taskDescriptor = taskIter->second;
// Create PTask channels.
void CreateChannels()
if (m_verbosity >= 1) fprintf(stderr, "\nCreating PTask channels ...\n");
for (auto taskIter=m_taskNameToTaskDescriptorMap.begin(); taskIter != m_taskNameToTaskDescriptorMap.end(); taskIter++)
if (m_verbosity >= 2) fprintf(stderr, " Task %s\n", taskIter->first.c_str());
TaskDescriptorPtr taskDescriptor = taskIter->second;
taskDescriptor->CreateChannelsForInputs(m_PTaskGraph, m_valueNameToProducerPortMap, m_inputNameToChannelsMap, m_verbosity);
void CreateBackAndInitChannels()
if (m_verbosity >= 1) fprintf(stderr, "\nCreating LearnableParameter extra channels...\n");
for (auto taskIter=m_taskNameToTaskDescriptorMap.begin(); taskIter != m_taskNameToTaskDescriptorMap.end(); taskIter++)
if (m_verbosity >= 2) fprintf(stderr, " Task %s\n", taskIter->first.c_str());
TaskDescriptorPtr taskDescriptor = taskIter->second;
taskDescriptor->CreateBackAndInitChannel(m_PTaskGraph, m_outputNameToChannelsMap);
void FindEmptyOutPorts()
if (m_verbosity >= 1) fprintf(stderr, "\nFinding empty ports to plug...\n");
for (auto taskIter=m_taskNameToTaskDescriptorMap.begin(); taskIter != m_taskNameToTaskDescriptorMap.end(); taskIter++)
TaskDescriptorPtr taskDescriptor = taskIter->second;
void CreatePropogationPath();
void CreateOutputChannels(const vector<ComputationNodePtr>& nodes);
// LimitAccelerators - Limit the Accelerators to the ones chosen by the user (or decided by the "bestGPU" algorithm)
void LimitAccelerators()
UINT uiIndex = 0;
// get the devices from here
std::vector<int>::iterator vi;
std::vector<int> devices = g_bestGpu->GetDevices(BestGpu::RequeryDevices, bestGpuRequery);
for(vi=devices.begin(); vi!=devices.end(); vi++)
if (m_verbosity >= 1)
printf("-- CUDA accelerator with CUDA id %d ENABLED for PTask\n", *vi);
PTask::Runtime::EnableAccelerator(ACCELERATOR_CLASS_CUDA, *vi);
// Start the PTask graph executing.
void StartGraph()
// Output the graph in Graphviz 'dot' format.
// Obtain Graphviz from http://www.graphviz.org/Download_windows.php
// Currently using version 2.34. Add C:\Program Files (x86)\Graphviz2.34\bin to PATH.
// Use:
// dot -Tpng C:\temp\PTaskGraph.dot -o PTaskGraph.png
// to render as PNG image.
if (m_verbosity >= 1)
fprintf(stderr, "Outputting graph to %s in Graphviz 'dot' format ...\n", PTASK_GRAPH_VIZ_FILE);
fprintf(stderr, " Convert to .png with: dot -Tpng C:\\temp\\PTaskGraph.dot -o PTaskGraph.png\n");
fprintf(stderr, " See PTaskGraphBuilder::StartGraph() for details.\n");
if (m_verbosity >= 1) fprintf(stderr, "Checking graph semantics ...\n");
Runtime::CheckGraphSemantics(m_PTaskGraph, TRUE, TRUE);
if (m_verbosity >= 1) fprintf(stderr, "Starting graph ...\n");
m_PTaskGraph->Run(TRUE); // single threaded for debugging
bool IsRunning()
return m_PTaskGraph->IsRunning();
TaskDescriptor<ElemType>* PTaskGraphBuilder<ElemType>::GetPTaskDescriptorOutput(ComputationNodePtr node) const;
std::vector<ComputationNodePtr> m_computationNodes;
std::map<const std::string, TaskDescriptorPtr> m_taskNameToTaskDescriptorMap;
std::map<const std::string, Port*> m_valueNameToProducerPortMap;
std::map<const std::string,
std::vector<PTask::GraphInputChannel*>*> m_inputNameToChannelsMap;
std::map<const std::string,
PTask::GraphOutputChannel*> m_outputNameToChannelsMap;
UINT m_portUIDCounter;
Graph* m_PTaskGraph;
int m_verbosity;
// state for PTask nodes
ComputationNetwork<ElemType>* m_cn;
#ifdef USE_PTASK
// the Host Task driver
template <class ElemType>
static void __stdcall
HostTaskDriver(LPDEPENDENTCONTEXT depContext);