WinMLModelRunner sample

2018-07-27 14:50:46 -07:00 · 2018-07-27 14:50:46 -07:00 · 9fd8281724
--- a/Samples/WinMLModelRunner/Desktop/cpp/BindingUtilities.h
+++ b/Samples/WinMLModelRunner/Desktop/cpp/BindingUtilities.h
@ -0,0 +1,109 @@
+#pragma once
+#include "Common.h"
+#include "ModelBinding.h"
+
+using namespace winrt::Windows::AI::MachineLearning;
+namespace BindingUtilities
+{
+
+    void BindTensorsFromGarbageData(LearningModelBinding context, LearningModel model) {
+        for (auto&& description : model.InputFeatures())
+        {
+            if (description == nullptr)
+            {
+                ThrowFailure(L" Learning model has no binding description.");
+            }
+
+            hstring name = description.Name();
+            TensorFeatureDescriptor tensorDescriptor = description.as<TensorFeatureDescriptor>();
+            TensorKind tensorKind = tensorDescriptor.TensorKind();
+
+            switch (tensorKind) {
+            case TensorKind::Undefined:
+            {
+                ThrowFailure(L" TensorKind is undefined.");
+            }
+            case TensorKind::Float:
+            {
+                ModelBinding<float> binding(description);
+                ITensor tensor = TensorFloat::CreateFromArray(binding.GetShapeBuffer(), binding.GetDataBuffer());
+                context.Bind(name, tensor);
+            }
+                break;
+            case TensorKind::Double:
+            {
+                ModelBinding<double> binding(description);
+                ITensor tensor = TensorDouble::CreateFromArray(binding.GetShapeBuffer(), binding.GetDataBuffer());
+                context.Bind(name, tensor);
+            }
+                break;
+            case TensorKind::Int8:
+            {
+                ModelBinding<uint8_t> binding(description);
+                ITensor tensor = TensorInt8Bit::CreateFromArray(binding.GetShapeBuffer(), binding.GetDataBuffer());
+                context.Bind(name, tensor);
+            }
+                break;
+            case TensorKind::UInt8:
+            {
+                ModelBinding<uint8_t> binding(description);
+                ITensor tensor = TensorUInt8Bit::CreateFromArray(binding.GetShapeBuffer(), binding.GetDataBuffer());
+                context.Bind(name, tensor);
+            }
+                break;
+            case TensorKind::Int16:
+            {
+                ModelBinding<int16_t> binding(description);
+                ITensor tensor = TensorInt16Bit::CreateFromArray(binding.GetShapeBuffer(), binding.GetDataBuffer());
+                context.Bind(name, tensor);
+            }
+                break;
+            case TensorKind::UInt16:
+            {
+                ModelBinding<uint16_t> binding(description);
+                ITensor tensor = TensorUInt16Bit::CreateFromArray(binding.GetShapeBuffer(), binding.GetDataBuffer());
+                context.Bind(name, tensor);
+            }
+                break;
+            case TensorKind::Int32:
+            {
+                ModelBinding<int32_t> binding(description);
+                ITensor tensor = TensorInt32Bit::CreateFromArray(binding.GetShapeBuffer(), binding.GetDataBuffer());
+                context.Bind(name, tensor);
+            }
+                break;
+            case TensorKind::UInt32:
+            {
+                ModelBinding<uint32_t> binding(description);
+                ITensor tensor = TensorUInt32Bit::CreateFromArray(binding.GetShapeBuffer(), binding.GetDataBuffer());
+                context.Bind(name, tensor);
+            }
+                break;
+            case TensorKind::Int64:
+            {
+                ModelBinding<int64_t> binding(description);
+                ITensor tensor = TensorInt64Bit::CreateFromArray(binding.GetShapeBuffer(), binding.GetDataBuffer());
+                context.Bind(name, tensor);
+            }
+                break;
+            case TensorKind::UInt64:
+            {
+                ModelBinding<uint64_t> binding(description);
+                ITensor tensor = TensorUInt64Bit::CreateFromArray(binding.GetShapeBuffer(), binding.GetDataBuffer());
+                context.Bind(name, tensor);
+            }
+                break;
+            default:
+            {
+                ThrowFailure(L"TensorKind has not been implemented.");
+                break;
+            }
+            }
+        }
+    }
+
+    void BindGarbageDataToContext(LearningModelBinding context, LearningModel model) {
+        context.Clear();
+         BindTensorsFromGarbageData(context, model);
+    }
+};
--- a/Samples/WinMLModelRunner/Desktop/cpp/CommandLineArgs.cpp
+++ b/Samples/WinMLModelRunner/Desktop/cpp/CommandLineArgs.cpp
@ -0,0 +1,42 @@
+#include <Windows.h>
+#include <string>
+#include "CommandLineArgs.h"
+
+CommandLineArgs::CommandLineArgs()
+{
+    int numArgs = 0;
+    LPWSTR* args = CommandLineToArgvW(GetCommandLineW(), &numArgs);
+
+    for (int i = 0; i < numArgs; i++)
+    {
+        if ((_wcsicmp(args[i], L"-CPU") == 0))
+        {
+            m_useCPU = true;
+        }
+        else if ((_wcsicmp(args[i], L"-GPU") == 0))
+        {
+            m_useGPU = true;
+        }
+         if ((_wcsicmp(args[i], L"-iterations") == 0) && (i + 1 < numArgs))
+        {
+            m_numIterations = static_cast<UINT>(_wtoi(args[++i]));
+        }
+        else if ((_wcsicmp(args[i], L"-model") == 0) && (i + 1 < numArgs))
+        {
+            m_modelPath = args[++i];
+        }
+        else if ((_wcsicmp(args[i], L"-folder") == 0) && (i + 1 < numArgs))
+        {
+            m_folderPath = args[++i];
+        }
+        else if ((_wcsicmp(args[i], L"-disableMetacommands") == 0))
+        {
+            m_metacommandsEnabled = false;
+        }
+        else if ((_wcsicmp(args[i], L"-csv") == 0))
+        {
+            m_csvFileName = args[++i];
+        }
+    }
+    m_useCPUandGPU = m_useCPU == m_useGPU;
+}
--- a/Samples/WinMLModelRunner/Desktop/cpp/CommandLineArgs.h
+++ b/Samples/WinMLModelRunner/Desktop/cpp/CommandLineArgs.h
@ -0,0 +1,28 @@
+#pragma once
+
+class CommandLineArgs
+{
+public:
+	CommandLineArgs();
+
+	bool UseCPU() const { return m_useCPU; }
+	bool UseGPU() const { return m_useGPU; }
+	bool UseCPUandGPU() const { return m_useCPUandGPU; }
+
+	const std::wstring& ModelPath() const { return m_modelPath; }
+	void SetModelPath(std::wstring path) { m_modelPath = path; }
+	const std::wstring& FolderPath() const { return m_folderPath; }
+	UINT NumIterations() const { return m_numIterations; }
+	std::wstring CsvFileName() { return m_csvFileName; }
+	bool MetacommandsEnabled() const { return m_metacommandsEnabled; }
+
+private:
+	bool m_useCPU = false;
+	bool m_useGPU = false;
+	bool m_useCPUandGPU = false;
+	std::wstring m_folderPath;
+	std::wstring m_modelPath;
+	std::wstring m_csvFileName;
+	UINT m_numIterations = 1;
+	bool m_metacommandsEnabled = false;
+};
--- a/Samples/WinMLModelRunner/Desktop/cpp/Common.h
+++ b/Samples/WinMLModelRunner/Desktop/cpp/Common.h
@ -0,0 +1,91 @@
+#pragma once
+
+#define _SILENCE_ALL_CXX17_DEPRECATION_WARNINGS
+#include <winrt/Windows.AI.MachineLearning.h>
+#include <winrt/Windows.Foundation.h>
+#include <winrt/Windows.Media.h>
+#include <winrt/Windows.Graphics.Imaging.h>
+#include <winrt/Windows.Media.h>
+#include "winrt/Windows.Storage.h"
+#include <winrt/Windows.Storage.Streams.h>
+#include "TimerHelper.h"
+#include <vector>
+#include <string>
+#include <iostream>
+#include <atlbase.h>
+#include <sstream>
+#include <comdef.h>
+#include <algorithm>
+#include <numeric>
+#include <cassert>
+#include <fstream>
+#include <future>
+#include <thread>
+#include <dxgi1_6.h>
+
+
+enum WINML_MODEL_TEST_PERF
+{
+	ENTIRE_TEST = 0,
+	LOAD_MODEL,
+	CREATE_SESSION,
+	BIND_VALUE,
+	EVAL_MODEL,
+	EVAL_MODEL_FIRST_RUN,
+	COUNT
+};
+
+static std::vector<std::wstring> WINML_MODEL_TEST_PERF_NAMES =
+{
+	L"ENTIRE TEST          ",
+	L"  LOAD MODEL         ",
+	L"  CREATE SESSION     ",
+	L"  BIND VALUE         ",
+	L"  EVAL MODEL         ",
+};
+
+#define MAX_PROFILING_LOOP 100
+
+using namespace winrt;
+
+inline std::wstring MakeErrorMsg(HRESULT hr)
+{
+	std::wostringstream ss;
+	ss << L"0x" << std::hex << hr << ": " << _com_error(hr).ErrorMessage();
+	return ss.str();
+}
+
+inline std::wstring MakeErrorMsg(HRESULT hr, const std::wstring &errorMsg)
+{
+	std::wostringstream ss;
+	ss << errorMsg << L" (" << (MakeErrorMsg(hr)) << L")";
+	return ss.str();
+}
+
+inline void WriteErrorMsg(const std::wstring &errorMsg)
+{
+	std::wostringstream ss;
+	ss << L"ERROR: " << errorMsg << std::endl;
+	OutputDebugStringW(ss.str().c_str());
+	std::wcout << ss.str() << std::endl;
+}
+
+inline void WriteErrorMsg(HRESULT hr, const std::wstring &errorMsg = L"")
+{
+	std::wostringstream ss;
+	ss << errorMsg << L" (" << (MakeErrorMsg(hr)) << L")";
+	WriteErrorMsg(ss.str());
+}
+
+inline void ThrowIfFailed(HRESULT hr, const std::wstring &errorMsg = L"")
+{
+	if (FAILED(hr))
+	{
+		throw MakeErrorMsg(hr, errorMsg);
+	}
+}
+
+inline void ThrowFailure(const std::wstring &errorMsg)
+{
+	throw errorMsg;
+}
--- a/Samples/WinMLModelRunner/Desktop/cpp/Main.cpp
+++ b/Samples/WinMLModelRunner/Desktop/cpp/Main.cpp
@ -0,0 +1,153 @@
+#include "Common.h"
+#include "OutputHelper.h"
+#include "ModelBinding.h"
+#include "BindingUtilities.h"
+#include "Stopwatch.h"
+#include "CommandLineArgs.h"
+#include <filesystem>
+
+#define CheckHr(expr, errorMsg) hr = (expr); if (FAILED(hr)) { WriteErrorMsg(hr, errorMsg); return 1; }
+
+Profiler<WINML_MODEL_TEST_PERF> g_Profiler;
+int g_GarbageRuns = 10;
+// Loads, binds, and evaluates the user-specified model and outputs the GPU/CPU and
+// wall-clock times(in ms) for each step to the command line.
+void EvaluateModel(CommandLineArgs args, std::wstring modelName, OutputHelper * output, LearningModelDeviceKind deviceKind)
+{
+    Stopwatch timer;
+    output->PrintModelInfo(modelName, deviceKind);
+
+    WINML_PROFILING_START(g_Profiler, WINML_MODEL_TEST_PERF::LOAD_MODEL);
+    timer.Click();
+
+    LearningModel model = nullptr;
+
+        try
+        {
+            model = LearningModel::LoadFromFilePath(args.ModelPath());
+        }
+        catch (const std::wstring &msg)
+        {
+            WriteErrorMsg(msg);
+            return;
+        }
+        WINML_PROFILING_STOP(g_Profiler, WINML_MODEL_TEST_PERF::LOAD_MODEL);
+        timer.Click();
+        output->m_clockLoadTime = timer.GetElapsedMilliseconds();
+
+    LearningModelSession session(model, LearningModelDevice(deviceKind));
+    LearningModelBinding binding(session);
+
+    // Initialize model input and bind garbage data.
+    WINML_PROFILING_START(g_Profiler, WINML_MODEL_TEST_PERF::BIND_VALUE);
+    timer.Click();
+    try
+    {
+        BindingUtilities::BindGarbageDataToContext(binding, model);
+    }
+    catch (const std::wstring &msg)
+    {
+        WriteErrorMsg(msg);
+        return;
+    }
+    timer.Click();
+    WINML_PROFILING_STOP(g_Profiler, WINML_MODEL_TEST_PERF::BIND_VALUE);
+    output->m_clockBindTime = timer.GetElapsedMilliseconds();
+
+    for (int i = 0; i < g_GarbageRuns; i++) {
+        auto result = session.Evaluate(binding, L"");
+    }
+    for (UINT i = 0; i < args.NumIterations(); i++)
+    {
+        WINML_PROFILING_START(g_Profiler, WINML_MODEL_TEST_PERF::EVAL_MODEL);
+        timer.Click();
+        auto result = session.Evaluate(binding, L"");
+        timer.Click();
+        WINML_PROFILING_STOP(g_Profiler, WINML_MODEL_TEST_PERF::EVAL_MODEL);
+        output->m_clockEvalTimes.push_back(timer.GetElapsedMilliseconds());
+    }
+
+    output->PrintWallClockTimes(args.NumIterations());
+    if (deviceKind == LearningModelDeviceKind::DirectX)
+    {
+        output->PrintGPUTimes(g_Profiler, args.NumIterations());
+    }
+    else
+    {
+        output->PrintCPUTimes(g_Profiler, args.NumIterations());
+    }
+    g_Profiler.Reset();
+}
+
+void EvaluateModelsInDirectory(CommandLineArgs args, OutputHelper * output)
+{
+    std::wstring folderPath = args.FolderPath();
+    for (auto & it : std::filesystem::directory_iterator(args.FolderPath()))
+    {
+        std::string path = it.path().string();
+        if (it.path().string().find(".onnx") != std::string::npos ||
+            it.path().string().find(".pb") != std::string::npos)
+        {
+            std::wstring fileName;
+            fileName.assign(path.begin(), path.end());
+            args.SetModelPath(fileName);
+            try
+            {
+                if (args.UseCPUandGPU() || args.UseGPU())
+                {
+                    EvaluateModel(args, args.ModelPath(), output, LearningModelDeviceKind::DirectX);
+                }
+                if (args.UseCPUandGPU() || args.UseCPU())
+                {
+                    EvaluateModel(args, args.ModelPath(), output, LearningModelDeviceKind::Cpu);
+                }
+                output->WritePerformanceDataToCSV(g_Profiler, args, fileName);
+                output->Reset();
+            }
+            catch (const std::wstring &msg)
+            {
+                WriteErrorMsg(msg);
+                continue;
+            }
+        }
+    }
+}
+
+int main(int argc, char** argv)
+{
+    CommandLineArgs args;
+    OutputHelper output;
+
+    winrt::init_apartment();
+    output.PrintHardwareInfo();
+    g_Profiler.Enable();
+
+    std::wstring csvFileName = args.CsvFileName();
+    if (csvFileName.empty())
+    {
+        output.SetDefaultCSVFileName();
+    }
+    else 
+    {
+        output.m_csvFileName = csvFileName;
+    }
+    if (!args.ModelPath().empty())
+    {
+    
+        if (args.UseCPUandGPU() || args.UseGPU())
+        {
+            EvaluateModel(args, args.ModelPath(), &output, LearningModelDeviceKind::DirectX);
+        }
+        if (args.UseCPUandGPU() || args.UseCPU())
+        {
+            EvaluateModel(args, args.ModelPath(), &output, LearningModelDeviceKind::Cpu);
+        }
+        output.WritePerformanceDataToCSV(g_Profiler, args, args.ModelPath());
+        output.Reset();
+    }
+    else if (!args.FolderPath().empty())
+    {
+        EvaluateModelsInDirectory(args, &output);
+    }
+    return 0;
+}
--- a/Samples/WinMLModelRunner/Desktop/cpp/ModelBinding.h
+++ b/Samples/WinMLModelRunner/Desktop/cpp/ModelBinding.h
@ -0,0 +1,95 @@
+#pragma once
+#include "Common.h"
+
+// Data storage for a model input or output variable.
+template< typename T>
+class ModelBinding
+{
+public:
+    ModelBinding(winrt::Windows::AI::MachineLearning::ILearningModelFeatureDescriptor variableDesc) : m_bindingDesc(variableDesc)
+    {
+        UINT numElements = 0;
+        if (variableDesc.Kind() == LearningModelFeatureKind::Tensor)
+        {
+            InitTensorBinding(variableDesc, numElements);
+        }
+        else
+        {
+            ThrowFailure(L"ModelBinding: Binding feature type not implemented");
+        }
+    }
+
+    winrt::Windows::AI::MachineLearning::ILearningModelFeatureDescriptor GetDesc()
+    {
+        return m_bindingDesc;
+    }
+
+    UINT GetNumElements() const
+    {
+        return m_numElements;
+    }
+
+    UINT GetElementSize() const
+    {
+        return m_elementSize;
+    }
+
+    std::vector<INT64> GetShapeBuffer()
+    {
+        return m_shapeBuffer;
+    }
+
+    void* GetData()
+    {
+        return m_dataBuffer.data();
+    }
+
+    std::vector<T> GetDataBuffer()
+    {
+        return m_dataBuffer;
+    }
+
+    size_t GetDataBufferSize()
+    {
+        return m_dataBuffer.size();
+    }
+
+
+private:
+    void InitNumElementsAndShape(winrt::Windows::Foundation::Collections::IVectorView<int64_t> * shape, UINT numDimensions, UINT numElements)
+    {
+        int unknownDim = -1;
+        UINT numKnownElements = 1;
+        for (UINT dim = 0; dim < numDimensions; dim++)
+        {
+            INT64 dimSize = shape->GetAt(dim);
+
+            if (dimSize <= 0)
+            {
+                if (unknownDim == -1)
+                {
+                    dimSize = 1;
+                }
+            }
+            else
+            {
+                numKnownElements *= static_cast<UINT>(dimSize);
+            }
+
+            m_shapeBuffer.push_back(dimSize);
+        }
+        m_numElements = numKnownElements;
+    }
+
+    void InitTensorBinding(winrt::Windows::AI::MachineLearning::ILearningModelFeatureDescriptor descriptor, UINT numElements)
+    {
+        auto tensorDescriptor = descriptor.as<winrt::Windows::AI::MachineLearning::TensorFeatureDescriptor>();
+        InitNumElementsAndShape(&tensorDescriptor.Shape(), tensorDescriptor.Shape().Size(), 1);
+        m_dataBuffer.resize(m_numElements);
+    }
+
+    winrt::Windows::AI::MachineLearning::ILearningModelFeatureDescriptor m_bindingDesc;
+    std::vector<INT64> m_shapeBuffer;
+    UINT m_numElements = 0;
+    std::vector<T> m_dataBuffer;
+};
--- a/Samples/WinMLModelRunner/Desktop/cpp/OutputHelper.h
+++ b/Samples/WinMLModelRunner/Desktop/cpp/OutputHelper.h
@ -0,0 +1,221 @@
+#pragma once
+#include "Common.h"
+#include <time.h>
+#include "CommandLineArgs.h"
+#include <fstream>
+#include <ctime>
+#include <locale>
+#include <utility>
+#include <codecvt>
+#include <iomanip>
+#include <windows.h>
+#include <stdio.h>
+
+using namespace winrt::Windows::AI::MachineLearning;
+
+// Stores performance information and handles output to the command line and CSV files.
+class OutputHelper
+{
+public:
+    OutputHelper() {}
+
+    void PrintWallClockTimes(UINT iterations)
+    {
+        double totalEvalTime = std::accumulate(m_clockEvalTimes.begin(), m_clockEvalTimes.end(), 0.0);
+        m_clockEvalTime = totalEvalTime / (double)iterations;
+
+        std::cout << "Wall-clock Time Averages (iterations = " << iterations << "):" << std::endl;
+        std::cout << "	Load: " << m_clockLoadTime << " ms" << std::endl;
+        std::cout << "	Bind: " << m_clockBindTime << " ms" << std::endl;
+        std::cout << "	Evaluate: " << m_clockEvalTime << " ms" << std::endl;
+        std::cout << "	Total time: " << m_clockLoadTime + m_clockBindTime + m_clockEvalTime << " ms" << std::endl;
+        std::cout << std::endl;
+    }
+
+    void PrintCPUTimes(Profiler<WINML_MODEL_TEST_PERF> &profiler, UINT iterations)
+    {
+         m_CPULoadTime = profiler[LOAD_MODEL].GetAverage(CounterType::TIMER);
+         m_CPUBindTime = profiler[BIND_VALUE].GetAverage(CounterType::TIMER);
+         m_CPUEvalTime = profiler[EVAL_MODEL].GetAverage(CounterType::TIMER);
+         m_CPUEvalMemoryUsage = profiler[EVAL_MODEL].GetAverage(CounterType::CPU_USAGE);
+
+        std::cout << "CPU Time Averages (iterations = " << iterations << "):" << std::endl;
+        std::cout << "	Load: " << m_CPULoadTime << " ms" << std::endl;
+        std::cout << "	Bind: " << m_CPUBindTime << " ms" << std::endl;
+        std::cout << "	Evaluate: " << m_CPUEvalTime << " ms" << std::endl;
+        std::cout << "	Total time: " << m_CPULoadTime + m_CPUBindTime + m_CPUEvalTime << " ms" << std::endl;
+        std::cout << "	Evaluate memory usage: " << m_CPUEvalMemoryUsage << " mb" << std::endl;
+        std::cout << std::endl;
+    }
+
+    void PrintGPUTimes(Profiler<WINML_MODEL_TEST_PERF> &profiler, UINT iterations)
+    {
+         m_GPULoadTime = profiler[LOAD_MODEL].GetAverage(CounterType::TIMER);
+         m_GPUBindTime = profiler[BIND_VALUE].GetAverage(CounterType::TIMER);
+         m_GPUEvalTime = profiler[EVAL_MODEL].GetAverage(CounterType::TIMER);
+         m_GPUEvalMemoryUsage = profiler[EVAL_MODEL].GetAverage(CounterType::CPU_USAGE);
+
+        std::cout << "GPU Time Averages (iterations = " << iterations << "):" << std::endl;
+        std::cout << "	Load: " << m_GPULoadTime << " ms" << std::endl;
+        std::cout << "	Bind: " << m_GPUBindTime << " ms" << std::endl;
+        std::cout << "	Evaluate: " << m_GPUEvalTime << " ms" << std::endl;
+        std::cout << "	Total time: " << m_GPULoadTime + m_GPUBindTime + m_GPUEvalTime << " ms" << std::endl;
+        std::cout << "	Evaluate memory usage: " << m_GPUEvalMemoryUsage << " mb" << std::endl;
+        std::cout << std::endl;
+    }
+
+    void PrintModelInfo(std::wstring modelName, LearningModelDeviceKind deviceKind)
+    {
+        std::wstring device = deviceKind == LearningModelDeviceKind::Cpu ? L" [CPU]" : L" [GPU]";
+        std::wcout << modelName << device << std::endl;
+        std::cout << "=================================================================" << std::endl;
+        std::cout << std::endl;
+    }
+
+    void PrintHardwareInfo()
+    {
+        std::cout << "WinML Model Runner" << std::endl;
+
+        com_ptr<IDXGIFactory6> factory;
+        (CreateDXGIFactory1(__uuidof(IDXGIFactory6), factory.put_void()));
+        com_ptr<IDXGIAdapter> adapter;
+        factory->EnumAdapters(0, adapter.put());
+        DXGI_ADAPTER_DESC description;
+        if (SUCCEEDED(adapter->GetDesc(&description)))
+        {
+            std::wcout << L"GPU: " << description.Description << std::endl;
+            std::cout << std::endl;
+        }
+    }
+
+    void SetDefaultCSVFileName() 
+    {
+        auto time = std::time(nullptr);
+        struct tm localTime;
+        localtime_s(&localTime, &time);
+
+        std::ostringstream oss;
+        oss << std::put_time(&localTime, "%Y-%m-%d");
+        std::string fileName = "WinML Model Run [" + oss.str() + "].csv";
+        std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+        m_csvFileName = converter.from_bytes(fileName);
+    }
+
+    void WritePerformanceDataToCSV(Profiler<WINML_MODEL_TEST_PERF> &g_Profiler, CommandLineArgs args, std::wstring model)
+    {
+        if (m_csvFileName.length() > 0)
+        {
+            // Check if header exists
+            bool bNewFile = false;
+            std::ifstream fin;
+            fin.open(m_csvFileName);
+            std::filebuf* outbuf = fin.rdbuf();
+            if (EOF == outbuf->sbumpc())
+            {
+                bNewFile = true;
+            }
+            fin.close();
+
+            std::ofstream fout;
+            fout.open(m_csvFileName, std::ios_base::app);
+
+            std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+            std::string modelName = converter.to_bytes(model);
+       
+            if (bNewFile)
+            {
+                fout << "Model Name" << ","
+                     << "Iterations" << ",";
+
+                if (args.UseCPUandGPU() || args.UseCPU()) 
+                {
+                    fout << "CPU Load (ms)" << ","
+                        << "CPU Bind (ms)" << ","
+                        << "CPU Evaluate (ms)" << ","
+                        << "CPU total time (ms)" << ","
+                        << "CPU Usage (Evaluate) (mb)" << ",";
+                }
+                if (args.UseCPUandGPU() || args.UseGPU())
+                {
+
+                    fout << "GPU Load (ms)" << ","
+                        << "GPU Bind (ms)" << ","
+                        << "GPU Evaluate (ms)" << ","
+                        << "GPU total time (ms)" << ","
+                        << "GPU Usage (Evaluate) (mb)" << ",";
+                }
+
+                    fout << "Wall-clock Load (ms)" << ","
+                         << "Wall-clock Bind (ms)" << ","
+                         << "Wall-clock Evaluate (ms)" << ","
+                         << "Wall-clock total time (ms)" << ","
+                         << std::endl;
+            }
+
+            fout << modelName << "," << args.NumIterations() << ",";
+
+            if (args.UseCPUandGPU() || args.UseCPU())
+            {
+                fout << m_CPULoadTime << ","
+                << m_CPUBindTime << ","
+                << m_CPUEvalTime << ","
+                << m_CPULoadTime + m_CPUBindTime + m_CPUEvalTime << ","
+                << m_CPUEvalMemoryUsage << ",";
+            }
+            if (args.UseCPUandGPU() || args.UseGPU())
+            {
+                fout << m_GPULoadTime << ","
+                << m_GPUBindTime << ","
+                << m_GPUEvalTime << ","
+                << m_GPULoadTime + m_GPUBindTime + m_GPUEvalTime << ","
+                << m_GPUEvalMemoryUsage << ",";
+            }
+
+            fout << m_clockLoadTime << ","
+            << m_clockBindTime << ","
+            << m_clockEvalTime << ","
+            << m_clockLoadTime + m_clockBindTime + m_clockEvalTime << ","
+            << std::endl;
+            fout.close();
+        }
+    }
+    
+    void Reset() 
+    {
+
+         m_GPULoadTime = 0;
+         m_GPUBindTime = 0;
+         m_GPUEvalTime = 0;
+         m_GPUEvalMemoryUsage = 0;
+
+         m_CPULoadTime = 0;
+         m_CPUBindTime = 0;
+         m_CPUEvalTime = 0;
+         m_CPUEvalMemoryUsage = 0;
+
+
+         m_clockLoadTime = 0;
+         m_clockBindTime = 0;
+         m_clockEvalTime = 0;
+    }
+
+    double m_clockLoadTime = 0;
+    double m_clockBindTime = 0;
+    std::vector<double> m_clockEvalTimes;
+
+    std::wstring m_csvFileName;
+
+private:
+    double m_GPULoadTime = 0;
+    double m_GPUBindTime = 0;
+    double m_GPUEvalTime = 0;
+    double m_GPUEvalMemoryUsage = 0;
+
+    double m_CPULoadTime = 0;
+    double m_CPUBindTime = 0;
+    double m_CPUEvalTime = 0;
+    double m_CPUEvalMemoryUsage = 0;
+
+    double m_clockEvalTime = 0;
+
+};
--- a/Samples/WinMLModelRunner/Desktop/cpp/Stopwatch.cpp
+++ b/Samples/WinMLModelRunner/Desktop/cpp/Stopwatch.cpp
@ -0,0 +1,16 @@
+#include <Windows.h>
+#include "Stopwatch.h"
+
+Stopwatch::Stopwatch()
+{
+    QueryPerformanceFrequency(&m_frequency);
+}
+
+void Stopwatch::Click()
+{
+    LARGE_INTEGER currentTime;
+    QueryPerformanceCounter(&currentTime);
+    auto delta = static_cast<double>(currentTime.QuadPart - m_lastClickTime.QuadPart);
+    m_elapsedMilliseconds = delta / (static_cast<double>(m_frequency.QuadPart) / 1000.0);
+    m_lastClickTime = currentTime;
+}
--- a/Samples/WinMLModelRunner/Desktop/cpp/Stopwatch.h
+++ b/Samples/WinMLModelRunner/Desktop/cpp/Stopwatch.h
@ -0,0 +1,20 @@
+#pragma once
+
+class Stopwatch
+{
+public:
+    Stopwatch();
+
+    void Click();
+
+    // Time elapsed between last two clicks.
+    inline double GetElapsedMilliseconds() const
+    {
+        return m_elapsedMilliseconds;
+    }
+
+private:
+    LARGE_INTEGER m_lastClickTime;
+    LARGE_INTEGER m_frequency;
+    double m_elapsedMilliseconds = 0.0;
+};
--- a/Samples/WinMLModelRunner/Desktop/cpp/TimerHelper.h
+++ b/Samples/WinMLModelRunner/Desktop/cpp/TimerHelper.h
@ -0,0 +1,617 @@
+#pragma once
+
+#include <windows.h>
+#include <cmath>
+#ifndef DISABLE_GPU_COUNTERS
+#include <Pdh.h>
+#include <PdhMsg.h>
+#endif
+#include <psapi.h>
+#include <string>
+#include <vector>
+
+#define TIMER_SLOT_SIZE (128)
+#define CONVERT_100NS_TO_SECOND(x) ((x) * 0.0000001)
+#define BYTE_TO_MB(x) ((x)/(1024.0*1024.0))
+
+// A stopwatch to measure the time passed (in seconds) between current Stop call and the closest Start call that has been called before.
+class Timer
+{
+public:
+	void Start()
+	{
+		LARGE_INTEGER t;
+		QueryPerformanceCounter(&t);
+		m_startTime = static_cast<double>(t.QuadPart);
+	}
+
+	double Stop()
+	{
+		LARGE_INTEGER stopTime;
+		QueryPerformanceCounter(&stopTime);
+		double t = static_cast<double>(stopTime.QuadPart) - m_startTime;
+		LARGE_INTEGER tps;
+		QueryPerformanceFrequency(&tps);
+		return t / static_cast<double>(tps.QuadPart);
+	}
+
+private:
+	double m_startTime;
+};
+
+class CpuPerfCounter
+{
+public:
+	CpuPerfCounter()
+	{
+		Reset();
+	}
+
+	~CpuPerfCounter() {}
+
+	void Reset()
+	{
+		SYSTEM_INFO sysInfo = { 0 };
+		GetSystemInfo(&sysInfo);
+
+		m_startKernelTime = { 0 };
+		m_startUserTime = { 0 };
+		m_numProcessors = sysInfo.dwNumberOfProcessors;
+		m_procHandle = GetCurrentProcess();;
+		m_pid = GetCurrentProcessId();;
+		m_previousStartCallFailed = true;
+		m_processTime = 0;
+		m_startPageFaultCount = 0;
+		m_startPagefileUsage = 0;
+		m_startPeakPagefileUsage = 0;
+		m_startWorkingSetSize = 0;
+		m_startPeakWorkingSetSize = 0;
+		m_deltaPageFaultCount = 0;
+		m_deltaPagefileUsage = 0;
+		m_deltaPeakPagefileUsage = 0;
+		m_deltaWorkingSetSize = 0;
+		m_deltaPeakWorkingSetSize = 0;
+	}
+
+	void Start()
+	{
+		FILETIME ftIgnore, ftKernel, ftUser;
+
+		if (!GetProcessTimes(m_procHandle, &ftIgnore, &ftIgnore, &ftKernel, &ftUser) ||
+			!GetProcessMemoryCounters(m_pid, m_startPageFaultCount, m_startPagefileUsage, m_startPeakPagefileUsage, m_startWorkingSetSize, m_startPeakWorkingSetSize))
+		{
+			m_previousStartCallFailed = true;
+		}
+		else
+		{
+			memcpy(&m_startKernelTime, &ftKernel, sizeof(FILETIME));
+			memcpy(&m_startUserTime, &ftUser, sizeof(FILETIME));
+			m_previousStartCallFailed = false;
+		}
+	}
+
+	void Stop()
+	{
+		FILETIME ftIgnore, ftKernel, ftUser;
+		ULARGE_INTEGER stopKernelTime, stopUserTime;
+		ULONG  stopPageFaultCount = 0;
+		SIZE_T stopPagefileUsage = 0;
+		SIZE_T stopPeakPagefileUsage = 0;
+		SIZE_T stopWorkingSetSize = 0;
+		SIZE_T stopPeakWorkingSetSize = 0;
+
+		if (m_previousStartCallFailed ||
+			m_numProcessors == 0 ||
+			!GetProcessTimes(m_procHandle, &ftIgnore, &ftIgnore, &ftKernel, &ftUser) ||
+			!GetProcessMemoryCounters(m_pid, stopPageFaultCount, stopPagefileUsage, stopPeakPagefileUsage, stopWorkingSetSize, stopPeakWorkingSetSize))
+		{
+			return;
+		}
+
+		memcpy(&stopKernelTime, &ftKernel, sizeof(FILETIME));
+		memcpy(&stopUserTime, &ftUser, sizeof(FILETIME));
+		m_processTime = CONVERT_100NS_TO_SECOND((stopKernelTime.QuadPart - m_startKernelTime.QuadPart) + (stopUserTime.QuadPart - m_startUserTime.QuadPart)) / m_numProcessors;
+
+		m_deltaPageFaultCount = stopPageFaultCount - m_startPageFaultCount;
+		m_deltaPagefileUsage = (double)BYTE_TO_MB((double)stopPagefileUsage - (double)m_startPagefileUsage);
+		m_deltaPeakPagefileUsage = (double)BYTE_TO_MB((double)stopPeakPagefileUsage - (double)m_startPeakPagefileUsage);
+		m_deltaWorkingSetSize = (double)BYTE_TO_MB((double)stopWorkingSetSize - (double)m_startWorkingSetSize);
+		m_deltaPeakWorkingSetSize = (double)BYTE_TO_MB((double)stopPeakWorkingSetSize - (double)m_startPeakWorkingSetSize);
+	}
+
+	double GetProcessTime() { return m_processTime; }
+	ULONG GetDeltaPageFaultCount() { return m_deltaPageFaultCount; }
+	double GetDeltaPageFileUsage() { return m_deltaPagefileUsage; }
+	double GetDeltaPeakPageFileUsage() { return m_deltaPeakPagefileUsage; }
+	double GetDeltaWorkingSetUsage() { return m_deltaWorkingSetSize; }
+	double GetDeltaPeakWorkingSetUsage() { return m_deltaPeakWorkingSetSize; }
+
+private:
+
+	bool GetProcessMemoryCounters(DWORD pid, ULONG& pageFaultCount, SIZE_T& pageFileUsage, SIZE_T& peakPageFileUsage, SIZE_T& workingSetSize, SIZE_T& peakWorkingSetSize)
+	{
+		HANDLE hProcess = NULL;
+
+		hProcess = OpenProcess(PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, FALSE, pid);
+		if (NULL == hProcess)
+			return false;
+
+		PROCESS_MEMORY_COUNTERS pmc = { 0 };
+
+		bool result = GetProcessMemoryInfo(hProcess, &pmc, sizeof(pmc));
+		if (result)
+		{
+			pageFaultCount = pmc.PageFaultCount;
+			pageFileUsage = pmc.PagefileUsage;
+			peakPageFileUsage = pmc.PeakPagefileUsage;
+			workingSetSize = pmc.WorkingSetSize;
+			peakWorkingSetSize = pmc.PeakWorkingSetSize;
+		}
+
+		CloseHandle(hProcess);
+
+		return result;
+	}
+
+	ULARGE_INTEGER m_startKernelTime;
+	ULARGE_INTEGER m_startUserTime;
+	UINT m_numProcessors;
+	HANDLE m_procHandle;
+	DWORD m_pid;
+	bool m_previousStartCallFailed;
+	double m_processTime;         // in second
+	ULONG m_startPageFaultCount;
+	SIZE_T m_startPagefileUsage;       // in byte
+	SIZE_T m_startPeakPagefileUsage;   // in byte
+	SIZE_T m_startWorkingSetSize;      // in byte
+	SIZE_T m_startPeakWorkingSetSize;  // in byte
+	ULONG m_deltaPageFaultCount;
+	double m_deltaPagefileUsage;       // in MByte
+	double m_deltaPeakPagefileUsage;   // in MByte
+	double m_deltaWorkingSetSize;      // in MByte
+	double m_deltaPeakWorkingSetSize;  // in MByte
+};
+#ifndef DISABLE_GPU_COUNTERS
+
+class GpuPerfCounter
+{
+public:
+	GpuPerfCounter() :
+		m_hPDH(NULL),
+		m_pfnPdhOpenQuery(NULL),
+		m_pfnPdhAddCounter(NULL),
+		m_pfnPdhCollectQueryData(NULL),
+		m_pfnPdhGetFormattedCounterArray(NULL),
+		m_pfnPdhGetFormattedCounterValue(NULL),
+		m_pfnPdhCloseQuery(NULL),
+		m_query(NULL)
+	{
+		//#ifdef DISABLE_LOADLIBRARY
+		m_hPDH = LoadLibraryEx(L"pdh.dll", NULL, 0);
+		//#endif
+		if (m_hPDH != NULL)
+		{
+			m_pfnPdhOpenQuery = (PFNPdhOpenQuery)GetProcAddress(m_hPDH, "PdhOpenQueryW");
+			m_pfnPdhAddCounter = (PFNPdhAddCounter)GetProcAddress(m_hPDH, "PdhAddCounterW");
+			m_pfnPdhCollectQueryData = (PFNPdhCollectQueryData)GetProcAddress(m_hPDH, "PdhCollectQueryData");
+			m_pfnPdhGetFormattedCounterArray = (PFNPdhGetFormattedCounterArray)GetProcAddress(m_hPDH, "PdhGetFormattedCounterArrayW");
+			m_pfnPdhGetFormattedCounterValue = (PFNPdhGetFormattedCounterValue)GetProcAddress(m_hPDH, "PdhGetFormattedCounterValue");
+			m_pfnPdhCloseQuery = (PFNPdhCloseQuery)GetProcAddress(m_hPDH, "PdhCloseQuery");
+		}
+
+		Reset();
+	}
+	~GpuPerfCounter()
+	{
+		if (m_query)
+		{
+			CloseQuery(m_query);
+			m_query = NULL;
+		}
+
+		if (m_hPDH)
+		{
+			FreeLibrary(m_hPDH);
+			m_hPDH = NULL;
+		}
+	}
+
+	void Reset()
+	{
+		m_gpuUsage = 0;
+		m_deltaGpuDedicatedMemory = 0;
+		m_deltaGpuSharedMemory = 0;
+
+		// Setup PDH performance query
+		std::wstring pidStr = std::to_wstring(GetCurrentProcessId());
+		std::wstring gpuUsageQueryStr = L"\\GPU Engine(pid_*_*)\\Utilization Percentage";
+		std::wstring gpuDedicatedMemQueryStr = L"\\GPU Process Memory(pid_*_*)\\Dedicated Usage";
+		std::wstring gpuSharedMemQueryStr = L"\\GPU Process Memory(pid_*_*)\\Shared Usage";
+		gpuUsageQueryStr.replace(gpuUsageQueryStr.find('*'), 1, pidStr);
+		gpuDedicatedMemQueryStr.replace(gpuDedicatedMemQueryStr.find('*'), 1, pidStr);
+		gpuSharedMemQueryStr.replace(gpuSharedMemQueryStr.find('*'), 1, pidStr);
+
+		// Open query
+		if (m_query) CloseQuery(m_query);
+		m_query = NULL;
+		OpenQuery(NULL, NULL, &m_query);
+		AddCounter(m_query, gpuUsageQueryStr.c_str(), NULL, &m_gpuUsageCounter);
+		AddCounter(m_query, gpuDedicatedMemQueryStr.c_str(), NULL, &m_gpuDedicatedMemUsageCounter);
+		AddCounter(m_query, gpuSharedMemQueryStr.c_str(), NULL, &m_gpuSharedMemUsageCounter);
+	}
+
+	void Start()
+	{
+		PDH_FMT_COUNTERVALUE gpuDedicatedMemUsageCounterValue = {};
+		PDH_FMT_COUNTERVALUE gpuSharedMemUsageCounterValue = {};
+		PDH_STATUS status = S_OK;
+
+		// Usage rate counter require two queries. Put first one at Start() and second on at Stop()
+		CollectQueryData(m_query);
+
+		// Gpu dedicated ememory
+		status = GetFormattedCounterValue(m_gpuDedicatedMemUsageCounter, PDH_FMT_LARGE, NULL, &gpuDedicatedMemUsageCounterValue);
+		m_startGpuDedicatedMemory = (ERROR_SUCCESS == status) ? (double)BYTE_TO_MB(gpuDedicatedMemUsageCounterValue.largeValue) : m_startGpuDedicatedMemory;
+
+		// Gpu shared ememory
+		status = GetFormattedCounterValue(m_gpuSharedMemUsageCounter, PDH_FMT_LARGE, NULL, &gpuSharedMemUsageCounterValue);
+		m_startGpuSharedMemory = (ERROR_SUCCESS == status) ? (double)BYTE_TO_MB(gpuSharedMemUsageCounterValue.largeValue) : m_startGpuSharedMemory;
+	}
+
+	void Stop()
+	{
+		PDH_FMT_COUNTERVALUE_ITEM* gpuUsageCounterValue = nullptr;
+		PDH_FMT_COUNTERVALUE gpuDedicatedMemUsageCounterValue = {};
+		PDH_FMT_COUNTERVALUE gpuSharedMemUsageCounterValue = {};
+		DWORD bufferSize = 0;
+		DWORD itemCount = 0;
+		PDH_STATUS status = S_OK;
+
+		// Query the gpu usage.
+		// For different IHVs, compute shader usage could be counted as either 3D or compute engine usage.
+		// Here we simply pick the max usage from all types of engines to see if bottleneck is from GPU.
+		// The same concept has been used in task manager to display GPU usage.
+		status = CollectQueryData(m_query);
+		if (S_OK != status && PDH_NO_DATA != status)
+			return;
+
+		status = GetFormattedCounterArray(m_gpuUsageCounter, PDH_FMT_DOUBLE, &bufferSize, &itemCount, gpuUsageCounterValue);
+		if (PDH_MORE_DATA != status)
+			return;
+
+		gpuUsageCounterValue = (PDH_FMT_COUNTERVALUE_ITEM *)malloc(bufferSize);
+		if (gpuUsageCounterValue != nullptr)
+		{
+			status = GetFormattedCounterArray(m_gpuUsageCounter, PDH_FMT_DOUBLE, &bufferSize, &itemCount, gpuUsageCounterValue);
+			if (ERROR_SUCCESS == status)
+			{
+				double maxValue = 0;
+				for (size_t i = 0; i < itemCount; ++i)
+				{
+					maxValue = (gpuUsageCounterValue[i].FmtValue.doubleValue > maxValue) ? gpuUsageCounterValue[i].FmtValue.doubleValue : maxValue;
+				}
+				m_gpuUsage = maxValue;
+			}
+		}
+
+		free(gpuUsageCounterValue);
+		gpuUsageCounterValue = NULL;
+		bufferSize = 0;
+		itemCount = 0;
+
+		double stopGpuDedicatedMemory;  // in MB
+		double stopGpuSharedMemory;     // in MB
+
+		// Gpu dedicated ememory delta. Don't update the value if counter doesn't get values correctly.
+		status = GetFormattedCounterValue(m_gpuDedicatedMemUsageCounter, PDH_FMT_LARGE, NULL, &gpuDedicatedMemUsageCounterValue);
+		if (ERROR_SUCCESS == status)
+		{
+			stopGpuDedicatedMemory = (double)BYTE_TO_MB(gpuDedicatedMemUsageCounterValue.largeValue);
+			m_deltaGpuDedicatedMemory = stopGpuDedicatedMemory - m_startGpuDedicatedMemory;
+		}
+
+		// Gpu shared ememory. Don't update the value if counter doesn't get values correctly.
+		status = GetFormattedCounterValue(m_gpuSharedMemUsageCounter, PDH_FMT_LARGE, NULL, &gpuSharedMemUsageCounterValue);
+		if (ERROR_SUCCESS == status)
+		{
+			stopGpuSharedMemory = (double)BYTE_TO_MB(gpuSharedMemUsageCounterValue.largeValue);
+			m_deltaGpuSharedMemory = stopGpuSharedMemory - m_startGpuSharedMemory;
+		}
+	}
+
+	double GetGpuUsage() const { return m_gpuUsage; }
+	double GetDedicatedMemory() const { return m_deltaGpuDedicatedMemory; }
+	double GetSharedMemory() const { return m_deltaGpuSharedMemory; }
+
+private:
+	// Pdh function prototypes
+	typedef PDH_STATUS(WINAPI *PFNPdhOpenQuery)(_In_opt_ LPCWSTR szDataSource, _In_ DWORD_PTR dwUserData, _Out_ PDH_HQUERY * phQuery);
+	typedef PDH_STATUS(WINAPI *PFNPdhAddCounter)(_In_ PDH_HQUERY hQuery, _In_ LPCWSTR szFullCounterPath, _In_ DWORD_PTR dwUserData, _Out_ PDH_HCOUNTER * phCounter);
+	typedef PDH_STATUS(WINAPI *PFNPdhCollectQueryData)(_Inout_ PDH_HQUERY hQuery);
+	typedef PDH_STATUS(WINAPI *PFNPdhGetFormattedCounterArray)(_In_ PDH_HCOUNTER hCounter, _In_ DWORD dwFormat, _Inout_ LPDWORD lpdwBufferSize, _Out_ LPDWORD lpdwItemCount, _Out_writes_bytes_opt_(*lpdwBufferSize) PPDH_FMT_COUNTERVALUE_ITEM_W ItemBuffer);
+	typedef PDH_STATUS(WINAPI *PFNPdhGetFormattedCounterValue)(_In_ PDH_HCOUNTER hCounter, _In_ DWORD dwFormat, _Out_opt_ LPDWORD lpdwType, _Out_ PPDH_FMT_COUNTERVALUE pValue);
+	typedef PDH_STATUS(WINAPI *PFNPdhCloseQuery)(_Inout_ PDH_HQUERY hQuery);
+
+	PDH_STATUS OpenQuery(LPCWSTR szDataSource, DWORD_PTR dwUserData, PDH_HQUERY * phQuery)
+	{
+		return (m_pfnPdhOpenQuery) ? m_pfnPdhOpenQuery(szDataSource, dwUserData, phQuery) : ERROR_MOD_NOT_FOUND;
+	}
+	PDH_STATUS AddCounter(PDH_HQUERY hQuery, LPCWSTR szFullCounterPath, DWORD_PTR dwUserData, PDH_HCOUNTER * phCounter)
+	{
+		return (m_pfnPdhAddCounter) ? m_pfnPdhAddCounter(hQuery, szFullCounterPath, dwUserData, phCounter) : ERROR_MOD_NOT_FOUND;
+	}
+	PDH_STATUS CollectQueryData(PDH_HQUERY hQuery)
+	{
+		return (m_pfnPdhCollectQueryData) ? m_pfnPdhCollectQueryData(hQuery) : ERROR_MOD_NOT_FOUND;
+	}
+	PDH_STATUS GetFormattedCounterArray(PDH_HCOUNTER hCounter, DWORD dwFormat, LPDWORD lpdwBufferSize, LPDWORD lpdwItemCount, PPDH_FMT_COUNTERVALUE_ITEM_W ItemBuffer)
+	{
+		return (m_pfnPdhGetFormattedCounterArray) ? m_pfnPdhGetFormattedCounterArray(hCounter, dwFormat, lpdwBufferSize, lpdwItemCount, ItemBuffer) : ERROR_MOD_NOT_FOUND;
+	}
+	PDH_STATUS GetFormattedCounterValue(PDH_HCOUNTER hCounter, DWORD dwFormat, LPDWORD lpdwType, PPDH_FMT_COUNTERVALUE pValue)
+	{
+		return (m_pfnPdhGetFormattedCounterValue) ? m_pfnPdhGetFormattedCounterValue(hCounter, dwFormat, lpdwType, pValue) : ERROR_MOD_NOT_FOUND;
+	}
+	PDH_STATUS CloseQuery(PDH_HQUERY hQuery)
+	{
+		return (m_pfnPdhCloseQuery) ? m_pfnPdhCloseQuery(hQuery) : ERROR_MOD_NOT_FOUND;
+	}
+
+	// PDH Performance Query
+	HMODULE m_hPDH;
+	PFNPdhOpenQuery m_pfnPdhOpenQuery;
+	PFNPdhAddCounter m_pfnPdhAddCounter;
+	PFNPdhCollectQueryData m_pfnPdhCollectQueryData;
+	PFNPdhGetFormattedCounterArray m_pfnPdhGetFormattedCounterArray;
+	PFNPdhGetFormattedCounterValue m_pfnPdhGetFormattedCounterValue;
+	PFNPdhCloseQuery m_pfnPdhCloseQuery;
+	HQUERY m_query;
+	HCOUNTER m_gpuUsageCounter;
+	HCOUNTER m_gpuDedicatedMemUsageCounter;
+	HCOUNTER m_gpuSharedMemUsageCounter;
+	// Process info
+	DWORD m_pid;
+	// Data
+	double m_gpuUsage;
+	double m_startGpuDedicatedMemory;  // in MB
+	double m_startGpuSharedMemory;     // in MB
+	double m_deltaGpuDedicatedMemory;  // in MB
+	double m_deltaGpuSharedMemory;     // in MB
+};
+
+#endif
+typedef enum CounterType
+{
+	TIMER = 0,
+	CPU_USAGE,
+	PAGE_FAULT_COUNT,
+	PAGE_FILE_USAGE,
+	PEAK_PAGE_FILE_USAGE,
+	WORKING_SET_USAGE,
+	PEAK_WORKING_SET_USAGE,
+	GPU_USAGE,
+	GPU_DEDICATED_MEM_USAGE,
+	GPU_SHARED_MEM_USAGE,
+	TYPE_COUNT
+} CounterType;
+
+const static std::vector<std::wstring> CounterTypeName =
+{
+	L"TIMER",
+	L"CPU USAGE",
+	L"PAGE FAULT COUNT",
+	L"PAGE FILE USAGE",
+	L"PEAK PAGE FILE USAGE",
+	L"WORKING SET USAGE",
+	L"PEAK WORK SET USAGE",
+	L"GPU USAGE",
+	L"GPU_DEDICATED_MEM_USAGE",
+	L"GPU_SHARED_MEM_USAGE"
+};
+
+// A statistics helper for Timer/CpuPerfCounter/GpuPerfCounter class.
+// It keeps the latest "TIMER_SLOT_SIZE" measured data in a ring buffer.
+// The statistic functions (e.g. GetVariance) assume data always starts from index 0 of the buffer.
+class PerfCounterStatistics
+{
+public:
+	PerfCounterStatistics() : m_bDisabled(true)
+	{
+	}
+
+	void Enable()
+	{
+		m_bDisabled = false;
+	}
+
+	void Disable()
+	{
+		m_bDisabled = true;
+	}
+
+	void Reset()
+	{
+		if (m_bDisabled)
+			return;
+
+		m_pos = 0;
+		m_bBufferFull = false;
+		m_cpuCounter.Reset();
+#ifndef DISABLE_GPU_COUNTERS
+		m_gpuCounter.Reset();
+#endif
+		for (int i = 0; i < CounterType::TYPE_COUNT; ++i)
+		{
+			m_data[i].Reset();
+		}
+	}
+
+	void Start()
+	{
+		if (m_bDisabled)
+			return;
+
+		m_timer.Start();
+		m_cpuCounter.Start();
+#ifndef DISABLE_GPU_COUNTERS
+		m_gpuCounter.Start();
+#endif
+	}
+
+	void Stop()
+	{
+		if (m_bDisabled)
+			return;
+
+		double counterValue[CounterType::TYPE_COUNT];
+
+		// Query counters
+		double time = m_timer.Stop();
+		m_cpuCounter.Stop();
+#ifndef DISABLE_GPU_COUNTERS
+		m_gpuCounter.Stop();
+#endif
+
+		// Get counter values
+		counterValue[CounterType::TIMER] = time;
+		counterValue[CounterType::CPU_USAGE] = 100.0 * m_cpuCounter.GetProcessTime() / time;
+		counterValue[CounterType::PAGE_FAULT_COUNT] = m_cpuCounter.GetDeltaPageFaultCount();
+		counterValue[CounterType::PAGE_FILE_USAGE] = m_cpuCounter.GetDeltaPageFileUsage();
+		counterValue[CounterType::PEAK_PAGE_FILE_USAGE] = m_cpuCounter.GetDeltaPeakPageFileUsage();
+		counterValue[CounterType::WORKING_SET_USAGE] = m_cpuCounter.GetDeltaWorkingSetUsage();
+		counterValue[CounterType::PEAK_WORKING_SET_USAGE] = m_cpuCounter.GetDeltaPeakWorkingSetUsage();
+#ifndef DISABLE_GPU_COUNTERS
+		counterValue[CounterType::GPU_USAGE] = m_gpuCounter.GetGpuUsage();
+		counterValue[CounterType::GPU_DEDICATED_MEM_USAGE] = m_gpuCounter.GetDedicatedMemory();
+		counterValue[CounterType::GPU_SHARED_MEM_USAGE] = m_gpuCounter.GetSharedMemory();
+#endif
+		// Update data blocks
+		for (int i = 0; i < CounterType::TYPE_COUNT; ++i)
+		{
+			m_data[i].total = m_data[i].total - m_data[i].measured[m_pos] + counterValue[i];
+			m_data[i].measured[m_pos] = counterValue[i];
+			m_data[i].max = (counterValue[i] > m_data[i].max) ? counterValue[i] : m_data[i].max;
+			m_data[i].min = (counterValue[i] < m_data[i].min) ? counterValue[i] : m_data[i].min;
+		}
+
+		// Update buffer index
+		if (m_pos + 1 >= TIMER_SLOT_SIZE)
+		{
+			m_pos = 0;
+			m_bBufferFull = true;
+		}
+		else
+		{
+			++m_pos;
+		}
+	}
+
+	int GetCount() const { return (m_bBufferFull) ? TIMER_SLOT_SIZE : m_pos; }
+	double GetAverage(CounterType t) const { return (m_bDisabled) ? 0 : m_data[t].total / GetCount(); }
+	double GetMin(CounterType t) const { return (m_bDisabled) ? 0 : m_data[t].min; }
+	double GetMax(CounterType t) const { return (m_bDisabled) ? 0 : m_data[t].max; }
+	double GetValues(CounterType t, int index) const { return (m_bDisabled) ? 0 : m_data[t].measured[index]; }
+	double GetStdev(CounterType t) const { return (m_bDisabled) ? 0 : sqrt(GetVariance(t)); }
+	double GetVariance(CounterType t) const
+	{
+		if (m_bDisabled)
+			return 0;
+
+		int count = GetCount();
+		double average = m_data[t].total / count;
+		double var = 0;
+		for (int i = 0; i < count; ++i)
+		{
+			var += (m_data[t].measured[i] - average) * (m_data[t].measured[i] - average);
+		}
+		return var / count;
+	}
+
+private:
+	struct DataBlock
+	{
+		void Reset()
+		{
+			max = 0;
+			min = DBL_MAX;
+			total = 0;
+			memset(measured, 0, sizeof(double)*TIMER_SLOT_SIZE);
+		}
+
+		double max;
+		double min;
+		double total;
+		double measured[TIMER_SLOT_SIZE];
+	};
+
+	int m_pos;
+	bool m_bBufferFull;
+	bool m_bDisabled;
+
+	Timer m_timer;
+	CpuPerfCounter m_cpuCounter;
+#ifndef DISABLE_GPU_COUNTERS
+	GpuPerfCounter m_gpuCounter;
+#endif
+	DataBlock m_data[CounterType::TYPE_COUNT];
+};
+
+// A class to wrap up multiple PerfCounterStatistics objects.
+// To create a profiler, define intervals in an enum and use it to create the profiler object.
+// See an example in engine/test/Model/ModelTest.cpp
+template<typename T>
+class Profiler
+{
+public:
+	void Reset(int begin, int end)
+	{
+		for (int i = begin; i < end; ++i)
+		{
+			m_perfCounterStat[i].Reset();
+		}
+	}
+
+	void Reset()
+	{
+		Reset(0, T::COUNT);
+	}
+
+	PerfCounterStatistics& GetCounter(int t)
+	{
+		return m_perfCounterStat[t];
+	}
+
+	PerfCounterStatistics& operator [] (int t)
+	{
+		return m_perfCounterStat[t];
+	}
+
+	void Enable()
+	{
+		for (int i = 0; i < T::COUNT; ++i)
+		{
+			m_perfCounterStat[i].Enable();
+		}
+	}
+
+	void Disable()
+	{
+		for (int i = 0; i < T::COUNT; ++i)
+		{
+			m_perfCounterStat[i].Disable();
+		}
+	}
+
+private:
+	PerfCounterStatistics m_perfCounterStat[T::COUNT];
+};
+
+#define WINML_PROFILING
+
+#ifdef WINML_PROFILING
+#define WINML_PROFILING_START(profiler, interval) profiler[interval].Start()
+#define WINML_PROFILING_STOP(profiler, interval) profiler[interval].Stop()
+#else
+#define WINML_PROFILING_START(profiler, interval) do {} while(0)
+#define WINML_PROFILING_STOP(profiler, interval) do {} while(0)
+#endif
--- a/Samples/WinMLModelRunner/Desktop/cpp/WinMLModelRunner.sln
+++ b/Samples/WinMLModelRunner/Desktop/cpp/WinMLModelRunner.sln
@ -0,0 +1,31 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.27004.2005
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WinMLModelRunner", "WinMLModelRunner.vcxproj", "{81EA9CC6-8A26-4583-B1A4-84740EF815C8}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Debug|x86 = Debug|x86
+		Release|x64 = Release|x64
+		Release|x86 = Release|x86
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{81EA9CC6-8A26-4583-B1A4-84740EF815C8}.Debug|x64.ActiveCfg = Debug|x64
+		{81EA9CC6-8A26-4583-B1A4-84740EF815C8}.Debug|x64.Build.0 = Debug|x64
+		{81EA9CC6-8A26-4583-B1A4-84740EF815C8}.Debug|x86.ActiveCfg = Debug|Win32
+		{81EA9CC6-8A26-4583-B1A4-84740EF815C8}.Debug|x86.Build.0 = Debug|Win32
+		{81EA9CC6-8A26-4583-B1A4-84740EF815C8}.Release|x64.ActiveCfg = Release|x64
+		{81EA9CC6-8A26-4583-B1A4-84740EF815C8}.Release|x64.Build.0 = Release|x64
+		{81EA9CC6-8A26-4583-B1A4-84740EF815C8}.Release|x86.ActiveCfg = Release|Win32
+		{81EA9CC6-8A26-4583-B1A4-84740EF815C8}.Release|x86.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {D193B2D4-1FF5-4E14-9334-E5EF4C8F9069}
+	EndGlobalSection
+EndGlobal
--- a/Samples/WinMLModelRunner/Desktop/cpp/WinMLModelRunner.vcxproj
+++ b/Samples/WinMLModelRunner/Desktop/cpp/WinMLModelRunner.vcxproj
@ -0,0 +1,151 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="CommandLineArgs.cpp" />
+    <ClCompile Include="Main.cpp" />
+    <ClCompile Include="Stopwatch.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="BindingUtilities.h" />
+    <ClInclude Include="CommandLineArgs.h" />
+    <ClInclude Include="Common.h" />
+    <ClInclude Include="OutputHelper.h" />
+    <ClInclude Include="ModelBinding.h" />
+    <ClInclude Include="Stopwatch.h" />
+    <ClInclude Include="TimerHelper.h" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <CppWinRTEnabled>true</CppWinRTEnabled>
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{81EA9CC6-8A26-4583-B1A4-84740EF815C8}</ProjectGuid>
+    <RootNamespace>Benchmark</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0.17713.0</WindowsTargetPlatformVersion>
+    <ProjectName>WinMLModelRunner</ProjectName>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <IntDir>$(Platform)\$(Configuration)\ to $(Platform)\$(Configuration)\$(Benchmark)\</IntDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ShowIncludes>true</ShowIncludes>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>dxgi.lib;d3d12.lib;winml.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>dxgi.lib;d3d12.lib;windowsapp.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ShowIncludes>true</ShowIncludes>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>dxgi.lib;d3d12.lib;winml.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ShowIncludes>true</ShowIncludes>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>dxgi.lib;d3d12.lib;winml.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/Samples/WinMLModelRunner/Desktop/cpp/readme.txt
+++ b/Samples/WinMLModelRunner/Desktop/cpp/readme.txt
@ -0,0 +1,33 @@
+The WinMLModelRunner program can run .onnx or .pb models where the input and output variables
+are tensors. It allows you to run WinML on the GPU or CPU, and if neither are specified will 
+run the test multiple times to generate separate GPU and CPU performance measurements. The GPU, 
+CPU and wall-clock times for loading, binding, and evaluating and the CPU and GPU memory usage during
+evaluate will print to the command line and to a CSV file. 
+
+If no csv file name is specified, the program will create csv titled
+"WinML Model Run [Today's date].csv" in the same folder as the .exe file. 
+
+Command-Line Options:
+---------------------------------------------------------------------------------------
+Required command-Line arguments:
+-model <path>     : Path to a .onnx model file.
+		
+-folder <path>    : Path to a folder with .onnx models, will run all of the models in the folder.
+
+Optional command-line arguments:
+-iterations <int> : Number of times to evaluate the model.
+-CPU              : Will create a session on the CPU.
+-GPU              : Will create a session on the GPU.
+-csv <file name>   : Will create a CSV file and output the performance measurements to it. 
+
+Examples:
+---------------------------------------------------------------------------------------
+Run 'concat' operator on the CPU and GPU separately 5 times:
+> WinMLModelRunner.exe -model c:\\data\\concat.onnx -iterations 5 
+
+Run all the models in the data folder 3 times using only the CPU: 
+> WinMLModelRunner .exe -folder c:\\data -iterations 3 -CPU
+
+Run all of the models in the data folder on the GPU and CPU once and output the 
+performance data to benchmarkdata.csv:
+> WinMLModelRunner.exe -folder c:\\data -csv benchmarkdata.csv