Add CreateDeviceOnClient and CreateDeviceInWinML flags to WinMLRunner (#70)

* Add client device creation to WinMLRunner * Add CreateDeviceOnClient and CreateDeviceInWinML flags * Fix outdated comment * Change CreateDeviceInWinML default
2018-10-30 14:55:17 -07:00 · 2018-10-30 14:55:17 -07:00 · bb34ea8393
--- a/Testing/WinMLRunnerTest/WinMLRunnerTest.cpp
+++ b/Testing/WinMLRunnerTest/WinMLRunnerTest.cpp
@ -133,120 +133,240 @@ namespace WinMLRunnerTest
            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
        }

-        TEST_METHOD(GarbageInputCpuDeviceCpuBoundRGBImage)
+        TEST_METHOD(GarbageInputCpuClientDeviceCpuBoundRGBImage)
        {
            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
-            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-CPU", L"-CPUBoundInput", L"-RGB" });
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-CPU", L"-CPUBoundInput", L"-RGB", L"-CreateDeviceOnClient" });
            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));

            // We need to expect one more line because of the header
            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
        }

-        TEST_METHOD(GarbageInputCpuDeviceCpuBoundBGRImage)
+        TEST_METHOD(GarbageInputCpuWinMLDeviceCpuBoundRGBImage)
        {
            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
-            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-CPU", L"-CPUBoundInput", L"-BGR" });
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-CPU", L"-CPUBoundInput", L"-RGB", L"-CreateDeviceInWinML" });
            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));

            // We need to expect one more line because of the header
            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
        }

-        TEST_METHOD(GarbageInputCpuDeviceCpuBoundTensor)
+        TEST_METHOD(GarbageInputCpuClientDeviceCpuBoundBGRImage)
        {
            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
-            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-CPU", L"-CPUBoundInput", L"-tensor" });
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-CPU", L"-CPUBoundInput", L"-BGR", L"-CreateDeviceOnClient" });
            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));

            // We need to expect one more line because of the header
            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
        }

-        TEST_METHOD(GarbageInputCpuDeviceGpuBoundRGBImage)
+        TEST_METHOD(GarbageInputCpuWinMLDeviceCpuBoundBGRImage)
        {
            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
-            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-CPU", L"-GPUBoundInput", L"-RGB" });
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-CPU", L"-CPUBoundInput", L"-BGR", L"-CreateDeviceInWinML" });
            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));

            // We need to expect one more line because of the header
            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
        }

-        TEST_METHOD(GarbageInputCpuDeviceGpuBoundBGRImage)
+        TEST_METHOD(GarbageInputCpuClientDeviceCpuBoundTensor)
        {
            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
-            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-CPU", L"-GPUBoundInput", L"-BGR" });
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-CPU", L"-CPUBoundInput", L"-tensor", L"-CreateDeviceOnClient" });
            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));

            // We need to expect one more line because of the header
            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
        }

-        TEST_METHOD(GarbageInputCpuDeviceGpuBoundTensor)
+        TEST_METHOD(GarbageInputCpuWinMLDeviceCpuBoundTensor)
        {
            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
-            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-CPU", L"-GPUBoundInput", L"-tensor" });
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-CPU", L"-CPUBoundInput", L"-tensor", L"-CreateDeviceInWinML" });
            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));

            // We need to expect one more line because of the header
            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
        }

-        TEST_METHOD(GarbageInputGpuDeviceCpuBoundRGBImage)
+        TEST_METHOD(GarbageInputCpuClientDeviceGpuBoundRGBImage)
        {
            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
-            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-GPU", L"-CPUBoundInput", L"-RGB" });
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-CPU", L"-GPUBoundInput", L"-RGB", L"-CreateDeviceOnClient" });
            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));

            // We need to expect one more line because of the header
            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
        }

-        TEST_METHOD(GarbageInputGpuDeviceCpuBoundBGRImage)
+        TEST_METHOD(GarbageInputCpuWinMLDeviceGpuBoundRGBImage)
        {
            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
-            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-GPU", L"-CPUBoundInput", L"-BGR" });
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-CPU", L"-GPUBoundInput", L"-RGB", L"-CreateDeviceInWinML" });
            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));

            // We need to expect one more line because of the header
            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
        }

-        TEST_METHOD(GarbageInputGpuDeviceCpuBoundTensor)
+        TEST_METHOD(GarbageInputCpuClientDeviceGpuBoundBGRImage)
        {
            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
-            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-GPU", L"-CPUBoundInput", L"-tensor" });
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-CPU", L"-GPUBoundInput", L"-BGR", L"-CreateDeviceOnClient" });
            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));

            // We need to expect one more line because of the header
            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
        }

-        TEST_METHOD(GarbageInputGpuDeviceGpuBoundRGBImage)
+        TEST_METHOD(GarbageInputCpuWinMLDeviceGpuBoundBGRImage)
        {
            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
-            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-GPU", L"-GPUBoundInput", L"-RGB" });
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-CPU", L"-GPUBoundInput", L"-BGR", L"-CreateDeviceInWinML" });
            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));

            // We need to expect one more line because of the header
            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
        }

-        TEST_METHOD(GarbageInputGpuDeviceGpuBoundBGRImage)
+        TEST_METHOD(GarbageInputCpuClientDeviceGpuBoundTensor)
        {
            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
-            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-GPU", L"-GPUBoundInput", L"-BGR" });
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-CPU", L"-GPUBoundInput", L"-tensor", L"-CreateDeviceOnClient" });
            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));

            // We need to expect one more line because of the header
            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
        }

-        TEST_METHOD(GarbageInputGpuDeviceGpuBoundTensor)
+        TEST_METHOD(GarbageInputCpuWinMLDeviceGpuBoundTensor)
        {
            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
-            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-GPU", L"-GPUBoundInput", L"-tensor" });
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-CPU", L"-GPUBoundInput", L"-tensor", L"-CreateDeviceInWinML" });
+            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));
+
+            // We need to expect one more line because of the header
+            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
+        }
+
+        TEST_METHOD(GarbageInputGpuClientDeviceCpuBoundRGBImage)
+        {
+            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-GPU", L"-CPUBoundInput", L"-RGB", L"-CreateDeviceOnClient" });
+            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));
+
+            // We need to expect one more line because of the header
+            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
+        }
+
+        TEST_METHOD(GarbageInputGpuWinMLDeviceCpuBoundRGBImage)
+        {
+            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-GPU", L"-CPUBoundInput", L"-RGB", L"-CreateDeviceInWinML" });
+            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));
+
+            // We need to expect one more line because of the header
+            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
+        }
+
+        TEST_METHOD(GarbageInputGpuClientDeviceCpuBoundBGRImage)
+        {
+            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-GPU", L"-CPUBoundInput", L"-BGR", L"-CreateDeviceOnClient" });
+            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));
+
+            // We need to expect one more line because of the header
+            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
+        }
+
+        TEST_METHOD(GarbageInputGpuWinMLDeviceCpuBoundBGRImage)
+        {
+            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-GPU", L"-CPUBoundInput", L"-BGR", L"-CreateDeviceInWinML" });
+            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));
+
+            // We need to expect one more line because of the header
+            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
+        }
+
+        TEST_METHOD(GarbageInputGpuClientDeviceCpuBoundTensor)
+        {
+            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-GPU", L"-CPUBoundInput", L"-tensor", L"-CreateDeviceOnClient" });
+            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));
+
+            // We need to expect one more line because of the header
+            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
+        }
+
+        TEST_METHOD(GarbageInputGpuWinMLDeviceCpuBoundTensor)
+        {
+            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-GPU", L"-CPUBoundInput", L"-tensor", L"-CreateDeviceInWinML" });
+            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));
+
+            // We need to expect one more line because of the header
+            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
+        }
+
+        TEST_METHOD(GarbageInputGpuClientDeviceGpuBoundRGBImage)
+        {
+            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-GPU", L"-GPUBoundInput", L"-RGB", L"-CreateDeviceOnClient" });
+            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));
+
+            // We need to expect one more line because of the header
+            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
+        }
+
+        TEST_METHOD(GarbageInputGpuWinMLDeviceGpuBoundRGBImage)
+        {
+            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-GPU", L"-GPUBoundInput", L"-RGB", L"-CreateDeviceInWinML" });
+            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));
+
+            // We need to expect one more line because of the header
+            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
+        }
+
+        TEST_METHOD(GarbageInputGpuClientDeviceGpuBoundBGRImage)
+        {
+            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-GPU", L"-GPUBoundInput", L"-BGR", L"-CreateDeviceOnClient" });
+            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));
+
+            // We need to expect one more line because of the header
+            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
+        }
+
+        TEST_METHOD(GarbageInputGpuWinMLDeviceGpuBoundBGRImage)
+        {
+            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-GPU", L"-GPUBoundInput", L"-BGR", L"-CreateDeviceInWinML" });
+            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));
+
+            // We need to expect one more line because of the header
+            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
+        }
+
+        TEST_METHOD(GarbageInputGpuClientDeviceGpuBoundTensor)
+        {
+            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-GPU", L"-GPUBoundInput", L"-tensor", L"-CreateDeviceOnClient" });
+            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));
+
+            // We need to expect one more line because of the header
+            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
+        }
+
+        TEST_METHOD(GarbageInputGpuWinMLDeviceGpuBoundTensor)
+        {
+            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-output", OUTPUT_PATH, L"-perf", L"-GPU", L"-GPUBoundInput", L"-tensor", L"-CreateDeviceInWinML" });
            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));

            // We need to expect one more line because of the header
@ -265,6 +385,8 @@ namespace WinMLRunnerTest
                L"-perf",
                L"-CPU",
                L"-GPU",
+                L"-CreateDeviceOnClient",
+                L"-CreateDeviceInWinML",
                L"-CPUBoundInput",
                L"-GPUBoundInput",
                L"-RGB",
@ -274,7 +396,7 @@ namespace WinMLRunnerTest
            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));

            // We need to expect one more line because of the header
-            Assert::AreEqual(static_cast<size_t>(13), GetOutputCSVLineCount());
+            Assert::AreEqual(static_cast<size_t>(25), GetOutputCSVLineCount());
        }

        TEST_METHOD(RunAllModelsInFolderGarbageInput)
@ -297,6 +419,8 @@ namespace WinMLRunnerTest
                L"-perf",
                L"-CPU",
                L"-GPU",
+                L"-CreateDeviceOnClient",
+                L"-CreateDeviceInWinML",
                L"-CPUBoundInput",
                L"-GPUBoundInput",
                L"-RGB",
@ -306,7 +430,7 @@ namespace WinMLRunnerTest
            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));

            // We need to expect one more line because of the header
-            Assert::AreEqual(static_cast<size_t>(25), GetOutputCSVLineCount());
+            Assert::AreEqual(static_cast<size_t>(49), GetOutputCSVLineCount());
        }
 	};

@ -386,8 +510,7 @@ namespace WinMLRunnerTest
        TEST_METHOD(TestPrintUsage)
        {
            auto const curPath = FileHelper::GetModulePath();
-            std::wstring command = curPath +
-                L"./WinMLRunner";
+            std::wstring command = curPath + L"./WinMLRunner";
            Assert::AreEqual(0, RunProc((wchar_t *)command.c_str()));
        }
    };
--- a/Tools/WinMLRunner/BindingUtilities.h
+++ b/Tools/WinMLRunner/BindingUtilities.h
@ -5,16 +5,19 @@
 #include "ModelBinding.h"
 #include "CommandLineArgs.h"

-using namespace Windows::Media;
-using namespace Windows::Storage;
+using namespace winrt::Windows::Media;
+using namespace winrt::Windows::Storage;
 using namespace winrt::Windows::AI::MachineLearning;
-using namespace winrt::Windows::Foundation;
 using namespace winrt::Windows::Foundation::Collections;
 using namespace winrt::Windows::Graphics::DirectX;
 using namespace winrt::Windows::Graphics::Imaging;
+using namespace winrt::Windows::Graphics::DirectX::Direct3D11;

 namespace BindingUtilities
 {
+    static unsigned int seed = 0;
+    static std::independent_bits_engine<std::default_random_engine, CHAR_BIT, unsigned int> randomBitsEngine;
+
    SoftwareBitmap GenerateGarbageImage(const TensorFeatureDescriptor& imageDescriptor, InputDataType inputDataType)
    {
        assert(inputDataType != InputDataType::Tensor);
@ -34,10 +37,9 @@ namespace BindingUtilities
        // We have to create RGBA8 or BGRA8 images, so we need 4 channels
        uint32_t totalByteSize = static_cast<uint32_t>(width) * static_cast<uint32_t>(height) * 4;

-        // Generate random values for the image
+        // Generate values for the image based on a seed
        std::vector<uint8_t> data(totalByteSize);
-        static std::independent_bits_engine<std::default_random_engine, CHAR_BIT, unsigned int> randomBitsEngine;
-        randomBitsEngine.seed(static_cast<unsigned int>(time(nullptr)));
+        randomBitsEngine.seed(seed++);
        std::generate(data.begin(), data.end(), randomBitsEngine);

        // Write the values to a buffer
@ -75,13 +77,16 @@ namespace BindingUtilities
        }
    }

-    VideoFrame CreateVideoFrame(const SoftwareBitmap& softwareBitmap, InputBindingType inputBindingType, InputDataType inputDataType)
+    VideoFrame CreateVideoFrame(const SoftwareBitmap& softwareBitmap, InputBindingType inputBindingType, InputDataType inputDataType, const IDirect3DDevice winrtDevice)
    {
        VideoFrame inputImage = VideoFrame::CreateWithSoftwareBitmap(softwareBitmap);

        if (inputBindingType == InputBindingType::GPU)
        {
-            VideoFrame gpuImage = VideoFrame::CreateAsDirect3D11SurfaceBacked(TypeHelper::GetDirectXPixelFormat(inputDataType), softwareBitmap.PixelWidth(), softwareBitmap.PixelHeight());
+            VideoFrame gpuImage = winrtDevice
+                ? VideoFrame::CreateAsDirect3D11SurfaceBacked(TypeHelper::GetDirectXPixelFormat(inputDataType), softwareBitmap.PixelWidth(), softwareBitmap.PixelHeight(), winrtDevice)
+                : VideoFrame::CreateAsDirect3D11SurfaceBacked(TypeHelper::GetDirectXPixelFormat(inputDataType), softwareBitmap.PixelWidth(), softwareBitmap.PixelHeight());
+
            inputImage.CopyToAsync(gpuImage).get();

            return gpuImage;
@ -254,7 +259,14 @@ namespace BindingUtilities
        throw hresult_not_implemented();
    }

-    ImageFeatureValue CreateBindableImage(const ILearningModelFeatureDescriptor& featureDescriptor, const std::wstring& imagePath, InputBindingType inputBindingType, InputDataType inputDataType)
+    ImageFeatureValue CreateBindableImage(
+        const ILearningModelFeatureDescriptor&
+        featureDescriptor,
+        const std::wstring& imagePath,
+        InputBindingType inputBindingType,
+        InputDataType inputDataType,
+        const IDirect3DDevice winrtDevice
+    )
    {
        auto imageDescriptor = featureDescriptor.try_as<TensorFeatureDescriptor>();

@ -268,13 +280,13 @@ namespace BindingUtilities
            ? GenerateGarbageImage(imageDescriptor, inputDataType)
            : LoadImageFile(imagePath.c_str(), inputDataType);

-        auto videoFrame = CreateVideoFrame(softwareBitmap, inputBindingType, inputDataType);
+        auto videoFrame = CreateVideoFrame(softwareBitmap, inputBindingType, inputDataType, winrtDevice);

        return ImageFeatureValue::CreateFromVideoFrame(videoFrame);
    }

    template<typename K, typename V>
-    void OutputSequenceBinding(IMapView<hstring, Windows::Foundation::IInspectable> results, hstring name)
+    void OutputSequenceBinding(IMapView<hstring, winrt::Windows::Foundation::IInspectable> results, hstring name)
    {
        auto map = results.Lookup(name).as<IVectorView<IMap<K, V>>>().GetAt(0);
        auto iter = map.First();
@ -295,7 +307,7 @@ namespace BindingUtilities
        std::cout << " " << maxKey << " " << maxVal << std::endl;
    }

-    void PrintEvaluationResults(const LearningModel& model, const CommandLineArgs& args, const IMapView<hstring, Windows::Foundation::IInspectable>& results)
+    void PrintEvaluationResults(const LearningModel& model, const CommandLineArgs& args, const IMapView<hstring, winrt::Windows::Foundation::IInspectable>& results)
    {
        if (args.Silent()) return;
        
--- a/Tools/WinMLRunner/CommandLineArgs.cpp
+++ b/Tools/WinMLRunner/CommandLineArgs.cpp
@ -15,6 +15,8 @@ void CommandLineArgs::PrintUsage() {
    std::cout << "  -GPU : run model on default GPU" << std::endl;
    std::cout << "  -GPUHighPerformance : run model on GPU with highest performance" << std::endl;
    std::cout << "  -GPUMinPower : run model on GPU with the least power" << std::endl;
+    std::cout << "  -CreateDeviceOnClient : create the device on the client and pass it to WinML" << std::endl;
+    std::cout << "  -CreateDeviceInWinML : create the device inside WinML" << std::endl;
    std::cout << "  -CPUBoundInput : bind the input to the CPU" << std::endl;
    std::cout << "  -GPUBoundInput : bind the input to the GPU" << std::endl;
    std::cout << "  -RGB : load the input as an RGB image" << std::endl;
@ -52,7 +54,15 @@ CommandLineArgs::CommandLineArgs()
        {
            m_useGPUMinPower = true;
        }
-        if ((_wcsicmp(args[i], L"-iterations") == 0) && (i + 1 < numArgs))
+        else if ((_wcsicmp(args[i], L"-CreateDeviceOnClient") == 0))
+        {
+            m_createDeviceOnClient = true;
+        }
+        else if ((_wcsicmp(args[i], L"-CreateDeviceInWinML") == 0))
+        {
+            m_createDeviceInWinML = true;
+        }
+        else if ((_wcsicmp(args[i], L"-iterations") == 0) && (i + 1 < numArgs))
        {
            m_numIterations = static_cast<UINT>(_wtoi(args[++i]));
        }
--- a/Tools/WinMLRunner/CommandLineArgs.h
+++ b/Tools/WinMLRunner/CommandLineArgs.h
@ -15,6 +15,7 @@ public:
    bool PerfCapture() const { return m_perfCapture; }
    bool EnableDebugOutput() const { return m_debug; }
    bool Silent() const { return m_silent; }
+    bool CreateDeviceOnClient() const { return m_createDeviceOnClient; }
   
    const std::wstring& ImagePath() const { return m_imagePath; }
    const std::wstring& CsvPath() const { return m_csvData; }
@ -53,6 +54,12 @@ public:
        return m_useCPUBoundInput || !m_useGPUBoundInput;
    }

+    bool CreateDeviceInWinML() const
+    {
+        // By Default we create the device in WinML if no flag is specified
+        return m_createDeviceInWinML || !m_createDeviceOnClient;
+    }
+
    uint32_t NumIterations() const { return m_numIterations; }

 private:
@ -61,6 +68,8 @@ private:
    bool m_useGPU = false;
    bool m_useGPUHighPerformance = false;
    bool m_useGPUMinPower = false;
+    bool m_createDeviceOnClient = false;
+    bool m_createDeviceInWinML = false;
    bool m_useRGB = false;
    bool m_useBGR = false;
    bool m_useTensor = false;
--- a/Tools/WinMLRunner/Common.h
+++ b/Tools/WinMLRunner/Common.h
@ -1,6 +1,8 @@
 #pragma once

 #define _SILENCE_ALL_CXX17_DEPRECATION_WARNINGS
+// unknown.h needs to be inlcuded before any winrt headers
+#include <unknwn.h>
 #include <winrt/Windows.AI.MachineLearning.h>
 #include <winrt/Windows.Foundation.h>
 #include <winrt/Windows.Media.h>
--- a/Tools/WinMLRunner/Main.cpp
+++ b/Tools/WinMLRunner/Main.cpp
@ -4,14 +4,17 @@
 #include "BindingUtilities.h"
 #include "CommandLineArgs.h"
 #include <filesystem>
+#include <d3d11.h>
+#include <Windows.Graphics.DirectX.Direct3D11.interop.h>

 Profiler<WINML_MODEL_TEST_PERF> g_Profiler;

+using namespace winrt::Windows::Graphics::DirectX::Direct3D11;
+
 LearningModel LoadModel(const std::wstring path, bool capturePerf, bool silent, OutputHelper& output)
 {
    Timer timer;
    LearningModel model = nullptr;
-
    output.PrintLoadingInfo(path);

    try
@ -63,7 +66,12 @@ std::vector<std::wstring> GetModelsInDirectory(CommandLineArgs& args, OutputHelp
    return modelPaths;
 }

-std::vector<ILearningModelFeatureValue> GenerateInputFeatures(const LearningModel& model, const CommandLineArgs& args, InputBindingType inputBindingType, InputDataType inputDataType)
+std::vector<ILearningModelFeatureValue> GenerateInputFeatures(
+    const LearningModel& model,
+    const CommandLineArgs& args,
+    InputBindingType inputBindingType,
+    InputDataType inputDataType,
+    const IDirect3DDevice winrtDevice)
 {
    std::vector<ILearningModelFeatureValue> inputFeatures;

@ -80,7 +88,7 @@ std::vector<ILearningModelFeatureValue> GenerateInputFeatures(const LearningMode
        }
        else
        {
-            auto imageFeature = BindingUtilities::CreateBindableImage(description, args.ImagePath(), inputBindingType, inputDataType);
+            auto imageFeature = BindingUtilities::CreateBindableImage(description, args.ImagePath(), inputBindingType, inputDataType, winrtDevice);
            inputFeatures.push_back(imageFeature);
        }
    }
@ -191,18 +199,56 @@ HRESULT EvaluateModel(
    OutputHelper& output,
    DeviceType deviceType,
    InputBindingType inputBindingType,
-    InputDataType inputDataType
+    InputDataType inputDataType,
+    DeviceCreationLocation deviceCreationLocation
 )
 {
    if (model == nullptr)
    {
        return hresult_invalid_argument().code();
    }
+
    LearningModelSession session = nullptr;
+    IDirect3DDevice winrtDevice = nullptr;

    try
    {
-        session = LearningModelSession(model, TypeHelper::GetWinmlDeviceKind(deviceType));
+        if (deviceCreationLocation == DeviceCreationLocation::ClientCode)
+        {
+            // Creating the device on the client and using it to create the video frame and initialize the session makes sure that everything is on
+            // the same device. This usually avoids an expensive cross-device and cross-videoframe copy via the VideoFrame pipeline.
+            com_ptr<ID3D11Device> d3d11Device;
+            HRESULT hr = D3D11CreateDevice(nullptr, D3D_DRIVER_TYPE_HARDWARE, nullptr, D3D11_CREATE_DEVICE_BGRA_SUPPORT, nullptr, 0, D3D11_SDK_VERSION, d3d11Device.put(), nullptr, nullptr);
+
+            if (FAILED(hr))
+            {
+                throw hresult(hr);
+            }
+
+            com_ptr<IDXGIDevice> dxgiDevice;
+            hr = d3d11Device->QueryInterface(IID_PPV_ARGS(dxgiDevice.put()));
+
+            if (FAILED(hr))
+            {
+                throw hresult(hr);
+            }
+
+            com_ptr<IInspectable> inspectableDevice;
+            hr = CreateDirect3D11DeviceFromDXGIDevice(dxgiDevice.get(), inspectableDevice.put());
+
+            if (FAILED(hr))
+            {
+                throw hresult(hr);
+            }
+
+            winrtDevice = inspectableDevice.as<IDirect3DDevice>();
+            LearningModelDevice learningModelDevice = LearningModelDevice::CreateFromDirect3D11Device(winrtDevice);
+            session = LearningModelSession(model, learningModelDevice);
+        }
+        else
+        {
+            session = LearningModelSession(model, TypeHelper::GetWinmlDeviceKind(deviceType));
+        }
    }
    catch (hresult_error hr)
    {
@ -231,9 +277,9 @@ HRESULT EvaluateModel(
    {
        bool captureIterationPerf = args.PerfCapture() && (!args.IgnoreFirstRun() || i > 0);

-        output.PrintBindingInfo(i + 1, deviceType, inputBindingType, inputDataType);
+        output.PrintBindingInfo(i + 1, deviceType, inputBindingType, inputDataType, deviceCreationLocation);

-        std::vector<ILearningModelFeatureValue> inputFeatures = GenerateInputFeatures(model, args, inputBindingType, inputDataType);
+        std::vector<ILearningModelFeatureValue> inputFeatures = GenerateInputFeatures(model, args, inputBindingType, inputDataType, winrtDevice);
        HRESULT bindInputResult = BindInputFeatures(model, context, inputFeatures, args, output, captureIterationPerf);

        if (FAILED(bindInputResult))
@ -241,7 +287,7 @@ HRESULT EvaluateModel(
            return bindInputResult;
        }

-        output.PrintEvaluatingInfo(i + 1, deviceType, inputBindingType, inputDataType);
+        output.PrintEvaluatingInfo(i + 1, deviceType, inputBindingType, inputDataType, deviceCreationLocation);

        HRESULT evalResult = EvaluateModel(model, context, session, isGarbageData, args, output, captureIterationPerf);

@ -261,6 +307,7 @@ HRESULT EvaluateModels(
    const std::vector<DeviceType>& deviceTypes,
    const std::vector<InputBindingType>& inputBindingTypes,
    const std::vector<InputDataType>& inputDataTypes,
+    const std::vector<DeviceCreationLocation> deviceCreationLocations,
    const CommandLineArgs& args,
    OutputHelper& output
 )
@ -296,32 +343,44 @@ HRESULT EvaluateModels(
            {
                for (auto inputDataType : inputDataTypes)
                {
-                    if (args.PerfCapture())
+                    for (auto deviceCreationLocation : deviceCreationLocations)
                    {
-                        output.ResetBindAndEvalTImes();
-                        g_Profiler.Reset();
-                    }
-
-                    if (inputDataType != InputDataType::Tensor)
-                    {
-                        // Currently GPU binding only work with 4D tensors and RGBA/BGRA images
-                        if (tensorDescriptor.Shape().Size() != 4 || tensorDescriptor.Shape().GetAt(1) != 3)
+                        if (args.PerfCapture())
                        {
-                            continue;
+                            output.ResetBindAndEvalTImes();
+                            g_Profiler.Reset();
                        }
-                    }

-                    HRESULT evalHResult = EvaluateModel(model, args, output, deviceType, inputBindingType, inputDataType);
+                        if (inputDataType != InputDataType::Tensor)
+                        {
+                            // Currently GPU binding only work with 4D tensors and RGBA/BGRA images
+                            if (tensorDescriptor.Shape().Size() != 4 || tensorDescriptor.Shape().GetAt(1) != 3)
+                            {
+                                continue;
+                            }
+                        }

-                    if (FAILED(evalHResult))
-                    {
-                        return evalHResult;
-                    }
+                        HRESULT evalHResult = EvaluateModel(model, args, output, deviceType, inputBindingType, inputDataType, deviceCreationLocation);

-                    if (args.PerfCapture())
-                    {
-                        output.PrintResults(g_Profiler, args.NumIterations(), deviceType, inputBindingType, inputDataType);
-                        output.WritePerformanceDataToCSV(g_Profiler, args.NumIterations(), path, TypeHelper::Stringify(deviceType), TypeHelper::Stringify(inputDataType), TypeHelper::Stringify(inputBindingType), args.IgnoreFirstRun());
+                        if (FAILED(evalHResult))
+                        {
+                            return evalHResult;
+                        }
+
+                        if (args.PerfCapture())
+                        {
+                            output.PrintResults(g_Profiler, args.NumIterations(), deviceType, inputBindingType, inputDataType, deviceCreationLocation);
+                            output.WritePerformanceDataToCSV(
+                                g_Profiler,
+                                args.NumIterations(),
+                                path,
+                                TypeHelper::Stringify(deviceType),
+                                TypeHelper::Stringify(inputDataType),
+                                TypeHelper::Stringify(inputBindingType),
+                                TypeHelper::Stringify(deviceCreationLocation),
+                                args.IgnoreFirstRun()
+                            );
+                        }
                    }
                }
            }
@ -399,6 +458,23 @@ std::vector<InputBindingType> FetchInputBindingTypes(const CommandLineArgs& args
    return inputBindingTypes;
 }

+std::vector<DeviceCreationLocation> FetchDeviceCreationLocations(const CommandLineArgs& args)
+{
+    std::vector<DeviceCreationLocation> deviceCreationLocations;
+
+    if (args.CreateDeviceInWinML())
+    {
+        deviceCreationLocations.push_back(DeviceCreationLocation::WinML);
+    }
+
+    if (args.CreateDeviceOnClient())
+    {
+        deviceCreationLocations.push_back(DeviceCreationLocation::ClientCode);
+    }
+
+    return deviceCreationLocations;
+}
+
 int main(int argc, char** argv)
 {
    // Initialize COM in a multi-threaded environment.
@ -425,8 +501,10 @@ int main(int argc, char** argv)
        std::vector<DeviceType> deviceTypes = FetchDeviceTypes(args);
        std::vector<InputBindingType> inputBindingTypes = FetchInputBindingTypes(args);
        std::vector<InputDataType> inputDataTypes = FetchInputDataTypes(args);
+        std::vector<DeviceCreationLocation> deviceCreationLocations = FetchDeviceCreationLocations(args);
        std::vector<std::wstring> modelPaths = args.ModelPath().empty() ? GetModelsInDirectory(args, &output) : std::vector<std::wstring>(1, args.ModelPath());
-        return EvaluateModels(modelPaths, deviceTypes, inputBindingTypes, inputDataTypes, args, output);
+
+        return EvaluateModels(modelPaths, deviceTypes, inputBindingTypes, inputDataTypes, deviceCreationLocations, args, output);
    }

    return 0;
--- a/Tools/WinMLRunner/OutputHelper.h
+++ b/Tools/WinMLRunner/OutputHelper.h
@ -24,30 +24,32 @@ public:
        }
    }

-    void PrintBindingInfo(uint32_t iteration, DeviceType deviceType, InputBindingType inputBindingType, InputDataType inputDataType) const
+    void PrintBindingInfo(uint32_t iteration, DeviceType deviceType, InputBindingType inputBindingType, InputDataType inputDataType, DeviceCreationLocation deviceCreationLocation) const
    {
        if (!m_silent)
        {
            printf(
-                "Binding (device = %s, iteration = %d, inputBinding = %s, inputDataType = %s)...",
+                "Binding (device = %s, iteration = %d, inputBinding = %s, inputDataType = %s, deviceCreationLocation = %s)...",
                TypeHelper::Stringify(deviceType).c_str(),
                iteration,
                TypeHelper::Stringify(inputBindingType).c_str(),
-                TypeHelper::Stringify(inputDataType).c_str()
+                TypeHelper::Stringify(inputDataType).c_str(),
+                TypeHelper::Stringify(deviceCreationLocation).c_str()
            );
        }
    }

-    void PrintEvaluatingInfo(uint32_t iteration, DeviceType deviceType, InputBindingType inputBindingType, InputDataType inputDataType) const
+    void PrintEvaluatingInfo(uint32_t iteration, DeviceType deviceType, InputBindingType inputBindingType, InputDataType inputDataType, DeviceCreationLocation deviceCreationLocation) const
    {
        if (!m_silent)
        {
            printf(
-                "Evaluating (device = %s, iteration = %d, inputBinding = %s, inputDataType = %s)...",
+                "Evaluating (device = %s, iteration = %d, inputBinding = %s, inputDataType = %s, deviceCreationLocation = %s)...",
                TypeHelper::Stringify(deviceType).c_str(),
                iteration,
                TypeHelper::Stringify(inputBindingType).c_str(),
-                TypeHelper::Stringify(inputDataType).c_str()
+                TypeHelper::Stringify(inputDataType).c_str(),
+                TypeHelper::Stringify(deviceCreationLocation).c_str()
            );
        }
    }
@ -122,7 +124,14 @@ public:
        }
    }

-    void PrintResults(const Profiler<WINML_MODEL_TEST_PERF> &profiler, uint32_t numIterations, DeviceType deviceType, InputBindingType inputBindingType, InputDataType inputDataType) const
+    void PrintResults(
+        const Profiler<WINML_MODEL_TEST_PERF> &profiler,
+        uint32_t numIterations,
+        DeviceType deviceType,
+        InputBindingType inputBindingType,
+        InputDataType inputDataType,
+        DeviceCreationLocation deviceCreationLocation
+    ) const
    {
        double loadTime = profiler[LOAD_MODEL].GetAverage(CounterType::TIMER);
        double bindTime = profiler[BIND_VALUE].GetAverage(CounterType::TIMER);
@ -143,11 +152,12 @@ public:

            std::cout << std::endl;

-            printf("Results (device = %s, numIterations = %d, inputBinding = %s, inputDataType = %s):\n",
+            printf("Results (device = %s, numIterations = %d, inputBinding = %s, inputDataType = %s, deviceCreationLocation = %s):\n",
                TypeHelper::Stringify(deviceType).c_str(),
                numIterations,
                TypeHelper::Stringify(inputBindingType).c_str(),
-                TypeHelper::Stringify(inputDataType).c_str()
+                TypeHelper::Stringify(inputDataType).c_str(),
+                TypeHelper::Stringify(deviceCreationLocation).c_str()
            );

            std::cout << "  Load: " << (isnan(loadTime) ? "N/A" : std::to_string(loadTime) + "ms") << std::endl;
@ -281,7 +291,15 @@ public:
        m_csvFileName = fileName;
    }

-    void WritePerformanceDataToCSV(const Profiler<WINML_MODEL_TEST_PERF> &profiler, int numIterations, std::wstring model, std::string modelBinding, std::string inputBinding, std::string inputType, bool firstRunIgnored) const
+    void WritePerformanceDataToCSV(
+        const Profiler<WINML_MODEL_TEST_PERF> &profiler,
+        int numIterations, std::wstring model,
+        std::string modelBinding,
+        std::string inputBinding,
+        std::string inputType,
+        std::string deviceCreationLocation,
+        bool firstRunIgnored
+    ) const
    {
        double loadTime = profiler[LOAD_MODEL].GetAverage(CounterType::TIMER);
        double bindTime = profiler[BIND_VALUE].GetAverage(CounterType::TIMER);
@ -323,6 +341,7 @@ public:
                     << "Model Binding" << ","
                     << "Input Binding" << ","
                     << "Input Type" << ","
+                     << "Device Creation Location" << ","
                     << "Iterations" << ","
                     << "First Run Ignored" << ","
                     << "Load (ms)" << ","
@ -342,6 +361,7 @@ public:
                 << modelBinding << ","
                 << inputBinding << ","
                 << inputType << ","
+                 << deviceCreationLocation << ","
                 << numIterations << ","
                 << firstRunIgnored << ","
                 << (isnan(loadTime) ? "N/A" : std::to_string(loadTime)) << ","
--- a/Tools/WinMLRunner/README.md
+++ b/Tools/WinMLRunner/README.md
@ -28,6 +28,8 @@ Required command-Line arguments:
 -GPU                     : Will create a session on the GPU.
 -GPUHighPerformance      : Will create a session with the most powerful GPU device available.
 -GPUMinPower             : Will create a session with GPU with the least power.
+-CreateDeviceOnClient    : Will create the device on the client and explicitly pass it to WinML via the API. GPU runs using this flag will usually be faster than -CreateDeviceInWinML since we avoid a cross-device copy by creating the video frame on the same device that DML uses to bind inputs.
+-CreateDeviceInWinML     : Will create the device inside WinML. GPU runs using this flag will usually be slower than -CreateDeviceOnClient since we have to copy the video frame to a different device.
 -CPUBoundInput           : Will bind the input to the CPU.
 -GPUBoundInput           : Will bind the input to the GPU.
 -BGR                     : Will load the input as a BGR image.
--- a/Tools/WinMLRunner/TypeHelper.h
+++ b/Tools/WinMLRunner/TypeHelper.h
@ -9,6 +9,7 @@ enum class InputBindingType { CPU, GPU };
 enum class InputDataType { Tensor, ImageRGB, ImageBGR };
 enum class InputSourceType { ImageFile, CSVFile, GeneratedData };
 enum class DeviceType { CPU, DefaultGPU, MinPowerGPU, HighPerfGPU };
+enum class DeviceCreationLocation { WinML, ClientCode };

 class TypeHelper
 {
@ -61,6 +62,17 @@ public:
        throw "No name found for this DeviceType.";
    }

+    static std::string Stringify(DeviceCreationLocation deviceCreationLocation)
+    {
+        switch (deviceCreationLocation)
+        {
+            case DeviceCreationLocation::ClientCode: return "Client Code";
+            case DeviceCreationLocation::WinML: return "WinML";
+        }
+
+        throw "No name found for this DeviceCreationLocation.";
+    }
+
    static LearningModelDeviceKind GetWinmlDeviceKind(DeviceType deviceType)
    {
        switch (deviceType)