Add Bind GPU bound Tensor (#213)

* Add feature to bind GPU tensors * Add test to check for invalid arg * Spacing * Make heap properties and resource desc more simple * Moved GPU upload tensor code under if statement
2019-04-15 16:38:21 -07:00 · 2019-04-15 16:38:21 -07:00 · f2a21529f1
--- a/Testing/WinMLRunnerTest/WinMLRunnerTest.cpp
+++ b/Testing/WinMLRunnerTest/WinMLRunnerTest.cpp
@ -284,7 +284,6 @@ public: TEST_CLASS_INITIALIZE(SetupClass) {
            // We need to expect one more line because of the header
            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
        }
-
        TEST_METHOD(GarbageInputCpuWinMLDeviceCpuBoundRGBImage)
        {
            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
@ -320,7 +319,6 @@ public: TEST_CLASS_INITIALIZE(SetupClass) {
            // We need to expect one more line because of the header
            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
        }
-
        TEST_METHOD(GarbageInputCpuWinMLDeviceGpuBoundRGBImage)
        {
            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
@ -344,19 +342,15 @@ public: TEST_CLASS_INITIALIZE(SetupClass) {
            // We need to expect one more line because of the header
            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
        }
-
        TEST_METHOD(GarbageInputCpuWinMLDeviceGpuBoundTensor)
        {
            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
            const std::wstring command =
                BuildCommand({ EXE_PATH, L"-model", modelPath, L"-PerfOutput", OUTPUT_PATH, L"-perf", L"-CPU",
                               L"-GPUBoundInput", L"-tensor", L"-CreateDeviceInWinML" });
-            Assert::AreEqual(S_OK, RunProc((wchar_t*)command.c_str()));
-
-            // We need to expect one more line because of the header
-            Assert::AreEqual(static_cast<size_t>(2), GetOutputCSVLineCount());
+            // Binding GPU Tensor with Session created with CPU device isn't supported.
+            Assert::AreEqual(E_INVALIDARG, RunProc((wchar_t*)command.c_str()));
        }
-
        TEST_METHOD(GarbageInputGpuClientDeviceCpuBoundRGBImage)
        {
            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
@ -646,7 +640,7 @@ public: TEST_CLASS_INITIALIZE(SetupClass) {
            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-input", inputPath });
            Assert::AreEqual(HRESULT_FROM_WIN32(ERROR_INVALID_PARAMETER), RunProc((wchar_t *)command.c_str()));
        }
-        TEST_METHOD(ProvidedCSVInputGPUSaveTensor)
+        TEST_METHOD(ProvidedCSVInputGPUSaveCpuBoundTensor)
        {
            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
            const std::wstring inputPath = CURRENT_PATH + L"fish.csv";
@ -656,7 +650,17 @@ public: TEST_CLASS_INITIALIZE(SetupClass) {
            Assert::AreEqual(true, CompareTensors(L"OutputTensorData\\Squeezenet_fish_input_GPU.csv",
                                                  TENSOR_DATA_PATH + L"\\softmaxout_1GpuIteration1.csv"));
        }
-        TEST_METHOD(ProvidedCSVInputCPUSaveTensor)
+        TEST_METHOD(ProvidedCSVInputGPUSaveGpuBoundTensor)
+        {
+            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
+            const std::wstring inputPath = CURRENT_PATH + L"fish.csv";
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", modelPath, L"-input", inputPath,
+                                                        L"-SaveTensorData", L"First", TENSOR_DATA_PATH, L"-GPU", L"-GPUBoundInput" });
+            Assert::AreEqual(S_OK, RunProc((wchar_t*)command.c_str()));
+            Assert::AreEqual(true, CompareTensors(L"OutputTensorData\\Squeezenet_fish_input_GPU.csv",
+                TENSOR_DATA_PATH + L"\\softmaxout_1GpuIteration1.csv"));
+        }
+        TEST_METHOD(ProvidedCSVInputCPUSaveCpuBoundTensor)
        {
            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet.onnx";
            const std::wstring inputPath = CURRENT_PATH + L"fish.csv";
@ -666,7 +670,7 @@ public: TEST_CLASS_INITIALIZE(SetupClass) {
            Assert::AreEqual(true, CompareTensors(L"OutputTensorData\\Squeezenet_fish_input_CPU.csv",
                                                  TENSOR_DATA_PATH + L"\\softmaxout_1CpuIteration1.csv"));
        }
-        TEST_METHOD(ProvidedCSVInputGPUSaveTensorFp16)
+        TEST_METHOD(ProvidedCSVInputGPUSaveCpuBoundTensorFp16)
        {
            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet_fp16.onnx";
            const std::wstring inputPath = CURRENT_PATH + L"fish.csv";
@ -676,7 +680,7 @@ public: TEST_CLASS_INITIALIZE(SetupClass) {
            Assert::AreEqual(true, CompareTensorsFP16(L"OutputTensorData\\Squeezenet_fp16_fish_input_GPU.csv",
                                                      TENSOR_DATA_PATH + L"\\softmaxout_1GpuIteration1.csv"));
        }
-        TEST_METHOD(ProvidedCSVInputCPUSaveTensorFp16)
+        TEST_METHOD(ProvidedCSVInputCPUSaveCpuBoundTensorFp16)
        {
            const std::wstring modelPath = CURRENT_PATH + L"SqueezeNet_fp16.onnx";
            const std::wstring inputPath = CURRENT_PATH + L"fish.csv";
@ -687,7 +691,7 @@ public: TEST_CLASS_INITIALIZE(SetupClass) {
                                                      TENSOR_DATA_PATH + L"\\softmaxout_1CpuIteration1.csv"));
        }

-        TEST_METHOD(ProvidedCSVInputOnlyGpuSaveTensorImageDenotation)
+        TEST_METHOD(ProvidedCSVInputOnlyGpuSaveCpuBoundTensorImageDenotation)
        {
            const std::wstring modelPath = CURRENT_PATH + L"mnist.onnx";
            const std::wstring inputPath = CURRENT_PATH + L"mnist_28.csv";
@ -697,7 +701,7 @@ public: TEST_CLASS_INITIALIZE(SetupClass) {
            Assert::AreEqual(true, CompareTensors(L"OutputTensorData\\Mnist_8_input_GPU.csv",
                TENSOR_DATA_PATH + L"\\Plus214_Output_0GpuIteration1.csv"));
        }
-        TEST_METHOD(ProvidedCSVInputOnlyCpuSaveTensorImageDenotation)
+        TEST_METHOD(ProvidedCSVInputOnlyCpuSaveCpuBoundTensorImageDenotation)
        {
            const std::wstring modelPath = CURRENT_PATH + L"mnist.onnx";
            const std::wstring inputPath = CURRENT_PATH + L"mnist_28.csv";
--- a/Tools/WinMLRunner/WinMLRunnerStaticLib.vcxproj
+++ b/Tools/WinMLRunner/WinMLRunnerStaticLib.vcxproj
@ -31,7 +31,6 @@
    <ClInclude Include="src/CommandLineArgs.h" />
    <ClInclude Include="src/Common.h" />
    <ClInclude Include="src/Filehelper.h" />
-    <ClInclude Include="src/ModelBinding.h" />
    <ClInclude Include="src/OutputHelper.h" />
    <ClInclude Include="src/Run.h" />
    <ClInclude Include="src/TimerHelper.h" />
@ -234,6 +233,7 @@
      <LanguageStandard>stdcpp17</LanguageStandard>
      <PrecompiledHeaderFile />
      <PrecompiledHeaderOutputFile />
+      <AdditionalIncludeDirectories>..\..\Samples\CustomTensorization\CustomTensorization;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Windows</SubSystem>
@ -254,6 +254,7 @@
      <PrecompiledHeaderFile />
      <PrecompiledHeaderOutputFile />
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <AdditionalIncludeDirectories>..\..\Samples\CustomTensorization\CustomTensorization;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Windows</SubSystem>
--- a/Tools/WinMLRunner/WinMLRunnerStaticLib.vcxproj.filters
+++ b/Tools/WinMLRunner/WinMLRunnerStaticLib.vcxproj.filters
@ -36,9 +36,6 @@
    <ClInclude Include="src/TypeHelper.h">
      <Filter>Header Files</Filter>
    </ClInclude>
-    <ClInclude Include="src/ModelBinding.h">
-      <Filter>Source Files</Filter>
-    </ClInclude>
    <ClInclude Include="src/Run.h">
      <Filter>Header Files</Filter>
    </ClInclude>
--- a/Tools/WinMLRunner/src/BindingUtilities.h
+++ b/Tools/WinMLRunner/src/BindingUtilities.h
@ -3,7 +3,7 @@
 #include <time.h>
 #include "Common.h"
 #include "Windows.AI.Machinelearning.Native.h"
-
+#include "d3dx12.h"
 using namespace winrt::Windows::Media;
 using namespace winrt::Windows::Storage;
 using namespace winrt::Windows::AI::MachineLearning;
@ -327,7 +327,7 @@ namespace BindingUtilities

    template <TensorKind T>
    static ITensor CreateTensor(const CommandLineArgs& args, const std::vector<std::string>& tensorStringInput,
-                                const IVectorView<int64_t>& tensorShape)
+                                const IVectorView<int64_t>& tensorShape, const InputBindingType inputBindingType)
    {
        using TensorValue = typename TensorKindToValue<T>::Type;
        using DataType = typename TensorKindToType<T>::Type;
@ -372,11 +372,106 @@ namespace BindingUtilities
            // Creating Tensors for Input Images haven't been added yet.
            throw hresult_not_implemented(L"Creating Tensors for Input Images haven't been implemented yet!");
        }
-        return tensorValue;  
+
+        if (inputBindingType == InputBindingType::CPU)
+        {
+            return tensorValue;
+        }
+        else // GPU Tensor
+        {
+            com_ptr<ID3D12Resource> pGPUResource = nullptr;
+            try
+            {
+                // create the d3d device.
+                com_ptr<ID3D12Device> pD3D12Device = nullptr;
+                D3D12CreateDevice(nullptr, D3D_FEATURE_LEVEL::D3D_FEATURE_LEVEL_11_0, __uuidof(ID3D12Device),
+                                  reinterpret_cast<void**>(&pD3D12Device));
+                
+                pD3D12Device->CreateCommittedResource(
+                    &CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT),
+                    D3D12_HEAP_FLAG_NONE,
+                    &CD3DX12_RESOURCE_DESC::Buffer(
+                        actualSizeInBytes,
+                        D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS),
+                    D3D12_RESOURCE_STATE_COMMON, nullptr,
+                    __uuidof(ID3D12Resource), pGPUResource.put_void());
+                if (!args.IsGarbageInput())
+                {
+                    com_ptr<ID3D12Resource> imageUploadHeap;
+                    // Create the GPU upload buffer.
+                    pD3D12Device->CreateCommittedResource(
+                        &CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_UPLOAD), D3D12_HEAP_FLAG_NONE,
+                        &CD3DX12_RESOURCE_DESC::Buffer(actualSizeInBytes), D3D12_RESOURCE_STATE_GENERIC_READ, nullptr,
+                        __uuidof(ID3D12Resource), imageUploadHeap.put_void());
+
+                    // create the command queue.
+                    com_ptr<ID3D12CommandQueue> dxQueue = nullptr;
+                    D3D12_COMMAND_QUEUE_DESC commandQueueDesc = {};
+                    commandQueueDesc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
+                    pD3D12Device->CreateCommandQueue(&commandQueueDesc, __uuidof(ID3D12CommandQueue),
+                                                     reinterpret_cast<void**>(&dxQueue));
+                    com_ptr<ILearningModelDeviceFactoryNative> devicefactory =
+                        get_activation_factory<LearningModelDevice, ILearningModelDeviceFactoryNative>();
+                    com_ptr<::IUnknown> spUnk;
+                    devicefactory->CreateFromD3D12CommandQueue(dxQueue.get(), spUnk.put());
+
+                    // Create ID3D12GraphicsCommandList and Allocator
+                    D3D12_COMMAND_LIST_TYPE queuetype = dxQueue->GetDesc().Type;
+                    com_ptr<ID3D12CommandAllocator> alloctor;
+                    com_ptr<ID3D12GraphicsCommandList> cmdList;
+                    pD3D12Device->CreateCommandAllocator(queuetype, winrt::guid_of<ID3D12CommandAllocator>(),
+                                                         alloctor.put_void());
+                    pD3D12Device->CreateCommandList(0, queuetype, alloctor.get(), nullptr,
+                                                    winrt::guid_of<ID3D12CommandList>(), cmdList.put_void());
+
+                    // Copy from Cpu to GPU
+                    D3D12_SUBRESOURCE_DATA CPUData = {};
+                    CPUData.pData = actualData;
+                    CPUData.RowPitch = actualSizeInBytes;
+                    CPUData.SlicePitch = actualSizeInBytes;
+                    UpdateSubresources(cmdList.get(), pGPUResource.get(), imageUploadHeap.get(), 0, 0, 1, &CPUData);
+
+                    // Close the command list and execute it to begin the initial GPU setup.
+                    cmdList->Close();
+                    ID3D12CommandList* ppCommandLists[] = { cmdList.get() };
+                    dxQueue->ExecuteCommandLists(_countof(ppCommandLists), ppCommandLists);
+
+                    // Create Event
+                    HANDLE directEvent = CreateEvent(nullptr, FALSE, FALSE, nullptr);
+
+                    // Create Fence
+                    Microsoft::WRL::ComPtr<ID3D12Fence> spDirectFence = nullptr;
+                    THROW_IF_FAILED(pD3D12Device->CreateFence(0, D3D12_FENCE_FLAG_NONE,
+                                                              IID_PPV_ARGS(spDirectFence.ReleaseAndGetAddressOf())));
+                    // Adds fence to queue
+                    THROW_IF_FAILED(dxQueue->Signal(spDirectFence.Get(), 1));
+                    THROW_IF_FAILED(spDirectFence->SetEventOnCompletion(1, directEvent));
+
+                    // Wait for signal
+                    DWORD retVal = WaitForSingleObject(directEvent, INFINITE);
+                    if (retVal != WAIT_OBJECT_0)
+                    {
+                        THROW_IF_FAILED(E_UNEXPECTED);
+                    }
+                }
+            }
+            catch (...)
+            {
+                std::cout << "Couldn't create and copy CPU tensor resource to GPU resource" << std::endl;
+                throw;
+            }
+            com_ptr<ITensorStaticsNative> tensorfactory = get_activation_factory<TensorValue, ITensorStaticsNative>();
+            com_ptr<::IUnknown> spUnkTensor;
+            tensorfactory->CreateFromD3D12Resource(pGPUResource.get(), vecShape.data(), static_cast<int>(vecShape.size()), spUnkTensor.put());
+            TensorValue returnTensor(nullptr);
+            spUnkTensor.try_as(returnTensor);
+            return returnTensor;
+        }
    }

    // Binds tensor floats, ints, doubles from CSV data.
-    ITensor CreateBindableTensor(const ILearningModelFeatureDescriptor& description, const CommandLineArgs& args)
+    ITensor CreateBindableTensor(const ILearningModelFeatureDescriptor& description, const CommandLineArgs& args,
+                                 const InputBindingType inputBindingType)
    {
        std::vector<std::string> elementStrings;
        if (!args.CsvPath().empty())
@ -407,7 +502,7 @@ namespace BindingUtilities
            std::vector<int64_t> shape = { 1, channels, imageFeatureDescriptor.Height(),
                                           imageFeatureDescriptor.Width() };
            IVectorView<int64_t> shapeVectorView = single_threaded_vector(std::move(shape)).GetView();
-            return CreateTensor<TensorKind::Float>(args, elementStrings, shapeVectorView);
+            return CreateTensor<TensorKind::Float>(args, elementStrings, shapeVectorView, inputBindingType);
        }

        auto tensorDescriptor = description.try_as<TensorFeatureDescriptor>();
@ -422,57 +517,68 @@ namespace BindingUtilities
                }
                case TensorKind::Float:
                {
-                    return CreateTensor<TensorKind::Float>(args, elementStrings, tensorDescriptor.Shape());
+                    return CreateTensor<TensorKind::Float>(args, elementStrings, tensorDescriptor.Shape(),
+                                                           inputBindingType);
                }
                break;
                case TensorKind::Float16:
                {
-                    return CreateTensor<TensorKind::Float16>(args, elementStrings, tensorDescriptor.Shape());
+                    return CreateTensor<TensorKind::Float16>(args, elementStrings, tensorDescriptor.Shape(),
+                                                             inputBindingType);
                }
                break;
                case TensorKind::Double:
                {
-                    return CreateTensor<TensorKind::Double>(args, elementStrings, tensorDescriptor.Shape());
+                    return CreateTensor<TensorKind::Double>(args, elementStrings, tensorDescriptor.Shape(),
+                                                            inputBindingType);
                }
                break;
                case TensorKind::Int8:
                {
-                    return CreateTensor<TensorKind::Int8>(args, elementStrings, tensorDescriptor.Shape());
+                    return CreateTensor<TensorKind::Int8>(args, elementStrings, tensorDescriptor.Shape(),
+                                                          inputBindingType);
                }
                break;
                case TensorKind::UInt8:
                {
-                    return CreateTensor<TensorKind::UInt8>(args, elementStrings, tensorDescriptor.Shape());
+                    return CreateTensor<TensorKind::UInt8>(args, elementStrings, tensorDescriptor.Shape(),
+                                                           inputBindingType);
                }
                break;
                case TensorKind::Int16:
                {
-                    return CreateTensor<TensorKind::Int16>(args, elementStrings, tensorDescriptor.Shape());
+                    return CreateTensor<TensorKind::Int16>(args, elementStrings, tensorDescriptor.Shape(),
+                                                           inputBindingType);
                }
                break;
                case TensorKind::UInt16:
                {
-                    return CreateTensor<TensorKind::UInt16>(args, elementStrings, tensorDescriptor.Shape());
+                    return CreateTensor<TensorKind::UInt16>(args, elementStrings, tensorDescriptor.Shape(),
+                                                            inputBindingType);
                }
                break;
                case TensorKind::Int32:
                {
-                    return CreateTensor<TensorKind::Int32>(args, elementStrings, tensorDescriptor.Shape());
+                    return CreateTensor<TensorKind::Int32>(args, elementStrings, tensorDescriptor.Shape(),
+                                                           inputBindingType);
                }
                break;
                case TensorKind::UInt32:
                {
-                    return CreateTensor<TensorKind::UInt32>(args, elementStrings, tensorDescriptor.Shape());
+                    return CreateTensor<TensorKind::UInt32>(args, elementStrings, tensorDescriptor.Shape(),
+                                                            inputBindingType);
                }
                break;
                case TensorKind::Int64:
                {
-                    return CreateTensor<TensorKind::Int64>(args, elementStrings, tensorDescriptor.Shape());
+                    return CreateTensor<TensorKind::Int64>(args, elementStrings, tensorDescriptor.Shape(),
+                                                           inputBindingType);
                }
                break;
                case TensorKind::UInt64:
                {
-                    return CreateTensor<TensorKind::UInt64>(args, elementStrings, tensorDescriptor.Shape());
+                    return CreateTensor<TensorKind::UInt64>(args, elementStrings, tensorDescriptor.Shape(),
+                                                            inputBindingType);
                }
                break;
            }
--- a/Tools/WinMLRunner/src/Run.cpp
+++ b/Tools/WinMLRunner/src/Run.cpp
@ -20,10 +20,10 @@ std::vector<ILearningModelFeatureValue> GenerateInputFeatures(const LearningMode
    {
        auto&& description = model.InputFeatures().GetAt(i);

-        if (inputDataType == InputDataType::Tensor || i > 0)
+        if (inputDataType == InputDataType::Tensor)
        {
            // If CSV data is provided, then every input will contain the same CSV data
-            auto tensorFeature = BindingUtilities::CreateBindableTensor(description, args);
+            auto tensorFeature = BindingUtilities::CreateBindableTensor(description, args, inputBindingType);
            inputFeatures.push_back(tensorFeature);
        }
        else
@ -214,6 +214,11 @@ HRESULT BindInputs(LearningModelBinding &context, const LearningModel& model, co
                   const IDirect3DDevice& winrtDevice, DeviceCreationLocation deviceCreationLocation, uint32_t iteration,
                   Profiler<WINML_MODEL_TEST_PERF>& profiler)
 {
+    if (deviceType == DeviceType::CPU && inputDataType == InputDataType::Tensor && inputBindingType == InputBindingType::GPU)
+    {
+        std::cout << "Cannot create D3D12 device on client if CPU device type is selected." << std::endl;
+        return E_INVALIDARG;
+    }
    bool useInputData = false;
    bool isGarbageData = args.IsGarbageInput();
    std::string completionString = "\n";
@ -315,15 +320,6 @@ HRESULT CheckIfModelAndConfigurationsAreSupported(LearningModel& model, const st
            }
        }
    }
-
-    // Creating D3D12 device on client doesn't make sense for CPU deviceType
-    if (deviceType == DeviceType::CPU && std::any_of(deviceCreationLocations.begin(), deviceCreationLocations.end(),
-                                                     [](const DeviceCreationLocation deviceCreationLocation) {
-                                                         return deviceCreationLocation == DeviceCreationLocation::UserD3DDevice; }))
-    {
-        std::cout << "Cannot create D3D12 device on client if CPU device type is selected." << std::endl;
-        return E_INVALIDARG;
-    }
    return S_OK;
 }

@ -508,10 +504,10 @@ int run(CommandLineArgs& args, Profiler<WINML_MODEL_TEST_PERF>& profiler) try
            for (auto deviceType : deviceTypes)
            {
                lastHr = CheckIfModelAndConfigurationsAreSupported(model, path, deviceType, inputDataTypes,
-                                                               deviceCreationLocations);
+                                                                     deviceCreationLocations);
                if (FAILED(lastHr))
                {
-                    continue;
+                  continue;
                }
                for (auto deviceCreationLocation : deviceCreationLocations)
                {
@ -550,7 +546,10 @@ int run(CommandLineArgs& args, Profiler<WINML_MODEL_TEST_PERF>& profiler) try
                                LearningModelBinding context(session);
                                lastHr = BindInputs(context, model, session, output, deviceType, args, inputBindingType,
                                                inputDataType, winrtDevice, deviceCreationLocation, i, profiler);
-
+                                if (FAILED(lastHr))
+                                {
+                                    break;
+                                }
                                LearningModelEvaluationResult result = nullptr;
                                bool capture_perf = args.IsPerformanceCapture() || args.IsPerIterationCapture();
                                lastHr = EvaluateModel(result, model, context, session, args, output,