DirectML/Samples/DirectMLSuperResolution/DirectMLResourceBuilder.cpp

908 строки
46 KiB
C++

#include "pch.h"
#include "DirectMLSuperResolution.h"
#include "ATGColors.h"
#include "ControllerFont.h"
#include "FindMedia.h"
#include "ReadData.h"
#include "Float16Compressor.h"
#include "DirectMLX.h"
using Microsoft::WRL::ComPtr;
using namespace DirectX;
#pragma warning(disable : 4238)
void Sample::CreateDirectMLResources()
{
auto device = m_deviceResources->GetD3DDevice();
// Shader for converting texture to tensor
{
auto computeShaderBlob = DX::ReadData(L"ImageToTensor.cso");
// Define root table layout
CD3DX12_DESCRIPTOR_RANGE descRange[2];
descRange[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 0); // t0
descRange[1].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0); // u0
CD3DX12_ROOT_PARAMETER rootParameters[3];
rootParameters[e_crpIdxCB].InitAsConstants(3, 0);
rootParameters[e_crpIdxSRV].InitAsDescriptorTable(1, &descRange[0], D3D12_SHADER_VISIBILITY_ALL);
rootParameters[e_crpIdxUAV].InitAsDescriptorTable(1, &descRange[1], D3D12_SHADER_VISIBILITY_ALL);
CD3DX12_ROOT_SIGNATURE_DESC rootSignature(_countof(rootParameters), rootParameters);
ComPtr<ID3DBlob> serializedSignature;
DX::ThrowIfFailed(
D3D12SerializeRootSignature(&rootSignature, D3D_ROOT_SIGNATURE_VERSION_1, serializedSignature.GetAddressOf(), nullptr));
// Create the root signature
DX::ThrowIfFailed(
device->CreateRootSignature(
0,
serializedSignature->GetBufferPointer(),
serializedSignature->GetBufferSize(),
IID_PPV_ARGS(m_computeRootSignature.ReleaseAndGetAddressOf())));
m_computeRootSignature->SetName(L"Compute RS");
// Create compute pipeline state
D3D12_COMPUTE_PIPELINE_STATE_DESC descComputePSO = {};
descComputePSO.pRootSignature = m_computeRootSignature.Get();
descComputePSO.CS.pShaderBytecode = computeShaderBlob.data();
descComputePSO.CS.BytecodeLength = computeShaderBlob.size();
DX::ThrowIfFailed(
device->CreateComputePipelineState(&descComputePSO, IID_PPV_ARGS(m_computePSO.ReleaseAndGetAddressOf())));
m_computePSO->SetName(L"Compute PSO");
}
// Shader for rendering DML result tensor to texture
// This can also be done with a compute shader, depending on the app's needs.
{
auto vsShaderBlob = DX::ReadData(L"TensorToImageVS.cso");
auto psShaderBlob = DX::ReadData(L"TensorToImagePS.cso");
static const D3D12_INPUT_ELEMENT_DESC s_inputElementDesc[1] =
{
{ "POSITION", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0, D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0 },
};
// Define root table layout
CD3DX12_DESCRIPTOR_RANGE descRange[1];
descRange[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 0, 0, D3D12_DESCRIPTOR_RANGE_FLAG_NONE); // t0
CD3DX12_ROOT_PARAMETER rootParameters[2];
rootParameters[e_rrpIdxCB].InitAsConstants(3, 0, 0, D3D12_SHADER_VISIBILITY_PIXEL);
rootParameters[e_rrpIdxSRV].InitAsDescriptorTable(1, &descRange[0], D3D12_SHADER_VISIBILITY_PIXEL);
CD3DX12_ROOT_SIGNATURE_DESC rootSignature(_countof(rootParameters), rootParameters,
0, nullptr, D3D12_ROOT_SIGNATURE_FLAG_ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT);
ComPtr<ID3DBlob> serializedSignature;
DX::ThrowIfFailed(
D3D12SerializeRootSignature(&rootSignature, D3D_ROOT_SIGNATURE_VERSION_1, serializedSignature.GetAddressOf(), nullptr));
// Create the root signature
DX::ThrowIfFailed(
device->CreateRootSignature(
0,
serializedSignature->GetBufferPointer(),
serializedSignature->GetBufferSize(),
IID_PPV_ARGS(m_tensorRenderRootSignature.ReleaseAndGetAddressOf())));
m_tensorRenderRootSignature->SetName(L"Tensor Render RS");
// Create pipeline state
D3D12_GRAPHICS_PIPELINE_STATE_DESC psoDesc = {};
psoDesc.InputLayout = { s_inputElementDesc, _countof(s_inputElementDesc) };
psoDesc.pRootSignature = m_tensorRenderRootSignature.Get();
psoDesc.VS = { vsShaderBlob.data(), vsShaderBlob.size() };
psoDesc.PS = { psShaderBlob.data(), psShaderBlob.size() };
psoDesc.RasterizerState = CD3DX12_RASTERIZER_DESC(D3D12_DEFAULT);
psoDesc.BlendState = CD3DX12_BLEND_DESC(D3D12_DEFAULT);
psoDesc.DepthStencilState.DepthEnable = FALSE;
psoDesc.DepthStencilState.StencilEnable = FALSE;
psoDesc.DSVFormat = m_deviceResources->GetDepthBufferFormat();
psoDesc.SampleMask = UINT_MAX;
psoDesc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE;
psoDesc.NumRenderTargets = 1;
psoDesc.RTVFormats[0] = DXGI_FORMAT_B8G8R8A8_UNORM;
psoDesc.SampleDesc.Count = 1;
DX::ThrowIfFailed(
device->CreateGraphicsPipelineState(&psoDesc,
IID_PPV_ARGS(m_tensorRenderPipelineState.ReleaseAndGetAddressOf())));
m_tensorRenderPipelineState->SetName(L"Tensor Render PSO");
// Resource to hold the rendered texture
D3D12_RESOURCE_DESC txtDesc = {};
txtDesc.MipLevels = txtDesc.DepthOrArraySize = 1;
txtDesc.Format = DXGI_FORMAT_B8G8R8A8_UNORM;
txtDesc.SampleDesc.Count = 1;
txtDesc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D;
txtDesc.Width = m_origTextureWidth * 2;
txtDesc.Height = m_origTextureHeight * 2;
txtDesc.Flags = D3D12_RESOURCE_FLAG_ALLOW_RENDER_TARGET;
DX::ThrowIfFailed(
device->CreateCommittedResource(
&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT),
D3D12_HEAP_FLAG_NONE,
&txtDesc,
D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE,
&CD3DX12_CLEAR_VALUE(DXGI_FORMAT_B8G8R8A8_UNORM, DirectX::Colors::Black),
IID_PPV_ARGS(m_finalResultTexture.ReleaseAndGetAddressOf())));
// Create an RTV for rendering to the texture, and an SRV for rendering it back to the screen
D3D12_RENDER_TARGET_VIEW_DESC rtvDesc = {};
rtvDesc.Format = txtDesc.Format;
rtvDesc.ViewDimension = D3D12_RTV_DIMENSION_TEXTURE2D;
device->CreateRenderTargetView(m_finalResultTexture.Get(), &rtvDesc, m_RTVDescriptorHeap->GetCpuHandle(e_descFinalResultTextureRtv));
D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = {};
srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
srvDesc.Format = txtDesc.Format;
srvDesc.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE2D;
srvDesc.Texture2D.MipLevels = 1;
device->CreateShaderResourceView(m_finalResultTexture.Get(), &srvDesc, m_SRVDescriptorHeap->GetCpuHandle(e_descFinalResultTextureSrv));
}
// DirectML device
{
#if _DEBUG
DX::ThrowIfFailed(DMLCreateDevice(device, DML_CREATE_DEVICE_FLAG_DEBUG, IID_PPV_ARGS(&m_dmlDevice)));
#else
DX::ThrowIfFailed(DMLCreateDevice(device, DML_CREATE_DEVICE_FLAG_NONE, IID_PPV_ARGS(&m_dmlDevice)));
#endif
#if FORCE_NCHW
m_tensorLayout = TensorLayout::Default;
#else
// Determine the best tensor layout based on the GPU vendor.
// This is a fairly coarse-grained method, but recent Nvidia GPUs tend to use NHWC
// layouts, while others use the default NCHW.
ComPtr<IDXGIAdapter1> adapter;
DX::ThrowIfFailed(m_deviceResources->GetDXGIFactory()->EnumAdapterByLuid(device->GetAdapterLuid(), IID_PPV_ARGS(&adapter)));
DXGI_ADAPTER_DESC adapterDesc;
adapter->GetDesc(&adapterDesc);
if (adapterDesc.VendorId == 0x10DE) // Nvidia
{
// This is faster on recent Nvidia hardware, but may be a problem on older hardware.
// If necessary, set FORCE_NCHW to override this.
m_tensorLayout = TensorLayout::NHWC;
}
else
{
m_tensorLayout = TensorLayout::Default;
}
#endif
DML_FEATURE_QUERY_TENSOR_DATA_TYPE_SUPPORT fp16Query = { DML_TENSOR_DATA_TYPE_FLOAT16 };
DML_FEATURE_DATA_TENSOR_DATA_TYPE_SUPPORT fp16Supported = {};
DX::ThrowIfFailed(m_dmlDevice->CheckFeatureSupport(DML_FEATURE_TENSOR_DATA_TYPE_SUPPORT, sizeof(fp16Query), &fp16Query, sizeof(fp16Supported), &fp16Supported));
if (!fp16Supported.IsSupported)
{
throw std::exception("FP16 data type support is required for this sample.");
}
DX::ThrowIfFailed(m_dmlDevice->CreateCommandRecorder(IID_PPV_ARGS(&m_dmlCommandRecorder)));
}
uint64_t modelInputBufferSize = 0;
uint64_t modelOutputBufferSize = 0;
uint64_t intermediateBufferMaxSize[] = { 0, 0 };
// DirectML operator resources--implementation of the super-resolution model
{
// Create an upscaled (nearest neighbor) version of the image first
uint32_t modelInputSizes[] = { 1, 3, m_origTextureHeight, m_origTextureWidth };
uint32_t upscaledInputSizes[4];
CreateUpsampleLayer(modelInputSizes, &modelInputBufferSize, &modelOutputBufferSize, upscaledInputSizes, &m_dmlUpsampleOps[0]);
// Create the residual with three convolutions, an upsample, and four more convolutions
WeightMapType weights;
if (!LoadWeights("Assets\\weights.bin", weights))
{
throw std::exception("loadWeights");
}
DirectX::ResourceUploadBatch weightUploadBatch(device);
weightUploadBatch.Begin();
uint32_t const filterSizes1[] = { 32, 3, 5, 5 };
uint32_t intermediateInputSizes[2][4];
CreateConvolutionLayer(modelInputSizes, filterSizes1, true, &modelInputBufferSize,
&intermediateBufferMaxSize[0], intermediateInputSizes[0], &m_dmlConvOps[0]);
CreateWeightTensors(weights, "conv1/weights", "conv1/BatchNorm/scale", "conv1/BatchNorm/shift",
filterSizes1, weightUploadBatch, &m_modelConvFilterWeights[0], &m_modelConvBiasWeights[0]);
// Which intermediate resource to use as input for the current operation. The other will be
// used as output. Then the next op will swap the order.
int inputIndex = 0;
uint32_t const filterSizes2[] = { 64, 32, 3, 3 }; // output filters
CreateConvolutionLayer(intermediateInputSizes[inputIndex], filterSizes2, true, &intermediateBufferMaxSize[inputIndex],
&intermediateBufferMaxSize[1 - inputIndex], intermediateInputSizes[1 - inputIndex], &m_dmlConvOps[1]);
CreateWeightTensors(weights, "conv2/weights", "conv2/BatchNorm/scale", "conv2/BatchNorm/shift",
filterSizes2, weightUploadBatch, &m_modelConvFilterWeights[1], &m_modelConvBiasWeights[1]);
inputIndex = 1 - inputIndex;
uint32_t const filterSizes3[] = { 64, 64, 3, 3 };
CreateConvolutionLayer(intermediateInputSizes[inputIndex], filterSizes3, true, &intermediateBufferMaxSize[inputIndex],
&intermediateBufferMaxSize[1 - inputIndex], intermediateInputSizes[1 - inputIndex], &m_dmlConvOps[2]);
CreateWeightTensors(weights, "conv3/weights", "conv3/BatchNorm/scale", "conv3/BatchNorm/shift",
filterSizes3, weightUploadBatch, &m_modelConvFilterWeights[2], &m_modelConvBiasWeights[2]);
inputIndex = 1 - inputIndex;
CreateUpsampleLayer(intermediateInputSizes[inputIndex], &intermediateBufferMaxSize[inputIndex],
&intermediateBufferMaxSize[1 - inputIndex], intermediateInputSizes[1 - inputIndex], &m_dmlUpsampleOps[1]);
inputIndex = 1 - inputIndex;
uint32_t const filterSizes4[] = { 32, 64, 5, 5 };
CreateConvolutionLayer(intermediateInputSizes[inputIndex], filterSizes4, true, &intermediateBufferMaxSize[inputIndex],
&intermediateBufferMaxSize[1 - inputIndex], intermediateInputSizes[1 - inputIndex], &m_dmlConvOps[3]);
CreateWeightTensors(weights, "conv_up1/conv/weights", "conv_up1/conv/BatchNorm/scale", "conv_up1/conv/BatchNorm/shift",
filterSizes4, weightUploadBatch, &m_modelConvFilterWeights[3], &m_modelConvBiasWeights[3]);
inputIndex = 1 - inputIndex;
uint32_t const filterSizes5[] = { 32, 32, 3, 3 };
CreateConvolutionLayer(intermediateInputSizes[inputIndex], filterSizes5, true, &intermediateBufferMaxSize[inputIndex],
&intermediateBufferMaxSize[1 - inputIndex], intermediateInputSizes[1 - inputIndex], &m_dmlConvOps[4]);
CreateWeightTensors(weights, "conv4/weights", "conv4/BatchNorm/scale", "conv4/BatchNorm/shift",
filterSizes5, weightUploadBatch, &m_modelConvFilterWeights[4], &m_modelConvBiasWeights[4]);
inputIndex = 1 - inputIndex;
CreateConvolutionLayer(intermediateInputSizes[inputIndex], filterSizes5, true, &intermediateBufferMaxSize[inputIndex],
&intermediateBufferMaxSize[1 - inputIndex], intermediateInputSizes[1 - inputIndex], &m_dmlConvOps[5]);
CreateWeightTensors(weights, "conv5/weights", "conv5/BatchNorm/scale", "conv5/BatchNorm/shift",
filterSizes5, weightUploadBatch, &m_modelConvFilterWeights[5], &m_modelConvBiasWeights[5]);
inputIndex = 1 - inputIndex;
uint32_t const filterSizes6[] = { 3, 32, 3, 3 };
CreateConvolutionLayer(intermediateInputSizes[inputIndex], filterSizes6, false, &intermediateBufferMaxSize[inputIndex],
&intermediateBufferMaxSize[1 - inputIndex], intermediateInputSizes[1 - inputIndex], &m_dmlConvOps[6]);
CreateWeightTensors(weights, "conv6/weights", nullptr, nullptr, filterSizes6, weightUploadBatch,
&m_modelConvFilterWeights[6], nullptr);
inputIndex = 1 - inputIndex;
// Finally add the residual to the original upsampled image
assert(memcmp(upscaledInputSizes, intermediateInputSizes[inputIndex], 4 * sizeof(uint16_t)) == 0);
CreateAdditionLayer(upscaledInputSizes, &m_dmlAddResidualOp);
weightUploadBatch.End(m_deviceResources->GetCommandQueue());
}
// Buffers for DML inputs and outputs
{
// Resource for input tensor
D3D12_RESOURCE_DESC resourceDesc = CD3DX12_RESOURCE_DESC::Buffer(modelInputBufferSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
DX::ThrowIfFailed(device->CreateCommittedResource(
&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT),
D3D12_HEAP_FLAG_NONE,
&resourceDesc,
D3D12_RESOURCE_STATE_COMMON,
nullptr,
IID_PPV_ARGS(&m_modelInput)
));
// Describe and create a UAV for the original input tensor.
D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
uavDesc.Format = DXGI_FORMAT_R16_FLOAT;
uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
uavDesc.Buffer.FirstElement = 0;
uavDesc.Buffer.NumElements = static_cast<UINT>(modelInputBufferSize / sizeof(uint16_t));
uavDesc.Buffer.StructureByteStride = 0;
uavDesc.Buffer.CounterOffsetInBytes = 0;
uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_NONE;
device->CreateUnorderedAccessView(m_modelInput.Get(), nullptr, &uavDesc, m_SRVDescriptorHeap->GetCpuHandle(e_descModelInput));
// Model result tensor is 2x larger in both dimensions
resourceDesc.Width = modelOutputBufferSize;
DX::ThrowIfFailed(device->CreateCommittedResource(
&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT),
D3D12_HEAP_FLAG_NONE,
&resourceDesc,
D3D12_RESOURCE_STATE_COMMON,
nullptr,
IID_PPV_ARGS(&m_modelOutput)
));
// Describe and create a SRV for the final result tensor.
D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = {};
srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
srvDesc.Format = DXGI_FORMAT_R16_FLOAT;
srvDesc.ViewDimension = D3D12_SRV_DIMENSION_BUFFER;
srvDesc.Buffer.FirstElement = 0;
srvDesc.Buffer.NumElements = static_cast<UINT>(modelOutputBufferSize / sizeof(uint16_t));
srvDesc.Buffer.StructureByteStride = 0;
srvDesc.Buffer.Flags = D3D12_BUFFER_SRV_FLAG_NONE;
device->CreateShaderResourceView(m_modelOutput.Get(), &srvDesc, m_SRVDescriptorHeap->GetCpuHandle(e_descModelOutput));
// Create two resources for intermediate layer results. Each layer will ping-pong between these. They're each large
// enough to hold the largest intermediate result required.
for (int i = 0; i < 2; i++)
{
resourceDesc.Width = intermediateBufferMaxSize[i];
DX::ThrowIfFailed(device->CreateCommittedResource(
&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT),
D3D12_HEAP_FLAG_NONE,
&resourceDesc,
D3D12_RESOURCE_STATE_COMMON,
nullptr,
IID_PPV_ARGS(&m_modelIntermediateResult[i])
));
}
}
// Wait until assets have been uploaded to the GPU.
m_deviceResources->WaitForGpu();
}
UINT GetDescriptorCount(size_t numOps, IDMLCompiledOperator** ops, IDMLOperatorInitializer* initializer)
{
auto bindingProps = initializer->GetBindingProperties();
UINT requiredDescriptorCount = bindingProps.RequiredDescriptorCount;
for (size_t i = 0; i < numOps; i++)
{
bindingProps = ops[i]->GetBindingProperties();
requiredDescriptorCount = std::max(requiredDescriptorCount, bindingProps.RequiredDescriptorCount);
}
return requiredDescriptorCount;
}
void Sample::InitializeDirectMLResources()
{
auto commandList = m_deviceResources->GetCommandList();
commandList->Reset(m_deviceResources->GetCommandAllocator(), nullptr);
// Create operator initializers and descriptor heap for binding
size_t upsampleOpDescriptorCount, convOpDescriptorCount, additionOpDescriptorCount;
size_t upsampleDescriptorsIdx, convDescriptorsIdx, additionDescriptorsIdx;
{
// The same descriptor heap will be used for both initializing and executing operators. These each happen
// at different times, so we reuse the same descriptor slots. GetDescriptorCount() ensures there are enough
// slots for both cases.
DX::ThrowIfFailed(m_dmlDevice->CreateOperatorInitializer(c_numUpsampleLayers, m_dmlUpsampleOps[0].GetAddressOf(), IID_PPV_ARGS(m_dmlOpInitializers[e_opUpsample].GetAddressOf())));
upsampleOpDescriptorCount = GetDescriptorCount(c_numUpsampleLayers, m_dmlUpsampleOps[0].GetAddressOf(), m_dmlOpInitializers[e_opUpsample].Get());
DX::ThrowIfFailed(m_dmlDevice->CreateOperatorInitializer(c_numConvLayers, m_dmlConvOps[0].GetAddressOf(), IID_PPV_ARGS(m_dmlOpInitializers[e_opConv].GetAddressOf())));
convOpDescriptorCount = GetDescriptorCount(c_numConvLayers, m_dmlConvOps[0].GetAddressOf(), m_dmlOpInitializers[e_opConv].Get());
DX::ThrowIfFailed(m_dmlDevice->CreateOperatorInitializer(1, m_dmlAddResidualOp.GetAddressOf(), IID_PPV_ARGS(m_dmlOpInitializers[e_opAdd].GetAddressOf())));
additionOpDescriptorCount = GetDescriptorCount(1, m_dmlAddResidualOp.GetAddressOf(), m_dmlOpInitializers[e_opAdd].Get());
upsampleDescriptorsIdx = 0;
convDescriptorsIdx = upsampleDescriptorsIdx + upsampleOpDescriptorCount * c_numUpsampleLayers;
additionDescriptorsIdx = convDescriptorsIdx + convOpDescriptorCount * c_numConvLayers;
size_t descriptorCount = additionDescriptorsIdx + additionOpDescriptorCount;
m_dmlDescriptorHeap = std::make_unique<DirectX::DescriptorHeap>(m_deviceResources->GetD3DDevice(),
D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV,
D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE,
descriptorCount);
// Operator initialization dispatches will use this heap right away
ID3D12DescriptorHeap* pHeaps[] = { m_dmlDescriptorHeap->Heap() };
commandList->SetDescriptorHeaps(_countof(pHeaps), pHeaps);
}
// Create any persistent resources required for the operators.
{
for (int i = 0; i < c_numUpsampleLayers + c_numConvLayers + 1; i++)
{
IDMLCompiledOperator* currentOp;
ID3D12Resource** persistentResource;
if (i < c_numUpsampleLayers)
{
currentOp = m_dmlUpsampleOps[i].Get();
persistentResource = m_modelUpsamplePersistentResources[i].ReleaseAndGetAddressOf();
}
else if (i < c_numUpsampleLayers + c_numConvLayers)
{
currentOp = m_dmlConvOps[i - c_numUpsampleLayers].Get();
persistentResource = m_modelConvPersistentResources[i - c_numUpsampleLayers].ReleaseAndGetAddressOf();
}
else
{
currentOp = m_dmlAddResidualOp.Get();
persistentResource = m_modelAddPersistentResource.ReleaseAndGetAddressOf();
}
auto bindingProps = currentOp->GetBindingProperties();
if (bindingProps.PersistentResourceSize > 0)
{
D3D12_RESOURCE_DESC resourceDesc = CD3DX12_RESOURCE_DESC::Buffer(bindingProps.PersistentResourceSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
DX::ThrowIfFailed(m_deviceResources->GetD3DDevice()->CreateCommittedResource(
&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT),
D3D12_HEAP_FLAG_NONE,
&resourceDesc,
D3D12_RESOURCE_STATE_COMMON,
nullptr,
IID_PPV_ARGS(persistentResource)));
}
}
}
// When binding input and output resources, take note of which temp resource is used at the time:
// Layer | Input | Output
// Upsample[0] | m_modelInput | m_modelOutput
// Conv[0] | m_modelInput | m_modelIntermediateResult[0]
// Conv[1] | m_modelIntermediateResult[0] | m_modelIntermediateResult[1]
// Conv[2] | m_modelIntermediateResult[1] | m_modelIntermediateResult[0]
// Upsample[1] | m_modelIntermediateResult[0] | m_modelIntermediateResult[1]
// Conv[3] | m_modelIntermediateResult[1] | m_modelIntermediateResult[0]
// Conv[4] | m_modelIntermediateResult[0] | m_modelIntermediateResult[1]
// Conv[5] | m_modelIntermediateResult[1] | m_modelIntermediateResult[0]
// Conv[6] | m_modelIntermediateResult[0] | m_modelIntermediateResult[1]
// Addition | m_modelIntermediateResult[1], m_modelOutput | m_modelOutput
const DML_BUFFER_BINDING emptyBufferBinding = { nullptr, 0, 0 };
const DML_BINDING_DESC emptyBindingDesc = { DML_BINDING_TYPE_NONE, nullptr };
// Bind D3D resources
Microsoft::WRL::ComPtr<IDMLBindingTable> initBindingTable;
// Upsample layers
{
// Bind resources for initialization.
auto bindingProps = m_dmlOpInitializers[e_opUpsample]->GetBindingProperties();
// The DML API guarantees that initialization never uses a persistent resource.
assert(bindingProps.PersistentResourceSize == 0);
DML_BINDING_TABLE_DESC tableDesc = {
m_dmlOpInitializers[e_opUpsample].Get(),
m_dmlDescriptorHeap->GetCpuHandle(upsampleDescriptorsIdx),
m_dmlDescriptorHeap->GetGpuHandle(upsampleDescriptorsIdx),
bindingProps.RequiredDescriptorCount
};
DX::ThrowIfFailed(m_dmlDevice->CreateBindingTable(&tableDesc, IID_PPV_ARGS(&initBindingTable)));
// If the operator requires a persistent resource, it must be bound as output for the initializer.
DML_BUFFER_BINDING upsamplePersistentBuffers[c_numUpsampleLayers];
DML_BINDING_DESC upsamplePersistentBindings[c_numUpsampleLayers];
for (int i = 0; i < c_numUpsampleLayers; i++)
{
if (m_modelUpsamplePersistentResources[i].Get() != nullptr)
{
upsamplePersistentBuffers[i] = { m_modelUpsamplePersistentResources[i].Get(), 0, m_modelUpsamplePersistentResources[i]->GetDesc().Width };
upsamplePersistentBindings[i] = { DML_BINDING_TYPE_BUFFER, &upsamplePersistentBuffers[i] };
}
else
upsamplePersistentBindings[i] = emptyBindingDesc;
}
// The inputs will vary each frame, so don't bind inputs at initialization.
initBindingTable->BindInputs(0, nullptr);
initBindingTable->BindOutputs(c_numUpsampleLayers, upsamplePersistentBindings);
BindTempResourceIfNeeded(bindingProps, initBindingTable.Get(), m_modelInitTemporaryResources[e_opUpsample].ReleaseAndGetAddressOf());
// Run initialization
m_dmlCommandRecorder->RecordDispatch(commandList, m_dmlOpInitializers[e_opUpsample].Get(), initBindingTable.Get());
// Bind resources for execution
for (int i = 0; i < c_numUpsampleLayers; i++)
{
bindingProps = m_dmlUpsampleOps[i]->GetBindingProperties();
tableDesc = {
m_dmlUpsampleOps[i].Get(),
m_dmlDescriptorHeap->GetCpuHandle(upsampleDescriptorsIdx + i * upsampleOpDescriptorCount),
m_dmlDescriptorHeap->GetGpuHandle(upsampleDescriptorsIdx + i * upsampleOpDescriptorCount),
bindingProps.RequiredDescriptorCount
};
DX::ThrowIfFailed(m_dmlDevice->CreateBindingTable(&tableDesc, IID_PPV_ARGS(m_dmlUpsampleBindings[i].ReleaseAndGetAddressOf())));
auto inputResource = (i == 0) ? m_modelInput : m_modelIntermediateResult[0];
auto outputResource = (i == 0) ? m_modelOutput : m_modelIntermediateResult[1];
DML_BUFFER_BINDING inputBufferBinding = { inputResource.Get(), 0, inputResource->GetDesc().Width };
DML_BINDING_DESC inputBinding = { DML_BINDING_TYPE_BUFFER, &inputBufferBinding };
DML_BUFFER_BINDING outputBufferBinding = { outputResource.Get(), 0, outputResource->GetDesc().Width };
DML_BINDING_DESC outputBinding = { DML_BINDING_TYPE_BUFFER, &outputBufferBinding };
m_dmlUpsampleBindings[i]->BindInputs(1, &inputBinding);
m_dmlUpsampleBindings[i]->BindOutputs(1, &outputBinding);
BindTempResourceIfNeeded(bindingProps, m_dmlUpsampleBindings[i].Get(), m_modelUpsampleTemporaryResources[i].ReleaseAndGetAddressOf());
if (m_modelUpsamplePersistentResources[i].Get() != nullptr)
m_dmlUpsampleBindings[i]->BindPersistentResource(&upsamplePersistentBindings[i]);
}
}
// Convolution layers
{
// Bind resources for initialization
auto bindingProps = m_dmlOpInitializers[e_opConv]->GetBindingProperties();
assert(bindingProps.PersistentResourceSize == 0);
DML_BINDING_TABLE_DESC tableDesc = {
m_dmlOpInitializers[e_opConv].Get(),
m_dmlDescriptorHeap->GetCpuHandle(convDescriptorsIdx),
m_dmlDescriptorHeap->GetGpuHandle(convDescriptorsIdx),
bindingProps.RequiredDescriptorCount
};
DX::ThrowIfFailed(initBindingTable->Reset(&tableDesc));
#if DML_MANAGED_WEIGHTS
// Bind the weight tensors at initialization instead of at execution. This lets DirectML reformat them
// and improve performance on some hardware.
DML_BUFFER_BINDING convBufferBindings[][3] = {
{ emptyBufferBinding, { m_modelConvFilterWeights[0].Get(), 0, m_modelConvFilterWeights[0]->GetDesc().Width }, { m_modelConvBiasWeights[0].Get(), 0, m_modelConvBiasWeights[0]->GetDesc().Width } },
{ emptyBufferBinding, { m_modelConvFilterWeights[1].Get(), 0, m_modelConvFilterWeights[1]->GetDesc().Width }, { m_modelConvBiasWeights[1].Get(), 0, m_modelConvBiasWeights[1]->GetDesc().Width } },
{ emptyBufferBinding, { m_modelConvFilterWeights[2].Get(), 0, m_modelConvFilterWeights[2]->GetDesc().Width }, { m_modelConvBiasWeights[2].Get(), 0, m_modelConvBiasWeights[2]->GetDesc().Width } },
{ emptyBufferBinding, { m_modelConvFilterWeights[3].Get(), 0, m_modelConvFilterWeights[3]->GetDesc().Width }, { m_modelConvBiasWeights[3].Get(), 0, m_modelConvBiasWeights[3]->GetDesc().Width } },
{ emptyBufferBinding, { m_modelConvFilterWeights[4].Get(), 0, m_modelConvFilterWeights[4]->GetDesc().Width }, { m_modelConvBiasWeights[4].Get(), 0, m_modelConvBiasWeights[4]->GetDesc().Width } },
{ emptyBufferBinding, { m_modelConvFilterWeights[5].Get(), 0, m_modelConvFilterWeights[5]->GetDesc().Width }, { m_modelConvBiasWeights[5].Get(), 0, m_modelConvBiasWeights[5]->GetDesc().Width } },
{ emptyBufferBinding, { m_modelConvFilterWeights[6].Get(), 0, m_modelConvFilterWeights[6]->GetDesc().Width }, emptyBufferBinding } // last layer has no bias
};
DML_BUFFER_ARRAY_BINDING convBufferArrayBindings[] = {
{ 3, convBufferBindings[0] },
{ 3, convBufferBindings[1] },
{ 3, convBufferBindings[2] },
{ 3, convBufferBindings[3] },
{ 3, convBufferBindings[4] },
{ 3, convBufferBindings[5] },
{ 3, convBufferBindings[6] },
};
DML_BINDING_DESC convInBindings[] = {
{ DML_BINDING_TYPE_BUFFER_ARRAY, &convBufferArrayBindings[0] },
{ DML_BINDING_TYPE_BUFFER_ARRAY, &convBufferArrayBindings[1] },
{ DML_BINDING_TYPE_BUFFER_ARRAY, &convBufferArrayBindings[2] },
{ DML_BINDING_TYPE_BUFFER_ARRAY, &convBufferArrayBindings[3] },
{ DML_BINDING_TYPE_BUFFER_ARRAY, &convBufferArrayBindings[4] },
{ DML_BINDING_TYPE_BUFFER_ARRAY, &convBufferArrayBindings[5] },
{ DML_BINDING_TYPE_BUFFER_ARRAY, &convBufferArrayBindings[6] }
};
initBindingTable->BindInputs(c_numConvLayers, convInBindings);
#else
initBindingTable->BindInputs(0, nullptr);
#endif
// If the operator requires a persistent resource, it must be bound as output for the initializer.
DML_BUFFER_BINDING convPersistentBuffers[c_numConvLayers];
DML_BINDING_DESC convPersistentBindings[c_numConvLayers];
for (int i = 0; i < c_numConvLayers; i++)
{
if (m_modelConvPersistentResources[i].Get() != nullptr)
{
convPersistentBuffers[i] = { m_modelConvPersistentResources[i].Get(), 0, m_modelConvPersistentResources[i]->GetDesc().Width };
convPersistentBindings[i] = { DML_BINDING_TYPE_BUFFER, &convPersistentBuffers[i] };
}
else
convPersistentBindings[i] = emptyBindingDesc;
}
initBindingTable->BindOutputs(c_numConvLayers, convPersistentBindings);
BindTempResourceIfNeeded(bindingProps, initBindingTable.Get(), m_modelInitTemporaryResources[e_opConv].ReleaseAndGetAddressOf());
// Run initialization
m_dmlCommandRecorder->RecordDispatch(commandList, m_dmlOpInitializers[e_opConv].Get(), initBindingTable.Get());
// Bind resources for execution
for (int i = 0; i < c_numConvLayers; i++)
{
bindingProps = m_dmlConvOps[i]->GetBindingProperties();
tableDesc = {
m_dmlConvOps[i].Get(),
m_dmlDescriptorHeap->GetCpuHandle(convDescriptorsIdx + i * convOpDescriptorCount),
m_dmlDescriptorHeap->GetGpuHandle(convDescriptorsIdx + i * convOpDescriptorCount),
bindingProps.RequiredDescriptorCount
};
DX::ThrowIfFailed(m_dmlDevice->CreateBindingTable(&tableDesc, IID_PPV_ARGS(m_dmlConvBindings[i].ReleaseAndGetAddressOf())));
// See table at the beginning of the function for the mapping of ops to resources.
auto inputResource = (i == 0) ? m_modelInput : ((i == 1 || i == 4 || i == 6) ? m_modelIntermediateResult[0] : m_modelIntermediateResult[1]);
auto outputResource = (i == 1 || i == 4 || i == 6) ? m_modelIntermediateResult[1] : m_modelIntermediateResult[0];
DML_BUFFER_BINDING inputBufferBinding = { inputResource.Get(), 0, inputResource->GetDesc().Width };
DML_BINDING_DESC inputBinding = { DML_BINDING_TYPE_BUFFER, &inputBufferBinding };
DML_BUFFER_BINDING outputBufferBinding = { outputResource.Get(), 0, outputResource->GetDesc().Width };
DML_BINDING_DESC outputBinding = { DML_BINDING_TYPE_BUFFER, &outputBufferBinding };
#if DML_MANAGED_WEIGHTS
// The weights are stored in the persistent resource and shouldn't be bound separately.
DML_BINDING_DESC inputBindings[] = { inputBinding, emptyBindingDesc, emptyBindingDesc };
#else
// Bind the weight resources
DML_BUFFER_BINDING filterBufferBinding = { m_modelConvFilterWeights[i].Get(), 0, m_modelConvFilterWeights[i]->GetDesc().Width };
DML_BINDING_DESC filterBinding = { DML_BINDING_TYPE_BUFFER, &filterBufferBinding };
DML_BUFFER_BINDING biasBufferBinding;
DML_BINDING_DESC biasBinding;
if (i == 6)
{
biasBinding = emptyBindingDesc; // last layer has no bias
}
else
{
biasBufferBinding = { m_modelConvBiasWeights[i].Get(), 0, m_modelConvBiasWeights[i]->GetDesc().Width };
biasBinding = { DML_BINDING_TYPE_BUFFER, &biasBufferBinding };
}
DML_BINDING_DESC inputBindings[] = { inputBinding, filterBinding, biasBinding };
#endif
m_dmlConvBindings[i]->BindInputs(3, inputBindings);
m_dmlConvBindings[i]->BindOutputs(1, &outputBinding);
BindTempResourceIfNeeded(bindingProps, m_dmlConvBindings[i].Get(), m_modelConvTemporaryResources[i].ReleaseAndGetAddressOf());
if (m_modelConvPersistentResources[i].Get() != nullptr)
m_dmlConvBindings[i]->BindPersistentResource(&convPersistentBindings[i]);
}
}
// Addition layer
{
// Bind resources for initialization.
auto bindingProps = m_dmlOpInitializers[e_opAdd]->GetBindingProperties();
assert(bindingProps.PersistentResourceSize == 0);
DML_BINDING_TABLE_DESC tableDesc = {
m_dmlOpInitializers[e_opAdd].Get(),
m_dmlDescriptorHeap->GetCpuHandle(additionDescriptorsIdx),
m_dmlDescriptorHeap->GetGpuHandle(additionDescriptorsIdx),
bindingProps.RequiredDescriptorCount
};
DX::ThrowIfFailed(initBindingTable->Reset(&tableDesc));
// If the operator requires a persistent resource, it must be bound as output for the initializer.
DML_BUFFER_BINDING addPersistentBuffer;
DML_BINDING_DESC addPersistentBinding;
if (m_modelAddPersistentResource.Get() != nullptr)
{
addPersistentBuffer = { m_modelAddPersistentResource.Get(), 0, m_modelAddPersistentResource->GetDesc().Width };
addPersistentBinding = { DML_BINDING_TYPE_BUFFER, &addPersistentBuffer };
}
else
addPersistentBinding = emptyBindingDesc;
initBindingTable->BindInputs(0, nullptr);
initBindingTable->BindOutputs(1, &addPersistentBinding);
BindTempResourceIfNeeded(bindingProps, initBindingTable.Get(), m_modelInitTemporaryResources[e_opAdd].ReleaseAndGetAddressOf());
// Run initialization
m_dmlCommandRecorder->RecordDispatch(commandList, m_dmlOpInitializers[e_opAdd].Get(), initBindingTable.Get());
// Bind resources for execution
{
bindingProps = m_dmlAddResidualOp->GetBindingProperties();
tableDesc = {
m_dmlAddResidualOp.Get(),
m_dmlDescriptorHeap->GetCpuHandle(additionDescriptorsIdx),
m_dmlDescriptorHeap->GetGpuHandle(additionDescriptorsIdx),
bindingProps.RequiredDescriptorCount
};
DX::ThrowIfFailed(m_dmlDevice->CreateBindingTable(&tableDesc, IID_PPV_ARGS(m_dmlAddResidualBinding.ReleaseAndGetAddressOf())));
// m_modelOutput will already hold the result of the first upsample operation. We add the result of
// the last convolution (the residual) to it in-place to get the final result.
DML_BUFFER_BINDING input0BufferBinding = { m_modelIntermediateResult[1].Get(), 0, m_modelIntermediateResult[1]->GetDesc().Width };
DML_BINDING_DESC input0Binding = { DML_BINDING_TYPE_BUFFER, &input0BufferBinding };
DML_BUFFER_BINDING input1BufferBinding = { m_modelOutput.Get(), 0, m_modelOutput->GetDesc().Width };
DML_BINDING_DESC input1Binding = { DML_BINDING_TYPE_BUFFER, &input1BufferBinding };
DML_BUFFER_BINDING outputBufferBinding = { m_modelOutput.Get(), 0, m_modelOutput->GetDesc().Width };
DML_BINDING_DESC outputBinding = { DML_BINDING_TYPE_BUFFER, &outputBufferBinding };
DML_BINDING_DESC inputBindings[] = { input0Binding, input1Binding };
m_dmlAddResidualBinding->BindInputs(2, inputBindings);
m_dmlAddResidualBinding->BindOutputs(1, &outputBinding);
BindTempResourceIfNeeded(bindingProps, m_dmlAddResidualBinding.Get(), m_modelAddTemporaryResource.ReleaseAndGetAddressOf());
if (m_modelAddPersistentResource.Get() != nullptr)
m_dmlAddResidualBinding->BindPersistentResource(&addPersistentBinding);
}
}
DX::ThrowIfFailed(commandList->Close());
m_deviceResources->GetCommandQueue()->ExecuteCommandLists(1, CommandListCast(&commandList));
// Wait until initialization has been finished on the GPU.
m_deviceResources->WaitForGpu();
#if DML_MANAGED_WEIGHTS
// These have been copied to DML-managed resources and are no longer needed.
for (int i = 0; i < c_numConvLayers; i++)
{
m_modelConvFilterWeights[i].Reset();
if (i < c_numConvLayers - 1) // Last layer has no bias
{
m_modelConvBiasWeights[i].Reset();
}
}
#endif
}
void Sample::CreateUpsampleLayer(
_In_reads_(4) const uint32_t* inputSizes,
_Inout_updates_(1) uint64_t* inputBufferRequiredSize,
_Inout_updates_(1) uint64_t* outputBufferRequiredSize,
_Out_writes_(4) uint32_t* outputSizesOut,
_Out_writes_(1) IDMLCompiledOperator** compiledOpOut)
{
// Describe input and output tensors
uint32_t inputStrides[4];
Sample::GetStrides(inputSizes, m_tensorLayout, inputStrides);
uint64_t inputBufferSize = DMLCalcBufferTensorSize(DML_TENSOR_DATA_TYPE_FLOAT16, 4, inputSizes, inputStrides);
// Because we can resuse resources for tensor storage, this tracks the resource size needed to hold the
// largest possible tensor requested.
*inputBufferRequiredSize = std::max(inputBufferSize, *inputBufferRequiredSize);
DML_BUFFER_TENSOR_DESC inputBufferDesc = { DML_TENSOR_DATA_TYPE_FLOAT16, DML_TENSOR_FLAG_NONE, 4, inputSizes, inputStrides, inputBufferSize, 0 };
DML_TENSOR_DESC inputDesc = { DML_TENSOR_TYPE_BUFFER, &inputBufferDesc };
// Output size is double in height and width
outputSizesOut[0] = inputSizes[0];
outputSizesOut[1] = inputSizes[1];
outputSizesOut[2] = inputSizes[2] * 2;
outputSizesOut[3] = inputSizes[3] * 2;
uint32_t outputStrides[4];
Sample::GetStrides(outputSizesOut, m_tensorLayout, outputStrides);
uint64_t outputBufferSize = DMLCalcBufferTensorSize(DML_TENSOR_DATA_TYPE_FLOAT16, 4, outputSizesOut, outputStrides);
*outputBufferRequiredSize = std::max(outputBufferSize, *outputBufferRequiredSize);
DML_BUFFER_TENSOR_DESC outputBufferDesc = { DML_TENSOR_DATA_TYPE_FLOAT16, DML_TENSOR_FLAG_NONE, 4, outputSizesOut, outputStrides, outputBufferSize, 0 };
DML_TENSOR_DESC outputDesc = { DML_TENSOR_TYPE_BUFFER, &outputBufferDesc };
// Describe, create, and compile upsample operator
DML_UPSAMPLE_2D_OPERATOR_DESC upsampleDesc = { &inputDesc, &outputDesc, {2, 2}, DML_INTERPOLATION_MODE_NEAREST_NEIGHBOR };
DML_OPERATOR_DESC opDesc = { DML_OPERATOR_UPSAMPLE_2D, &upsampleDesc };
ComPtr<IDMLOperator> op;
DX::ThrowIfFailed(m_dmlDevice->CreateOperator(&opDesc, IID_PPV_ARGS(op.ReleaseAndGetAddressOf())));
DX::ThrowIfFailed(m_dmlDevice->CompileOperator(op.Get(), DML_EXECUTION_FLAG_ALLOW_HALF_PRECISION_COMPUTATION, IID_PPV_ARGS(compiledOpOut)));
}
void Sample::CreateConvolutionLayer(
_In_reads_(4) const uint32_t* inputSizes,
_In_reads_(4) const uint32_t* filterSizes,
bool useBiasAndActivation,
_Inout_updates_(1) uint64_t* inputBufferRequiredSize,
_Inout_updates_(1) uint64_t* outputBufferRequiredSize,
_Out_writes_(4) uint32_t* outputSizesOut,
_Out_writes_(1) IDMLCompiledOperator** compiledOpOut)
{
// Describe input and output tensors
uint32_t inputStrides[4];
Sample::GetStrides(inputSizes, m_tensorLayout, inputStrides);
uint64_t inputBufferSize = DMLCalcBufferTensorSize(DML_TENSOR_DATA_TYPE_FLOAT16, 4, inputSizes, inputStrides);
*inputBufferRequiredSize = std::max(inputBufferSize, *inputBufferRequiredSize);
DML_BUFFER_TENSOR_DESC inputBufferDesc = { DML_TENSOR_DATA_TYPE_FLOAT16, DML_TENSOR_FLAG_NONE, 4, inputSizes, inputStrides, inputBufferSize, 0 };
DML_TENSOR_DESC inputDesc = { DML_TENSOR_TYPE_BUFFER, &inputBufferDesc };
// The output shape has as many channels as there are convolution filters.
outputSizesOut[0] = inputSizes[0];
outputSizesOut[1] = filterSizes[0];
outputSizesOut[2] = inputSizes[2];
outputSizesOut[3] = inputSizes[3];
uint32_t outputStrides[4];
Sample::GetStrides(outputSizesOut, m_tensorLayout, outputStrides);
uint64_t outputBufferSize = DMLCalcBufferTensorSize(DML_TENSOR_DATA_TYPE_FLOAT16, 4, outputSizesOut, outputStrides);
*outputBufferRequiredSize = std::max(outputBufferSize, *outputBufferRequiredSize);
DML_BUFFER_TENSOR_DESC outputBufferDesc = { DML_TENSOR_DATA_TYPE_FLOAT16, DML_TENSOR_FLAG_NONE, 4, outputSizesOut, outputStrides, outputBufferSize, 0 };
DML_TENSOR_DESC outputDesc = { DML_TENSOR_TYPE_BUFFER, &outputBufferDesc };
// Describe weight tensors
uint32_t filterStrides[4];
Sample::GetStrides(filterSizes, m_tensorLayout, filterStrides);
uint64_t filterBufferSize = DMLCalcBufferTensorSize(DML_TENSOR_DATA_TYPE_FLOAT16, 4, filterSizes, filterStrides);
#if DML_MANAGED_WEIGHTS
DML_BUFFER_TENSOR_DESC filterBufferDesc = { DML_TENSOR_DATA_TYPE_FLOAT16, DML_TENSOR_FLAG_OWNED_BY_DML, 4, filterSizes, filterStrides, filterBufferSize, 0 };
#else
DML_BUFFER_TENSOR_DESC filterBufferDesc = { DML_TENSOR_DATA_TYPE_FLOAT16, DML_TENSOR_FLAG_NONE, 4, filterSizes, filterStrides, filterBufferSize, 0 };
#endif
DML_TENSOR_DESC filterDesc = { DML_TENSOR_TYPE_BUFFER, &filterBufferDesc };
uint32_t biasSizes[] = { 1, filterSizes[0], 1, 1 }; // One bias per output channel
uint32_t biasStrides[4];
Sample::GetStrides(biasSizes, m_tensorLayout, biasStrides);
uint64_t biasBufferSize = DMLCalcBufferTensorSize(DML_TENSOR_DATA_TYPE_FLOAT16, 4, biasSizes, biasStrides);
#if DML_MANAGED_WEIGHTS
DML_BUFFER_TENSOR_DESC biasBufferDesc = { DML_TENSOR_DATA_TYPE_FLOAT16, DML_TENSOR_FLAG_OWNED_BY_DML, 4, biasSizes, biasStrides, biasBufferSize, 0 };
#else
DML_BUFFER_TENSOR_DESC biasBufferDesc = { DML_TENSOR_DATA_TYPE_FLOAT16, DML_TENSOR_FLAG_NONE, 4, biasSizes, biasStrides, biasBufferSize, 0 };
#endif
DML_TENSOR_DESC biasDesc = { DML_TENSOR_TYPE_BUFFER, &biasBufferDesc };
// Describe, create, and compile convolution operator
// The output size of a convolution operation is given by:
// height = (inputHeight - filterHeight + 2*paddingHeight) / filterStride + 1
// width = (inputWidth - filterWidth + 2*paddingWidth ) / filterStride + 1
//
// We want to preserve the height and width, so assuming stride is 1, we get:
// paddingHeight = (filterHeight - 1) / 2
// paddingWidth = (filterWidth - 1) / 2
// If padding is fractional, we pad unevenly with ceil/floor.
UINT paddingHeightTop = static_cast<UINT>(ceil((filterSizes[2] - 1) / 2.0f));
UINT paddingHeightBottom = static_cast<UINT>(floor((filterSizes[2] - 1) / 2.0f));
UINT paddingWidthLeft = static_cast<UINT>(ceil((filterSizes[3] - 1) / 2.0f));
UINT paddingWidthRight = static_cast<UINT>(floor((filterSizes[3] - 1) / 2.0f));
UINT strides[] = { 1, 1 };
UINT dilations[] = { 1, 1 };
UINT startPadding[] = { paddingHeightTop, paddingWidthLeft };
UINT endPadding[] = { paddingHeightBottom, paddingWidthRight };
UINT outputPadding[] = { 0, 0 };
DML_ACTIVATION_RELU_OPERATOR_DESC fusedReluDesc = { 0 };
DML_OPERATOR_DESC activationDesc = { DML_OPERATOR_ACTIVATION_RELU, &fusedReluDesc };
DML_CONVOLUTION_OPERATOR_DESC convDesc = {
&inputDesc,
&filterDesc,
useBiasAndActivation ? &biasDesc : nullptr,
&outputDesc,
DML_CONVOLUTION_MODE_CROSS_CORRELATION,
DML_CONVOLUTION_DIRECTION_FORWARD,
2,
strides,
dilations,
startPadding,
endPadding,
outputPadding,
1,
useBiasAndActivation ? &activationDesc : nullptr
};
DML_OPERATOR_DESC opDesc = { DML_OPERATOR_CONVOLUTION, &convDesc };
ComPtr<IDMLOperator> op;
DX::ThrowIfFailed(m_dmlDevice->CreateOperator(&opDesc, IID_PPV_ARGS(op.ReleaseAndGetAddressOf())));
DX::ThrowIfFailed(m_dmlDevice->CompileOperator(op.Get(), DML_EXECUTION_FLAG_ALLOW_HALF_PRECISION_COMPUTATION, IID_PPV_ARGS(compiledOpOut)));
}
void Sample::CreateAdditionLayer(
_In_reads_(4) const uint32_t* inputSizes,
_Out_writes_(1) IDMLCompiledOperator** compiledOpOut)
{
// Describe input and output tensors
uint32_t strides[4];
Sample::GetStrides(inputSizes, m_tensorLayout, strides);
uint64_t bufferSize = DMLCalcBufferTensorSize(DML_TENSOR_DATA_TYPE_FLOAT16, 4, inputSizes, strides);
DML_BUFFER_TENSOR_DESC bufferDesc = { DML_TENSOR_DATA_TYPE_FLOAT16, DML_TENSOR_FLAG_NONE, 4, inputSizes, strides, bufferSize, 0 };
DML_TENSOR_DESC tensorDesc = { DML_TENSOR_TYPE_BUFFER, &bufferDesc };
// Describe, create, and compile elementwise addition operator
// Inputs and output are all the same size and use the same tensor desc.
DML_ELEMENT_WISE_ADD_OPERATOR_DESC addDesc = { &tensorDesc, &tensorDesc, &tensorDesc };
DML_OPERATOR_DESC opDesc = { DML_OPERATOR_ELEMENT_WISE_ADD, &addDesc };
ComPtr<IDMLOperator> op;
DX::ThrowIfFailed(m_dmlDevice->CreateOperator(&opDesc, IID_PPV_ARGS(op.ReleaseAndGetAddressOf())));
DX::ThrowIfFailed(m_dmlDevice->CompileOperator(op.Get(), DML_EXECUTION_FLAG_ALLOW_HALF_PRECISION_COMPUTATION, IID_PPV_ARGS(compiledOpOut)));
}