Use custom heaps by default for cache-coherent UMA devices (#580)
This commit is contained in:
Родитель
5897d34ae2
Коммит
6a5919ceca
|
@ -1,5 +1,5 @@
|
|||
cmake_minimum_required(VERSION 3.19)
|
||||
project(dxdispatch VERSION 0.17.1 LANGUAGES CXX)
|
||||
project(dxdispatch VERSION 0.17.2 LANGUAGES CXX)
|
||||
|
||||
# ==============================================================================
|
||||
# External Libraries/Helpers
|
||||
|
|
|
@ -68,6 +68,7 @@ std::string Adapter::GetDetailedDescription() const
|
|||
FormatBytes(m_dedicatedSystemMemory),
|
||||
FormatBytes(m_sharedSystemMemory),
|
||||
m_isSupported_D3D12_GRAPHICS,
|
||||
m_isSupported_CORE_COMPUTE,
|
||||
m_isSupported_GENERIC_ML
|
||||
);
|
||||
}
|
||||
|
|
|
@ -121,6 +121,11 @@ CommandLineArgs::CommandLineArgs(int argc, char** argv)
|
|||
"Type of command queue/list to use ('compute' or 'direct')",
|
||||
cxxopts::value<std::string>()
|
||||
)
|
||||
(
|
||||
"disable_custom_heaps",
|
||||
"Always use default heaps for resources",
|
||||
cxxopts::value<bool>()
|
||||
)
|
||||
(
|
||||
"clear_shader_caches",
|
||||
"Clears D3D shader caches before running commands",
|
||||
|
@ -322,6 +327,11 @@ CommandLineArgs::CommandLineArgs(int argc, char** argv)
|
|||
m_showDependencies = result["show_dependencies"].as<bool>();
|
||||
}
|
||||
|
||||
if (result.count("disable_custom_heaps"))
|
||||
{
|
||||
m_preferCustomHeaps = !result["disable_custom_heaps"].as<bool>();
|
||||
}
|
||||
|
||||
if (result.count("clear_shader_caches"))
|
||||
{
|
||||
m_clearShaderCaches = result["clear_shader_caches"].as<bool>();
|
||||
|
|
|
@ -28,6 +28,7 @@ public:
|
|||
bool EnableDred() const { return m_enableDred; }
|
||||
bool DisableBackgroundProcessing() const { return m_disableBackgroundProcessing; }
|
||||
bool SetStablePowerState() const { return m_setStablePowerState; }
|
||||
bool PreferCustomHeaps() const { return m_preferCustomHeaps; }
|
||||
bool DisableAgilitySDK() const { return m_disableAgilitySDK; }
|
||||
const std::string& AdapterSubstring() const { return m_adapterSubstring; }
|
||||
|
||||
|
@ -83,6 +84,7 @@ private:
|
|||
bool m_enableDred = false;
|
||||
bool m_disableBackgroundProcessing = false;
|
||||
bool m_setStablePowerState = false;
|
||||
bool m_preferCustomHeaps = true;
|
||||
bool m_disableAgilitySDK = false;
|
||||
bool m_uavBarrierAfterDispatch = true;
|
||||
bool m_aliasingBarrierAfterDispatch = false;
|
||||
|
|
|
@ -52,6 +52,7 @@ Device::Device(
|
|||
bool enableDred,
|
||||
bool disableBackgroundProcessing,
|
||||
bool setStablePowerState,
|
||||
bool preferCustomHeaps,
|
||||
uint32_t maxGpuTimeMeasurements,
|
||||
std::shared_ptr<PixCaptureHelper> pixCaptureHelper,
|
||||
std::shared_ptr<D3d12Module> d3dModule,
|
||||
|
@ -63,7 +64,8 @@ Device::Device(
|
|||
m_dispatchRepeat(dispatchRepeat),
|
||||
m_logger(logger),
|
||||
m_restoreBackgroundProcessing(disableBackgroundProcessing),
|
||||
m_restoreStablePowerState(setStablePowerState)
|
||||
m_restoreStablePowerState(setStablePowerState),
|
||||
m_useCustomHeaps(preferCustomHeaps)
|
||||
{
|
||||
DML_CREATE_DEVICE_FLAGS dmlCreateDeviceFlags = debugLayersEnabled ? DML_CREATE_DEVICE_FLAG_DEBUG : DML_CREATE_DEVICE_FLAG_NONE;
|
||||
|
||||
|
@ -167,6 +169,44 @@ Device::Device(
|
|||
}
|
||||
}
|
||||
|
||||
D3D12_FEATURE_DATA_ARCHITECTURE1 archData = {};
|
||||
if (SUCCEEDED(m_d3d->CheckFeatureSupport(D3D12_FEATURE_ARCHITECTURE1, &archData, sizeof(archData))))
|
||||
{
|
||||
m_architectureSupport = archData;
|
||||
}
|
||||
|
||||
// Custom heaps should only be used on UMA systems with cache-coherent memory.
|
||||
m_useCustomHeaps = m_useCustomHeaps && m_architectureSupport->UMA && m_architectureSupport->CacheCoherentUMA;
|
||||
|
||||
D3D_FEATURE_LEVEL featureLevelsList[] = {
|
||||
D3D_FEATURE_LEVEL_1_0_GENERIC,
|
||||
D3D_FEATURE_LEVEL_1_0_CORE,
|
||||
D3D_FEATURE_LEVEL_11_0,
|
||||
D3D_FEATURE_LEVEL_11_1,
|
||||
D3D_FEATURE_LEVEL_12_0,
|
||||
D3D_FEATURE_LEVEL_12_1
|
||||
};
|
||||
|
||||
D3D12_FEATURE_DATA_FEATURE_LEVELS featureLevels = {};
|
||||
featureLevels.NumFeatureLevels = _countof(featureLevelsList);
|
||||
featureLevels.pFeatureLevelsRequested = featureLevelsList;
|
||||
THROW_IF_FAILED(m_d3d->CheckFeatureSupport(
|
||||
D3D12_FEATURE_FEATURE_LEVELS,
|
||||
&featureLevels,
|
||||
sizeof(featureLevels)
|
||||
));
|
||||
|
||||
// Custom heaps are optional for MCDM devices, so we also need to check for support.
|
||||
if (featureLevels.MaxSupportedFeatureLevel == D3D_FEATURE_LEVEL_1_0_CORE ||
|
||||
featureLevels.MaxSupportedFeatureLevel == D3D_FEATURE_LEVEL_1_0_GENERIC)
|
||||
{
|
||||
D3D12_FEATURE_DATA_D3D12_OPTIONS19 options = {};
|
||||
if (SUCCEEDED(m_d3d->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS1, &options, sizeof(options))))
|
||||
{
|
||||
m_useCustomHeaps = m_useCustomHeaps && options.ComputeOnlyCustomHeapSupported;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // !_GAMING_XBOX
|
||||
|
||||
THROW_IF_FAILED(m_d3d->CreateFence(
|
||||
|
@ -264,6 +304,37 @@ Device::~Device()
|
|||
}
|
||||
}
|
||||
|
||||
Microsoft::WRL::ComPtr<ID3D12Resource> Device::CreatePreferredDeviceMemoryBuffer(
|
||||
uint64_t sizeInBytes,
|
||||
D3D12_RESOURCE_FLAGS resourceFlags,
|
||||
uint64_t alignment,
|
||||
D3D12_HEAP_FLAGS heapFlags)
|
||||
{
|
||||
return m_useCustomHeaps ? CreateCustomBuffer(sizeInBytes, resourceFlags, alignment, heapFlags) :
|
||||
CreateDefaultBuffer(sizeInBytes, resourceFlags, alignment, heapFlags);
|
||||
}
|
||||
|
||||
Microsoft::WRL::ComPtr<ID3D12Resource> Device::CreateCustomBuffer(
|
||||
uint64_t sizeInBytes,
|
||||
D3D12_RESOURCE_FLAGS resourceFlags,
|
||||
uint64_t alignment,
|
||||
D3D12_HEAP_FLAGS heapFlags)
|
||||
{
|
||||
auto resourceDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeInBytes, resourceFlags, alignment);
|
||||
auto heapProps = CD3DX12_HEAP_PROPERTIES(D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE, D3D12_MEMORY_POOL_L0, 0, 0);
|
||||
|
||||
ComPtr<ID3D12Resource> resource;
|
||||
THROW_IF_FAILED(m_d3d->CreateCommittedResource(
|
||||
&heapProps,
|
||||
heapFlags,
|
||||
&resourceDesc,
|
||||
D3D12_RESOURCE_STATE_COMMON,
|
||||
nullptr,
|
||||
IID_GRAPHICS_PPV_ARGS(resource.ReleaseAndGetAddressOf())));
|
||||
|
||||
return resource;
|
||||
}
|
||||
|
||||
ComPtr<ID3D12Resource> Device::CreateDefaultBuffer(
|
||||
uint64_t sizeInBytes,
|
||||
D3D12_RESOURCE_FLAGS resourceFlags,
|
||||
|
@ -388,101 +459,105 @@ Microsoft::WRL::ComPtr<ID3D12Resource> Device::Upload(uint64_t totalSize, gsl::s
|
|||
throw std::invalid_argument("Attempting to upload more data than the size of the buffer");
|
||||
}
|
||||
|
||||
auto defaultBuffer = CreateDefaultBuffer(totalSize);
|
||||
ComPtr<ID3D12Resource> buffer;
|
||||
ComPtr<ID3D12Resource> uploadBuffer;
|
||||
ComPtr<ID3D12Resource> resourceToMap;
|
||||
|
||||
if (m_useCustomHeaps)
|
||||
{
|
||||
buffer = CreateCustomBuffer(totalSize);
|
||||
resourceToMap = data.empty() ? nullptr : buffer;
|
||||
}
|
||||
else
|
||||
{
|
||||
buffer = CreateDefaultBuffer(totalSize);
|
||||
uploadBuffer = data.empty() ? nullptr : CreateUploadBuffer(totalSize);
|
||||
uploadBuffer->SetName(L"Device::Upload");
|
||||
resourceToMap = uploadBuffer;
|
||||
}
|
||||
|
||||
if (!name.empty())
|
||||
{
|
||||
defaultBuffer->SetName(name.data());
|
||||
buffer->SetName(name.data());
|
||||
}
|
||||
|
||||
if (data.empty())
|
||||
if (resourceToMap)
|
||||
{
|
||||
// No need to create an upload resource if the source data is empty.
|
||||
return defaultBuffer;
|
||||
}
|
||||
void* mappedBufferData = nullptr;
|
||||
THROW_IF_FAILED(resourceToMap->Map(0, nullptr, &mappedBufferData));
|
||||
memcpy(mappedBufferData, data.data(), data.size());
|
||||
resourceToMap->Unmap(0, nullptr);
|
||||
|
||||
auto uploadBuffer = CreateUploadBuffer(totalSize);
|
||||
uploadBuffer->SetName(L"Device::Upload");
|
||||
{
|
||||
void* uploadBufferData = nullptr;
|
||||
THROW_IF_FAILED(uploadBuffer->Map(0, nullptr, &uploadBufferData));
|
||||
memcpy(uploadBufferData, data.data(), data.size());
|
||||
uploadBuffer->Unmap(0, nullptr);
|
||||
}
|
||||
|
||||
{
|
||||
D3D12_RESOURCE_BARRIER barriers[] =
|
||||
if (resourceToMap == uploadBuffer)
|
||||
{
|
||||
CD3DX12_RESOURCE_BARRIER::Transition(
|
||||
defaultBuffer.Get(),
|
||||
D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
|
||||
D3D12_RESOURCE_STATE_COPY_DEST)
|
||||
};
|
||||
m_commandList->ResourceBarrier(_countof(barriers), barriers);
|
||||
D3D12_RESOURCE_BARRIER barriers[] =
|
||||
{
|
||||
CD3DX12_RESOURCE_BARRIER::Transition(
|
||||
buffer.Get(),
|
||||
D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
|
||||
D3D12_RESOURCE_STATE_COPY_DEST)
|
||||
};
|
||||
|
||||
m_commandList->ResourceBarrier(_countof(barriers), barriers);
|
||||
m_commandList->CopyResource(buffer.Get(), uploadBuffer.Get());
|
||||
std::swap(barriers[0].Transition.StateBefore, barriers[0].Transition.StateAfter);
|
||||
m_commandList->ResourceBarrier(_countof(barriers), barriers);
|
||||
|
||||
m_temporaryResources.push_back(std::move(uploadBuffer));
|
||||
}
|
||||
}
|
||||
|
||||
m_commandList->CopyResource(defaultBuffer.Get(), uploadBuffer.Get());
|
||||
|
||||
{
|
||||
D3D12_RESOURCE_BARRIER barriers[] =
|
||||
{
|
||||
CD3DX12_RESOURCE_BARRIER::Transition(
|
||||
defaultBuffer.Get(),
|
||||
D3D12_RESOURCE_STATE_COPY_DEST,
|
||||
D3D12_RESOURCE_STATE_UNORDERED_ACCESS)
|
||||
};
|
||||
m_commandList->ResourceBarrier(_countof(barriers), barriers);
|
||||
}
|
||||
|
||||
m_temporaryResources.push_back(std::move(uploadBuffer));
|
||||
|
||||
return defaultBuffer;
|
||||
return buffer;
|
||||
}
|
||||
|
||||
std::vector<std::byte> Device::Download(Microsoft::WRL::ComPtr<ID3D12Resource> defaultBuffer)
|
||||
std::vector<std::byte> Device::Download(Microsoft::WRL::ComPtr<ID3D12Resource> buffer)
|
||||
{
|
||||
auto readbackBuffer = CreateReadbackBuffer(defaultBuffer->GetDesc().Width);
|
||||
readbackBuffer->SetName(L"Device::Download");
|
||||
|
||||
if (buffer->GetDesc().Width > std::numeric_limits<size_t>::max())
|
||||
{
|
||||
throw std::invalid_argument(fmt::format("Buffer width '{}' is too large.", buffer->GetDesc().Width));
|
||||
}
|
||||
|
||||
ComPtr<ID3D12Resource> resourceToMap;
|
||||
|
||||
// Can't assume the input buffer was created as a custom heap (e.g., ONNX dispatchable with a deferred
|
||||
// resource allocated by the DML EP), so check the heap properties.
|
||||
D3D12_HEAP_PROPERTIES heapProps = {};
|
||||
D3D12_HEAP_FLAGS heapFlags = {};
|
||||
|
||||
if (SUCCEEDED(buffer->GetHeapProperties(&heapProps, &heapFlags)) &&
|
||||
heapProps.MemoryPoolPreference == D3D12_MEMORY_POOL_L0 &&
|
||||
heapProps.CPUPageProperty == D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE)
|
||||
{
|
||||
resourceToMap = buffer;
|
||||
}
|
||||
else
|
||||
{
|
||||
resourceToMap = CreateReadbackBuffer(buffer->GetDesc().Width);
|
||||
resourceToMap->SetName(L"Device::Download");
|
||||
|
||||
D3D12_RESOURCE_BARRIER barriers[] =
|
||||
{
|
||||
CD3DX12_RESOURCE_BARRIER::Transition(
|
||||
defaultBuffer.Get(),
|
||||
buffer.Get(),
|
||||
D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
|
||||
D3D12_RESOURCE_STATE_COPY_SOURCE)
|
||||
};
|
||||
|
||||
m_commandList->ResourceBarrier(_countof(barriers), barriers);
|
||||
}
|
||||
|
||||
m_commandList->CopyResource(readbackBuffer.Get(), defaultBuffer.Get());
|
||||
|
||||
{
|
||||
D3D12_RESOURCE_BARRIER barriers[] =
|
||||
{
|
||||
CD3DX12_RESOURCE_BARRIER::Transition(
|
||||
defaultBuffer.Get(),
|
||||
D3D12_RESOURCE_STATE_COPY_SOURCE,
|
||||
D3D12_RESOURCE_STATE_UNORDERED_ACCESS)
|
||||
};
|
||||
m_commandList->CopyResource(resourceToMap.Get(), buffer.Get());
|
||||
std::swap(barriers[0].Transition.StateBefore, barriers[0].Transition.StateAfter);
|
||||
m_commandList->ResourceBarrier(_countof(barriers), barriers);
|
||||
ExecuteCommandListAndWait();
|
||||
}
|
||||
|
||||
ExecuteCommandListAndWait();
|
||||
if (defaultBuffer->GetDesc().Width > std::numeric_limits<size_t>::max())
|
||||
{
|
||||
throw std::invalid_argument(fmt::format("Buffer width '{}' is too large.", defaultBuffer->GetDesc().Width));
|
||||
}
|
||||
std::vector<std::byte> outputBuffer(static_cast<size_t>(defaultBuffer->GetDesc().Width));
|
||||
{
|
||||
size_t dataSize = gsl::narrow<size_t>(defaultBuffer->GetDesc().Width);
|
||||
CD3DX12_RANGE readRange(0, dataSize);
|
||||
void* readbackBufferData = nullptr;
|
||||
THROW_IF_FAILED(readbackBuffer->Map(0, &readRange, &readbackBufferData));
|
||||
memcpy(outputBuffer.data(), readbackBufferData, dataSize);
|
||||
readbackBuffer->Unmap(0, nullptr);
|
||||
}
|
||||
|
||||
m_temporaryResources.push_back(std::move(readbackBuffer));
|
||||
std::vector<std::byte> outputBuffer(static_cast<size_t>(buffer->GetDesc().Width));
|
||||
|
||||
size_t dataSize = gsl::narrow<size_t>(buffer->GetDesc().Width);
|
||||
CD3DX12_RANGE readRange(0, dataSize);
|
||||
void* mappedBufferData = nullptr;
|
||||
THROW_IF_FAILED(resourceToMap->Map(0, &readRange, &mappedBufferData));
|
||||
memcpy(outputBuffer.data(), mappedBufferData, dataSize);
|
||||
resourceToMap->Unmap(0, nullptr);
|
||||
|
||||
return outputBuffer;
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@ public:
|
|||
bool enableDred,
|
||||
bool disableBackgroundProcessing,
|
||||
bool setStablePowerState,
|
||||
bool preferCustomHeaps,
|
||||
uint32_t maxGpuTimeMeasurements,
|
||||
std::shared_ptr<PixCaptureHelper> pixCaptureHelper,
|
||||
std::shared_ptr<D3d12Module> d3dModule,
|
||||
|
@ -45,7 +46,19 @@ public:
|
|||
IDxcCompiler3* GetDxcCompiler();
|
||||
#endif
|
||||
|
||||
// TODO: test custom heap buffer with write combine for igpu?
|
||||
// Creates either a default buffer or custom buffer based on support for custom heaps
|
||||
// and whether or not they are allowed.
|
||||
Microsoft::WRL::ComPtr<ID3D12Resource> CreatePreferredDeviceMemoryBuffer(
|
||||
uint64_t sizeInBytes,
|
||||
D3D12_RESOURCE_FLAGS resourceFlags = D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS,
|
||||
uint64_t alignment = 0,
|
||||
D3D12_HEAP_FLAGS heapFlags = D3D12_HEAP_FLAG_NONE);
|
||||
|
||||
Microsoft::WRL::ComPtr<ID3D12Resource> CreateCustomBuffer(
|
||||
uint64_t sizeInBytes,
|
||||
D3D12_RESOURCE_FLAGS resourceFlags = D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS,
|
||||
uint64_t alignment = 0,
|
||||
D3D12_HEAP_FLAGS heapFlags = D3D12_HEAP_FLAG_NONE);
|
||||
|
||||
Microsoft::WRL::ComPtr<ID3D12Resource> CreateDefaultBuffer(
|
||||
uint64_t sizeInBytes,
|
||||
|
@ -137,6 +150,8 @@ private:
|
|||
Microsoft::WRL::ComPtr<IDxDispatchLogger> m_logger;
|
||||
bool m_restoreBackgroundProcessing = false;
|
||||
bool m_restoreStablePowerState = false;
|
||||
std::optional<D3D12_FEATURE_DATA_ARCHITECTURE1> m_architectureSupport;
|
||||
bool m_useCustomHeaps = false;
|
||||
|
||||
#ifndef DXCOMPILER_NONE
|
||||
Microsoft::WRL::ComPtr<IDxcUtils> m_dxcUtils;
|
||||
|
|
|
@ -166,7 +166,7 @@ void DmlDispatchable::Initialize()
|
|||
auto tempBufferSize = initializer->GetBindingProperties().TemporaryResourceSize;
|
||||
if (tempBufferSize > 0)
|
||||
{
|
||||
ComPtr<ID3D12Resource> tempBuffer = m_device->CreateDefaultBuffer(tempBufferSize);
|
||||
ComPtr<ID3D12Resource> tempBuffer = m_device->CreatePreferredDeviceMemoryBuffer(tempBufferSize);
|
||||
DML_BUFFER_BINDING bufferBinding = { tempBuffer.Get(), 0, tempBufferSize };
|
||||
DML_BINDING_DESC bindingDesc = { DML_BINDING_TYPE_BUFFER, &bufferBinding };
|
||||
bindingTable->BindTemporaryResource(&bindingDesc);
|
||||
|
@ -177,7 +177,7 @@ void DmlDispatchable::Initialize()
|
|||
auto persistentBufferSize = m_operatorCompiled->GetBindingProperties().PersistentResourceSize;
|
||||
if (persistentBufferSize > 0)
|
||||
{
|
||||
m_persistentBuffer = m_device->CreateDefaultBuffer(persistentBufferSize);
|
||||
m_persistentBuffer = m_device->CreatePreferredDeviceMemoryBuffer(persistentBufferSize);
|
||||
DML_BUFFER_BINDING bufferBinding = { m_persistentBuffer.Get(), 0, persistentBufferSize };
|
||||
DML_BINDING_DESC bindingDesc = { DML_BINDING_TYPE_BUFFER, &bufferBinding };
|
||||
bindingTable->BindOutputs(1, &bindingDesc);
|
||||
|
@ -227,7 +227,7 @@ void DmlDispatchable::Bind(const Bindings& bindings, uint32_t iteration)
|
|||
auto tempBufferSize = bindingProps.TemporaryResourceSize;
|
||||
if (tempBufferSize > 0)
|
||||
{
|
||||
tempBuffer = m_device->CreateDefaultBuffer(tempBufferSize);
|
||||
tempBuffer = m_device->CreatePreferredDeviceMemoryBuffer(tempBufferSize);
|
||||
|
||||
DML_BUFFER_BINDING bufferBinding = { tempBuffer.Get(), 0, tempBufferSize };
|
||||
DML_BINDING_DESC bindingDesc = { DML_BINDING_TYPE_BUFFER, &bufferBinding };
|
||||
|
|
|
@ -419,7 +419,7 @@ void OnnxDispatchable::Bind(const Bindings& jsonBindings, uint32_t iteration)
|
|||
{
|
||||
throw std::invalid_argument(fmt::format("TensorShapeUint32 '{}' is too large.", tensorShapeUint32.size()));
|
||||
}
|
||||
binding.resource = m_device->CreateDefaultBuffer(DMLCalcBufferTensorSize(
|
||||
binding.resource = m_device->CreatePreferredDeviceMemoryBuffer(DMLCalcBufferTensorSize(
|
||||
dataTypeInfo.dmlDataType,
|
||||
static_cast<uint32_t>(tensorShapeUint32.size()),
|
||||
tensorShapeUint32.data(),
|
||||
|
|
|
@ -172,6 +172,7 @@ HRESULT DxDispatch::RuntimeClassInitialize(
|
|||
m_options->EnableDred(),
|
||||
m_options->DisableBackgroundProcessing(),
|
||||
m_options->SetStablePowerState(),
|
||||
m_options->PreferCustomHeaps(),
|
||||
m_options->MaxGpuTimeMeasurements(),
|
||||
m_pixCaptureHelper,
|
||||
m_d3dModule,
|
||||
|
|
Загрузка…
Ссылка в новой задаче