Use custom heaps by default for cache-coherent UMA devices (#580)

This commit is contained in:
Justin Stoecker 2024-04-30 23:20:46 -07:00 коммит произвёл GitHub
Родитель 5897d34ae2
Коммит 6a5919ceca
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
9 изменённых файлов: 183 добавлений и 79 удалений

Просмотреть файл

@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.19)
project(dxdispatch VERSION 0.17.1 LANGUAGES CXX)
project(dxdispatch VERSION 0.17.2 LANGUAGES CXX)
# ==============================================================================
# External Libraries/Helpers

Просмотреть файл

@ -68,6 +68,7 @@ std::string Adapter::GetDetailedDescription() const
FormatBytes(m_dedicatedSystemMemory),
FormatBytes(m_sharedSystemMemory),
m_isSupported_D3D12_GRAPHICS,
m_isSupported_CORE_COMPUTE,
m_isSupported_GENERIC_ML
);
}

Просмотреть файл

@ -121,6 +121,11 @@ CommandLineArgs::CommandLineArgs(int argc, char** argv)
"Type of command queue/list to use ('compute' or 'direct')",
cxxopts::value<std::string>()
)
(
"disable_custom_heaps",
"Always use default heaps for resources",
cxxopts::value<bool>()
)
(
"clear_shader_caches",
"Clears D3D shader caches before running commands",
@ -322,6 +327,11 @@ CommandLineArgs::CommandLineArgs(int argc, char** argv)
m_showDependencies = result["show_dependencies"].as<bool>();
}
if (result.count("disable_custom_heaps"))
{
m_preferCustomHeaps = !result["disable_custom_heaps"].as<bool>();
}
if (result.count("clear_shader_caches"))
{
m_clearShaderCaches = result["clear_shader_caches"].as<bool>();

Просмотреть файл

@ -28,6 +28,7 @@ public:
bool EnableDred() const { return m_enableDred; }
bool DisableBackgroundProcessing() const { return m_disableBackgroundProcessing; }
bool SetStablePowerState() const { return m_setStablePowerState; }
bool PreferCustomHeaps() const { return m_preferCustomHeaps; }
bool DisableAgilitySDK() const { return m_disableAgilitySDK; }
const std::string& AdapterSubstring() const { return m_adapterSubstring; }
@ -83,6 +84,7 @@ private:
bool m_enableDred = false;
bool m_disableBackgroundProcessing = false;
bool m_setStablePowerState = false;
bool m_preferCustomHeaps = true;
bool m_disableAgilitySDK = false;
bool m_uavBarrierAfterDispatch = true;
bool m_aliasingBarrierAfterDispatch = false;

Просмотреть файл

@ -52,6 +52,7 @@ Device::Device(
bool enableDred,
bool disableBackgroundProcessing,
bool setStablePowerState,
bool preferCustomHeaps,
uint32_t maxGpuTimeMeasurements,
std::shared_ptr<PixCaptureHelper> pixCaptureHelper,
std::shared_ptr<D3d12Module> d3dModule,
@ -63,7 +64,8 @@ Device::Device(
m_dispatchRepeat(dispatchRepeat),
m_logger(logger),
m_restoreBackgroundProcessing(disableBackgroundProcessing),
m_restoreStablePowerState(setStablePowerState)
m_restoreStablePowerState(setStablePowerState),
m_useCustomHeaps(preferCustomHeaps)
{
DML_CREATE_DEVICE_FLAGS dmlCreateDeviceFlags = debugLayersEnabled ? DML_CREATE_DEVICE_FLAG_DEBUG : DML_CREATE_DEVICE_FLAG_NONE;
@ -167,6 +169,44 @@ Device::Device(
}
}
D3D12_FEATURE_DATA_ARCHITECTURE1 archData = {};
if (SUCCEEDED(m_d3d->CheckFeatureSupport(D3D12_FEATURE_ARCHITECTURE1, &archData, sizeof(archData))))
{
m_architectureSupport = archData;
}
// Custom heaps should only be used on UMA systems with cache-coherent memory.
m_useCustomHeaps = m_useCustomHeaps && m_architectureSupport->UMA && m_architectureSupport->CacheCoherentUMA;
D3D_FEATURE_LEVEL featureLevelsList[] = {
D3D_FEATURE_LEVEL_1_0_GENERIC,
D3D_FEATURE_LEVEL_1_0_CORE,
D3D_FEATURE_LEVEL_11_0,
D3D_FEATURE_LEVEL_11_1,
D3D_FEATURE_LEVEL_12_0,
D3D_FEATURE_LEVEL_12_1
};
D3D12_FEATURE_DATA_FEATURE_LEVELS featureLevels = {};
featureLevels.NumFeatureLevels = _countof(featureLevelsList);
featureLevels.pFeatureLevelsRequested = featureLevelsList;
THROW_IF_FAILED(m_d3d->CheckFeatureSupport(
D3D12_FEATURE_FEATURE_LEVELS,
&featureLevels,
sizeof(featureLevels)
));
// Custom heaps are optional for MCDM devices, so we also need to check for support.
if (featureLevels.MaxSupportedFeatureLevel == D3D_FEATURE_LEVEL_1_0_CORE ||
featureLevels.MaxSupportedFeatureLevel == D3D_FEATURE_LEVEL_1_0_GENERIC)
{
D3D12_FEATURE_DATA_D3D12_OPTIONS19 options = {};
if (SUCCEEDED(m_d3d->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS1, &options, sizeof(options))))
{
m_useCustomHeaps = m_useCustomHeaps && options.ComputeOnlyCustomHeapSupported;
}
}
#endif // !_GAMING_XBOX
THROW_IF_FAILED(m_d3d->CreateFence(
@ -264,6 +304,37 @@ Device::~Device()
}
}
Microsoft::WRL::ComPtr<ID3D12Resource> Device::CreatePreferredDeviceMemoryBuffer(
uint64_t sizeInBytes,
D3D12_RESOURCE_FLAGS resourceFlags,
uint64_t alignment,
D3D12_HEAP_FLAGS heapFlags)
{
return m_useCustomHeaps ? CreateCustomBuffer(sizeInBytes, resourceFlags, alignment, heapFlags) :
CreateDefaultBuffer(sizeInBytes, resourceFlags, alignment, heapFlags);
}
Microsoft::WRL::ComPtr<ID3D12Resource> Device::CreateCustomBuffer(
uint64_t sizeInBytes,
D3D12_RESOURCE_FLAGS resourceFlags,
uint64_t alignment,
D3D12_HEAP_FLAGS heapFlags)
{
auto resourceDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeInBytes, resourceFlags, alignment);
auto heapProps = CD3DX12_HEAP_PROPERTIES(D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE, D3D12_MEMORY_POOL_L0, 0, 0);
ComPtr<ID3D12Resource> resource;
THROW_IF_FAILED(m_d3d->CreateCommittedResource(
&heapProps,
heapFlags,
&resourceDesc,
D3D12_RESOURCE_STATE_COMMON,
nullptr,
IID_GRAPHICS_PPV_ARGS(resource.ReleaseAndGetAddressOf())));
return resource;
}
ComPtr<ID3D12Resource> Device::CreateDefaultBuffer(
uint64_t sizeInBytes,
D3D12_RESOURCE_FLAGS resourceFlags,
@ -388,101 +459,105 @@ Microsoft::WRL::ComPtr<ID3D12Resource> Device::Upload(uint64_t totalSize, gsl::s
throw std::invalid_argument("Attempting to upload more data than the size of the buffer");
}
auto defaultBuffer = CreateDefaultBuffer(totalSize);
ComPtr<ID3D12Resource> buffer;
ComPtr<ID3D12Resource> uploadBuffer;
ComPtr<ID3D12Resource> resourceToMap;
if (m_useCustomHeaps)
{
buffer = CreateCustomBuffer(totalSize);
resourceToMap = data.empty() ? nullptr : buffer;
}
else
{
buffer = CreateDefaultBuffer(totalSize);
uploadBuffer = data.empty() ? nullptr : CreateUploadBuffer(totalSize);
uploadBuffer->SetName(L"Device::Upload");
resourceToMap = uploadBuffer;
}
if (!name.empty())
{
defaultBuffer->SetName(name.data());
buffer->SetName(name.data());
}
if (data.empty())
if (resourceToMap)
{
// No need to create an upload resource if the source data is empty.
return defaultBuffer;
}
void* mappedBufferData = nullptr;
THROW_IF_FAILED(resourceToMap->Map(0, nullptr, &mappedBufferData));
memcpy(mappedBufferData, data.data(), data.size());
resourceToMap->Unmap(0, nullptr);
auto uploadBuffer = CreateUploadBuffer(totalSize);
uploadBuffer->SetName(L"Device::Upload");
{
void* uploadBufferData = nullptr;
THROW_IF_FAILED(uploadBuffer->Map(0, nullptr, &uploadBufferData));
memcpy(uploadBufferData, data.data(), data.size());
uploadBuffer->Unmap(0, nullptr);
}
{
D3D12_RESOURCE_BARRIER barriers[] =
if (resourceToMap == uploadBuffer)
{
CD3DX12_RESOURCE_BARRIER::Transition(
defaultBuffer.Get(),
D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
D3D12_RESOURCE_STATE_COPY_DEST)
};
m_commandList->ResourceBarrier(_countof(barriers), barriers);
D3D12_RESOURCE_BARRIER barriers[] =
{
CD3DX12_RESOURCE_BARRIER::Transition(
buffer.Get(),
D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
D3D12_RESOURCE_STATE_COPY_DEST)
};
m_commandList->ResourceBarrier(_countof(barriers), barriers);
m_commandList->CopyResource(buffer.Get(), uploadBuffer.Get());
std::swap(barriers[0].Transition.StateBefore, barriers[0].Transition.StateAfter);
m_commandList->ResourceBarrier(_countof(barriers), barriers);
m_temporaryResources.push_back(std::move(uploadBuffer));
}
}
m_commandList->CopyResource(defaultBuffer.Get(), uploadBuffer.Get());
{
D3D12_RESOURCE_BARRIER barriers[] =
{
CD3DX12_RESOURCE_BARRIER::Transition(
defaultBuffer.Get(),
D3D12_RESOURCE_STATE_COPY_DEST,
D3D12_RESOURCE_STATE_UNORDERED_ACCESS)
};
m_commandList->ResourceBarrier(_countof(barriers), barriers);
}
m_temporaryResources.push_back(std::move(uploadBuffer));
return defaultBuffer;
return buffer;
}
std::vector<std::byte> Device::Download(Microsoft::WRL::ComPtr<ID3D12Resource> defaultBuffer)
std::vector<std::byte> Device::Download(Microsoft::WRL::ComPtr<ID3D12Resource> buffer)
{
auto readbackBuffer = CreateReadbackBuffer(defaultBuffer->GetDesc().Width);
readbackBuffer->SetName(L"Device::Download");
if (buffer->GetDesc().Width > std::numeric_limits<size_t>::max())
{
throw std::invalid_argument(fmt::format("Buffer width '{}' is too large.", buffer->GetDesc().Width));
}
ComPtr<ID3D12Resource> resourceToMap;
// Can't assume the input buffer was created as a custom heap (e.g., ONNX dispatchable with a deferred
// resource allocated by the DML EP), so check the heap properties.
D3D12_HEAP_PROPERTIES heapProps = {};
D3D12_HEAP_FLAGS heapFlags = {};
if (SUCCEEDED(buffer->GetHeapProperties(&heapProps, &heapFlags)) &&
heapProps.MemoryPoolPreference == D3D12_MEMORY_POOL_L0 &&
heapProps.CPUPageProperty == D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE)
{
resourceToMap = buffer;
}
else
{
resourceToMap = CreateReadbackBuffer(buffer->GetDesc().Width);
resourceToMap->SetName(L"Device::Download");
D3D12_RESOURCE_BARRIER barriers[] =
{
CD3DX12_RESOURCE_BARRIER::Transition(
defaultBuffer.Get(),
buffer.Get(),
D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
D3D12_RESOURCE_STATE_COPY_SOURCE)
};
m_commandList->ResourceBarrier(_countof(barriers), barriers);
}
m_commandList->CopyResource(readbackBuffer.Get(), defaultBuffer.Get());
{
D3D12_RESOURCE_BARRIER barriers[] =
{
CD3DX12_RESOURCE_BARRIER::Transition(
defaultBuffer.Get(),
D3D12_RESOURCE_STATE_COPY_SOURCE,
D3D12_RESOURCE_STATE_UNORDERED_ACCESS)
};
m_commandList->CopyResource(resourceToMap.Get(), buffer.Get());
std::swap(barriers[0].Transition.StateBefore, barriers[0].Transition.StateAfter);
m_commandList->ResourceBarrier(_countof(barriers), barriers);
ExecuteCommandListAndWait();
}
ExecuteCommandListAndWait();
if (defaultBuffer->GetDesc().Width > std::numeric_limits<size_t>::max())
{
throw std::invalid_argument(fmt::format("Buffer width '{}' is too large.", defaultBuffer->GetDesc().Width));
}
std::vector<std::byte> outputBuffer(static_cast<size_t>(defaultBuffer->GetDesc().Width));
{
size_t dataSize = gsl::narrow<size_t>(defaultBuffer->GetDesc().Width);
CD3DX12_RANGE readRange(0, dataSize);
void* readbackBufferData = nullptr;
THROW_IF_FAILED(readbackBuffer->Map(0, &readRange, &readbackBufferData));
memcpy(outputBuffer.data(), readbackBufferData, dataSize);
readbackBuffer->Unmap(0, nullptr);
}
m_temporaryResources.push_back(std::move(readbackBuffer));
std::vector<std::byte> outputBuffer(static_cast<size_t>(buffer->GetDesc().Width));
size_t dataSize = gsl::narrow<size_t>(buffer->GetDesc().Width);
CD3DX12_RANGE readRange(0, dataSize);
void* mappedBufferData = nullptr;
THROW_IF_FAILED(resourceToMap->Map(0, &readRange, &mappedBufferData));
memcpy(outputBuffer.data(), mappedBufferData, dataSize);
resourceToMap->Unmap(0, nullptr);
return outputBuffer;
}

Просмотреть файл

@ -22,6 +22,7 @@ public:
bool enableDred,
bool disableBackgroundProcessing,
bool setStablePowerState,
bool preferCustomHeaps,
uint32_t maxGpuTimeMeasurements,
std::shared_ptr<PixCaptureHelper> pixCaptureHelper,
std::shared_ptr<D3d12Module> d3dModule,
@ -45,7 +46,19 @@ public:
IDxcCompiler3* GetDxcCompiler();
#endif
// TODO: test custom heap buffer with write combine for igpu?
// Creates either a default buffer or custom buffer based on support for custom heaps
// and whether or not they are allowed.
Microsoft::WRL::ComPtr<ID3D12Resource> CreatePreferredDeviceMemoryBuffer(
uint64_t sizeInBytes,
D3D12_RESOURCE_FLAGS resourceFlags = D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS,
uint64_t alignment = 0,
D3D12_HEAP_FLAGS heapFlags = D3D12_HEAP_FLAG_NONE);
Microsoft::WRL::ComPtr<ID3D12Resource> CreateCustomBuffer(
uint64_t sizeInBytes,
D3D12_RESOURCE_FLAGS resourceFlags = D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS,
uint64_t alignment = 0,
D3D12_HEAP_FLAGS heapFlags = D3D12_HEAP_FLAG_NONE);
Microsoft::WRL::ComPtr<ID3D12Resource> CreateDefaultBuffer(
uint64_t sizeInBytes,
@ -137,6 +150,8 @@ private:
Microsoft::WRL::ComPtr<IDxDispatchLogger> m_logger;
bool m_restoreBackgroundProcessing = false;
bool m_restoreStablePowerState = false;
std::optional<D3D12_FEATURE_DATA_ARCHITECTURE1> m_architectureSupport;
bool m_useCustomHeaps = false;
#ifndef DXCOMPILER_NONE
Microsoft::WRL::ComPtr<IDxcUtils> m_dxcUtils;

Просмотреть файл

@ -166,7 +166,7 @@ void DmlDispatchable::Initialize()
auto tempBufferSize = initializer->GetBindingProperties().TemporaryResourceSize;
if (tempBufferSize > 0)
{
ComPtr<ID3D12Resource> tempBuffer = m_device->CreateDefaultBuffer(tempBufferSize);
ComPtr<ID3D12Resource> tempBuffer = m_device->CreatePreferredDeviceMemoryBuffer(tempBufferSize);
DML_BUFFER_BINDING bufferBinding = { tempBuffer.Get(), 0, tempBufferSize };
DML_BINDING_DESC bindingDesc = { DML_BINDING_TYPE_BUFFER, &bufferBinding };
bindingTable->BindTemporaryResource(&bindingDesc);
@ -177,7 +177,7 @@ void DmlDispatchable::Initialize()
auto persistentBufferSize = m_operatorCompiled->GetBindingProperties().PersistentResourceSize;
if (persistentBufferSize > 0)
{
m_persistentBuffer = m_device->CreateDefaultBuffer(persistentBufferSize);
m_persistentBuffer = m_device->CreatePreferredDeviceMemoryBuffer(persistentBufferSize);
DML_BUFFER_BINDING bufferBinding = { m_persistentBuffer.Get(), 0, persistentBufferSize };
DML_BINDING_DESC bindingDesc = { DML_BINDING_TYPE_BUFFER, &bufferBinding };
bindingTable->BindOutputs(1, &bindingDesc);
@ -227,7 +227,7 @@ void DmlDispatchable::Bind(const Bindings& bindings, uint32_t iteration)
auto tempBufferSize = bindingProps.TemporaryResourceSize;
if (tempBufferSize > 0)
{
tempBuffer = m_device->CreateDefaultBuffer(tempBufferSize);
tempBuffer = m_device->CreatePreferredDeviceMemoryBuffer(tempBufferSize);
DML_BUFFER_BINDING bufferBinding = { tempBuffer.Get(), 0, tempBufferSize };
DML_BINDING_DESC bindingDesc = { DML_BINDING_TYPE_BUFFER, &bufferBinding };

Просмотреть файл

@ -419,7 +419,7 @@ void OnnxDispatchable::Bind(const Bindings& jsonBindings, uint32_t iteration)
{
throw std::invalid_argument(fmt::format("TensorShapeUint32 '{}' is too large.", tensorShapeUint32.size()));
}
binding.resource = m_device->CreateDefaultBuffer(DMLCalcBufferTensorSize(
binding.resource = m_device->CreatePreferredDeviceMemoryBuffer(DMLCalcBufferTensorSize(
dataTypeInfo.dmlDataType,
static_cast<uint32_t>(tensorShapeUint32.size()),
tensorShapeUint32.data(),

Просмотреть файл

@ -172,6 +172,7 @@ HRESULT DxDispatch::RuntimeClassInitialize(
m_options->EnableDred(),
m_options->DisableBackgroundProcessing(),
m_options->SetStablePowerState(),
m_options->PreferCustomHeaps(),
m_options->MaxGpuTimeMeasurements(),
m_pixCaptureHelper,
m_d3dModule,