Support the SPV_EXT_fragment_shader_interlock extension.

This was straightforward to implement in GLSL. The
`ShadingRateInterlockOrderedEXT` and `ShadingRateInterlockUnorderedEXT`
modes aren't implemented yet, because we don't support
`SPV_NV_shading_rate` or `SPV_EXT_fragment_invocation_density` yet.

HLSL and MSL were more interesting. They don't support this directly,
but they do support marking resources as "rasterizer ordered," which
does roughly the same thing. So this implementation scans all accesses
inside the critical section and marks all storage resources found
therein as rasterizer ordered. They also don't support the fine-grained
controls on pixel- vs. sample-level interlock and disabling ordering
guarantees that GLSL and SPIR-V do, but that's OK. "Unordered" here
merely means the order is undefined; that it just so happens to be the
same as rasterizer order is immaterial. As for pixel- vs. sample-level
interlock, Vulkan explicitly states:

> With sample shading enabled, [the `PixelInterlockOrderedEXT` and
> `PixelInterlockUnorderedEXT`] execution modes are treated like
> `SampleInterlockOrderedEXT` or `SampleInterlockUnorderedEXT`
> respectively.

and:

> If [the `SampleInterlockOrderedEXT` or `SampleInterlockUnorderedEXT`]
> execution modes are used in single-sample mode they are treated like
> `PixelInterlockOrderedEXT` or `PixelInterlockUnorderedEXT`
> respectively.

So this will DTRT for MoltenVK and gfx-rs, at least.

MSL additionally supports multiple raster order groups; resources that
are not accessed together can be placed in different ROGs to allow them
to be synchronized separately. A more sophisticated analysis might be
able to place resources optimally, but that's outside the scope of this
change. For now, we assign all resources to group 0, which should do for
our purposes.

`glslang` doesn't support the `RasterizerOrdered` UAVs this
implementation produces for HLSL, so the test case needs `fxc.exe`.

It also insists on GLSL 4.50 for `GL_ARB_fragment_shader_interlock`,
even though the spec says it needs either 4.20 or
`GL_ARB_shader_image_load_store`; and it doesn't support the
`GL_NV_fragment_shader_interlock` extension at all. So I haven't been
able to test those code paths.

Fixes #1002.
This commit is contained in:
Chip Davis 2019-08-04 00:07:20 -05:00
Родитель a06997a6a4
Коммит 2eff420d9a
26 изменённых файлов: 934 добавлений и 9 удалений

Просмотреть файл

@ -0,0 +1,24 @@
RWByteAddressBuffer _9 : register(u6, space0);
globallycoherent RasterizerOrderedByteAddressBuffer _42 : register(u3, space0);
RasterizerOrderedByteAddressBuffer _52 : register(u4, space0);
RWTexture2D<unorm float4> img4 : register(u5, space0);
RasterizerOrderedTexture2D<unorm float4> img : register(u0, space0);
RasterizerOrderedTexture2D<unorm float4> img3 : register(u2, space0);
RasterizerOrderedTexture2D<uint> img2 : register(u1, space0);
void frag_main()
{
_9.Store(0, uint(0));
img4[int2(1, 1)] = float4(1.0f, 0.0f, 0.0f, 1.0f);
img[int2(0, 0)] = img3[int2(0, 0)];
uint _39;
InterlockedAdd(img2[int2(0, 0)], 1u, _39);
_42.Store(0, uint(int(_42.Load(0)) + 42));
uint _55;
_42.InterlockedAnd(4, _52.Load(0), _55);
}
void main()
{
frag_main();
}

Просмотреть файл

@ -0,0 +1,43 @@
#pragma clang diagnostic ignored "-Wunused-variable"
#include <metal_stdlib>
#include <simd/simd.h>
#include <metal_atomic>
using namespace metal;
struct Buffer3
{
int baz;
};
struct Buffer
{
int foo;
uint bar;
};
struct Buffer2
{
uint quux;
};
struct spvDescriptorSetBuffer0
{
device Buffer3* m_9 [[id(0)]];
texture2d<float, access::write> img4 [[id(1)]];
texture2d<float, access::write> img [[id(2), raster_order_group(0)]];
texture2d<float> img3 [[id(3), raster_order_group(0)]];
volatile device Buffer* m_34 [[id(4), raster_order_group(0)]];
device Buffer2* m_44 [[id(5), raster_order_group(0)]];
};
fragment void main0(constant spvDescriptorSetBuffer0& spvDescriptorSet0 [[buffer(0)]])
{
(*spvDescriptorSet0.m_9).baz = 0;
spvDescriptorSet0.img4.write(float4(1.0, 0.0, 0.0, 1.0), uint2(int2(1)));
spvDescriptorSet0.img.write(spvDescriptorSet0.img3.read(uint2(int2(0))), uint2(int2(0)));
(*spvDescriptorSet0.m_34).foo += 42;
uint _49 = atomic_fetch_and_explicit((volatile device atomic_uint*)&(*spvDescriptorSet0.m_34).bar, (*spvDescriptorSet0.m_44).quux, memory_order_relaxed);
}

Просмотреть файл

@ -0,0 +1,33 @@
#pragma clang diagnostic ignored "-Wunused-variable"
#include <metal_stdlib>
#include <simd/simd.h>
#include <metal_atomic>
using namespace metal;
struct Buffer3
{
int baz;
};
struct Buffer
{
int foo;
uint bar;
};
struct Buffer2
{
uint quux;
};
fragment void main0(device Buffer3& _9 [[buffer(0)]], volatile device Buffer& _34 [[buffer(1), raster_order_group(0)]], device Buffer2& _44 [[buffer(2), raster_order_group(0)]], texture2d<float, access::write> img4 [[texture(0)]], texture2d<float, access::write> img [[texture(1), raster_order_group(0)]], texture2d<float> img3 [[texture(2), raster_order_group(0)]])
{
_9.baz = 0;
img4.write(float4(1.0, 0.0, 0.0, 1.0), uint2(int2(1)));
img.write(img3.read(uint2(int2(0))), uint2(int2(0)));
_34.foo += 42;
uint _49 = atomic_fetch_and_explicit((volatile device atomic_uint*)&_34.bar, _44.quux, memory_order_relaxed);
}

Просмотреть файл

@ -0,0 +1,23 @@
#version 450
#extension GL_ARB_fragment_shader_interlock : require
layout(pixel_interlock_ordered) in;
layout(binding = 2, std430) coherent buffer Buffer
{
int foo;
uint bar;
} _30;
layout(binding = 0, rgba8) uniform writeonly image2D img;
layout(binding = 1, r32ui) uniform uimage2D img2;
void main()
{
beginInvocationInterlockARB();
imageStore(img, ivec2(0), vec4(1.0, 0.0, 0.0, 1.0));
uint _27 = imageAtomicAdd(img2, ivec2(0), 1u);
_30.foo += 42;
uint _41 = atomicAnd(_30.bar, 255u);
endInvocationInterlockARB();
}

Просмотреть файл

@ -0,0 +1,23 @@
#version 450
#extension GL_ARB_fragment_shader_interlock : require
layout(pixel_interlock_unordered) in;
layout(binding = 2, std430) coherent buffer Buffer
{
int foo;
uint bar;
} _30;
layout(binding = 0, rgba8) uniform writeonly image2D img;
layout(binding = 1, r32ui) uniform uimage2D img2;
void main()
{
beginInvocationInterlockARB();
imageStore(img, ivec2(0), vec4(1.0, 0.0, 0.0, 1.0));
uint _27 = imageAtomicAdd(img2, ivec2(0), 1u);
_30.foo += 42;
uint _41 = atomicAnd(_30.bar, 255u);
endInvocationInterlockARB();
}

Просмотреть файл

@ -0,0 +1,23 @@
#version 450
#extension GL_ARB_fragment_shader_interlock : require
layout(sample_interlock_ordered) in;
layout(binding = 2, std430) coherent buffer Buffer
{
int foo;
uint bar;
} _30;
layout(binding = 0, rgba8) uniform writeonly image2D img;
layout(binding = 1, r32ui) uniform uimage2D img2;
void main()
{
beginInvocationInterlockARB();
imageStore(img, ivec2(0), vec4(1.0, 0.0, 0.0, 1.0));
uint _27 = imageAtomicAdd(img2, ivec2(0), 1u);
_30.foo += 42;
uint _47 = atomicAnd(_30.bar, uint(gl_SampleMaskIn[0]));
endInvocationInterlockARB();
}

Просмотреть файл

@ -0,0 +1,23 @@
#version 450
#extension GL_ARB_fragment_shader_interlock : require
layout(sample_interlock_unordered) in;
layout(binding = 2, std430) coherent buffer Buffer
{
int foo;
uint bar;
} _30;
layout(binding = 0, rgba8) uniform writeonly image2D img;
layout(binding = 1, r32ui) uniform uimage2D img2;
void main()
{
beginInvocationInterlockARB();
imageStore(img, ivec2(0), vec4(1.0, 0.0, 0.0, 1.0));
uint _27 = imageAtomicAdd(img2, ivec2(0), 1u);
_30.foo += 42;
uint _41 = atomicAnd(_30.bar, 255u);
endInvocationInterlockARB();
}

Просмотреть файл

@ -0,0 +1,24 @@
RWByteAddressBuffer _9 : register(u6, space0);
globallycoherent RasterizerOrderedByteAddressBuffer _42 : register(u3, space0);
RasterizerOrderedByteAddressBuffer _52 : register(u4, space0);
RWTexture2D<unorm float4> img4 : register(u5, space0);
RasterizerOrderedTexture2D<unorm float4> img : register(u0, space0);
RasterizerOrderedTexture2D<unorm float4> img3 : register(u2, space0);
RasterizerOrderedTexture2D<uint> img2 : register(u1, space0);
void frag_main()
{
_9.Store(0, uint(0));
img4[int2(1, 1)] = float4(1.0f, 0.0f, 0.0f, 1.0f);
img[int2(0, 0)] = img3[int2(0, 0)];
uint _39;
InterlockedAdd(img2[int2(0, 0)], 1u, _39);
_42.Store(0, uint(int(_42.Load(0)) + 42));
uint _55;
_42.InterlockedAnd(4, _52.Load(0), _55);
}
void main()
{
frag_main();
}

Просмотреть файл

@ -0,0 +1,43 @@
#pragma clang diagnostic ignored "-Wunused-variable"
#include <metal_stdlib>
#include <simd/simd.h>
#include <metal_atomic>
using namespace metal;
struct Buffer3
{
int baz;
};
struct Buffer
{
int foo;
uint bar;
};
struct Buffer2
{
uint quux;
};
struct spvDescriptorSetBuffer0
{
device Buffer3* m_9 [[id(0)]];
texture2d<float, access::write> img4 [[id(1)]];
texture2d<float, access::write> img [[id(2), raster_order_group(0)]];
texture2d<float> img3 [[id(3), raster_order_group(0)]];
volatile device Buffer* m_34 [[id(4), raster_order_group(0)]];
device Buffer2* m_44 [[id(5), raster_order_group(0)]];
};
fragment void main0(constant spvDescriptorSetBuffer0& spvDescriptorSet0 [[buffer(0)]])
{
(*spvDescriptorSet0.m_9).baz = 0;
spvDescriptorSet0.img4.write(float4(1.0, 0.0, 0.0, 1.0), uint2(int2(1)));
spvDescriptorSet0.img.write(spvDescriptorSet0.img3.read(uint2(int2(0))), uint2(int2(0)));
(*spvDescriptorSet0.m_34).foo += 42;
uint _49 = atomic_fetch_and_explicit((volatile device atomic_uint*)&(*spvDescriptorSet0.m_34).bar, (*spvDescriptorSet0.m_44).quux, memory_order_relaxed);
}

Просмотреть файл

@ -0,0 +1,33 @@
#pragma clang diagnostic ignored "-Wunused-variable"
#include <metal_stdlib>
#include <simd/simd.h>
#include <metal_atomic>
using namespace metal;
struct Buffer3
{
int baz;
};
struct Buffer
{
int foo;
uint bar;
};
struct Buffer2
{
uint quux;
};
fragment void main0(device Buffer3& _9 [[buffer(0)]], volatile device Buffer& _34 [[buffer(1), raster_order_group(0)]], device Buffer2& _44 [[buffer(2), raster_order_group(0)]], texture2d<float, access::write> img4 [[texture(0)]], texture2d<float, access::write> img [[texture(1), raster_order_group(0)]], texture2d<float> img3 [[texture(2), raster_order_group(0)]])
{
_9.baz = 0;
img4.write(float4(1.0, 0.0, 0.0, 1.0), uint2(int2(1)));
img.write(img3.read(uint2(int2(0))), uint2(int2(0)));
_34.foo += 42;
uint _49 = atomic_fetch_and_explicit((volatile device atomic_uint*)&_34.bar, _44.quux, memory_order_relaxed);
}

Просмотреть файл

@ -0,0 +1,23 @@
#version 450
#extension GL_ARB_fragment_shader_interlock : require
layout(pixel_interlock_ordered) in;
layout(binding = 2, std430) coherent buffer Buffer
{
int foo;
uint bar;
} _30;
layout(binding = 0, rgba8) uniform writeonly image2D img;
layout(binding = 1, r32ui) uniform uimage2D img2;
void main()
{
beginInvocationInterlockARB();
imageStore(img, ivec2(0), vec4(1.0, 0.0, 0.0, 1.0));
uint _27 = imageAtomicAdd(img2, ivec2(0), 1u);
_30.foo += 42;
uint _41 = atomicAnd(_30.bar, 255u);
endInvocationInterlockARB();
}

Просмотреть файл

@ -0,0 +1,23 @@
#version 450
#extension GL_ARB_fragment_shader_interlock : require
layout(pixel_interlock_unordered) in;
layout(binding = 2, std430) coherent buffer Buffer
{
int foo;
uint bar;
} _30;
layout(binding = 0, rgba8) uniform writeonly image2D img;
layout(binding = 1, r32ui) uniform uimage2D img2;
void main()
{
beginInvocationInterlockARB();
imageStore(img, ivec2(0), vec4(1.0, 0.0, 0.0, 1.0));
uint _27 = imageAtomicAdd(img2, ivec2(0), 1u);
_30.foo += 42;
uint _41 = atomicAnd(_30.bar, 255u);
endInvocationInterlockARB();
}

Просмотреть файл

@ -0,0 +1,23 @@
#version 450
#extension GL_ARB_fragment_shader_interlock : require
layout(sample_interlock_ordered) in;
layout(binding = 2, std430) coherent buffer Buffer
{
int foo;
uint bar;
} _30;
layout(binding = 0, rgba8) uniform writeonly image2D img;
layout(binding = 1, r32ui) uniform uimage2D img2;
void main()
{
beginInvocationInterlockARB();
imageStore(img, ivec2(0), vec4(1.0, 0.0, 0.0, 1.0));
uint _27 = imageAtomicAdd(img2, ivec2(0), 1u);
_30.foo += 42;
uint _47 = atomicAnd(_30.bar, uint(gl_SampleMaskIn[0]));
endInvocationInterlockARB();
}

Просмотреть файл

@ -0,0 +1,23 @@
#version 450
#extension GL_ARB_fragment_shader_interlock : require
layout(sample_interlock_unordered) in;
layout(binding = 2, std430) coherent buffer Buffer
{
int foo;
uint bar;
} _30;
layout(binding = 0, rgba8) uniform writeonly image2D img;
layout(binding = 1, r32ui) uniform uimage2D img2;
void main()
{
beginInvocationInterlockARB();
imageStore(img, ivec2(0), vec4(1.0, 0.0, 0.0, 1.0));
uint _27 = imageAtomicAdd(img2, ivec2(0), 1u);
_30.foo += 42;
uint _41 = atomicAnd(_30.bar, 255u);
endInvocationInterlockARB();
}

Просмотреть файл

@ -0,0 +1,36 @@
#version 450
#extension GL_ARB_fragment_shader_interlock : require
layout(pixel_interlock_ordered) in;
layout(binding = 0, rgba8) uniform writeonly image2D img;
layout(binding = 1, r32ui) uniform uimage2D img2;
layout(binding = 2, rgba8) uniform readonly image2D img3;
layout(binding = 3) coherent buffer Buffer
{
int foo;
uint bar;
};
layout(binding = 4) buffer Buffer2
{
uint quux;
};
layout(binding = 5, rgba8) uniform writeonly image2D img4;
layout(binding = 6) buffer Buffer3
{
int baz;
};
void main()
{
// Deliberately outside the critical section to test usage tracking.
baz = 0;
imageStore(img4, ivec2(1, 1), vec4(1.0, 0.0, 0.0, 1.0));
beginInvocationInterlockARB();
imageStore(img, ivec2(0, 0), imageLoad(img3, ivec2(0, 0)));
imageAtomicAdd(img2, ivec2(0, 0), 1u);
foo += 42;
atomicAnd(bar, quux);
endInvocationInterlockARB();
}

Просмотреть файл

@ -0,0 +1,36 @@
#version 450
#extension GL_ARB_fragment_shader_interlock : require
layout(pixel_interlock_ordered) in;
layout(binding = 0, rgba8) uniform writeonly image2D img;
//layout(binding = 1, r32ui) uniform uimage2D img2;
layout(binding = 2, rgba8) uniform readonly image2D img3;
layout(binding = 3) coherent buffer Buffer
{
int foo;
uint bar;
};
layout(binding = 4) buffer Buffer2
{
uint quux;
};
layout(binding = 5, rgba8) uniform writeonly image2D img4;
layout(binding = 6) buffer Buffer3
{
int baz;
};
void main()
{
// Deliberately outside the critical section to test usage tracking.
baz = 0;
imageStore(img4, ivec2(1, 1), vec4(1.0, 0.0, 0.0, 1.0));
beginInvocationInterlockARB();
imageStore(img, ivec2(0, 0), imageLoad(img3, ivec2(0, 0)));
//imageAtomicAdd(img2, ivec2(0, 0), 1u);
foo += 42;
atomicAnd(bar, quux);
endInvocationInterlockARB();
}

Просмотреть файл

@ -0,0 +1,36 @@
#version 450
#extension GL_ARB_fragment_shader_interlock : require
layout(pixel_interlock_ordered) in;
layout(binding = 0, rgba8) uniform writeonly image2D img;
//layout(binding = 1, r32ui) uniform uimage2D img2;
layout(binding = 2, rgba8) uniform readonly image2D img3;
layout(binding = 3) coherent buffer Buffer
{
int foo;
uint bar;
};
layout(binding = 4) buffer Buffer2
{
uint quux;
};
layout(binding = 5, rgba8) uniform writeonly image2D img4;
layout(binding = 6) buffer Buffer3
{
int baz;
};
void main()
{
// Deliberately outside the critical section to test usage tracking.
baz = 0;
imageStore(img4, ivec2(1, 1), vec4(1.0, 0.0, 0.0, 1.0));
beginInvocationInterlockARB();
imageStore(img, ivec2(0, 0), imageLoad(img3, ivec2(0, 0)));
//imageAtomicAdd(img2, ivec2(0, 0), 1u);
foo += 42;
atomicAnd(bar, quux);
endInvocationInterlockARB();
}

Просмотреть файл

@ -0,0 +1,22 @@
#version 450
#extension GL_ARB_fragment_shader_interlock : require
layout(pixel_interlock_ordered) in;
layout(binding = 0, rgba8) uniform writeonly image2D img;
layout(binding = 1, r32ui) uniform uimage2D img2;
layout(binding = 2) coherent buffer Buffer
{
int foo;
uint bar;
};
void main()
{
beginInvocationInterlockARB();
imageStore(img, ivec2(0, 0), vec4(1.0, 0.0, 0.0, 1.0));
imageAtomicAdd(img2, ivec2(0, 0), 1u);
foo += 42;
atomicAnd(bar, 0xff);
endInvocationInterlockARB();
}

Просмотреть файл

@ -0,0 +1,22 @@
#version 450
#extension GL_ARB_fragment_shader_interlock : require
layout(pixel_interlock_unordered) in;
layout(binding = 0, rgba8) uniform writeonly image2D img;
layout(binding = 1, r32ui) uniform uimage2D img2;
layout(binding = 2) coherent buffer Buffer
{
int foo;
uint bar;
};
void main()
{
beginInvocationInterlockARB();
imageStore(img, ivec2(0, 0), vec4(1.0, 0.0, 0.0, 1.0));
imageAtomicAdd(img2, ivec2(0, 0), 1u);
foo += 42;
atomicAnd(bar, 0xff);
endInvocationInterlockARB();
}

Просмотреть файл

@ -0,0 +1,22 @@
#version 450
#extension GL_ARB_fragment_shader_interlock : require
layout(sample_interlock_ordered) in;
layout(binding = 0, rgba8) uniform writeonly image2D img;
layout(binding = 1, r32ui) uniform uimage2D img2;
layout(binding = 2) coherent buffer Buffer
{
int foo;
uint bar;
};
void main()
{
beginInvocationInterlockARB();
imageStore(img, ivec2(0, 0), vec4(1.0, 0.0, 0.0, 1.0));
imageAtomicAdd(img2, ivec2(0, 0), 1u);
foo += 42;
atomicAnd(bar, gl_SampleMaskIn[0]);
endInvocationInterlockARB();
}

Просмотреть файл

@ -0,0 +1,22 @@
#version 450
#extension GL_ARB_fragment_shader_interlock : require
layout(sample_interlock_unordered) in;
layout(binding = 0, rgba8) uniform writeonly image2D img;
layout(binding = 1, r32ui) uniform uimage2D img2;
layout(binding = 2) coherent buffer Buffer
{
int foo;
uint bar;
};
void main()
{
beginInvocationInterlockARB();
imageStore(img, ivec2(0, 0), vec4(1.0, 0.0, 0.0, 1.0));
imageAtomicAdd(img2, ivec2(0, 0), 1u);
foo += 42;
atomicAnd(bar, 0xff);
endInvocationInterlockARB();
}

Просмотреть файл

@ -4249,6 +4249,221 @@ void Compiler::analyze_non_block_pointer_types()
sort(begin(physical_storage_non_block_pointer_types), end(physical_storage_non_block_pointer_types));
}
bool Compiler::InterlockedResourceAccessHandler::handle(Op opcode, const uint32_t *args, uint32_t length)
{
if (opcode == OpBeginInvocationInterlockEXT)
{
in_crit_sec = true;
return true;
}
if (opcode == OpEndInvocationInterlockEXT)
{
// End critical section--nothing more to do.
return false;
}
// We need to figure out where images and buffers are loaded from, so do only the bare bones compilation we need.
switch (opcode)
{
case OpLoad:
{
if (length < 3)
return false;
uint32_t ptr = args[2];
auto *var = compiler.maybe_get_backing_variable(ptr);
// We're only concerned with buffer and image memory here.
if (!var)
break;
switch (var->storage)
{
default:
break;
case StorageClassUniformConstant:
{
uint32_t result_type = args[0];
uint32_t id = args[1];
compiler.set<SPIRExpression>(id, "", result_type, true);
compiler.register_read(id, ptr, true);
break;
}
case StorageClassUniform:
// Must have BufferBlock; we only care about SSBOs.
if (!compiler.has_decoration(compiler.get<SPIRType>(var->basetype).self, DecorationBufferBlock))
break;
// fallthrough
case StorageClassStorageBuffer:
if (!in_crit_sec)
break;
compiler.interlocked_resources.insert(var->self);
break;
}
break;
}
case OpInBoundsAccessChain:
case OpAccessChain:
case OpPtrAccessChain:
{
if (length < 3)
return false;
uint32_t result_type = args[0];
auto &type = compiler.get<SPIRType>(result_type);
if (type.storage == StorageClassUniform || type.storage == StorageClassUniformConstant ||
type.storage == StorageClassStorageBuffer)
{
uint32_t id = args[1];
uint32_t ptr = args[2];
compiler.set<SPIRExpression>(id, "", result_type, true);
compiler.register_read(id, ptr, true);
}
break;
}
case OpImageTexelPointer:
{
if (length < 3)
return false;
uint32_t result_type = args[0];
uint32_t id = args[1];
uint32_t ptr = args[2];
auto &e = compiler.set<SPIRExpression>(id, "", result_type, true);
auto *var = compiler.maybe_get_backing_variable(ptr);
if (var)
e.loaded_from = var->self;
}
case OpStore:
case OpImageWrite:
case OpAtomicStore:
{
if (length < 1)
return false;
if (!in_crit_sec)
break;
uint32_t ptr = args[0];
auto *var = compiler.maybe_get_backing_variable(ptr);
if (var && (var->storage == StorageClassUniform || var->storage == StorageClassUniformConstant ||
var->storage == StorageClassStorageBuffer))
compiler.interlocked_resources.insert(var->self);
break;
}
case OpCopyMemory:
{
if (length < 2)
return false;
if (!in_crit_sec)
break;
uint32_t dst = args[0];
uint32_t src = args[1];
auto *dst_var = compiler.maybe_get_backing_variable(dst);
auto *src_var = compiler.maybe_get_backing_variable(src);
if (dst_var && (dst_var->storage == StorageClassUniform || dst_var->storage == StorageClassStorageBuffer))
compiler.interlocked_resources.insert(dst_var->self);
if (src_var)
{
if (src_var->storage != StorageClassUniform && src_var->storage != StorageClassStorageBuffer)
break;
if (src_var->storage == StorageClassUniform &&
!compiler.has_decoration(compiler.get<SPIRType>(src_var->basetype).self, DecorationBufferBlock))
break;
compiler.interlocked_resources.insert(src_var->self);
}
break;
}
case OpImageRead:
case OpAtomicLoad:
{
if (length < 3)
return false;
if (!in_crit_sec)
break;
uint32_t ptr = args[2];
auto *var = compiler.maybe_get_backing_variable(ptr);
// We're only concerned with buffer and image memory here.
if (!var)
break;
switch (var->storage)
{
default:
break;
case StorageClassUniform:
// Must have BufferBlock; we only care about SSBOs.
if (!compiler.has_decoration(compiler.get<SPIRType>(var->basetype).self, DecorationBufferBlock))
break;
// fallthrough
case StorageClassUniformConstant:
case StorageClassStorageBuffer:
compiler.interlocked_resources.insert(var->self);
break;
}
break;
}
case OpAtomicExchange:
case OpAtomicCompareExchange:
case OpAtomicIIncrement:
case OpAtomicIDecrement:
case OpAtomicIAdd:
case OpAtomicISub:
case OpAtomicSMin:
case OpAtomicUMin:
case OpAtomicSMax:
case OpAtomicUMax:
case OpAtomicAnd:
case OpAtomicOr:
case OpAtomicXor:
{
if (length < 3)
return false;
if (!in_crit_sec)
break;
uint32_t ptr = args[2];
auto *var = compiler.maybe_get_backing_variable(ptr);
if (var && (var->storage == StorageClassUniform || var->storage == StorageClassUniformConstant ||
var->storage == StorageClassStorageBuffer))
compiler.interlocked_resources.insert(var->self);
break;
}
default:
break;
}
return true;
}
void Compiler::analyze_interlocked_resource_usage()
{
InterlockedResourceAccessHandler handler(*this);
traverse_all_reachable_opcodes(get<SPIRFunction>(ir.default_entry_point), handler);
}
bool Compiler::type_is_array_of_pointers(const SPIRType &type) const
{
if (!type.pointer)

Просмотреть файл

@ -945,6 +945,27 @@ protected:
bool single_function);
bool may_read_undefined_variable_in_block(const SPIRBlock &block, uint32_t var);
// Finds all resources that are written to from inside the critical section, if present.
// The critical section is delimited by OpBeginInvocationInterlockEXT and
// OpEndInvocationInterlockEXT instructions. In MSL and HLSL, any resources written
// while inside the critical section must be placed in a raster order group.
struct InterlockedResourceAccessHandler : OpcodeHandler
{
InterlockedResourceAccessHandler(Compiler &compiler_)
: compiler(compiler_)
{
}
bool handle(spv::Op op, const uint32_t *args, uint32_t length) override;
Compiler &compiler;
bool in_crit_sec = false;
};
void analyze_interlocked_resource_usage();
// The set of all resources written while inside the critical section, if present.
std::unordered_set<uint32_t> interlocked_resources;
void make_constant_null(uint32_t id, uint32_t type);
std::unordered_map<uint32_t, std::string> declared_block_names;

Просмотреть файл

@ -605,6 +605,26 @@ void CompilerGLSL::emit_header()
if (execution.flags.get(ExecutionModePostDepthCoverage))
require_extension_internal("GL_ARB_post_depth_coverage");
// Needed for: layout({pixel,sample}_interlock_[un]ordered) in;
if (execution.flags.get(ExecutionModePixelInterlockOrderedEXT) ||
execution.flags.get(ExecutionModePixelInterlockUnorderedEXT) ||
execution.flags.get(ExecutionModeSampleInterlockOrderedEXT) ||
execution.flags.get(ExecutionModeSampleInterlockUnorderedEXT))
{
if (options.es)
{
if (options.version < 310)
SPIRV_CROSS_THROW("At least ESSL 3.10 required for fragment shader interlock.");
require_extension_internal("GL_NV_fragment_shader_interlock");
}
else
{
if (options.version < 420)
require_extension_internal("GL_ARB_shader_image_load_store");
require_extension_internal("GL_ARB_fragment_shader_interlock");
}
}
for (auto &ext : forced_extensions)
{
if (ext == "GL_EXT_shader_explicit_arithmetic_types_float16")
@ -784,6 +804,15 @@ void CompilerGLSL::emit_header()
if (execution.flags.get(ExecutionModePostDepthCoverage))
inputs.push_back("post_depth_coverage");
if (execution.flags.get(ExecutionModePixelInterlockOrderedEXT))
inputs.push_back("pixel_interlock_ordered");
else if (execution.flags.get(ExecutionModePixelInterlockUnorderedEXT))
inputs.push_back("pixel_interlock_unordered");
else if (execution.flags.get(ExecutionModeSampleInterlockOrderedEXT))
inputs.push_back("sample_interlock_ordered");
else if (execution.flags.get(ExecutionModeSampleInterlockUnorderedEXT))
inputs.push_back("sample_interlock_unordered");
if (!options.es && execution.flags.get(ExecutionModeDepthGreater))
statement("layout(depth_greater) out float gl_FragDepth;");
else if (!options.es && execution.flags.get(ExecutionModeDepthLess))
@ -10109,6 +10138,32 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
emit_op(ops[0], ops[1], "helperInvocationEXT()", false);
break;
case OpBeginInvocationInterlockEXT:
if (options.es)
{
require_extension_internal("GL_NV_fragment_shader_interlock");
statement("beginInvocationInterlockNV();");
}
else
{
require_extension_internal("GL_ARB_fragment_shader_interlock");
statement("beginInvocationInterlockARB();");
}
break;
case OpEndInvocationInterlockEXT:
if (options.es)
{
require_extension_internal("GL_NV_fragment_shader_interlock");
statement("endInvocationInterlockNV();");
}
else
{
require_extension_internal("GL_ARB_fragment_shader_interlock");
statement("endInvocationInterlockARB();");
}
break;
default:
statement("// unimplemented op ", instruction.op);
break;

Просмотреть файл

@ -203,7 +203,7 @@ static string image_format_to_type(ImageFormat fmt, SPIRType::BaseType basetype)
}
}
string CompilerHLSL::image_type_hlsl_modern(const SPIRType &type, uint32_t)
string CompilerHLSL::image_type_hlsl_modern(const SPIRType &type, uint32_t id)
{
auto &imagetype = get<SPIRType>(type.image.type);
const char *dim = nullptr;
@ -235,7 +235,12 @@ string CompilerHLSL::image_type_hlsl_modern(const SPIRType &type, uint32_t)
if (type.image.sampled == 1)
return join("Buffer<", type_to_glsl(imagetype), components, ">");
else if (type.image.sampled == 2)
{
if (interlocked_resources.count(id))
return join("RasterizerOrderedBuffer<", image_format_to_type(type.image.format, imagetype.basetype),
">");
return join("RWBuffer<", image_format_to_type(type.image.format, imagetype.basetype), ">");
}
else
SPIRV_CROSS_THROW("Sampler buffers must be either sampled or unsampled. Cannot deduce in runtime.");
case DimSubpassData:
@ -248,6 +253,8 @@ string CompilerHLSL::image_type_hlsl_modern(const SPIRType &type, uint32_t)
const char *arrayed = type.image.arrayed ? "Array" : "";
const char *ms = type.image.ms ? "MS" : "";
const char *rw = typed_load ? "RW" : "";
if (typed_load && interlocked_resources.count(id))
rw = "RasterizerOrdered";
return join(rw, "Texture", dim, ms, arrayed, "<",
typed_load ? image_format_to_type(type.image.format, imagetype.basetype) :
join(type_to_glsl(imagetype), components),
@ -1848,9 +1855,13 @@ void CompilerHLSL::emit_buffer_block(const SPIRVariable &var)
Bitset flags = ir.get_buffer_block_flags(var);
bool is_readonly = flags.get(DecorationNonWritable);
bool is_coherent = flags.get(DecorationCoherent);
bool is_interlocked = interlocked_resources.count(var.self) > 0;
const char *type_name = "ByteAddressBuffer ";
if (!is_readonly)
type_name = is_interlocked ? "RasterizerOrderedByteAddressBuffer " : "RWByteAddressBuffer ";
add_resource_name(var.self);
statement(is_coherent ? "globallycoherent " : "", is_readonly ? "ByteAddressBuffer " : "RWByteAddressBuffer ",
to_name(var.self), type_to_array_glsl(type), to_resource_binding(var), ";");
statement(is_coherent ? "globallycoherent " : "", type_name, to_name(var.self), type_to_array_glsl(type),
to_resource_binding(var), ";");
}
else
{
@ -4673,6 +4684,12 @@ void CompilerHLSL::emit_instruction(const Instruction &instruction)
case OpIsHelperInvocationEXT:
SPIRV_CROSS_THROW("helperInvocationEXT() is not supported in HLSL.");
case OpBeginInvocationInterlockEXT:
case OpEndInvocationInterlockEXT:
if (hlsl_options.shader_model < 51)
SPIRV_CROSS_THROW("Rasterizer order views require Shader Model 5.1.");
break; // Nothing to do in the body
default:
CompilerGLSL::emit_instruction(instruction);
break;
@ -4850,6 +4867,12 @@ string CompilerHLSL::compile()
validate_shader_model();
update_active_builtins();
analyze_image_and_sampler_usage();
if (get_execution_model() == ExecutionModelFragment &&
(get_entry_point().flags.get(ExecutionModePixelInterlockOrderedEXT) ||
get_entry_point().flags.get(ExecutionModePixelInterlockUnorderedEXT) ||
get_entry_point().flags.get(ExecutionModeSampleInterlockOrderedEXT) ||
get_entry_point().flags.get(ExecutionModeSampleInterlockUnorderedEXT)))
analyze_interlocked_resource_usage();
// Subpass input needs SV_Position.
if (need_subpass_input)

Просмотреть файл

@ -852,6 +852,12 @@ string CompilerMSL::compile()
update_active_builtins();
analyze_image_and_sampler_usage();
analyze_sampled_image_usage();
if (get_execution_model() == ExecutionModelFragment &&
(get_entry_point().flags.get(ExecutionModePixelInterlockOrderedEXT) ||
get_entry_point().flags.get(ExecutionModePixelInterlockUnorderedEXT) ||
get_entry_point().flags.get(ExecutionModeSampleInterlockOrderedEXT) ||
get_entry_point().flags.get(ExecutionModeSampleInterlockUnorderedEXT)))
analyze_interlocked_resource_usage();
preprocess_op_codes();
build_implicit_builtins();
@ -5541,6 +5547,12 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
emit_op(ops[0], ops[1], "simd_is_helper_thread()", false);
break;
case OpBeginInvocationInterlockEXT:
case OpEndInvocationInterlockEXT:
if (!msl_options.supports_msl_version(2, 0))
SPIRV_CROSS_THROW("Raster order groups require MSL 2.0.");
break; // Nothing to do in the body
default:
CompilerGLSL::emit_instruction(instruction);
break;
@ -7436,8 +7448,15 @@ string CompilerMSL::member_attribute_qualifier(const SPIRType &type, uint32_t in
bool is_builtin = is_member_builtin(type, index, &builtin);
if (has_extended_member_decoration(type.self, index, SPIRVCrossDecorationResourceIndexPrimary))
return join(" [[id(",
get_extended_member_decoration(type.self, index, SPIRVCrossDecorationResourceIndexPrimary), ")]]");
{
string quals = join(
" [[id(", get_extended_member_decoration(type.self, index, SPIRVCrossDecorationResourceIndexPrimary), ")");
if (interlocked_resources.count(
get_extended_member_decoration(type.self, index, SPIRVCrossDecorationInterfaceOrigID)))
quals += ", raster_order_group(0)";
quals += "]]";
return quals;
}
// Vertex function inputs
if (execution.model == ExecutionModelVertex && type.storage == StorageClassInput)
@ -8239,7 +8258,10 @@ void CompilerMSL::entry_point_args_discrete_descriptors(string &ep_args)
ep_args += ", ";
ep_args += get_argument_address_space(var) + " " + type_to_glsl(type) + "* " + to_restrict(var_id) +
r.name + "_" + convert_to_string(i);
ep_args += " [[buffer(" + convert_to_string(r.index + i) + ")]]";
ep_args += " [[buffer(" + convert_to_string(r.index + i) + ")";
if (interlocked_resources.count(var_id))
ep_args += ", raster_order_group(0)";
ep_args += "]]";
}
}
else
@ -8248,7 +8270,10 @@ void CompilerMSL::entry_point_args_discrete_descriptors(string &ep_args)
ep_args += ", ";
ep_args +=
get_argument_address_space(var) + " " + type_to_glsl(type) + "& " + to_restrict(var_id) + r.name;
ep_args += " [[buffer(" + convert_to_string(r.index) + ")]]";
ep_args += " [[buffer(" + convert_to_string(r.index) + ")";
if (interlocked_resources.count(var_id))
ep_args += ", raster_order_group(0)";
ep_args += "]]";
}
break;
}
@ -8264,7 +8289,10 @@ void CompilerMSL::entry_point_args_discrete_descriptors(string &ep_args)
ep_args += image_type_glsl(type, var_id) + " " + r.name;
if (r.plane > 0)
ep_args += join(plane_name_suffix, r.plane);
ep_args += " [[texture(" + convert_to_string(r.index) + ")]]";
ep_args += " [[texture(" + convert_to_string(r.index) + ")";
if (interlocked_resources.count(var_id))
ep_args += ", raster_order_group(0)";
ep_args += "]]";
break;
default:
if (!ep_args.empty())
@ -8274,7 +8302,10 @@ void CompilerMSL::entry_point_args_discrete_descriptors(string &ep_args)
type_to_glsl(type, var_id) + "& " + r.name;
else
ep_args += type_to_glsl(type, var_id) + " " + r.name;
ep_args += " [[buffer(" + convert_to_string(r.index) + ")]]";
ep_args += " [[buffer(" + convert_to_string(r.index) + ")";
if (interlocked_resources.count(var_id))
ep_args += ", raster_order_group(0)";
ep_args += "]]";
break;
}
}