Vulkan: Emulate subgroup ops in seamful cubemap emulation

Where subgroup ops are not available, they are emulated as such: Code with subgroup ops: float lH = subgroupQuadSwapHorizontal(layer); float lV = subgroupQuadSwapVertical(layer); float lD = subgroupQuadSwapDiagonal(layer); bool isHelperH = subgroupQuadSwapHorizontal(gl_HelperInvocation); bool isHelperV = subgroupQuadSwapVertical(gl_HelperInvocation); if (gl_HelperInvocation) { layer = !isHelperH ? lH : !isHelperV ? lV : lD; } Emulated code: float nonHelperLayer = gl_HelperInvocation ? 0.0 : layer; float lH = abs(dFdxFine(nonHelperLayer)); float lV = abs(dFdyFine(nonHelperLayer)); float lD = abs(dFdxFine(lV)); float isHelperDiffH = abs(dFdxFine(float(gl_HelperInvocation))); bool isNonHelperH = isHelperDiffH > 0.5; float isHelperDiffV = abs(dFdyFine(float(gl_HelperInvocation))); bool isNonHelperV = isHelperDiffV > 0.5; if (gl_HelperInvocation) { layer = isNonHelperH ? lH : isNonHelperV ? lV : lD; } Both paths are supported as on nvidia devices the emulated code misbehaves. This change therefore effectively only enables seamful cube map emulation on Android where subgroup operations are not supported. Bug: angleproject:3243 Bug: angleproject:3732 Change-Id: I9664d9760756758748183eb121c626f176789f3a Reviewed-on: https://chromium-review.googlesource.com/c/angle/angle/+/1742222 Reviewed-by: Shahbaz Youssefi <syoussefi@chromium.org> Commit-Queue: Shahbaz Youssefi <syoussefi@chromium.org>
2019-08-07 14:44:12 -04:00 · 2019-08-07 14:44:12 -04:00 · 5a2553a7b6
--- a/include/GLSLANG/ShaderLang.h
+++ b/include/GLSLANG/ShaderLang.h
@ -26,7 +26,7 @@

 // Version number for shader translation API.
 // It is incremented every time the API changes.
-#define ANGLE_SH_VERSION 211
+#define ANGLE_SH_VERSION 212

 enum ShShaderSpec
 {
@ -288,9 +288,14 @@ const ShCompileOptions SH_FORCE_ATOMIC_VALUE_RESOLUTION = UINT64_C(1) << 42;
 const ShCompileOptions SH_EMULATE_GL_BASE_VERTEX_BASE_INSTANCE = UINT64_C(1) << 43;

 // Emulate seamful cube map sampling for OpenGL ES2.0.  Currently only applies to the Vulkan
-// backend, as subgroup operations are used.  Once that dependency is broken, could be used with
+// backend, as is done after samplers are moved out of structs.  Can likely be made to work on
 // the other backends as well.
+//
+// There are two variations of this.  One using subgroup operations where available, and another
+// that emulates those operations using dFdxFine and dFdyFine.  The latter is more universally
+// available, but is buggy on Nvidia.
 const ShCompileOptions SH_EMULATE_SEAMFUL_CUBE_MAP_SAMPLING = UINT64_C(1) << 44;
+const ShCompileOptions SH_EMULATE_SEAMFUL_CUBE_MAP_SAMPLING_WITH_SUBGROUP_OP = UINT64_C(1) << 45;

 // Defines alternate strategies for implementing array index clamping.
 enum ShArrayIndexClampingStrategy
--- a/src/compiler/translator/TranslatorVulkan.cpp
+++ b/src/compiler/translator/TranslatorVulkan.cpp
@ -645,7 +645,7 @@ void TranslatorVulkan::translate(TIntermBlock *root,

    sink << "#version 450 core\n";

-    if (compileOptions & SH_EMULATE_SEAMFUL_CUBE_MAP_SAMPLING)
+    if (compileOptions & SH_EMULATE_SEAMFUL_CUBE_MAP_SAMPLING_WITH_SUBGROUP_OP)
    {
        sink << "#extension GL_KHR_shader_subgroup_quad : require\n";
    }
@ -688,10 +688,12 @@ void TranslatorVulkan::translate(TIntermBlock *root,

    // Rewrite samplerCubes as sampler2DArrays.  This must be done after rewriting struct samplers
    // as it doesn't expect that.
-    if (compileOptions & SH_EMULATE_SEAMFUL_CUBE_MAP_SAMPLING)
+    if (compileOptions & (SH_EMULATE_SEAMFUL_CUBE_MAP_SAMPLING |
+                          SH_EMULATE_SEAMFUL_CUBE_MAP_SAMPLING_WITH_SUBGROUP_OP))
    {
-        RewriteCubeMapSamplersAs2DArray(root, &getSymbolTable(),
-                                        getShaderType() == GL_FRAGMENT_SHADER);
+        RewriteCubeMapSamplersAs2DArray(
+            root, &getSymbolTable(), getShaderType() == GL_FRAGMENT_SHADER,
+            compileOptions & SH_EMULATE_SEAMFUL_CUBE_MAP_SAMPLING_WITH_SUBGROUP_OP);
    }

    if (defaultUniformCount > 0)
--- a/src/compiler/translator/tree_ops/RewriteCubeMapSamplersAs2DArray.cpp
+++ b/src/compiler/translator/tree_ops/RewriteCubeMapSamplersAs2DArray.cpp
@ -39,9 +39,56 @@ TIntermSymbol *GetValueFromNeighbor(TSymbolTable *symbolTable,
    return neighbor;
 }

+// Calculate the difference of a value with another invocation in the quad.  Used to emulate
+// GetValueFromNeighbor where subgroup operations are not present.
+//
+// See comment in declareCoordTranslationFunction.
+TIntermSymbol *GetDiffWithNeighbor(TSymbolTable *symbolTable,
+                                   TIntermBlock *body,
+                                   TFunction *dFdxyFine,
+                                   TIntermTyped *variable,
+                                   const TType *variableType)
+{
+    TIntermTyped *neighborValue =
+        TIntermAggregate::CreateRawFunctionCall(*dFdxyFine, new TIntermSequence({variable}));
+    TIntermTyped *absNeighborValue = new TIntermUnary(EOpAbs, neighborValue, nullptr);
+
+    TIntermSymbol *neighbor = new TIntermSymbol(CreateTempVariable(symbolTable, variableType));
+    body->appendStatement(CreateTempInitDeclarationNode(&neighbor->variable(), absNeighborValue));
+
+    return neighbor;
+}
+
+// Used to emulate GetValueFromNeighbor with bool values.
+TIntermSymbol *IsNeighborNonHelper(TSymbolTable *symbolTable,
+                                   TIntermBlock *body,
+                                   TFunction *dFdxyFine,
+                                   TIntermTyped *gl_HelperInvocation)
+{
+    const TType *boolType  = StaticType::GetBasic<EbtBool>();
+    const TType *floatType = StaticType::GetBasic<EbtFloat>();
+
+    TIntermTyped *gl_HelperInvocationAsFloat =
+        TIntermAggregate::CreateConstructor(*floatType, new TIntermSequence({gl_HelperInvocation}));
+    TIntermSymbol *diffWithNeighbor =
+        GetDiffWithNeighbor(symbolTable, body, dFdxyFine, gl_HelperInvocationAsFloat, floatType);
+
+    TIntermTyped *isNeighborNonHelperValue =
+        new TIntermBinary(EOpGreaterThan, diffWithNeighbor, CreateFloatNode(0.5f));
+    TIntermSymbol *isNeighborNonHelper =
+        new TIntermSymbol(CreateTempVariable(symbolTable, boolType));
+    body->appendStatement(
+        CreateTempInitDeclarationNode(&isNeighborNonHelper->variable(), isNeighborNonHelperValue));
+
+    return isNeighborNonHelper;
+}
+
 // If this is a helper invocation, retrieve the layer index (cube map face) from another invocation
 // in the quad that is not a helper.  See comment in declareCoordTranslationFunction.
-void GetLayerFromNonHelperInvocation(TSymbolTable *symbolTable, TIntermBlock *body, TIntermTyped *l)
+void GetLayerFromNonHelperInvocation(TSymbolTable *symbolTable,
+                                     TIntermBlock *body,
+                                     TIntermTyped *l,
+                                     bool useSubgroupOps)
 {
    TVariable *gl_HelperInvocationVar =
        new TVariable(symbolTable, ImmutableString("gl_HelperInvocation"),
@ -50,48 +97,97 @@ void GetLayerFromNonHelperInvocation(TSymbolTable *symbolTable, TIntermBlock *bo

    const TType *boolType  = StaticType::GetBasic<EbtBool>();
    const TType *floatType = StaticType::GetBasic<EbtFloat>();
-    TFunction *quadSwapHorizontalBool =
-        new TFunction(symbolTable, ImmutableString("subgroupQuadSwapHorizontal"),
-                      SymbolType::AngleInternal, boolType, true);
-    TFunction *quadSwapHorizontalFloat =
-        new TFunction(symbolTable, ImmutableString("subgroupQuadSwapHorizontal"),
-                      SymbolType::AngleInternal, floatType, true);
-    TFunction *quadSwapVerticalBool =
-        new TFunction(symbolTable, ImmutableString("subgroupQuadSwapVertical"),
-                      SymbolType::AngleInternal, boolType, true);
-    TFunction *quadSwapVerticalFloat =
-        new TFunction(symbolTable, ImmutableString("subgroupQuadSwapVertical"),
-                      SymbolType::AngleInternal, floatType, true);
-    TFunction *quadSwapDiagonalFloat =
-        new TFunction(symbolTable, ImmutableString("subgroupQuadSwapDiagonal"),
-                      SymbolType::AngleInternal, floatType, true);

-    quadSwapHorizontalBool->addParameter(CreateTempVariable(symbolTable, boolType));
-    quadSwapVerticalBool->addParameter(CreateTempVariable(symbolTable, boolType));
-    quadSwapHorizontalFloat->addParameter(CreateTempVariable(symbolTable, floatType));
-    quadSwapVerticalFloat->addParameter(CreateTempVariable(symbolTable, floatType));
-    quadSwapDiagonalFloat->addParameter(CreateTempVariable(symbolTable, floatType));
+    TIntermSymbol *lH;
+    TIntermSymbol *lV;
+    TIntermSymbol *lD;

-    // Get the layer from the horizontal, vertical and diagonal neighbor.  These should be done
-    // outside `if`s so the non-helper thread is not turned inactive.
-    TIntermSymbol *lH =
-        GetValueFromNeighbor(symbolTable, body, quadSwapHorizontalFloat, l, floatType);
-    TIntermSymbol *lV =
-        GetValueFromNeighbor(symbolTable, body, quadSwapVerticalFloat, l->deepCopy(), floatType);
-    TIntermSymbol *lD =
-        GetValueFromNeighbor(symbolTable, body, quadSwapDiagonalFloat, l->deepCopy(), floatType);
+    TIntermTyped *horizontalIsNonHelper;
+    TIntermTyped *verticalIsNonHelper;

-    // Get the value of gl_HelperInvocation from the neighbors too.
-    TIntermSymbol *horizontalIsHelper = GetValueFromNeighbor(
-        symbolTable, body, quadSwapHorizontalBool, gl_HelperInvocation->deepCopy(), boolType);
-    TIntermSymbol *verticalIsHelper = GetValueFromNeighbor(
-        symbolTable, body, quadSwapVerticalBool, gl_HelperInvocation->deepCopy(), boolType);
+    if (useSubgroupOps)
+    {
+        TFunction *quadSwapHorizontalBool =
+            new TFunction(symbolTable, ImmutableString("subgroupQuadSwapHorizontal"),
+                          SymbolType::AngleInternal, boolType, true);
+        TFunction *quadSwapHorizontalFloat =
+            new TFunction(symbolTable, ImmutableString("subgroupQuadSwapHorizontal"),
+                          SymbolType::AngleInternal, floatType, true);
+        TFunction *quadSwapVerticalBool =
+            new TFunction(symbolTable, ImmutableString("subgroupQuadSwapVertical"),
+                          SymbolType::AngleInternal, boolType, true);
+        TFunction *quadSwapVerticalFloat =
+            new TFunction(symbolTable, ImmutableString("subgroupQuadSwapVertical"),
+                          SymbolType::AngleInternal, floatType, true);
+        TFunction *quadSwapDiagonalFloat =
+            new TFunction(symbolTable, ImmutableString("subgroupQuadSwapDiagonal"),
+                          SymbolType::AngleInternal, floatType, true);

-    // Note(syoussefi): if the sampling is done inside an if with a non-uniform condition, it's not
-    // enough to test if the neighbor is not a helper, we should also check if it's active.
-    TIntermTyped *horizontalIsNonHelper =
-        new TIntermUnary(EOpLogicalNot, horizontalIsHelper, nullptr);
-    TIntermTyped *verticalIsNonHelper = new TIntermUnary(EOpLogicalNot, verticalIsHelper, nullptr);
+        quadSwapHorizontalBool->addParameter(CreateTempVariable(symbolTable, boolType));
+        quadSwapVerticalBool->addParameter(CreateTempVariable(symbolTable, boolType));
+        quadSwapHorizontalFloat->addParameter(CreateTempVariable(symbolTable, floatType));
+        quadSwapVerticalFloat->addParameter(CreateTempVariable(symbolTable, floatType));
+        quadSwapDiagonalFloat->addParameter(CreateTempVariable(symbolTable, floatType));
+
+        // Get the layer from the horizontal, vertical and diagonal neighbor.  These should be done
+        // outside `if`s so the non-helper thread is not turned inactive.
+        lH = GetValueFromNeighbor(symbolTable, body, quadSwapHorizontalFloat, l, floatType);
+        lV = GetValueFromNeighbor(symbolTable, body, quadSwapVerticalFloat, l->deepCopy(),
+                                  floatType);
+        lD = GetValueFromNeighbor(symbolTable, body, quadSwapDiagonalFloat, l->deepCopy(),
+                                  floatType);
+
+        // Get the value of gl_HelperInvocation from the neighbors too.
+        TIntermSymbol *horizontalIsHelper = GetValueFromNeighbor(
+            symbolTable, body, quadSwapHorizontalBool, gl_HelperInvocation->deepCopy(), boolType);
+        TIntermSymbol *verticalIsHelper = GetValueFromNeighbor(
+            symbolTable, body, quadSwapVerticalBool, gl_HelperInvocation->deepCopy(), boolType);
+
+        // Note(syoussefi): if the sampling is done inside an if with a non-uniform condition, it's
+        // not enough to test if the neighbor is not a helper, we should also check if it's active.
+        horizontalIsNonHelper = new TIntermUnary(EOpLogicalNot, horizontalIsHelper, nullptr);
+        verticalIsNonHelper   = new TIntermUnary(EOpLogicalNot, verticalIsHelper, nullptr);
+    }
+    else
+    {
+        TFunction *dFdxFineBool  = new TFunction(symbolTable, ImmutableString("dFdxFine"),
+                                                SymbolType::AngleInternal, boolType, true);
+        TFunction *dFdxFineFloat = new TFunction(symbolTable, ImmutableString("dFdxFine"),
+                                                 SymbolType::AngleInternal, floatType, true);
+        TFunction *dFdyFineBool  = new TFunction(symbolTable, ImmutableString("dFdyFine"),
+                                                SymbolType::AngleInternal, boolType, true);
+        TFunction *dFdyFineFloat = new TFunction(symbolTable, ImmutableString("dFdyFine"),
+                                                 SymbolType::AngleInternal, floatType, true);
+
+        dFdxFineBool->addParameter(CreateTempVariable(symbolTable, boolType));
+        dFdyFineBool->addParameter(CreateTempVariable(symbolTable, boolType));
+        dFdxFineFloat->addParameter(CreateTempVariable(symbolTable, floatType));
+        dFdyFineFloat->addParameter(CreateTempVariable(symbolTable, floatType));
+
+        // layerQuadSwapHelper = gl_HelperInvocation ? 0.0 : layer;
+        TIntermTyped *layerQuadSwapHelperValue =
+            new TIntermTernary(gl_HelperInvocation->deepCopy(), CreateZeroNode(*floatType), l);
+        TIntermSymbol *layerQuadSwapHelper =
+            new TIntermSymbol(CreateTempVariable(symbolTable, floatType));
+        body->appendStatement(CreateTempInitDeclarationNode(&layerQuadSwapHelper->variable(),
+                                                            layerQuadSwapHelperValue));
+
+        // Get the layer from the horizontal, vertical and diagonal neighbor.  These should be done
+        // outside `if`s so the non-helper thread is not turned inactive.
+        lH = GetDiffWithNeighbor(symbolTable, body, dFdxFineFloat, layerQuadSwapHelper, floatType);
+        lV = GetDiffWithNeighbor(symbolTable, body, dFdyFineFloat, layerQuadSwapHelper->deepCopy(),
+                                 floatType);
+        lD = GetDiffWithNeighbor(symbolTable, body, dFdxFineFloat, lV->deepCopy(), floatType);
+
+        // Get the value of gl_HelperInvocation from the neighbors too.
+        //
+        // Note(syoussefi): if the sampling is done inside an if with a non-uniform condition, it's
+        // not enough to test if the neighbor is not a helper, we should also check if it's active.
+        horizontalIsNonHelper =
+            IsNeighborNonHelper(symbolTable, body, dFdxFineBool, gl_HelperInvocation->deepCopy());
+        verticalIsNonHelper =
+            IsNeighborNonHelper(symbolTable, body, dFdyFineBool, gl_HelperInvocation->deepCopy());
+    }

    TIntermTyped *lVD  = new TIntermTernary(verticalIsNonHelper, lV, lD);
    TIntermTyped *lHVD = new TIntermTernary(horizontalIsNonHelper, lH, lVD);
@ -163,10 +259,13 @@ void TransformZMajor(TIntermBlock *block,
 class RewriteCubeMapSamplersAs2DArrayTraverser : public TIntermTraverser
 {
  public:
-    RewriteCubeMapSamplersAs2DArrayTraverser(TSymbolTable *symbolTable, bool isFragmentShader)
+    RewriteCubeMapSamplersAs2DArrayTraverser(TSymbolTable *symbolTable,
+                                             bool isFragmentShader,
+                                             bool useSubgroupOps)
        : TIntermTraverser(true, true, true, symbolTable),
          mCubeXYZToArrayUVL(nullptr),
          mIsFragmentShader(isFragmentShader),
+          mUseSubgroupOps(useSubgroupOps),
          mCoordTranslationFunctionDecl(nullptr)
    {}

@ -543,14 +642,15 @@ class RewriteCubeMapSamplersAs2DArrayTraverser : public TIntermTraverser
        // incorrect and the wrong mip would be selected.
        //
        // We therefore use gl_HelperInvocation to identify these invocations and subgroupQuadSwap*
-        // operations to retrieve the layer from a non-helper invocation.  As a result, the UVs
-        // calculated for the helper invocations correspond to the same face and end up outside the
-        // [0, 1] range, but result in correct derivatives.  Indeed, sampling from any other kind of
-        // texture using varyings that range from [0, 1] would follow the same behavior (where
-        // helper invocations generate UVs out of range).
+        // (where available) or dFdx/dFdy (emulating subgroupQuadSwap*) to retrieve the layer from a
+        // non-helper invocation.  As a result, the UVs calculated for the helper invocations
+        // correspond to the same face and end up outside the [0, 1] range, but result in correct
+        // derivatives.  Indeed, sampling from any other kind of texture using varyings that range
+        // from [0, 1] would follow the same behavior (where helper invocations generate UVs out of
+        // range).
        if (mIsFragmentShader)
        {
-            GetLayerFromNonHelperInvocation(mSymbolTable, body, l->deepCopy());
+            GetLayerFromNonHelperInvocation(mSymbolTable, body, l->deepCopy(), mUseSubgroupOps);
        }

        // layer < 1.5 (covering faces 0 and 1, corresponding to major axis being X) and layer < 3.5
@ -819,6 +919,7 @@ class RewriteCubeMapSamplersAs2DArrayTraverser : public TIntermTraverser
    TFunction *mCubeXYZToArrayUVL;

    bool mIsFragmentShader;
+    bool mUseSubgroupOps;

    // Stored to be put before the first function after the pass.
    TIntermFunctionDefinition *mCoordTranslationFunctionDecl;
@ -828,9 +929,11 @@ class RewriteCubeMapSamplersAs2DArrayTraverser : public TIntermTraverser

 void RewriteCubeMapSamplersAs2DArray(TIntermBlock *root,
                                     TSymbolTable *symbolTable,
-                                     bool isFragmentShader)
+                                     bool isFragmentShader,
+                                     bool useSubgroupOps)
 {
-    RewriteCubeMapSamplersAs2DArrayTraverser traverser(symbolTable, isFragmentShader);
+    RewriteCubeMapSamplersAs2DArrayTraverser traverser(symbolTable, isFragmentShader,
+                                                       useSubgroupOps);
    root->traverse(&traverser);
    traverser.updateTree();

--- a/src/compiler/translator/tree_ops/RewriteCubeMapSamplersAs2DArray.h
+++ b/src/compiler/translator/tree_ops/RewriteCubeMapSamplersAs2DArray.h
@ -19,7 +19,8 @@ class TSymbolTable;

 void RewriteCubeMapSamplersAs2DArray(TIntermBlock *root,
                                     TSymbolTable *symbolTable,
-                                     bool isFragmentShader);
+                                     bool isFragmentShader,
+                                     bool useSubgroupOps);
 }  // namespace sh

 #endif  // COMPILER_TRANSLATOR_TREEOPS_REWRITECUBEMAPSAMPLERSAS2DARRAY_H_
--- a/src/libANGLE/renderer/vulkan/ContextVk.cpp
+++ b/src/libANGLE/renderer/vulkan/ContextVk.cpp
@ -238,6 +238,8 @@ ContextVk::ContextVk(const gl::State &state, gl::ErrorSet *errorSet, RendererVk
      mClearColorMask(kAllColorChannelsMask),
      mFlipYForCurrentSurface(false),
      mIsAnyHostVisibleBufferWritten(false),
+      mEmulateSeamfulCubeMapSampling(false),
+      mEmulateSeamfulCubeMapSamplingWithSubgroupOps(false),
      mLastCompletedQueueSerial(renderer->nextSerial()),
      mCurrentQueueSerial(renderer->nextSerial()),
      mPoolAllocator(kDefaultPoolAllocatorPageSize, 1),
@ -441,7 +443,8 @@ angle::Result ContextVk::initialize()
        ANGLE_TRY(synchronizeCpuGpuTime());
    }

-    mEmulateSeamfulCubeMapSampling = shouldEmulateSeamfulCubeMapSampling();
+    mEmulateSeamfulCubeMapSampling =
+        shouldEmulateSeamfulCubeMapSampling(&mEmulateSeamfulCubeMapSamplingWithSubgroupOps);

    return angle::Result::Continue;
 }
@ -2896,9 +2899,10 @@ vk::DescriptorSetLayoutDesc ContextVk::getDriverUniformsDescriptorSetDesc(
    return desc;
 }

-bool ContextVk::shouldEmulateSeamfulCubeMapSampling() const
+bool ContextVk::shouldEmulateSeamfulCubeMapSampling(bool *useSubgroupOpsOut) const
 {
-    if (mState.getClientMajorVersion() != 2)
+    // Only allow seamful cube map sampling in non-webgl ES2.
+    if (mState.getClientMajorVersion() != 2 || mState.isWebGL())
    {
        return false;
    }
@ -2908,17 +2912,15 @@ bool ContextVk::shouldEmulateSeamfulCubeMapSampling() const
        return false;
    }

+    // Use subgroup ops where available.
    constexpr VkSubgroupFeatureFlags kSeamfulCubeMapSubgroupOperations =
        VK_SUBGROUP_FEATURE_BASIC_BIT | VK_SUBGROUP_FEATURE_BALLOT_BIT |
        VK_SUBGROUP_FEATURE_QUAD_BIT;
    const VkSubgroupFeatureFlags deviceSupportedOperations =
        mRenderer->getPhysicalDeviceSubgroupProperties().supportedOperations;
-    bool hasSeamfulCubeMapSubgroupOperations =
-        (deviceSupportedOperations & kSeamfulCubeMapSubgroupOperations) ==
-        kSeamfulCubeMapSubgroupOperations;
+    *useSubgroupOpsOut = (deviceSupportedOperations & kSeamfulCubeMapSubgroupOperations) ==
+                         kSeamfulCubeMapSubgroupOperations;

-    // Only enable seamful cube map emulation if the necessary subgroup operations are supported.
-    // Without them, we cannot remove derivative-related artifacts caused by helper invocations.
-    return hasSeamfulCubeMapSubgroupOperations;
+    return true;
 }
 }  // namespace rx
--- a/src/libANGLE/renderer/vulkan/ContextVk.h
+++ b/src/libANGLE/renderer/vulkan/ContextVk.h
@ -313,7 +313,11 @@ class ContextVk : public ContextImpl, public vk::Context, public vk::RenderPassO

    void updateScissor(const gl::State &glState);

-    bool emulateSeamfulCubeMapSampling() const { return mEmulateSeamfulCubeMapSampling; }
+    bool emulateSeamfulCubeMapSampling(bool *useSubgroupOpsOut) const
+    {
+        *useSubgroupOpsOut = mEmulateSeamfulCubeMapSamplingWithSubgroupOps;
+        return mEmulateSeamfulCubeMapSampling;
+    }

  private:
    // Dirty bits.
@ -472,7 +476,7 @@ class ContextVk : public ContextImpl, public vk::Context, public vk::RenderPassO

    void waitForSwapchainImageIfNecessary();

-    bool shouldEmulateSeamfulCubeMapSampling() const;
+    bool shouldEmulateSeamfulCubeMapSampling(bool *useSubgroupOpsOut) const;

    vk::PipelineHelper *mCurrentGraphicsPipeline;
    vk::PipelineAndSerial *mCurrentComputePipeline;
@ -535,8 +539,10 @@ class ContextVk : public ContextImpl, public vk::Context, public vk::RenderPassO
    // at the end of the command buffer to make that write available to the host.
    bool mIsAnyHostVisibleBufferWritten;

-    // Whether this context should do seamful cube map sampling emulation.
+    // Whether this context should do seamful cube map sampling emulation, and whether subgroup
+    // operations should be used.
    bool mEmulateSeamfulCubeMapSampling;
+    bool mEmulateSeamfulCubeMapSamplingWithSubgroupOps;

    struct DriverUniformsDescriptorSet
    {
--- a/src/libANGLE/renderer/vulkan/GlslangWrapper.cpp
+++ b/src/libANGLE/renderer/vulkan/GlslangWrapper.cpp
@ -936,7 +936,7 @@ void GlslangWrapper::GetShaderSource(const gl::ProgramState &programState,
 angle::Result GlslangWrapper::GetShaderCode(vk::Context *context,
                                            const gl::Caps &glCaps,
                                            bool enableLineRasterEmulation,
-                                            bool enableSeamfulCubeMapEmulation,
+                                            bool enableSubgroupOps,
                                            const gl::ShaderMap<std::string> &shaderSources,
                                            gl::ShaderMap<std::vector<uint32_t>> *shaderCodeOut)
 {
@ -956,20 +956,18 @@ angle::Result GlslangWrapper::GetShaderCode(vk::Context *context,
                                               kVersionDefine, kLineRasterDefine),
                       VK_ERROR_INVALID_SHADER_NV);

-        return GetShaderCodeImpl(context, glCaps, enableSeamfulCubeMapEmulation, patchedSources,
-                                 shaderCodeOut);
+        return GetShaderCodeImpl(context, glCaps, enableSubgroupOps, patchedSources, shaderCodeOut);
    }
    else
    {
-        return GetShaderCodeImpl(context, glCaps, enableSeamfulCubeMapEmulation, shaderSources,
-                                 shaderCodeOut);
+        return GetShaderCodeImpl(context, glCaps, enableSubgroupOps, shaderSources, shaderCodeOut);
    }
 }

 // static
 angle::Result GlslangWrapper::GetShaderCodeImpl(vk::Context *context,
                                                const gl::Caps &glCaps,
-                                                bool enableSeamfulCubeMapEmulation,
+                                                bool enableSubgroupOps,
                                                const gl::ShaderMap<std::string> &shaderSources,
                                                gl::ShaderMap<std::vector<uint32_t>> *shaderCodeOut)
 {
@ -1005,9 +1003,9 @@ angle::Result GlslangWrapper::GetShaderCodeImpl(vk::Context *context,
        glslang::TShader *shader = shaders[shaderType];
        shader->setStringsWithLengths(&shaderString, &shaderLength, 1);
        shader->setEntryPoint("main");
-        if (enableSeamfulCubeMapEmulation)
+        if (enableSubgroupOps)
        {
-            // Enable SPIR-V 1.3 if this workaround is used, as it uses subgroup operations.
+            // Enable SPIR-V 1.3 if to be able to use subgroup operations.
            shader->setEnvTarget(glslang::EShTargetSpv, glslang::EShTargetSpv_1_3);
        }

--- a/src/libANGLE/renderer/vulkan/GlslangWrapper.h
+++ b/src/libANGLE/renderer/vulkan/GlslangWrapper.h
@ -29,14 +29,14 @@ class GlslangWrapper
    static angle::Result GetShaderCode(vk::Context *context,
                                       const gl::Caps &glCaps,
                                       bool enableLineRasterEmulation,
-                                       bool enableSeamfulCubeMapEmulation,
+                                       bool enableSubgroupOps,
                                       const gl::ShaderMap<std::string> &shaderSources,
                                       gl::ShaderMap<std::vector<uint32_t>> *shaderCodesOut);

  private:
    static angle::Result GetShaderCodeImpl(vk::Context *context,
                                           const gl::Caps &glCaps,
-                                           bool enableSeamfulCubeMapEmulation,
+                                           bool enableSubgroupOps,
                                           const gl::ShaderMap<std::string> &shaderSources,
                                           gl::ShaderMap<std::vector<uint32_t>> *shaderCodesOut);
 };
--- a/src/libANGLE/renderer/vulkan/ProgramVk.cpp
+++ b/src/libANGLE/renderer/vulkan/ProgramVk.cpp
@ -304,10 +304,16 @@ angle::Result ProgramVk::ShaderInfo::initShaders(ContextVk *contextVk,
 {
    ASSERT(!valid());

+    bool useSubgroupOpsWithSeamfulCubeMapEmulation = false;
+    bool emulateSeamfulCubeMapSampling =
+        contextVk->emulateSeamfulCubeMapSampling(&useSubgroupOpsWithSeamfulCubeMapEmulation);
+    bool useSubgroupOps =
+        emulateSeamfulCubeMapSampling && useSubgroupOpsWithSeamfulCubeMapEmulation;
+
    gl::ShaderMap<std::vector<uint32_t>> shaderCodes;
-    ANGLE_TRY(GlslangWrapper::GetShaderCode(
-        contextVk, contextVk->getCaps(), enableLineRasterEmulation,
-        contextVk->emulateSeamfulCubeMapSampling(), shaderSources, &shaderCodes));
+    ANGLE_TRY(GlslangWrapper::GetShaderCode(contextVk, contextVk->getCaps(),
+                                            enableLineRasterEmulation, useSubgroupOps,
+                                            shaderSources, &shaderCodes));

    for (const gl::ShaderType shaderType : gl::AllShaderTypes())
    {
@ -1437,7 +1443,8 @@ angle::Result ProgramVk::updateTexturesDescriptorSet(ContextVk *contextVk)

    const gl::ActiveTextureArray<vk::TextureUnit> &activeTextures = contextVk->getActiveTextures();

-    bool emulateSeamfulCubeMapSampling = contextVk->emulateSeamfulCubeMapSampling();
+    bool useSubgroupOps                = false;
+    bool emulateSeamfulCubeMapSampling = contextVk->emulateSeamfulCubeMapSampling(&useSubgroupOps);

    for (uint32_t textureIndex = 0; textureIndex < mState.getSamplerBindings().size();
         ++textureIndex)
--- a/src/libANGLE/renderer/vulkan/RendererVk.cpp
+++ b/src/libANGLE/renderer/vulkan/RendererVk.cpp
@ -1279,6 +1279,7 @@ void RendererVk::initFeatures(const ExtensionNameList &deviceExtensionNames)

    if (IsWindows() && IsAMD(mPhysicalDeviceProperties.vendorID))
    {
+        // Disabled on AMD/windows due to buggy behavior.
        mFeatures.disallowSeamfulCubeMapEmulation.enabled = true;
    }

--- a/src/libANGLE/renderer/vulkan/ShaderVk.cpp
+++ b/src/libANGLE/renderer/vulkan/ShaderVk.cpp
@ -40,9 +40,17 @@ std::shared_ptr<WaitableCompileEvent> ShaderVk::compile(const gl::Context *conte
        compileOptions |= SH_CLAMP_POINT_SIZE;
    }

-    if (contextVk->emulateSeamfulCubeMapSampling())
+    bool useSubgroupOps = false;
+    if (contextVk->emulateSeamfulCubeMapSampling(&useSubgroupOps))
    {
-        compileOptions |= SH_EMULATE_SEAMFUL_CUBE_MAP_SAMPLING;
+        if (useSubgroupOps)
+        {
+            compileOptions |= SH_EMULATE_SEAMFUL_CUBE_MAP_SAMPLING_WITH_SUBGROUP_OP;
+        }
+        else
+        {
+            compileOptions |= SH_EMULATE_SEAMFUL_CUBE_MAP_SAMPLING;
+        }
    }

    return compileImpl(context, compilerInstance, mData.getSource(), compileOptions | options);
--- a/src/tests/deqp_support/deqp_gles2_test_expectations.txt
+++ b/src/tests/deqp_support/deqp_gles2_test_expectations.txt
@ -329,25 +329,6 @@
 3306 VULKAN ANDROID : dEQP-GLES2.functional.polygon_offset.fixed16_factor_1_slope = FAIL
 3307 VULKAN ANDROID : dEQP-GLES2.functional.texture.mipmap.cube.projected.nearest_linear = FAIL

-// Seamful cubemap sampling failures on Android (due to missing support subgroupQuad* operations).
-3243 VULKAN ANDROID : dEQP-GLES2.functional.shaders.texture_functions.vertex.texturecubelod = FAIL
-3243 VULKAN ANDROID : dEQP-GLES2.functional.texture.mipmap.cube.basic.linear_nearest = FAIL
-3243 VULKAN ANDROID : dEQP-GLES2.functional.texture.mipmap.cube.basic.linear_linear = FAIL
-3243 VULKAN ANDROID : dEQP-GLES2.functional.texture.mipmap.cube.projected.linear_nearest = FAIL
-3243 VULKAN ANDROID : dEQP-GLES2.functional.texture.mipmap.cube.projected.linear_linear = FAIL
-3243 VULKAN ANDROID : dEQP-GLES2.functional.texture.mipmap.cube.bias.linear_nearest = FAIL
-3243 VULKAN ANDROID : dEQP-GLES2.functional.texture.mipmap.cube.bias.linear_linear = FAIL
-3243 VULKAN ANDROID : dEQP-GLES2.functional.texture.vertex.cube.filtering.linear_mipmap_linear_nearest_clamp = FAIL
-3243 VULKAN ANDROID : dEQP-GLES2.functional.texture.vertex.cube.filtering.linear_mipmap_linear_nearest_mirror = FAIL
-3243 VULKAN ANDROID : dEQP-GLES2.functional.texture.vertex.cube.filtering.linear_mipmap_linear_linear_clamp = FAIL
-3243 VULKAN ANDROID : dEQP-GLES2.functional.texture.vertex.cube.filtering.linear_mipmap_linear_linear_mirror = FAIL
-3243 VULKAN ANDROID : dEQP-GLES2.functional.texture.vertex.cube.wrap.clamp_clamp = FAIL
-3243 VULKAN ANDROID : dEQP-GLES2.functional.texture.vertex.cube.wrap.clamp_repeat = FAIL
-3243 VULKAN ANDROID : dEQP-GLES2.functional.texture.vertex.cube.wrap.clamp_mirror = FAIL
-3243 VULKAN ANDROID : dEQP-GLES2.functional.texture.vertex.cube.wrap.mirror_clamp = FAIL
-3243 VULKAN ANDROID : dEQP-GLES2.functional.texture.vertex.cube.wrap.mirror_repeat = FAIL
-3243 VULKAN ANDROID : dEQP-GLES2.functional.texture.vertex.cube.wrap.mirror_mirror = FAIL
-
 // These tests also fail on AMD windows driver as it is not allowed to use emulation due to errors.
 3243 VULKAN WIN AMD : dEQP-GLES2.functional.shaders.texture_functions.vertex.texturecubelod = FAIL
 3243 VULKAN WIN AMD : dEQP-GLES2.functional.texture.mipmap.cube.basic.linear_nearest = FAIL