Bug 1674524 - implement KHR_blend_equation_advanced in SWGL. r=bradwerth

This patch has a few moving parts. We have to first tell WR that when it detects the extension that it is actually allowed to use it. We have to make the glsl-to-cxx translator eat the blend_supports_all_equations layout qualifier. We have to enable generation of advanced-blend-equation variants in the SWGL build setup. Then we report the actual extension inside SWGL. Finally, we actually add all the necessary blend equation enums, hash them down to a blend key, and implement all the blend modes therein. Differential Revision: https://phabricator.services.mozilla.com/D103804
2021-02-02 21:43:12 +00:00 · 2021-02-02 21:43:12 +00:00 · 7b87580cdf
--- a/gfx/webrender_bindings/src/bindings.rs
+++ b/gfx/webrender_bindings/src/bindings.rs
@ -1660,6 +1660,9 @@ pub extern "C" fn wr_window_new(
        // SWGL doesn't support the GL_ALWAYS depth comparison function used by
        // `clear_caches_with_quads`, but scissored clears work well.
        clear_caches_with_quads: !software && !allow_scissored_cache_clears,
+        // SWGL supports KHR_blend_equation_advanced safely, but we haven't yet
+        // tested other HW platforms determine if it is safe to allow them.
+        allow_advanced_blend_equation: software,
        start_debug_server,
        surface_origin_is_top_left,
        compositor_config,
--- a/gfx/wr/glsl-to-cxx/src/hir.rs
+++ b/gfx/wr/glsl-to-cxx/src/hir.rs
@ -1892,11 +1892,33 @@ fn translate_declaration(
        syntax::Declaration::FunctionPrototype(p) => {
            Declaration::FunctionPrototype(translate_function_prototype(state, p))
        }
-        syntax::Declaration::Global(_ty, _ids) => {
-            panic!();
-            // glsl non-es supports requalifying variables
-            // we don't right now
-            //Declaration::Global(..)
+        syntax::Declaration::Global(ty, ids) => {
+            // glsl non-es supports requalifying variables, but we don't yet.
+            // However, we still want to allow global layout qualifiers for
+            // KHR_advanced_blend_equation.
+            if !ids.is_empty() {
+                panic!();
+            }
+            let _ = for qual in &ty.qualifiers {
+                match qual {
+                    syntax::TypeQualifierSpec::Layout(l) => {
+                        for id in &l.ids {
+                            match id {
+                                syntax::LayoutQualifierSpec::Identifier(key, _) => {
+                                    match key.as_str() {
+                                        "blend_support_all_equations" => (),
+                                        _ => panic!(),
+                                    }
+                                }
+                                _ => panic!(),
+                            }
+                        }
+                    }
+                    syntax::TypeQualifierSpec::Storage(syntax::StorageQualifier::Out) => (),
+                    _ => panic!(),
+                }
+            };
+            Declaration::Global(lift_type_qualifier_for_declaration(state, &Some(ty.clone())).unwrap(), ids.clone())
        }
        syntax::Declaration::InitDeclaratorList(dl) => {
            translate_init_declarator_list(state, dl, default_run_class)
--- a/gfx/wr/glsl-to-cxx/src/lib.rs
+++ b/gfx/wr/glsl-to-cxx/src/lib.rs
@ -2313,19 +2313,22 @@ pub fn show_declaration(state: &mut OutputState, d: &hir::Declaration) {
            //state.write(";\n");
        }
        hir::Declaration::Global(ref qual, ref identifiers) => {
-            show_type_qualifier(state, &qual);
+            // We only want to output GLSL layout qualifiers if not C++
+            if !state.output_cxx {
+                show_type_qualifier(state, &qual);

-            if !identifiers.is_empty() {
-                let mut iter = identifiers.iter();
-                let first = iter.next().unwrap();
-                show_identifier(state, first);
+                if !identifiers.is_empty() {
+                    let mut iter = identifiers.iter();
+                    let first = iter.next().unwrap();
+                    show_identifier(state, first);

-                for identifier in iter {
-                    let _ = write!(state, ", {}", identifier);
+                    for identifier in iter {
+                        let _ = write!(state, ", {}", identifier);
+                    }
                }
-            }

-            state.write(";\n");
+                state.write(";\n");
+            }
        }
        hir::Declaration::StructDefinition(ref sym) => {
            show_sym_decl(state, sym);
--- a/gfx/wr/swgl/build.rs
+++ b/gfx/wr/swgl/build.rs
@ -108,6 +108,7 @@ fn main() {
    let shader_flags =
        ShaderFeatureFlags::GL |
        ShaderFeatureFlags::DUAL_SOURCE_BLENDING |
+        ShaderFeatureFlags::ADVANCED_BLEND_EQUATION |
        ShaderFeatureFlags::DEBUG;
    let mut shaders: Vec<String> = Vec::new();
    for (name, features) in get_shader_features(shader_flags) {
--- a/gfx/wr/swgl/src/composite.h
+++ b/gfx/wr/swgl/src/composite.h
@ -596,13 +596,13 @@ static ALWAYS_INLINE V8<int16_t> linearRowTapsR8(S sampler, I32 ix,
  auto b0 = unaligned_load<V2<uint8_t>>(&buf[ix.y]);
  auto c0 = unaligned_load<V2<uint8_t>>(&buf[ix.z]);
  auto d0 = unaligned_load<V2<uint8_t>>(&buf[ix.w]);
-  auto abcd0 = CONVERT(combine(combine(a0, b0), combine(c0, d0)), V8<int16_t>);
+  auto abcd0 = CONVERT(combine(a0, b0, c0, d0), V8<int16_t>);
  buf += stridey;
  auto a1 = unaligned_load<V2<uint8_t>>(&buf[ix.x]);
  auto b1 = unaligned_load<V2<uint8_t>>(&buf[ix.y]);
  auto c1 = unaligned_load<V2<uint8_t>>(&buf[ix.z]);
  auto d1 = unaligned_load<V2<uint8_t>>(&buf[ix.w]);
-  auto abcd1 = CONVERT(combine(combine(a1, b1), combine(c1, d1)), V8<int16_t>);
+  auto abcd1 = CONVERT(combine(a1, b1, c1, d1), V8<int16_t>);
  abcd0 += ((abcd1 - abcd0) * fracy) >> 7;
  return abcd0;
 }
--- a/gfx/wr/swgl/src/gl.cc
+++ b/gfx/wr/swgl/src/gl.cc
@ -762,10 +762,13 @@ struct Program {
 };

 // clang-format off
-// for GL defines to fully expand
+// Fully-expand GL defines while ignoring more than 4 suffixes
 #define CONCAT_KEY(prefix, x, y, z, w, ...) prefix##x##y##z##w
-#define BLEND_KEY(...) CONCAT_KEY(BLEND_, __VA_ARGS__, 0, 0)
-#define MASK_BLEND_KEY(...) CONCAT_KEY(MASK_BLEND_, __VA_ARGS__, 0, 0)
+// Generate a blend key enum symbol
+#define BLEND_KEY(...) CONCAT_KEY(BLEND_, __VA_ARGS__, 0, 0, 0)
+// Generate a blend key symbol for a clip-mask variation
+#define MASK_BLEND_KEY(...) CONCAT_KEY(MASK_BLEND_, __VA_ARGS__, 0, 0, 0)
+// Utility macro to easily generate similar code for all implemented blend modes
 #define FOR_EACH_BLEND_KEY(macro)                                              \
  macro(GL_ONE, GL_ZERO, 0, 0)                                                 \
  macro(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ONE_MINUS_SRC_ALPHA)  \
@ -778,7 +781,24 @@ struct Program {
  macro(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA)                        \
  macro(GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE)                       \
  macro(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR, 0, 0)                       \
-  macro(GL_ONE, GL_ONE_MINUS_SRC1_COLOR, 0, 0)
+  macro(GL_ONE, GL_ONE_MINUS_SRC1_COLOR, 0, 0)                                 \
+  macro(GL_MIN, 0, 0, 0)                                                       \
+  macro(GL_MAX, 0, 0, 0)                                                       \
+  macro(GL_MULTIPLY_KHR, 0, 0, 0)                                              \
+  macro(GL_SCREEN_KHR, 0, 0, 0)                                                \
+  macro(GL_OVERLAY_KHR, 0, 0, 0)                                               \
+  macro(GL_DARKEN_KHR, 0, 0, 0)                                                \
+  macro(GL_LIGHTEN_KHR, 0, 0, 0)                                               \
+  macro(GL_COLORDODGE_KHR, 0, 0, 0)                                            \
+  macro(GL_COLORBURN_KHR, 0, 0, 0)                                             \
+  macro(GL_HARDLIGHT_KHR, 0, 0, 0)                                             \
+  macro(GL_SOFTLIGHT_KHR, 0, 0, 0)                                             \
+  macro(GL_DIFFERENCE_KHR, 0, 0, 0)                                            \
+  macro(GL_EXCLUSION_KHR, 0, 0, 0)                                             \
+  macro(GL_HSL_HUE_KHR, 0, 0, 0)                                               \
+  macro(GL_HSL_SATURATION_KHR, 0, 0, 0)                                        \
+  macro(GL_HSL_COLOR_KHR, 0, 0, 0)                                             \
+  macro(GL_HSL_LUMINOSITY_KHR, 0, 0, 0)

 #define DEFINE_BLEND_KEY(...) BLEND_KEY(__VA_ARGS__),
 #define DEFINE_MASK_BLEND_KEY(...) MASK_BLEND_KEY(__VA_ARGS__),
@ -1299,6 +1319,8 @@ static const char* const extensions[] = {
    "GL_ARB_invalidate_subdata",
    "GL_ARB_texture_storage",
    "GL_EXT_timer_query",
+    "GL_KHR_blend_equation_advanced",
+    "GL_KHR_blend_equation_advanced_coherent",
    "GL_APPLE_rgb_422",
 };

@ -1437,6 +1459,37 @@ GLenum remap_blendfunc(GLenum rgb, GLenum a) {
  return a;
 }

+// Generate a hashed blend key based on blend func and equation state. This
+// allows all the blend state to be processed down to a blend key that can be
+// dealt with inside a single switch statement.
+static void hash_blend_key() {
+  GLenum srgb = ctx->blendfunc_srgb;
+  GLenum drgb = ctx->blendfunc_drgb;
+  GLenum sa = ctx->blendfunc_sa;
+  GLenum da = ctx->blendfunc_da;
+  GLenum equation = ctx->blend_equation;
+#define HASH_BLEND_KEY(x, y, z, w) ((x << 4) | (y) | (z << 24) | (w << 20))
+  // Basic non-separate blend funcs used the two argument form
+  int hash = HASH_BLEND_KEY(srgb, drgb, 0, 0);
+  // Separate alpha blend funcs use the 4 argument hash
+  if (srgb != sa || drgb != da) hash |= HASH_BLEND_KEY(0, 0, sa, da);
+  // Any other blend equation than the default func_add ignores the func and
+  // instead generates a one-argument hash based on the equation
+  if (equation != GL_FUNC_ADD) hash = HASH_BLEND_KEY(equation, 0, 0, 0);
+  switch (hash) {
+#define MAP_BLEND_KEY(...)                   \
+  case HASH_BLEND_KEY(__VA_ARGS__):          \
+    ctx->blend_key = BLEND_KEY(__VA_ARGS__); \
+    break;
+    FOR_EACH_BLEND_KEY(MAP_BLEND_KEY)
+    default:
+      debugf("blendfunc: %x, %x, separate: %x, %x, equation: %x\n", srgb, drgb,
+             sa, da, equation);
+      assert(false);
+      break;
+  }
+}
+
 void BlendFunc(GLenum srgb, GLenum drgb, GLenum sa, GLenum da) {
  ctx->blendfunc_srgb = srgb;
  ctx->blendfunc_drgb = drgb;
@ -1445,20 +1498,7 @@ void BlendFunc(GLenum srgb, GLenum drgb, GLenum sa, GLenum da) {
  ctx->blendfunc_sa = sa;
  ctx->blendfunc_da = da;

-#define HASH_BLEND_KEY(x, y, z, w) ((x << 4) | (y) | (z << 24) | (w << 20))
-  int hash = HASH_BLEND_KEY(srgb, drgb, 0, 0);
-  if (srgb != sa || drgb != da) hash |= HASH_BLEND_KEY(0, 0, sa, da);
-  switch (hash) {
-#define MAP_BLEND_KEY(...)                   \
-  case HASH_BLEND_KEY(__VA_ARGS__):          \
-    ctx->blend_key = BLEND_KEY(__VA_ARGS__); \
-    break;
-    FOR_EACH_BLEND_KEY(MAP_BLEND_KEY)
-    default:
-      debugf("blendfunc: %x, %x, separate: %x, %x\n", srgb, drgb, sa, da);
-      assert(false);
-      break;
-  }
+  hash_blend_key();
 }

 void BlendColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a) {
@ -1467,8 +1507,12 @@ void BlendColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a) {
 }

 void BlendEquation(GLenum mode) {
-  assert(mode == GL_FUNC_ADD);
-  ctx->blend_equation = mode;
+  assert(mode == GL_FUNC_ADD || mode == GL_MIN || mode == GL_MAX ||
+         (mode >= GL_MULTIPLY_KHR && mode <= GL_HSL_LUMINOSITY_KHR));
+  if (mode != ctx->blend_equation) {
+    ctx->blend_equation = mode;
+    hash_blend_key();
+  }
 }

 void DepthMask(GLboolean flag) { ctx->depthmask = flag; }
@ -2990,8 +3034,9 @@ static ALWAYS_INLINE HalfRGBA8 packRGBA8(I32 a, I32 b) {
 #endif
 }

-static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(const vec4& v) {
-  ivec4 i = round_pixel(v);
+static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(const vec4& v,
+                                                 float maxval = 1.0f) {
+  ivec4 i = round_pixel(v, maxval);
  HalfRGBA8 xz = packRGBA8(i.z, i.x);
  HalfRGBA8 yw = packRGBA8(i.y, i.w);
  HalfRGBA8 xyzwl = zipLow(xz, yw);
@ -3011,6 +3056,12 @@ static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8() {
  return pack_pixels_RGBA8(fragment_shader->gl_FragColor);
 }

+static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(WideRGBA32F v,
+                                                 float maxval = 1.0f) {
+  ivec4 i = round_pixel(bit_cast<vec4>(v), maxval);
+  return combine(packRGBA8(i.x, i.y), packRGBA8(i.z, i.w));
+}
+
 // Load a partial span > 0 and < 4 pixels.
 template <typename V, typename P>
 static ALWAYS_INLINE V partial_load_span(const P* src, int span) {
@ -3079,10 +3130,128 @@ static ALWAYS_INLINE T addlow(T x, T y) {
  return bit_cast<T>(bit_cast<bytes>(x) + bit_cast<bytes>(y));
 }

-static ALWAYS_INLINE WideRGBA8 alphas(WideRGBA8 c) {
+// Replace color components of each pixel with the pixel's alpha values.
+template <typename T>
+static ALWAYS_INLINE T alphas(T c) {
  return SHUFFLE(c, c, 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15);
 }

+// Replace the alpha values of the first vector with alpha values from the
+// second, while leaving the color components unmodified.
+template <typename T>
+static ALWAYS_INLINE T set_alphas(T c, T a) {
+  return SHUFFLE(c, a, 0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31);
+}
+
+// Miscellaneous helper functions for working with packed RGBA8 data.
+static ALWAYS_INLINE HalfRGBA8 if_then_else(V8<int16_t> c, HalfRGBA8 t,
+                                            HalfRGBA8 e) {
+  return bit_cast<HalfRGBA8>((c & t) | (~c & e));
+}
+
+template <typename T, typename C, int N>
+static ALWAYS_INLINE VectorType<T, N> if_then_else(VectorType<C, N> c,
+                                                   VectorType<T, N> t,
+                                                   VectorType<T, N> e) {
+  return combine(if_then_else(lowHalf(c), lowHalf(t), lowHalf(e)),
+                 if_then_else(highHalf(c), highHalf(t), highHalf(e)));
+}
+
+static ALWAYS_INLINE HalfRGBA8 min(HalfRGBA8 x, HalfRGBA8 y) {
+#if USE_SSE2
+  return bit_cast<HalfRGBA8>(
+      _mm_min_epi16(bit_cast<V8<int16_t>>(x), bit_cast<V8<int16_t>>(y)));
+#elif USE_NEON
+  return vminq_u16(x, y);
+#else
+  return if_then_else(a < b, a, b);
+#endif
+}
+
+template <typename T, int N>
+static ALWAYS_INLINE VectorType<T, N> min(VectorType<T, N> x,
+                                          VectorType<T, N> y) {
+  return combine(min(lowHalf(x), lowHalf(y)), min(highHalf(x), highHalf(y)));
+}
+
+static ALWAYS_INLINE HalfRGBA8 max(HalfRGBA8 x, HalfRGBA8 y) {
+#if USE_SSE2
+  return bit_cast<HalfRGBA8>(
+      _mm_max_epi16(bit_cast<V8<int16_t>>(x), bit_cast<V8<int16_t>>(y)));
+#elif USE_NEON
+  return vmaxq_u16(x, y);
+#else
+  return if_then_else(a > b, a, b);
+#endif
+}
+
+template <typename T, int N>
+static ALWAYS_INLINE VectorType<T, N> max(VectorType<T, N> x,
+                                          VectorType<T, N> y) {
+  return combine(max(lowHalf(x), lowHalf(y)), max(highHalf(x), highHalf(y)));
+}
+
+template <typename T, int N>
+static ALWAYS_INLINE VectorType<T, N> recip(VectorType<T, N> v) {
+  return combine(recip(lowHalf(v)), recip(highHalf(v)));
+}
+
+// Helper to get the reciprocal if the value is non-zero, or otherwise default
+// to the supplied fallback value.
+template <typename V>
+static ALWAYS_INLINE V recip_or(V v, float f) {
+  return if_then_else(v != V(0.0f), recip(v), V(f));
+}
+
+template <typename T, int N>
+static ALWAYS_INLINE VectorType<T, N> inversesqrt(VectorType<T, N> v) {
+  return combine(inversesqrt(lowHalf(v)), inversesqrt(highHalf(v)));
+}
+
+// Extract the alpha components so that we can cheaply calculate the reciprocal
+// on a single SIMD register. Then multiply the duplicated alpha reciprocal with
+// the pixel data. 0 alpha is treated as transparent black.
+static ALWAYS_INLINE WideRGBA32F unpremultiply(WideRGBA32F v) {
+  Float a = recip_or((Float){v[3], v[7], v[11], v[15]}, 0.0f);
+  return v * combine(a.xxxx, a.yyyy, a.zzzz, a.wwww);
+}
+
+// Packed RGBA32F data is AoS in BGRA order. Transpose it to SoA and swizzle to
+// RGBA to unpack.
+static ALWAYS_INLINE vec4 unpack(PackedRGBA32F c) {
+  return bit_cast<vec4>(
+      SHUFFLE(c, c, 2, 6, 10, 14, 1, 5, 9, 13, 0, 4, 8, 12, 3, 7, 11, 15));
+}
+
+// The following lum/sat functions mostly follow the KHR_blend_equation_advanced
+// specification but are rearranged to work on premultiplied data.
+static ALWAYS_INLINE Float lumv3(vec3 v) {
+  return v.x * 0.30f + v.y * 0.59f + v.z * 0.11f;
+}
+
+static ALWAYS_INLINE Float minv3(vec3 v) { return min(min(v.x, v.y), v.z); }
+
+static ALWAYS_INLINE Float maxv3(vec3 v) { return max(max(v.x, v.y), v.z); }
+
+static inline vec3 clip_color(vec3 v, Float lum, Float alpha) {
+  Float mincol = max(-minv3(v), lum);
+  Float maxcol = max(maxv3(v), alpha - lum);
+  return lum + v * (lum * (alpha - lum) * recip_or(mincol * maxcol, 0.0f));
+}
+
+static inline vec3 set_lum(vec3 base, vec3 ref, Float alpha) {
+  return clip_color(base - lumv3(base), lumv3(ref), alpha);
+}
+
+static inline vec3 set_lum_sat(vec3 base, vec3 sref, vec3 lref, Float alpha) {
+  vec3 diff = base - minv3(base);
+  Float sbase = maxv3(diff);
+  Float ssat = maxv3(sref) - minv3(sref);
+  // The sbase range is rescaled to ssat. If sbase has 0 extent, then rescale
+  // to black, as per specification.
+  return set_lum(diff * ssat * recip_or(sbase, 0.0f), lref, alpha);
+}
+
 // A pointer into the color buffer for the start of the span.
 static void* swgl_SpanBuf = nullptr;
 // A pointer into the clip mask for the start of the span.
@ -3173,6 +3342,182 @@ static ALWAYS_INLINE WideRGBA8 blend_pixels(uint32_t* buf, PackedRGBA8 pdst,
      return muldiv255(src, mask) + dst -
             muldiv255(dst, muldiv255(secondary, mask));
    }
+    case BLEND_CASE(GL_MIN):
+      return min(src, dst);
+    case BLEND_CASE(GL_MAX):
+      return max(src, dst);
+
+      // clang-format off
+    // The KHR_blend_equation_advanced spec describes the blend equations such
+    // that the unpremultiplied values Cs, Cd, As, Ad and function f combine to
+    // the result:
+    //      Cr = f(Cs,Cd)*As*Ad + Cs*As*(1-Ad) + Cd*AD*(1-As)
+    //      Ar = As*Ad + As*(1-Ad) + Ad*(1-As)
+    // However, working with unpremultiplied values requires expensive math to
+    // unpremultiply and premultiply again during blending. We can use the fact
+    // that premultiplied value P = C*A and simplify the equations such that no
+    // unpremultiplied colors are necessary, allowing us to stay with integer
+    // math that avoids floating-point conversions in the common case. Some of
+    // the blend modes require division or sqrt, in which case we do convert
+    // to (possibly transposed/unpacked) floating-point to implement the mode.
+    // However, most common modes can still use cheaper premultiplied integer
+    // math. As an example, the multiply mode f(Cs,Cd) = Cs*Cd is simplified
+    // to:
+    //     Cr = Cs*Cd*As*Ad + Cs*As*(1-Ad) + Cd*Ad*(1-As)
+    //     .. Pr = Ps*Pd + Ps - Ps*Ad + Pd - Pd*As
+    //     Ar = As*Ad + As - As*Ad + Ad - Ad*As
+    //     .. Ar = As + Ad - As*Ad
+    // Note that the alpha equation is the same for all blend equations, such
+    // that so long as the implementation results in As + Ad - As*Ad, we can
+    // avoid using separate instructions to compute the alpha result, which is
+    // dependent on the math used to implement each blend mode. The exact
+    // reductions used to get the final math for every blend mode are too
+    // involved to show here in comments, but mostly follows from replacing
+    // Cs*As and Cd*Ad with Ps and Ps while factoring out as many common terms
+    // as possible.
+      // clang-format on
+
+    case BLEND_CASE(GL_MULTIPLY_KHR): {
+      WideRGBA8 diff = muldiv255(alphas(src) - (src & RGB_MASK),
+                                 alphas(dst) - (dst & RGB_MASK));
+      return src + dst + (diff & RGB_MASK) - alphas(diff);
+    }
+    case BLEND_CASE(GL_SCREEN_KHR):
+      return src + dst - muldiv255(src, dst);
+    case BLEND_CASE(GL_OVERLAY_KHR): {
+      WideRGBA8 srcA = alphas(src);
+      WideRGBA8 dstA = alphas(dst);
+      WideRGBA8 diff = muldiv255(src, dst) + muldiv255(srcA - src, dstA - dst);
+      return src + dst +
+             if_then_else(dst * 2 <= dstA, (diff & RGB_MASK) - alphas(diff),
+                          -diff);
+    }
+    case BLEND_CASE(GL_DARKEN_KHR):
+      return src + dst -
+             max(muldiv255(src, alphas(dst)), muldiv255(dst, alphas(src)));
+    case BLEND_CASE(GL_LIGHTEN_KHR):
+      return src + dst -
+             min(muldiv255(src, alphas(dst)), muldiv255(dst, alphas(src)));
+
+    case BLEND_CASE(GL_COLORDODGE_KHR): {
+      // Color-dodge and color-burn require division, so we convert to FP math
+      // here, but avoid transposing to a vec4.
+      WideRGBA32F srcF = CONVERT(src, WideRGBA32F);
+      WideRGBA32F srcA = alphas(srcF);
+      WideRGBA32F dstF = CONVERT(dst, WideRGBA32F);
+      WideRGBA32F dstA = alphas(dstF);
+      return pack_pixels_RGBA8(
+          srcA * set_alphas(
+                     min(dstA, dstF * srcA * recip_or(srcA - srcF, 255.0f)),
+                     dstF) +
+              srcF * (255.0f - dstA) + dstF * (255.0f - srcA),
+          255.0f * 255.0f);
+    }
+    case BLEND_CASE(GL_COLORBURN_KHR): {
+      WideRGBA32F srcF = CONVERT(src, WideRGBA32F);
+      WideRGBA32F srcA = alphas(srcF);
+      WideRGBA32F dstF = CONVERT(dst, WideRGBA32F);
+      WideRGBA32F dstA = alphas(dstF);
+      return pack_pixels_RGBA8(
+          srcA * set_alphas((dstA - min(dstA, (dstA - dstF) * srcA *
+                                                  recip_or(srcF, 255.0f))),
+                            dstF) +
+              srcF * (255.0f - dstA) + dstF * (255.0f - srcA),
+          255.0f * 255.0f);
+    }
+    case BLEND_CASE(GL_HARDLIGHT_KHR): {
+      WideRGBA8 srcA = alphas(src);
+      WideRGBA8 dstA = alphas(dst);
+      WideRGBA8 diff = muldiv255(src, dst) + muldiv255(srcA - src, dstA - dst);
+      return src + dst +
+             if_then_else(src * 2 <= srcA, (diff & RGB_MASK) - alphas(diff),
+                          -diff);
+    }
+    case BLEND_CASE(GL_SOFTLIGHT_KHR): {
+      // Soft-light requires an unpremultiply that can't be factored out as
+      // well as a sqrt, so we convert to FP math here, but avoid transposing
+      // to a vec4.
+      WideRGBA32F srcF = CONVERT(src, WideRGBA32F);
+      WideRGBA32F srcA = alphas(srcF);
+      WideRGBA32F dstF = CONVERT(dst, WideRGBA32F);
+      WideRGBA32F dstA = alphas(dstF);
+      WideRGBA32F dstU = unpremultiply(dstF);
+      WideRGBA32F scale = srcF + srcF - srcA;
+      return pack_pixels_RGBA8(
+          dstF * (255.0f +
+                  set_alphas(
+                      scale *
+                          if_then_else(scale < 0.0f, 1.0f - dstU,
+                                       min((16.0f * dstU - 12.0f) * dstU + 3.0f,
+                                           inversesqrt(dstU) - 1.0f)),
+                      WideRGBA32F(0.0f))) +
+              srcF * (255.0f - dstA),
+          255.0f * 255.0f);
+    }
+    case BLEND_CASE(GL_DIFFERENCE_KHR): {
+      WideRGBA8 diff =
+          min(muldiv255(dst, alphas(src)), muldiv255(src, alphas(dst)));
+      return src + dst - diff - (diff & RGB_MASK);
+    }
+    case BLEND_CASE(GL_EXCLUSION_KHR): {
+      WideRGBA8 diff = muldiv255(src, dst);
+      return src + dst - diff - (diff & RGB_MASK);
+    }
+    case BLEND_CASE(GL_HSL_HUE_KHR): {
+      // The HSL blend modes are non-separable and require complicated use of
+      // division. It is advantageous to convert to FP and transpose to vec4
+      // math to more easily manipulate the individual color components.
+      vec4 srcV = unpack(CONVERT(src, PackedRGBA32F));
+      vec4 dstV = unpack(CONVERT(dst, PackedRGBA32F));
+      Float srcA = srcV.w * (1.0f / 255.0f);
+      Float dstA = dstV.w * (1.0f / 255.0f);
+      Float srcDstA = srcV.w * dstA;
+      vec3 srcC = vec3(srcV) * dstA;
+      vec3 dstC = vec3(dstV) * srcA;
+      return pack_pixels_RGBA8(vec4(set_lum_sat(srcC, dstC, dstC, srcDstA) +
+                                        vec3(srcV) - srcC + vec3(dstV) - dstC,
+                                    srcV.w + dstV.w - srcDstA),
+                               255.0f);
+    }
+    case BLEND_CASE(GL_HSL_SATURATION_KHR): {
+      vec4 srcV = unpack(CONVERT(src, PackedRGBA32F));
+      vec4 dstV = unpack(CONVERT(dst, PackedRGBA32F));
+      Float srcA = srcV.w * (1.0f / 255.0f);
+      Float dstA = dstV.w * (1.0f / 255.0f);
+      Float srcDstA = srcV.w * dstA;
+      vec3 srcC = vec3(srcV) * dstA;
+      vec3 dstC = vec3(dstV) * srcA;
+      return pack_pixels_RGBA8(vec4(set_lum_sat(dstC, srcC, dstC, srcDstA) +
+                                        vec3(srcV) - srcC + vec3(dstV) - dstC,
+                                    srcV.w + dstV.w - srcDstA),
+                               255.0f);
+    }
+    case BLEND_CASE(GL_HSL_COLOR_KHR): {
+      vec4 srcV = unpack(CONVERT(src, PackedRGBA32F));
+      vec4 dstV = unpack(CONVERT(dst, PackedRGBA32F));
+      Float srcA = srcV.w * (1.0f / 255.0f);
+      Float dstA = dstV.w * (1.0f / 255.0f);
+      Float srcDstA = srcV.w * dstA;
+      vec3 srcC = vec3(srcV) * dstA;
+      vec3 dstC = vec3(dstV) * srcA;
+      return pack_pixels_RGBA8(vec4(set_lum(srcC, dstC, srcDstA) + vec3(srcV) -
+                                        srcC + vec3(dstV) - dstC,
+                                    srcV.w + dstV.w - srcDstA),
+                               255.0f);
+    }
+    case BLEND_CASE(GL_HSL_LUMINOSITY_KHR): {
+      vec4 srcV = unpack(CONVERT(src, PackedRGBA32F));
+      vec4 dstV = unpack(CONVERT(dst, PackedRGBA32F));
+      Float srcA = srcV.w * (1.0f / 255.0f);
+      Float dstA = dstV.w * (1.0f / 255.0f);
+      Float srcDstA = srcV.w * dstA;
+      vec3 srcC = vec3(srcV) * dstA;
+      vec3 dstC = vec3(dstV) * srcA;
+      return pack_pixels_RGBA8(vec4(set_lum(dstC, srcC, srcDstA) + vec3(srcV) -
+                                        srcC + vec3(dstV) - dstC,
+                                    srcV.w + dstV.w - srcDstA),
+                               255.0f);
+    }
    default:
      UNREACHABLE;
      // return src;
--- a/gfx/wr/swgl/src/gl_defs.h
+++ b/gfx/wr/swgl/src/gl_defs.h
@ -155,6 +155,8 @@ typedef intptr_t GLintptr;
 #define GL_ONE_MINUS_SRC1_ALPHA 0x88FB

 #define GL_FUNC_ADD 0x8006
+#define GL_MIN 0x8007
+#define GL_MAX 0x8008

 #define GL_NEVER 0x0200
 #define GL_LESS 0x0201
@ -192,3 +194,19 @@ typedef intptr_t GLintptr;
 #define GL_UNSIGNED_SHORT_8_8_APPLE 0x85BA
 #define GL_UNSIGNED_SHORT_8_8_REV_APPLE 0x85BB
 #define GL_RGB_RAW_422_APPLE 0x8A51
+
+#define GL_MULTIPLY_KHR 0x9294
+#define GL_SCREEN_KHR 0x9295
+#define GL_OVERLAY_KHR 0x9296
+#define GL_DARKEN_KHR 0x9297
+#define GL_LIGHTEN_KHR 0x9298
+#define GL_COLORDODGE_KHR 0x9299
+#define GL_COLORBURN_KHR 0x929A
+#define GL_HARDLIGHT_KHR 0x929B
+#define GL_SOFTLIGHT_KHR 0x929C
+#define GL_DIFFERENCE_KHR 0x929E
+#define GL_EXCLUSION_KHR 0x92A0
+#define GL_HSL_HUE_KHR 0x92AD
+#define GL_HSL_SATURATION_KHR 0x92AE
+#define GL_HSL_COLOR_KHR 0x92AF
+#define GL_HSL_LUMINOSITY_KHR 0x92B0
--- a/gfx/wr/swgl/src/glsl.h
+++ b/gfx/wr/swgl/src/glsl.h
@ -215,6 +215,24 @@ SI Float sqrt(Float v) {
 #endif
 }

+SI float recip(float x) { return 1.0f / x; }
+
+// Use a fast vector reciprocal approximation when available. This should only
+// be used in cases where it is okay that the approximation is imprecise -
+// essentially visually correct but numerically wrong. Otherwise just rely on
+// however the compiler would implement slower division if the platform doesn't
+// provide a convenient intrinsic.
+SI Float recip(Float v) {
+#if USE_SSE2
+  return _mm_rcp_ps(v);
+#elif USE_NEON
+  Float e = vrecpeq_f32(v);
+  return vrecpsq_f32(v, e) * e;
+#else
+  return 1.0f / v;
+#endif
+}
+
 SI float inversesqrt(float x) { return 1.0f / sqrtf(x); }

 SI Float inversesqrt(Float v) {
@ -648,8 +666,8 @@ SI I32 roundfast(Float v, Float scale) {
 }

 template <typename T>
-SI auto round_pixel(T v) {
-  return roundfast(v, 255.0f);
+SI auto round_pixel(T v, float maxval = 1.0f) {
+  return roundfast(v, (255.0f / maxval));
 }

 #define round __glsl_round
@ -1335,6 +1353,7 @@ struct vec3 {
  IMPLICIT constexpr vec3(Float a) : x(a), y(a), z(a) {}
  constexpr vec3(Float x, Float y, Float z) : x(x), y(y), z(z) {}
  vec3(vec2 a, Float z) : x(a.x), y(a.y), z(z) {}
+  explicit vec3(vec4);
  IMPLICIT constexpr vec3(vec3_scalar s) : x(s.x), y(s.y), z(s.z) {}
  constexpr vec3(vec3_scalar s0, vec3_scalar s1, vec3_scalar s2, vec3_scalar s3)
      : x(Float{s0.x, s1.x, s2.x, s3.x}),
@ -1828,6 +1847,8 @@ vec4 make_vec4(const X& x, const Y& y, const Z& z, const W& w) {
  return vec4(x, y, z, w);
 }

+ALWAYS_INLINE vec3::vec3(vec4 v) : x(v.x), y(v.y), z(v.z) {}
+
 SI ivec4 roundfast(vec4 v, Float scale) {
  return ivec4(roundfast(v.x, scale), roundfast(v.y, scale),
               roundfast(v.z, scale), roundfast(v.w, scale));
--- a/gfx/wr/swgl/src/swgl_fns.rs
+++ b/gfx/wr/swgl/src/swgl_fns.rs
@ -2287,7 +2287,7 @@ impl Gl for Context {

    // GL_KHR_blend_equation_advanced
    fn blend_barrier_khr(&self) {
-        panic!();
+        // No barrier required, so nothing to do
    }

    // GL_CHROMIUM_copy_texture
--- a/gfx/wr/swgl/src/texture.h
+++ b/gfx/wr/swgl/src/texture.h
@ -589,13 +589,13 @@ static inline U16 textureLinearUnpackedR8(S sampler, ivec2 i,
  auto b0 = unaligned_load<V2<uint8_t>>(&buf[row0.y]);
  auto c0 = unaligned_load<V2<uint8_t>>(&buf[row0.z]);
  auto d0 = unaligned_load<V2<uint8_t>>(&buf[row0.w]);
-  auto abcd0 = CONVERT(combine(combine(a0, b0), combine(c0, d0)), V8<int16_t>);
+  auto abcd0 = CONVERT(combine(a0, b0, c0, d0), V8<int16_t>);

  auto a1 = unaligned_load<V2<uint8_t>>(&buf[row1.x]);
  auto b1 = unaligned_load<V2<uint8_t>>(&buf[row1.y]);
  auto c1 = unaligned_load<V2<uint8_t>>(&buf[row1.z]);
  auto d1 = unaligned_load<V2<uint8_t>>(&buf[row1.w]);
-  auto abcd1 = CONVERT(combine(combine(a1, b1), combine(c1, d1)), V8<int16_t>);
+  auto abcd1 = CONVERT(combine(a1, b1, c1, d1), V8<int16_t>);

  abcd0 += ((abcd1 - abcd0) * fracy.xxyyzzww) >> 7;

@ -709,15 +709,13 @@ static inline I16 textureLinearUnpackedR16(S sampler, ivec2 i,
  auto b0 = unaligned_load<V2<uint16_t>>(&buf[row0.y]);
  auto c0 = unaligned_load<V2<uint16_t>>(&buf[row0.z]);
  auto d0 = unaligned_load<V2<uint16_t>>(&buf[row0.w]);
-  auto abcd0 =
-      CONVERT(combine(combine(a0, b0), combine(c0, d0)) >> 1, V8<int16_t>);
+  auto abcd0 = CONVERT(combine(a0, b0, c0, d0) >> 1, V8<int16_t>);

  auto a1 = unaligned_load<V2<uint16_t>>(&buf[row1.x]);
  auto b1 = unaligned_load<V2<uint16_t>>(&buf[row1.y]);
  auto c1 = unaligned_load<V2<uint16_t>>(&buf[row1.z]);
  auto d1 = unaligned_load<V2<uint16_t>>(&buf[row1.w]);
-  auto abcd1 =
-      CONVERT(combine(combine(a1, b1), combine(c1, d1)) >> 1, V8<int16_t>);
+  auto abcd1 = CONVERT(combine(a1, b1, c1, d1) >> 1, V8<int16_t>);

  // The samples occupy 15 bits and the fraction occupies 15 bits, so that when
  // they are multiplied together, the new scaled sample will fit in the high
@ -767,6 +765,9 @@ vec4 textureLinearR16(S sampler, vec2 P, int32_t zoffset = 0) {
  return vec4(r * (1.0f / 32767.0f), 0.0f, 0.0f, 1.0f);
 }

+using PackedRGBA32F = V16<float>;
+using WideRGBA32F = V16<float>;
+
 template <typename S>
 vec4 textureLinearRGBA32F(S sampler, vec2 P, int32_t zoffset = 0) {
  assert(sampler->format == TextureFormat::RGBA32F);
--- a/gfx/wr/swgl/src/vector_type.h
+++ b/gfx/wr/swgl/src/vector_type.h
@ -315,6 +315,10 @@ struct VectorType {
    return VectorType<T, N * 2>::wrap(data, high.data);
  }

+#  define xxxx swizzle(0, 0, 0, 0)
+#  define yyyy swizzle(1, 1, 1, 1)
+#  define zzzz swizzle(2, 2, 2, 2)
+#  define wwww swizzle(3, 3, 3, 3)
 #  define xyxy swizzle(0, 1, 0, 1)
 #  define zwzw swizzle(2, 3, 2, 3)
 #  define zwxy swizzle(2, 3, 0, 1)
@ -388,6 +392,12 @@ SI VectorType<T, N * 2> expand(VectorType<T, N> a) {
 }
 #endif

+template <typename T, int N>
+SI VectorType<T, N * 4> combine(VectorType<T, N> a, VectorType<T, N> b,
+                                VectorType<T, N> c, VectorType<T, N> d) {
+  return combine(combine(a, b), combine(c, d));
+}
+
 template <typename T>
 SI VectorType<T, 4> zipLow(VectorType<T, 4> a, VectorType<T, 4> b) {
  return SHUFFLE(a, b, 0, 4, 1, 5);
--- a/layout/reftests/css-blending/reftest.list
+++ b/layout/reftests/css-blending/reftest.list
@ -20,14 +20,14 @@ fuzzy-if(azureSkiaGL,0-2,0-7174) == background-blending-image-gradient.html back
 == background-blending-color-burn.html background-blending-color-burn-ref.svg
 == background-blending-color-dodge.html background-blending-color-dodge-ref.svg
 # need to investigate why these tests are fuzzy - first suspect is a possible color space conversion on some platforms; same for mix-blend-mode tests
-fuzzy-if(azureSkia||gtkWidget,0-2,0-9600) fuzzy-if(d2d,0-1,0-8000) fuzzy-if(swgl,1-1,9600-9600) == background-blending-color.html background-blending-color-ref.svg
+fuzzy-if(azureSkia||gtkWidget,0-2,0-9600) fuzzy-if(d2d,0-1,0-8000) fuzzy-if(swgl,1-1,8000-9600) == background-blending-color.html background-blending-color-ref.svg
 == background-blending-darken.html background-blending-darken-ref.svg
 == background-blending-difference.html background-blending-difference-ref.svg
 fuzzy-if(/^Windows\x20NT\x2010\.0/.test(http.oscpu)||skiaContent,0-1,0-1600) == background-blending-exclusion.html background-blending-exclusion-ref.svg
-fuzzy-if(cocoaWidget||d2d,0-1,0-1600) == background-blending-hard-light.html background-blending-hard-light-ref.svg
+fuzzy-if(cocoaWidget||d2d||swgl,0-1,0-1600) == background-blending-hard-light.html background-blending-hard-light-ref.svg
 fuzzy-if(d2d,0-1,0-9600) fuzzy-if(azureSkia||gtkWidget,0-1,0-11200) fuzzy-if(webrender&&!geckoview,1-1,9600-11240) == background-blending-hue.html background-blending-hue-ref.svg
 == background-blending-lighten.html background-blending-lighten-ref.svg
-fuzzy-if(d2d,0-1,0-8000) fuzzy-if(azureSkia||gtkWidget,0-2,0-9600) fuzzy-if(swgl,1-1,9600-9600) == background-blending-luminosity.html background-blending-luminosity-ref.svg
+fuzzy-if(d2d,0-1,0-8000) fuzzy-if(azureSkia||gtkWidget,0-2,0-9600) fuzzy-if(swgl,1-1,8000-9600) == background-blending-luminosity.html background-blending-luminosity-ref.svg
 fuzzy-if(skiaContent,0-1,0-1600) == background-blending-multiply.html background-blending-multiply-ref.svg
 == background-blending-normal.html background-blending-normal-ref.svg
 fuzzy-if(/^Windows\x20NT\x2010\.0/.test(http.oscpu)||azureSkia||gtkWidget,0-1,0-1600) == background-blending-overlay.html background-blending-overlay-ref.svg
@ -41,7 +41,7 @@ fuzzy-if(azureSkia||d2d||gtkWidget,0-1,0-40000) == background-blending-image-col
 fuzzy(0-65,0-53) fuzzy-if(geckoview&&webrender&&device,63-64,163-328) == mix-blend-mode-952051.html mix-blend-mode-952051-ref.html

 fuzzy-if(d3d11,0-49,0-200) == mix-blend-mode-and-filter.html mix-blend-mode-and-filter-ref.html
-fuzzy-if(d3d11,0-1,0-6) == mix-blend-mode-and-filter.svg mix-blend-mode-and-filter-ref.svg
+fuzzy-if(d3d11,0-1,0-6) fuzzy-if(swgl,171-171,2980-2980) == mix-blend-mode-and-filter.svg mix-blend-mode-and-filter-ref.svg

 fuzzy(0-2,0-14400) fuzzy-if(geckoview&&webrender&&device,3-3,700-700) == mix-blend-mode-child-of-blended-has-opacity.html mix-blend-mode-child-of-blended-has-opacity-ref.html