From 73e9582ee44cbf4e5ebad2888c9fe2df7c8f1a68 Mon Sep 17 00:00:00 2001 From: Lee Salzman Date: Tue, 2 Feb 2021 22:18:21 +0000 Subject: [PATCH] Bug 1674524 - implement KHR_blend_equation_advanced in SWGL. r=bradwerth This patch has a few moving parts. We have to first tell WR that when it detects the extension that it is actually allowed to use it. We have to make the glsl-to-cxx translator eat the blend_supports_all_equations layout qualifier. We have to enable generation of advanced-blend-equation variants in the SWGL build setup. Then we report the actual extension inside SWGL. Finally, we actually add all the necessary blend equation enums, hash them down to a blend key, and implement all the blend modes therein. Differential Revision: https://phabricator.services.mozilla.com/D103804 --- gfx/webrender_bindings/src/bindings.rs | 3 + gfx/wr/glsl-to-cxx/src/hir.rs | 32 +- gfx/wr/glsl-to-cxx/src/lib.rs | 21 +- gfx/wr/swgl/build.rs | 1 + gfx/wr/swgl/src/composite.h | 4 +- gfx/wr/swgl/src/gl.cc | 391 ++++++++++++++++++++-- gfx/wr/swgl/src/gl_defs.h | 18 + gfx/wr/swgl/src/glsl.h | 25 +- gfx/wr/swgl/src/swgl_fns.rs | 2 +- gfx/wr/swgl/src/texture.h | 13 +- gfx/wr/swgl/src/vector_type.h | 10 + layout/reftests/css-blending/reftest.list | 8 +- 12 files changed, 476 insertions(+), 52 deletions(-) diff --git a/gfx/webrender_bindings/src/bindings.rs b/gfx/webrender_bindings/src/bindings.rs index ef78776a40e1..f2679a58aeb3 100644 --- a/gfx/webrender_bindings/src/bindings.rs +++ b/gfx/webrender_bindings/src/bindings.rs @@ -1660,6 +1660,9 @@ pub extern "C" fn wr_window_new( // SWGL doesn't support the GL_ALWAYS depth comparison function used by // `clear_caches_with_quads`, but scissored clears work well. clear_caches_with_quads: !software && !allow_scissored_cache_clears, + // SWGL supports KHR_blend_equation_advanced safely, but we haven't yet + // tested other HW platforms determine if it is safe to allow them. + allow_advanced_blend_equation: software, start_debug_server, surface_origin_is_top_left, compositor_config, diff --git a/gfx/wr/glsl-to-cxx/src/hir.rs b/gfx/wr/glsl-to-cxx/src/hir.rs index 4610e0210ee8..6af8d4adb053 100644 --- a/gfx/wr/glsl-to-cxx/src/hir.rs +++ b/gfx/wr/glsl-to-cxx/src/hir.rs @@ -1892,11 +1892,33 @@ fn translate_declaration( syntax::Declaration::FunctionPrototype(p) => { Declaration::FunctionPrototype(translate_function_prototype(state, p)) } - syntax::Declaration::Global(_ty, _ids) => { - panic!(); - // glsl non-es supports requalifying variables - // we don't right now - //Declaration::Global(..) + syntax::Declaration::Global(ty, ids) => { + // glsl non-es supports requalifying variables, but we don't yet. + // However, we still want to allow global layout qualifiers for + // KHR_advanced_blend_equation. + if !ids.is_empty() { + panic!(); + } + let _ = for qual in &ty.qualifiers { + match qual { + syntax::TypeQualifierSpec::Layout(l) => { + for id in &l.ids { + match id { + syntax::LayoutQualifierSpec::Identifier(key, _) => { + match key.as_str() { + "blend_support_all_equations" => (), + _ => panic!(), + } + } + _ => panic!(), + } + } + } + syntax::TypeQualifierSpec::Storage(syntax::StorageQualifier::Out) => (), + _ => panic!(), + } + }; + Declaration::Global(lift_type_qualifier_for_declaration(state, &Some(ty.clone())).unwrap(), ids.clone()) } syntax::Declaration::InitDeclaratorList(dl) => { translate_init_declarator_list(state, dl, default_run_class) diff --git a/gfx/wr/glsl-to-cxx/src/lib.rs b/gfx/wr/glsl-to-cxx/src/lib.rs index 9b54fcc1ecf7..20c8c9286871 100644 --- a/gfx/wr/glsl-to-cxx/src/lib.rs +++ b/gfx/wr/glsl-to-cxx/src/lib.rs @@ -2313,19 +2313,22 @@ pub fn show_declaration(state: &mut OutputState, d: &hir::Declaration) { //state.write(";\n"); } hir::Declaration::Global(ref qual, ref identifiers) => { - show_type_qualifier(state, &qual); + // We only want to output GLSL layout qualifiers if not C++ + if !state.output_cxx { + show_type_qualifier(state, &qual); - if !identifiers.is_empty() { - let mut iter = identifiers.iter(); - let first = iter.next().unwrap(); - show_identifier(state, first); + if !identifiers.is_empty() { + let mut iter = identifiers.iter(); + let first = iter.next().unwrap(); + show_identifier(state, first); - for identifier in iter { - let _ = write!(state, ", {}", identifier); + for identifier in iter { + let _ = write!(state, ", {}", identifier); + } } - } - state.write(";\n"); + state.write(";\n"); + } } hir::Declaration::StructDefinition(ref sym) => { show_sym_decl(state, sym); diff --git a/gfx/wr/swgl/build.rs b/gfx/wr/swgl/build.rs index b20dacd59359..f41b5694d9dc 100644 --- a/gfx/wr/swgl/build.rs +++ b/gfx/wr/swgl/build.rs @@ -108,6 +108,7 @@ fn main() { let shader_flags = ShaderFeatureFlags::GL | ShaderFeatureFlags::DUAL_SOURCE_BLENDING | + ShaderFeatureFlags::ADVANCED_BLEND_EQUATION | ShaderFeatureFlags::DEBUG; let mut shaders: Vec = Vec::new(); for (name, features) in get_shader_features(shader_flags) { diff --git a/gfx/wr/swgl/src/composite.h b/gfx/wr/swgl/src/composite.h index a5a4489e6dd8..2765a84fc7c0 100644 --- a/gfx/wr/swgl/src/composite.h +++ b/gfx/wr/swgl/src/composite.h @@ -596,13 +596,13 @@ static ALWAYS_INLINE V8 linearRowTapsR8(S sampler, I32 ix, auto b0 = unaligned_load>(&buf[ix.y]); auto c0 = unaligned_load>(&buf[ix.z]); auto d0 = unaligned_load>(&buf[ix.w]); - auto abcd0 = CONVERT(combine(combine(a0, b0), combine(c0, d0)), V8); + auto abcd0 = CONVERT(combine(a0, b0, c0, d0), V8); buf += stridey; auto a1 = unaligned_load>(&buf[ix.x]); auto b1 = unaligned_load>(&buf[ix.y]); auto c1 = unaligned_load>(&buf[ix.z]); auto d1 = unaligned_load>(&buf[ix.w]); - auto abcd1 = CONVERT(combine(combine(a1, b1), combine(c1, d1)), V8); + auto abcd1 = CONVERT(combine(a1, b1, c1, d1), V8); abcd0 += ((abcd1 - abcd0) * fracy) >> 7; return abcd0; } diff --git a/gfx/wr/swgl/src/gl.cc b/gfx/wr/swgl/src/gl.cc index 1165b15542cc..3a542d0f4cb0 100644 --- a/gfx/wr/swgl/src/gl.cc +++ b/gfx/wr/swgl/src/gl.cc @@ -762,10 +762,13 @@ struct Program { }; // clang-format off -// for GL defines to fully expand +// Fully-expand GL defines while ignoring more than 4 suffixes #define CONCAT_KEY(prefix, x, y, z, w, ...) prefix##x##y##z##w -#define BLEND_KEY(...) CONCAT_KEY(BLEND_, __VA_ARGS__, 0, 0) -#define MASK_BLEND_KEY(...) CONCAT_KEY(MASK_BLEND_, __VA_ARGS__, 0, 0) +// Generate a blend key enum symbol +#define BLEND_KEY(...) CONCAT_KEY(BLEND_, __VA_ARGS__, 0, 0, 0) +// Generate a blend key symbol for a clip-mask variation +#define MASK_BLEND_KEY(...) CONCAT_KEY(MASK_BLEND_, __VA_ARGS__, 0, 0, 0) +// Utility macro to easily generate similar code for all implemented blend modes #define FOR_EACH_BLEND_KEY(macro) \ macro(GL_ONE, GL_ZERO, 0, 0) \ macro(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ONE_MINUS_SRC_ALPHA) \ @@ -778,7 +781,24 @@ struct Program { macro(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA) \ macro(GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE) \ macro(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR, 0, 0) \ - macro(GL_ONE, GL_ONE_MINUS_SRC1_COLOR, 0, 0) + macro(GL_ONE, GL_ONE_MINUS_SRC1_COLOR, 0, 0) \ + macro(GL_MIN, 0, 0, 0) \ + macro(GL_MAX, 0, 0, 0) \ + macro(GL_MULTIPLY_KHR, 0, 0, 0) \ + macro(GL_SCREEN_KHR, 0, 0, 0) \ + macro(GL_OVERLAY_KHR, 0, 0, 0) \ + macro(GL_DARKEN_KHR, 0, 0, 0) \ + macro(GL_LIGHTEN_KHR, 0, 0, 0) \ + macro(GL_COLORDODGE_KHR, 0, 0, 0) \ + macro(GL_COLORBURN_KHR, 0, 0, 0) \ + macro(GL_HARDLIGHT_KHR, 0, 0, 0) \ + macro(GL_SOFTLIGHT_KHR, 0, 0, 0) \ + macro(GL_DIFFERENCE_KHR, 0, 0, 0) \ + macro(GL_EXCLUSION_KHR, 0, 0, 0) \ + macro(GL_HSL_HUE_KHR, 0, 0, 0) \ + macro(GL_HSL_SATURATION_KHR, 0, 0, 0) \ + macro(GL_HSL_COLOR_KHR, 0, 0, 0) \ + macro(GL_HSL_LUMINOSITY_KHR, 0, 0, 0) #define DEFINE_BLEND_KEY(...) BLEND_KEY(__VA_ARGS__), #define DEFINE_MASK_BLEND_KEY(...) MASK_BLEND_KEY(__VA_ARGS__), @@ -1299,6 +1319,8 @@ static const char* const extensions[] = { "GL_ARB_invalidate_subdata", "GL_ARB_texture_storage", "GL_EXT_timer_query", + "GL_KHR_blend_equation_advanced", + "GL_KHR_blend_equation_advanced_coherent", "GL_APPLE_rgb_422", }; @@ -1437,6 +1459,37 @@ GLenum remap_blendfunc(GLenum rgb, GLenum a) { return a; } +// Generate a hashed blend key based on blend func and equation state. This +// allows all the blend state to be processed down to a blend key that can be +// dealt with inside a single switch statement. +static void hash_blend_key() { + GLenum srgb = ctx->blendfunc_srgb; + GLenum drgb = ctx->blendfunc_drgb; + GLenum sa = ctx->blendfunc_sa; + GLenum da = ctx->blendfunc_da; + GLenum equation = ctx->blend_equation; +#define HASH_BLEND_KEY(x, y, z, w) ((x << 4) | (y) | (z << 24) | (w << 20)) + // Basic non-separate blend funcs used the two argument form + int hash = HASH_BLEND_KEY(srgb, drgb, 0, 0); + // Separate alpha blend funcs use the 4 argument hash + if (srgb != sa || drgb != da) hash |= HASH_BLEND_KEY(0, 0, sa, da); + // Any other blend equation than the default func_add ignores the func and + // instead generates a one-argument hash based on the equation + if (equation != GL_FUNC_ADD) hash = HASH_BLEND_KEY(equation, 0, 0, 0); + switch (hash) { +#define MAP_BLEND_KEY(...) \ + case HASH_BLEND_KEY(__VA_ARGS__): \ + ctx->blend_key = BLEND_KEY(__VA_ARGS__); \ + break; + FOR_EACH_BLEND_KEY(MAP_BLEND_KEY) + default: + debugf("blendfunc: %x, %x, separate: %x, %x, equation: %x\n", srgb, drgb, + sa, da, equation); + assert(false); + break; + } +} + void BlendFunc(GLenum srgb, GLenum drgb, GLenum sa, GLenum da) { ctx->blendfunc_srgb = srgb; ctx->blendfunc_drgb = drgb; @@ -1445,20 +1498,7 @@ void BlendFunc(GLenum srgb, GLenum drgb, GLenum sa, GLenum da) { ctx->blendfunc_sa = sa; ctx->blendfunc_da = da; -#define HASH_BLEND_KEY(x, y, z, w) ((x << 4) | (y) | (z << 24) | (w << 20)) - int hash = HASH_BLEND_KEY(srgb, drgb, 0, 0); - if (srgb != sa || drgb != da) hash |= HASH_BLEND_KEY(0, 0, sa, da); - switch (hash) { -#define MAP_BLEND_KEY(...) \ - case HASH_BLEND_KEY(__VA_ARGS__): \ - ctx->blend_key = BLEND_KEY(__VA_ARGS__); \ - break; - FOR_EACH_BLEND_KEY(MAP_BLEND_KEY) - default: - debugf("blendfunc: %x, %x, separate: %x, %x\n", srgb, drgb, sa, da); - assert(false); - break; - } + hash_blend_key(); } void BlendColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a) { @@ -1467,8 +1507,12 @@ void BlendColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a) { } void BlendEquation(GLenum mode) { - assert(mode == GL_FUNC_ADD); - ctx->blend_equation = mode; + assert(mode == GL_FUNC_ADD || mode == GL_MIN || mode == GL_MAX || + (mode >= GL_MULTIPLY_KHR && mode <= GL_HSL_LUMINOSITY_KHR)); + if (mode != ctx->blend_equation) { + ctx->blend_equation = mode; + hash_blend_key(); + } } void DepthMask(GLboolean flag) { ctx->depthmask = flag; } @@ -2990,8 +3034,9 @@ static ALWAYS_INLINE HalfRGBA8 packRGBA8(I32 a, I32 b) { #endif } -static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(const vec4& v) { - ivec4 i = round_pixel(v); +static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(const vec4& v, + float maxval = 1.0f) { + ivec4 i = round_pixel(v, maxval); HalfRGBA8 xz = packRGBA8(i.z, i.x); HalfRGBA8 yw = packRGBA8(i.y, i.w); HalfRGBA8 xyzwl = zipLow(xz, yw); @@ -3011,6 +3056,12 @@ static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8() { return pack_pixels_RGBA8(fragment_shader->gl_FragColor); } +static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(WideRGBA32F v, + float maxval = 1.0f) { + ivec4 i = round_pixel(bit_cast(v), maxval); + return combine(packRGBA8(i.x, i.y), packRGBA8(i.z, i.w)); +} + // Load a partial span > 0 and < 4 pixels. template static ALWAYS_INLINE V partial_load_span(const P* src, int span) { @@ -3079,10 +3130,128 @@ static ALWAYS_INLINE T addlow(T x, T y) { return bit_cast(bit_cast(x) + bit_cast(y)); } -static ALWAYS_INLINE WideRGBA8 alphas(WideRGBA8 c) { +// Replace color components of each pixel with the pixel's alpha values. +template +static ALWAYS_INLINE T alphas(T c) { return SHUFFLE(c, c, 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15); } +// Replace the alpha values of the first vector with alpha values from the +// second, while leaving the color components unmodified. +template +static ALWAYS_INLINE T set_alphas(T c, T a) { + return SHUFFLE(c, a, 0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31); +} + +// Miscellaneous helper functions for working with packed RGBA8 data. +static ALWAYS_INLINE HalfRGBA8 if_then_else(V8 c, HalfRGBA8 t, + HalfRGBA8 e) { + return bit_cast((c & t) | (~c & e)); +} + +template +static ALWAYS_INLINE VectorType if_then_else(VectorType c, + VectorType t, + VectorType e) { + return combine(if_then_else(lowHalf(c), lowHalf(t), lowHalf(e)), + if_then_else(highHalf(c), highHalf(t), highHalf(e))); +} + +static ALWAYS_INLINE HalfRGBA8 min(HalfRGBA8 x, HalfRGBA8 y) { +#if USE_SSE2 + return bit_cast( + _mm_min_epi16(bit_cast>(x), bit_cast>(y))); +#elif USE_NEON + return vminq_u16(x, y); +#else + return if_then_else(x < y, x, y); +#endif +} + +template +static ALWAYS_INLINE VectorType min(VectorType x, + VectorType y) { + return combine(min(lowHalf(x), lowHalf(y)), min(highHalf(x), highHalf(y))); +} + +static ALWAYS_INLINE HalfRGBA8 max(HalfRGBA8 x, HalfRGBA8 y) { +#if USE_SSE2 + return bit_cast( + _mm_max_epi16(bit_cast>(x), bit_cast>(y))); +#elif USE_NEON + return vmaxq_u16(x, y); +#else + return if_then_else(x > y, x, y); +#endif +} + +template +static ALWAYS_INLINE VectorType max(VectorType x, + VectorType y) { + return combine(max(lowHalf(x), lowHalf(y)), max(highHalf(x), highHalf(y))); +} + +template +static ALWAYS_INLINE VectorType recip(VectorType v) { + return combine(recip(lowHalf(v)), recip(highHalf(v))); +} + +// Helper to get the reciprocal if the value is non-zero, or otherwise default +// to the supplied fallback value. +template +static ALWAYS_INLINE V recip_or(V v, float f) { + return if_then_else(v != V(0.0f), recip(v), V(f)); +} + +template +static ALWAYS_INLINE VectorType inversesqrt(VectorType v) { + return combine(inversesqrt(lowHalf(v)), inversesqrt(highHalf(v))); +} + +// Extract the alpha components so that we can cheaply calculate the reciprocal +// on a single SIMD register. Then multiply the duplicated alpha reciprocal with +// the pixel data. 0 alpha is treated as transparent black. +static ALWAYS_INLINE WideRGBA32F unpremultiply(WideRGBA32F v) { + Float a = recip_or((Float){v[3], v[7], v[11], v[15]}, 0.0f); + return v * combine(a.xxxx, a.yyyy, a.zzzz, a.wwww); +} + +// Packed RGBA32F data is AoS in BGRA order. Transpose it to SoA and swizzle to +// RGBA to unpack. +static ALWAYS_INLINE vec4 unpack(PackedRGBA32F c) { + return bit_cast( + SHUFFLE(c, c, 2, 6, 10, 14, 1, 5, 9, 13, 0, 4, 8, 12, 3, 7, 11, 15)); +} + +// The following lum/sat functions mostly follow the KHR_blend_equation_advanced +// specification but are rearranged to work on premultiplied data. +static ALWAYS_INLINE Float lumv3(vec3 v) { + return v.x * 0.30f + v.y * 0.59f + v.z * 0.11f; +} + +static ALWAYS_INLINE Float minv3(vec3 v) { return min(min(v.x, v.y), v.z); } + +static ALWAYS_INLINE Float maxv3(vec3 v) { return max(max(v.x, v.y), v.z); } + +static inline vec3 clip_color(vec3 v, Float lum, Float alpha) { + Float mincol = max(-minv3(v), lum); + Float maxcol = max(maxv3(v), alpha - lum); + return lum + v * (lum * (alpha - lum) * recip_or(mincol * maxcol, 0.0f)); +} + +static inline vec3 set_lum(vec3 base, vec3 ref, Float alpha) { + return clip_color(base - lumv3(base), lumv3(ref), alpha); +} + +static inline vec3 set_lum_sat(vec3 base, vec3 sref, vec3 lref, Float alpha) { + vec3 diff = base - minv3(base); + Float sbase = maxv3(diff); + Float ssat = maxv3(sref) - minv3(sref); + // The sbase range is rescaled to ssat. If sbase has 0 extent, then rescale + // to black, as per specification. + return set_lum(diff * ssat * recip_or(sbase, 0.0f), lref, alpha); +} + // A pointer into the color buffer for the start of the span. static void* swgl_SpanBuf = nullptr; // A pointer into the clip mask for the start of the span. @@ -3173,6 +3342,182 @@ static ALWAYS_INLINE WideRGBA8 blend_pixels(uint32_t* buf, PackedRGBA8 pdst, return muldiv255(src, mask) + dst - muldiv255(dst, muldiv255(secondary, mask)); } + case BLEND_CASE(GL_MIN): + return min(src, dst); + case BLEND_CASE(GL_MAX): + return max(src, dst); + + // clang-format off + // The KHR_blend_equation_advanced spec describes the blend equations such + // that the unpremultiplied values Cs, Cd, As, Ad and function f combine to + // the result: + // Cr = f(Cs,Cd)*As*Ad + Cs*As*(1-Ad) + Cd*AD*(1-As) + // Ar = As*Ad + As*(1-Ad) + Ad*(1-As) + // However, working with unpremultiplied values requires expensive math to + // unpremultiply and premultiply again during blending. We can use the fact + // that premultiplied value P = C*A and simplify the equations such that no + // unpremultiplied colors are necessary, allowing us to stay with integer + // math that avoids floating-point conversions in the common case. Some of + // the blend modes require division or sqrt, in which case we do convert + // to (possibly transposed/unpacked) floating-point to implement the mode. + // However, most common modes can still use cheaper premultiplied integer + // math. As an example, the multiply mode f(Cs,Cd) = Cs*Cd is simplified + // to: + // Cr = Cs*Cd*As*Ad + Cs*As*(1-Ad) + Cd*Ad*(1-As) + // .. Pr = Ps*Pd + Ps - Ps*Ad + Pd - Pd*As + // Ar = As*Ad + As - As*Ad + Ad - Ad*As + // .. Ar = As + Ad - As*Ad + // Note that the alpha equation is the same for all blend equations, such + // that so long as the implementation results in As + Ad - As*Ad, we can + // avoid using separate instructions to compute the alpha result, which is + // dependent on the math used to implement each blend mode. The exact + // reductions used to get the final math for every blend mode are too + // involved to show here in comments, but mostly follows from replacing + // Cs*As and Cd*Ad with Ps and Ps while factoring out as many common terms + // as possible. + // clang-format on + + case BLEND_CASE(GL_MULTIPLY_KHR): { + WideRGBA8 diff = muldiv255(alphas(src) - (src & RGB_MASK), + alphas(dst) - (dst & RGB_MASK)); + return src + dst + (diff & RGB_MASK) - alphas(diff); + } + case BLEND_CASE(GL_SCREEN_KHR): + return src + dst - muldiv255(src, dst); + case BLEND_CASE(GL_OVERLAY_KHR): { + WideRGBA8 srcA = alphas(src); + WideRGBA8 dstA = alphas(dst); + WideRGBA8 diff = muldiv255(src, dst) + muldiv255(srcA - src, dstA - dst); + return src + dst + + if_then_else(dst * 2 <= dstA, (diff & RGB_MASK) - alphas(diff), + -diff); + } + case BLEND_CASE(GL_DARKEN_KHR): + return src + dst - + max(muldiv255(src, alphas(dst)), muldiv255(dst, alphas(src))); + case BLEND_CASE(GL_LIGHTEN_KHR): + return src + dst - + min(muldiv255(src, alphas(dst)), muldiv255(dst, alphas(src))); + + case BLEND_CASE(GL_COLORDODGE_KHR): { + // Color-dodge and color-burn require division, so we convert to FP math + // here, but avoid transposing to a vec4. + WideRGBA32F srcF = CONVERT(src, WideRGBA32F); + WideRGBA32F srcA = alphas(srcF); + WideRGBA32F dstF = CONVERT(dst, WideRGBA32F); + WideRGBA32F dstA = alphas(dstF); + return pack_pixels_RGBA8( + srcA * set_alphas( + min(dstA, dstF * srcA * recip_or(srcA - srcF, 255.0f)), + dstF) + + srcF * (255.0f - dstA) + dstF * (255.0f - srcA), + 255.0f * 255.0f); + } + case BLEND_CASE(GL_COLORBURN_KHR): { + WideRGBA32F srcF = CONVERT(src, WideRGBA32F); + WideRGBA32F srcA = alphas(srcF); + WideRGBA32F dstF = CONVERT(dst, WideRGBA32F); + WideRGBA32F dstA = alphas(dstF); + return pack_pixels_RGBA8( + srcA * set_alphas((dstA - min(dstA, (dstA - dstF) * srcA * + recip_or(srcF, 255.0f))), + dstF) + + srcF * (255.0f - dstA) + dstF * (255.0f - srcA), + 255.0f * 255.0f); + } + case BLEND_CASE(GL_HARDLIGHT_KHR): { + WideRGBA8 srcA = alphas(src); + WideRGBA8 dstA = alphas(dst); + WideRGBA8 diff = muldiv255(src, dst) + muldiv255(srcA - src, dstA - dst); + return src + dst + + if_then_else(src * 2 <= srcA, (diff & RGB_MASK) - alphas(diff), + -diff); + } + case BLEND_CASE(GL_SOFTLIGHT_KHR): { + // Soft-light requires an unpremultiply that can't be factored out as + // well as a sqrt, so we convert to FP math here, but avoid transposing + // to a vec4. + WideRGBA32F srcF = CONVERT(src, WideRGBA32F); + WideRGBA32F srcA = alphas(srcF); + WideRGBA32F dstF = CONVERT(dst, WideRGBA32F); + WideRGBA32F dstA = alphas(dstF); + WideRGBA32F dstU = unpremultiply(dstF); + WideRGBA32F scale = srcF + srcF - srcA; + return pack_pixels_RGBA8( + dstF * (255.0f + + set_alphas( + scale * + if_then_else(scale < 0.0f, 1.0f - dstU, + min((16.0f * dstU - 12.0f) * dstU + 3.0f, + inversesqrt(dstU) - 1.0f)), + WideRGBA32F(0.0f))) + + srcF * (255.0f - dstA), + 255.0f * 255.0f); + } + case BLEND_CASE(GL_DIFFERENCE_KHR): { + WideRGBA8 diff = + min(muldiv255(dst, alphas(src)), muldiv255(src, alphas(dst))); + return src + dst - diff - (diff & RGB_MASK); + } + case BLEND_CASE(GL_EXCLUSION_KHR): { + WideRGBA8 diff = muldiv255(src, dst); + return src + dst - diff - (diff & RGB_MASK); + } + case BLEND_CASE(GL_HSL_HUE_KHR): { + // The HSL blend modes are non-separable and require complicated use of + // division. It is advantageous to convert to FP and transpose to vec4 + // math to more easily manipulate the individual color components. + vec4 srcV = unpack(CONVERT(src, PackedRGBA32F)); + vec4 dstV = unpack(CONVERT(dst, PackedRGBA32F)); + Float srcA = srcV.w * (1.0f / 255.0f); + Float dstA = dstV.w * (1.0f / 255.0f); + Float srcDstA = srcV.w * dstA; + vec3 srcC = vec3(srcV) * dstA; + vec3 dstC = vec3(dstV) * srcA; + return pack_pixels_RGBA8(vec4(set_lum_sat(srcC, dstC, dstC, srcDstA) + + vec3(srcV) - srcC + vec3(dstV) - dstC, + srcV.w + dstV.w - srcDstA), + 255.0f); + } + case BLEND_CASE(GL_HSL_SATURATION_KHR): { + vec4 srcV = unpack(CONVERT(src, PackedRGBA32F)); + vec4 dstV = unpack(CONVERT(dst, PackedRGBA32F)); + Float srcA = srcV.w * (1.0f / 255.0f); + Float dstA = dstV.w * (1.0f / 255.0f); + Float srcDstA = srcV.w * dstA; + vec3 srcC = vec3(srcV) * dstA; + vec3 dstC = vec3(dstV) * srcA; + return pack_pixels_RGBA8(vec4(set_lum_sat(dstC, srcC, dstC, srcDstA) + + vec3(srcV) - srcC + vec3(dstV) - dstC, + srcV.w + dstV.w - srcDstA), + 255.0f); + } + case BLEND_CASE(GL_HSL_COLOR_KHR): { + vec4 srcV = unpack(CONVERT(src, PackedRGBA32F)); + vec4 dstV = unpack(CONVERT(dst, PackedRGBA32F)); + Float srcA = srcV.w * (1.0f / 255.0f); + Float dstA = dstV.w * (1.0f / 255.0f); + Float srcDstA = srcV.w * dstA; + vec3 srcC = vec3(srcV) * dstA; + vec3 dstC = vec3(dstV) * srcA; + return pack_pixels_RGBA8(vec4(set_lum(srcC, dstC, srcDstA) + vec3(srcV) - + srcC + vec3(dstV) - dstC, + srcV.w + dstV.w - srcDstA), + 255.0f); + } + case BLEND_CASE(GL_HSL_LUMINOSITY_KHR): { + vec4 srcV = unpack(CONVERT(src, PackedRGBA32F)); + vec4 dstV = unpack(CONVERT(dst, PackedRGBA32F)); + Float srcA = srcV.w * (1.0f / 255.0f); + Float dstA = dstV.w * (1.0f / 255.0f); + Float srcDstA = srcV.w * dstA; + vec3 srcC = vec3(srcV) * dstA; + vec3 dstC = vec3(dstV) * srcA; + return pack_pixels_RGBA8(vec4(set_lum(dstC, srcC, srcDstA) + vec3(srcV) - + srcC + vec3(dstV) - dstC, + srcV.w + dstV.w - srcDstA), + 255.0f); + } default: UNREACHABLE; // return src; diff --git a/gfx/wr/swgl/src/gl_defs.h b/gfx/wr/swgl/src/gl_defs.h index f3df7c80aa28..a68adbcef637 100644 --- a/gfx/wr/swgl/src/gl_defs.h +++ b/gfx/wr/swgl/src/gl_defs.h @@ -155,6 +155,8 @@ typedef intptr_t GLintptr; #define GL_ONE_MINUS_SRC1_ALPHA 0x88FB #define GL_FUNC_ADD 0x8006 +#define GL_MIN 0x8007 +#define GL_MAX 0x8008 #define GL_NEVER 0x0200 #define GL_LESS 0x0201 @@ -192,3 +194,19 @@ typedef intptr_t GLintptr; #define GL_UNSIGNED_SHORT_8_8_APPLE 0x85BA #define GL_UNSIGNED_SHORT_8_8_REV_APPLE 0x85BB #define GL_RGB_RAW_422_APPLE 0x8A51 + +#define GL_MULTIPLY_KHR 0x9294 +#define GL_SCREEN_KHR 0x9295 +#define GL_OVERLAY_KHR 0x9296 +#define GL_DARKEN_KHR 0x9297 +#define GL_LIGHTEN_KHR 0x9298 +#define GL_COLORDODGE_KHR 0x9299 +#define GL_COLORBURN_KHR 0x929A +#define GL_HARDLIGHT_KHR 0x929B +#define GL_SOFTLIGHT_KHR 0x929C +#define GL_DIFFERENCE_KHR 0x929E +#define GL_EXCLUSION_KHR 0x92A0 +#define GL_HSL_HUE_KHR 0x92AD +#define GL_HSL_SATURATION_KHR 0x92AE +#define GL_HSL_COLOR_KHR 0x92AF +#define GL_HSL_LUMINOSITY_KHR 0x92B0 diff --git a/gfx/wr/swgl/src/glsl.h b/gfx/wr/swgl/src/glsl.h index 875561c8b370..d07caec8f468 100644 --- a/gfx/wr/swgl/src/glsl.h +++ b/gfx/wr/swgl/src/glsl.h @@ -215,6 +215,24 @@ SI Float sqrt(Float v) { #endif } +SI float recip(float x) { return 1.0f / x; } + +// Use a fast vector reciprocal approximation when available. This should only +// be used in cases where it is okay that the approximation is imprecise - +// essentially visually correct but numerically wrong. Otherwise just rely on +// however the compiler would implement slower division if the platform doesn't +// provide a convenient intrinsic. +SI Float recip(Float v) { +#if USE_SSE2 + return _mm_rcp_ps(v); +#elif USE_NEON + Float e = vrecpeq_f32(v); + return vrecpsq_f32(v, e) * e; +#else + return 1.0f / v; +#endif +} + SI float inversesqrt(float x) { return 1.0f / sqrtf(x); } SI Float inversesqrt(Float v) { @@ -648,8 +666,8 @@ SI I32 roundfast(Float v, Float scale) { } template -SI auto round_pixel(T v) { - return roundfast(v, 255.0f); +SI auto round_pixel(T v, float maxval = 1.0f) { + return roundfast(v, (255.0f / maxval)); } #define round __glsl_round @@ -1335,6 +1353,7 @@ struct vec3 { IMPLICIT constexpr vec3(Float a) : x(a), y(a), z(a) {} constexpr vec3(Float x, Float y, Float z) : x(x), y(y), z(z) {} vec3(vec2 a, Float z) : x(a.x), y(a.y), z(z) {} + explicit vec3(vec4); IMPLICIT constexpr vec3(vec3_scalar s) : x(s.x), y(s.y), z(s.z) {} constexpr vec3(vec3_scalar s0, vec3_scalar s1, vec3_scalar s2, vec3_scalar s3) : x(Float{s0.x, s1.x, s2.x, s3.x}), @@ -1828,6 +1847,8 @@ vec4 make_vec4(const X& x, const Y& y, const Z& z, const W& w) { return vec4(x, y, z, w); } +ALWAYS_INLINE vec3::vec3(vec4 v) : x(v.x), y(v.y), z(v.z) {} + SI ivec4 roundfast(vec4 v, Float scale) { return ivec4(roundfast(v.x, scale), roundfast(v.y, scale), roundfast(v.z, scale), roundfast(v.w, scale)); diff --git a/gfx/wr/swgl/src/swgl_fns.rs b/gfx/wr/swgl/src/swgl_fns.rs index f56b6c4c6b28..a8f5fd4b87ea 100644 --- a/gfx/wr/swgl/src/swgl_fns.rs +++ b/gfx/wr/swgl/src/swgl_fns.rs @@ -2287,7 +2287,7 @@ impl Gl for Context { // GL_KHR_blend_equation_advanced fn blend_barrier_khr(&self) { - panic!(); + // No barrier required, so nothing to do } // GL_CHROMIUM_copy_texture diff --git a/gfx/wr/swgl/src/texture.h b/gfx/wr/swgl/src/texture.h index 9d4138b8c519..b198d4cf25f6 100644 --- a/gfx/wr/swgl/src/texture.h +++ b/gfx/wr/swgl/src/texture.h @@ -589,13 +589,13 @@ static inline U16 textureLinearUnpackedR8(S sampler, ivec2 i, auto b0 = unaligned_load>(&buf[row0.y]); auto c0 = unaligned_load>(&buf[row0.z]); auto d0 = unaligned_load>(&buf[row0.w]); - auto abcd0 = CONVERT(combine(combine(a0, b0), combine(c0, d0)), V8); + auto abcd0 = CONVERT(combine(a0, b0, c0, d0), V8); auto a1 = unaligned_load>(&buf[row1.x]); auto b1 = unaligned_load>(&buf[row1.y]); auto c1 = unaligned_load>(&buf[row1.z]); auto d1 = unaligned_load>(&buf[row1.w]); - auto abcd1 = CONVERT(combine(combine(a1, b1), combine(c1, d1)), V8); + auto abcd1 = CONVERT(combine(a1, b1, c1, d1), V8); abcd0 += ((abcd1 - abcd0) * fracy.xxyyzzww) >> 7; @@ -709,15 +709,13 @@ static inline I16 textureLinearUnpackedR16(S sampler, ivec2 i, auto b0 = unaligned_load>(&buf[row0.y]); auto c0 = unaligned_load>(&buf[row0.z]); auto d0 = unaligned_load>(&buf[row0.w]); - auto abcd0 = - CONVERT(combine(combine(a0, b0), combine(c0, d0)) >> 1, V8); + auto abcd0 = CONVERT(combine(a0, b0, c0, d0) >> 1, V8); auto a1 = unaligned_load>(&buf[row1.x]); auto b1 = unaligned_load>(&buf[row1.y]); auto c1 = unaligned_load>(&buf[row1.z]); auto d1 = unaligned_load>(&buf[row1.w]); - auto abcd1 = - CONVERT(combine(combine(a1, b1), combine(c1, d1)) >> 1, V8); + auto abcd1 = CONVERT(combine(a1, b1, c1, d1) >> 1, V8); // The samples occupy 15 bits and the fraction occupies 15 bits, so that when // they are multiplied together, the new scaled sample will fit in the high @@ -767,6 +765,9 @@ vec4 textureLinearR16(S sampler, vec2 P, int32_t zoffset = 0) { return vec4(r * (1.0f / 32767.0f), 0.0f, 0.0f, 1.0f); } +using PackedRGBA32F = V16; +using WideRGBA32F = V16; + template vec4 textureLinearRGBA32F(S sampler, vec2 P, int32_t zoffset = 0) { assert(sampler->format == TextureFormat::RGBA32F); diff --git a/gfx/wr/swgl/src/vector_type.h b/gfx/wr/swgl/src/vector_type.h index 1d4fc8db1a0a..2edfa47afdc3 100644 --- a/gfx/wr/swgl/src/vector_type.h +++ b/gfx/wr/swgl/src/vector_type.h @@ -315,6 +315,10 @@ struct VectorType { return VectorType::wrap(data, high.data); } +# define xxxx swizzle(0, 0, 0, 0) +# define yyyy swizzle(1, 1, 1, 1) +# define zzzz swizzle(2, 2, 2, 2) +# define wwww swizzle(3, 3, 3, 3) # define xyxy swizzle(0, 1, 0, 1) # define zwzw swizzle(2, 3, 2, 3) # define zwxy swizzle(2, 3, 0, 1) @@ -388,6 +392,12 @@ SI VectorType expand(VectorType a) { } #endif +template +SI VectorType combine(VectorType a, VectorType b, + VectorType c, VectorType d) { + return combine(combine(a, b), combine(c, d)); +} + template SI VectorType zipLow(VectorType a, VectorType b) { return SHUFFLE(a, b, 0, 4, 1, 5); diff --git a/layout/reftests/css-blending/reftest.list b/layout/reftests/css-blending/reftest.list index 7990a619cf1f..658e26ab6a91 100644 --- a/layout/reftests/css-blending/reftest.list +++ b/layout/reftests/css-blending/reftest.list @@ -20,14 +20,14 @@ fuzzy-if(azureSkiaGL,0-2,0-7174) == background-blending-image-gradient.html back == background-blending-color-burn.html background-blending-color-burn-ref.svg == background-blending-color-dodge.html background-blending-color-dodge-ref.svg # need to investigate why these tests are fuzzy - first suspect is a possible color space conversion on some platforms; same for mix-blend-mode tests -fuzzy-if(azureSkia||gtkWidget,0-2,0-9600) fuzzy-if(d2d,0-1,0-8000) fuzzy-if(swgl,1-1,9600-9600) == background-blending-color.html background-blending-color-ref.svg +fuzzy-if(azureSkia||gtkWidget,0-2,0-9600) fuzzy-if(d2d,0-1,0-8000) fuzzy-if(swgl,1-1,8000-9600) == background-blending-color.html background-blending-color-ref.svg == background-blending-darken.html background-blending-darken-ref.svg == background-blending-difference.html background-blending-difference-ref.svg fuzzy-if(/^Windows\x20NT\x2010\.0/.test(http.oscpu)||skiaContent,0-1,0-1600) == background-blending-exclusion.html background-blending-exclusion-ref.svg -fuzzy-if(cocoaWidget||d2d,0-1,0-1600) == background-blending-hard-light.html background-blending-hard-light-ref.svg +fuzzy-if(cocoaWidget||d2d||swgl,0-1,0-1600) == background-blending-hard-light.html background-blending-hard-light-ref.svg fuzzy-if(d2d,0-1,0-9600) fuzzy-if(azureSkia||gtkWidget,0-1,0-11200) fuzzy-if(webrender&&!geckoview,1-1,9600-11240) == background-blending-hue.html background-blending-hue-ref.svg == background-blending-lighten.html background-blending-lighten-ref.svg -fuzzy-if(d2d,0-1,0-8000) fuzzy-if(azureSkia||gtkWidget,0-2,0-9600) fuzzy-if(swgl,1-1,9600-9600) == background-blending-luminosity.html background-blending-luminosity-ref.svg +fuzzy-if(d2d,0-1,0-8000) fuzzy-if(azureSkia||gtkWidget,0-2,0-9600) fuzzy-if(swgl,1-1,8000-9600) == background-blending-luminosity.html background-blending-luminosity-ref.svg fuzzy-if(skiaContent,0-1,0-1600) == background-blending-multiply.html background-blending-multiply-ref.svg == background-blending-normal.html background-blending-normal-ref.svg fuzzy-if(/^Windows\x20NT\x2010\.0/.test(http.oscpu)||azureSkia||gtkWidget,0-1,0-1600) == background-blending-overlay.html background-blending-overlay-ref.svg @@ -41,7 +41,7 @@ fuzzy-if(azureSkia||d2d||gtkWidget,0-1,0-40000) == background-blending-image-col fuzzy(0-65,0-53) fuzzy-if(geckoview&&webrender&&device,63-64,163-328) == mix-blend-mode-952051.html mix-blend-mode-952051-ref.html fuzzy-if(d3d11,0-49,0-200) == mix-blend-mode-and-filter.html mix-blend-mode-and-filter-ref.html -fuzzy-if(d3d11,0-1,0-6) == mix-blend-mode-and-filter.svg mix-blend-mode-and-filter-ref.svg +fuzzy-if(d3d11,0-1,0-6) fuzzy-if(swgl,171-171,2980-2980) == mix-blend-mode-and-filter.svg mix-blend-mode-and-filter-ref.svg fuzzy(0-2,0-14400) fuzzy-if(geckoview&&webrender&&device,3-3,700-700) == mix-blend-mode-child-of-blended-has-opacity.html mix-blend-mode-child-of-blended-has-opacity-ref.html