From 4f480c9072123b94c0eed17f2055388ec498c176 Mon Sep 17 00:00:00 2001
From: Lee Salzman <lsalzman@mozilla.com>
Date: Mon, 14 Sep 2020 02:54:53 +0000
Subject: [PATCH] Bug 1664479 - optimize cs_blur shader for SWGL. r=gw

Differential Revision: https://phabricator.services.mozilla.com/D89929
---
 gfx/wr/swgl/src/glsl.h            | 16 +++++--
 gfx/wr/webrender/res/cs_blur.glsl | 77 +++++++++++++++++++------------
 2 files changed, 58 insertions(+), 35 deletions(-)
diff --git a/gfx/wr/swgl/src/glsl.h b/gfx/wr/swgl/src/glsl.h
index 0864baa56c79..d0fd7e39d0dc 100644
--- a/gfx/wr/swgl/src/glsl.h
+++ b/gfx/wr/swgl/src/glsl.h
@@ -2,8 +2,6 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
-// Some of this is copied from Skia and is governed by a BSD-style license
-// Every function in this file should be marked static and inline using SI.
 #define SI ALWAYS_INLINE static
 
 #include "vector_type.h"
@@ -612,6 +610,7 @@ Float approx_log2(Float x) {
   return e - 124.225514990f - 1.498030302f * m -
          1.725879990f / (0.3520887068f + m);
 }
+
 Float approx_pow2(Float x) {
   Float f = fract(x);
   return bit_cast<Float>(
@@ -619,14 +618,21 @@ Float approx_pow2(Float x) {
                                       27.728023300f / (4.84252568f - f)));
 }
 
-// From skia
+#define pow __glsl_pow
+
+SI float pow(float x, float y) { return powf(x, y); }
+
 Float pow(Float x, Float y) {
   return if_then_else((x == 0) | (x == 1), x, approx_pow2(approx_log2(x) * y));
 }
 
+#define exp __glsl_exp
+
+SI float exp(float x) { return expf(x); }
+
 Float exp(Float y) {
-  float x = 2.718281828459045235360287471352;
-  return approx_pow2(log2f(x) * y);
+  float l2e = 1.4426950408889634074f;
+  return approx_pow2(l2e * y);
 }
 
 struct ivec4;
diff --git a/gfx/wr/webrender/res/cs_blur.glsl b/gfx/wr/webrender/res/cs_blur.glsl
index a10fe00d7909..bfd5c906a8e3 100644
--- a/gfx/wr/webrender/res/cs_blur.glsl
+++ b/gfx/wr/webrender/res/cs_blur.glsl
@@ -4,12 +4,13 @@
 
 #include shared,prim_shared
 
-varying vec3 vUv;
+varying vec2 vUv;
+flat varying float vUvLayer;
 flat varying vec4 vUvRect;
 flat varying vec2 vOffsetScale;
-flat varying float vSigma;
 // The number of pixels on each end that we apply the blur filter over.
 flat varying int vSupport;
+flat varying vec2 vGaussCoefficients;
 
 #ifdef WR_VERTEX_SHADER
 // Applies a separable gaussian blur in one direction, as specified
@@ -40,6 +41,30 @@ BlurTask fetch_blur_task(int address) {
     return task;
 }
 
+void calculate_gauss_coefficients(float sigma) {
+    // Incremental Gaussian Coefficent Calculation (See GPU Gems 3 pp. 877 - 889)
+    vGaussCoefficients = vec2(1.0 / (sqrt(2.0 * 3.14159265) * sigma),
+                              exp(-0.5 / (sigma * sigma)));
+
+    // Pre-calculate the coefficient total in the vertex shader so that
+    // we can avoid having to do it per-fragment and also avoid division
+    // by zero in the degenerate case.
+    vec3 gauss_coefficient = vec3(vGaussCoefficients,
+                                  vGaussCoefficients.y * vGaussCoefficients.y);
+    float gauss_coefficient_total = gauss_coefficient.x;
+    for (int i = 1; i <= vSupport; i += 2) {
+        gauss_coefficient.xy *= gauss_coefficient.yz;
+        float gauss_coefficient_subtotal = gauss_coefficient.x;
+        gauss_coefficient.xy *= gauss_coefficient.yz;
+        gauss_coefficient_subtotal += gauss_coefficient.x;
+        gauss_coefficient_total += 2.0 * gauss_coefficient_subtotal;
+    }
+
+    // Scale initial coefficient by total to avoid passing the total separately
+    // to the fragment shader.
+    vGaussCoefficients.x /= gauss_coefficient_total;
+}
+
 void main(void) {
     BlurTask blur_task = fetch_blur_task(aBlurRenderTaskAddress);
     RenderTaskCommonData src_task = fetch_render_task_common_data(aBlurSourceTaskAddress);
@@ -52,8 +77,7 @@ void main(void) {
 #else
     vec2 texture_size = vec2(textureSize(sPrevPassAlpha, 0).xy);
 #endif
-    vUv.z = src_task.texture_layer_index;
-    vSigma = blur_task.blur_radius;
+    vUvLayer = src_task.texture_layer_index;
 
     // Ensure that the support is an even number of pixels to simplify the
     // fragment shader logic.
@@ -62,6 +86,13 @@ void main(void) {
     // hardware for linear filtering.
     vSupport = int(ceil(1.5 * blur_task.blur_radius)) * 2;
 
+    if (vSupport > 0) {
+        calculate_gauss_coefficients(blur_task.blur_radius);
+    } else {
+        // The gauss function gets NaNs when blur radius is zero.
+        vGaussCoefficients = vec2(1.0, 1.0);
+    }
+
     switch (aBlurDirection) {
         case DIR_HORIZONTAL:
             vOffsetScale = vec2(1.0 / texture_size.x, 0.0);
@@ -81,7 +112,7 @@ void main(void) {
 
     vec2 uv0 = src_rect.p0 / texture_size;
     vec2 uv1 = (src_rect.p0 + src_rect.size) / texture_size;
-    vUv.xy = mix(uv0, uv1, aPosition.xy);
+    vUv = mix(uv0, uv1, aPosition.xy);
 
     gl_Position = uTransform * vec4(pos, 0.0, 1.0);
 }
@@ -91,10 +122,10 @@ void main(void) {
 
 #if defined WR_FEATURE_COLOR_TARGET
 #define SAMPLE_TYPE vec4
-#define SAMPLE_TEXTURE(uv)  texture(sPrevPassColor, uv)
+#define SAMPLE_TEXTURE(uv)  texture(sPrevPassColor, vec3(uv, vUvLayer))
 #else
 #define SAMPLE_TYPE float
-#define SAMPLE_TEXTURE(uv)  texture(sPrevPassAlpha, uv).r
+#define SAMPLE_TEXTURE(uv)  texture(sPrevPassAlpha, vec3(uv, vUvLayer)).r
 #endif
 
 // TODO(gw): Write a fast path blur that handles smaller blur radii
@@ -104,23 +135,11 @@ void main(void) {
 void main(void) {
     SAMPLE_TYPE original_color = SAMPLE_TEXTURE(vUv);
 
-    // TODO(gw): The gauss function gets NaNs when blur radius
-    //           is zero. In the future, detect this earlier
-    //           and skip the blur passes completely.
-    if (vSupport == 0) {
-        oFragColor = vec4(original_color);
-        return;
-    }
-
     // Incremental Gaussian Coefficent Calculation (See GPU Gems 3 pp. 877 - 889)
-    vec3 gauss_coefficient;
-    gauss_coefficient.x = 1.0 / (sqrt(2.0 * 3.14159265) * vSigma);
-    gauss_coefficient.y = exp(-0.5 / (vSigma * vSigma));
-    gauss_coefficient.z = gauss_coefficient.y * gauss_coefficient.y;
+    vec3 gauss_coefficient = vec3(vGaussCoefficients,
+                                  vGaussCoefficients.y * vGaussCoefficients.y);
 
-    float gauss_coefficient_total = gauss_coefficient.x;
     SAMPLE_TYPE avg_color = original_color * gauss_coefficient.x;
-    gauss_coefficient.xy *= gauss_coefficient.yz;
 
     // Evaluate two adjacent texels at a time. We can do this because, if c0
     // and c1 are colors of adjacent texels and k0 and k1 are arbitrary
@@ -142,6 +161,8 @@ void main(void) {
     // Equation 1 with a single texture lookup.
 
     for (int i = 1; i <= vSupport; i += 2) {
+        gauss_coefficient.xy *= gauss_coefficient.yz;
+
         float gauss_coefficient_subtotal = gauss_coefficient.x;
         gauss_coefficient.xy *= gauss_coefficient.yz;
         gauss_coefficient_subtotal += gauss_coefficient.x;
@@ -149,16 +170,12 @@ void main(void) {
 
         vec2 offset = vOffsetScale * (float(i) + gauss_ratio);
 
-        vec2 st0 = clamp(vUv.xy - offset, vUvRect.xy, vUvRect.zw);
-        avg_color += SAMPLE_TEXTURE(vec3(st0, vUv.z)) * gauss_coefficient_subtotal;
-
-        vec2 st1 = clamp(vUv.xy + offset, vUvRect.xy, vUvRect.zw);
-        avg_color += SAMPLE_TEXTURE(vec3(st1, vUv.z)) * gauss_coefficient_subtotal;
-
-        gauss_coefficient_total += 2.0 * gauss_coefficient_subtotal;
-        gauss_coefficient.xy *= gauss_coefficient.yz;
+        vec2 st0 = max(vUv - offset, vUvRect.xy);
+        vec2 st1 = min(vUv + offset, vUvRect.zw);
+        avg_color += (SAMPLE_TEXTURE(st0) + SAMPLE_TEXTURE(st1)) *
+                     gauss_coefficient_subtotal;
     }
 
-    oFragColor = vec4(avg_color) / gauss_coefficient_total;
+    oFragColor = vec4(avg_color);
 }
 #endif