Bug 1692731 - Accelerate linear filter upscaling in SWGL. r=jrmuizel

Often images are upscaled from a smaller resolution on a page, especially when there is any amount of zoom being used, and especially at higher screen resolutions. In this case, we don't really take advantage of the fact that all the samples for a SIMD chunk can be loaded from memory in a single load, so long as we're willing to shuffle them around. We also can take advantage of the fact that most images are axis-aligned so that they have a constant filter offset with the next row. Also, we can easily fall off the fast past for blendTextureNearest if for some reason there is a significant subpixel offset. In this case, we can still do something way faster than a normal linear filter the optimizes for the fact that both the X and Y steps are constant 1:1, but we need to interpolate with neighboring samples. Differential Revision: https://phabricator.services.mozilla.com/D105131
2021-02-16 21:17:45 +00:00 · 2021-02-16 21:17:45 +00:00 · 20f75fe5f0
--- a/gfx/wr/swgl/src/gl.cc
+++ b/gfx/wr/swgl/src/gl.cc
@ -3231,7 +3231,7 @@ static ALWAYS_INLINE VectorType<T, N> inversesqrt(VectorType<T, N> v) {
 // the pixel data. 0 alpha is treated as transparent black.
 static ALWAYS_INLINE WideRGBA32F unpremultiply(WideRGBA32F v) {
  Float a = recip_or((Float){v[3], v[7], v[11], v[15]}, 0.0f);
-  return v * combine(a.xxxx, a.yyyy, a.zzzz, a.wwww);
+  return v * a.xxxxyyyyzzzzwwww;
 }

 // Packed RGBA32F data is AoS in BGRA order. Transpose it to SoA and swizzle to
@ -4105,6 +4105,7 @@ static inline void draw_quad_spans(int nump, Point2D p[4], uint16_t z,
                                   const ClipRect& clipRect) {
  // Only triangles and convex quads supported.
  assert(nump == 3 || nump == 4);
+
  Point2D l0, r0, l1, r1;
  int l0i, r0i, l1i, r1i;
  {
--- a/gfx/wr/swgl/src/glsl.h
+++ b/gfx/wr/swgl/src/glsl.h
@ -125,6 +125,7 @@ struct vec4;
 struct ivec2;

 SI int32_t if_then_else(int32_t c, int32_t t, int32_t e) { return c ? t : e; }
+SI int32_t if_then_else(bool c, int32_t t, int32_t e) { return c ? t : e; }

 SI float if_then_else(int32_t c, float t, float e) { return c ? t : e; }

@ -1189,6 +1190,8 @@ struct bvec4_scalar {
        return z;
      case W:
        return w;
+      default:
+        UNREACHABLE;
    }
  }
  bool sel(XYZW c1) { return select(c1); }
@ -1220,6 +1223,8 @@ struct bvec4 {
        return z;
      case W:
        return w;
+      default:
+        UNREACHABLE;
    }
  }
  Bool sel(XYZW c1) { return select(c1); }
--- a/gfx/wr/swgl/src/swgl_ext.h
+++ b/gfx/wr/swgl/src/swgl_ext.h
@ -302,6 +302,8 @@ static ALWAYS_INLINE bool matchTextureFormat(S s, UNUSED uint8_t* buf) {
  return swgl_isTextureR8(s);
 }

+// Quantizes the UVs to the 2^7 scale needed for calculating fractional offsets
+// for linear sampling.
 #define LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv,   \
                           uv_z, zoffset)                                   \
  uv = swgl_linearQuantize(sampler, uv);                                    \
@ -313,22 +315,228 @@ static ALWAYS_INLINE bool matchTextureFormat(S s, UNUSED uint8_t* buf) {
      swgl_linearQuantize(sampler, vec2_scalar{uv_rect.z, uv_rect.w});      \
  int zoffset = swgl_textureLayerOffset(sampler, uv_z);

+// Implements the fallback linear filter that can deal with clamping and
+// arbitrary scales.
 template <bool BLEND, typename S, typename C, typename P>
-static int blendTextureLinear(S sampler, vec2 uv, int span,
-                              const vec4_scalar& uv_rect, C color, P* buf,
-                              float z = 0) {
-  if (!matchTextureFormat(sampler, buf)) {
-    return 0;
-  }
-  LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv, z, zoffset);
-  P* end = buf + span;
-  for (; buf < end; buf += swgl_StepSize, uv += uv_step) {
+static void blendTextureLinearFallback(S sampler, vec2 uv, int span,
+                                       vec2_scalar uv_step, vec2_scalar min_uv,
+                                       vec2_scalar max_uv, C color, P* buf,
+                                       int zoffset) {
+  for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) {
    commit_blend_span<BLEND>(
        buf,
        applyColor(textureLinearUnpacked(
                       buf, sampler, ivec2(clamp(uv, min_uv, max_uv)), zoffset),
                   color));
  }
+}
+
+static ALWAYS_INLINE U64 castForShuffle(V16<int16_t> r) {
+  return bit_cast<U64>(r);
+}
+static ALWAYS_INLINE U16 castForShuffle(V4<int16_t> r) {
+  return bit_cast<U16>(r);
+}
+
+static ALWAYS_INLINE V16<int16_t> applyFracX(V16<int16_t> r, I16 fracx) {
+  return r * fracx.xxxxyyyyzzzzwwww;
+}
+static ALWAYS_INLINE V4<int16_t> applyFracX(V4<int16_t> r, I16 fracx) {
+  return r * fracx;
+}
+
+// Implements a faster linear filter that works with axis-aligned constant Y but
+// scales less than 1, i.e. upscaling. In this case we can optimize for the
+// constant Y fraction as well as load all chunks from memory in a single tap
+// for each row.
+template <bool BLEND, typename S, typename C, typename P>
+static void blendTextureLinearUpscale(S sampler, vec2 uv, int span,
+                                      vec2_scalar uv_step, vec2_scalar min_uv,
+                                      vec2_scalar max_uv, C color, P* buf,
+                                      int zoffset) {
+  typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
+  typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type;
+  typedef VectorType<int16_t, 4 * sizeof(P)> signed_unpacked_type;
+
+  ivec2 i(clamp(uv, min_uv, max_uv));
+  ivec2 frac = i;
+  i >>= 7;
+  P* row0 =
+      (P*)sampler->buf + computeRow(sampler, ivec2_scalar(0, i.y.x), zoffset);
+  P* row1 = row0 + computeNextRowOffset(sampler, ivec2_scalar(0, i.y.x));
+  I16 fracx = computeFracX(sampler, i, frac);
+  int16_t fracy = computeFracY(frac).x;
+  auto src0 =
+      CONVERT(unaligned_load<packed_type>(&row0[i.x.x]), signed_unpacked_type);
+  auto src1 =
+      CONVERT(unaligned_load<packed_type>(&row1[i.x.x]), signed_unpacked_type);
+  auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7));
+
+  // We attempt to sample ahead by one chunk and interpolate it with the current
+  // one. However, due to the complication of upscaling, we may not necessarily
+  // shift in all the next set of samples.
+  for (P* end = buf + span; buf < end; buf += 4) {
+    uv.x += uv_step.x;
+    I32 ixn = cast(uv.x);
+    I16 fracn = computeFracNoClamp(ixn);
+    ixn >>= 7;
+    auto src0n = CONVERT(unaligned_load<packed_type>(&row0[ixn.x]),
+                         signed_unpacked_type);
+    auto src1n = CONVERT(unaligned_load<packed_type>(&row1[ixn.x]),
+                         signed_unpacked_type);
+    auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7));
+
+    // Since we're upscaling, we know that a source pixel has a larger footprint
+    // than the destination pixel, and thus all the source pixels needed for
+    // this chunk will fall within a single chunk of texture data. However,
+    // since the source pixels don't map 1:1 with destination pixels, we need to
+    // shift the source pixels over based on their offset from the start of the
+    // chunk. This could conceivably be optimized better with usage of PSHUFB or
+    // VTBL instructions However, since PSHUFB requires SSSE3, instead we resort
+    // to masking in the correct pixels to avoid having to index into memory.
+    // For the last sample to interpolate with, we need to potentially shift in
+    // a sample from the next chunk over in the case the samples fill out an
+    // entire chunk.
+    auto shuf = src;
+    auto shufn = SHUFFLE(src, ixn.x == i.x.w ? srcn.yyyy : srcn, 1, 2, 3, 4);
+    if (i.x.y == i.x.x) {
+      shuf = shuf.xxyz;
+      shufn = shufn.xxyz;
+    }
+    if (i.x.z == i.x.y) {
+      shuf = shuf.xyyz;
+      shufn = shufn.xyyz;
+    }
+    if (i.x.w == i.x.z) {
+      shuf = shuf.xyzz;
+      shufn = shufn.xyzz;
+    }
+
+    // Convert back to a signed unpacked type so that we can interpolate the
+    // final result.
+    auto interp = bit_cast<signed_unpacked_type>(shuf);
+    auto interpn = bit_cast<signed_unpacked_type>(shufn);
+    interp += applyFracX(interpn - interp, fracx) >> 7;
+
+    commit_blend_span<BLEND>(
+        buf, applyColor(bit_cast<unpacked_type>(interp), color));
+
+    i.x = ixn;
+    fracx = fracn;
+    src = srcn;
+  }
+}
+
+// This is the fastest variant of the linear filter that still provides
+// filtering. In cases where there is no scaling required, but we have a
+// subpixel offset that forces us to blend in neighboring pixels, we can
+// optimize away most of the memory loads and shuffling that is required by the
+// fallback filter.
+template <bool BLEND, typename S, typename C, typename P>
+static void blendTextureLinearFast(S sampler, vec2 uv, int span,
+                                   vec2_scalar min_uv, vec2_scalar max_uv,
+                                   C color, P* buf, int zoffset) {
+  typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
+  typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type;
+  typedef VectorType<int16_t, 4 * sizeof(P)> signed_unpacked_type;
+
+  ivec2 i(clamp(uv, min_uv, max_uv));
+  ivec2 frac = i;
+  i >>= 7;
+  P* row0 = (P*)sampler->buf + computeRow(sampler, force_scalar(i), zoffset);
+  P* row1 = row0 + computeNextRowOffset(sampler, force_scalar(i));
+  int16_t fracx = computeFracX(sampler, i, frac).x;
+  int16_t fracy = computeFracY(frac).x;
+  auto src0 = CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type);
+  auto src1 = CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type);
+  auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7));
+
+  // Since there is no scaling, we sample ahead by one chunk and interpolate it
+  // with the current one. We can then reuse this value on the next iteration.
+  for (P* end = buf + span; buf < end; buf += 4) {
+    row0 += 4;
+    row1 += 4;
+    auto src0n =
+        CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type);
+    auto src1n =
+        CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type);
+    auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7));
+
+    // For the last sample to interpolate with, we need to potentially shift in
+    // a sample from the next chunk over since the samples fill out an entire
+    // chunk.
+    auto interp = bit_cast<signed_unpacked_type>(src);
+    auto interpn =
+        bit_cast<signed_unpacked_type>(SHUFFLE(src, srcn, 1, 2, 3, 4));
+    interp += ((interpn - interp) * fracx) >> 7;
+
+    commit_blend_span<BLEND>(
+        buf, applyColor(bit_cast<unpacked_type>(interp), color));
+
+    src = srcn;
+  }
+}
+
+enum LinearFilter {
+  // No linear filter is needed.
+  LINEAR_FILTER_NEAREST = 0,
+  // The most general linear filter that handles clamping and varying scales.
+  LINEAR_FILTER_FALLBACK,
+  // A linear filter optimized for axis-aligned upscaling.
+  LINEAR_FILTER_UPSCALE,
+  // A linear filter with no scaling but with subpixel offset.
+  LINEAR_FILTER_FAST
+};
+
+// Dispatches to an appropriate linear filter depending on the selected filter.
+template <bool BLEND, typename S, typename C, typename P>
+static int blendTextureLinear(S sampler, vec2 uv, int span,
+                              const vec4_scalar& uv_rect, C color, P* buf,
+                              LinearFilter filter, float z = 0) {
+  if (!matchTextureFormat(sampler, buf)) {
+    return 0;
+  }
+  LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv, z, zoffset);
+  P* end = buf + span;
+  if (filter != LINEAR_FILTER_FALLBACK) {
+    // If we're not using the fallback, then Y is constant across the entire
+    // row. We just need to ensure that we handle any samples that might pull
+    // data from before the start of the row and require clamping.
+    float beforeDist = max(0.0f, min_uv.x) - uv.x.x;
+    if (beforeDist > 0) {
+      int before = clamp(int(ceil(beforeDist / uv_step.x)) * swgl_StepSize, 0,
+                         int(end - buf));
+      blendTextureLinearFallback<BLEND>(sampler, uv, before, uv_step, min_uv,
+                                        max_uv, color, buf, zoffset);
+      buf += before;
+      uv.x += (before / swgl_StepSize) * uv_step.x;
+    }
+    // We need to check how many samples we can take from inside the row without
+    // requiring clamping. In case the filter oversamples the row by a step, we
+    // subtract off a step from the width to leave some room.
+    float insideDist =
+        min(max_uv.x, float((int(sampler->width) - swgl_StepSize) << 7)) -
+        uv.x.x;
+    if (insideDist >= uv_step.x) {
+      int inside =
+          clamp(int(insideDist / uv_step.x) * swgl_StepSize, 0, int(end - buf));
+      if (filter == LINEAR_FILTER_FAST) {
+        blendTextureLinearFast<BLEND>(sampler, uv, inside, min_uv, max_uv,
+                                      color, buf, zoffset);
+      } else {
+        blendTextureLinearUpscale<BLEND>(sampler, uv, inside, uv_step, min_uv,
+                                         max_uv, color, buf, zoffset);
+      }
+      buf += inside;
+      uv.x += (inside / swgl_StepSize) * uv_step.x;
+    }
+  }
+  // If the fallback filter was requested, or if there are any samples left that
+  // may be outside the row and require clamping, then handle that with here.
+  if (buf < end) {
+    blendTextureLinearFallback<BLEND>(sampler, uv, int(end - buf), uv_step,
+                                      min_uv, max_uv, color, buf, zoffset);
+  }
  return span;
 }

@ -409,12 +617,12 @@ static int blendTextureNearest(S sampler, vec2 uv, int span,
 }

 // Helper function to decide whether we can safely apply 1:1 nearest filtering
-// without diverging too much from the linear filter
+// without diverging too much from the linear filter.
 template <typename S, typename T>
-static inline bool allowTextureNearest(S sampler, T P, int span) {
+static inline LinearFilter needsTextureLinear(S sampler, T P, int span) {
  // First verify if the row Y doesn't change across samples
  if (P.y.x != P.y.y) {
-    return false;
+    return LINEAR_FILTER_FALLBACK;
  }
  P = samplerScale(sampler, P);
  // We need to verify that the pixel step reasonably approximates stepping
@ -422,12 +630,26 @@ static inline bool allowTextureNearest(S sampler, T P, int span) {
  // that the margin of error is no more than approximately 2^-7.
  span &= ~(128 - 1);
  span += 128;
-  return round((P.x.y - P.x.x) * span) == span &&
-         // Also verify that we're reasonably close to the center of a texel
-         // so that it doesn't look that much different than if a linear filter
-         // was used.
-         (int(P.x.x * 4.0f + 0.5f) & 3) == 2 &&
-         (int(P.y.x * 4.0f + 0.5f) & 3) == 2;
+  float dx = P.x.y - P.x.x;
+  if (round(dx * span) != span) {
+    // If the source region is smaller than the destination, then we can use the
+    // upscaling filter since row Y is constant.
+    return dx >= 0 && dx <= 1 ? LINEAR_FILTER_UPSCALE : LINEAR_FILTER_FALLBACK;
+  }
+  // Also verify that we're reasonably close to the center of a texel
+  // so that it doesn't look that much different than if a linear filter
+  // was used.
+  if ((int(P.x.x * 4.0f + 0.5f) & 3) != 2 ||
+      (int(P.y.x * 4.0f + 0.5f) & 3) != 2) {
+    // The source and destination regions are the same, but there is a
+    // significant subpixel offset. We can use a faster linear filter to deal
+    // with the offset in this case.
+    return LINEAR_FILTER_FAST;
+  }
+  // Otherwise, we have a constant 1:1 step and we're stepping reasonably close
+  // to the center of each pixel, so it's safe to disable the linear filter and
+  // use nearest.
+  return LINEAR_FILTER_NEAREST;
 }

 // Commit a single chunk from a linear texture fetch
@ -435,24 +657,24 @@ static inline bool allowTextureNearest(S sampler, T P, int span) {
  do {                                                                     \
    auto packed_color = packColor(swgl_Out##format, color);                \
    int drawn = 0;                                                         \
-    if (allowTextureNearest(s, p, swgl_SpanLength)) {                      \
+    if (LinearFilter filter = needsTextureLinear(s, p, swgl_SpanLength)) { \
      if (blend_key) {                                                     \
-        drawn = blendTextureNearest<true>(s, p, swgl_SpanLength, uv_rect,  \
-                                          packed_color, swgl_Out##format,  \
-                                          __VA_ARGS__);                    \
+        drawn = blendTextureLinear<true>(s, p, swgl_SpanLength, uv_rect,   \
+                                         packed_color, swgl_Out##format,   \
+                                         filter, __VA_ARGS__);             \
      } else {                                                             \
-        drawn = blendTextureNearest<false>(s, p, swgl_SpanLength, uv_rect, \
-                                           packed_color, swgl_Out##format, \
-                                           __VA_ARGS__);                   \
+        drawn = blendTextureLinear<false>(s, p, swgl_SpanLength, uv_rect,  \
+                                          packed_color, swgl_Out##format,  \
+                                          filter, __VA_ARGS__);            \
      }                                                                    \
    } else if (blend_key) {                                                \
-      drawn = blendTextureLinear<true>(s, p, swgl_SpanLength, uv_rect,     \
-                                       packed_color, swgl_Out##format,     \
-                                       __VA_ARGS__);                       \
-    } else {                                                               \
-      drawn = blendTextureLinear<false>(s, p, swgl_SpanLength, uv_rect,    \
+      drawn = blendTextureNearest<true>(s, p, swgl_SpanLength, uv_rect,    \
                                        packed_color, swgl_Out##format,    \
                                        __VA_ARGS__);                      \
+    } else {                                                               \
+      drawn = blendTextureNearest<false>(s, p, swgl_SpanLength, uv_rect,   \
+                                         packed_color, swgl_Out##format,   \
+                                         __VA_ARGS__);                     \
    }                                                                      \
    swgl_Out##format += drawn;                                             \
    swgl_SpanLength -= drawn;                                              \
--- a/gfx/wr/swgl/src/texture.h
+++ b/gfx/wr/swgl/src/texture.h
@ -488,10 +488,10 @@ SI auto computeRow(S sampler, I i, int32_t zoffset, size_t margin = 1)
 }

 // Compute clamped offset of second row for linear interpolation from first row
-template <typename S>
-SI I32 computeNextRowOffset(S sampler, ivec2 i) {
-  return (i.y >= 0 && i.y < int32_t(sampler->height) - 1) &
-         I32(sampler->stride);
+template <typename S, typename I>
+SI auto computeNextRowOffset(S sampler, I i) -> decltype(i.x) {
+  return if_then_else(i.y >= 0 && i.y < int32_t(sampler->height) - 1,
+                      sampler->stride, 0);
 }

 // Convert X coordinate to a 2^7 scale fraction for interpolation
@ -502,7 +502,8 @@ SI I16 computeFracX(S sampler, ivec2 i, ivec2 frac) {
 }

 // Convert Y coordinate to a 2^7 scale fraction for interpolation
-SI I16 computeFracY(ivec2 frac) { return CONVERT(frac.y & 0x7F, I16); }
+SI I16 computeFracNoClamp(I32 frac) { return CONVERT(frac & 0x7F, I16); }
+SI I16 computeFracY(ivec2 frac) { return computeFracNoClamp(frac.y); }

 struct WidePlanarRGBA8 {
  V8<uint16_t> rg;
--- a/gfx/wr/swgl/src/vector_type.h
+++ b/gfx/wr/swgl/src/vector_type.h
@ -348,6 +348,10 @@ struct VectorType {
  VectorType<T, 8> XXYYZZWW() const {
    return swizzle(0, 0, 1, 1).combine(swizzle(2, 2, 3, 3));
  }
+#  define xxxxyyyyzzzzwwww XXXXYYYYZZZZWWWW()
+  VectorType<T, 16> XXXXYYYYZZZZWWWW() {
+    return XXXXYYYY().combine(ZZZZWWWW());
+  }
 };

 template <typename T>
@ -366,6 +370,11 @@ struct VectorType<T, 2> {
    v.data = data;
    return v;
  }
+
+  VectorType operator&(VectorType x) const { return wrap(data & x.data); }
+  VectorType operator&(T x) const { return wrap(data & x); }
+  VectorType operator|(VectorType x) const { return wrap(data | x.data); }
+  VectorType operator|(T x) const { return wrap(data | x); }
 };

 #  define CONVERT(vector, type) ((type)(vector))