Bug 1686244 - Accelerate linear gradients in SWGL. r=jrmuizel

For linear gradients, we are currently bottlenecked by looking up a gradient table entry, doing interpolation, and converting to pixel formats for every sample. We can accelerate this by instead looking for contiguous segments of gradient within the range of entries we need to sample and then interpolating these as a single gradient. This also enables us to convert to relevant pixel formats only when setting up this gradient, which greatly reduces the per-pixel processing down to essentially a shift and add. To enable this sort of crawling of the gradient table, the output gradients have been modified such that each entry's step value will equal an adjacent entry's step value if and only if they are from same gradient. We can ensure this by, in the very rare case two segments of gradient have the same step, using the equivalent of nextafter() to imperceptibly alter the value so that the invariant is maintained. Differential Revision: https://phabricator.services.mozilla.com/D105716
2021-02-22 04:14:38 +00:00 · 2021-02-22 04:14:38 +00:00 · 6d46022aa7
--- a/gfx/wr/glsl-to-cxx/src/hir.rs
+++ b/gfx/wr/glsl-to-cxx/src/hir.rs
@ -3771,6 +3771,13 @@ pub fn ast_to_hir(state: &mut State, tu: &syntax::TranslationUnit) -> Translatio
        vec![Type::new(Sampler2D), Type::new(IVec2), Type::new(Int)],
        RunClass::Scalar,
    );
    declare_function(
        state,
        "swgl_commitLinearGradientRGBA8",
        None,
        Type::new(Void),
        vec![Type::new(Sampler2D), Type::new(Int), Type::new(Float), Type::new(Bool), Type::new(Float)],
    );
    declare_function(
        state,
        "swgl_commitGradientRGBA8",
--- a/gfx/wr/swgl/src/gl.cc
+++ b/gfx/wr/swgl/src/gl.cc
@ -3054,15 +3054,13 @@ static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(Float alpha,
 static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(float alpha,
                                                 float scale = 255.0f) {
  I32 i = round_pixel(alpha, scale);
-  HalfRGBA8 c = packRGBA8(i, i);
+  return repeat2(packRGBA8(i, i));
  return combine(c, c);
 }
 UNUSED static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(const vec4_scalar& v,
                                                        float scale = 255.0f) {
  I32 i = round_pixel((Float){v.z, v.y, v.x, v.w}, scale);
-  HalfRGBA8 c = packRGBA8(i, i);
+  return repeat2(packRGBA8(i, i));
  return combine(c, c);
 }
 static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8() {
@ -3433,7 +3431,7 @@ static ALWAYS_INLINE WideRGBA8 blend_pixels(uint32_t* buf, PackedRGBA8 pdst,
    // src*dst = dst + src*(k - dst) use addlow
    // for signed overflow
    return addlow(
-        dst, muldiv255(src, combine(ctx->blendcolor, ctx->blendcolor) - dst));
+        dst, muldiv255(src, repeat2(ctx->blendcolor) - dst));
  // We must explicitly handle the masked/anti-aliased secondary blend case.
  // The secondary color as well as the source must be multiplied by the
--- a/gfx/wr/swgl/src/swgl_ext.h
+++ b/gfx/wr/swgl/src/swgl_ext.h
@ -937,6 +937,32 @@ static void blendYUV(P* buf, int span, S0 sampler0, vec2 uv0,
 #define swgl_commitTextureLinearColorYUV(...) \
  swgl_commitTextureLinearYUV(__VA_ARGS__)
 // Each gradient stops entry is a pair of RGBA32F start color and end step.
 struct GradientStops {
  Float startColor;
  union {
    Float stepColor;
    vec4_scalar stepData;
  };
  // Whether this gradient entry can be merged with an adjacent entry. The
  // step will be equal with the adjacent step if and only if they can be
  // merged, or rather, that the stops are actually part of a single larger
  // gradient.
  bool can_merge(const GradientStops& next) const {
    return stepData == next.stepData;
  }
  // Get the interpolated color within the entry based on the offset from its
  // start.
  Float interpolate(float offset) const {
    return startColor + stepColor * offset;
  }
  // Get the end color of the entry where interpolation stops.
  Float end_color() const { return startColor + stepColor; }
 };
 // Checks if a gradient table of the specified size exists at the UV coords of
 // the address within an RGBA32F texture. If so, a linear address within the
 // texture is returned that may be used to sample the gradient table later. If
@ -947,16 +973,13 @@ static inline int swgl_validateGradient(sampler2D sampler, ivec2_scalar address,
  return sampler->format == TextureFormat::RGBA32F && address.y >= 0 &&
                 address.y < int(sampler->height) && address.x >= 0 &&
                 address.x < int(sampler->width) && entries > 0 &&
-                 address.x + 2 * entries <= int(sampler->width)
+                 address.x +
                         int(sizeof(GradientStops) / sizeof(Float)) * entries <=
                     int(sampler->width)
             ? address.y * sampler->stride + address.x * 4
             : -1;
 }
 // Swizzle RGBA gradient result to BGRA.
 static ALWAYS_INLINE HalfRGBA8 swizzleGradient(HalfRGBA8 v) {
  return SHUFFLE(v, v, 2, 1, 0, 3, 6, 5, 4, 7);
 }
 static inline WideRGBA8 sampleGradient(sampler2D sampler, int address,
                                       Float entry) {
  assert(sampler->format == TextureFormat::RGBA32F);
@ -967,18 +990,18 @@ static inline WideRGBA8 sampleGradient(sampler2D sampler, int address,
  // entry colors.
  Float offset = entry - cast(index);
  // Every entry is a pair of colors blended by the fractional offset.
-  index *= 2;
+  assert(test_all(index >= 0 &&
-  assert(test_all(index >= 0 && index < int(sampler->width) - 1));
+                  index * int(sizeof(GradientStops) / sizeof(Float)) <
-  Float* buf = (Float*)&sampler->buf[address];
+                      int(sampler->width)));
  GradientStops* stops = (GradientStops*)&sampler->buf[address];
  // Blend between the colors for each SIMD lane, then pack them to RGBA8
  // result. Since the layout of the RGBA8 framebuffer is actually BGRA while
  // the gradient table has RGBA colors, swizzling is required.
-  return combine(swizzleGradient(packRGBA8(
+  return combine(
-                     round_pixel(buf[index.x] + buf[index.x + 1] * offset.x),
+      packRGBA8(round_pixel(stops[index.x].interpolate(offset.x).zyxw),
-                     round_pixel(buf[index.y] + buf[index.y + 1] * offset.y))),
+                round_pixel(stops[index.y].interpolate(offset.y).zyxw)),
-                 swizzleGradient(packRGBA8(
+      packRGBA8(round_pixel(stops[index.z].interpolate(offset.z).zyxw),
-                     round_pixel(buf[index.z] + buf[index.z + 1] * offset.z),
+                round_pixel(stops[index.w].interpolate(offset.w).zyxw)));
                     round_pixel(buf[index.w] + buf[index.w + 1] * offset.w))));
 }
 // Samples a gradient entry from the gradient at the provided linearized
@ -993,6 +1016,175 @@ static inline WideRGBA8 sampleGradient(sampler2D sampler, int address,
  swgl_commitChunk(RGBA8, applyColor(sampleGradient(sampler, address, entry), \
                                     packColor(swgl_OutRGBA, color)))
 // Samples an entire span of a linear gradient by crawling the gradient table
 // and looking for consecutive stops that can be merged into a single larger
 // gradient, then interpolating between those larger gradients within the span.
 template <bool BLEND>
 static void commitLinearGradient(sampler2D sampler, int address, float size,
                                 bool repeat, Float offset, uint32_t* buf,
                                 int span) {
  assert(sampler->format == TextureFormat::RGBA32F);
  assert(address >= 0 && address < int(sampler->height * sampler->stride));
  GradientStops* stops = (GradientStops*)&sampler->buf[address];
  // Get the chunk delta from the difference in offset steps. This represents
  // how far within the gradient table we advance for every step in output,
  // normalized to gradient table size.
  float delta = (offset.y - offset.x) * 4.0f;
  for (; span > 0;) {
    // If repeat is desired, we need to limit the offset to a fractional value.
    if (repeat) {
      offset = fract(offset);
    }
    // Try to process as many chunks as are within the span if possible.
    float chunks = 0.25f * span;
    // To properly handle both clamping and repeating of the table offset, we
    // need to ensure we don't run past the 0 and 1 points. Here we compute the
    // intercept points depending on whether advancing forwards or backwards in
    // the gradient table to ensure the chunk count is limited by the amount
    // before intersection. If there is no delta, then we compute no intercept.
    float startEntry;
    int minIndex, maxIndex;
    if (offset.x < 0) {
      // If we're below the gradient table, use the first color stop. We can
      // only intercept the table if walking forward.
      startEntry = 0;
      minIndex = int(startEntry);
      maxIndex = minIndex;
      if (delta > 0) {
        chunks = min(chunks, -offset.x / delta);
      }
    } else if (offset.x >= 1) {
      // If we're above the gradient table, use the last color stop. We can
      // only intercept the table if walking backward.
      startEntry = 1.0f + size;
      minIndex = int(startEntry);
      maxIndex = minIndex;
      if (delta < 0) {
        chunks = min(chunks, (1 - offset.x) / delta);
      }
    } else {
      // Otherwise, we're inside the gradient table. Depending on the direction
      // we're walking the the table, we may intersect either the 0 or 1 offset.
      // Compute the start entry based on our initial offset, and compute the
      // end entry based on the available chunks limited by intercepts. Clamp
      // them into the valid range of the table.
      startEntry = 1.0f + offset.x * size;
      if (delta < 0) {
        chunks = min(chunks, -offset.x / delta);
      } else if (delta > 0) {
        chunks = min(chunks, (1 - offset.x) / delta);
      }
      float endEntry = clamp(1.0f + (offset.x + delta * int(chunks)) * size,
                             0.0f, 1.0f + size);
      // Now that we know the range of entries we need to sample, we want to
      // find the largest possible merged gradient within that range. Depending
      // on which direction we are advancing in the table, we either walk up or
      // down the table trying to merge the current entry with the adjacent
      // entry. We finally limit the chunks to only sample from this merged
      // gradient.
      minIndex = int(startEntry);
      maxIndex = minIndex;
      if (delta > 0) {
        while (maxIndex + 1 < endEntry &&
               stops[maxIndex].can_merge(stops[maxIndex + 1])) {
          maxIndex++;
        }
        chunks = min(chunks, (maxIndex + 1 - startEntry) / (delta * size));
      } else if (delta < 0) {
        while (minIndex - 1 > endEntry &&
               stops[minIndex - 1].can_merge(stops[minIndex])) {
          minIndex--;
        }
        chunks = min(chunks, (minIndex - startEntry) / (delta * size));
      }
    }
    // If there are any amount of whole chunks of a merged gradient found,
    // then we want to process that as a single gradient span with the start
    // and end colors from the min and max entries.
    int inside = int(chunks);
    if (inside > 0) {
      // Sample the start color from the min entry and the end color from the
      // max entry of the merged gradient. These are scaled to a range of
      // 0..0xFF00, as that is the largest shifted value that can fit in a U16.
      // Since we are only doing addition with the step value, we can still
      // represent negative step values without having to use an explicit sign
      // bit, as the result will still come out the same, allowing us to gain an
      // extra bit of precision. We will later shift these into 8 bit output
      // range while committing the span, but stepping with higher precision to
      // avoid banding. We convert from RGBA to BGRA here to avoid doing this in
      // the inner loop.
      auto minColorF = stops[minIndex].startColor.zyxw * float(0xFF00);
      auto maxColorF = stops[maxIndex].end_color().zyxw * float(0xFF00);
      // Get the color range of the merged gradient, normalized to its size.
      auto colorRangeF =
          (maxColorF - minColorF) * (1.0f / (maxIndex + 1 - minIndex));
      // Compute the actual starting color of the current start offset within
      // the merged gradient. The value 0.5 is added to the low bits (0x80) so
      // that the color will effective round to the nearest increment below.
      auto colorF =
          minColorF + colorRangeF * (startEntry - minIndex) + float(0x80);
      // Compute the portion of the color range that we advance on each chunk.
      Float deltaColorF = colorRangeF * (delta * size);
      // Quantize the color delta and current color. These have already been
      // scaled to the 0..0xFF00 range, so we just need to round them to U16.
      auto deltaColor = repeat4(CONVERT(round_pixel(deltaColorF, 1), U16));
      auto color =
          combine(CONVERT(round_pixel(colorF, 1), U16),
                  CONVERT(round_pixel(colorF + deltaColorF * 0.25f, 1), U16),
                  CONVERT(round_pixel(colorF + deltaColorF * 0.5f, 1), U16),
                  CONVERT(round_pixel(colorF + deltaColorF * 0.75f, 1), U16));
      // Finally, step the current color through the output chunks, shifting
      // it into 8 bit range and outputting as we go.
      for (auto* end = buf + inside * 4; buf < end; buf += 4) {
        commit_blend_span<BLEND>(buf, bit_cast<WideRGBA8>(color >> 8));
        color += deltaColor;
      }
      // Deduct the number of chunks inside the gradient from the remaining
      // overall span. If we exhausted the span, bail out.
      span -= inside * 4;
      if (span <= 0) {
        break;
      }
      // Otherwise, assume we're in a transitional section of the gradient that
      // will probably require per-sample table lookups, so fall through below.
      offset += inside * delta;
      if (repeat) {
        offset = fract(offset);
      }
    }
    // If we get here, there were no whole chunks of a merged gradient found
    // that we could process, but we still have a non-zero amount of span left.
    // That means we have segments of gradient that begin or end at the current
    // entry we're on. For this case, we just fall back to sampleGradient which
    // will calculate a table entry for each sample, assuming the samples may
    // have different table entries.
    Float entry = clamp(offset * size + 1.0f, 0.0f, 1.0f + size);
    commit_blend_span<BLEND>(buf, sampleGradient(sampler, address, entry));
    span -= 4;
    buf += 4;
    offset += delta;
  }
 }
 // Commits an entire span of a linear gradient, given the address of a table
 // previously resolved with swgl_validateGradient. The size of the inner portion
 // of the table is given, assuming the table start and ends with a single entry
 // each to deal with clamping. Repeating will be handled if necessary. The
 // initial offset within the table is used to designate where to start the span
 // and how to step through the gradient table.
 #define swgl_commitLinearGradientRGBA8(sampler, address, size, repeat, offset) \
  do {                                                                         \
    if (blend_key) {                                                           \
      commitLinearGradient<true>(sampler, address, size, repeat, offset,       \
                                 swgl_OutRGBA8, swgl_SpanLength);              \
    } else {                                                                   \
      commitLinearGradient<false>(sampler, address, size, repeat, offset,      \
                                  swgl_OutRGBA8, swgl_SpanLength);             \
    }                                                                          \
    swgl_OutRGBA8 += swgl_SpanLength;                                          \
    swgl_SpanLength = 0;                                                       \
  } while (0)
 // Extension to set a clip mask image to be sampled during blending. The offset
 // specifies the positioning of the clip mask image relative to the viewport
 // origin. The bounding box specifies the rectangle relative to the clip mask's
--- a/gfx/wr/swgl/src/vector_type.h
+++ b/gfx/wr/swgl/src/vector_type.h
@ -426,6 +426,16 @@ SI VectorType<T, N> combineHigh(VectorType<T, N> a, VectorType<T, N> b) {
  return combine(highHalf(a), highHalf(b));
 }
 template <typename T, int N>
 SI VectorType<T, N * 2> repeat2(VectorType<T, N> a) {
  return combine(a, a);
 }
 template <typename T, int N>
 SI VectorType<T, N * 4> repeat4(VectorType<T, N> a) {
  return combine(a, a, a, a);
 }
 template <typename T>
 SI VectorType<T, 4> zipLow(VectorType<T, 4> a, VectorType<T, 4> b) {
  return SHUFFLE(a, b, 0, 4, 1, 5);
--- a/gfx/wr/webrender/res/brush_conic_gradient.glsl
+++ b/gfx/wr/webrender/res/brush_conic_gradient.glsl
@ -61,22 +61,19 @@ void brush_vs(
    v_center = gradient.center_point;
    v_angle = PI / 2.0 - gradient.angle;
-    v_start_offset = gradient.start_end_offset.x;
+
-    if (gradient.start_end_offset.x != gradient.start_end_offset.y) {
+    // Store 1/scale where scale = end_offset - start_offset
-      // Store 1/scale where scale = end_offset - start_offset
+    // If scale = 0, we can't get its reciprocal. Instead, just use a zero scale.
-      v_offset_scale = 1.0 / (gradient.start_end_offset.y - gradient.start_end_offset.x);
+    v_offset_scale =
-    } else {
+        gradient.start_end_offset.x != gradient.start_end_offset.y
-      // If scale = 0, we can't get its reciprocal. Instead, just use a zero scale.
+            ? 1.0 / (gradient.start_end_offset.y - gradient.start_end_offset.x)
-      v_offset_scale = 0.0;
+            : 0.0;
-    }
+    v_start_offset = gradient.start_end_offset.x * v_offset_scale;
 }
 #endif
 #ifdef WR_FRAGMENT_SHADER
-float get_gradient_offset() {
+float get_gradient_offset(vec2 pos) {
    // Get the brush position to solve for gradient offset.
    vec2 pos = compute_gradient_pos();
    // Rescale UV to actual repetition size. This can't be done in the vertex
    // shader due to the use of atan() below.
    pos *= v_repeated_size;
@ -84,11 +81,11 @@ float get_gradient_offset() {
    // Use inverse trig to find the angle offset from the relative position.
    vec2 current_dir = pos - v_center;
    float current_angle = atan(current_dir.y, current_dir.x) + v_angle;
-    return (fract(current_angle / (2.0 * PI)) - v_start_offset) * v_offset_scale;
+    return fract(current_angle / (2.0 * PI)) * v_offset_scale - v_start_offset;
 }
 Fragment brush_fs() {
-    vec4 color = sample_gradient(get_gradient_offset());
+    vec4 color = sample_gradient(get_gradient_offset(compute_repeated_pos()));
 #ifdef WR_FEATURE_ALPHA_PASS
    color *= antialias_brush();
--- a/gfx/wr/webrender/res/brush_linear_gradient.glsl
+++ b/gfx/wr/webrender/res/brush_linear_gradient.glsl
@ -6,7 +6,7 @@
 #include shared,prim_shared,brush,gradient_shared
-flat varying vec2 v_start_point;
+flat varying float v_start_offset;
 flat varying vec2 v_scale_dir;
 #ifdef WR_VERTEX_SHADER
@ -55,26 +55,21 @@ void brush_vs(
    vec2 end_point = gradient.start_end_point.zw;
    vec2 dir = end_point - start_point;
    v_start_point = start_point;
    v_scale_dir = dir / dot(dir, dir);
    // Normalize UV and offsets to 0..1 scale.
-    v_start_point /= v_repeated_size;
+    v_scale_dir = dir / dot(dir, dir);
    v_start_offset = dot(start_point, v_scale_dir);
    v_scale_dir *= v_repeated_size;
 }
 #endif
 #ifdef WR_FRAGMENT_SHADER
-float get_gradient_offset() {
+float get_gradient_offset(vec2 pos) {
    // Get the brush position to solve for gradient offset.
    vec2 pos = compute_gradient_pos();
    // Project position onto a direction vector to compute offset.
-    return dot(pos - v_start_point, v_scale_dir);
+    return dot(pos, v_scale_dir) - v_start_offset;
 }
 Fragment brush_fs() {
-    vec4 color = sample_gradient(get_gradient_offset());
+    vec4 color = sample_gradient(get_gradient_offset(compute_repeated_pos()));
 #ifdef WR_FEATURE_ALPHA_PASS
    color *= antialias_brush();
@ -89,21 +84,18 @@ void swgl_drawSpanRGBA8() {
    if (address < 0) {
        return;
    }
-    if (v_gradient_repeat != 0.0) {
+    #ifndef WR_FEATURE_ALPHA_PASS
-        // The gradient repeats, so use fract() on the offset.
+        swgl_commitLinearGradientRGBA8(sGpuCache, address, GRADIENT_ENTRIES, v_gradient_repeat != 0.0,
                                       get_gradient_offset(v_pos));
    #else
        while (swgl_SpanLength > 0) {
-            float entry = clamp_gradient_entry(fract(get_gradient_offset()));
+            float offset = get_gradient_offset(compute_repeated_pos());
            if (v_gradient_repeat != 0.0) offset = fract(offset);
            float entry = clamp_gradient_entry(offset);
            swgl_commitGradientRGBA8(sGpuCache, address, entry);
            v_pos += swgl_interpStep(v_pos);
        }
-    } else {
+    #endif
        // The gradient offset is only clamped.
        while (swgl_SpanLength > 0) {
            float entry = clamp_gradient_entry(get_gradient_offset());
            swgl_commitGradientRGBA8(sGpuCache, address, entry);
            v_pos += swgl_interpStep(v_pos);
        }
    }
 }
 #endif
--- a/gfx/wr/webrender/res/brush_radial_gradient.glsl
+++ b/gfx/wr/webrender/res/brush_radial_gradient.glsl
@ -8,7 +8,6 @@
 flat varying vec2 v_center;
 flat varying float v_start_radius;
 flat varying float v_radius_scale;
 #ifdef WR_VERTEX_SHADER
@ -54,39 +53,35 @@ void brush_vs(
        gradient.stretch_size
    );
-    v_center = gradient.center_start_end_radius.xy;
+    // Store 1/rd where rd = end_radius - start_radius
-    v_start_radius = gradient.center_start_end_radius.z;
+    // If rd = 0, we can't get its reciprocal. Instead, just use a zero scale.
-    if (gradient.center_start_end_radius.z != gradient.center_start_end_radius.w) {
+    float radius_scale =
-      // Store 1/rd where rd = end_radius - start_radius
+        gradient.center_start_end_radius.z != gradient.center_start_end_radius.w
-      v_radius_scale = 1.0 / (gradient.center_start_end_radius.w - gradient.center_start_end_radius.z);
+            ? 1.0 / (gradient.center_start_end_radius.w - gradient.center_start_end_radius.z)
-    } else {
+            : 0.0;
-      // If rd = 0, we can't get its reciprocal. Instead, just use a zero scale.
+    v_center = gradient.center_start_end_radius.xy * radius_scale;
-      v_radius_scale = 0.0;
+    v_start_radius = gradient.center_start_end_radius.z * radius_scale;
-    }
+    v_repeated_size *= radius_scale;
    // Transform all coordinates by the y scale so the
    // fragment shader can work with circles
    v_center.y *= gradient.ratio_xy;
-    v_repeated_size.y *=  gradient.ratio_xy;
+    v_repeated_size.y *= gradient.ratio_xy;
 }
 #endif
 #ifdef WR_FRAGMENT_SHADER
-float get_gradient_offset() {
+float get_gradient_offset(vec2 pos) {
    // Get the brush position to solve for gradient offset.
    vec2 pos = compute_gradient_pos();
    // Rescale UV to actual repetition size. This can't be done in the vertex
    // shader due to the use of length() below.
    pos *= v_repeated_size;
    // Solve for t in length(pd) = v_start_radius + t * rd
-    vec2 pd = pos - v_center;
+    return length(pos - v_center) - v_start_radius;
    return (length(pd) - v_start_radius) * v_radius_scale;
 }
 Fragment brush_fs() {
-    vec4 color = sample_gradient(get_gradient_offset());
+    vec4 color = sample_gradient(get_gradient_offset(compute_repeated_pos()));
 #ifdef WR_FEATURE_ALPHA_PASS
    color *= antialias_brush();
@ -101,20 +96,12 @@ void swgl_drawSpanRGBA8() {
    if (address < 0) {
        return;
    }
-    if (v_gradient_repeat != 0.0) {
+    while (swgl_SpanLength > 0) {
-        // The gradient repeats, so use fract() on the offset.
+        float offset = get_gradient_offset(compute_repeated_pos());
-        while (swgl_SpanLength > 0) {
+        if (v_gradient_repeat != 0.0) offset = fract(offset); 
-            float entry = clamp_gradient_entry(fract(get_gradient_offset()));
+        float entry = clamp_gradient_entry(offset);
-            swgl_commitGradientRGBA8(sGpuCache, address, entry);
+        swgl_commitGradientRGBA8(sGpuCache, address, entry);
-            v_pos += swgl_interpStep(v_pos);
+        v_pos += swgl_interpStep(v_pos);
        }
    } else {
        // The gradient offset is only clamped.
        while (swgl_SpanLength > 0) {
            float entry = clamp_gradient_entry(get_gradient_offset());
            swgl_commitGradientRGBA8(sGpuCache, address, entry);
            v_pos += swgl_interpStep(v_pos);
        }
    }
 }
 #endif
--- a/gfx/wr/webrender/res/gradient_shared.glsl
+++ b/gfx/wr/webrender/res/gradient_shared.glsl
@ -54,7 +54,7 @@ void write_gradient_vertex(
 #endif //WR_VERTEX_SHADER
 #ifdef WR_FRAGMENT_SHADER
-vec2 compute_gradient_pos() {
+vec2 compute_repeated_pos() {
 #ifdef WR_FEATURE_ALPHA_PASS
    // Handle top and left inflated edges (see brush_image).
    vec2 local_pos = max(v_pos, vec2(0.0));
--- a/gfx/wr/webrender/src/prim_store/gradient.rs
+++ b/gfx/wr/webrender/src/prim_store/gradient.rs
@ -848,15 +848,28 @@ impl GradientGpuBlockBuilder {
        start_color: &PremultipliedColorF,
        end_color: &PremultipliedColorF,
        entries: &mut [GradientDataEntry; GRADIENT_DATA_SIZE],
-    ) {
+        prev_step: &PremultipliedColorF,
    ) -> PremultipliedColorF {
        // Calculate the color difference for individual steps in the ramp.
        let inv_steps = 1.0 / (end_idx - start_idx) as f32;
-        let step = PremultipliedColorF {
+        let mut step = PremultipliedColorF {
            r: (end_color.r - start_color.r) * inv_steps,
            g: (end_color.g - start_color.g) * inv_steps,
            b: (end_color.b - start_color.b) * inv_steps,
            a: (end_color.a - start_color.a) * inv_steps,
        };
        // As a subtle form of compression, we ensure that the step values for
        // each stop range are the same if and only if they belong to the same
        // stop range. However, if two different stop ranges have the same step,
        // we need to modify the steps so they compare unequally between ranges.
        // This allows to quickly compare if two adjacent stops belong to the
        // same range by comparing their steps.
        if step == *prev_step {
            // Modify the step alpha value as if by nextafter(). The difference
            // here should be so small as to be unnoticeable, but yet allow it
            // to compare differently.
            step.a = f32::from_bits(if step.a == 0.0 { 1 } else { step.a.to_bits() + 1 });
        }
        let mut cur_color = *start_color;
@ -870,6 +883,8 @@ impl GradientGpuBlockBuilder {
            cur_color.a += step.a;
            entry.end_step = step;
        }
        step
    }
    /// Compute an index into the gradient entry table based on a gradient stop offset. This
@ -913,15 +928,16 @@ impl GradientGpuBlockBuilder {
        // range [0, 1]. The first and last entries hold the first and last color stop colors respectively,
        // while the entries in between hold the interpolated color stop values for the range [0, 1].
        let mut entries = [GradientDataEntry::white(); GRADIENT_DATA_SIZE];
-
+        let mut prev_step = cur_color;
        if reverse_stops {
            // Fill in the first entry (for reversed stops) with the first color stop
-            GradientGpuBlockBuilder::fill_colors(
+            prev_step = GradientGpuBlockBuilder::fill_colors(
                GRADIENT_DATA_LAST_STOP,
                GRADIENT_DATA_LAST_STOP + 1,
                &cur_color,
                &cur_color,
                &mut entries,
                &prev_step,
            );
            // Fill in the center of the gradient table, generating a color ramp between each consecutive pair
@ -933,12 +949,13 @@ impl GradientGpuBlockBuilder {
                let next_idx = Self::get_index(1.0 - next.offset);
                if next_idx < cur_idx {
-                    GradientGpuBlockBuilder::fill_colors(
+                    prev_step = GradientGpuBlockBuilder::fill_colors(
                        next_idx,
                        cur_idx,
                        &next_color,
                        &cur_color,
                        &mut entries,
                        &prev_step,
                    );
                    cur_idx = next_idx;
                }
@ -956,15 +973,17 @@ impl GradientGpuBlockBuilder {
                &cur_color,
                &cur_color,
                &mut entries,
                &prev_step,
            );
        } else {
            // Fill in the first entry with the first color stop
-            GradientGpuBlockBuilder::fill_colors(
+            prev_step = GradientGpuBlockBuilder::fill_colors(
                GRADIENT_DATA_FIRST_STOP,
                GRADIENT_DATA_FIRST_STOP + 1,
                &cur_color,
                &cur_color,
                &mut entries,
                &prev_step,
            );
            // Fill in the center of the gradient table, generating a color ramp between each consecutive pair
@ -976,12 +995,13 @@ impl GradientGpuBlockBuilder {
                let next_idx = Self::get_index(next.offset);
                if next_idx > cur_idx {
-                    GradientGpuBlockBuilder::fill_colors(
+                    prev_step = GradientGpuBlockBuilder::fill_colors(
                        cur_idx,
                        next_idx,
                        &cur_color,
                        &next_color,
                        &mut entries,
                        &prev_step,
                    );
                    cur_idx = next_idx;
                }
@ -999,6 +1019,7 @@ impl GradientGpuBlockBuilder {
                &cur_color,
                &cur_color,
                &mut entries,
                &prev_step,
            );
        }
--- a/layout/reftests/border-image/reftest.list
+++ b/layout/reftests/border-image/reftest.list
@ -43,17 +43,17 @@ fuzzy-if(asyncPan&&!layersGPUAccelerated,0-140,0-514) fuzzy-if(winWidget,0-144,0
 # border images with gradients
 fuzzy-if(webrender&&!geckoview,1-3,554-1804) == border-image-linear-gradient.html border-image-linear-gradient-ref.html
-fuzzy(0-1,0-98) fuzzy-if(skiaContent,0-1,0-350) fuzzy-if(webrender&&!geckoview,1-3,1086-37537) == border-image-linear-gradient-slice-1.html border-image-linear-gradient-slice-1-ref.html
+fuzzy(0-1,0-98) fuzzy-if(skiaContent,0-1,0-350) fuzzy-if(webrender&&!geckoview,1-3,995-37537) == border-image-linear-gradient-slice-1.html border-image-linear-gradient-slice-1-ref.html
 fuzzy(0-1,0-515) fuzzy-if(OSX,0-1,0-10595) fuzzy-if(webrender&&!geckoview,1-3,272-25136) == border-image-linear-gradient-slice-2.html border-image-linear-gradient-slice-2-ref.html
 fuzzy-if(skiaContent,0-1,0-2500) fuzzy-if(webrender&&!geckoview,1-3,2500-86037) == border-image-linear-gradient-slice-fill-1.html border-image-linear-gradient-slice-fill-1-ref.html
 fuzzy(0-1,0-649) fuzzy-if(OSX,0-1,0-25771) fuzzy-if(skiaContent&&!Android,0-1,0-546) fuzzy-if(Android,0-1,0-6093) fuzzy-if(webrender&&!geckoview,1-3,480-57480) == border-image-linear-gradient-slice-fill-2.html border-image-linear-gradient-slice-fill-2-ref.html
 fuzzy(0-1,0-134)  fuzzy-if(OSX,0-5,0-1676) fuzzy-if(webrender&&!geckoview,1-1,0-4537) == border-image-linear-gradient-width.html border-image-linear-gradient-width-ref.html
 fuzzy(0-2,0-60590) fuzzy-if(Android,0-4,0-18022) fuzzy-if(OSX,0-1,0-15000) fuzzy-if(webrender&&!geckoview,1-2,14300-60581) == border-image-linear-gradient-slice-width.html border-image-linear-gradient-slice-width-ref.html
 fuzzy(0-2,0-26758) fuzzy-if(OSX,0-1,0-6000) fuzzy-if(webrender&&!geckoview,1-3,3803-26758) == border-image-linear-gradient-outset.html border-image-linear-gradient-outset-ref.html
-fuzzy(0-1,0-12) fuzzy-if(skiaContent,0-1,0-400) fuzzy-if(webrender&&!geckoview,1-3,1397-26872) == border-image-linear-gradient-repeat-repeat-1.html border-image-linear-gradient-repeat-repeat-1-ref.html
+fuzzy(0-1,0-12) fuzzy-if(skiaContent,0-1,0-400) fuzzy-if(webrender&&!geckoview,1-3,1291-26872) == border-image-linear-gradient-repeat-repeat-1.html border-image-linear-gradient-repeat-repeat-1-ref.html
-fuzzy(0-1,0-13) fuzzy-if(skiaContent,0-1,0-300) fuzzy-if(webrender&&!geckoview,1-3,1400-27131) == border-image-linear-gradient-repeat-round-1.html border-image-linear-gradient-repeat-round-1-ref.html
+fuzzy(0-1,0-13) fuzzy-if(skiaContent,0-1,0-300) fuzzy-if(webrender&&!geckoview,1-3,1061-27131) == border-image-linear-gradient-repeat-round-1.html border-image-linear-gradient-repeat-round-1-ref.html
-fuzzy-if(Android,0-1,0-1894) fuzzy-if(webrender&&!geckoview,1-2,3163-67805) == border-image-linear-gradient-repeat-repeat-2.html border-image-linear-gradient-repeat-repeat-2-ref.html
+fuzzy-if(Android,0-1,0-1894) fuzzy-if(webrender&&!geckoview,1-2,2480-67805) == border-image-linear-gradient-repeat-repeat-2.html border-image-linear-gradient-repeat-repeat-2-ref.html
-fuzzy(0-1,0-2000) fuzzy-if(webrender&&!geckoview,1-2,3249-9500) == border-image-linear-gradient-repeat-round-2.html border-image-linear-gradient-repeat-round-2-ref.html
+fuzzy(0-1,0-2000) fuzzy-if(webrender&&!geckoview,1-2,2658-9500) == border-image-linear-gradient-repeat-round-2.html border-image-linear-gradient-repeat-round-2-ref.html
 fuzzy(0-1,0-8533) fuzzy-if(webrender&&!geckoview&&!swgl,1-3,2967-9500) == border-image-linear-gradient-repeat-repeat-3.html border-image-linear-gradient-repeat-repeat-3-ref.html
 fuzzy(0-3,0-107563) fuzzy-if(webrender&&!geckoview&&!swgl,1-3,42909-107563) == border-image-linear-gradient-repeat-round-3.html border-image-linear-gradient-repeat-round-3-ref.html