Bug 1682195 - Use scissoring to restrict cs_clip_image rather than discard. r=gw

cs_clip_image renders the entire target-space sub-rect, and then uses discard against the local-space bounds to ensure the primitive gets trimmed down to its actual footprint within the larger target-space sub-rect. This can be fairly wasteful and slow. This patch restructures things so that we essentially render with the actual primitive vertexes, and then we either clamp (for axis-aligned) or scissor (for non-axis-aligned) to restrict the footprint to within the target-space sub-rect. This then allows the use of a SWGL fast-path for the axis-aligned case that can vastly speed up the common-case. Differential Revision: https://phabricator.services.mozilla.com/D111316
2021-04-09 22:55:33 +00:00 · 2021-04-09 22:55:33 +00:00 · b40978d057
--- a/gfx/wr/swgl/src/blend.h
+++ b/gfx/wr/swgl/src/blend.h
@ -770,6 +770,22 @@ static ALWAYS_INLINE WideR8 blend_span(uint8_t* buf, WideR8 r, int len) {
                      len);
 }

+static ALWAYS_INLINE void commit_span(uint8_t* buf, PackedR8 r) {
+  unaligned_store(buf, r);
+}
+
+static ALWAYS_INLINE void commit_span(uint8_t* buf, PackedR8 r, int len) {
+  partial_store_span(buf, r, len);
+}
+
+static ALWAYS_INLINE PackedR8 blend_span(uint8_t* buf, PackedR8 r) {
+  return pack(blend_span(buf, unpack(r)));
+}
+
+static ALWAYS_INLINE PackedR8 blend_span(uint8_t* buf, PackedR8 r, int len) {
+  return pack(blend_span(buf, unpack(r), len));
+}
+
 template <bool BLEND, typename P, typename R>
 static ALWAYS_INLINE void commit_blend_span(P* buf, R r) {
  if (BLEND) {
--- a/gfx/wr/swgl/src/swgl_ext.h
+++ b/gfx/wr/swgl/src/swgl_ext.h
@ -481,8 +481,9 @@ static int blendTextureNearestFast(S sampler, vec2 uv, int span,
  // Calculate the row pointer within the buffer, clamping to within valid row
  // bounds.
  P* row =
-      &sampler->buf[clamp(clampCoord(i.y, sampler->height), minUV.y, maxUV.y) *
-                    sampler->stride];
+      &((P*)sampler
+            ->buf)[clamp(clampCoord(i.y, sampler->height), minUV.y, maxUV.y) *
+                   sampler->stride];
  // Find clamped X bounds within the row.
  int minX = clamp(minUV.x, 0, sampler->width - 1);
  int maxX = clamp(maxUV.x, minX, sampler->width - 1);
@ -518,7 +519,8 @@ static int blendTextureNearestFast(S sampler, vec2 uv, int span,
  // If we still have samples left above the valid sample bounds, then we again
  // need to fill this section with a constant clamped sample.
  if (curX < endX) {
-    auto src = applyColor(unpack(bit_cast<packed_type>(U32(row[maxX]))), color);
+    auto src =
+        applyColor(unpack(bit_cast<packed_type>(V4<P>(row[maxX]))), color);
    commit_solid_span<BLEND>(buf, src, endX - curX);
  }
  return span;
--- a/gfx/wr/webrender/res/cs_clip_image.glsl
+++ b/gfx/wr/webrender/res/cs_clip_image.glsl
@ -4,10 +4,9 @@

 #include shared,clip_shared

-varying vec4 vLocalPos;
+varying vec2 vLocalPos;
 varying vec2 vClipMaskImageUv;

-flat varying vec4 vClipMaskUvRect;
 flat varying vec4 vClipMaskUvInnerRect;

 #ifdef WR_VERTEX_SHADER
@ -35,13 +34,48 @@ ClipMaskInstanceImage fetch_clip_item() {
    return cmi;
 }

+struct ClipImageVertexInfo {
+    vec2 local_pos;
+    vec4 world_pos;
+};
+
+// This differs from write_clip_tile_vertex in that we forward transform the
+// primitive's local-space tile rect into the target space. We use scissoring
+// to ensure that the primitive does not draw outside the target bounds.
+ClipImageVertexInfo write_clip_image_vertex(RectWithSize tile_rect,
+                                            RectWithSize local_clip_rect,
+                                            Transform prim_transform,
+                                            Transform clip_transform,
+                                            RectWithSize sub_rect,
+                                            vec2 task_origin,
+                                            vec2 screen_origin,
+                                            float device_pixel_scale) {
+    vec2 local_pos = clamp_rect(tile_rect.p0 + aPosition.xy * tile_rect.size, local_clip_rect);
+    vec4 world_pos = prim_transform.m * vec4(local_pos, 0.0, 1.0);
+    vec4 final_pos = vec4(
+        world_pos.xy * device_pixel_scale + (task_origin - screen_origin) * world_pos.w,
+        0.0,
+        world_pos.w
+    );
+    gl_Position = uTransform * final_pos;
+
+    init_transform_vs(
+        prim_transform.is_axis_aligned
+            ? vec4(vec2(-1.0e16), vec2(1.0e16))
+            : vec4(local_clip_rect.p0, local_clip_rect.p0 + local_clip_rect.size));
+
+    ClipImageVertexInfo vi = ClipImageVertexInfo(local_pos, world_pos);
+    return vi;
+}
+
 void main(void) {
    ClipMaskInstanceImage cmi = fetch_clip_item();
    Transform clip_transform = fetch_transform(cmi.base.clip_transform_id);
    Transform prim_transform = fetch_transform(cmi.base.prim_transform_id);
    ImageSource res = fetch_image_source_direct(cmi.resource_address);

-    ClipVertexInfo vi = write_clip_tile_vertex(
+    ClipImageVertexInfo vi = write_clip_image_vertex(
+        cmi.tile_rect,
        cmi.local_rect,
        prim_transform,
        clip_transform,
@ -51,33 +85,33 @@ void main(void) {
        cmi.base.device_pixel_scale
    );
    vLocalPos = vi.local_pos;
-    vClipMaskImageUv = (vi.local_pos.xy - cmi.tile_rect.p0 * vi.local_pos.w) / cmi.tile_rect.size;
+    vec2 uv = (vi.local_pos - cmi.tile_rect.p0) / cmi.tile_rect.size;

    vec2 texture_size = vec2(TEX_SIZE(sColor0));
-    vClipMaskUvRect = vec4(res.uv_rect.p0, res.uv_rect.p1 - res.uv_rect.p0) / texture_size.xyxy;
+    vec4 uv_rect = vec4(res.uv_rect.p0, res.uv_rect.p1);
+    vClipMaskImageUv = mix(uv_rect.xy, uv_rect.zw, uv) / texture_size;
+
    // applying a half-texel offset to the UV boundaries to prevent linear samples from the outside
-    vec4 inner_rect = vec4(res.uv_rect.p0, res.uv_rect.p1);
-    vClipMaskUvInnerRect = (inner_rect + vec4(0.5, 0.5, -0.5, -0.5)) / texture_size.xyxy;
+    vClipMaskUvInnerRect = (uv_rect + vec4(0.5, 0.5, -0.5, -0.5)) / texture_size.xyxy;
 }
 #endif

 #ifdef WR_FRAGMENT_SHADER
 void main(void) {
-    vec2 local_pos = vLocalPos.xy / vLocalPos.w;
-    float alpha = vLocalPos.w > 0.0 ? init_transform_fs(local_pos) : 0.0;
-
-    // TODO: Handle repeating masks?
-    vec2 clamped_mask_uv = clamp(vClipMaskImageUv, vec2(0.0, 0.0), vLocalPos.ww);
-
-    // Ensure we don't draw outside of our tile.
-    // FIXME(emilio): Can we do this earlier?
-    if (clamped_mask_uv != vClipMaskImageUv)
-        discard;
-
-    vec2 source_uv = clamp(
-        clamped_mask_uv / vLocalPos.w * vClipMaskUvRect.zw + vClipMaskUvRect.xy,
-        vClipMaskUvInnerRect.xy, vClipMaskUvInnerRect.zw);
+    float alpha = init_transform_fs(vLocalPos);
+    vec2 source_uv = clamp(vClipMaskImageUv, vClipMaskUvInnerRect.xy, vClipMaskUvInnerRect.zw);
    float clip_alpha = texture(sColor0, source_uv).r; //careful: texture has type A8
    oFragColor = vec4(alpha * clip_alpha, 1.0, 1.0, 1.0);
 }
+
+#ifdef SWGL_DRAW_SPAN
+void swgl_drawSpanR8() {
+    if (has_valid_transform_bounds()) {
+        return;
+    }
+
+    swgl_commitTextureLinearR8(sColor0, vClipMaskImageUv, vClipMaskUvInnerRect);
+}
+#endif
+
 #endif
--- a/gfx/wr/webrender/res/prim_shared.glsl
+++ b/gfx/wr/webrender/res/prim_shared.glsl
@ -17,10 +17,6 @@

 uniform sampler2D sClipMask;

-vec2 clamp_rect(vec2 pt, RectWithSize rect) {
-    return clamp(pt, rect.p0, rect.p0 + rect.size);
-}
-
 #ifndef SWGL_CLIP_MASK
 // TODO: convert back to RectWithEndPoint if driver issues are resolved, if ever.
 flat varying vec4 vClipMaskUvBounds;
--- a/gfx/wr/webrender/res/rect.glsl
+++ b/gfx/wr/webrender/res/rect.glsl
@ -50,3 +50,7 @@ float signed_distance_rect(vec2 pos, vec2 p0, vec2 p1) {
    return max(d.x, d.y);
 }

+vec2 clamp_rect(vec2 pt, RectWithSize rect) {
+    return clamp(pt, rect.p0, rect.p0 + rect.size);
+}
+
--- a/gfx/wr/webrender/res/transform.glsl
+++ b/gfx/wr/webrender/res/transform.glsl
@ -93,6 +93,11 @@ vec4 get_node_pos(vec2 pos, Transform transform) {

 #ifdef WR_FRAGMENT_SHADER

+// Assume transform bounds are set to a large scale to signal they are invalid.
+bool has_valid_transform_bounds() {
+    return vTransformBounds.w < 1.0e15;
+}
+
 float init_transform_fs(vec2 local_pos) {
    // Get signed distance from local rect bounds.
    float d = signed_distance_rect(
--- a/gfx/wr/webrender/src/batch.rs
+++ b/gfx/wr/webrender/src/batch.rs
@ -3383,7 +3383,7 @@ pub struct ClipBatchList {
    pub slow_rectangles: Vec<ClipMaskInstanceRect>,
    pub fast_rectangles: Vec<ClipMaskInstanceRect>,
    /// Image draws apply the image masking.
-    pub images: FastHashMap<TextureSource, Vec<ClipMaskInstanceImage>>,
+    pub images: FastHashMap<(TextureSource, Option<DeviceIntRect>), Vec<ClipMaskInstanceImage>>,
    pub box_shadows: FastHashMap<TextureSource, Vec<ClipMaskInstanceBoxShadow>>,
 }

@ -3586,11 +3586,26 @@ impl ClipBatcher {
                spatial_tree,
            );

-            let prim_transform_id = transforms.get_id(
-                root_spatial_node_index,
-                ROOT_SPATIAL_NODE_INDEX,
-                spatial_tree,
-            );
+            // For clip mask images, we need to map from the primitive's layout space to
+            // the target space, as the cs_clip_image shader needs to forward transform
+            // the local image bounds, rather than backwards transform the target bounds
+            // as in done in write_clip_tile_vertex.
+            let prim_transform_id = match clip_node.item.kind {
+                ClipItemKind::Image { .. } => {
+                    transforms.get_id(
+                        clip_instance.spatial_node_index,
+                        root_spatial_node_index,
+                        spatial_tree,
+                    )
+                }
+                _ => {
+                    transforms.get_id(
+                        root_spatial_node_index,
+                        ROOT_SPATIAL_NODE_INDEX,
+                        spatial_tree,
+                    )
+                }
+            };

            let common = ClipMaskInstanceCommon {
                sub_rect: DeviceRect::new(
@ -3612,7 +3627,14 @@ impl ClipBatcher {
                        tile: None,
                    };

-                    let mut add_image = |request: ImageRequest, local_tile_rect: LayoutRect, sub_rect: DeviceRect| {
+                    let map_local_to_world = SpaceMapper::new_with_target(
+                        ROOT_SPATIAL_NODE_INDEX,
+                        clip_instance.spatial_node_index,
+                        WorldRect::max_rect(),
+                        spatial_tree,
+                    );
+
+                    let mut add_image = |request: ImageRequest, tile_rect: LayoutRect, sub_rect: DeviceRect| {
                        let cache_item = match resource_cache.get_cached_image(request) {
                            Ok(item) => item,
                            Err(..) => {
@ -3622,9 +3644,33 @@ impl ClipBatcher {
                            }
                        };

+                        // If the primitive transform is axis aligned, we can skip any need for scissoring
+                        // by clipping the local clip rect with the backwards transformed target bounds.
+                        // If it is not axis-aligned, then we pass the local clip rect through unmodified
+                        // to the shader and also set up a scissor rect for the overall target bounds to
+                        // ensure nothing is drawn outside the target.
+                        let (local_rect, scissor_rect) =
+                            if prim_transform_id.transform_kind() == TransformedRectKind::AxisAligned {
+                                let world_rect =
+                                    sub_rect.translate(actual_rect.origin.to_vector()) / global_device_pixel_scale;
+                                (map_local_to_world
+                                    .unmap(&world_rect)
+                                    .expect("bug: should always map as axis-aligned")
+                                    .intersection(&rect)
+                                    .unwrap_or_default(),
+                                 None)
+                            } else {
+                                (rect,
+                                 Some(common.sub_rect
+                                    .translate(task_origin.to_vector())
+                                    .round_out()
+                                    .to_i32()))
+                            };
+
+
                        self.get_batch_list(is_first_clip)
                            .images
-                            .entry(cache_item.texture_id)
+                            .entry((cache_item.texture_id, scissor_rect))
                            .or_insert_with(Vec::new)
                            .push(ClipMaskInstanceImage {
                                common: ClipMaskInstanceCommon {
@ -3632,8 +3678,8 @@ impl ClipBatcher {
                                    ..common
                                },
                                resource_address: gpu_cache.get_address(&cache_item.uv_rect_handle),
-                                tile_rect: local_tile_rect,
-                                local_rect: rect,
+                                tile_rect,
+                                local_rect,
                            });
                    };

@ -3642,12 +3688,6 @@ impl ClipBatcher {
                            let clip_spatial_node = &spatial_tree.spatial_nodes[clip_instance.spatial_node_index.0 as usize];
                            let clip_is_axis_aligned = clip_spatial_node.coordinate_system_id == CoordinateSystemId::root();
                            let sub_rect_bounds = actual_rect.size.into();
-                            let map_local_to_world = SpaceMapper::new_with_target(
-                                ROOT_SPATIAL_NODE_INDEX,
-                                clip_instance.spatial_node_index,
-                                WorldRect::max_rect(),
-                                spatial_tree,
-                            );

                            for tile in tiles {
                                let tile_sub_rect = if clip_is_axis_aligned {
--- a/gfx/wr/webrender/src/renderer/mod.rs
+++ b/gfx/wr/webrender/src/renderer/mod.rs
@ -3749,6 +3749,7 @@ impl Renderer {
    fn draw_clip_batch_list(
        &mut self,
        list: &ClipBatchList,
+        draw_target: &DrawTarget,
        projection: &default::Transform3D<f32>,
        stats: &mut RendererStats,
    ) {
@ -3803,8 +3804,27 @@ impl Renderer {
        }

        // draw image masks
-        for (mask_texture_id, items) in list.images.iter() {
+        let mut using_scissor = false;
+        for ((mask_texture_id, clip_rect), items) in list.images.iter() {
            let _gm2 = self.gpu_profiler.start_marker("clip images");
+            // Some image masks may require scissoring to ensure they don't draw
+            // outside their task's target bounds. Axis-aligned primitives will
+            // be clamped inside the shader and should not require scissoring.
+            // TODO: We currently assume scissor state is off by default for
+            // alpha targets here, but in the future we may want to track the
+            // current scissor state so that this can be properly saved and
+            // restored here.
+            if let Some(clip_rect) = clip_rect {
+                if !using_scissor {
+                    self.device.enable_scissor();
+                    using_scissor = true;
+                }
+                let scissor_rect = draw_target.build_scissor_rect(Some(*clip_rect));
+                self.device.set_scissor_rect(scissor_rect);
+            } else if using_scissor {
+                self.device.disable_scissor();
+                using_scissor = false;
+            }
            let textures = BatchTextures::composite_rgb(*mask_texture_id);
            self.shaders.borrow_mut().cs_clip_image
                .bind(&mut self.device, projection, None, &mut self.renderer_errors);
@ -3815,6 +3835,9 @@ impl Renderer {
                stats,
            );
        }
+        if using_scissor {
+            self.device.disable_scissor();
+        }
    }

    fn draw_alpha_target(
@ -3958,6 +3981,7 @@ impl Renderer {
            self.set_blend(false, FramebufferKind::Other);
            self.draw_clip_batch_list(
                &target.clip_batcher.primary_clips,
+                &draw_target,
                projection,
                stats,
            );
@ -3968,6 +3992,7 @@ impl Renderer {
            self.set_blend_mode_multiply(FramebufferKind::Other);
            self.draw_clip_batch_list(
                &target.clip_batcher.secondary_clips,
+                &draw_target,
                projection,
                stats,
            );
--- a/gfx/wr/wrench/reftests/mask/mask-perspective.png
+++ b/gfx/wr/wrench/reftests/mask/mask-perspective.png
--- a/gfx/wr/wrench/reftests/mask/reftest.list
+++ b/gfx/wr/wrench/reftests/mask/reftest.list
@ -12,6 +12,6 @@ platform(linux,mac) fuzzy(1,17500) == mask-atomicity-tiling.yaml mask-atomicity-
 platform(linux,mac) == mask-perspective.yaml mask-perspective.png
 == fuzzy(1,11) mask-perspective-tiling.yaml mask-perspective.yaml
 platform(linux,mac) == checkerboard.yaml checkerboard.png
-skip_on(android,device) == checkerboard.yaml checkerboard-tiling.yaml  # Fails on a Pixel2
+skip_on(android,device) fuzzy(2,1900) == checkerboard.yaml checkerboard-tiling.yaml  # Fails on a Pixel2
 == missing-mask.yaml missing-mask-ref.yaml
 platform(linux) == scaled-filter-raster-root.yaml scaled-filter-raster-root.png