Bug 1684781 - Improve performance of mix-blend-mode. r=nical

This patch enables the faster mix-blend-mode path that allows using picture cache tiles as the backdrop source for blends where that is appropriate (most of the underlying work is in previous patches or the dependencies of this bug). In addition to avoiding an extra intermediate surface for blends that are on a picture cache surface, it also avoids constant invalidation of picture cache tiles due to the blend container not being part of the main content scroll root. As an example of the typical performance improvement, the GPU times on an AMD 5700 GPU at 4k, when using the Firelux color temperature addon browsing pages drops from ~1.8ms to ~0.3 ms. Differential Revision: https://phabricator.services.mozilla.com/D104491
2021-02-22 23:00:57 +00:00 · 2021-02-22 23:00:57 +00:00 · c9a99d4ee4
--- a/gfx/wr/webrender/src/device/gl.rs
+++ b/gfx/wr/webrender/src/device/gl.rs
@ -1063,7 +1063,7 @@ pub struct Device {
    bound_textures: [gl::GLuint; 16],
    bound_program: gl::GLuint,
    bound_vao: gl::GLuint,
-    bound_read_fbo: FBOId,
+    bound_read_fbo: (FBOId, DeviceIntPoint),
    bound_draw_fbo: FBOId,
    program_mode_id: UniformLocation,
    default_read_fbo: FBOId,
@ -1078,7 +1078,7 @@ pub struct Device {
    /// Whether to use draw calls instead of regular blitting commands.
    ///
    /// Note: this currently only applies to the batched texture uploads
-    /// path. 
+    /// path.
    use_draw_calls_for_texture_copy: bool,

    // HW or API capabilities
@ -1320,6 +1320,11 @@ pub enum ReadTarget {
    External {
        fbo: FBOId,
    },
+    /// An FBO bound to a native (OS compositor) surface
+    NativeSurface {
+        fbo_id: FBOId,
+        offset: DeviceIntPoint,
+    },
 }

 impl ReadTarget {
@ -1331,19 +1336,40 @@ impl ReadTarget {
            fbo_id: texture.fbos[layer],
        }
    }
+
+    fn offset(&self) -> DeviceIntPoint {
+        match *self {
+            ReadTarget::Default |
+            ReadTarget::Texture { .. } |
+            ReadTarget::External { .. } => {
+                DeviceIntPoint::zero()
+            }
+
+            ReadTarget::NativeSurface { offset, .. } => {
+                offset
+            }
+        }
+    }
 }

 impl From<DrawTarget> for ReadTarget {
    fn from(t: DrawTarget) -> Self {
        match t {
-            DrawTarget::Default { .. } => ReadTarget::Default,
-            DrawTarget::NativeSurface { .. } => {
-                unreachable!("bug: native surfaces cannot be read targets");
+            DrawTarget::Default { .. } => {
+                ReadTarget::Default
+            }
+            DrawTarget::NativeSurface { external_fbo_id, offset, .. } => {
+                ReadTarget::NativeSurface {
+                    fbo_id: FBOId(external_fbo_id),
+                    offset,
+                }
+            }
+            DrawTarget::Texture { fbo_id, .. } => {
+                ReadTarget::Texture { fbo_id }
+            }
+            DrawTarget::External { fbo, .. } => {
+                ReadTarget::External { fbo }
            }
-            DrawTarget::Texture { fbo_id, .. } =>
-                ReadTarget::Texture { fbo_id },
-            DrawTarget::External { fbo, .. } =>
-                ReadTarget::External { fbo },
        }
    }
 }
@ -1727,7 +1753,7 @@ impl Device {
            bound_textures: [0; 16],
            bound_program: 0,
            bound_vao: 0,
-            bound_read_fbo: FBOId(0),
+            bound_read_fbo: (FBOId(0), DeviceIntPoint::zero()),
            bound_draw_fbo: FBOId(0),
            program_mode_id: UniformLocation::INVALID,
            default_read_fbo: FBOId(0),
@ -1857,8 +1883,8 @@ impl Device {
        self.bound_vao = 0;
        self.gl.bind_vertex_array(0);

-        self.bound_read_fbo = self.default_read_fbo;
-        self.gl.bind_framebuffer(gl::READ_FRAMEBUFFER, self.bound_read_fbo.0);
+        self.bound_read_fbo = (self.default_read_fbo, DeviceIntPoint::zero());
+        self.gl.bind_framebuffer(gl::READ_FRAMEBUFFER, self.default_read_fbo.0);

        self.bound_draw_fbo = self.default_draw_fbo;
        self.gl.bind_framebuffer(gl::DRAW_FRAMEBUFFER, self.bound_draw_fbo.0);
@ -2032,13 +2058,18 @@ impl Device {
        self.bind_texture_impl(slot.into(), external_texture.id, external_texture.target, None);
    }

-    pub fn bind_read_target_impl(&mut self, fbo_id: FBOId) {
+    pub fn bind_read_target_impl(
+        &mut self,
+        fbo_id: FBOId,
+        offset: DeviceIntPoint,
+    ) {
        debug_assert!(self.inside_frame);

-        if self.bound_read_fbo != fbo_id {
-            self.bound_read_fbo = fbo_id;
+        if self.bound_read_fbo != (fbo_id, offset) {
            fbo_id.bind(self.gl(), FBOTarget::Read);
        }
+
+        self.bound_read_fbo = (fbo_id, offset);
    }

    pub fn bind_read_target(&mut self, target: ReadTarget) {
@ -2046,9 +2077,10 @@ impl Device {
            ReadTarget::Default => self.default_read_fbo,
            ReadTarget::Texture { fbo_id } => fbo_id,
            ReadTarget::External { fbo } => fbo,
+            ReadTarget::NativeSurface { fbo_id, .. } => fbo_id,
        };

-        self.bind_read_target_impl(fbo_id)
+        self.bind_read_target_impl(fbo_id, target.offset())
    }

    fn bind_draw_target_impl(&mut self, fbo_id: FBOId) {
@ -2062,7 +2094,7 @@ impl Device {

    pub fn reset_read_target(&mut self) {
        let fbo = self.default_read_fbo;
-        self.bind_read_target_impl(fbo);
+        self.bind_read_target_impl(fbo, DeviceIntPoint::zero());
    }


@ -2757,11 +2789,14 @@ impl Device {
            TextureFilter::Linear | TextureFilter::Trilinear => gl::LINEAR,
        };

+        let src_x0 = src_rect.origin.x + self.bound_read_fbo.1.x;
+        let src_y0 = src_rect.origin.y + self.bound_read_fbo.1.y;
+
        self.gl.blit_framebuffer(
-            src_rect.origin.x,
-            src_rect.origin.y,
-            src_rect.origin.x + src_rect.size.width,
-            src_rect.origin.y + src_rect.size.height,
+            src_x0,
+            src_y0,
+            src_x0 + src_rect.size.width,
+            src_y0 + src_rect.size.height,
            dest_rect.origin.x,
            dest_rect.origin.y,
            dest_rect.origin.x + dest_rect.size.width,
@ -2815,7 +2850,7 @@ impl Device {
                    ),
                ).intersection(&dimensions.into()).unwrap_or_else(DeviceIntRect::zero);

-                self.bind_read_target_impl(fbo);
+                self.bind_read_target_impl(fbo, DeviceIntPoint::zero());
                self.bind_texture_impl(
                    DEFAULT_TEXTURE,
                    id,
--- a/gfx/wr/webrender/src/picture.rs
+++ b/gfx/wr/webrender/src/picture.rs
@ -5057,7 +5057,7 @@ impl PicturePrimitive {
                frame_state.init_surface_tiled(
                    surface_index,
                    surface_tasks,
-                    device_clip_rect,
+                    device_clip_rect.translate(tile_cache.device_position.to_vector()),
                );
            }
            Some(ref mut raster_config) => {
--- a/gfx/wr/webrender/src/renderer/mod.rs
+++ b/gfx/wr/webrender/src/renderer/mod.rs
@ -5673,7 +5673,7 @@ impl Renderer {

        self.device.begin_frame();
        let _gm = self.gpu_profiler.start_marker("read GPU data");
-        self.device.bind_read_target_impl(self.read_fbo);
+        self.device.bind_read_target_impl(self.read_fbo, DeviceIntPoint::zero());

        if config.bits.contains(CaptureBits::EXTERNAL_RESOURCES) && !deferred_images.is_empty() {
            info!("saving external images");
--- a/gfx/wr/webrender/src/scene_building.rs
+++ b/gfx/wr/webrender/src/scene_building.rs
@ -1905,6 +1905,7 @@ impl<'a> SceneBuilder<'a> {
                context_3d,
                is_redundant,
                is_backdrop_root: flags.contains(StackingContextFlags::IS_BACKDROP_ROOT),
+                flags,
            });
        }

@ -1942,6 +1943,26 @@ impl<'a> SceneBuilder<'a> {

        let stacking_context = self.sc_stack.pop().unwrap();

+        // If the stacking context is a blend container, and if we're at the top level
+        // of the stacking context tree, we can make this blend container into a tile
+        // cache. This means that we get caching and correct scrolling invalidation for
+        // root level blend containers. For these cases, the readbacks of the backdrop
+        // are handled by doing partial reads of the picture cache tiles during rendering.
+        if stacking_context.flags.contains(StackingContextFlags::IS_BLEND_CONTAINER) &&
+           self.sc_stack.is_empty() &&
+           self.tile_cache_builder.can_add_container_tile_cache()
+        {
+            self.tile_cache_builder.add_tile_cache(
+                stacking_context.prim_list,
+                &self.spatial_tree,
+                &self.clip_store,
+                self.interners,
+                &self.config,
+            );
+
+            return;
+        }
+
        let parent_is_empty = match self.sc_stack.last() {
            Some(parent_sc) => {
                assert!(!stacking_context.is_redundant);
@ -3616,6 +3637,9 @@ struct FlattenedStackingContext {

    /// True if this stacking context is redundant (i.e. doesn't require a surface)
    is_redundant: bool,
+
+    /// Flags identifying the type of container (among other things) this stacking context is
+    flags: StackingContextFlags,
 }

 impl FlattenedStackingContext {
--- a/gfx/wr/webrender/src/tile_cache.rs
+++ b/gfx/wr/webrender/src/tile_cache.rs
@ -21,6 +21,15 @@ use crate::util::VecHelper;
 and into here.
 */

+// If the page would create too many slices (an arbitrary definition where
+// it's assumed the GPU memory + compositing overhead would be too high)
+// then create a single picture cache for the remaining content. This at
+// least means that we can cache small content changes efficiently when
+// scrolling isn't occurring. Scrolling regions will be handled reasonably
+// efficiently by the dirty rect tracking (since it's likely that if the
+// page has so many slices there isn't a single major scroll region).
+const MAX_CACHE_SLICES: usize = 12;
+
 /// Created during scene building, describes how to create a tile cache for a given slice.
 pub struct PendingTileCache {
    /// List of primitives that are part of this slice
@ -91,6 +100,124 @@ impl TileCacheBuilder {
        self.force_new_tile_cache = Some(slice_flags);
    }

+    /// Returns true if it's OK to add a container tile cache (will return false
+    /// if too many slices have been created).
+    pub fn can_add_container_tile_cache(&self) -> bool {
+        // See the logic and comments around MAX_CACHE_SLICES in add_prim
+        // to explain why < MAX_CACHE_SLICES-1 is used.
+        self.pending_tile_caches.len() < MAX_CACHE_SLICES-1
+    }
+
+    /// Create a new tile cache for an existing prim_list
+    pub fn add_tile_cache(
+        &mut self,
+        prim_list: PrimitiveList,
+        spatial_tree: &SpatialTree,
+        clip_store: &ClipStore,
+        interners: &Interners,
+        config: &FrameBuilderConfig,
+    ) {
+        assert!(self.can_add_container_tile_cache());
+
+        if prim_list.is_empty() {
+            return;
+        }
+
+        // Iterate the clusters and determine which is the most commonly occurring
+        // scroll root. This is a reasonable heuristic to decide which spatial node
+        // should be considered the scroll root of this tile cache, in order to
+        // minimize the invalidations that occur due to scrolling. It's often the
+        // case that a blend container will have only a single scroll root.
+        let mut found_scroll_roots = FastHashMap::default();
+
+        for cluster in &prim_list.clusters {
+            let scroll_root = self.find_scroll_root(
+                cluster.spatial_node_index,
+                spatial_tree,
+            );
+
+            *found_scroll_roots.entry(scroll_root).or_insert(0) += 1;
+        }
+
+        // Select the scroll root by finding the most commonly occurring one
+        let scroll_root = *found_scroll_roots
+            .iter()
+            .max_by_key(|entry | entry.1)
+            .unwrap()
+            .0;
+
+        let mut first = true;
+        let prim_clips_buffer = &mut self.prim_clips_buffer;
+        let mut shared_clips = Vec::new();
+
+        // Work out which clips are shared by all prim instances and can thus be applied
+        // at the tile cache level. In future, we aim to remove this limitation by knowing
+        // during initial scene build which are the relevant compositor clips, but for now
+        // this is unlikely to be a significant cost.
+        for cluster in &prim_list.clusters {
+            for prim_instance in &prim_list.prim_instances[cluster.prim_range()] {
+                if first {
+                    add_clips(
+                        scroll_root,
+                        prim_instance.clip_set.clip_chain_id,
+                        &mut shared_clips,
+                        clip_store,
+                        interners,
+                        spatial_tree,
+                    );
+
+                    self.last_checked_clip_chain = prim_instance.clip_set.clip_chain_id;
+                    first = false;
+                } else {
+                    if self.last_checked_clip_chain != prim_instance.clip_set.clip_chain_id {
+                        prim_clips_buffer.clear();
+
+                        add_clips(
+                            scroll_root,
+                            prim_instance.clip_set.clip_chain_id,
+                            prim_clips_buffer,
+                            clip_store,
+                            interners,
+                            spatial_tree,
+                        );
+
+                        shared_clips.retain(|h1: &ClipInstance| {
+                            let uid = h1.handle.uid();
+                            prim_clips_buffer.iter().any(|h2| {
+                                uid == h2.handle.uid() &&
+                                h1.spatial_node_index == h2.spatial_node_index
+                            })
+                        });
+
+                        self.last_checked_clip_chain = prim_instance.clip_set.clip_chain_id;
+                    }
+                }
+            }
+        }
+
+        // Construct the new tile cache and add to the list to be built
+        let slice = self.pending_tile_caches.len();
+
+        let params = TileCacheParams {
+            slice,
+            slice_flags: SliceFlags::empty(),
+            spatial_node_index: scroll_root,
+            background_color: None,
+            shared_clips,
+            shared_clip_chain: ClipChainId::NONE,
+            virtual_surface_size: config.compositor_kind.get_virtual_surface_size(),
+        };
+
+        self.pending_tile_caches.push(PendingTileCache {
+            prim_list,
+            params,
+        });
+
+        // Add a tile cache barrier so that the next prim definitely gets added to a
+        // new tile cache, even if it's otherwise compatible with the blend container.
+        self.force_new_tile_cache = Some(SliceFlags::empty());
+    }
+
    /// Add a primitive, either to the current tile cache, or a new one, depending on various conditions.
    pub fn add_prim(
        &mut self,
@ -193,14 +320,6 @@ impl TileCacheBuilder {
        }

        if want_new_tile_cache {
-            // If the page would create too many slices (an arbitrary definition where
-            // it's assumed the GPU memory + compositing overhead would be too high)
-            // then create a single picture cache for the remaining content. This at
-            // least means that we can cache small content changes efficiently when
-            // scrolling isn't occurring. Scrolling regions will be handled reasonably
-            // efficiently by the dirty rect tracking (since it's likely that if the
-            // page has so many slices there isn't a single major scroll region).
-            const MAX_CACHE_SLICES: usize = 12;
            let slice = self.pending_tile_caches.len();

            // If we have exceeded the maximum number of slices, skip creating a new
--- a/gfx/wr/wrench/reftests/blend/mix-blend-invalid-backdrop.yaml
+++ b/gfx/wr/wrench/reftests/blend/mix-blend-invalid-backdrop.yaml
@ -4,20 +4,26 @@
 ---
 root:
  items:
-    - type: clip
-      id: 2
-      bounds: [0, 0, 100, 100]
+    # Ensure a filter is placed here to force this mix-blend to isolate from
+    # the tile cache backdrop - otherwise the surface won't get clipped, which
+    # is what we're trying to test.
    - type: stacking-context
-      blend-container: true
-      clip-node: 2
+      filters: [identity]
      items:
-        - type: rect
-          bounds: [0, 0, 100, 100]
-          color: red
-        - type: stacking-context
-          bounds: [100, 0, 100, 100]
-          mix-blend-mode: multiply
-          items:
-            - type: rect
-              bounds: [0, 0, 100, 100]
-              color: green
+      - type: clip
+        id: 2
+        bounds: [0, 0, 100, 100]
+      - type: stacking-context
+        blend-container: true
+        clip-node: 2
+        items:
+          - type: rect
+            bounds: [0, 0, 100, 100]
+            color: red
+          - type: stacking-context
+            bounds: [100, 0, 100, 100]
+            mix-blend-mode: multiply
+            items:
+              - type: rect
+                bounds: [0, 0, 100, 100]
+                color: green
--- a/gfx/wr/wrench/reftests/blend/reftest.list
+++ b/gfx/wr/wrench/reftests/blend/reftest.list
@ -1,6 +1,6 @@
 == multiply.yaml multiply-ref.yaml
 fuzzy(1,32) == multiply-2.yaml multiply-2-ref.yaml
-fuzzy(1,32) == color_targets(4) alpha_targets(0) multiply-3.yaml multiply-2-ref.yaml
+fuzzy(1,32) == color_targets(3) alpha_targets(0) multiply-3.yaml multiply-2-ref.yaml
 == difference.yaml difference-ref.yaml
 fuzzy(1,30000) == difference-transparent.yaml difference-transparent-ref.yaml
 fuzzy-if(platform(swgl),1,10000) == darken.yaml darken-ref.yaml
--- a/gfx/wr/wrench/reftests/text/reftest.list
+++ b/gfx/wr/wrench/reftests/text/reftest.list
@ -81,4 +81,4 @@ fuzzy(1,15) platform(linux) force_subpixel_aa_where_possible(false) == text-fixe
 # most pixels are off by a small amount, but a few pixels on the edge vary by a lot, pushing up the fuzzy max-diff;
 # the main goal of the test is that everything is in the same place, at the same scale, clipped the same way,
 # despite 4x on-the-fly scale change.
-skip_on(android) fuzzy-range(<=3,*21700,<=20,*3500,<=115,*543) fuzzy-if(platform(swgl),103,24907) == raster_root_C_8192.yaml raster_root_C_ref.yaml
+skip_on(android) fuzzy-range(<=3,*21700,<=20,*3500,<=119,*590) fuzzy-if(platform(swgl),108,24907) == raster_root_C_8192.yaml raster_root_C_ref.yaml
--- a/testing/web-platform/meta/css/compositing/background-blending/background-blend-mode-gradient-image.html.ini
+++ b/testing/web-platform/meta/css/compositing/background-blending/background-blend-mode-gradient-image.html.ini
@ -1,3 +1,5 @@
 [background-blend-mode-gradient-image.html]
  expected:
    if (os == "android") and debug: [PASS, FAIL]
+  fuzzy:
+    if os == "win" and webrender: maxDifference=92;totalPixels=2135