Bug 1516676 - Update webrender to commit 8b8ca774f555aeb8ef99743196593a192298dd60 (WR PR #3452). r=kats

https://github.com/servo/webrender/pull/3452 Differential Revision: https://phabricator.services.mozilla.com/D15494 --HG-- extra : moz-landing-system : lando
2018-12-29 21:29:46 +00:00 · 2018-12-29 21:29:46 +00:00 · fa50bcc2e7
--- a/gfx/webrender_bindings/revision.txt
+++ b/gfx/webrender_bindings/revision.txt
@ -1 +1 @@
-b4dfe9c4f98fdeca3814976cd075bde8ed409123
+8b8ca774f555aeb8ef99743196593a192298dd60
--- a/gfx/wr/webrender/src/gpu_cache.rs
+++ b/gfx/wr/webrender/src/gpu_cache.rs
@ -31,13 +31,28 @@ use profiler::GpuCacheProfileCounters;
 use render_backend::FrameId;
 use renderer::MAX_VERTEX_TEXTURE_WIDTH;
 use std::{mem, u16, u32};
+use std::num::NonZeroU32;
 use std::ops::Add;
 use std::os::raw::c_void;
+use std::time::{Duration, Instant};


-pub const GPU_CACHE_INITIAL_HEIGHT: i32 = 512;
+/// At the time of this writing, Firefox uses about 15 GPU cache rows on
+/// startup, and then gradually works its way up to the mid-30s with normal
+/// browsing.
+pub const GPU_CACHE_INITIAL_HEIGHT: i32 = 20;
+const NEW_ROWS_PER_RESIZE: i32 = 10;
+
+/// The number of frames an entry can go unused before being evicted.
 const FRAMES_BEFORE_EVICTION: usize = 10;
-const NEW_ROWS_PER_RESIZE: i32 = 512;
+
+/// The ratio of utilized blocks to total blocks for which we start the clock
+/// on reclaiming memory.
+const RECLAIM_THRESHOLD: f32 = 0.2;
+
+/// The amount of time utilization must be below the above threshold before we
+/// blow away the cache and rebuild it.
+const RECLAIM_DELAY_S: u64 = 5;

 #[derive(Debug, Copy, Clone, Eq, PartialEq)]
 #[cfg_attr(feature = "capture", derive(Serialize))]
@ -131,7 +146,7 @@ impl GpuCacheHandle {
 // A unique address in the GPU cache. These are uploaded
 // as part of the primitive instances, to allow the vertex
 // shader to fetch the specific data.
-#[derive(Copy, Debug, Clone)]
+#[derive(Copy, Debug, Clone, Eq, PartialEq)]
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
 pub struct GpuCacheAddress {
@ -173,31 +188,67 @@ impl Add<usize> for GpuCacheAddress {
 struct Block {
    // The location in the cache of this block.
    address: GpuCacheAddress,
+    // The current epoch (generation) of this block.
+    epoch: Epoch,
    // Index of the next free block in the list it
    // belongs to (either a free-list or the
    // occupied list).
    next: Option<BlockIndex>,
-    // The current epoch (generation) of this block.
-    epoch: Epoch,
    // The last frame this block was referenced.
    last_access_time: FrameId,
 }

 impl Block {
-    fn new(address: GpuCacheAddress, next: Option<BlockIndex>, frame_id: FrameId) -> Self {
+    fn new(
+        address: GpuCacheAddress,
+        next: Option<BlockIndex>,
+        frame_id: FrameId,
+        epoch: Epoch,
+    ) -> Self {
        Block {
            address,
            next,
            last_access_time: frame_id,
-            epoch: Epoch(0),
+            epoch,
        }
    }
+
+    fn advance_epoch(&mut self, max_epoch: &mut Epoch) {
+        self.epoch.next();
+        if max_epoch.0 < self.epoch.0 {
+            max_epoch.0 = self.epoch.0;
+        }
+    }
+
+    /// Creates an invalid dummy block ID.
+    pub const INVALID: Block = Block {
+        address: GpuCacheAddress { u: 0, v: 0 },
+        epoch: Epoch(0),
+        next: None,
+        last_access_time: FrameId::INVALID,
+    };
 }

+/// Represents the index of a Block in the block array. We only create such
+/// structs for blocks that represent the start of a chunk.
+///
+/// Because we use Option<BlockIndex> in a lot of places, we use a NonZeroU32
+/// here and avoid ever using the index zero.
 #[derive(Debug, Copy, Clone)]
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
-struct BlockIndex(usize);
+struct BlockIndex(NonZeroU32);
+
+impl BlockIndex {
+    fn new(idx: usize) -> Self {
+        debug_assert!(idx <= u32::MAX as usize);
+        BlockIndex(NonZeroU32::new(idx as u32).expect("Index zero forbidden"))
+    }
+
+    fn get(&self) -> usize {
+        self.0.get() as usize
+    }
+}

 // A row in the cache texture.
 #[cfg_attr(feature = "capture", derive(Serialize))]
@ -232,10 +283,19 @@ pub enum GpuCacheUpdate {
    },
 }

-pub struct GpuDebugChunk {
+/// Command to inform the debug display in the renderer when chunks are allocated
+/// or freed.
+pub enum GpuCacheDebugCmd {
+    /// Describes an allocated chunk.
+    Alloc(GpuCacheDebugChunk),
+    /// Describes a freed chunk.
+    Free(GpuCacheAddress),
+}
+
+#[derive(Clone)]
+pub struct GpuCacheDebugChunk {
    pub address: GpuCacheAddress,
-    pub tag: u8,
-    pub size: u16,
+    pub size: usize,
 }

 #[must_use]
@ -254,7 +314,7 @@ pub struct GpuCacheUpdateList {
    pub blocks: Vec<GpuBlockData>,
    /// Whole state GPU block metadata for debugging.
    #[cfg_attr(feature = "serde", serde(skip))]
-    pub debug_chunks: Vec<GpuDebugChunk>,
+    pub debug_commands: Vec<GpuCacheDebugCmd>,
 }

 // Holds the free lists of fixed size blocks. Mostly
@ -270,7 +330,10 @@ struct FreeBlockLists {
    free_list_32: Option<BlockIndex>,
    free_list_64: Option<BlockIndex>,
    free_list_128: Option<BlockIndex>,
-    free_list_large: Option<BlockIndex>,
+    free_list_256: Option<BlockIndex>,
+    free_list_341: Option<BlockIndex>,
+    free_list_512: Option<BlockIndex>,
+    free_list_1024: Option<BlockIndex>,
 }

 impl FreeBlockLists {
@ -284,7 +347,10 @@ impl FreeBlockLists {
            free_list_32: None,
            free_list_64: None,
            free_list_128: None,
-            free_list_large: None,
+            free_list_256: None,
+            free_list_341: None,
+            free_list_512: None,
+            free_list_1024: None,
        }
    }

@ -292,8 +358,14 @@ impl FreeBlockLists {
        &mut self,
        block_count: usize,
    ) -> (usize, &mut Option<BlockIndex>) {
-        // Find the appropriate free list to use
-        // based on the block size.
+        // Find the appropriate free list to use based on the block size.
+        //
+        // Note that we cheat a bit with the 341 bucket, since it's not quite
+        // a divisor of 1024, because purecss-francine allocates many 260-block
+        // chunks, and there's no reason we shouldn't pack these three to a row.
+        // This means the allocation statistics will under-report by one block
+        // for each row using 341-block buckets, which is fine.
+        debug_assert_eq!(MAX_VERTEX_TEXTURE_WIDTH, 1024, "Need to update bucketing");
        match block_count {
            0 => panic!("Can't allocate zero sized blocks!"),
            1 => (1, &mut self.free_list_1),
@ -304,7 +376,10 @@ impl FreeBlockLists {
            17...32 => (32, &mut self.free_list_32),
            33...64 => (64, &mut self.free_list_64),
            65...128 => (128, &mut self.free_list_128),
-            129...MAX_VERTEX_TEXTURE_WIDTH => (MAX_VERTEX_TEXTURE_WIDTH, &mut self.free_list_large),
+            129...256 => (256, &mut self.free_list_256),
+            257...341 => (341, &mut self.free_list_341),
+            342...512 => (512, &mut self.free_list_512),
+            513...1024 => (1024, &mut self.free_list_1024),
            _ => panic!("Can't allocate > MAX_VERTEX_TEXTURE_WIDTH per resource!"),
        }
    }
@ -320,6 +395,12 @@ struct Texture {
    blocks: Vec<Block>,
    // Metadata about each allocated row.
    rows: Vec<Row>,
+    // The base Epoch for this texture.
+    base_epoch: Epoch,
+    // The maximum epoch reached. We track this along with the above so
+    // that we can rebuild the Texture and avoid collisions with handles
+    // allocated for the old texture.
+    max_epoch: Epoch,
    // Free lists of available blocks for each supported
    // block size in the texture. These are intrusive
    // linked lists.
@ -335,19 +416,39 @@ struct Texture {
    updates: Vec<GpuCacheUpdate>,
    // Profile stats
    allocated_block_count: usize,
+    // The stamp at which we first reached our threshold for reclaiming `GpuCache`
+    // memory, or `None` if the threshold hasn't been reached.
+    #[cfg_attr(feature = "serde", serde(skip))]
+    reached_reclaim_threshold: Option<Instant>,
+    // List of debug commands to be sent to the renderer when the GPU cache
+    // debug display is enabled.
+    #[cfg_attr(feature = "serde", serde(skip))]
+    debug_commands: Vec<GpuCacheDebugCmd>,
+    // The current debug flags for the system.
+    debug_flags: DebugFlags,
 }

 impl Texture {
-    fn new() -> Self {
+    fn new(base_epoch: Epoch, debug_flags: DebugFlags) -> Self {
+        // Pre-fill the block array with one invalid block so that we never use
+        // 0 for a BlockIndex. This lets us use NonZeroU32 for BlockIndex, which
+        // saves memory.
+        let blocks = vec![Block::INVALID];
+
        Texture {
            height: GPU_CACHE_INITIAL_HEIGHT,
-            blocks: Vec::new(),
+            blocks,
            rows: Vec::new(),
+            base_epoch,
+            max_epoch: base_epoch,
            free_lists: FreeBlockLists::new(),
            pending_blocks: Vec::new(),
            updates: Vec::new(),
            occupied_list_head: None,
            allocated_block_count: 0,
+            reached_reclaim_threshold: None,
+            debug_commands: Vec::new(),
+            debug_flags,
        }
    }

@ -393,8 +494,8 @@ impl Texture {
            let mut prev_block_index = None;
            for i in 0 .. items_per_row {
                let address = GpuCacheAddress::new(i * alloc_size, row_index);
-                let block_index = BlockIndex(self.blocks.len());
-                let block = Block::new(address, prev_block_index, frame_id);
+                let block_index = BlockIndex::new(self.blocks.len());
+                let block = Block::new(address, prev_block_index, frame_id, self.base_epoch);
                self.blocks.push(block);
                prev_block_index = Some(block_index);
            }
@ -406,7 +507,7 @@ impl Texture {
        // available in the appropriate free-list. Pull a block from the
        // head of the list.
        let free_block_index = free_list.take().unwrap();
-        let block = &mut self.blocks[free_block_index.0 as usize];
+        let block = &mut self.blocks[free_block_index.get()];
        *free_list = block.next;

        // Add the block to the occupied linked list.
@ -425,6 +526,18 @@ impl Texture {
            });
        }

+        // If we're using the debug display, communicate the allocation to the
+        // renderer thread. Note that we do this regardless of whether or not
+        // pending_block_index is None (if it is, the renderer thread will fill
+        // in the data via a deferred resolve, but the block is still considered
+        // allocated).
+        if self.debug_flags.contains(DebugFlags::GPU_CACHE_DBG) {
+            self.debug_commands.push(GpuCacheDebugCmd::Alloc(GpuCacheDebugChunk {
+                address: block.address,
+                size: block_count,
+            }));
+        }
+
        CacheLocation {
            block_index: free_block_index,
            epoch: block.epoch,
@ -442,7 +555,7 @@ impl Texture {

        while let Some(index) = current_block {
            let (next_block, should_unlink) = {
-                let block = &mut self.blocks[index.0 as usize];
+                let block = &mut self.blocks[index.get()];

                let next_block = block.next;
                let mut should_unlink = false;
@ -461,11 +574,16 @@ impl Texture {
                    let (_, free_list) = self.free_lists
                        .get_actual_block_count_and_free_list(row.block_count_per_item);

-                    block.epoch.next();
+                    block.advance_epoch(&mut self.max_epoch);
                    block.next = *free_list;
                    *free_list = Some(index);

                    self.allocated_block_count -= row.block_count_per_item;
+
+                    if self.debug_flags.contains(DebugFlags::GPU_CACHE_DBG) {
+                        let cmd = GpuCacheDebugCmd::Free(block.address);
+                        self.debug_commands.push(cmd);
+                    }
                };

                (next_block, should_unlink)
@ -476,7 +594,7 @@ impl Texture {
            if should_unlink {
                match prev_block {
                    Some(prev_block) => {
-                        self.blocks[prev_block.0 as usize].next = next_block;
+                        self.blocks[prev_block.get()].next = next_block;
                    }
                    None => {
                        self.occupied_list_head = next_block;
@ -489,6 +607,15 @@ impl Texture {
            current_block = next_block;
        }
    }
+
+    /// Returns the ratio of utilized blocks.
+    fn utilization(&self) -> f32 {
+        let total_blocks = self.rows.len() * MAX_VERTEX_TEXTURE_WIDTH;
+        debug_assert!(total_blocks > 0);
+        let ratio = self.allocated_block_count as f32 / total_blocks as f32;
+        debug_assert!(0.0 <= ratio && ratio <= 1.0, "Bad ratio: {}", ratio);
+        ratio
+    }
 }


@ -546,14 +673,25 @@ pub struct GpuCache {

 impl GpuCache {
    pub fn new() -> Self {
+        let debug_flags = DebugFlags::empty();
        GpuCache {
            frame_id: FrameId::INVALID,
-            texture: Texture::new(),
+            texture: Texture::new(Epoch(0), debug_flags),
            saved_block_count: 0,
-            debug_flags: DebugFlags::empty(),
+            debug_flags,
        }
    }

+    /// Drops everything in the GPU cache. Paired by the caller with a message
+    /// to the renderer thread telling it to do the same.
+    pub fn clear(&mut self) {
+        assert!(self.texture.updates.is_empty(), "Clearing with pending updates");
+        let mut next_base_epoch = self.texture.max_epoch;
+        next_base_epoch.next();
+        self.texture = Texture::new(next_base_epoch, self.debug_flags);
+        self.saved_block_count = 0;
+    }
+
    /// Begin a new frame.
    pub fn begin_frame(&mut self, frame_id: FrameId) {
        debug_assert!(self.texture.pending_blocks.is_empty());
@ -567,29 +705,31 @@ impl GpuCache {
    // will rebuild the data and upload it to the GPU.
    pub fn invalidate(&mut self, handle: &GpuCacheHandle) {
        if let Some(ref location) = handle.location {
-            let block = &mut self.texture.blocks[location.block_index.0];
            // don't invalidate blocks that are already re-assigned
-            if block.epoch == location.epoch {
-                block.epoch.next();
+            if let Some(block) = self.texture.blocks.get_mut(location.block_index.get()) {
+                if block.epoch == location.epoch {
+                    block.advance_epoch(&mut self.texture.max_epoch);
+                }
            }
        }
    }

-    // Request a resource be added to the cache. If the resource
+    /// Request a resource be added to the cache. If the resource
    /// is already in the cache, `None` will be returned.
    pub fn request<'a>(&'a mut self, handle: &'a mut GpuCacheHandle) -> Option<GpuDataRequest<'a>> {
        let mut max_block_count = MAX_VERTEX_TEXTURE_WIDTH;
        // Check if the allocation for this handle is still valid.
        if let Some(ref location) = handle.location {
-            let block = &mut self.texture.blocks[location.block_index.0];
-            max_block_count = self.texture.rows[block.address.v as usize].block_count_per_item;
-            if block.epoch == location.epoch {
-                if block.last_access_time != self.frame_id {
-                    // Mark last access time to avoid evicting this block.
-                    block.last_access_time = self.frame_id;
-                    self.saved_block_count += max_block_count;
+            if let Some(block) = self.texture.blocks.get_mut(location.block_index.get()) {
+                if block.epoch == location.epoch {
+                    max_block_count = self.texture.rows[block.address.v as usize].block_count_per_item;
+                    if block.last_access_time != self.frame_id {
+                        // Mark last access time to avoid evicting this block.
+                        block.last_access_time = self.frame_id;
+                        self.saved_block_count += max_block_count;
+                    }
+                    return None;
                }
-                return None;
            }
        }

@ -631,7 +771,7 @@ impl GpuCache {
    /// End the frame. Return the list of updates to apply to the
    /// device specific cache texture.
    pub fn end_frame(
-        &self,
+        &mut self,
        profile_counters: &mut GpuCacheProfileCounters,
    ) -> FrameId {
        profile_counters
@ -643,28 +783,32 @@ impl GpuCache {
        profile_counters
            .saved_blocks
            .set(self.saved_block_count);
+
+        let reached_threshold =
+            self.texture.rows.len() > (GPU_CACHE_INITIAL_HEIGHT as usize) &&
+            self.texture.utilization() < RECLAIM_THRESHOLD;
+        if reached_threshold {
+            self.texture.reached_reclaim_threshold.get_or_insert_with(Instant::now);
+        } else {
+            self.texture.reached_reclaim_threshold = None;
+        }
+
        self.frame_id
    }

+    /// Returns true if utilization has been low enough for long enough that we
+    /// should blow the cache away and rebuild it.
+    pub fn should_reclaim_memory(&self) -> bool {
+        self.texture.reached_reclaim_threshold
+            .map_or(false, |t| t.elapsed() > Duration::from_secs(RECLAIM_DELAY_S))
+    }
+
    /// Extract the pending updates from the cache.
    pub fn extract_updates(&mut self) -> GpuCacheUpdateList {
        GpuCacheUpdateList {
            frame_id: self.frame_id,
            height: self.texture.height,
-            debug_chunks: if self.debug_flags.contains(DebugFlags::GPU_CACHE_DBG) {
-                self.texture.updates
-                    .iter()
-                    .map(|update| match *update {
-                        GpuCacheUpdate::Copy { address, block_index: _, block_count } => GpuDebugChunk {
-                            address,
-                            tag: 0, //TODO
-                            size: block_count.min(0xFFFF) as u16,
-                        }
-                    })
-                    .collect()
-            } else {
-                Vec::new()
-            },
+            debug_commands: mem::replace(&mut self.texture.debug_commands, Vec::new()),
            updates: mem::replace(&mut self.texture.updates, Vec::new()),
            blocks: mem::replace(&mut self.texture.pending_blocks, Vec::new()),
        }
@ -673,6 +817,7 @@ impl GpuCache {
    /// Sets the current debug flags for the system.
    pub fn set_debug_flags(&mut self, flags: DebugFlags) {
        self.debug_flags = flags;
+        self.texture.debug_flags = flags;
    }

    /// Get the actual GPU address in the texture for a given slot ID.
@ -681,7 +826,7 @@ impl GpuCache {
    /// freed or pending slot will panic!
    pub fn get_address(&self, id: &GpuCacheHandle) -> GpuCacheAddress {
        let location = id.location.expect("handle not requested or allocated!");
-        let block = &self.texture.blocks[location.block_index.0];
+        let block = &self.texture.blocks[location.block_index.get()];
        debug_assert_eq!(block.epoch, location.epoch);
        debug_assert_eq!(block.last_access_time, self.frame_id);
        block.address
@ -692,3 +837,12 @@ impl GpuCache {
        self.texture.malloc_size_of(op)
    }
 }
+
+#[test]
+#[cfg(target_pointer_width = "64")]
+fn test_struct_sizes() {
+    use std::mem;
+    // We can end up with a lot of blocks stored in the global vec, and keeping
+    // them small helps reduce memory overhead.
+    assert_eq!(mem::size_of::<Block>(), 24, "Block size changed");
+}
--- a/gfx/wr/webrender/src/internal_types.rs
+++ b/gfx/wr/webrender/src/internal_types.rs
@ -295,6 +295,7 @@ pub enum ResultMsg {
    DebugOutput(DebugOutput),
    RefreshShader(PathBuf),
    UpdateGpuCache(GpuCacheUpdateList),
+    ClearGpuCache,
    UpdateResources {
        updates: TextureUpdateList,
        memory_pressure: bool,
--- a/gfx/wr/webrender/src/prim_store/borders.rs
+++ b/gfx/wr/webrender/src/prim_store/borders.rs
@ -374,7 +374,7 @@ impl IsVisible for ImageBorder {
 }

 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(target_pointer_width = "64")]
 fn test_struct_sizes() {
    use std::mem;
    // The sizes of these structures are critical for performance on a number of
@ -384,9 +384,9 @@ fn test_struct_sizes() {
    // (b) You made a structure larger. This is not necessarily a problem, but should only
    //     be done with care, and after checking if talos performance regresses badly.
    assert_eq!(mem::size_of::<NormalBorderPrim>(), 84, "NormalBorderPrim size changed");
-    assert_eq!(mem::size_of::<NormalBorderTemplate>(), 240, "NormalBorderTemplate size changed");
+    assert_eq!(mem::size_of::<NormalBorderTemplate>(), 224, "NormalBorderTemplate size changed");
    assert_eq!(mem::size_of::<NormalBorderKey>(), 112, "NormalBorderKey size changed");
    assert_eq!(mem::size_of::<ImageBorder>(), 92, "ImageBorder size changed");
-    assert_eq!(mem::size_of::<ImageBorderTemplate>(), 104, "ImageBorderTemplate size changed");
+    assert_eq!(mem::size_of::<ImageBorderTemplate>(), 88, "ImageBorderTemplate size changed");
    assert_eq!(mem::size_of::<ImageBorderKey>(), 120, "ImageBorderKey size changed");
 }
--- a/gfx/wr/webrender/src/prim_store/gradient.rs
+++ b/gfx/wr/webrender/src/prim_store/gradient.rs
@ -714,7 +714,7 @@ impl GradientGpuBlockBuilder {
 }

 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(target_pointer_width = "64")]
 fn test_struct_sizes() {
    use std::mem;
    // The sizes of these structures are critical for performance on a number of
@ -724,10 +724,10 @@ fn test_struct_sizes() {
    // (b) You made a structure larger. This is not necessarily a problem, but should only
    //     be done with care, and after checking if talos performance regresses badly.
    assert_eq!(mem::size_of::<LinearGradient>(), 72, "LinearGradient size changed");
-    assert_eq!(mem::size_of::<LinearGradientTemplate>(), 168, "LinearGradientTemplate size changed");
+    assert_eq!(mem::size_of::<LinearGradientTemplate>(), 128, "LinearGradientTemplate size changed");
    assert_eq!(mem::size_of::<LinearGradientKey>(), 96, "LinearGradientKey size changed");

    assert_eq!(mem::size_of::<RadialGradient>(), 72, "RadialGradient size changed");
-    assert_eq!(mem::size_of::<RadialGradientTemplate>(), 168, "RadialGradientTemplate size changed");
+    assert_eq!(mem::size_of::<RadialGradientTemplate>(), 136, "RadialGradientTemplate size changed");
    assert_eq!(mem::size_of::<RadialGradientKey>(), 104, "RadialGradientKey size changed");
 }
--- a/gfx/wr/webrender/src/prim_store/image.rs
+++ b/gfx/wr/webrender/src/prim_store/image.rs
@ -549,7 +549,7 @@ impl IsVisible for YuvImage {
 }

 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(target_pointer_width = "64")]
 fn test_struct_sizes() {
    use std::mem;
    // The sizes of these structures are critical for performance on a number of
@ -559,9 +559,9 @@ fn test_struct_sizes() {
    // (b) You made a structure larger. This is not necessarily a problem, but should only
    //     be done with care, and after checking if talos performance regresses badly.
    assert_eq!(mem::size_of::<Image>(), 56, "Image size changed");
-    assert_eq!(mem::size_of::<ImageTemplate>(), 144, "ImageTemplate size changed");
+    assert_eq!(mem::size_of::<ImageTemplate>(), 124, "ImageTemplate size changed");
    assert_eq!(mem::size_of::<ImageKey>(), 84, "ImageKey size changed");
    assert_eq!(mem::size_of::<YuvImage>(), 36, "YuvImage size changed");
-    assert_eq!(mem::size_of::<YuvImageTemplate>(), 96, "YuvImageTemplate size changed");
+    assert_eq!(mem::size_of::<YuvImageTemplate>(), 72, "YuvImageTemplate size changed");
    assert_eq!(mem::size_of::<YuvImageKey>(), 64, "YuvImageKey size changed");
 }
--- a/gfx/wr/webrender/src/prim_store/line_dec.rs
+++ b/gfx/wr/webrender/src/prim_store/line_dec.rs
@ -180,7 +180,7 @@ impl IsVisible for LineDecoration {
 }

 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(target_pointer_width = "64")]
 fn test_struct_sizes() {
    use std::mem;
    // The sizes of these structures are critical for performance on a number of
@ -190,6 +190,6 @@ fn test_struct_sizes() {
    // (b) You made a structure larger. This is not necessarily a problem, but should only
    //     be done with care, and after checking if talos performance regresses badly.
    assert_eq!(mem::size_of::<LineDecoration>(), 20, "LineDecoration size changed");
-    assert_eq!(mem::size_of::<LineDecorationTemplate>(), 88, "LineDecorationTemplate size changed");
+    assert_eq!(mem::size_of::<LineDecorationTemplate>(), 68, "LineDecorationTemplate size changed");
    assert_eq!(mem::size_of::<LineDecorationKey>(), 48, "LineDecorationKey size changed");
 }
--- a/gfx/wr/webrender/src/prim_store/mod.rs
+++ b/gfx/wr/webrender/src/prim_store/mod.rs
@ -3311,7 +3311,7 @@ fn update_opacity_binding(
 }

 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(target_pointer_width = "64")]
 fn test_struct_sizes() {
    use std::mem;
    // The sizes of these structures are critical for performance on a number of
@ -3322,7 +3322,7 @@ fn test_struct_sizes() {
    //     be done with care, and after checking if talos performance regresses badly.
    assert_eq!(mem::size_of::<PrimitiveInstance>(), 120, "PrimitiveInstance size changed");
    assert_eq!(mem::size_of::<PrimitiveInstanceKind>(), 40, "PrimitiveInstanceKind size changed");
-    assert_eq!(mem::size_of::<PrimitiveTemplate>(), 80, "PrimitiveTemplate size changed");
+    assert_eq!(mem::size_of::<PrimitiveTemplate>(), 56, "PrimitiveTemplate size changed");
    assert_eq!(mem::size_of::<PrimitiveTemplateKind>(), 20, "PrimitiveTemplateKind size changed");
    assert_eq!(mem::size_of::<PrimitiveKey>(), 36, "PrimitiveKey size changed");
    assert_eq!(mem::size_of::<PrimitiveKeyKind>(), 5, "PrimitiveKeyKind size changed");
--- a/gfx/wr/webrender/src/prim_store/picture.rs
+++ b/gfx/wr/webrender/src/prim_store/picture.rs
@ -224,7 +224,7 @@ impl IsVisible for Picture {
 }

 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(target_pointer_width = "64")]
 fn test_struct_sizes() {
    use std::mem;
    // The sizes of these structures are critical for performance on a number of
@ -234,6 +234,6 @@ fn test_struct_sizes() {
    // (b) You made a structure larger. This is not necessarily a problem, but should only
    //     be done with care, and after checking if talos performance regresses badly.
    assert_eq!(mem::size_of::<Picture>(), 84, "Picture size changed");
-    assert_eq!(mem::size_of::<PictureTemplate>(), 56, "PictureTemplate size changed");
+    assert_eq!(mem::size_of::<PictureTemplate>(), 36, "PictureTemplate size changed");
    assert_eq!(mem::size_of::<PictureKey>(), 112, "PictureKey size changed");
 }
--- a/gfx/wr/webrender/src/prim_store/text_run.rs
+++ b/gfx/wr/webrender/src/prim_store/text_run.rs
@ -328,6 +328,7 @@ impl TextRunPrimitive {
    }
 }

+/// These are linux only because FontInstancePlatformOptions varies in size by platform.
 #[test]
 #[cfg(target_os = "linux")]
 fn test_struct_sizes() {
@ -339,7 +340,7 @@ fn test_struct_sizes() {
    // (b) You made a structure larger. This is not necessarily a problem, but should only
    //     be done with care, and after checking if talos performance regresses badly.
    assert_eq!(mem::size_of::<TextRun>(), 112, "TextRun size changed");
-    assert_eq!(mem::size_of::<TextRunTemplate>(), 160, "TextRunTemplate size changed");
+    assert_eq!(mem::size_of::<TextRunTemplate>(), 144, "TextRunTemplate size changed");
    assert_eq!(mem::size_of::<TextRunKey>(), 136, "TextRunKey size changed");
    assert_eq!(mem::size_of::<TextRunPrimitive>(), 88, "TextRunPrimitive size changed");
 }
--- a/gfx/wr/webrender/src/render_backend.rs
+++ b/gfx/wr/webrender/src/render_backend.rs
@ -8,7 +8,7 @@
 //! See the comment at the top of the `renderer` module for a description of
 //! how these two pieces interact.

-use api::{ApiMsg, BuiltDisplayList, ClearCache, DebugCommand};
+use api::{ApiMsg, BuiltDisplayList, ClearCache, DebugCommand, DebugFlags};
 #[cfg(feature = "debugger")]
 use api::{BuiltDisplayListIter, SpecificDisplayItem};
 use api::{DevicePixelScale, DeviceIntPoint, DeviceIntRect, DeviceIntSize};
@ -659,6 +659,7 @@ pub struct RenderBackend {
    recorder: Option<Box<ApiRecordingReceiver>>,
    sampler: Option<Box<AsyncPropertySampler + Send>>,
    size_of_op: Option<VoidPtrToSizeFn>,
+    debug_flags: DebugFlags,
    namespace_alloc_by_client: bool,
 }

@ -677,6 +678,7 @@ impl RenderBackend {
        recorder: Option<Box<ApiRecordingReceiver>>,
        sampler: Option<Box<AsyncPropertySampler + Send>>,
        size_of_op: Option<VoidPtrToSizeFn>,
+        debug_flags: DebugFlags,
        namespace_alloc_by_client: bool,
    ) -> RenderBackend {
        RenderBackend {
@ -696,6 +698,7 @@ impl RenderBackend {
            recorder,
            sampler,
            size_of_op,
+            debug_flags,
            namespace_alloc_by_client,
        }
    }
@ -1005,6 +1008,8 @@ impl RenderBackend {
                // recently used resources.
                self.resource_cache.clear(ClearCache::all());

+                self.clear_gpu_cache();
+
                let pending_update = self.resource_cache.pending_updates();
                let msg = ResultMsg::UpdateResources {
                    updates: pending_update,
@ -1104,6 +1109,22 @@ impl RenderBackend {
                    DebugCommand::SetFlags(flags) => {
                        self.resource_cache.set_debug_flags(flags);
                        self.gpu_cache.set_debug_flags(flags);
+
+                        // If we're toggling on the GPU cache debug display, we
+                        // need to blow away the cache. This is because we only
+                        // send allocation/free notifications to the renderer
+                        // thread when the debug display is enabled, and thus
+                        // enabling it when the cache is partially populated will
+                        // give the renderer an incomplete view of the world.
+                        // And since we might as well drop all the debugging state
+                        // from the renderer when we disable the debug display,
+                        // we just clear the cache on toggle.
+                        let changed = self.debug_flags ^ flags;
+                        if changed.contains(DebugFlags::GPU_CACHE_DBG) {
+                            self.clear_gpu_cache();
+                        }
+                        self.debug_flags = flags;
+
                        ResultMsg::DebugCommand(option)
                    }
                    _ => ResultMsg::DebugCommand(option),
@ -1156,6 +1177,13 @@ impl RenderBackend {
            &mut profile_counters.resources,
        );

+        // If we've been above the threshold for reclaiming GPU cache memory for
+        // long enough, drop it and rebuild it. This needs to be done before any
+        // updates for this frame are made.
+        if self.gpu_cache.should_reclaim_memory() {
+            self.clear_gpu_cache();
+        }
+
        for scene_msg in transaction_msg.scene_ops.drain(..) {
            let _timer = profile_counters.total_time.timer();
            self.process_scene_msg(
@ -1520,6 +1548,13 @@ impl RenderBackend {
        // thread waiting on the request.
        self.scene_tx.send(SceneBuilderRequest::ReportMemory(report, tx)).unwrap();
    }
+
+    /// Drops everything in the GPU cache. Must not be called once gpu cache entries
+    /// for the next frame have already been requested.
+    fn clear_gpu_cache(&mut self) {
+        self.gpu_cache.clear();
+        self.result_tx.send(ResultMsg::ClearGpuCache).unwrap();
+    }
 }

 fn get_blob_image_updates(updates: &[ResourceUpdate]) -> Vec<BlobImageKey> {
--- a/gfx/wr/webrender/src/renderer.rs
+++ b/gfx/wr/webrender/src/renderer.rs
@ -49,7 +49,7 @@ use gleam::gl;
 use glyph_rasterizer::{GlyphFormat, GlyphRasterizer};
 use gpu_cache::{GpuBlockData, GpuCacheUpdate, GpuCacheUpdateList};
 #[cfg(feature = "debug_renderer")]
-use gpu_cache::GpuDebugChunk;
+use gpu_cache::{GpuCacheDebugChunk, GpuCacheDebugCmd};
 #[cfg(feature = "pathfinder")]
 use gpu_glyph_renderer::GpuGlyphRenderer;
 use gpu_types::ScalingInstance;
@ -1017,14 +1017,21 @@ pub enum BlendMode {
    SubpixelWithBgColor,
 }

-// Tracks the state of each row in the GPU cache texture.
+/// Tracks the state of each row in the GPU cache texture.
 struct CacheRow {
+    /// Mirrored block data on CPU for this row. We store a copy of
+    /// the data on the CPU side to improve upload batching.
+    cpu_blocks: Box<[GpuBlockData; MAX_VERTEX_TEXTURE_WIDTH]>,
+    /// True if this row is dirty.
    is_dirty: bool,
 }

 impl CacheRow {
    fn new() -> Self {
-        CacheRow { is_dirty: false }
+        CacheRow {
+            cpu_blocks: Box::new([GpuBlockData::EMPTY; MAX_VERTEX_TEXTURE_WIDTH]),
+            is_dirty: false,
+        }
    }
 }

@ -1036,10 +1043,8 @@ enum GpuCacheBus {
    PixelBuffer {
        /// PBO used for transfers.
        buffer: PBO,
-        /// Meta-data about the cached rows.
+        /// Per-row data.
        rows: Vec<CacheRow>,
-        /// Mirrored block data on CPU.
-        cpu_blocks: Vec<GpuBlockData>,
    },
    /// Shader-based scattering updates. Currently rendered by a set
    /// of points into the GPU texture, each carrying a `GpuBlockData`.
@ -1057,16 +1062,6 @@ enum GpuCacheBus {
    },
 }

-impl GpuCacheBus {
-    /// Returns true if this bus uses a render target for a texture.
-    fn uses_render_target(&self) -> bool {
-        match *self {
-            GpuCacheBus::Scatter { .. } => true,
-            GpuCacheBus::PixelBuffer { .. } => false,
-        }
-    }
-}
-
 /// The device-specific representation of the cache texture in gpu_cache.rs
 struct GpuCacheTexture {
    texture: Option<Texture>,
@ -1077,38 +1072,22 @@ impl GpuCacheTexture {

    /// Ensures that we have an appropriately-sized texture. Returns true if a
    /// new texture was created.
-    fn ensure_texture(&mut self, device: &mut Device, height: i32) -> bool {
+    fn ensure_texture(&mut self, device: &mut Device, height: i32) {
        // If we already have a texture that works, we're done.
        if self.texture.as_ref().map_or(false, |t| t.get_dimensions().height >= height) {
-            if GPU_CACHE_RESIZE_TEST && self.bus.uses_render_target() {
+            if GPU_CACHE_RESIZE_TEST {
                // Special debug mode - resize the texture even though it's fine.
            } else {
-                return false;
+                return;
            }
        }

-        // Compute a few parameters for the new texture. We round the height up to
-        // a multiple of 256 to avoid many small resizes.
-        let new_height = (height + 255) & !255;
-        let new_size = DeviceIntSize::new(MAX_VERTEX_TEXTURE_WIDTH as _, new_height);
-        let rt_info = if self.bus.uses_render_target() {
-            Some(RenderTargetInfo { has_depth: false })
-        } else {
-            None
-        };
-
-        // Take the old texture, if any, and deinitialize it unless we're going
-        // to blit it's contents to the new one.
-        let mut blit_source = None;
-        if let Some(t) = self.texture.take() {
-            if rt_info.is_some() {
-                blit_source = Some(t);
-            } else {
-                device.delete_texture(t);
-            }
-        }
+        // Take the old texture, if any.
+        let blit_source = self.texture.take();

        // Create the new texture.
+        let new_size = DeviceIntSize::new(MAX_VERTEX_TEXTURE_WIDTH as _, height);
+        let rt_info = Some(RenderTargetInfo { has_depth: false });
        let mut texture = device.create_texture(
            TextureTarget::Default,
            ImageFormat::RGBAF32,
@ -1126,7 +1105,6 @@ impl GpuCacheTexture {
        }

        self.texture = Some(texture);
-        true
    }

    fn new(device: &mut Device, use_scatter: bool) -> Result<Self, RendererError> {
@ -1156,7 +1134,6 @@ impl GpuCacheTexture {
            GpuCacheBus::PixelBuffer {
                buffer,
                rows: Vec::new(),
-                cpu_blocks: Vec::new(),
            }
        };

@ -1193,18 +1170,9 @@ impl GpuCacheTexture {
        total_block_count: usize,
        max_height: i32,
    ) {
-        let allocated_new_texture = self.ensure_texture(device, max_height);
+        self.ensure_texture(device, max_height);
        match self.bus {
-            GpuCacheBus::PixelBuffer { ref mut rows, .. } => {
-                if allocated_new_texture {
-                    // If we had to resize the texture, just mark all rows
-                    // as dirty so they will be uploaded to the texture
-                    // during the next flush.
-                    for row in rows.iter_mut() {
-                        row.is_dirty = true;
-                    }
-                }
-            }
+            GpuCacheBus::PixelBuffer { .. } => {},
            GpuCacheBus::Scatter {
                ref mut buf_position,
                ref mut buf_value,
@ -1222,7 +1190,7 @@ impl GpuCacheTexture {

    fn update(&mut self, device: &mut Device, updates: &GpuCacheUpdateList) {
        match self.bus {
-            GpuCacheBus::PixelBuffer { ref mut rows, ref mut cpu_blocks, .. } => {
+            GpuCacheBus::PixelBuffer { ref mut rows, .. } => {
                for update in &updates.updates {
                    match *update {
                        GpuCacheUpdate::Copy {
@ -1237,19 +1205,16 @@ impl GpuCacheTexture {
                            while rows.len() <= row {
                                // Add a new row.
                                rows.push(CacheRow::new());
-                                // Add enough GPU blocks for this row.
-                                cpu_blocks
-                                    .extend_from_slice(&[GpuBlockData::EMPTY; MAX_VERTEX_TEXTURE_WIDTH]);
                            }

                            // This row is dirty (needs to be updated in GPU texture).
                            rows[row].is_dirty = true;

                            // Copy the blocks from the patch array in the shadow CPU copy.
-                            let block_offset = row * MAX_VERTEX_TEXTURE_WIDTH + address.u as usize;
-                            let data = &mut cpu_blocks[block_offset .. (block_offset + block_count)];
+                            let block_offset = address.u as usize;
+                            let data = &mut rows[row].cpu_blocks;
                            for i in 0 .. block_count {
-                                data[i] = updates.blocks[block_index + i];
+                                data[block_offset + i] = updates.blocks[block_index + i];
                            }
                        }
                    }
@ -1294,7 +1259,7 @@ impl GpuCacheTexture {
    fn flush(&mut self, device: &mut Device) -> usize {
        let texture = self.texture.as_ref().unwrap();
        match self.bus {
-            GpuCacheBus::PixelBuffer { ref buffer, ref mut rows, ref cpu_blocks } => {
+            GpuCacheBus::PixelBuffer { ref buffer, ref mut rows } => {
                let rows_dirty = rows
                    .iter()
                    .filter(|row| row.is_dirty)
@ -1314,15 +1279,12 @@ impl GpuCacheTexture {
                        continue;
                    }

-                    let block_index = row_index * MAX_VERTEX_TEXTURE_WIDTH;
-                    let cpu_blocks =
-                        &cpu_blocks[block_index .. (block_index + MAX_VERTEX_TEXTURE_WIDTH)];
                    let rect = DeviceIntRect::new(
                        DeviceIntPoint::new(0, row_index as i32),
                        DeviceIntSize::new(MAX_VERTEX_TEXTURE_WIDTH as i32, 1),
                    );

-                    uploader.upload(rect, 0, None, cpu_blocks);
+                    uploader.upload(rect, 0, None, &*row.cpu_blocks);

                    row.is_dirty = false;
                }
@ -1515,6 +1477,7 @@ pub struct Renderer {
    pub device: Device,
    pending_texture_updates: Vec<TextureUpdateList>,
    pending_gpu_cache_updates: Vec<GpuCacheUpdateList>,
+    pending_gpu_cache_clear: bool,
    pending_shader_updates: Vec<PathBuf>,
    active_documents: Vec<(DocumentId, RenderedDocument)>,

@ -1552,8 +1515,12 @@ pub struct Renderer {
    transforms_texture: VertexDataTexture,
    render_task_texture: VertexDataTexture,
    gpu_cache_texture: GpuCacheTexture,
+
+    /// When the GPU cache debugger is enabled, we keep track of the live blocks
+    /// in the GPU cache so that we can use them for the debug display. This
+    /// member stores those live blocks, indexed by row.
    #[cfg(feature = "debug_renderer")]
-    gpu_cache_debug_chunks: Vec<GpuDebugChunk>,
+    gpu_cache_debug_chunks: Vec<Vec<GpuCacheDebugChunk>>,

    gpu_cache_frame_id: FrameId,
    gpu_cache_overflow: bool,
@ -1851,9 +1818,7 @@ impl Renderer {
        };

        let device_pixel_ratio = options.device_pixel_ratio;
-        // First set the flags to default and later call set_debug_flags to ensure any
-        // potential transition when enabling a flag is run.
-        let debug_flags = DebugFlags::default();
+        let debug_flags = options.debug_flags;
        let payload_rx_for_backend = payload_rx.to_mpsc_receiver();
        let recorder = options.recorder;
        let thread_listener = Arc::new(options.thread_listener);
@ -1973,6 +1938,7 @@ impl Renderer {
                recorder,
                sampler,
                size_of_op,
+                debug_flags,
                namespace_alloc_by_client,
            );
            backend.run(backend_profile_counters);
@ -1993,11 +1959,12 @@ impl Renderer {
            active_documents: Vec::new(),
            pending_texture_updates: Vec::new(),
            pending_gpu_cache_updates: Vec::new(),
+            pending_gpu_cache_clear: false,
            pending_shader_updates: Vec::new(),
            shaders,
            #[cfg(feature = "debug_renderer")]
            debug: LazyInitializedDebugRenderer::new(),
-            debug_flags,
+            debug_flags: DebugFlags::empty(),
            backend_profile_counters: BackendProfileCounters::new(),
            profile_counters: RendererProfileCounters::new(),
            resource_upload_time: 0,
@ -2053,7 +2020,9 @@ impl Renderer {
            framebuffer_size: None,
        };

-        renderer.set_debug_flags(options.debug_flags);
+        // We initially set the flags to default and then now call set_debug_flags
+        // to ensure any potential transition when enabling a flag is run.
+        renderer.set_debug_flags(debug_flags);

        let sender = RenderApiSender::new(api_tx, payload_tx);
        Ok((renderer, sender))
@ -2144,10 +2113,33 @@ impl Renderer {
                ResultMsg::UpdateGpuCache(mut list) => {
                    #[cfg(feature = "debug_renderer")]
                    {
-                        self.gpu_cache_debug_chunks = mem::replace(&mut list.debug_chunks, Vec::new());
+                        for cmd in mem::replace(&mut list.debug_commands, Vec::new()) {
+                            match cmd {
+                                GpuCacheDebugCmd::Alloc(chunk) => {
+                                    let row = chunk.address.v as usize;
+                                    if row >= self.gpu_cache_debug_chunks.len() {
+                                        self.gpu_cache_debug_chunks.resize(row + 1, Vec::new());
+                                    }
+                                    self.gpu_cache_debug_chunks[row].push(chunk);
+                                },
+                                GpuCacheDebugCmd::Free(address) => {
+                                    let chunks = &mut self.gpu_cache_debug_chunks[address.v as usize];
+                                    let pos = chunks.iter()
+                                        .position(|x| x.address == address).unwrap();
+                                    chunks.remove(pos);
+                                },
+                            }
+                        }
                    }
                    self.pending_gpu_cache_updates.push(list);
                }
+                ResultMsg::ClearGpuCache => {
+                    #[cfg(feature = "debug_renderer")]
+                    {
+                        self.gpu_cache_debug_chunks = Vec::new();
+                    }
+                    self.pending_gpu_cache_clear = true;
+                }
                ResultMsg::UpdateResources {
                    updates,
                    memory_pressure,
@ -2743,7 +2735,7 @@ impl Renderer {
                height: gpu_cache_height,
                blocks: vec![[1f32; 4].into()],
                updates: Vec::new(),
-                debug_chunks: Vec::new(),
+                debug_commands: Vec::new(),
            });
        }

@ -2789,6 +2781,15 @@ impl Renderer {
    }

    fn prepare_gpu_cache(&mut self, frame: &Frame) {
+        if self.pending_gpu_cache_clear {
+            let use_scatter =
+                matches!(self.gpu_cache_texture.bus, GpuCacheBus::Scatter { .. });
+            let new_cache = GpuCacheTexture::new(&mut self.device, use_scatter).unwrap();
+            let old_cache = mem::replace(&mut self.gpu_cache_texture, new_cache);
+            old_cache.deinit(&mut self.device);
+            self.pending_gpu_cache_clear = false;
+        }
+
        let deferred_update_list = self.update_deferred_resolves(&frame.deferred_resolves);
        self.pending_gpu_cache_updates.extend(deferred_update_list);

@ -3856,7 +3857,7 @@ impl Renderer {
            height: self.gpu_cache_texture.get_height(),
            blocks: Vec::new(),
            updates: Vec::new(),
-            debug_chunks: Vec::new(),
+            debug_commands: Vec::new(),
        };

        for deferred_resolve in deferred_resolves {
@ -4444,21 +4445,21 @@ impl Renderer {
        };

        let (x_off, y_off) = (30f32, 30f32);
-        //let x_end = framebuffer_size.width as f32 - x_off;
-        let y_end = framebuffer_size.height as f32 - y_off;
+        let height = self.gpu_cache_texture.texture
+            .as_ref().map_or(0, |t| t.get_dimensions().height)
+            .min(framebuffer_size.height - (y_off as i32) * 2) as usize;
        debug_renderer.add_quad(
            x_off,
            y_off,
            x_off + MAX_VERTEX_TEXTURE_WIDTH as f32,
-            y_end,
+            y_off + height as f32,
            ColorU::new(80, 80, 80, 80),
            ColorU::new(80, 80, 80, 80),
        );

-        for chunk in &self.gpu_cache_debug_chunks {
-            let color = match chunk.tag {
-                _ => ColorU::new(250, 0, 0, 200),
-            };
+        let upper = self.gpu_cache_debug_chunks.len().min(height);
+        for chunk in self.gpu_cache_debug_chunks[0..upper].iter().flatten() {
+            let color = ColorU::new(250, 0, 0, 200);
            debug_renderer.add_quad(
                x_off + chunk.address.u as f32,
                y_off + chunk.address.v as f32,
@ -4548,8 +4549,10 @@ impl Renderer {
        let mut report = MemoryReport::default();

        // GPU cache CPU memory.
-        if let GpuCacheBus::PixelBuffer{ref cpu_blocks, ..} = self.gpu_cache_texture.bus {
-            report.gpu_cache_cpu_mirror += self.size_of(cpu_blocks.as_ptr());
+        if let GpuCacheBus::PixelBuffer{ref rows, ..} = self.gpu_cache_texture.bus {
+            for row in rows.iter() {
+                report.gpu_cache_cpu_mirror += self.size_of(&*row.cpu_blocks as *const _);
+            }
        }

        // GPU cache GPU memory.
@ -5205,7 +5208,7 @@ impl Renderer {
            );
            self.gpu_cache_texture.texture = Some(t);
            match self.gpu_cache_texture.bus {
-                GpuCacheBus::PixelBuffer { ref mut rows, ref mut cpu_blocks, .. } => {
+                GpuCacheBus::PixelBuffer { ref mut rows, .. } => {
                    let dim = self.gpu_cache_texture.texture.as_ref().unwrap().get_dimensions();
                    let blocks = unsafe {
                        slice::from_raw_parts(
@ -5215,9 +5218,12 @@ impl Renderer {
                    };
                    // fill up the CPU cache from the contents we just loaded
                    rows.clear();
-                    cpu_blocks.clear();
                    rows.extend((0 .. dim.height).map(|_| CacheRow::new()));
-                    cpu_blocks.extend_from_slice(blocks);
+                    let chunks = blocks.chunks(MAX_VERTEX_TEXTURE_WIDTH);
+                    debug_assert_eq!(chunks.len(), rows.len());
+                    for (row, chunk) in rows.iter_mut().zip(chunks) {
+                        row.cpu_blocks.copy_from_slice(chunk);
+                    }
                }
                GpuCacheBus::Scatter { .. } => {}
            }