Metal: Optimized BufferSubData per device

Adds a staging buffer path which means there are 4 paths for bufferSubData. 1. direct copy * get a pointer to the buffer * copy the new data to the buffer * if the buffer is managed, tell metal which part was updated 2. use a shadow copy * copy the data to a shadow copy * copy the entire shadow to a new buffer * start using the new buffer 3. use a new buffer * get a new buffer (or unused) * put the new data in the new buffer * blit any unchanged data from the old buffer to the new buffer * start using the new buffer 4. use a staging buffer * get a staging buffer * put the new data in the staging buffer * blit from the staging buffer to the existing buffer. Further, there are 3 types of memory storage modes. Managed, Staged, Private. Based on the GPU type different storage modes and different paths in different sitatutions are more performant. So, add feature flags to select paths by GPU. Bug: angleproject:7544 Change-Id: I741dd1874201043416374194bd2001ded8dbd9b4 Reviewed-on: https://chromium-review.googlesource.com/c/angle/angle/+/3842641 Reviewed-by: Kyle Piddington <kpiddington@apple.com> Reviewed-by: Kenneth Russell <kbr@chromium.org> Reviewed-by: Quyen Le <lehoangquyen@chromium.org> Commit-Queue: Gregg Tavares <gman@chromium.org>
2022-08-19 12:11:23 -07:00 · 2022-08-19 12:11:23 -07:00 · 968041b547
--- a/include/platform/FeaturesMtl_autogen.h
+++ b/include/platform/FeaturesMtl_autogen.h
@ -232,6 +232,31 @@ struct FeaturesMtl : FeatureSetBase
        "uploadDataToIosurfacesWithStagingBuffers", FeatureCategory::MetalWorkarounds,
        "When uploading data to IOSurface-backed textures, use a staging buffer.", &members,
        "http://anglebug.com/7573"};
+
+    FeatureInfo alwaysUseStagedBufferUpdates = {
+        "alwaysUseStagedBufferUpdates", FeatureCategory::MetalFeatures,
+        "Always update buffers by copying the data to a staging buffer and then blitting it to the "
+        "actual buffer",
+        &members, "http://anglebug.com/7544"};
+
+    FeatureInfo useShadowBuffersWhenAppropriate = {
+        "useShadowBuffersWhenAppropriate", FeatureCategory::MetalFeatures,
+        "On some architectures using a shadow buffer can be faster for certain size buffers",
+        &members, "http://anglebug.com/7544"};
+
+    FeatureInfo alwaysUseManagedStorageModeForBuffers = {
+        "alwaysUseManagedStorageModeForBuffers", FeatureCategory::MetalFeatures,
+        "Metal buffers can be managed, shared, or private. Sometimes managed is fastest", &members,
+        "http://anglebug.com/7544"};
+
+    FeatureInfo alwaysUseSharedStorageModeForBuffers = {
+        "alwaysUseSharedStorageModeForBuffers", FeatureCategory::MetalFeatures,
+        "Metal buffers can be managed, shared, or private. Sometimes shared is fastest", &members,
+        "http://anglebug.com/7544"};
+
+    FeatureInfo preferCpuForBuffersubdata = {
+        "preferCpuForBuffersubdata", FeatureCategory::MetalFeatures,
+        "Makes bufferSubData always update via CPU", &members, "http://anglebug.com/7544"};
 };

 inline FeaturesMtl::FeaturesMtl()  = default;
--- a/include/platform/mtl_features.json
+++ b/include/platform/mtl_features.json
@ -243,6 +243,46 @@
                "When uploading data to IOSurface-backed textures, use a staging buffer."
            ],
            "issue": "http://anglebug.com/7573"
+        },
+        {
+            "name": "always_use_staged_buffer_updates",
+            "category": "Features",
+            "description": [
+                "Always update buffers by copying the data to a staging buffer and then blitting it to the actual buffer"
+            ],
+            "issue": "http://anglebug.com/7544"
+        },
+        {
+            "name": "use_shadow_buffers_when_appropriate",
+            "category": "Features",
+            "description": [
+                "On some architectures using a shadow buffer can be faster for certain size buffers"
+            ],
+            "issue": "http://anglebug.com/7544"
+        },
+        {
+            "name": "always_use_managed_storage_mode_for_buffers",
+            "category": "Features",
+            "description": [
+                "Metal buffers can be managed, shared, or private. Sometimes managed is fastest"
+            ],
+            "issue": "http://anglebug.com/7544"
+        },
+        {
+            "name": "always_use_shared_storage_mode_for_buffers",
+            "category": "Features",
+            "description": [
+                "Metal buffers can be managed, shared, or private. Sometimes shared is fastest"
+            ],
+            "issue": "http://anglebug.com/7544"
+        },
+        {
+            "name": "prefer_cpu_for_buffersubdata",
+            "category": "Features",
+            "description": [
+                "Makes bufferSubData always update via CPU"
+            ],
+            "issue": "http://anglebug.com/7544"
        }
    ]
 }
--- a/scripts/code_generation_hashes/ANGLE_features.json
+++ b/scripts/code_generation_hashes/ANGLE_features.json
@ -4,7 +4,7 @@
  "include/platform/FeaturesGL_autogen.h":
    "c192145f3939d4d0bf85a39649e0c14e",
  "include/platform/FeaturesMtl_autogen.h":
-    "80c0f3379882d1f67e523a3a1530cd79",
+    "c31c1c77040ef119dfaf882b5b5e65ab",
  "include/platform/FeaturesVk_autogen.h":
    "03f5b51f08b6cb4f831764aa4848f399",
  "include/platform/FrontendFeatures_autogen.h":
@ -18,11 +18,11 @@
  "include/platform/gl_features.json":
    "a50e9bd2fa9eb0685d9b1c118a21ad2c",
  "include/platform/mtl_features.json":
-    "9833c17145ba2223da2e607a9340afda",
+    "408385ed8fa29652e23a6338faec6d2f",
  "include/platform/vk_features.json":
    "a0dd571e23e0bd521eb42d72a0863297",
  "util/angle_features_autogen.cpp":
-    "b6a2d2cac7d30b6c08d9398fed38a14c",
+    "d43086098956bfd4374284a05cfb884d",
  "util/angle_features_autogen.h":
-    "1e4b7c6e89ee370d052fa7f0c48c11c6"
+    "80421f1223abdee293434a2c7f8ff3bc"
 }
--- a/src/libANGLE/renderer/metal/BUILD.gn
+++ b/src/libANGLE/renderer/metal/BUILD.gn
@ -51,6 +51,8 @@ _metal_backend_sources = [
  "TransformFeedbackMtl.mm",
  "VertexArrayMtl.h",
  "VertexArrayMtl.mm",
+  "mtl_buffer_manager.h",
+  "mtl_buffer_manager.mm",
  "mtl_buffer_pool.h",
  "mtl_buffer_pool.mm",
  "mtl_command_buffer.h",
--- a/src/libANGLE/renderer/metal/BufferMtl.h
+++ b/src/libANGLE/renderer/metal/BufferMtl.h
@ -151,7 +151,8 @@ class BufferMtl : public BufferImpl, public BufferHolderMtl
                                      size_t count,
                                      std::pair<uint32_t, uint32_t> *outIndices);

-    const uint8_t *getClientShadowCopyData(ContextMtl *contextMtl);
+    const uint8_t *getBufferDataReadOnly(ContextMtl *contextMtl);
+    bool isSafeToReadFromBufferViaBlit(ContextMtl *contextMtl);

    ConversionBufferMtl *getVertexConversionBuffer(ContextMtl *context,
                                                   angle::FormatID formatID,
@ -186,27 +187,36 @@ class BufferMtl : public BufferImpl, public BufferHolderMtl
                                 size_t size,
                                 size_t offset);

-    angle::Result commitShadowCopy(const gl::Context *context);
-    angle::Result commitShadowCopy(const gl::Context *context, size_t size);
+    angle::Result commitShadowCopy(ContextMtl *contextMtl);
+    angle::Result commitShadowCopy(ContextMtl *contextMtl, size_t size);

    void markConversionBuffersDirty();
    void clearConversionBuffers();

+    angle::Result putDataInNewBufferAndStartUsingNewBuffer(ContextMtl *contextMtl,
+                                                           const uint8_t *srcPtr,
+                                                           size_t sizeToCopy,
+                                                           size_t offset);
+    angle::Result updateExistingBufferViaBlitFromStagingBuffer(ContextMtl *contextMtl,
+                                                               const uint8_t *srcPtr,
+                                                               size_t sizeToCopy,
+                                                               size_t offset);
+    angle::Result copyDataToExistingBufferViaCPU(ContextMtl *contextMtl,
+                                                 const uint8_t *srcPtr,
+                                                 size_t sizeToCopy,
+                                                 size_t offset);
+    angle::Result updateShadowCopyThenCopyShadowToNewBuffer(ContextMtl *contextMtl,
+                                                            const uint8_t *srcPtr,
+                                                            size_t sizeToCopy,
+                                                            size_t offset);
+
    bool clientShadowCopyDataNeedSync(ContextMtl *contextMtl);
    void ensureShadowCopySyncedFromGPU(ContextMtl *contextMtl);
    uint8_t *syncAndObtainShadowCopy(ContextMtl *contextMtl);

-    // Convenient method
-    const uint8_t *getClientShadowCopyData(const gl::Context *context)
-    {
-        return getClientShadowCopyData(mtl::GetImpl(context));
-    }
-    // Client side shadow buffer
+    // Optional client side shadow buffer
    angle::MemoryBuffer mShadowCopy;

-    // GPU side buffers pool
-    mtl::BufferPool mBufferPool;
-
    // A cache of converted vertex data.
    std::vector<VertexConversionBufferMtl> mVertexConversionBuffers;

@ -224,6 +234,9 @@ class BufferMtl : public BufferImpl, public BufferHolderMtl
    };
    std::optional<RestartRangeCache> mRestartRangeCache;
    std::vector<IndexRange> mRestartIndices;
+    size_t mGLSize        = 0;  // size GL asked for (vs size we actually allocated)
+    size_t mRevisionCount = 0;  // for generating labels only
+    gl::BufferUsage mUsage;
 };

 class SimpleWeakBufferHolderMtl : public BufferHolderMtl
--- a/src/libANGLE/renderer/metal/BufferMtl.mm
+++ b/src/libANGLE/renderer/metal/BufferMtl.mm
@ -13,6 +13,7 @@
 #include "common/utilities.h"
 #include "libANGLE/renderer/metal/ContextMtl.h"
 #include "libANGLE/renderer/metal/DisplayMtl.h"
+#include "libANGLE/renderer/metal/mtl_buffer_manager.h"

 namespace rx
 {
@ -39,6 +40,12 @@ angle::Result GetFirstLastIndices(const IndexType *indices,
    return angle::Result::Continue;
 }

+bool isOffsetAndSizeMetalBlitCompatible(size_t offset, size_t size)
+{
+    // Metal requires offset and size to be multiples of 4
+    return offset % 4 == 0 && size % 4 == 0;
+}
+
 }  // namespace

 // ConversionBufferMtl implementation.
@ -88,9 +95,7 @@ VertexConversionBufferMtl::VertexConversionBufferMtl(ContextMtl *context,
 {}

 // BufferMtl implementation
-BufferMtl::BufferMtl(const gl::BufferState &state)
-    : BufferImpl(state), mBufferPool(/** alwaysAllocNewBuffer */ true)
-{}
+BufferMtl::BufferMtl(const gl::BufferState &state) : BufferImpl(state) {}

 BufferMtl::~BufferMtl() {}

@ -98,8 +103,13 @@ void BufferMtl::destroy(const gl::Context *context)
 {
    ContextMtl *contextMtl = mtl::GetImpl(context);
    mShadowCopy.clear();
-    mBufferPool.destroy(contextMtl);
-    mBuffer = nullptr;
+
+    // if there's a buffer, give it back to the buffer manager
+    if (mBuffer)
+    {
+        contextMtl->getBufferManager().returnBuffer(contextMtl, mBuffer);
+        mBuffer = nullptr;
+    }

    clearConversionBuffers();
 }
@ -136,19 +146,30 @@ angle::Result BufferMtl::copySubData(const gl::Context *context,
    ContextMtl *contextMtl = mtl::GetImpl(context);
    auto srcMtl            = GetAs<BufferMtl>(source);

-    if (srcMtl->clientShadowCopyDataNeedSync(contextMtl) || mBuffer->isBeingUsedByGPU(contextMtl))
-    {
-        // If shadow copy requires a synchronization then use blit command instead.
-        // It might break a pending render pass, but still faster than synchronization with
-        // GPU.
-        mtl::BlitCommandEncoder *blitEncoder = contextMtl->getBlitCommandEncoder();
-        blitEncoder->copyBuffer(srcMtl->getCurrentBuffer(), sourceOffset, mBuffer, destOffset,
-                                size);
+    markConversionBuffersDirty();

-        return angle::Result::Continue;
+    if (mShadowCopy.size() > 0)
+    {
+        if (srcMtl->clientShadowCopyDataNeedSync(contextMtl) ||
+            mBuffer->isBeingUsedByGPU(contextMtl))
+        {
+            // If shadow copy requires a synchronization then use blit command instead.
+            // It might break a pending render pass, but still faster than synchronization with
+            // GPU.
+            mtl::BlitCommandEncoder *blitEncoder = contextMtl->getBlitCommandEncoder();
+            blitEncoder->copyBuffer(srcMtl->getCurrentBuffer(), sourceOffset, mBuffer, destOffset,
+                                    size);
+
+            return angle::Result::Continue;
+        }
+        return setSubDataImpl(context, srcMtl->getBufferDataReadOnly(contextMtl) + sourceOffset,
+                              size, destOffset);
    }
-    return setSubDataImpl(context, srcMtl->getClientShadowCopyData(contextMtl) + sourceOffset, size,
-                          destOffset);
+
+    mtl::BlitCommandEncoder *blitEncoder = contextMtl->getBlitCommandEncoder();
+    blitEncoder->copyBuffer(srcMtl->getCurrentBuffer(), sourceOffset, mBuffer, destOffset, size);
+
+    return angle::Result::Continue;
 }

 angle::Result BufferMtl::map(const gl::Context *context, GLenum access, void **mapPtr)
@ -176,7 +197,7 @@ angle::Result BufferMtl::mapRange(const gl::Context *context,
    if (mapPtr)
    {
        ContextMtl *contextMtl = mtl::GetImpl(context);
-        if (mBufferPool.getMaxBuffers() == 1)
+        if (mShadowCopy.size() == 0)
        {
            *mapPtr = mBuffer->mapWithOpt(contextMtl, (access & GL_MAP_WRITE_BIT) == 0,
                                          access & GL_MAP_UNSYNCHRONIZED_BIT) +
@ -199,7 +220,7 @@ angle::Result BufferMtl::unmap(const gl::Context *context, GLboolean *result)

    markConversionBuffersDirty();

-    if (mBufferPool.getMaxBuffers() == 1)
+    if (mShadowCopy.size() == 0)
    {
        ASSERT(mBuffer);
        if (mState.getAccessFlags() & GL_MAP_WRITE_BIT)
@ -215,8 +236,6 @@ angle::Result BufferMtl::unmap(const gl::Context *context, GLboolean *result)
    }
    else
    {
-        ASSERT(mShadowCopy.size());
-
        if (mState.getAccessFlags() & GL_MAP_UNSYNCHRONIZED_BIT)
        {
            // Copy the mapped region without synchronization with GPU
@ -228,7 +247,7 @@ angle::Result BufferMtl::unmap(const gl::Context *context, GLboolean *result)
        else
        {
            // commit shadow copy data to GPU synchronously
-            ANGLE_TRY(commitShadowCopy(context));
+            ANGLE_TRY(commitShadowCopy(contextMtl));
        }
    }

@ -247,7 +266,7 @@ angle::Result BufferMtl::getIndexRange(const gl::Context *context,
                                       bool primitiveRestartEnabled,
                                       gl::IndexRange *outRange)
 {
-    const uint8_t *indices = getClientShadowCopyData(mtl::GetImpl(context)) + offset;
+    const uint8_t *indices = getBufferDataReadOnly(mtl::GetImpl(context)) + offset;

    *outRange = gl::ComputeIndexRange(type, indices, count, primitiveRestartEnabled);

@ -260,7 +279,7 @@ angle::Result BufferMtl::getFirstLastIndices(ContextMtl *contextMtl,
                                             size_t count,
                                             std::pair<uint32_t, uint32_t> *outIndices)
 {
-    const uint8_t *indices = getClientShadowCopyData(contextMtl) + offset;
+    const uint8_t *indices = getBufferDataReadOnly(contextMtl) + offset;

    switch (type)
    {
@ -283,10 +302,9 @@ void BufferMtl::onDataChanged()
    markConversionBuffersDirty();
 }

-/* public */
-const uint8_t *BufferMtl::getClientShadowCopyData(ContextMtl *contextMtl)
+const uint8_t *BufferMtl::getBufferDataReadOnly(ContextMtl *contextMtl)
 {
-    if (mBufferPool.getMaxBuffers() == 1)
+    if (mShadowCopy.size() == 0)
    {
        // Don't need shadow copy in this case, use the buffer directly
        return mBuffer->mapReadOnly(contextMtl);
@ -479,13 +497,45 @@ const std::vector<IndexRange> BufferMtl::getRestartIndicesFromClientData(
    return restartIndices;
 }

+namespace
+{
+
+bool useSharedMemory(ContextMtl *contextMtl, gl::BufferUsage usage)
+{
+    const angle::FeaturesMtl &features = contextMtl->getDisplay()->getFeatures();
+    if (features.alwaysUseManagedStorageModeForBuffers.enabled)
+    {
+        return false;
+    }
+
+    if (features.alwaysUseSharedStorageModeForBuffers.enabled)
+    {
+        return true;
+    }
+
+    switch (usage)
+    {
+        case gl::BufferUsage::StaticCopy:
+        case gl::BufferUsage::StaticDraw:
+        case gl::BufferUsage::StaticRead:
+        case gl::BufferUsage::DynamicRead:
+        case gl::BufferUsage::StreamRead:
+            return true;
+        default:
+            return false;
+    }
+}
+
+}  // namespace
+
 angle::Result BufferMtl::setDataImpl(const gl::Context *context,
                                     gl::BufferBinding target,
                                     const void *data,
                                     size_t intendedSize,
                                     gl::BufferUsage usage)
 {
-    ContextMtl *contextMtl = mtl::GetImpl(context);
+    ContextMtl *contextMtl             = mtl::GetImpl(context);
+    const angle::FeaturesMtl &features = contextMtl->getDisplay()->getFeatures();

    // Invalidate conversion buffers
    if (mState.getSize() != static_cast<GLint64>(intendedSize))
@ -497,80 +547,32 @@ angle::Result BufferMtl::setDataImpl(const gl::Context *context,
        markConversionBuffersDirty();
    }

+    mUsage              = usage;
+    mGLSize             = intendedSize;
    size_t adjustedSize = std::max<size_t>(1, intendedSize);

    // Ensures no validation layer issues in std140 with data types like vec3 being 12 bytes vs 16
    // in MSL.
    if (target == gl::BufferBinding::Uniform)
    {
+        // This doesn't work! A buffer can be allocated on ARRAY_BUFFER and used in UNIFORM_BUFFER
+        // TODO(anglebug.com/7585)
        adjustedSize = roundUpPow2(adjustedSize, (size_t)16);
    }

-    size_t maxBuffers;
-    switch (usage)
-    {
-        case gl::BufferUsage::StaticCopy:
-        case gl::BufferUsage::StaticDraw:
-        case gl::BufferUsage::StaticRead:
-        case gl::BufferUsage::DynamicRead:
-        case gl::BufferUsage::StreamRead:
-            maxBuffers = 1;  // static/read buffer doesn't need high speed data update
-            mBufferPool.setAlwaysUseGPUMem();
-            break;
-        default:
-            // dynamic buffer, allow up to 10 update per frame/encoding without
-            // waiting for GPU.
-            if (adjustedSize <= mtl::kSharedMemBufferMaxBufSizeHint)
-            {
-                maxBuffers = 10;
-                mBufferPool.setAlwaysUseSharedMem();
-            }
-            else
-            {
-                maxBuffers = 1;
-                mBufferPool.setAlwaysUseGPUMem();
-            }
-            break;
-    }
-
    // Re-create the buffer
-    mBuffer = nullptr;
-    ANGLE_TRY(mBufferPool.reset(contextMtl, adjustedSize, 1, maxBuffers));
-
-    if (maxBuffers > 1)
+    mtl::BufferManager &bufferManager = contextMtl->getBufferManager();
+    if (mBuffer)
    {
-        // We use shadow copy to maintain consistent data between buffers in pool
-        ANGLE_MTL_CHECK(contextMtl, mShadowCopy.resize(adjustedSize), GL_OUT_OF_MEMORY);
-
-        if (data)
-        {
-            // Transfer data to shadow copy buffer
-            auto ptr = static_cast<const uint8_t *>(data);
-            std::copy(ptr, ptr + intendedSize, mShadowCopy.data());
-
-            // Transfer data from shadow copy buffer to GPU buffer.
-            ANGLE_TRY(commitShadowCopy(context, adjustedSize));
-        }
-        else
-        {
-            // This is needed so that first buffer pointer could be available
-            ANGLE_TRY(commitShadowCopy(context, 0));
-        }
+        // Return the current buffer to the buffer manager
+        // It will not be re-used until it's no longer in use.
+        bufferManager.returnBuffer(contextMtl, mBuffer);
+        mBuffer = nullptr;
    }
-    else
-    {
-        // We don't need shadow copy if there will be only one buffer in the pool.
-        ANGLE_MTL_CHECK(contextMtl, mShadowCopy.resize(0), GL_OUT_OF_MEMORY);

-        // Allocate one buffer to use
-        ANGLE_TRY(
-            mBufferPool.allocate(contextMtl, adjustedSize, nullptr, &mBuffer, nullptr, nullptr));
-
-        if (data)
-        {
-            ANGLE_TRY(setSubDataImpl(context, data, intendedSize, 0));
-        }
-    }
+    // Get a new buffer
+    bool useSharedMem = useSharedMemory(contextMtl, usage);
+    ANGLE_TRY(bufferManager.getBuffer(contextMtl, adjustedSize, useSharedMem, mBuffer));

 #ifndef NDEBUG
    ANGLE_MTL_OBJC_SCOPE
@ -579,9 +581,137 @@ angle::Result BufferMtl::setDataImpl(const gl::Context *context,
    }
 #endif

+    // We may use shadow copy to maintain consistent data between buffers in pool
+    size_t shadowSize = (!features.preferCpuForBuffersubdata.enabled &&
+                         features.useShadowBuffersWhenAppropriate.enabled &&
+                         adjustedSize <= mtl::kSharedMemBufferMaxBufSizeHint)
+                            ? adjustedSize
+                            : 0;
+    ANGLE_MTL_CHECK(contextMtl, mShadowCopy.resize(shadowSize), GL_OUT_OF_MEMORY);
+
+    if (data)
+    {
+        ANGLE_TRY(setSubDataImpl(context, data, intendedSize, 0));
+    }
+
    return angle::Result::Continue;
 }

+// states:
+//  * The buffer is not use
+//
+//    safe = true
+//
+//  * The buffer has a pending blit
+//
+//    In this case, as long as we are only reading from it
+//    via blit to a new buffer our blits will happen after existing
+//    blits
+//
+//    safe = true
+//
+//  * The buffer has pending writes in a commited render encoder
+//
+//    In this case we're encoding commands that will happen after
+//    that encoder
+//
+//    safe = true
+//
+//  * The buffer has pending writes in the current render encoder
+//
+//    in this case we have to split/end the render encoder
+//    before we can use the buffer.
+//
+//    safe = false
+bool BufferMtl::isSafeToReadFromBufferViaBlit(ContextMtl *contextMtl)
+{
+    uint64_t serial   = mBuffer->getLastWritingRenderEncoderSerial();
+    bool isSameSerial = contextMtl->isCurrentRenderEncoderSerial(serial);
+    return !isSameSerial;
+}
+
+angle::Result BufferMtl::updateExistingBufferViaBlitFromStagingBuffer(ContextMtl *contextMtl,
+                                                                      const uint8_t *srcPtr,
+                                                                      size_t sizeToCopy,
+                                                                      size_t offset)
+{
+    ASSERT(isOffsetAndSizeMetalBlitCompatible(offset, sizeToCopy));
+
+    mtl::BufferManager &bufferManager = contextMtl->getBufferManager();
+    return bufferManager.queueBlitCopyDataToBuffer(contextMtl, srcPtr, sizeToCopy, offset, mBuffer);
+}
+
+// * get a new or unused buffer
+// * copy the new data to it
+// * copy any old data not overwriten by the new data to the new buffer
+// * start using the new buffer
+angle::Result BufferMtl::putDataInNewBufferAndStartUsingNewBuffer(ContextMtl *contextMtl,
+                                                                  const uint8_t *srcPtr,
+                                                                  size_t sizeToCopy,
+                                                                  size_t offset)
+{
+    ASSERT(isOffsetAndSizeMetalBlitCompatible(offset, sizeToCopy));
+
+    mtl::BufferManager &bufferManager = contextMtl->getBufferManager();
+    mtl::BufferRef oldBuffer          = mBuffer;
+    bool useSharedMem                 = useSharedMemory(contextMtl, mUsage);
+
+    ANGLE_TRY(bufferManager.getBuffer(contextMtl, mGLSize, useSharedMem, mBuffer));
+    mBuffer->get().label = [NSString stringWithFormat:@"BufferMtl=%p(%lu)", this, ++mRevisionCount];
+
+    uint8_t *ptr = mBuffer->mapWithOpt(contextMtl, false, true);
+    std::copy(srcPtr, srcPtr + sizeToCopy, ptr + offset);
+    mBuffer->unmapAndFlushSubset(contextMtl, offset, sizeToCopy);
+
+    if (offset > 0 || offset + sizeToCopy < mGLSize)
+    {
+        mtl::BlitCommandEncoder *blitEncoder =
+            contextMtl->getBlitCommandEncoderWithoutEndingRenderEncoder();
+        if (offset > 0)
+        {
+            // copy old data before updated region
+            blitEncoder->copyBuffer(oldBuffer, 0, mBuffer, 0, offset);
+        }
+        if (offset + sizeToCopy < mGLSize)
+        {
+            // copy old data after updated region
+            const size_t endOffset     = offset + sizeToCopy;
+            const size_t endSizeToCopy = mGLSize - endOffset;
+            blitEncoder->copyBuffer(oldBuffer, endOffset, mBuffer, endOffset, endSizeToCopy);
+        }
+    }
+
+    bufferManager.returnBuffer(contextMtl, oldBuffer);
+    return angle::Result::Continue;
+}
+
+angle::Result BufferMtl::copyDataToExistingBufferViaCPU(ContextMtl *contextMtl,
+                                                        const uint8_t *srcPtr,
+                                                        size_t sizeToCopy,
+                                                        size_t offset)
+{
+    uint8_t *ptr = mBuffer->map(contextMtl);
+    std::copy(srcPtr, srcPtr + sizeToCopy, ptr + offset);
+    mBuffer->unmapAndFlushSubset(contextMtl, offset, sizeToCopy);
+    return angle::Result::Continue;
+}
+
+angle::Result BufferMtl::updateShadowCopyThenCopyShadowToNewBuffer(ContextMtl *contextMtl,
+                                                                   const uint8_t *srcPtr,
+                                                                   size_t sizeToCopy,
+                                                                   size_t offset)
+{
+    // 1. Before copying data from client, we need to synchronize modified data from GPU to
+    // shadow copy first.
+    ensureShadowCopySyncedFromGPU(contextMtl);
+
+    // 2. Copy data from client to shadow copy.
+    std::copy(srcPtr, srcPtr + sizeToCopy, mShadowCopy.data() + offset);
+
+    // 3. Copy data from shadow copy to GPU.
+    return commitShadowCopy(contextMtl);
+}
+
 angle::Result BufferMtl::setSubDataImpl(const gl::Context *context,
                                        const void *data,
                                        size_t size,
@ -594,68 +724,72 @@ angle::Result BufferMtl::setSubDataImpl(const gl::Context *context,

    ASSERT(mBuffer);

-    ContextMtl *contextMtl = mtl::GetImpl(context);
+    ContextMtl *contextMtl             = mtl::GetImpl(context);
+    const angle::FeaturesMtl &features = contextMtl->getDisplay()->getFeatures();

-    ANGLE_MTL_TRY(contextMtl, offset <= mBuffer->size());
+    ANGLE_MTL_TRY(contextMtl, offset <= mGLSize);

    auto srcPtr     = static_cast<const uint8_t *>(data);
-    auto sizeToCopy = std::min<size_t>(size, mBuffer->size() - offset);
+    auto sizeToCopy = std::min<size_t>(size, mGLSize - offset);

    markConversionBuffersDirty();

-    if (mBufferPool.getMaxBuffers() == 1)
+    if (features.preferCpuForBuffersubdata.enabled)
    {
-        ASSERT(mBuffer);
-        uint8_t *ptr = mBuffer->map(contextMtl);
-        std::copy(srcPtr, srcPtr + sizeToCopy, ptr + offset);
-        mBuffer->unmapAndFlushSubset(contextMtl, offset, sizeToCopy);
+        return copyDataToExistingBufferViaCPU(contextMtl, srcPtr, sizeToCopy, offset);
+    }
+
+    if (mShadowCopy.size() > 0)
+    {
+        return updateShadowCopyThenCopyShadowToNewBuffer(contextMtl, srcPtr, sizeToCopy, offset);
    }
    else
    {
-        ASSERT(mShadowCopy.size());
+        bool alwaysUseStagedBufferUpdates = features.alwaysUseStagedBufferUpdates.enabled;

-        // 1. Before copying data from client, we need to synchronize modified data from GPU to
-        // shadow copy first.
-        ensureShadowCopySyncedFromGPU(contextMtl);
-
-        // 2. Copy data from client to shadow copy.
-        std::copy(srcPtr, srcPtr + sizeToCopy, mShadowCopy.data() + offset);
-
-        // 3. Copy data from shadow copy to GPU.
-        ANGLE_TRY(commitShadowCopy(context));
+        if (isOffsetAndSizeMetalBlitCompatible(offset, size) &&
+            (alwaysUseStagedBufferUpdates || mBuffer->isBeingUsedByGPU(contextMtl)))
+        {
+            if (alwaysUseStagedBufferUpdates || !isSafeToReadFromBufferViaBlit(contextMtl))
+            {
+                // We can't use the buffer now so copy the data
+                // to a staging buffer and blit it in
+                return updateExistingBufferViaBlitFromStagingBuffer(contextMtl, srcPtr, sizeToCopy,
+                                                                    offset);
+            }
+            else
+            {
+                return putDataInNewBufferAndStartUsingNewBuffer(contextMtl, srcPtr, sizeToCopy,
+                                                                offset);
+            }
+        }
+        else
+        {
+            return copyDataToExistingBufferViaCPU(contextMtl, srcPtr, sizeToCopy, offset);
+        }
    }
-
-    return angle::Result::Continue;
 }

-angle::Result BufferMtl::commitShadowCopy(const gl::Context *context)
+angle::Result BufferMtl::commitShadowCopy(ContextMtl *contextMtl)
 {
-    return commitShadowCopy(context, size());
+    return commitShadowCopy(contextMtl, mGLSize);
 }

-angle::Result BufferMtl::commitShadowCopy(const gl::Context *context, size_t size)
+angle::Result BufferMtl::commitShadowCopy(ContextMtl *contextMtl, size_t size)
 {
-    ContextMtl *contextMtl = mtl::GetImpl(context);
+    mtl::BufferManager &bufferManager = contextMtl->getBufferManager();
+    bool useSharedMem                 = useSharedMemory(contextMtl, mUsage);

-    if (!size)
-    {
-        // Skip mapping if size to commit is zero.
-        // zero size is passed to allocate buffer only.
-        ANGLE_TRY(mBufferPool.allocate(contextMtl, mShadowCopy.size(), nullptr, &mBuffer, nullptr,
-                                       nullptr));
-    }
-    else
-    {
-        uint8_t *ptr = nullptr;
-        mBufferPool.releaseInFlightBuffers(contextMtl);
-        ANGLE_TRY(
-            mBufferPool.allocate(contextMtl, mShadowCopy.size(), &ptr, &mBuffer, nullptr, nullptr));
+    bufferManager.returnBuffer(contextMtl, mBuffer);
+    ANGLE_TRY(bufferManager.getBuffer(contextMtl, mGLSize, useSharedMem, mBuffer));

+    if (size)
+    {
+        uint8_t *ptr = mBuffer->mapWithOpt(contextMtl, false, true);
        std::copy(mShadowCopy.data(), mShadowCopy.data() + size, ptr);
+        mBuffer->unmapAndFlushSubset(contextMtl, 0, size);
    }

-    ANGLE_TRY(mBufferPool.commit(contextMtl));
-
    return angle::Result::Continue;
 }

--- a/src/libANGLE/renderer/metal/ContextMtl.h
+++ b/src/libANGLE/renderer/metal/ContextMtl.h
@ -17,6 +17,7 @@
 #include "libANGLE/Context.h"
 #include "libANGLE/renderer/ContextImpl.h"
 #include "libANGLE/renderer/metal/ProvokingVertexHelper.h"
+#include "libANGLE/renderer/metal/mtl_buffer_manager.h"
 #include "libANGLE/renderer/metal/mtl_buffer_pool.h"
 #include "libANGLE/renderer/metal/mtl_command_buffer.h"
 #include "libANGLE/renderer/metal/mtl_context_device.h"
@ -369,6 +370,7 @@ class ContextMtl : public ContextImpl, public mtl::Context
    // Will end current command encoder and start new blit command encoder. Unless a blit comamnd
    // encoder is already started.
    mtl::BlitCommandEncoder *getBlitCommandEncoder();
+
    // Will end current command encoder and start new compute command encoder. Unless a compute
    // command encoder is already started.
    mtl::ComputeCommandEncoder *getComputeCommandEncoder();
@ -381,6 +383,8 @@ class ContextMtl : public ContextImpl, public mtl::Context
    // Get the provoking vertex command encoder.
    mtl::ComputeCommandEncoder *getIndexPreprocessingCommandEncoder();

+    bool isCurrentRenderEncoderSerial(uint64_t serial);
+
    const mtl::ContextDevice &getMetalDevice() const { return mContextDevice; }

    angle::Result copy2DTextureSlice0Level0ToWorkTexture(const mtl::TextureRef &srcTexture);
@ -390,6 +394,7 @@ class ContextMtl : public ContextImpl, public mtl::Context
                                                    const mtl::MipmapNativeLevel &mipNativeLevel,
                                                    uint32_t layerIndex);
    const mtl::BufferRef &getWorkBuffer() const { return mWorkBuffer; }
+    mtl::BufferManager &getBufferManager() { return mBufferManager; }

  private:
    void ensureCommandBufferReady();
@ -600,6 +605,8 @@ class ContextMtl : public ContextImpl, public mtl::Context
    MTLCullMode mCullMode;
    bool mCullAllPolygons = false;

+    mtl::BufferManager mBufferManager;
+
    // Lineloop and TriFan index buffer
    mtl::BufferPool mLineLoopIndexBuffer;
    mtl::BufferPool mLineLoopLastSegmentIndexBuffer;
--- a/src/libANGLE/renderer/metal/ContextMtl.mm
+++ b/src/libANGLE/renderer/metal/ContextMtl.mm
@ -1663,6 +1663,11 @@ void ContextMtl::endRenderEncoding(mtl::RenderCommandEncoder *encoder)
        disableActiveOcclusionQueryInRenderPass();
    }

+    if (mBlitEncoder.valid())
+    {
+        mBlitEncoder.endEncoding();
+    }
+
    encoder->endEncoding();

    // Resolve visibility results
@ -1755,6 +1760,16 @@ bool ContextMtl::hasStartedRenderPass(const mtl::RenderPassDesc &desc)
           mRenderEncoder.renderPassDesc().equalIgnoreLoadStoreOptions(desc);
 }

+bool ContextMtl::isCurrentRenderEncoderSerial(uint64_t serial)
+{
+    if (!mRenderEncoder.valid())
+    {
+        return false;
+    }
+
+    return serial == mRenderEncoder.getSerial();
+}
+
 // Get current render encoder
 mtl::RenderCommandEncoder *ContextMtl::getRenderCommandEncoder()
 {
@ -1856,6 +1871,11 @@ mtl::RenderCommandEncoder *ContextMtl::getRenderTargetCommandEncoder(

 mtl::BlitCommandEncoder *ContextMtl::getBlitCommandEncoder()
 {
+    if (mRenderEncoder.valid() || mComputeEncoder.valid())
+    {
+        endEncoding(true);
+    }
+
    if (mBlitEncoder.valid())
    {
        return &mBlitEncoder;
@ -1882,6 +1902,11 @@ mtl::BlitCommandEncoder *ContextMtl::getBlitCommandEncoderWithoutEndingRenderEnc

 mtl::ComputeCommandEncoder *ContextMtl::getComputeCommandEncoder()
 {
+    if (mRenderEncoder.valid() || mBlitEncoder.valid())
+    {
+        endEncoding(true);
+    }
+
    if (mComputeEncoder.valid())
    {
        return &mComputeEncoder;
@ -2725,7 +2750,8 @@ angle::Result ContextMtl::copyTextureSliceLevelToWorkBuffer(
    // Expand the buffer if it is not big enough.
    if (!mWorkBuffer || mWorkBuffer->size() < sizeInBytes)
    {
-        ANGLE_TRY(mtl::Buffer::MakeBuffer(this, sizeInBytes, nullptr, &mWorkBuffer));
+        ANGLE_TRY(mtl::Buffer::MakeBufferWithSharedMemOpt(this, true, sizeInBytes, nullptr,
+                                                          &mWorkBuffer));
    }

    gl::Rectangle region(0, 0, width, height);
--- a/src/libANGLE/renderer/metal/DisplayMtl.mm
+++ b/src/libANGLE/renderer/metal/DisplayMtl.mm
@ -1193,6 +1193,16 @@ void DisplayMtl::initializeFeatures()

    ANGLE_FEATURE_CONDITION((&mFeatures), preemptivelyStartProvokingVertexCommandBuffer, isAMD());

+    ANGLE_FEATURE_CONDITION((&mFeatures), alwaysUseStagedBufferUpdates, isAMD());
+    ANGLE_FEATURE_CONDITION((&mFeatures), alwaysUseManagedStorageModeForBuffers, isAMD());
+
+    ANGLE_FEATURE_CONDITION((&mFeatures), alwaysUseSharedStorageModeForBuffers, isIntel());
+    ANGLE_FEATURE_CONDITION((&mFeatures), useShadowBuffersWhenAppropriate, isIntel());
+
+    // At least one of these must not be set.
+    ASSERT(!mFeatures.alwaysUseManagedStorageModeForBuffers.enabled ||
+           !mFeatures.alwaysUseSharedStorageModeForBuffers.enabled);
+
    bool defaultDirectToMetal = true;
    ANGLE_FEATURE_CONDITION((&mFeatures), directMetalGeneration, defaultDirectToMetal);

--- a/src/libANGLE/renderer/metal/FrameBufferMtl.mm
+++ b/src/libANGLE/renderer/metal/FrameBufferMtl.mm
@ -1611,6 +1611,7 @@ angle::Result FramebufferMtl::readPixelsImpl(const gl::Context *context,

        return result;
    }
+
    if (texture->isBeingUsedByGPU(contextMtl))
    {
        contextMtl->flushCommandBuffer(mtl::WaitUntilFinished);
--- a/src/libANGLE/renderer/metal/ProgramMtl.mm
+++ b/src/libANGLE/renderer/metal/ProgramMtl.mm
@ -1460,7 +1460,7 @@ angle::Result ProgramMtl::legalizeUniformBufferOffsets(
            // Has the content of the buffer has changed since last conversion?
            if (conversion->dirty)
            {
-                const uint8_t *srcBytes = bufferMtl->getClientShadowCopyData(context);
+                const uint8_t *srcBytes = bufferMtl->getBufferDataReadOnly(context);
                srcBytes += offsetModulo;
                size_t sizeToCopy      = bufferMtl->size() - offsetModulo;
                size_t bytesToAllocate = roundUp<size_t>(sizeToCopy, 16u);
--- a/src/libANGLE/renderer/metal/TextureMtl.mm
+++ b/src/libANGLE/renderer/metal/TextureMtl.mm
@ -1817,7 +1817,7 @@ angle::Result TextureMtl::setPerSliceSubImage(const gl::Context *context,
        {
            // NOTE(hqle): packed depth & stencil texture cannot copy from buffer directly, needs
            // to split its depth & stencil data and copy separately.
-            const uint8_t *clientData = unpackBufferMtl->getClientShadowCopyData(contextMtl);
+            const uint8_t *clientData = unpackBufferMtl->getBufferDataReadOnly(contextMtl);
            clientData += offset;
            ANGLE_TRY(UploadTextureContents(context, mFormat.actualAngleFormat(), mtlArea,
                                            mtl::kZeroNativeMipLevel, slice, clientData,
@ -1871,7 +1871,7 @@ angle::Result TextureMtl::convertAndSetPerSliceSubImage(const gl::Context *conte
            mFormat.intendedAngleFormat().isBlock)
        {
            // Unsupported format, use CPU path.
-            const uint8_t *clientData = unpackBufferMtl->getClientShadowCopyData(contextMtl);
+            const uint8_t *clientData = unpackBufferMtl->getBufferDataReadOnly(contextMtl);
            clientData += offset;
            ANGLE_TRY(convertAndSetPerSliceSubImage(context, slice, mtlArea, internalFormat, type,
                                                    pixelsAngleFormat, pixelsRowPitch,
--- a/src/libANGLE/renderer/metal/VertexArrayMtl.mm
+++ b/src/libANGLE/renderer/metal/VertexArrayMtl.mm
@ -881,7 +881,7 @@ angle::Result VertexArrayMtl::convertIndexBuffer(const gl::Context *glContext,
    {
        // We shouldn't use GPU to convert when we are in a middle of a render pass.
        ANGLE_TRY(StreamIndexData(contextMtl, &conversion->data,
-                                  idxBuffer->getClientShadowCopyData(contextMtl) + offsetModulo,
+                                  idxBuffer->getBufferDataReadOnly(contextMtl) + offsetModulo,
                                  indexType, indexCount, glState.isPrimitiveRestartEnabled(),
                                  &conversion->convertedBuffer, &conversion->convertedOffset));
    }
@ -1061,7 +1061,7 @@ angle::Result VertexArrayMtl::convertVertexBufferCPU(ContextMtl *contextMtl,
                                                     ConversionBufferMtl *conversion)
 {

-    const uint8_t *srcBytes = srcBuffer->getClientShadowCopyData(contextMtl);
+    const uint8_t *srcBytes = srcBuffer->getBufferDataReadOnly(contextMtl);
    ANGLE_CHECK_GL_ALLOC(contextMtl, srcBytes);
    VertexConversionBufferMtl *vertexConverison =
        static_cast<VertexConversionBufferMtl *>(conversion);
@ -1151,4 +1151,4 @@ angle::Result VertexArrayMtl::convertVertexBufferGPU(const gl::Context *glContex

    return angle::Result::Continue;
 }
-}
+}  // namespace rx
--- a/src/libANGLE/renderer/metal/mtl_buffer_manager.h
+++ b/src/libANGLE/renderer/metal/mtl_buffer_manager.h
@ -0,0 +1,94 @@
+//
+// Copyright 2022 The ANGLE Project Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// mtl_buffer_manager.h:
+//    BufferManager manages buffers across all contexts for a single
+//    device.
+//
+#ifndef LIBANGLE_RENDERER_METAL_MTL_BUFFER_MANAGER_H_
+#define LIBANGLE_RENDERER_METAL_MTL_BUFFER_MANAGER_H_
+
+#include "common/FixedVector.h"
+#include "libANGLE/renderer/metal/mtl_resources.h"
+
+#include <vector>
+
+namespace rx
+{
+class ContextMtl;
+
+namespace mtl
+{
+
+// GL buffers are backed by Metal buffers. Which metal
+// buffer is backing a particular GL buffer is fluid.
+// The case being optimized is a loop of something like
+//
+//    for 1..4
+//      glBufferSubData
+//      glDrawXXX
+//
+// You can't update a buffer in the middle of a render pass
+// in metal so instead we'd end up using multiple buffers.
+//
+// Simple case, the call to `glBufferSubData` updates the
+// entire buffer. In this case we'd end up with each call
+// to `glBufferSubData` getting a new buffer from this
+// BufferManager and copying the new data to it. We'd
+// end up submitting this renderpass
+//
+//    draw with buf1
+//    draw with buf2
+//    draw with buf3
+//    draw with buf4
+//
+// The GL buffer now references buf4. And buf1, buf2, buf3 and
+// buf0 (the buffer that was previously referenced by the GL buffer)
+// are all added to the inuse-list
+//
+
+// This macro enables showing the running totals of the various
+// buckets of unused buffers.
+// #define ANGLE_MTL_TRACK_BUFFER_MEM
+
+class BufferManager
+{
+  public:
+    BufferManager();
+
+    static constexpr size_t kMaxStagingBufferSize = 1024 * 1024;
+    static constexpr size_t kMaxSizePowerOf2      = 64;
+
+    angle::Result queueBlitCopyDataToBuffer(ContextMtl *contextMtl,
+                                            const void *srcPtr,
+                                            size_t sizeToCopy,
+                                            size_t offset,
+                                            mtl::BufferRef &dstMetalBuffer);
+
+    angle::Result getBuffer(ContextMtl *contextMtl,
+                            size_t size,
+                            bool useSharedMem,
+                            mtl::BufferRef &bufferRef);
+    void returnBuffer(ContextMtl *contextMtl, mtl::BufferRef &bufferRef);
+
+  private:
+    typedef std::vector<mtl::BufferRef> BufferList;
+
+    void freeUnusedBuffers(ContextMtl *contextMtl);
+    void addBufferRefToFreeLists(mtl::BufferRef &bufferRef);
+
+    BufferList mInUseBuffers;
+
+    angle::FixedVector<BufferList, kMaxSizePowerOf2> mFreeBuffers[2];
+#ifdef ANGLE_MTL_TRACK_BUFFER_MEM
+    angle::FixedVector<size_t, kMaxSizePowerOf2> mAllocations;
+    size_t mTotalMem = 0;
+#endif
+};
+
+}  // namespace mtl
+}  // namespace rx
+
+#endif /* LIBANGLE_RENDERER_METAL_MTL_BUFFER_MANAGER_H_ */
--- a/src/libANGLE/renderer/metal/mtl_buffer_manager.mm
+++ b/src/libANGLE/renderer/metal/mtl_buffer_manager.mm
@ -0,0 +1,202 @@
+//
+// Copyright 2022 The ANGLE Project Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// mtl_buffer_manager.mm:
+//    Implements the class methods for BufferManager.
+//
+
+#include "libANGLE/renderer/metal/mtl_buffer_manager.h"
+
+#include "libANGLE/renderer/metal/ContextMtl.h"
+#include "libANGLE/renderer/metal/DisplayMtl.h"
+
+namespace rx
+{
+
+namespace mtl
+{
+
+namespace
+{
+
+constexpr size_t Log2(size_t num)
+{
+    return num <= 1 ? 0 : (1 + Log2(num / 2));
+}
+
+constexpr size_t Log2Ceil(size_t num)
+{
+    size_t l    = Log2(num);
+    size_t size = size_t(1) << l;
+    return num == size ? l : l + 1;
+}
+
+#ifdef ANGLE_MTL_TRACK_BUFFER_MEM
+const char *memUnitSuffix(size_t powerOf2)
+{
+    if (powerOf2 < 10)
+    {
+        return "b";
+    }
+    if (powerOf2 < 20)
+    {
+        return "k";
+    }
+    if (powerOf2 < 30)
+    {
+        return "M";
+    }
+    return "G";
+}
+
+size_t memUnitValue(size_t powerOf2)
+{
+    if (powerOf2 < 10)
+    {
+        return 1u << powerOf2;
+    }
+    if (powerOf2 < 20)
+    {
+        return 1u << (powerOf2 - 10);
+    }
+    if (powerOf2 < 30)
+    {
+        return 1u << (powerOf2 - 20);
+    }
+    return 1u << (powerOf2 - 30);
+}
+#endif  // ANGLE_MTL_TRACK_BUFFER_MEM
+
+int sharedMemToIndex(bool useSharedMem)
+{
+    return useSharedMem ? 1 : 0;
+}
+
+}  // namespace
+
+BufferManager::BufferManager()
+#ifdef ANGLE_MTL_TRACK_BUFFER_MEM
+    : mAllocations(kMaxSizePowerOf2, 0)
+#endif
+{}
+
+void BufferManager::freeUnusedBuffers(ContextMtl *contextMtl)
+{
+    // Scan for the first buffer still in use.
+    BufferList::iterator firstInUseIter =
+        std::find_if(mInUseBuffers.begin(), mInUseBuffers.end(),
+                     [&contextMtl](auto ref) { return ref->isBeingUsedByGPU(contextMtl); });
+
+    // Move unused buffers to the free lists
+    for (BufferList::iterator it = mInUseBuffers.begin(); it != firstInUseIter; ++it)
+    {
+        addBufferRefToFreeLists(*it);
+    }
+    mInUseBuffers.erase(mInUseBuffers.begin(), firstInUseIter);
+}
+
+void BufferManager::addBufferRefToFreeLists(mtl::BufferRef &bufferRef)
+{
+    const size_t bucketNdx = Log2Ceil(bufferRef->size());
+    ASSERT(bucketNdx < kMaxSizePowerOf2);
+    int sharedNdx = sharedMemToIndex(bufferRef->get().storageMode == MTLStorageModeShared);
+    mFreeBuffers[sharedNdx][bucketNdx].push_back(bufferRef);
+}
+
+void BufferManager::returnBuffer(ContextMtl *contextMtl, BufferRef &bufferRef)
+{
+    if (bufferRef->isBeingUsedByGPU(contextMtl))
+    {
+        mInUseBuffers.push_back(bufferRef);
+    }
+    else
+    {
+        addBufferRefToFreeLists(bufferRef);
+    }
+}
+
+angle::Result BufferManager::getBuffer(ContextMtl *contextMtl,
+                                       size_t size,
+                                       bool useSharedMem,
+                                       BufferRef &bufferRef)
+{
+    freeUnusedBuffers(contextMtl);
+
+    const size_t bucketNdx  = Log2Ceil(size);
+    const int sharedNdx     = sharedMemToIndex(useSharedMem);
+    BufferList &freeBuffers = mFreeBuffers[sharedNdx][bucketNdx];
+
+    // If there are free buffers grab one
+    if (!freeBuffers.empty())
+    {
+        bufferRef = freeBuffers.back();
+        freeBuffers.pop_back();
+        return angle::Result::Continue;
+    }
+
+    // Create a new one
+    mtl::BufferRef newBufferRef;
+
+    size_t allocSize = size_t(1) << bucketNdx;
+    ASSERT(allocSize >= size);
+    ANGLE_TRY(mtl::Buffer::MakeBufferWithSharedMemOpt(contextMtl, useSharedMem, allocSize, nullptr,
+                                                      &newBufferRef));
+
+#ifdef ANGLE_MTL_TRACK_BUFFER_MEM
+    {
+        mTotalMem += allocSize;
+        mAllocations[bucketNdx]++;
+        fprintf(stderr, "totalMem: %zu, ", mTotalMem);
+        size_t numBuffers = 0;
+        for (size_t i = 0; i < kMaxSizePowerOf2; ++i)
+        {
+            if (mAllocations[i])
+            {
+                numBuffers += mAllocations[i];
+                fprintf(stderr, "%zu%s: %zu, ", memUnitValue(i), memUnitSuffix(i), mAllocations[i]);
+            }
+        }
+        fprintf(stderr, " total: %zu\n", numBuffers);
+    }
+#endif
+
+    bufferRef = newBufferRef;
+
+    return angle::Result::Continue;
+}
+
+angle::Result BufferManager::queueBlitCopyDataToBuffer(ContextMtl *contextMtl,
+                                                       const void *srcPtr,
+                                                       size_t sizeToCopy,
+                                                       size_t offset,
+                                                       mtl::BufferRef &dstMetalBuffer)
+{
+    const uint8 *src = reinterpret_cast<const uint8 *>(srcPtr);
+    bool useShared =
+        !contextMtl->getDisplay()->getFeatures().alwaysUseManagedStorageModeForBuffers.enabled;
+
+    for (size_t srcOffset = 0; srcOffset < sizeToCopy; srcOffset += kMaxStagingBufferSize)
+    {
+        size_t subSizeToCopy = std::min(kMaxStagingBufferSize, sizeToCopy - srcOffset);
+
+        mtl::BufferRef bufferRef;
+        ANGLE_TRY(getBuffer(contextMtl, subSizeToCopy, useShared, bufferRef));
+
+        // copy data to buffer
+        uint8_t *ptr = bufferRef->mapWithOpt(contextMtl, false, true);
+        std::copy(src + srcOffset, src + srcOffset + subSizeToCopy, ptr);
+        bufferRef->unmapAndFlushSubset(contextMtl, 0, subSizeToCopy);
+
+        // queue blit
+        mtl::BlitCommandEncoder *blitEncoder = contextMtl->getBlitCommandEncoder();
+        blitEncoder->copyBuffer(bufferRef, 0, dstMetalBuffer, offset + srcOffset, subSizeToCopy);
+
+        returnBuffer(contextMtl, bufferRef);
+    }
+    return angle::Result::Continue;
+}
+
+}  // namespace mtl
+}  // namespace rx
--- a/src/libANGLE/renderer/metal/mtl_command_buffer.h
+++ b/src/libANGLE/renderer/metal/mtl_command_buffer.h
@ -80,6 +80,8 @@ class CommandQueue final : public WrappedObject<id<MTLCommandQueue>>, angle::Non
    AutoObjCPtr<id<MTLCommandBuffer>> makeMetalCommandBuffer(uint64_t *queueSerialOut);
    void onCommandBufferCommitted(id<MTLCommandBuffer> buf, uint64_t serial);

+    uint64_t getNextRenderEncoderSerial();
+
  private:
    void onCommandBufferCompleted(id<MTLCommandBuffer> buf, uint64_t serial);
    using ParentClass = WrappedObject<id<MTLCommandQueue>>;
@ -94,6 +96,7 @@ class CommandQueue final : public WrappedObject<id<MTLCommandQueue>>, angle::Non
    uint64_t mQueueSerialCounter = 1;
    std::atomic<uint64_t> mCommittedBufferSerial{0};
    std::atomic<uint64_t> mCompletedBufferSerial{0};
+    uint64_t mRenderEncoderCounter = 1;

    mutable std::mutex mLock;
 };
@ -497,6 +500,8 @@ class RenderCommandEncoder final : public CommandEncoder
    const RenderPassDesc &renderPassDesc() const { return mRenderPassDesc; }
    bool hasDrawCalls() const { return mHasDrawCalls; }

+    uint64_t getSerial() const { return mSerial; }
+
  private:
    // Override CommandEncoder
    id<MTLRenderCommandEncoder> get()
@ -541,6 +546,7 @@ class RenderCommandEncoder final : public CommandEncoder
    RenderCommandEncoderStates mStateCache = {};

    bool mPipelineStateSet = false;
+    const uint64_t mSerial = 0;
 };

 class BlitCommandEncoder final : public CommandEncoder
--- a/src/libANGLE/renderer/metal/mtl_command_buffer.mm
+++ b/src/libANGLE/renderer/metal/mtl_command_buffer.mm
@ -11,6 +11,7 @@
 #include "libANGLE/renderer/metal/mtl_command_buffer.h"

 #include <cassert>
+#include <cstdint>
 #if ANGLE_MTL_SIMULATE_DISCARD_FRAMEBUFFER
 #    include <random>
 #endif
@ -579,6 +580,11 @@ void CommandQueue::onCommandBufferCompleted(id<MTLCommandBuffer> buf, uint64_t s
        std::memory_order_relaxed);
 }

+uint64_t CommandQueue::getNextRenderEncoderSerial()
+{
+    return ++mRenderEncoderCounter;
+}
+
 // CommandBuffer implementation
 CommandBuffer::CommandBuffer(CommandQueue *cmdQueue) : mCmdQueue(*cmdQueue) {}

@ -1065,7 +1071,9 @@ void RenderCommandEncoderStates::reset()
 // RenderCommandEncoder implemtation
 RenderCommandEncoder::RenderCommandEncoder(CommandBuffer *cmdBuffer,
                                           const OcclusionQueryPool &queryPool)
-    : CommandEncoder(cmdBuffer, RENDER), mOcclusionQueryPool(queryPool)
+    : CommandEncoder(cmdBuffer, RENDER),
+      mOcclusionQueryPool(queryPool),
+      mSerial(cmdBuffer->cmdQueue().getNextRenderEncoderSerial())
 {
    ANGLE_MTL_OBJC_SCOPE
    {
@ -1556,6 +1564,7 @@ RenderCommandEncoder &RenderCommandEncoder::setBufferForWrite(gl::ShaderType sha
        return *this;
    }

+    buffer->setLastWritingRenderEncoderSerial(mSerial);
    cmdBuffer().setWriteDependency(buffer);

    id<MTLBuffer> mtlBuffer = (buffer ? buffer->get() : nil);
@ -2183,10 +2192,14 @@ BlitCommandEncoder &BlitCommandEncoder::synchronizeResource(Buffer *buffer)
    }

 #if TARGET_OS_OSX || TARGET_OS_MACCATALYST
-    // Only MacOS has separated storage for resource on CPU and GPU and needs explicit
-    // synchronization
-    cmdBuffer().setReadDependency(buffer);
-    [get() synchronizeResource:buffer->get()];
+    if (buffer->get().storageMode == MTLStorageModeManaged)
+    {
+        // Only MacOS has separated storage for resource on CPU and GPU and needs explicit
+        // synchronization
+        cmdBuffer().setReadDependency(buffer);
+
+        [get() synchronizeResource:buffer->get()];
+    }
 #endif
    return *this;
 }
--- a/src/libANGLE/renderer/metal/mtl_common.h
+++ b/src/libANGLE/renderer/metal/mtl_common.h
@ -106,7 +106,7 @@ constexpr uint32_t kMaxShaderXFBs = gl::IMPLEMENTATION_MAX_TRANSFORM_FEEDBACK_SE
 // The max size of a buffer that will be allocated in shared memory.
 // NOTE(hqle): This is just a hint. There is no official document on what is the max allowed size
 // for shared memory.
-constexpr size_t kSharedMemBufferMaxBufSizeHint = 128 * 1024;
+constexpr size_t kSharedMemBufferMaxBufSizeHint = 256 * 1024;

 constexpr size_t kDefaultAttributeSize = 4 * sizeof(float);

--- a/src/libANGLE/renderer/metal/mtl_render_utils.mm
+++ b/src/libANGLE/renderer/metal/mtl_render_utils.mm
@ -2094,8 +2094,7 @@ angle::Result IndexGeneratorUtils::generateTriFanBufferFromElementsArray(
             contextMtl->getRenderCommandEncoder()))
        {
            IndexGenerationParams cpuPathParams = params;
-            cpuPathParams.indices =
-                elementBufferMtl->getClientShadowCopyData(contextMtl) + srcOffset;
+            cpuPathParams.indices = elementBufferMtl->getBufferDataReadOnly(contextMtl) + srcOffset;
            return generateTriFanBufferFromElementsArrayCPU(contextMtl, cpuPathParams,
                                                            indicesGenerated);
        }
@ -2223,8 +2222,7 @@ angle::Result IndexGeneratorUtils::generateLineLoopBufferFromElementsArray(
             contextMtl->getRenderCommandEncoder()))
        {
            IndexGenerationParams cpuPathParams = params;
-            cpuPathParams.indices =
-                elementBufferMtl->getClientShadowCopyData(contextMtl) + srcOffset;
+            cpuPathParams.indices = elementBufferMtl->getBufferDataReadOnly(contextMtl) + srcOffset;
            return generateLineLoopBufferFromElementsArrayCPU(contextMtl, cpuPathParams,
                                                              indicesGenerated);
        }
--- a/src/libANGLE/renderer/metal/mtl_resources.h
+++ b/src/libANGLE/renderer/metal/mtl_resources.h
@ -56,6 +56,7 @@ class Resource : angle::NonCopyable
    bool hasPendingWorks(Context *context) const;

    void setUsedByCommandBufferWithQueueSerial(uint64_t serial, bool writing);
+    void setWrittenToByRenderEncoder(uint64_t serial);

    uint64_t getCommandBufferQueueSerial() const { return mUsageRef->cmdBufferQueueSerial; }

@ -71,6 +72,15 @@ class Resource : angle::NonCopyable
    bool isCPUReadMemDirty() const { return mUsageRef->cpuReadMemDirty; }
    void resetCPUReadMemDirty() { mUsageRef->cpuReadMemDirty = false; }

+    bool getLastWritingRenderEncoderSerial() const
+    {
+        return mUsageRef->lastWritingRenderEncoderSerial;
+    }
+    void setLastWritingRenderEncoderSerial(uint64_t serial) const
+    {
+        mUsageRef->lastWritingRenderEncoderSerial = serial;
+    }
+
    virtual size_t estimatedByteSize() const = 0;
    virtual id getID() const                 = 0;

@ -98,6 +108,9 @@ class Resource : angle::NonCopyable

        // This flag is useful for BufferMtl to know whether it should update the shadow copy
        bool cpuReadMemDirty = false;
+
+        // The id of the last render encoder to write to this resource
+        uint64_t lastWritingRenderEncoderSerial = 0;
    };

    // One resource object might just be a view of another resource. For example, a texture 2d
--- a/src/libANGLE/renderer/metal/mtl_resources.mm
+++ b/src/libANGLE/renderer/metal/mtl_resources.mm
@ -1054,8 +1054,9 @@ void Buffer::flush(ContextMtl *context, size_t offsetWritten, size_t sizeWritten
    {
        if (get().storageMode == MTLStorageModeManaged)
        {
-            size_t startOffset = std::min(offsetWritten, size());
-            size_t endOffset   = std::min(offsetWritten + sizeWritten, size());
+            size_t bufferSize  = size();
+            size_t startOffset = std::min(offsetWritten, bufferSize);
+            size_t endOffset   = std::min(offsetWritten + sizeWritten, bufferSize);
            size_t clampedSize = endOffset - startOffset;
            if (clampedSize > 0)
            {
--- a/src/tests/gl_tests/BufferDataTest.cpp
+++ b/src/tests/gl_tests/BufferDataTest.cpp
@ -1055,6 +1055,149 @@ TEST_P(BufferDataTestES3, NoBufferInitDataCopyBug)
    ASSERT_GL_NO_ERROR();
 }

+// This a shortened version of dEQP functional.buffer.copy.basic.array_copy_read. It provoked
+// a bug in copyBufferSubData. The bug appeared to be that conversion buffers were not marked
+// as dirty and therefore after copyBufferSubData the next draw call using the buffer that
+// just had data copied to it was not re-converted. It's not clear to me how this ever worked
+// or why changes to bufferSubData from
+// https://chromium-review.googlesource.com/c/angle/angle/+/3842641 made this issue appear and
+// why it wasn't already broken.
+TEST_P(BufferDataTestES3, CopyBufferSubDataDraw)
+{
+    const char simpleVertex[]   = R"(attribute vec2 position;
+attribute vec4 color;
+varying vec4 vColor;
+void main()
+{
+    gl_Position = vec4(position, 0, 1);
+    vColor = color;
+}
+)";
+    const char simpleFragment[] = R"(precision mediump float;
+varying vec4 vColor;
+void main()
+{
+    gl_FragColor = vColor;
+}
+)";
+
+    ANGLE_GL_PROGRAM(program, simpleVertex, simpleFragment);
+    glUseProgram(program);
+
+    GLint colorLoc = glGetAttribLocation(program, "color");
+    ASSERT_NE(-1, colorLoc);
+    GLint posLoc = glGetAttribLocation(program, "position");
+    ASSERT_NE(-1, posLoc);
+
+    glClearColor(0, 0, 0, 0);
+
+    GLBuffer srcBuffer;  // green
+    GLBuffer dstBuffer;  // red
+
+    constexpr size_t numElements = 399;
+    std::vector<GLColorRGB> reds(numElements, GLColorRGB::red);
+    std::vector<GLColorRGB> greens(numElements, GLColorRGB::green);
+    constexpr size_t sizeOfElem  = sizeof(decltype(greens)::value_type);
+    constexpr size_t sizeInBytes = numElements * sizeOfElem;
+
+    glBindBuffer(GL_ARRAY_BUFFER, srcBuffer);
+    glBufferData(GL_ARRAY_BUFFER, sizeInBytes, greens.data(), GL_STREAM_DRAW);
+
+    glBindBuffer(GL_COPY_READ_BUFFER, dstBuffer);
+    glBufferData(GL_COPY_READ_BUFFER, sizeInBytes, reds.data(), GL_STREAM_DRAW);
+    ASSERT_GL_NO_ERROR();
+
+    constexpr size_t numQuads = numElements / 4;
+
+    // Generate quads that fill clip space to use all the vertex colors
+    std::vector<float> positions(numQuads * 4 * 2);
+    for (size_t quad = 0; quad < numQuads; ++quad)
+    {
+        size_t offset = quad * 4 * 2;
+        float x0      = float(quad + 0) / numQuads * 2.0f - 1.0f;
+        float x1      = float(quad + 1) / numQuads * 2.0f - 1.0f;
+
+        /*
+           2--3
+           |  |
+           0--1
+        */
+        positions[offset + 0] = x0;
+        positions[offset + 1] = -1;
+        positions[offset + 2] = x1;
+        positions[offset + 3] = -1;
+        positions[offset + 4] = x0;
+        positions[offset + 5] = 1;
+        positions[offset + 6] = x1;
+        positions[offset + 7] = 1;
+    }
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
+    glEnableVertexAttribArray(posLoc);
+    glVertexAttribPointer(posLoc, 2, GL_FLOAT, GL_FALSE, 0, positions.data());
+    ASSERT_GL_NO_ERROR();
+
+    glBindBuffer(GL_ARRAY_BUFFER, srcBuffer);
+    glEnableVertexAttribArray(colorLoc);
+    glVertexAttribPointer(colorLoc, 3, GL_UNSIGNED_BYTE, GL_TRUE, 0, nullptr);
+    ASSERT_GL_NO_ERROR();
+
+    glClear(GL_COLOR_BUFFER_BIT);
+
+    std::vector<GLushort> indices(numQuads * 6);
+    for (size_t quad = 0; quad < numQuads; ++quad)
+    {
+        size_t ndx          = quad * 4;
+        size_t offset       = quad * 6;
+        indices[offset + 0] = ndx;
+        indices[offset + 1] = ndx + 1;
+        indices[offset + 2] = ndx + 2;
+        indices[offset + 3] = ndx + 2;
+        indices[offset + 4] = ndx + 1;
+        indices[offset + 5] = ndx + 3;
+    }
+    GLBuffer indexBuffer;
+    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, indexBuffer);
+    glBufferData(GL_ELEMENT_ARRAY_BUFFER, indices.size() * sizeof(decltype(indices)::value_type),
+                 indices.data(), GL_STATIC_DRAW);
+
+    // Draw with srcBuffer (green)
+    glDrawElements(GL_TRIANGLES, numQuads * 6, GL_UNSIGNED_SHORT, 0);
+    EXPECT_PIXEL_RECT_EQ(0, 0, 16, 16, GLColor::green);
+    ASSERT_GL_NO_ERROR();
+
+    // Draw with dstBuffer (red)
+    glBindBuffer(GL_ARRAY_BUFFER, dstBuffer);
+    glEnableVertexAttribArray(colorLoc);
+    glVertexAttribPointer(colorLoc, 3, GL_UNSIGNED_BYTE, GL_TRUE, 0, nullptr);
+    glDrawElements(GL_TRIANGLES, numQuads * 6, GL_UNSIGNED_SHORT, 0);
+    EXPECT_PIXEL_RECT_EQ(0, 0, 16, 16, GLColor::red);
+    ASSERT_GL_NO_ERROR();
+
+    // Copy src to dst. Yes, we're using GL_COPY_READ_BUFFER as dest because that's what the dEQP
+    // test was testing.
+    glBindBuffer(GL_ARRAY_BUFFER, srcBuffer);
+    glBindBuffer(GL_COPY_READ_BUFFER, dstBuffer);
+    glCopyBufferSubData(GL_ARRAY_BUFFER, GL_COPY_READ_BUFFER, 0, 0, sizeInBytes);
+    ASSERT_GL_NO_ERROR();
+
+    // Draw with srcBuffer. It should still be green.
+    glBindBuffer(GL_ARRAY_BUFFER, srcBuffer);
+    glEnableVertexAttribArray(colorLoc);
+    glVertexAttribPointer(colorLoc, 3, GL_UNSIGNED_BYTE, GL_TRUE, 0, nullptr);
+    glDrawElements(GL_TRIANGLES, numQuads * 6, GL_UNSIGNED_SHORT, 0);
+    EXPECT_PIXEL_RECT_EQ(0, 0, 16, 16, GLColor::green);
+    ASSERT_GL_NO_ERROR();
+
+    // Draw with dstBuffer. It should now be green too.
+    glBindBuffer(GL_ARRAY_BUFFER, dstBuffer);
+    glEnableVertexAttribArray(colorLoc);
+    glVertexAttribPointer(colorLoc, 3, GL_UNSIGNED_BYTE, GL_TRUE, 0, nullptr);
+    glDrawElements(GL_TRIANGLES, numQuads * 6, GL_UNSIGNED_SHORT, 0);
+    EXPECT_PIXEL_RECT_EQ(0, 0, 16, 16, GLColor::green);
+
+    ASSERT_GL_NO_ERROR();
+}
+
 // Ensures that calling glBufferData on a mapped buffer results in an unmapped buffer
 TEST_P(BufferDataTestES3, BufferDataUnmap)
 {
--- a/util/angle_features_autogen.cpp
+++ b/util/angle_features_autogen.cpp
@ -38,6 +38,9 @@ constexpr PackedEnumMap<Feature, const char *> kFeatureNames = {{
     "allowTranslateUniformBlockToStructuredBuffer"},
    {Feature::AlwaysCallUseProgramAfterLink, "alwaysCallUseProgramAfterLink"},
    {Feature::AlwaysUnbindFramebufferTexture2D, "alwaysUnbindFramebufferTexture2D"},
+    {Feature::AlwaysUseManagedStorageModeForBuffers, "alwaysUseManagedStorageModeForBuffers"},
+    {Feature::AlwaysUseSharedStorageModeForBuffers, "alwaysUseSharedStorageModeForBuffers"},
+    {Feature::AlwaysUseStagedBufferUpdates, "alwaysUseStagedBufferUpdates"},
    {Feature::AsyncCommandQueue, "asyncCommandQueue"},
    {Feature::Avoid1BitAlphaTextureFormats, "avoid1BitAlphaTextureFormats"},
    {Feature::BindTransformFeedbackBufferBeforeBindBufferRange,
@ -187,6 +190,7 @@ constexpr PackedEnumMap<Feature, const char *> kFeatureNames = {{
    {Feature::PreemptivelyStartProvokingVertexCommandBuffer,
     "preemptivelyStartProvokingVertexCommandBuffer"},
    {Feature::PreferAggregateBarrierCalls, "preferAggregateBarrierCalls"},
+    {Feature::PreferCpuForBuffersubdata, "preferCpuForBuffersubdata"},
    {Feature::PreferCPUForBufferSubData, "preferCPUForBufferSubData"},
    {Feature::PreferDeviceLocalMemoryHostVisible, "preferDeviceLocalMemoryHostVisible"},
    {Feature::PreferDrawClearOverVkCmdClearAttachments, "preferDrawClearOverVkCmdClearAttachments"},
@ -317,6 +321,7 @@ constexpr PackedEnumMap<Feature, const char *> kFeatureNames = {{
    {Feature::UseInstancedPointSpriteEmulation, "useInstancedPointSpriteEmulation"},
    {Feature::UseMultipleDescriptorsForExternalFormats, "useMultipleDescriptorsForExternalFormats"},
    {Feature::UseNonZeroStencilWriteMaskStaticState, "useNonZeroStencilWriteMaskStaticState"},
+    {Feature::UseShadowBuffersWhenAppropriate, "useShadowBuffersWhenAppropriate"},
    {Feature::UseSystemMemoryForConstantBuffers, "useSystemMemoryForConstantBuffers"},
    {Feature::UseUnusedBlocksWithStandardOrSharedLayout,
     "useUnusedBlocksWithStandardOrSharedLayout"},
--- a/util/angle_features_autogen.h
+++ b/util/angle_features_autogen.h
@ -37,6 +37,9 @@ enum class Feature
    AllowTranslateUniformBlockToStructuredBuffer,
    AlwaysCallUseProgramAfterLink,
    AlwaysUnbindFramebufferTexture2D,
+    AlwaysUseManagedStorageModeForBuffers,
+    AlwaysUseSharedStorageModeForBuffers,
+    AlwaysUseStagedBufferUpdates,
    AsyncCommandQueue,
    Avoid1BitAlphaTextureFormats,
    BindTransformFeedbackBufferBeforeBindBufferRange,
@ -176,6 +179,7 @@ enum class Feature
    PrecisionSafeDivision,
    PreemptivelyStartProvokingVertexCommandBuffer,
    PreferAggregateBarrierCalls,
+    PreferCpuForBuffersubdata,
    PreferCPUForBufferSubData,
    PreferDeviceLocalMemoryHostVisible,
    PreferDrawClearOverVkCmdClearAttachments,
@ -293,6 +297,7 @@ enum class Feature
    UseInstancedPointSpriteEmulation,
    UseMultipleDescriptorsForExternalFormats,
    UseNonZeroStencilWriteMaskStaticState,
+    UseShadowBuffersWhenAppropriate,
    UseSystemMemoryForConstantBuffers,
    UseUnusedBlocksWithStandardOrSharedLayout,
    VertexIDDoesNotIncludeBaseVertex,