From b8f55be64453ea77fc51bff6cd0d906d18ce1cd2 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 11 Aug 2017 12:11:16 +0100 Subject: [PATCH] drm/i915: Split obj->cache_coherent to track r/w Another month, another story in the cache coherency saga. This time, we come to the realisation that i915_gem_object_is_coherent() has been reporting whether we can read from the target without requiring a cache invalidate; but we were using it in places for testing whether we could write into the object without requiring a cache flush. So split the tracking into two, one to decide before reads, one after writes. See commit e27ab73d17ef ("drm/i915: Mark CPU cache as dirty on every transition for CPU writes") for the previous entry in this saga. v2: Be verbose v3: Remove unused function (i915_gem_object_is_coherent) v4: Fix inverted coherency check prior to execbuf (from v2) v5: Add comment for nasty code where we are optimising on gcc's behalf. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=101109 Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=101555 Testcase: igt/kms_mmap_write_crc Testcase: igt/kms_pwrite_crc Signed-off-by: Chris Wilson Cc: Maarten Lankhorst Cc: Dongwon Kim Cc: Matt Roper Cc: Joonas Lahtinen Cc: Mika Kuoppala Tested-by: Maarten Lankhorst Acked-by: Maarten Lankhorst Link: https://patchwork.freedesktop.org/patch/msgid/20170811111116.10373-1-chris@chris-wilson.co.uk Reviewed-by: Joonas Lahtinen --- drivers/gpu/drm/i915/Makefile | 1 + drivers/gpu/drm/i915/i915_drv.h | 6 --- drivers/gpu/drm/i915/i915_gem.c | 25 +++++----- drivers/gpu/drm/i915/i915_gem_clflush.c | 3 +- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 14 +++++- drivers/gpu/drm/i915/i915_gem_internal.c | 7 +-- drivers/gpu/drm/i915/i915_gem_object.c | 48 +++++++++++++++++++ drivers/gpu/drm/i915/i915_gem_object.h | 9 +++- drivers/gpu/drm/i915/i915_gem_stolen.c | 5 +- drivers/gpu/drm/i915/i915_gem_userptr.c | 4 +- .../gpu/drm/i915/selftests/huge_gem_object.c | 6 +-- 11 files changed, 96 insertions(+), 32 deletions(-) create mode 100644 drivers/gpu/drm/i915/i915_gem_object.c diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile index f8227318dcaf..892f52b53060 100644 --- a/drivers/gpu/drm/i915/Makefile +++ b/drivers/gpu/drm/i915/Makefile @@ -39,6 +39,7 @@ i915-y += i915_cmd_parser.o \ i915_gem_gtt.o \ i915_gem_internal.o \ i915_gem.o \ + i915_gem_object.o \ i915_gem_render_state.o \ i915_gem_request.o \ i915_gem_shrinker.o \ diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 5aa214a05c51..9e5e3ca722cd 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -4322,10 +4322,4 @@ int remap_io_mapping(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, struct io_mapping *iomap); -static inline bool i915_gem_object_is_coherent(struct drm_i915_gem_object *obj) -{ - return (obj->cache_level != I915_CACHE_NONE || - HAS_LLC(to_i915(obj->base.dev))); -} - #endif diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 000a764ee8d9..887fff281f4e 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -52,7 +52,7 @@ static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj) if (obj->cache_dirty) return false; - if (!obj->cache_coherent) + if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE)) return true; return obj->pin_display; @@ -253,7 +253,7 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj, if (needs_clflush && (obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0 && - !obj->cache_coherent) + !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ)) drm_clflush_sg(pages); __start_cpu_write(obj); @@ -800,7 +800,8 @@ int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj, if (ret) return ret; - if (obj->cache_coherent || !static_cpu_has(X86_FEATURE_CLFLUSH)) { + if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ || + !static_cpu_has(X86_FEATURE_CLFLUSH)) { ret = i915_gem_object_set_to_cpu_domain(obj, false); if (ret) goto err_unpin; @@ -852,7 +853,8 @@ int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj, if (ret) return ret; - if (obj->cache_coherent || !static_cpu_has(X86_FEATURE_CLFLUSH)) { + if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE || + !static_cpu_has(X86_FEATURE_CLFLUSH)) { ret = i915_gem_object_set_to_cpu_domain(obj, true); if (ret) goto err_unpin; @@ -3673,8 +3675,7 @@ restart: list_for_each_entry(vma, &obj->vma_list, obj_link) vma->node.color = cache_level; - obj->cache_level = cache_level; - obj->cache_coherent = i915_gem_object_is_coherent(obj); + i915_gem_object_set_cache_coherency(obj, cache_level); obj->cache_dirty = true; /* Always invalidate stale cachelines */ return 0; @@ -4279,6 +4280,7 @@ i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size) { struct drm_i915_gem_object *obj; struct address_space *mapping; + unsigned int cache_level; gfp_t mask; int ret; @@ -4317,7 +4319,7 @@ i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size) obj->base.write_domain = I915_GEM_DOMAIN_CPU; obj->base.read_domains = I915_GEM_DOMAIN_CPU; - if (HAS_LLC(dev_priv)) { + if (HAS_LLC(dev_priv)) /* On some devices, we can have the GPU use the LLC (the CPU * cache) for about a 10% performance improvement * compared to uncached. Graphics requests other than @@ -4330,12 +4332,11 @@ i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size) * However, we maintain the display planes as UC, and so * need to rebind when first used as such. */ - obj->cache_level = I915_CACHE_LLC; - } else - obj->cache_level = I915_CACHE_NONE; + cache_level = I915_CACHE_LLC; + else + cache_level = I915_CACHE_NONE; - obj->cache_coherent = i915_gem_object_is_coherent(obj); - obj->cache_dirty = !obj->cache_coherent; + i915_gem_object_set_cache_coherency(obj, cache_level); trace_i915_gem_object_create(obj); diff --git a/drivers/gpu/drm/i915/i915_gem_clflush.c b/drivers/gpu/drm/i915/i915_gem_clflush.c index 348b29a845c9..8a04d33055be 100644 --- a/drivers/gpu/drm/i915/i915_gem_clflush.c +++ b/drivers/gpu/drm/i915/i915_gem_clflush.c @@ -139,7 +139,8 @@ bool i915_gem_clflush_object(struct drm_i915_gem_object *obj, * snooping behaviour occurs naturally as the result of our domain * tracking. */ - if (!(flags & I915_CLFLUSH_FORCE) && obj->cache_coherent) + if (!(flags & I915_CLFLUSH_FORCE) && + obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ) return false; trace_i915_gem_object_clflush(obj); diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 5fa44767c29e..99520b2fac7c 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -1842,7 +1842,19 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb) eb->request->capture_list = capture; } - if (unlikely(obj->cache_dirty && !obj->cache_coherent)) { + /* + * If the GPU is not _reading_ through the CPU cache, we need + * to make sure that any writes (both previous GPU writes from + * before a change in snooping levels and normal CPU writes) + * caught in that cache are flushed to main memory. + * + * We want to say + * obj->cache_dirty && + * !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ) + * but gcc's optimiser doesn't handle that as well and emits + * two jumps instead of one. Maybe one day... + */ + if (unlikely(obj->cache_dirty & ~obj->cache_coherent)) { if (i915_gem_clflush_object(obj, 0)) entry->flags &= ~EXEC_OBJECT_ASYNC; } diff --git a/drivers/gpu/drm/i915/i915_gem_internal.c b/drivers/gpu/drm/i915/i915_gem_internal.c index 568bf83af1f5..c1f64ddaf8aa 100644 --- a/drivers/gpu/drm/i915/i915_gem_internal.c +++ b/drivers/gpu/drm/i915/i915_gem_internal.c @@ -174,6 +174,7 @@ i915_gem_object_create_internal(struct drm_i915_private *i915, phys_addr_t size) { struct drm_i915_gem_object *obj; + unsigned int cache_level; GEM_BUG_ON(!size); GEM_BUG_ON(!IS_ALIGNED(size, PAGE_SIZE)); @@ -190,9 +191,9 @@ i915_gem_object_create_internal(struct drm_i915_private *i915, obj->base.read_domains = I915_GEM_DOMAIN_CPU; obj->base.write_domain = I915_GEM_DOMAIN_CPU; - obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE; - obj->cache_coherent = i915_gem_object_is_coherent(obj); - obj->cache_dirty = !obj->cache_coherent; + + cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE; + i915_gem_object_set_cache_coherency(obj, cache_level); return obj; } diff --git a/drivers/gpu/drm/i915/i915_gem_object.c b/drivers/gpu/drm/i915/i915_gem_object.c new file mode 100644 index 000000000000..aab8cdd80e6d --- /dev/null +++ b/drivers/gpu/drm/i915/i915_gem_object.c @@ -0,0 +1,48 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include "i915_drv.h" +#include "i915_gem_object.h" + +/** + * Mark up the object's coherency levels for a given cache_level + * @obj: #drm_i915_gem_object + * @cache_level: cache level + */ +void i915_gem_object_set_cache_coherency(struct drm_i915_gem_object *obj, + unsigned int cache_level) +{ + obj->cache_level = cache_level; + + if (cache_level != I915_CACHE_NONE) + obj->cache_coherent = (I915_BO_CACHE_COHERENT_FOR_READ | + I915_BO_CACHE_COHERENT_FOR_WRITE); + else if (HAS_LLC(to_i915(obj->base.dev))) + obj->cache_coherent = I915_BO_CACHE_COHERENT_FOR_READ; + else + obj->cache_coherent = 0; + + obj->cache_dirty = + !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE); +} diff --git a/drivers/gpu/drm/i915/i915_gem_object.h b/drivers/gpu/drm/i915/i915_gem_object.h index 5b19a4916a4d..3baa341432db 100644 --- a/drivers/gpu/drm/i915/i915_gem_object.h +++ b/drivers/gpu/drm/i915/i915_gem_object.h @@ -33,8 +33,11 @@ #include +#include "i915_gem_request.h" #include "i915_selftest.h" +struct drm_i915_gem_object; + struct drm_i915_gem_object_ops { unsigned int flags; #define I915_GEM_OBJECT_HAS_STRUCT_PAGE BIT(0) @@ -118,8 +121,10 @@ struct drm_i915_gem_object { */ unsigned long gt_ro:1; unsigned int cache_level:3; + unsigned int cache_coherent:2; +#define I915_BO_CACHE_COHERENT_FOR_READ BIT(0) +#define I915_BO_CACHE_COHERENT_FOR_WRITE BIT(1) unsigned int cache_dirty:1; - unsigned int cache_coherent:1; atomic_t frontbuffer_bits; unsigned int frontbuffer_ggtt_origin; /* write once */ @@ -391,6 +396,8 @@ i915_gem_object_last_write_engine(struct drm_i915_gem_object *obj) return engine; } +void i915_gem_object_set_cache_coherency(struct drm_i915_gem_object *obj, + unsigned int cache_level); void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj); #endif diff --git a/drivers/gpu/drm/i915/i915_gem_stolen.c b/drivers/gpu/drm/i915/i915_gem_stolen.c index c11c915382e7..507c9f0d8df1 100644 --- a/drivers/gpu/drm/i915/i915_gem_stolen.c +++ b/drivers/gpu/drm/i915/i915_gem_stolen.c @@ -580,6 +580,7 @@ _i915_gem_object_create_stolen(struct drm_i915_private *dev_priv, struct drm_mm_node *stolen) { struct drm_i915_gem_object *obj; + unsigned int cache_level; obj = i915_gem_object_alloc(dev_priv); if (obj == NULL) @@ -590,8 +591,8 @@ _i915_gem_object_create_stolen(struct drm_i915_private *dev_priv, obj->stolen = stolen; obj->base.read_domains = I915_GEM_DOMAIN_CPU | I915_GEM_DOMAIN_GTT; - obj->cache_level = HAS_LLC(dev_priv) ? I915_CACHE_LLC : I915_CACHE_NONE; - obj->cache_coherent = true; /* assumptions! more like cache_oblivious */ + cache_level = HAS_LLC(dev_priv) ? I915_CACHE_LLC : I915_CACHE_NONE; + i915_gem_object_set_cache_coherency(obj, cache_level); if (i915_gem_object_pin_pages(obj)) goto cleanup; diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index ccd09e8419f5..f152a38d7079 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -804,9 +804,7 @@ i915_gem_userptr_ioctl(struct drm_device *dev, void *data, struct drm_file *file i915_gem_object_init(obj, &i915_gem_userptr_ops); obj->base.read_domains = I915_GEM_DOMAIN_CPU; obj->base.write_domain = I915_GEM_DOMAIN_CPU; - obj->cache_level = I915_CACHE_LLC; - obj->cache_coherent = i915_gem_object_is_coherent(obj); - obj->cache_dirty = !obj->cache_coherent; + i915_gem_object_set_cache_coherency(obj, I915_CACHE_LLC); obj->userptr.ptr = args->user_ptr; obj->userptr.read_only = !!(args->flags & I915_USERPTR_READ_ONLY); diff --git a/drivers/gpu/drm/i915/selftests/huge_gem_object.c b/drivers/gpu/drm/i915/selftests/huge_gem_object.c index caf76af36aba..c5c7e8efbdd3 100644 --- a/drivers/gpu/drm/i915/selftests/huge_gem_object.c +++ b/drivers/gpu/drm/i915/selftests/huge_gem_object.c @@ -111,6 +111,7 @@ huge_gem_object(struct drm_i915_private *i915, dma_addr_t dma_size) { struct drm_i915_gem_object *obj; + unsigned int cache_level; GEM_BUG_ON(!phys_size || phys_size > dma_size); GEM_BUG_ON(!IS_ALIGNED(phys_size, PAGE_SIZE)); @@ -128,9 +129,8 @@ huge_gem_object(struct drm_i915_private *i915, obj->base.read_domains = I915_GEM_DOMAIN_CPU; obj->base.write_domain = I915_GEM_DOMAIN_CPU; - obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE; - obj->cache_coherent = i915_gem_object_is_coherent(obj); - obj->cache_dirty = !obj->cache_coherent; + cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE; + i915_gem_object_set_cache_coherency(obj, cache_level); obj->scratch = phys_size; return obj;