[Intel-gfx] [PATCH v4] drm/i915: Implement inter-engine read-read optimisations
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Thu Mar 26 10:43:33 PDT 2015
Hi,
On 03/19/2015 08:59 AM, Chris Wilson wrote:
> Currently, we only track the last request globally across all engines.
> This prevents us from issuing concurrent read requests on e.g. the RCS
> and BCS engines (or more likely the render and media engines). Without
> semaphores, we incur costly stalls as we synchronise between rings -
> greatly impacting the current performance of Broadwell versus Haswell in
> certain workloads (like video decode). With the introduction of
> reference counted requests, it is much easier to track the last request
> per ring, as well as the last global write request so that we can
> optimise inter-engine read read requests (as well as better optimise
> certain CPU waits).
This is quite complex and big so on first pass there will be a lot of
questions below. Especially since comments are sparse. Call it a first
pass review if that's OK with you.
> v2: Fix inverted readonly condition for nonblocking waits.
> v3: Handle non-continguous engine array after waits
> v4: Rebase, tidy, rewrite ring list debugging
>
> Benchmark: igt/gem_read_read_speed
> hsw:gt3e (with semaphores):
> Before: Time to read-read 1024k: 275.794µs
> After: Time to read-read 1024k: 123.260µs
>
> hsw:gt3e (w/o semaphores):
> Before: Time to read-read 1024k: 230.433µs
> After: Time to read-read 1024k: 124.593µs
>
> bdw-u (w/o semaphores): Before After
> Time to read-read 1x1: 26.274µs 10.350µs
> Time to read-read 128x128: 40.097µs 21.366µs
> Time to read-read 256x256: 77.087µs 42.608µs
> Time to read-read 512x512: 281.999µs 181.155µs
> Time to read-read 1024x1024: 1196.141µs 1118.223µs
> Time to read-read 2048x2048: 5639.072µs 5225.837µs
> Time to read-read 4096x4096: 22401.662µs 21137.067µs
> Time to read-read 8192x8192: 89617.735µs 85637.681µs
>
> Testcase: igt/gem_concurrent_blit (read-read and friends)
> Cc: Lionel Landwerlin <lionel.g.landwerlin at linux.intel.com>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> drivers/gpu/drm/i915/i915_debugfs.c | 16 +-
> drivers/gpu/drm/i915/i915_drv.h | 11 +-
> drivers/gpu/drm/i915/i915_gem.c | 427 +++++++++++++++++++-------------
> drivers/gpu/drm/i915/i915_gem_context.c | 2 -
> drivers/gpu/drm/i915/i915_gem_debug.c | 100 +++-----
> drivers/gpu/drm/i915/i915_gpu_error.c | 19 +-
> drivers/gpu/drm/i915/intel_display.c | 4 +-
> drivers/gpu/drm/i915/intel_ringbuffer.h | 1 +
> 8 files changed, 321 insertions(+), 259 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 7ef6295438e9..5cea9a9c1cb9 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -120,10 +120,13 @@ static inline const char *get_global_flag(struct drm_i915_gem_object *obj)
> static void
> describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
> {
> + struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
> + struct intel_engine_cs *ring;
> struct i915_vma *vma;
> int pin_count = 0;
> + int i;
>
> - seq_printf(m, "%pK: %s%s%s%s %8zdKiB %02x %02x %x %x %x%s%s%s",
> + seq_printf(m, "%pK: %s%s%s%s %8zdKiB %02x %02x [ ",
> &obj->base,
> obj->active ? "*" : " ",
> get_pin_flag(obj),
> @@ -131,8 +134,11 @@ describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
> get_global_flag(obj),
> obj->base.size / 1024,
> obj->base.read_domains,
> - obj->base.write_domain,
> - i915_gem_request_get_seqno(obj->last_read_req),
> + obj->base.write_domain);
> + for_each_ring(ring, dev_priv, i)
> + seq_printf(m, "%x ",
> + i915_gem_request_get_seqno(obj->last_read_req[i]));
> + seq_printf(m, "] %x %x%s%s%s",
> i915_gem_request_get_seqno(obj->last_write_req),
> i915_gem_request_get_seqno(obj->last_fenced_req),
> i915_cache_level_str(to_i915(obj->base.dev), obj->cache_level),
> @@ -169,9 +175,9 @@ describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
> *t = '\0';
> seq_printf(m, " (%s mappable)", s);
> }
> - if (obj->last_read_req != NULL)
> + if (obj->last_write_req != NULL)
> seq_printf(m, " (%s)",
> - i915_gem_request_get_ring(obj->last_read_req)->name);
> + i915_gem_request_get_ring(obj->last_write_req)->name);
> if (obj->frontbuffer_bits)
> seq_printf(m, " (frontbuffer: 0x%03x)", obj->frontbuffer_bits);
> }
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 819503e6c3d4..4436c1ca247d 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -498,7 +498,7 @@ struct drm_i915_error_state {
> struct drm_i915_error_buffer {
> u32 size;
> u32 name;
> - u32 rseqno, wseqno;
> + u32 rseqno[I915_NUM_RINGS], wseqno;
> u32 gtt_offset;
> u32 read_domains;
> u32 write_domain;
> @@ -1903,7 +1903,7 @@ struct drm_i915_gem_object {
> struct drm_mm_node *stolen;
> struct list_head global_list;
>
> - struct list_head ring_list;
> + struct list_head ring_list[I915_NUM_RINGS];
> /** Used in execbuf to temporarily hold a ref */
> struct list_head obj_exec_link;
>
> @@ -1914,7 +1914,7 @@ struct drm_i915_gem_object {
> * rendering and so a non-zero seqno), and is not set if it i s on
> * inactive (ready to be unbound) list.
> */
> - unsigned int active:1;
> + unsigned int active:I915_MAX_RING_BIT;
>
> /**
> * This is set if the object has been written to since last bound
> @@ -1982,7 +1982,7 @@ struct drm_i915_gem_object {
> int vmapping_count;
>
> /** Breadcrumb of last rendering to the buffer. */
> - struct drm_i915_gem_request *last_read_req;
> + struct drm_i915_gem_request *last_read_req[I915_NUM_RINGS];
> struct drm_i915_gem_request *last_write_req;
> /** Breadcrumb of last fenced GPU access to the buffer. */
> struct drm_i915_gem_request *last_fenced_req;
> @@ -2120,10 +2120,11 @@ i915_gem_request_get_ring(struct drm_i915_gem_request *req)
> return req ? req->ring : NULL;
> }
>
> -static inline void
> +static inline struct drm_i915_gem_request *
> i915_gem_request_reference(struct drm_i915_gem_request *req)
> {
> kref_get(&req->ref);
> + return req;
> }
>
> static inline void
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 3d6d3cec5a06..bca433a279f6 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -40,12 +40,13 @@
>
> static void i915_gem_object_flush_gtt_write_domain(struct drm_i915_gem_object *obj);
> static void i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj);
> +static void
> +i915_gem_object_retire__write(struct drm_i915_gem_object *obj);
> +static void
> +i915_gem_object_retire__read(struct drm_i915_gem_object *obj, int ring);
> static __must_check int
> i915_gem_object_wait_rendering(struct drm_i915_gem_object *obj,
> bool readonly);
> -static void
> -i915_gem_object_retire(struct drm_i915_gem_object *obj);
> -
> static void i915_gem_write_fence(struct drm_device *dev, int reg,
> struct drm_i915_gem_object *obj);
> static void i915_gem_object_update_fence(struct drm_i915_gem_object *obj,
> @@ -518,8 +519,6 @@ int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
> ret = i915_gem_object_wait_rendering(obj, true);
> if (ret)
> return ret;
> -
> - i915_gem_object_retire(obj);
> }
>
> ret = i915_gem_object_get_pages(obj);
> @@ -939,8 +938,6 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
> ret = i915_gem_object_wait_rendering(obj, false);
> if (ret)
> return ret;
> -
> - i915_gem_object_retire(obj);
> }
> /* Same trick applies to invalidate partially written cachelines read
> * before writing. */
> @@ -1372,29 +1369,8 @@ i915_wait_request(struct drm_i915_gem_request *req)
> return ret;
>
> reset_counter = atomic_read(&dev_priv->gpu_error.reset_counter);
> - i915_gem_request_reference(req);
> - ret = __i915_wait_request(req, reset_counter,
> - interruptible, NULL, NULL);
> - i915_gem_request_unreference(req);
> - return ret;
> -}
> -
> -static int
> -i915_gem_object_wait_rendering__tail(struct drm_i915_gem_object *obj)
> -{
> - if (!obj->active)
> - return 0;
> -
> - /* Manually manage the write flush as we may have not yet
> - * retired the buffer.
> - *
> - * Note that the last_write_req is always the earlier of
> - * the two (read/write) requests, so if we haved successfully waited,
> - * we know we have passed the last write.
> - */
> - i915_gem_request_assign(&obj->last_write_req, NULL);
> -
> - return 0;
> + return __i915_wait_request(req, reset_counter,
> + interruptible, NULL, NULL);
> }
Net code comments -/+ for this patch needs improvement. :) Above you
deleted a chunk but below added nothing.
I think something high level is needed to explain the active lists and
tracking etc.
> /**
> @@ -1405,18 +1381,39 @@ static __must_check int
> i915_gem_object_wait_rendering(struct drm_i915_gem_object *obj,
> bool readonly)
> {
> - struct drm_i915_gem_request *req;
> - int ret;
> + int ret, i;
>
> - req = readonly ? obj->last_write_req : obj->last_read_req;
> - if (!req)
> + if (!obj->active)
> return 0;
>
> - ret = i915_wait_request(req);
> - if (ret)
> - return ret;
> + if (readonly) {
> + if (obj->last_write_req != NULL) {
> + ret = i915_wait_request(obj->last_write_req);
> + if (ret)
> + return ret;
> +
> + i = obj->last_write_req->ring->id;
> + if (obj->last_read_req[i] == obj->last_write_req)
> + i915_gem_object_retire__read(obj, i);
> + else
> + i915_gem_object_retire__write(obj);
Above mentioned comments would especially help understand the ordering
rules here.
> + }
> + } else {
> + for (i = 0; i < I915_NUM_RINGS; i++) {
> + if (obj->last_read_req[i] == NULL)
> + continue;
> +
> + ret = i915_wait_request(obj->last_read_req[i]);
> + if (ret)
> + return ret;
> +
> + i915_gem_object_retire__read(obj, i);
> + }
I think with optimistic spinning this could end up doing num_rings spins
etc in sequence. Would it be worth smarting it up somehow?
> + BUG_ON(obj->active);
Better WARN and halt the driver indirectly if unavoidable.
> + }
> +
> + return 0;
>
> - return i915_gem_object_wait_rendering__tail(obj);
> }
>
> /* A nonblocking variant of the above wait. This is a highly dangerous routine
> @@ -1427,37 +1424,71 @@ i915_gem_object_wait_rendering__nonblocking(struct drm_i915_gem_object *obj,
> struct drm_i915_file_private *file_priv,
> bool readonly)
> {
> - struct drm_i915_gem_request *req;
> struct drm_device *dev = obj->base.dev;
> struct drm_i915_private *dev_priv = dev->dev_private;
> + struct drm_i915_gem_request *requests[I915_NUM_RINGS];
> unsigned reset_counter;
> - int ret;
> + int ret, i, n = 0;
>
> BUG_ON(!mutex_is_locked(&dev->struct_mutex));
> BUG_ON(!dev_priv->mm.interruptible);
>
> - req = readonly ? obj->last_write_req : obj->last_read_req;
> - if (!req)
> + if (!obj->active)
> return 0;
>
> ret = i915_gem_check_wedge(&dev_priv->gpu_error, true);
> if (ret)
> return ret;
>
> - ret = i915_gem_check_olr(req);
> - if (ret)
> - return ret;
> -
> reset_counter = atomic_read(&dev_priv->gpu_error.reset_counter);
> - i915_gem_request_reference(req);
Good place for a comment: Build an array of requests to be waited upon etc..
> +
> + if (readonly) {
> + struct drm_i915_gem_request *rq;
> +
> + rq = obj->last_write_req;
> + if (rq == NULL)
> + return 0;
> +
> + ret = i915_gem_check_olr(rq);
> + if (ret)
> + goto err;
> +
> + requests[n++] = i915_gem_request_reference(rq);
> + } else {
> + for (i = 0; i < I915_NUM_RINGS; i++) {
> + struct drm_i915_gem_request *rq;
> +
> + rq = obj->last_read_req[i];
> + if (rq == NULL)
> + continue;
> +
> + ret = i915_gem_check_olr(rq);
> + if (ret)
> + goto err;
> +
> + requests[n++] = i915_gem_request_reference(rq);
> + }
> + }
I wonder if merging the tracked requests to a single array, roughly
something like:
obj->last_req[1 + NUM_RINGS]
and:
LAST_WRITE_REQ = 0
LAST_READ_REQ(ring) = 1 + ring
Could make the above logic more compact and how would it look elsewhere
in the code? Definitely looks like it would work for the above loop:
for (i = LAST_WRITE_REQ; i <= LAST_TRACKED_REQUEST; i++) {
rq = obj->last_req[i];
if (rq == NULL && readonly)
return 0;
else if (rq == NULL)
continue;
...
requests[n++] = ...
if (readonly)
break;
}
Not sure, might not be that great idea.
> +
> mutex_unlock(&dev->struct_mutex);
> - ret = __i915_wait_request(req, reset_counter, true, NULL, file_priv);
> + for (i = 0; ret == 0 && i < n; i++)
> + ret = __i915_wait_request(requests[i], reset_counter, true,
> + NULL, file_priv);
Another chance for more optimal "group" waiting.
> mutex_lock(&dev->struct_mutex);
> - i915_gem_request_unreference(req);
> - if (ret)
> - return ret;
>
> - return i915_gem_object_wait_rendering__tail(obj);
> +err:
> + for (i = 0; i < n; i++) {
> + if (ret == 0) {
> + int ring = requests[i]->ring->id;
> + if (obj->last_read_req[ring] == requests[i])
> + i915_gem_object_retire__read(obj, ring);
> + if (obj->last_write_req == requests[i])
> + i915_gem_object_retire__write(obj);
> + }
What if one wait succeeded and one failed, no need to update anything?
> + i915_gem_request_unreference(requests[i]);
> + }
> +
> + return ret;
> }
>
> /**
> @@ -2204,78 +2235,59 @@ i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
> return 0;
> }
>
> -static void
> -i915_gem_object_move_to_active(struct drm_i915_gem_object *obj,
> - struct intel_engine_cs *ring)
> +void i915_vma_move_to_active(struct i915_vma *vma,
> + struct intel_engine_cs *ring)
> {
> - struct drm_i915_gem_request *req;
> - struct intel_engine_cs *old_ring;
> -
> - BUG_ON(ring == NULL);
> -
> - req = intel_ring_get_request(ring);
> - old_ring = i915_gem_request_get_ring(obj->last_read_req);
> -
> - if (old_ring != ring && obj->last_write_req) {
> - /* Keep the request relative to the current ring */
> - i915_gem_request_assign(&obj->last_write_req, req);
> - }
> + struct drm_i915_gem_object *obj = vma->obj;
>
> /* Add a reference if we're newly entering the active list. */
> - if (!obj->active) {
> + if (obj->last_read_req[ring->id] == NULL && obj->active++ == 0)
> drm_gem_object_reference(&obj->base);
> - obj->active = 1;
> - }
> + BUG_ON(obj->active == 0 || obj->active > I915_NUM_RINGS);
Maybe we need I915_WARN_AND_HALT() which would be a WARN() plus put the
driver in halted state?
>
> - list_move_tail(&obj->ring_list, &ring->active_list);
> + list_move_tail(&obj->ring_list[ring->id], &ring->active_list);
> + i915_gem_request_assign(&obj->last_read_req[ring->id],
> + intel_ring_get_request(ring));
>
> - i915_gem_request_assign(&obj->last_read_req, req);
> + list_move_tail(&vma->mm_list, &vma->vm->active_list);
> }
>
> -void i915_vma_move_to_active(struct i915_vma *vma,
> - struct intel_engine_cs *ring)
> +static void
> +i915_gem_object_retire__write(struct drm_i915_gem_object *obj)
> {
> - list_move_tail(&vma->mm_list, &vma->vm->active_list);
> - return i915_gem_object_move_to_active(vma->obj, ring);
> + BUG_ON(obj->last_write_req == NULL);
> + BUG_ON(!obj->active);
> +
> + i915_gem_request_assign(&obj->last_write_req, NULL);
> +
> + intel_fb_obj_flush(obj, true);
> }
>
> static void
> -i915_gem_object_move_to_inactive(struct drm_i915_gem_object *obj)
> +i915_gem_object_retire__read(struct drm_i915_gem_object *obj, int ring)
> {
> struct i915_vma *vma;
>
> - BUG_ON(obj->base.write_domain & ~I915_GEM_GPU_DOMAINS);
> + BUG_ON(obj->last_read_req[ring] == NULL);
> BUG_ON(!obj->active);
>
> + list_del_init(&obj->ring_list[ring]);
> + i915_gem_request_assign(&obj->last_read_req[ring], NULL);
> +
> + if (obj->last_write_req && obj->last_write_req->ring->id == ring)
> + i915_gem_object_retire__write(obj);
> +
> + if (--obj->active)
> + return;
> +
> list_for_each_entry(vma, &obj->vma_list, vma_link) {
> if (!list_empty(&vma->mm_list))
> list_move_tail(&vma->mm_list, &vma->vm->inactive_list);
> }
>
> - intel_fb_obj_flush(obj, true);
> -
> - list_del_init(&obj->ring_list);
> -
> - i915_gem_request_assign(&obj->last_read_req, NULL);
> - i915_gem_request_assign(&obj->last_write_req, NULL);
> - obj->base.write_domain = 0;
> -
> i915_gem_request_assign(&obj->last_fenced_req, NULL);
>
> - obj->active = 0;
> drm_gem_object_unreference(&obj->base);
> -
> - WARN_ON(i915_verify_lists(dev));
> -}
> -
> -static void
> -i915_gem_object_retire(struct drm_i915_gem_object *obj)
> -{
> - if (obj->last_read_req == NULL)
> - return;
> -
> - if (i915_gem_request_completed(obj->last_read_req, true))
> - i915_gem_object_move_to_inactive(obj);
> }
>
> static int
> @@ -2583,9 +2595,9 @@ static void i915_gem_reset_ring_cleanup(struct drm_i915_private *dev_priv,
>
> obj = list_first_entry(&ring->active_list,
> struct drm_i915_gem_object,
> - ring_list);
> + ring_list[ring->id]);
>
> - i915_gem_object_move_to_inactive(obj);
> + i915_gem_object_retire__read(obj, ring->id);
> }
>
> /*
> @@ -2670,6 +2682,8 @@ void i915_gem_reset(struct drm_device *dev)
> i915_gem_context_reset(dev);
>
> i915_gem_restore_fences(dev);
> +
> + WARN_ON(i915_verify_lists(dev));
> }
>
> /**
> @@ -2678,11 +2692,11 @@ void i915_gem_reset(struct drm_device *dev)
> void
> i915_gem_retire_requests_ring(struct intel_engine_cs *ring)
> {
> + WARN_ON(i915_verify_lists(ring->dev));
> +
> if (list_empty(&ring->request_list))
> return;
>
> - WARN_ON(i915_verify_lists(ring->dev));
> -
> /* Retire requests first as we use it above for the early return.
> * If we retire requests last, we may use a later seqno and so clear
> * the requests lists without clearing the active list, leading to
> @@ -2719,12 +2733,13 @@ i915_gem_retire_requests_ring(struct intel_engine_cs *ring)
>
> obj = list_first_entry(&ring->active_list,
> struct drm_i915_gem_object,
> - ring_list);
> + ring_list[ring->id]);
>
> - if (!i915_gem_request_completed(obj->last_read_req, true))
> + if (!i915_gem_request_completed(obj->last_read_req[ring->id],
> + true))
> break;
>
> - i915_gem_object_move_to_inactive(obj);
> + i915_gem_object_retire__read(obj, ring->id);
> }
>
> if (unlikely(ring->trace_irq_req &&
> @@ -2813,17 +2828,23 @@ i915_gem_idle_work_handler(struct work_struct *work)
> static int
> i915_gem_object_flush_active(struct drm_i915_gem_object *obj)
> {
> - struct intel_engine_cs *ring;
> - int ret;
> + int ret, i;
>
> - if (obj->active) {
> - ring = i915_gem_request_get_ring(obj->last_read_req);
> + if (!obj->active)
> + return 0;
> +
> + for (i = 0; i < I915_NUM_RINGS; i++) {
> + struct drm_i915_gem_request *rq;
> +
> + rq = obj->last_read_req[i];
> + if (rq == NULL)
> + continue;
>
> - ret = i915_gem_check_olr(obj->last_read_req);
> + ret = i915_gem_check_olr(rq);
> if (ret)
> return ret;
>
> - i915_gem_retire_requests_ring(ring);
> + i915_gem_retire_requests_ring(rq->ring);
> }
>
> return 0;
> @@ -2857,9 +2878,10 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> struct drm_i915_private *dev_priv = dev->dev_private;
> struct drm_i915_gem_wait *args = data;
> struct drm_i915_gem_object *obj;
> - struct drm_i915_gem_request *req;
> + struct drm_i915_gem_request *req[I915_NUM_RINGS];
> unsigned reset_counter;
> - int ret = 0;
> + int i, n = 0;
> + int ret;
>
> if (args->flags != 0)
> return -EINVAL;
> @@ -2879,11 +2901,9 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> if (ret)
> goto out;
>
> - if (!obj->active || !obj->last_read_req)
> + if (!obj->active)
> goto out;
>
> - req = obj->last_read_req;
> -
> /* Do this after OLR check to make sure we make forward progress polling
> * on this IOCTL with a timeout == 0 (like busy ioctl)
> */
> @@ -2894,13 +2914,23 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>
> drm_gem_object_unreference(&obj->base);
> reset_counter = atomic_read(&dev_priv->gpu_error.reset_counter);
> - i915_gem_request_reference(req);
> +
> + for (i = 0; i < I915_NUM_RINGS; i++) {
> + if (obj->last_read_req[i] == NULL)
> + continue;
> +
> + req[n++] = i915_gem_request_reference(obj->last_read_req[i]);
> + }
> +
> mutex_unlock(&dev->struct_mutex);
>
> - ret = __i915_wait_request(req, reset_counter, true,
> - args->timeout_ns > 0 ? &args->timeout_ns : NULL,
> - file->driver_priv);
> - i915_gem_request_unreference__unlocked(req);
> + for (i = 0; i < n; i++) {
> + if (ret == 0)
> + ret = __i915_wait_request(req[i], reset_counter, true,
> + args->timeout_ns > 0 ? &args->timeout_ns : NULL,
> + file->driver_priv);
> + i915_gem_request_unreference__unlocked(req[i]);
> + }
> return ret;
>
> out:
> @@ -2909,6 +2939,62 @@ out:
> return ret;
> }
>
> +static int
> +__i915_gem_object_sync(struct drm_i915_gem_object *obj,
> + struct intel_engine_cs *to,
> + struct drm_i915_gem_request *rq)
> +{
> + struct intel_engine_cs *from;
> + int ret;
> +
> + if (rq == NULL)
> + return 0;
> +
> + from = i915_gem_request_get_ring(rq);
> + if (to == from)
> + return 0;
> +
> + if (i915_gem_request_completed(rq, true))
> + return 0;
> +
> + ret = i915_gem_check_olr(rq);
> + if (ret)
> + return ret;
> +
> + if (!i915_semaphore_is_enabled(obj->base.dev)) {
> + ret = __i915_wait_request(rq,
> + atomic_read(&to_i915(obj->base.dev)->gpu_error.reset_counter),
> + to_i915(obj->base.dev)->mm.interruptible, NULL, NULL);
> + if (ret)
> + return ret;
> +
> + if (obj->last_read_req[from->id] == rq)
> + i915_gem_object_retire__read(obj, from->id);
> + if (obj->last_write_req == rq)
> + i915_gem_object_retire__write(obj);
Retiring reads implies retiring writes on a ring so is this needed?
> + } else {
> + int idx = intel_ring_sync_index(from, to);
> + u32 seqno = i915_gem_request_get_seqno(rq);
> +
> + if (seqno <= from->semaphore.sync_seqno[idx])
> + return 0;
> +
> + trace_i915_gem_ring_sync_to(from, to, rq);
> + ret = to->semaphore.sync_to(to, from, seqno);
> + if (ret)
> + return ret;
> +
> + /* We use last_read_req because sync_to()
> + * might have just caused seqno wrap under
> + * the radar.
> + */
> + from->semaphore.sync_seqno[idx] =
> + i915_gem_request_get_seqno(obj->last_read_req[from->id]);
> + }
> +
> + return 0;
> +}
> +
> /**
> * i915_gem_object_sync - sync an object to a ring.
> *
> @@ -2917,7 +3003,17 @@ out:
> *
> * This code is meant to abstract object synchronization with the GPU.
> * Calling with NULL implies synchronizing the object with the CPU
> - * rather than a particular GPU ring.
> + * rather than a particular GPU ring. Conceptually we serialise writes
> + * between engines inside the GPU. We only allow on engine to write
> + * into a buffer at any time, but multiple readers. To ensure each has
> + * a coherent view of memory, we must:
> + *
> + * - If there is an outstanding write request to the object, the new
> + * request must wait for it to complete (either CPU or in hw, requests
> + * on the same ring will be naturally ordered).
> + *
> + * - If we are a write request (pending_write_domain is set), the new
> + * request must wait for outstanding read requests to complete.
> *
> * Returns 0 if successful, else propagates up the lower layer error.
> */
> @@ -2925,39 +3021,25 @@ int
> i915_gem_object_sync(struct drm_i915_gem_object *obj,
> struct intel_engine_cs *to)
> {
> - struct intel_engine_cs *from;
> - u32 seqno;
> - int ret, idx;
> -
> - from = i915_gem_request_get_ring(obj->last_read_req);
> -
> - if (from == NULL || to == from)
> - return 0;
> -
> - if (to == NULL || !i915_semaphore_is_enabled(obj->base.dev))
> - return i915_gem_object_wait_rendering(obj, false);
> -
> - idx = intel_ring_sync_index(from, to);
> + const bool readonly = obj->base.pending_write_domain == 0;
> + int ret, i;
>
> - seqno = i915_gem_request_get_seqno(obj->last_read_req);
> - /* Optimization: Avoid semaphore sync when we are sure we already
> - * waited for an object with higher seqno */
> - if (seqno <= from->semaphore.sync_seqno[idx])
> + if (!obj->active)
> return 0;
>
> - ret = i915_gem_check_olr(obj->last_read_req);
> - if (ret)
> - return ret;
> -
> - trace_i915_gem_ring_sync_to(from, to, obj->last_read_req);
> - ret = to->semaphore.sync_to(to, from, seqno);
> - if (!ret)
> - /* We use last_read_req because sync_to()
> - * might have just caused seqno wrap under
> - * the radar.
> - */
> - from->semaphore.sync_seqno[idx] =
> - i915_gem_request_get_seqno(obj->last_read_req);
> + if (to == NULL) {
> + ret = i915_gem_object_wait_rendering(obj, readonly);
> + } else if (readonly) {
> + ret = __i915_gem_object_sync(obj, to,
> + obj->last_write_req);
> + } else {
> + for (i = 0; i < I915_NUM_RINGS; i++) {
> + ret = __i915_gem_object_sync(obj, to,
> + obj->last_read_req[i]);
Here I think is another opportunity to wait for all of them at once. Via
a __i915_gem_object_sync helper which would take an array, or a
ring/request mask. Not sure if it would be worth it though.
> + if (ret)
> + break;
> + }
> + }
>
> return ret;
> }
> @@ -3044,10 +3126,6 @@ int i915_vma_unbind(struct i915_vma *vma)
> /* Since the unbound list is global, only move to that list if
> * no more VMAs exist. */
> if (list_empty(&obj->vma_list)) {
> - /* Throw away the active reference before
> - * moving to the unbound list. */
> - i915_gem_object_retire(obj);
> -
> i915_gem_gtt_finish_object(obj);
> list_move_tail(&obj->global_list, &dev_priv->mm.unbound_list);
> }
> @@ -3678,8 +3756,6 @@ i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
> if (ret)
> return ret;
>
> - i915_gem_object_retire(obj);
> -
> /* Flush and acquire obj->pages so that we are coherent through
> * direct access in memory with previous cached writes through
> * shmemfs and that our cache domain tracking remains valid.
> @@ -3904,11 +3980,9 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
> bool was_pin_display;
> int ret;
>
> - if (pipelined != i915_gem_request_get_ring(obj->last_read_req)) {
> - ret = i915_gem_object_sync(obj, pipelined);
> - if (ret)
> - return ret;
> - }
> + ret = i915_gem_object_sync(obj, pipelined);
> + if (ret)
> + return ret;
>
> /* Mark the pin_display early so that we account for the
> * display coherency whilst setting up the cache domains.
> @@ -4004,7 +4078,6 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
> if (ret)
> return ret;
>
> - i915_gem_object_retire(obj);
> i915_gem_object_flush_gtt_write_domain(obj);
>
> old_write_domain = obj->base.write_domain;
> @@ -4297,15 +4370,24 @@ i915_gem_busy_ioctl(struct drm_device *dev, void *data,
> * necessary flushes here.
> */
> ret = i915_gem_object_flush_active(obj);
> + if (ret)
> + goto unref;
> +
> + args->busy = 0;
> + if (obj->active) {
> + int i;
>
> - args->busy = obj->active;
> - if (obj->last_read_req) {
> - struct intel_engine_cs *ring;
> BUILD_BUG_ON(I915_NUM_RINGS > 16);
> - ring = i915_gem_request_get_ring(obj->last_read_req);
> - args->busy |= intel_ring_flag(ring) << 16;
> +
> + for (i = 0; i < I915_NUM_RINGS; i++) {
> + if (obj->last_read_req[i] == NULL)
> + continue;
> +
> + args->busy |= 1 << (16 + i) | 1;
Doesn't look like equivalent bits will be set? What is the "| 1" at the
end for?
> + }
> }
>
> +unref:
> drm_gem_object_unreference(&obj->base);
> unlock:
> mutex_unlock(&dev->struct_mutex);
> @@ -4379,8 +4461,11 @@ unlock:
> void i915_gem_object_init(struct drm_i915_gem_object *obj,
> const struct drm_i915_gem_object_ops *ops)
> {
> + int i;
> +
> INIT_LIST_HEAD(&obj->global_list);
> - INIT_LIST_HEAD(&obj->ring_list);
> + for (i = 0; i < I915_NUM_RINGS; i++)
> + INIT_LIST_HEAD(&obj->ring_list[i]);
> INIT_LIST_HEAD(&obj->obj_exec_link);
> INIT_LIST_HEAD(&obj->vma_list);
> INIT_LIST_HEAD(&obj->batch_pool_link);
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 70346b0028f9..31826e7e904c 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -673,8 +673,6 @@ static int do_switch(struct intel_engine_cs *ring,
> * swapped, but there is no way to do that yet.
> */
> from->legacy_hw_ctx.rcs_state->dirty = 1;
> - BUG_ON(i915_gem_request_get_ring(
> - from->legacy_hw_ctx.rcs_state->last_read_req) != ring);
>
> /* obj is kept alive until the next request by its active ref */
> i915_gem_object_ggtt_unpin(from->legacy_hw_ctx.rcs_state);
> diff --git a/drivers/gpu/drm/i915/i915_gem_debug.c b/drivers/gpu/drm/i915/i915_gem_debug.c
> index f462d1b51d97..f1cdddada369 100644
> --- a/drivers/gpu/drm/i915/i915_gem_debug.c
> +++ b/drivers/gpu/drm/i915/i915_gem_debug.c
> @@ -34,82 +34,46 @@ int
> i915_verify_lists(struct drm_device *dev)
> {
> static int warned;
> - struct drm_i915_private *dev_priv = dev->dev_private;
> + struct drm_i915_private *dev_priv = to_i915(dev);
> struct drm_i915_gem_object *obj;
> + struct intel_engine_cs *ring;
> int err = 0;
> + int i;
>
> if (warned)
> return 0;
>
> - list_for_each_entry(obj, &dev_priv->render_ring.active_list, list) {
> - if (obj->base.dev != dev ||
> - !atomic_read(&obj->base.refcount.refcount)) {
> - DRM_ERROR("freed render active %p\n", obj);
> - err++;
> - break;
> - } else if (!obj->active ||
> - (obj->base.read_domains & I915_GEM_GPU_DOMAINS) == 0) {
> - DRM_ERROR("invalid render active %p (a %d r %x)\n",
> - obj,
> - obj->active,
> - obj->base.read_domains);
> - err++;
> - } else if (obj->base.write_domain && list_empty(&obj->gpu_write_list)) {
> - DRM_ERROR("invalid render active %p (w %x, gwl %d)\n",
> - obj,
> - obj->base.write_domain,
> - !list_empty(&obj->gpu_write_list));
> - err++;
> + for_each_ring(ring, dev_priv, i) {
> + if (list_empty(&ring->request_list)) {
> + if (!list_empty(&ring->active_list)) {
> + DRM_ERROR("%s: active list not empty, but no requests\n", ring->name);
> + err++;
> + }
> + continue;
> }
> - }
> -
> - list_for_each_entry(obj, &dev_priv->mm.flushing_list, list) {
> - if (obj->base.dev != dev ||
> - !atomic_read(&obj->base.refcount.refcount)) {
> - DRM_ERROR("freed flushing %p\n", obj);
> - err++;
> - break;
> - } else if (!obj->active ||
> - (obj->base.write_domain & I915_GEM_GPU_DOMAINS) == 0 ||
> - list_empty(&obj->gpu_write_list)) {
> - DRM_ERROR("invalid flushing %p (a %d w %x gwl %d)\n",
> - obj,
> - obj->active,
> - obj->base.write_domain,
> - !list_empty(&obj->gpu_write_list));
> - err++;
> - }
> - }
> -
> - list_for_each_entry(obj, &dev_priv->mm.gpu_write_list, gpu_write_list) {
> - if (obj->base.dev != dev ||
> - !atomic_read(&obj->base.refcount.refcount)) {
> - DRM_ERROR("freed gpu write %p\n", obj);
> - err++;
> - break;
> - } else if (!obj->active ||
> - (obj->base.write_domain & I915_GEM_GPU_DOMAINS) == 0) {
> - DRM_ERROR("invalid gpu write %p (a %d w %x)\n",
> - obj,
> - obj->active,
> - obj->base.write_domain);
> - err++;
> - }
> - }
>
> - list_for_each_entry(obj, &i915_gtt_vm->inactive_list, list) {
> - if (obj->base.dev != dev ||
> - !atomic_read(&obj->base.refcount.refcount)) {
> - DRM_ERROR("freed inactive %p\n", obj);
> - err++;
> - break;
> - } else if (obj->pin_count || obj->active ||
> - (obj->base.write_domain & I915_GEM_GPU_DOMAINS)) {
> - DRM_ERROR("invalid inactive %p (p %d a %d w %x)\n",
> - obj,
> - obj->pin_count, obj->active,
> - obj->base.write_domain);
> - err++;
> + list_for_each_entry(obj, &ring->active_list, ring_list[ring->id]) {
> + if (obj->base.dev != dev ||
> + !atomic_read(&obj->base.refcount.refcount)) {
> + DRM_ERROR("%s: freed active obj %p\n",
> + ring->name, obj);
> + err++;
> + break;
> + } else if (!obj->active ||
> + obj->last_read_req[ring->id] == NULL) {
> + DRM_ERROR("%s: invalid active obj %p\n",
> + ring->name, obj);
> + err++;
> + } else if (obj->base.write_domain) {
> + DRM_ERROR("%s: invalid write obj %p (w %x)\n",
> + ring->name,
> + obj, obj->base.write_domain);
> + err++;
> + } else if (list_empty(&obj->last_read_req[ring->id]->list)) {
> + DRM_ERROR("%s: invalid request for obj %p\n",
> + ring->name, obj);
> + err++;
> + }
> }
> }
>
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index a982849a5edd..fa5d53f1240f 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -192,15 +192,20 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m,
> struct drm_i915_error_buffer *err,
> int count)
> {
> + int i;
> +
> err_printf(m, " %s [%d]:\n", name, count);
>
> while (count--) {
> - err_printf(m, " %08x %8u %02x %02x %x %x",
> + err_printf(m, " %08x %8u %02x %02x [ ",
> err->gtt_offset,
> err->size,
> err->read_domains,
> - err->write_domain,
> - err->rseqno, err->wseqno);
> + err->write_domain);
> + for (i = 0; i < I915_NUM_RINGS; i++)
> + err_printf(m, "%02x ", err->rseqno[i]);
> +
> + err_printf(m, "] %02x", err->wseqno);
> err_puts(m, pin_flag(err->pinned));
> err_puts(m, tiling_flag(err->tiling));
> err_puts(m, dirty_flag(err->dirty));
> @@ -667,10 +672,12 @@ static void capture_bo(struct drm_i915_error_buffer *err,
> struct i915_vma *vma)
> {
> struct drm_i915_gem_object *obj = vma->obj;
> + int i;
>
> err->size = obj->base.size;
> err->name = obj->base.name;
> - err->rseqno = i915_gem_request_get_seqno(obj->last_read_req);
> + for (i = 0; i < I915_NUM_RINGS; i++)
> + err->rseqno[i] = i915_gem_request_get_seqno(obj->last_read_req[i]);
> err->wseqno = i915_gem_request_get_seqno(obj->last_write_req);
> err->gtt_offset = vma->node.start;
> err->read_domains = obj->base.read_domains;
> @@ -683,8 +690,8 @@ static void capture_bo(struct drm_i915_error_buffer *err,
> err->dirty = obj->dirty;
> err->purgeable = obj->madv != I915_MADV_WILLNEED;
> err->userptr = obj->userptr.mm != NULL;
> - err->ring = obj->last_read_req ?
> - i915_gem_request_get_ring(obj->last_read_req)->id : -1;
> + err->ring = obj->last_write_req ?
> + i915_gem_request_get_ring(obj->last_write_req)->id : -1;
> err->cache_level = obj->cache_level;
> }
>
> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> index 0efb19a9b9a5..1a3756e6bea4 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -9674,7 +9674,7 @@ static bool use_mmio_flip(struct intel_engine_cs *ring,
> else if (i915.enable_execlists)
> return true;
> else
> - return ring != i915_gem_request_get_ring(obj->last_read_req);
> + return ring != i915_gem_request_get_ring(obj->last_write_req);
> }
Why this?
> static void skl_do_mmio_flip(struct intel_crtc *intel_crtc)
> @@ -9977,7 +9977,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
> } else if (IS_IVYBRIDGE(dev) || IS_HASWELL(dev)) {
> ring = &dev_priv->ring[BCS];
> } else if (INTEL_INFO(dev)->gen >= 7) {
> - ring = i915_gem_request_get_ring(obj->last_read_req);
> + ring = i915_gem_request_get_ring(obj->last_write_req);
> if (ring == NULL || ring->id != RCS)
> ring = &dev_priv->ring[BCS];
> } else {
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 1d08d8f9149d..a4a25c932b0b 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -129,6 +129,7 @@ struct intel_engine_cs {
> VCS2
> } id;
> #define I915_NUM_RINGS 5
> +#define I915_MAX_RING_BIT 3
> #define LAST_USER_RING (VECS + 1)
> u32 mmio_base;
> struct drm_device *dev;
Regards,
Tvrtko
More information about the Intel-gfx
mailing list