[Intel-gfx] [PATCH 08/11] drm/i915: Keep timeline HWSP allocated until the system is idle
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Wed Jan 30 17:54:51 UTC 2019
On 30/01/2019 02:19, Chris Wilson wrote:
> In preparation for enabling HW semaphores, we need to keep in flight
> timeline HWSP alive until the entire system is idle, as any other
> timeline active on the GPU may still refer back to the already retired
> timeline. We both have to delay recycling available cachelines and
> unpinning old HWSP until the next idle point (i.e. on parking).
>
> That we have to keep the HWSP alive for external references on HW raises
> an interesting conundrum. On a busy system, we may never see a global
> idle point, essentially meaning the resource will be leaking until we
> are forced to sleep. What we need is a set of RCU primitives for the GPU!
> This should also help mitigate the resource starvation issues
> promulgating from keeping all logical state pinned until idle (instead
> of as currently handled until the next context switch).
>
> v2: Use idle barriers to free stale HWSP as soon as all current requests
> are idle, rather than rely on the system reaching a global idle point.
> (Tvrtko)
> v3: Replace the idle barrier with read locks.
Time to change patch title and actually even rewrite the commit message
I think.
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> drivers/gpu/drm/i915/i915_request.c | 30 ++--
> drivers/gpu/drm/i915/i915_timeline.c | 229 +++++++++++++++++++++++++--
> drivers/gpu/drm/i915/i915_timeline.h | 9 +-
> 3 files changed, 237 insertions(+), 31 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index a09f47ccc703..07e4c3c68ecd 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -326,11 +326,6 @@ void i915_request_retire_upto(struct i915_request *rq)
> } while (tmp != rq);
> }
>
> -static u32 timeline_get_seqno(struct i915_timeline *tl)
> -{
> - return tl->seqno += 1 + tl->has_initial_breadcrumb;
> -}
> -
> static void move_to_timeline(struct i915_request *request,
> struct i915_timeline *timeline)
> {
> @@ -539,8 +534,10 @@ struct i915_request *
> i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
> {
> struct drm_i915_private *i915 = engine->i915;
> - struct i915_request *rq;
> struct intel_context *ce;
> + struct i915_timeline *tl;
> + struct i915_request *rq;
> + u32 seqno;
> int ret;
>
> lockdep_assert_held(&i915->drm.struct_mutex);
> @@ -615,24 +612,26 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
> }
> }
>
> - rq->rcustate = get_state_synchronize_rcu();
> -
> INIT_LIST_HEAD(&rq->active_list);
> +
> + tl = ce->ring->timeline;
> + ret = i915_timeline_get_seqno(tl, rq, &seqno);
> + if (ret)
> + goto err_free;
> +
> rq->i915 = i915;
> rq->engine = engine;
> rq->gem_context = ctx;
> rq->hw_context = ce;
> rq->ring = ce->ring;
> - rq->timeline = ce->ring->timeline;
> + rq->timeline = tl;
> GEM_BUG_ON(rq->timeline == &engine->timeline);
> - rq->hwsp_seqno = rq->timeline->hwsp_seqno;
> + rq->hwsp_seqno = tl->hwsp_seqno;
> + rq->rcustate = get_state_synchronize_rcu(); /* acts as smp_mb() */
>
> spin_lock_init(&rq->lock);
> - dma_fence_init(&rq->fence,
> - &i915_fence_ops,
> - &rq->lock,
> - rq->timeline->fence_context,
> - timeline_get_seqno(rq->timeline));
> + dma_fence_init(&rq->fence, &i915_fence_ops, &rq->lock,
> + tl->fence_context, seqno);
>
> /* We bump the ref for the fence chain */
> i915_sw_fence_init(&i915_request_get(rq)->submit, submit_notify);
> @@ -693,6 +692,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
> GEM_BUG_ON(!list_empty(&rq->sched.signalers_list));
> GEM_BUG_ON(!list_empty(&rq->sched.waiters_list));
>
> +err_free:
> kmem_cache_free(i915->requests, rq);
> err_unreserve:
> unreserve_gt(i915);
> diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
> index b2202d2e58a2..fd1a92a3663d 100644
> --- a/drivers/gpu/drm/i915/i915_timeline.c
> +++ b/drivers/gpu/drm/i915/i915_timeline.c
> @@ -6,19 +6,28 @@
>
> #include "i915_drv.h"
>
> -#include "i915_timeline.h"
> +#include "i915_active.h"
> #include "i915_syncmap.h"
> +#include "i915_timeline.h"
>
> struct i915_timeline_hwsp {
> - struct i915_vma *vma;
> + struct i915_gt_timelines *gt;
Coming back to a comment from one of the previous reviews, this is also
called gt but is a different thing altogether. I would really like that
we afford ourselves a few more characters so it just easier to read the
code.
> struct list_head free_link;
> + struct i915_vma *vma;
> u64 free_bitmap;
> };
>
> -static inline struct i915_timeline_hwsp *
> -i915_timeline_hwsp(const struct i915_timeline *tl)
> +struct i915_timeline_cacheline {
> + struct i915_active active;
> + struct i915_timeline_hwsp *hwsp;
> + unsigned int cacheline : 6;
> + unsigned int free : 1;
> +};
> +
> +static inline struct drm_i915_private *
> +hwsp_to_i915(struct i915_timeline_hwsp *hwsp)
> {
> - return tl->hwsp_ggtt->private;
> + return container_of(hwsp->gt, struct drm_i915_private, gt.timelines);
> }
>
> static struct i915_vma *__hwsp_alloc(struct drm_i915_private *i915)
> @@ -71,6 +80,7 @@ hwsp_alloc(struct i915_timeline *timeline, unsigned int *cacheline)
> vma->private = hwsp;
> hwsp->vma = vma;
> hwsp->free_bitmap = ~0ull;
> + hwsp->gt = gt;
>
> spin_lock(>->hwsp_lock);
> list_add(&hwsp->free_link, >->hwsp_free_list);
> @@ -88,14 +98,9 @@ hwsp_alloc(struct i915_timeline *timeline, unsigned int *cacheline)
> return hwsp->vma;
> }
>
> -static void hwsp_free(struct i915_timeline *timeline)
> +static void __idle_hwsp_free(struct i915_timeline_hwsp *hwsp, int cacheline)
> {
> - struct i915_gt_timelines *gt = &timeline->i915->gt.timelines;
> - struct i915_timeline_hwsp *hwsp;
> -
> - hwsp = i915_timeline_hwsp(timeline);
> - if (!hwsp) /* leave global HWSP alone! */
> - return;
> + struct i915_gt_timelines *gt = hwsp->gt;
>
> spin_lock(>->hwsp_lock);
>
> @@ -103,7 +108,8 @@ static void hwsp_free(struct i915_timeline *timeline)
> if (!hwsp->free_bitmap)
> list_add_tail(&hwsp->free_link, >->hwsp_free_list);
>
> - hwsp->free_bitmap |= BIT_ULL(timeline->hwsp_offset / CACHELINE_BYTES);
> + GEM_BUG_ON(cacheline >= BITS_PER_TYPE(hwsp->free_bitmap));
> + hwsp->free_bitmap |= BIT_ULL(cacheline);
>
> /* And if no one is left using it, give the page back to the system */
> if (hwsp->free_bitmap == ~0ull) {
> @@ -115,6 +121,80 @@ static void hwsp_free(struct i915_timeline *timeline)
> spin_unlock(>->hwsp_lock);
> }
>
> +static void __idle_cacheline_free(struct i915_timeline_cacheline *cl)
> +{
> + GEM_BUG_ON(!i915_active_is_idle(&cl->active));
> +
> + i915_vma_put(cl->hwsp->vma);
> + __idle_hwsp_free(cl->hwsp, cl->cacheline);
> +
> + i915_active_fini(&cl->active);
> + kfree(cl);
> +}
> +
> +static void __idle_cacheline_park(struct i915_timeline_cacheline *cl)
> +{
> + i915_active_fini(&cl->active);
> +}
> +
> +static void __cacheline_retire(struct i915_active *active)
> +{
> + struct i915_timeline_cacheline *cl =
> + container_of(active, typeof(*cl), active);
> +
> + i915_vma_unpin(cl->hwsp->vma);
> + if (!cl->free)
> + __idle_cacheline_park(cl);
> + else
> + __idle_cacheline_free(cl);
> +}
> +
> +static struct i915_timeline_cacheline *
> +cacheline_alloc(struct i915_timeline_hwsp *hwsp, unsigned int cacheline)
> +{
> + struct i915_timeline_cacheline *cl;
> +
> + GEM_BUG_ON(cacheline >= 64);
Maybe pull out CACHELINES_PER_PAGE as HWSP_CACHELINES_PER_PAGE or something?
> +
> + cl = kmalloc(sizeof(*cl), GFP_KERNEL);
> + if (!cl)
> + return ERR_PTR(-ENOMEM);
> +
> + i915_vma_get(hwsp->vma);
> + cl->hwsp = hwsp;
> + cl->cacheline = cacheline;
> + cl->free = false;
> +
> + i915_active_init(i915_gt_active(hwsp_to_i915(hwsp)),
> + &cl->active, __cacheline_retire);
> +
> + return cl;
> +}
> +
> +static void cacheline_acquire(struct i915_timeline_cacheline *cl)
> +{
> + if (cl && i915_active_acquire(&cl->active))
> + __i915_vma_pin(cl->hwsp->vma);
> +}
> +
> +static void cacheline_release(struct i915_timeline_cacheline *cl)
> +{
> + if (cl)
> + i915_active_release(&cl->active);
> +}
> +
> +static void cacheline_free(struct i915_timeline_cacheline *cl)
> +{
> + if (!cl)
> + return;
> +
> + GEM_BUG_ON(cl->free);
> + cl->free = true;
> +
> + if (i915_active_is_idle(&cl->active))
> + __idle_cacheline_free(cl);
> +}
> +
> int i915_timeline_init(struct drm_i915_private *i915,
> struct i915_timeline *timeline,
> const char *name,
> @@ -136,22 +216,32 @@ int i915_timeline_init(struct drm_i915_private *i915,
> timeline->name = name;
> timeline->pin_count = 0;
> timeline->has_initial_breadcrumb = !hwsp;
> + timeline->hwsp_cacheline = NULL;
>
> timeline->hwsp_offset = I915_GEM_HWS_SEQNO_ADDR;
> if (!hwsp) {
> + struct i915_timeline_cacheline *cl;
> unsigned int cacheline;
>
> hwsp = hwsp_alloc(timeline, &cacheline);
> if (IS_ERR(hwsp))
> return PTR_ERR(hwsp);
>
> + cl = cacheline_alloc(hwsp->private, cacheline);
> + if (IS_ERR(cl)) {
> + __idle_hwsp_free(hwsp->private, cacheline);
> + return PTR_ERR(cl);
> + }
> +
> timeline->hwsp_offset = cacheline * CACHELINE_BYTES;
> + timeline->hwsp_cacheline = cl;
> }
> timeline->hwsp_ggtt = i915_vma_get(hwsp);
> + GEM_BUG_ON(timeline->hwsp_offset >= hwsp->size);
>
> vaddr = i915_gem_object_pin_map(hwsp->obj, I915_MAP_WB);
> if (IS_ERR(vaddr)) {
> - hwsp_free(timeline);
> + cacheline_free(timeline->hwsp_cacheline);
> i915_vma_put(hwsp);
> return PTR_ERR(vaddr);
> }
> @@ -239,7 +329,7 @@ void i915_timeline_fini(struct i915_timeline *timeline)
> GEM_BUG_ON(i915_active_request_isset(&timeline->barrier));
>
> i915_syncmap_free(&timeline->sync);
> - hwsp_free(timeline);
> + cacheline_free(timeline->hwsp_cacheline);
>
> i915_gem_object_unpin_map(timeline->hwsp_ggtt->obj);
> i915_vma_put(timeline->hwsp_ggtt);
> @@ -284,6 +374,7 @@ int i915_timeline_pin(struct i915_timeline *tl)
> i915_ggtt_offset(tl->hwsp_ggtt) +
> offset_in_page(tl->hwsp_offset);
>
> + cacheline_acquire(tl->hwsp_cacheline);
> timeline_add_to_active(tl);
>
> return 0;
> @@ -293,6 +384,113 @@ int i915_timeline_pin(struct i915_timeline *tl)
> return err;
> }
>
> +static u32 timeline_advance(struct i915_timeline *tl)
> +{
> + GEM_BUG_ON(!tl->pin_count);
> + GEM_BUG_ON(tl->seqno & tl->has_initial_breadcrumb);
> +
> + return tl->seqno += 1 + tl->has_initial_breadcrumb;
> +}
> +
> +static void timeline_rollback(struct i915_timeline *tl)
> +{
> + tl->seqno -= 1 + tl->has_initial_breadcrumb;
> +}
> +
> +static noinline int
> +__i915_timeline_get_seqno(struct i915_timeline *tl,
> + struct i915_request *rq,
> + u32 *seqno)
> +{
> + struct i915_timeline_cacheline *cl;
> + struct i915_vma *vma;
> + unsigned int cacheline;
> + int err;
> +
> + /*
> + * If there is an outstanding GPU reference to this cacheline,
> + * such as it being sampled by a HW semaphore on another timeline,
> + * we cannot wraparound our seqno value (the HW semaphore does
> + * a strict greater-than-or-equals compare, not i915_seqno_passed).
> + * So if the cacheline is still busy, we must detached ourselves
> + * from it and leave it inflight alongside its users.
> + *
> + * However, if nobody is watching and we can guarantee that nobody
> + * will, we could simply reuse the same cacheline.
> + *
> + * // while locked
> + * if (i915_active_request_is_signaled(&tl->last_request) &&
> + * i915_active_is_signaled(&tl->hwsp_cacheline->active))
> + * return 0;
> + *
> + * That seems unlikely for a busy timeline that needed to wrap in
> + * the first place, so just replace the cacheline.
> + */
> +
> + vma = hwsp_alloc(tl, &cacheline);
> + if (IS_ERR(vma)) {
> + err = PTR_ERR(vma);
> + goto err_rollback;
> + }
> +
> + cl = cacheline_alloc(vma->private, cacheline);
> + if (IS_ERR(cl)) {
> + err = PTR_ERR(cl);
> + goto err_hwsp;
> + }
> +
> + /*
> + * Attach the old cacheline to the current request, so that we only
> + * free it after the current request is retired, which ensures that
> + * all writes into the cacheline from previous requests are complete.
> + */
> + err = i915_active_ref(&tl->hwsp_cacheline->active,
> + tl->fence_context, rq);
Right, this is the rq + 1 magic akin to unpin previous context. Was
confusing me for a bit why we would be assigning the old cacheline to
the current rq.
> + if (err)
> + goto err_cacheline;
> +
> + tl->hwsp_ggtt = i915_vma_get(vma);
> + tl->hwsp_offset = cacheline * CACHELINE_BYTES;
> + __i915_vma_pin(tl->hwsp_ggtt);
> +
> + cacheline_release(tl->hwsp_cacheline); /* ownership now xfered to rq */
> + cacheline_free(tl->hwsp_cacheline);
> +
> + cacheline_acquire(cl);
> + tl->hwsp_cacheline = cl;
> +
> + *seqno = timeline_advance(tl);
> + return 0;
> +
> +err_cacheline:
i915_vma_put looks to be missing here to fully unwind cacheline_alloc.
> + kfree(cl);
> +err_hwsp:
> + __idle_hwsp_free(vma->private, cacheline);
> +err_rollback:
> + timeline_rollback(tl);
> + return err;
> +}
> +
> +int i915_timeline_get_seqno(struct i915_timeline *tl,
> + struct i915_request *rq,
> + u32 *seqno)
> +{
> + *seqno = timeline_advance(tl);
> +
> + /* Replace the HWSP on wraparound for HW semaphores */
> + if (unlikely(!*seqno && tl->hwsp_cacheline))
> + return __i915_timeline_get_seqno(tl, rq, seqno);
> +
> + return 0;
> +}
> +
> +int i915_timeline_read_lock(struct i915_timeline *tl, struct i915_request *rq)
> +{
> + GEM_BUG_ON(!tl->pin_count);
> + return i915_active_ref(&tl->hwsp_cacheline->active,
> + rq->fence.context, rq);
> +}
> +
> void i915_timeline_unpin(struct i915_timeline *tl)
> {
> GEM_BUG_ON(!tl->pin_count);
> @@ -300,6 +498,7 @@ void i915_timeline_unpin(struct i915_timeline *tl)
> return;
>
> timeline_remove_from_active(tl);
> + cacheline_release(tl->hwsp_cacheline);
>
> /*
> * Since this timeline is idle, all bariers upon which we were waiting
> diff --git a/drivers/gpu/drm/i915/i915_timeline.h b/drivers/gpu/drm/i915/i915_timeline.h
> index 7bec7d2e45bf..d78ec6fbc000 100644
> --- a/drivers/gpu/drm/i915/i915_timeline.h
> +++ b/drivers/gpu/drm/i915/i915_timeline.h
> @@ -34,7 +34,7 @@
> #include "i915_utils.h"
>
> struct i915_vma;
> -struct i915_timeline_hwsp;
> +struct i915_timeline_cacheline;
>
> struct i915_timeline {
> u64 fence_context;
> @@ -49,6 +49,8 @@ struct i915_timeline {
> struct i915_vma *hwsp_ggtt;
> u32 hwsp_offset;
>
> + struct i915_timeline_cacheline *hwsp_cacheline;
> +
> bool has_initial_breadcrumb;
>
> /**
> @@ -160,6 +162,11 @@ static inline bool i915_timeline_sync_is_later(struct i915_timeline *tl,
> }
>
> int i915_timeline_pin(struct i915_timeline *tl);
> +int i915_timeline_get_seqno(struct i915_timeline *tl,
> + struct i915_request *rq,
> + u32 *seqno);
> +int i915_timeline_read_lock(struct i915_timeline *tl,
> + struct i915_request *rq);
> void i915_timeline_unpin(struct i915_timeline *tl);
>
> void i915_timelines_init(struct drm_i915_private *i915);
>
I like it! Took me some time to figure it out but it looks good and is
definitely elegant.
Regards,
Tvrtko
More information about the Intel-gfx
mailing list