[Intel-gfx] [PATCH 08/11] drm/i915: Keep timeline HWSP allocated until the system is idle

Wed Jan 30 17:54:51 UTC 2019

On 30/01/2019 02:19, Chris Wilson wrote:
> In preparation for enabling HW semaphores, we need to keep in flight
> timeline HWSP alive until the entire system is idle, as any other
> timeline active on the GPU may still refer back to the already retired
> timeline. We both have to delay recycling available cachelines and
> unpinning old HWSP until the next idle point (i.e. on parking).
> 
> That we have to keep the HWSP alive for external references on HW raises
> an interesting conundrum. On a busy system, we may never see a global
> idle point, essentially meaning the resource will be leaking until we
> are forced to sleep. What we need is a set of RCU primitives for the GPU!
> This should also help mitigate the resource starvation issues
> promulgating from keeping all logical state pinned until idle (instead
> of as currently handled until the next context switch).
> 
> v2: Use idle barriers to free stale HWSP as soon as all current requests
> are idle, rather than rely on the system reaching a global idle point.
> (Tvrtko)
> v3: Replace the idle barrier with read locks.

Time to change patch title and actually even rewrite the commit message 
I think.

> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_request.c  |  30 ++--
>   drivers/gpu/drm/i915/i915_timeline.c | 229 +++++++++++++++++++++++++--
>   drivers/gpu/drm/i915/i915_timeline.h |   9 +-
>   3 files changed, 237 insertions(+), 31 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index a09f47ccc703..07e4c3c68ecd 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -326,11 +326,6 @@ void i915_request_retire_upto(struct i915_request *rq)
>   	} while (tmp != rq);
>   }
>   
> -static u32 timeline_get_seqno(struct i915_timeline *tl)
> -{
> -	return tl->seqno += 1 + tl->has_initial_breadcrumb;
> -}
> -
>   static void move_to_timeline(struct i915_request *request,
>   			     struct i915_timeline *timeline)
>   {
> @@ -539,8 +534,10 @@ struct i915_request *
>   i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   {
>   	struct drm_i915_private *i915 = engine->i915;
> -	struct i915_request *rq;
>   	struct intel_context *ce;
> +	struct i915_timeline *tl;
> +	struct i915_request *rq;
> +	u32 seqno;
>   	int ret;
>   
>   	lockdep_assert_held(&i915->drm.struct_mutex);
> @@ -615,24 +612,26 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   		}
>   	}
>   
> -	rq->rcustate = get_state_synchronize_rcu();
> -
>   	INIT_LIST_HEAD(&rq->active_list);
> +
> +	tl = ce->ring->timeline;
> +	ret = i915_timeline_get_seqno(tl, rq, &seqno);
> +	if (ret)
> +		goto err_free;
> +
>   	rq->i915 = i915;
>   	rq->engine = engine;
>   	rq->gem_context = ctx;
>   	rq->hw_context = ce;
>   	rq->ring = ce->ring;
> -	rq->timeline = ce->ring->timeline;
> +	rq->timeline = tl;
>   	GEM_BUG_ON(rq->timeline == &engine->timeline);
> -	rq->hwsp_seqno = rq->timeline->hwsp_seqno;
> +	rq->hwsp_seqno = tl->hwsp_seqno;
> +	rq->rcustate = get_state_synchronize_rcu(); /* acts as smp_mb() */
>   
>   	spin_lock_init(&rq->lock);
> -	dma_fence_init(&rq->fence,
> -		       &i915_fence_ops,
> -		       &rq->lock,
> -		       rq->timeline->fence_context,
> -		       timeline_get_seqno(rq->timeline));
> +	dma_fence_init(&rq->fence, &i915_fence_ops, &rq->lock,
> +		       tl->fence_context, seqno);
>   
>   	/* We bump the ref for the fence chain */
>   	i915_sw_fence_init(&i915_request_get(rq)->submit, submit_notify);
> @@ -693,6 +692,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   	GEM_BUG_ON(!list_empty(&rq->sched.signalers_list));
>   	GEM_BUG_ON(!list_empty(&rq->sched.waiters_list));
>   
> +err_free:
>   	kmem_cache_free(i915->requests, rq);
>   err_unreserve:
>   	unreserve_gt(i915);
> diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
> index b2202d2e58a2..fd1a92a3663d 100644
> --- a/drivers/gpu/drm/i915/i915_timeline.c
> +++ b/drivers/gpu/drm/i915/i915_timeline.c
> @@ -6,19 +6,28 @@
>   
>   #include "i915_drv.h"
>   
> -#include "i915_timeline.h"
> +#include "i915_active.h"
>   #include "i915_syncmap.h"
> +#include "i915_timeline.h"
>   
>   struct i915_timeline_hwsp {
> -	struct i915_vma *vma;
> +	struct i915_gt_timelines *gt;

Coming back to a comment from one of the previous reviews, this is also 
called gt but is a different thing altogether. I would really like that 
we afford ourselves a few more characters so it just easier to read the 
code.

>   	struct list_head free_link;
> +	struct i915_vma *vma;
>   	u64 free_bitmap;
>   };
>   
> -static inline struct i915_timeline_hwsp *
> -i915_timeline_hwsp(const struct i915_timeline *tl)
> +struct i915_timeline_cacheline {
> +	struct i915_active active;
> +	struct i915_timeline_hwsp *hwsp;
> +	unsigned int cacheline : 6;
> +	unsigned int free : 1;
> +};
> +
> +static inline struct drm_i915_private *
> +hwsp_to_i915(struct i915_timeline_hwsp *hwsp)
>   {
> -	return tl->hwsp_ggtt->private;
> +	return container_of(hwsp->gt, struct drm_i915_private, gt.timelines);
>   }
>   
>   static struct i915_vma *__hwsp_alloc(struct drm_i915_private *i915)
> @@ -71,6 +80,7 @@ hwsp_alloc(struct i915_timeline *timeline, unsigned int *cacheline)
>   		vma->private = hwsp;
>   		hwsp->vma = vma;
>   		hwsp->free_bitmap = ~0ull;
> +		hwsp->gt = gt;
>   
>   		spin_lock(&gt->hwsp_lock);
>   		list_add(&hwsp->free_link, &gt->hwsp_free_list);
> @@ -88,14 +98,9 @@ hwsp_alloc(struct i915_timeline *timeline, unsigned int *cacheline)
>   	return hwsp->vma;
>   }
>   
> -static void hwsp_free(struct i915_timeline *timeline)
> +static void __idle_hwsp_free(struct i915_timeline_hwsp *hwsp, int cacheline)
>   {
> -	struct i915_gt_timelines *gt = &timeline->i915->gt.timelines;
> -	struct i915_timeline_hwsp *hwsp;
> -
> -	hwsp = i915_timeline_hwsp(timeline);
> -	if (!hwsp) /* leave global HWSP alone! */
> -		return;
> +	struct i915_gt_timelines *gt = hwsp->gt;
>   
>   	spin_lock(&gt->hwsp_lock);
>   
> @@ -103,7 +108,8 @@ static void hwsp_free(struct i915_timeline *timeline)
>   	if (!hwsp->free_bitmap)
>   		list_add_tail(&hwsp->free_link, &gt->hwsp_free_list);
>   
> -	hwsp->free_bitmap |= BIT_ULL(timeline->hwsp_offset / CACHELINE_BYTES);
> +	GEM_BUG_ON(cacheline >= BITS_PER_TYPE(hwsp->free_bitmap));
> +	hwsp->free_bitmap |= BIT_ULL(cacheline);
>   
>   	/* And if no one is left using it, give the page back to the system */
>   	if (hwsp->free_bitmap == ~0ull) {
> @@ -115,6 +121,80 @@ static void hwsp_free(struct i915_timeline *timeline)
>   	spin_unlock(&gt->hwsp_lock);
>   }
>   
> +static void __idle_cacheline_free(struct i915_timeline_cacheline *cl)
> +{
> +	GEM_BUG_ON(!i915_active_is_idle(&cl->active));
> +
> +	i915_vma_put(cl->hwsp->vma);
> +	__idle_hwsp_free(cl->hwsp, cl->cacheline);
> +
> +	i915_active_fini(&cl->active);
> +	kfree(cl);
> +}
> +
> +static void __idle_cacheline_park(struct i915_timeline_cacheline *cl)
> +{
> +	i915_active_fini(&cl->active);
> +}
> +
> +static void __cacheline_retire(struct i915_active *active)
> +{
> +	struct i915_timeline_cacheline *cl =
> +		container_of(active, typeof(*cl), active);
> +
> +	i915_vma_unpin(cl->hwsp->vma);
> +	if (!cl->free)
> +		__idle_cacheline_park(cl);
> +	else
> +		__idle_cacheline_free(cl);
> +}
> +
> +static struct i915_timeline_cacheline *
> +cacheline_alloc(struct i915_timeline_hwsp *hwsp, unsigned int cacheline)
> +{
> +	struct i915_timeline_cacheline *cl;
> +
> +	GEM_BUG_ON(cacheline >= 64);

Maybe pull out CACHELINES_PER_PAGE as HWSP_CACHELINES_PER_PAGE or something?

> +
> +	cl = kmalloc(sizeof(*cl), GFP_KERNEL);
> +	if (!cl)
> +		return ERR_PTR(-ENOMEM);
> +
> +	i915_vma_get(hwsp->vma);
> +	cl->hwsp = hwsp;
> +	cl->cacheline = cacheline;
> +	cl->free = false;
> +
> +	i915_active_init(i915_gt_active(hwsp_to_i915(hwsp)),
> +			 &cl->active, __cacheline_retire);
> +
> +	return cl;
> +}
> +
> +static void cacheline_acquire(struct i915_timeline_cacheline *cl)
> +{
> +	if (cl && i915_active_acquire(&cl->active))
> +		__i915_vma_pin(cl->hwsp->vma);
> +}
> +
> +static void cacheline_release(struct i915_timeline_cacheline *cl)
> +{
> +	if (cl)
> +		i915_active_release(&cl->active);
> +}
> +
> +static void cacheline_free(struct i915_timeline_cacheline *cl)
> +{
> +	if (!cl)
> +		return;
> +
> +	GEM_BUG_ON(cl->free);
> +	cl->free = true;
> +
> +	if (i915_active_is_idle(&cl->active))
> +		__idle_cacheline_free(cl);
> +}
> +
>   int i915_timeline_init(struct drm_i915_private *i915,
>   		       struct i915_timeline *timeline,
>   		       const char *name,
> @@ -136,22 +216,32 @@ int i915_timeline_init(struct drm_i915_private *i915,
>   	timeline->name = name;
>   	timeline->pin_count = 0;
>   	timeline->has_initial_breadcrumb = !hwsp;
> +	timeline->hwsp_cacheline = NULL;
>   
>   	timeline->hwsp_offset = I915_GEM_HWS_SEQNO_ADDR;
>   	if (!hwsp) {
> +		struct i915_timeline_cacheline *cl;
>   		unsigned int cacheline;
>   
>   		hwsp = hwsp_alloc(timeline, &cacheline);
>   		if (IS_ERR(hwsp))
>   			return PTR_ERR(hwsp);
>   
> +		cl = cacheline_alloc(hwsp->private, cacheline);
> +		if (IS_ERR(cl)) {
> +			__idle_hwsp_free(hwsp->private, cacheline);
> +			return PTR_ERR(cl);
> +		}
> +
>   		timeline->hwsp_offset = cacheline * CACHELINE_BYTES;
> +		timeline->hwsp_cacheline = cl;
>   	}
>   	timeline->hwsp_ggtt = i915_vma_get(hwsp);
> +	GEM_BUG_ON(timeline->hwsp_offset >= hwsp->size);
>   
>   	vaddr = i915_gem_object_pin_map(hwsp->obj, I915_MAP_WB);
>   	if (IS_ERR(vaddr)) {
> -		hwsp_free(timeline);
> +		cacheline_free(timeline->hwsp_cacheline);
>   		i915_vma_put(hwsp);
>   		return PTR_ERR(vaddr);
>   	}
> @@ -239,7 +329,7 @@ void i915_timeline_fini(struct i915_timeline *timeline)
>   	GEM_BUG_ON(i915_active_request_isset(&timeline->barrier));
>   
>   	i915_syncmap_free(&timeline->sync);
> -	hwsp_free(timeline);
> +	cacheline_free(timeline->hwsp_cacheline);
>   
>   	i915_gem_object_unpin_map(timeline->hwsp_ggtt->obj);
>   	i915_vma_put(timeline->hwsp_ggtt);
> @@ -284,6 +374,7 @@ int i915_timeline_pin(struct i915_timeline *tl)
>   		i915_ggtt_offset(tl->hwsp_ggtt) +
>   		offset_in_page(tl->hwsp_offset);
>   
> +	cacheline_acquire(tl->hwsp_cacheline);
>   	timeline_add_to_active(tl);
>   
>   	return 0;
> @@ -293,6 +384,113 @@ int i915_timeline_pin(struct i915_timeline *tl)
>   	return err;
>   }
>   
> +static u32 timeline_advance(struct i915_timeline *tl)
> +{
> +	GEM_BUG_ON(!tl->pin_count);
> +	GEM_BUG_ON(tl->seqno & tl->has_initial_breadcrumb);
> +
> +	return tl->seqno += 1 + tl->has_initial_breadcrumb;
> +}
> +
> +static void timeline_rollback(struct i915_timeline *tl)
> +{
> +	tl->seqno -= 1 + tl->has_initial_breadcrumb;
> +}
> +
> +static noinline int
> +__i915_timeline_get_seqno(struct i915_timeline *tl,
> +			  struct i915_request *rq,
> +			  u32 *seqno)
> +{
> +	struct i915_timeline_cacheline *cl;
> +	struct i915_vma *vma;
> +	unsigned int cacheline;
> +	int err;
> +
> +	/*
> +	 * If there is an outstanding GPU reference to this cacheline,
> +	 * such as it being sampled by a HW semaphore on another timeline,
> +	 * we cannot wraparound our seqno value (the HW semaphore does
> +	 * a strict greater-than-or-equals compare, not i915_seqno_passed).
> +	 * So if the cacheline is still busy, we must detached ourselves
> +	 * from it and leave it inflight alongside its users.
> +	 *
> +	 * However, if nobody is watching and we can guarantee that nobody
> +	 * will, we could simply reuse the same cacheline.
> +	 *
> +	 * // while locked
> +	 * if (i915_active_request_is_signaled(&tl->last_request) &&
> +	 *     i915_active_is_signaled(&tl->hwsp_cacheline->active))
> +	 *	return 0;
> +	 *
> +	 * That seems unlikely for a busy timeline that needed to wrap in
> +	 * the first place, so just replace the cacheline.
> +	 */
> +
> +	vma = hwsp_alloc(tl, &cacheline);
> +	if (IS_ERR(vma)) {
> +		err = PTR_ERR(vma);
> +		goto err_rollback;
> +	}
> +
> +	cl = cacheline_alloc(vma->private, cacheline);
> +	if (IS_ERR(cl)) {
> +		err = PTR_ERR(cl);
> +		goto err_hwsp;
> +	}
> +
> +	/*
> +	 * Attach the old cacheline to the current request, so that we only
> +	 * free it after the current request is retired, which ensures that
> +	 * all writes into the cacheline from previous requests are complete.
> +	 */
> +	err = i915_active_ref(&tl->hwsp_cacheline->active,
> +			      tl->fence_context, rq);

Right, this is the rq + 1 magic akin to unpin previous context. Was 
confusing me for a bit why we would be assigning the old cacheline to 
the current rq.

> +	if (err)
> +		goto err_cacheline;
> +
> +	tl->hwsp_ggtt = i915_vma_get(vma);
> +	tl->hwsp_offset = cacheline * CACHELINE_BYTES;
> +	__i915_vma_pin(tl->hwsp_ggtt);
> +
> +	cacheline_release(tl->hwsp_cacheline); /* ownership now xfered to rq */
> +	cacheline_free(tl->hwsp_cacheline);
> +
> +	cacheline_acquire(cl);
> +	tl->hwsp_cacheline = cl;
> +
> +	*seqno = timeline_advance(tl);
> +	return 0;
> +
> +err_cacheline:

i915_vma_put looks to be missing here to fully unwind cacheline_alloc.

> +	kfree(cl);
> +err_hwsp:
> +	__idle_hwsp_free(vma->private, cacheline);
> +err_rollback:
> +	timeline_rollback(tl);
> +	return err;
> +}
> +
> +int i915_timeline_get_seqno(struct i915_timeline *tl,
> +			    struct i915_request *rq,
> +			    u32 *seqno)
> +{
> +	*seqno = timeline_advance(tl);
> +
> +	/* Replace the HWSP on wraparound for HW semaphores */
> +	if (unlikely(!*seqno && tl->hwsp_cacheline))
> +		return __i915_timeline_get_seqno(tl, rq, seqno);
> +
> +	return 0;
> +}
> +
> +int i915_timeline_read_lock(struct i915_timeline *tl, struct i915_request *rq)
> +{
> +	GEM_BUG_ON(!tl->pin_count);
> +	return i915_active_ref(&tl->hwsp_cacheline->active,
> +			       rq->fence.context, rq);
> +}
> +
>   void i915_timeline_unpin(struct i915_timeline *tl)
>   {
>   	GEM_BUG_ON(!tl->pin_count);
> @@ -300,6 +498,7 @@ void i915_timeline_unpin(struct i915_timeline *tl)
>   		return;
>   
>   	timeline_remove_from_active(tl);
> +	cacheline_release(tl->hwsp_cacheline);
>   
>   	/*
>   	 * Since this timeline is idle, all bariers upon which we were waiting
> diff --git a/drivers/gpu/drm/i915/i915_timeline.h b/drivers/gpu/drm/i915/i915_timeline.h
> index 7bec7d2e45bf..d78ec6fbc000 100644
> --- a/drivers/gpu/drm/i915/i915_timeline.h
> +++ b/drivers/gpu/drm/i915/i915_timeline.h
> @@ -34,7 +34,7 @@
>   #include "i915_utils.h"
>   
>   struct i915_vma;
> -struct i915_timeline_hwsp;
> +struct i915_timeline_cacheline;
>   
>   struct i915_timeline {
>   	u64 fence_context;
> @@ -49,6 +49,8 @@ struct i915_timeline {
>   	struct i915_vma *hwsp_ggtt;
>   	u32 hwsp_offset;
>   
> +	struct i915_timeline_cacheline *hwsp_cacheline;
> +
>   	bool has_initial_breadcrumb;
>   
>   	/**
> @@ -160,6 +162,11 @@ static inline bool i915_timeline_sync_is_later(struct i915_timeline *tl,
>   }
>   
>   int i915_timeline_pin(struct i915_timeline *tl);
> +int i915_timeline_get_seqno(struct i915_timeline *tl,
> +			    struct i915_request *rq,
> +			    u32 *seqno);
> +int i915_timeline_read_lock(struct i915_timeline *tl,
> +			    struct i915_request *rq);
>   void i915_timeline_unpin(struct i915_timeline *tl);
>   
>   void i915_timelines_init(struct drm_i915_private *i915);
> 

I like it! Took me some time to figure it out but it looks good and is 
definitely elegant.

Regards,

Tvrtko