[Intel-gfx] [PATCH] drm/i915: Keep timeline HWSP allocated until idle across the system

Wed Feb 27 14:20:25 UTC 2019

On 27/02/2019 11:15, Chris Wilson wrote:
> In preparation for enabling HW semaphores, we need to keep in flight
> timeline HWSP alive until its use across entire system has completed,
> as any other timeline active on the GPU may still refer back to the
> already retired timeline. We both have to delay recycling available
> cachelines and unpinning old HWSP until the next idle point.
> 
> An easy option would be to simply keep all used HWSP until the system as
> a whole was idle, i.e. we could release them all at once on parking.
> However, on a busy system, we may never see a global idle point,
> essentially meaning the resource will be leaked until we are forced to
> do a GC pass. We already employ a fine-grained idle detection mechanism
> for vma, which we can reuse here so that each cacheline can be freed
> immediately after the last request using it is retired.
> 
> v3: Keep track of the activity of each cacheline.
> v4: cacheline_free() on canceling the seqno tracking
> v5: Finally with a testcase to exercise wraparound
> v6: Pack cacheline into empty bits of page-aligned vaddr
> v7: Use i915_utils to hide the pointer casting around bit manipulation
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com> #v5
> ---
>   drivers/gpu/drm/i915/i915_request.c           |  31 +-
>   drivers/gpu/drm/i915/i915_request.h           |  11 +
>   drivers/gpu/drm/i915/i915_timeline.c          | 290 ++++++++++++++++--
>   drivers/gpu/drm/i915/i915_timeline.h          |  11 +-
>   drivers/gpu/drm/i915/i915_utils.h             |   3 +
>   .../gpu/drm/i915/selftests/i915_timeline.c    | 113 +++++++
>   6 files changed, 420 insertions(+), 39 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 719d1a5ab082..d354967d6ae8 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -325,11 +325,6 @@ void i915_request_retire_upto(struct i915_request *rq)
>   	} while (tmp != rq);
>   }
>   
> -static u32 timeline_get_seqno(struct i915_timeline *tl)
> -{
> -	return tl->seqno += 1 + tl->has_initial_breadcrumb;
> -}
> -
>   static void move_to_timeline(struct i915_request *request,
>   			     struct i915_timeline *timeline)
>   {
> @@ -532,8 +527,10 @@ struct i915_request *
>   i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   {
>   	struct drm_i915_private *i915 = engine->i915;
> -	struct i915_request *rq;
>   	struct intel_context *ce;
> +	struct i915_timeline *tl;
> +	struct i915_request *rq;
> +	u32 seqno;
>   	int ret;
>   
>   	lockdep_assert_held(&i915->drm.struct_mutex);
> @@ -610,24 +607,27 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   		}
>   	}
>   
> -	rq->rcustate = get_state_synchronize_rcu();
> -
>   	INIT_LIST_HEAD(&rq->active_list);
> +
> +	tl = ce->ring->timeline;
> +	ret = i915_timeline_get_seqno(tl, rq, &seqno);
> +	if (ret)
> +		goto err_free;
> +
>   	rq->i915 = i915;
>   	rq->engine = engine;
>   	rq->gem_context = ctx;
>   	rq->hw_context = ce;
>   	rq->ring = ce->ring;
> -	rq->timeline = ce->ring->timeline;
> +	rq->timeline = tl;
>   	GEM_BUG_ON(rq->timeline == &engine->timeline);
> -	rq->hwsp_seqno = rq->timeline->hwsp_seqno;
> +	rq->hwsp_seqno = tl->hwsp_seqno;
> +	rq->hwsp_cacheline = tl->hwsp_cacheline;
> +	rq->rcustate = get_state_synchronize_rcu(); /* acts as smp_mb() */
>   
>   	spin_lock_init(&rq->lock);
> -	dma_fence_init(&rq->fence,
> -		       &i915_fence_ops,
> -		       &rq->lock,
> -		       rq->timeline->fence_context,
> -		       timeline_get_seqno(rq->timeline));
> +	dma_fence_init(&rq->fence, &i915_fence_ops, &rq->lock,
> +		       tl->fence_context, seqno);
>   
>   	/* We bump the ref for the fence chain */
>   	i915_sw_fence_init(&i915_request_get(rq)->submit, submit_notify);
> @@ -687,6 +687,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   	GEM_BUG_ON(!list_empty(&rq->sched.signalers_list));
>   	GEM_BUG_ON(!list_empty(&rq->sched.waiters_list));
>   
> +err_free:
>   	kmem_cache_free(global.slab_requests, rq);
>   err_unreserve:
>   	mutex_unlock(&ce->ring->timeline->mutex);
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index be3ded6bcf56..ea1e6f0ade53 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -38,6 +38,7 @@ struct drm_file;
>   struct drm_i915_gem_object;
>   struct i915_request;
>   struct i915_timeline;
> +struct i915_timeline_cacheline;
>   
>   struct i915_capture_list {
>   	struct i915_capture_list *next;
> @@ -148,6 +149,16 @@ struct i915_request {
>   	 */
>   	const u32 *hwsp_seqno;
>   
> +	/*
> +	 * If we need to access the timeline's seqno for this request in
> +	 * another request, we need to keep a read reference to this associated
> +	 * cacheline, so that we do not free and recycle it before the foriegn
> +	 * observers have completed. Hence, we keep a pointer to the cacheline
> +	 * inside the timeline's HWSP vma, but it is only valid while this
> +	 * request has not completed and guarded by the timeline mutex.
> +	 */
> +	struct i915_timeline_cacheline *hwsp_cacheline;
> +
>   	/** Position in the ring of the start of the request */
>   	u32 head;
>   
> diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
> index 87a80558da28..e45e81b4cab6 100644
> --- a/drivers/gpu/drm/i915/i915_timeline.c
> +++ b/drivers/gpu/drm/i915/i915_timeline.c
> @@ -6,19 +6,29 @@
>   
>   #include "i915_drv.h"
>   
> -#include "i915_timeline.h"
> +#include "i915_active.h"
>   #include "i915_syncmap.h"
> +#include "i915_timeline.h"
>   
>   struct i915_timeline_hwsp {
> -	struct i915_vma *vma;
> +	struct i915_gt_timelines *gt;
>   	struct list_head free_link;
> +	struct i915_vma *vma;
>   	u64 free_bitmap;
>   };
>   
> -static inline struct i915_timeline_hwsp *
> -i915_timeline_hwsp(const struct i915_timeline *tl)
> +struct i915_timeline_cacheline {
> +	struct i915_active active;
> +	struct i915_timeline_hwsp *hwsp;
> +	void *vaddr;
> +#define CACHELINE_BITS 6
> +#define CACHELINE_FREE CACHELINE_BITS
> +};
> +
> +static inline struct drm_i915_private *
> +hwsp_to_i915(struct i915_timeline_hwsp *hwsp)
>   {
> -	return tl->hwsp_ggtt->private;
> +	return container_of(hwsp->gt, struct drm_i915_private, gt.timelines);
>   }
>   
>   static struct i915_vma *__hwsp_alloc(struct drm_i915_private *i915)
> @@ -71,6 +81,7 @@ hwsp_alloc(struct i915_timeline *timeline, unsigned int *cacheline)
>   		vma->private = hwsp;
>   		hwsp->vma = vma;
>   		hwsp->free_bitmap = ~0ull;
> +		hwsp->gt = gt;
>   
>   		spin_lock(&gt->hwsp_lock);
>   		list_add(&hwsp->free_link, &gt->hwsp_free_list);
> @@ -88,14 +99,9 @@ hwsp_alloc(struct i915_timeline *timeline, unsigned int *cacheline)
>   	return hwsp->vma;
>   }
>   
> -static void hwsp_free(struct i915_timeline *timeline)
> +static void __idle_hwsp_free(struct i915_timeline_hwsp *hwsp, int cacheline)
>   {
> -	struct i915_gt_timelines *gt = &timeline->i915->gt.timelines;
> -	struct i915_timeline_hwsp *hwsp;
> -
> -	hwsp = i915_timeline_hwsp(timeline);
> -	if (!hwsp) /* leave global HWSP alone! */
> -		return;
> +	struct i915_gt_timelines *gt = hwsp->gt;
>   
>   	spin_lock(&gt->hwsp_lock);
>   
> @@ -103,7 +109,8 @@ static void hwsp_free(struct i915_timeline *timeline)
>   	if (!hwsp->free_bitmap)
>   		list_add_tail(&hwsp->free_link, &gt->hwsp_free_list);
>   
> -	hwsp->free_bitmap |= BIT_ULL(timeline->hwsp_offset / CACHELINE_BYTES);
> +	GEM_BUG_ON(cacheline >= BITS_PER_TYPE(hwsp->free_bitmap));
> +	hwsp->free_bitmap |= BIT_ULL(cacheline);
>   
>   	/* And if no one is left using it, give the page back to the system */
>   	if (hwsp->free_bitmap == ~0ull) {
> @@ -115,6 +122,76 @@ static void hwsp_free(struct i915_timeline *timeline)
>   	spin_unlock(&gt->hwsp_lock);
>   }
>   
> +static void __idle_cacheline_free(struct i915_timeline_cacheline *cl)
> +{
> +	GEM_BUG_ON(!i915_active_is_idle(&cl->active));
> +
> +	i915_gem_object_unpin_map(cl->hwsp->vma->obj);
> +	i915_vma_put(cl->hwsp->vma);
> +	__idle_hwsp_free(cl->hwsp, ptr_unmask_bits(cl->vaddr, CACHELINE_BITS));
> +
> +	i915_active_fini(&cl->active);
> +	kfree(cl);
> +}
> +
> +static void __cacheline_retire(struct i915_active *active)
> +{
> +	struct i915_timeline_cacheline *cl =
> +		container_of(active, typeof(*cl), active);
> +
> +	i915_vma_unpin(cl->hwsp->vma);
> +	if (ptr_test_bit(cl->vaddr, CACHELINE_FREE))
> +		__idle_cacheline_free(cl);
> +}
> +
> +static struct i915_timeline_cacheline *
> +cacheline_alloc(struct i915_timeline_hwsp *hwsp, unsigned int cacheline)
> +{
> +	struct i915_timeline_cacheline *cl;
> +	void *vaddr;
> +
> +	GEM_BUG_ON(cacheline >= BIT(CACHELINE_BITS));
> +
> +	cl = kmalloc(sizeof(*cl), GFP_KERNEL);
> +	if (!cl)
> +		return ERR_PTR(-ENOMEM);
> +
> +	vaddr = i915_gem_object_pin_map(hwsp->vma->obj, I915_MAP_WB);
> +	if (IS_ERR(vaddr)) {
> +		kfree(cl);
> +		return ERR_CAST(vaddr);
> +	}
> +
> +	i915_vma_get(hwsp->vma);
> +	cl->hwsp = hwsp;
> +	cl->vaddr = page_pack_bits(vaddr, cacheline);
> +
> +	i915_active_init(hwsp_to_i915(hwsp), &cl->active, __cacheline_retire);
> +
> +	return cl;
> +}
> +
> +static void cacheline_acquire(struct i915_timeline_cacheline *cl)
> +{
> +	if (cl && i915_active_acquire(&cl->active))
> +		__i915_vma_pin(cl->hwsp->vma);
> +}
> +
> +static void cacheline_release(struct i915_timeline_cacheline *cl)
> +{
> +	if (cl)
> +		i915_active_release(&cl->active);
> +}
> +
> +static void cacheline_free(struct i915_timeline_cacheline *cl)
> +{
> +	GEM_BUG_ON(ptr_test_bit(cl->vaddr, CACHELINE_FREE));
> +	cl->vaddr = ptr_set_bit(cl->vaddr, CACHELINE_FREE);
> +
> +	if (i915_active_is_idle(&cl->active))
> +		__idle_cacheline_free(cl);
> +}
> +
>   int i915_timeline_init(struct drm_i915_private *i915,
>   		       struct i915_timeline *timeline,
>   		       const char *name,
> @@ -136,29 +213,40 @@ int i915_timeline_init(struct drm_i915_private *i915,
>   	timeline->name = name;
>   	timeline->pin_count = 0;
>   	timeline->has_initial_breadcrumb = !hwsp;
> +	timeline->hwsp_cacheline = NULL;
>   
> -	timeline->hwsp_offset = I915_GEM_HWS_SEQNO_ADDR;
>   	if (!hwsp) {
> +		struct i915_timeline_cacheline *cl;
>   		unsigned int cacheline;
>   
>   		hwsp = hwsp_alloc(timeline, &cacheline);
>   		if (IS_ERR(hwsp))
>   			return PTR_ERR(hwsp);
>   
> +		cl = cacheline_alloc(hwsp->private, cacheline);
> +		if (IS_ERR(cl)) {
> +			__idle_hwsp_free(hwsp->private, cacheline);
> +			return PTR_ERR(cl);
> +		}
> +
> +		timeline->hwsp_cacheline = cl;
>   		timeline->hwsp_offset = cacheline * CACHELINE_BYTES;
> -	}
> -	timeline->hwsp_ggtt = i915_vma_get(hwsp);
>   
> -	vaddr = i915_gem_object_pin_map(hwsp->obj, I915_MAP_WB);
> -	if (IS_ERR(vaddr)) {
> -		hwsp_free(timeline);
> -		i915_vma_put(hwsp);
> -		return PTR_ERR(vaddr);
> +		vaddr = page_mask_bits(cl->vaddr);
> +	} else {
> +		timeline->hwsp_offset = I915_GEM_HWS_SEQNO_ADDR;
> +
> +		vaddr = i915_gem_object_pin_map(hwsp->obj, I915_MAP_WB);
> +		if (IS_ERR(vaddr))
> +			return PTR_ERR(vaddr);
>   	}
>   
>   	timeline->hwsp_seqno =
>   		memset(vaddr + timeline->hwsp_offset, 0, CACHELINE_BYTES);
>   
> +	timeline->hwsp_ggtt = i915_vma_get(hwsp);
> +	GEM_BUG_ON(timeline->hwsp_offset >= hwsp->size);
> +
>   	timeline->fence_context = dma_fence_context_alloc(1);
>   
>   	spin_lock_init(&timeline->lock);
> @@ -240,9 +328,12 @@ void i915_timeline_fini(struct i915_timeline *timeline)
>   	GEM_BUG_ON(i915_active_request_isset(&timeline->barrier));
>   
>   	i915_syncmap_free(&timeline->sync);
> -	hwsp_free(timeline);
>   
> -	i915_gem_object_unpin_map(timeline->hwsp_ggtt->obj);
> +	if (timeline->hwsp_cacheline)
> +		cacheline_free(timeline->hwsp_cacheline);
> +	else
> +		i915_gem_object_unpin_map(timeline->hwsp_ggtt->obj);
> +
>   	i915_vma_put(timeline->hwsp_ggtt);
>   }
>   
> @@ -285,6 +376,7 @@ int i915_timeline_pin(struct i915_timeline *tl)
>   		i915_ggtt_offset(tl->hwsp_ggtt) +
>   		offset_in_page(tl->hwsp_offset);
>   
> +	cacheline_acquire(tl->hwsp_cacheline);
>   	timeline_add_to_active(tl);
>   
>   	return 0;
> @@ -294,6 +386,157 @@ int i915_timeline_pin(struct i915_timeline *tl)
>   	return err;
>   }
>   
> +static u32 timeline_advance(struct i915_timeline *tl)
> +{
> +	GEM_BUG_ON(!tl->pin_count);
> +	GEM_BUG_ON(tl->seqno & tl->has_initial_breadcrumb);
> +
> +	return tl->seqno += 1 + tl->has_initial_breadcrumb;
> +}
> +
> +static void timeline_rollback(struct i915_timeline *tl)
> +{
> +	tl->seqno -= 1 + tl->has_initial_breadcrumb;
> +}
> +
> +static noinline int
> +__i915_timeline_get_seqno(struct i915_timeline *tl,
> +			  struct i915_request *rq,
> +			  u32 *seqno)
> +{
> +	struct i915_timeline_cacheline *cl;
> +	unsigned int cacheline;
> +	struct i915_vma *vma;
> +	void *vaddr;
> +	int err;
> +
> +	/*
> +	 * If there is an outstanding GPU reference to this cacheline,
> +	 * such as it being sampled by a HW semaphore on another timeline,
> +	 * we cannot wraparound our seqno value (the HW semaphore does
> +	 * a strict greater-than-or-equals compare, not i915_seqno_passed).
> +	 * So if the cacheline is still busy, we must detach ourselves
> +	 * from it and leave it inflight alongside its users.
> +	 *
> +	 * However, if nobody is watching and we can guarantee that nobody
> +	 * will, we could simply reuse the same cacheline.
> +	 *
> +	 * if (i915_active_request_is_signaled(&tl->last_request) &&
> +	 *     i915_active_is_signaled(&tl->hwsp_cacheline->active))
> +	 *	return 0;
> +	 *
> +	 * That seems unlikely for a busy timeline that needed to wrap in
> +	 * the first place, so just replace the cacheline.
> +	 */
> +
> +	vma = hwsp_alloc(tl, &cacheline);
> +	if (IS_ERR(vma)) {
> +		err = PTR_ERR(vma);
> +		goto err_rollback;
> +	}
> +
> +	err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
> +	if (err) {
> +		__idle_hwsp_free(vma->private, cacheline);
> +		goto err_rollback;
> +	}
> +
> +	cl = cacheline_alloc(vma->private, cacheline);
> +	if (IS_ERR(cl)) {
> +		err = PTR_ERR(cl);
> +		__idle_hwsp_free(vma->private, cacheline);
> +		goto err_unpin;
> +	}
> +	GEM_BUG_ON(cl->hwsp->vma != vma);
> +
> +	/*
> +	 * Attach the old cacheline to the current request, so that we only
> +	 * free it after the current request is retired, which ensures that
> +	 * all writes into the cacheline from previous requests are complete.
> +	 */
> +	err = i915_active_ref(&tl->hwsp_cacheline->active,
> +			      tl->fence_context, rq);
> +	if (err)
> +		goto err_cacheline;
> +
> +	cacheline_release(tl->hwsp_cacheline); /* ownership now xfered to rq */
> +	cacheline_free(tl->hwsp_cacheline);
> +
> +	i915_vma_unpin(tl->hwsp_ggtt); /* binding kept alive by old cacheline */
> +	i915_vma_put(tl->hwsp_ggtt);
> +
> +	tl->hwsp_ggtt = i915_vma_get(vma);
> +
> +	vaddr = page_mask_bits(cl->vaddr);
> +	tl->hwsp_offset = cacheline * CACHELINE_BYTES;
> +	tl->hwsp_seqno =
> +		memset(vaddr + tl->hwsp_offset, 0, CACHELINE_BYTES);
> +
> +	tl->hwsp_offset += i915_ggtt_offset(vma);
> +
> +	cacheline_acquire(cl);
> +	tl->hwsp_cacheline = cl;
> +
> +	*seqno = timeline_advance(tl);
> +	GEM_BUG_ON(i915_seqno_passed(*tl->hwsp_seqno, *seqno));
> +	return 0;
> +
> +err_cacheline:
> +	cacheline_free(cl);
> +err_unpin:
> +	i915_vma_unpin(vma);
> +err_rollback:
> +	timeline_rollback(tl);
> +	return err;
> +}
> +
> +int i915_timeline_get_seqno(struct i915_timeline *tl,
> +			    struct i915_request *rq,
> +			    u32 *seqno)
> +{
> +	*seqno = timeline_advance(tl);
> +
> +	/* Replace the HWSP on wraparound for HW semaphores */
> +	if (unlikely(!*seqno && tl->hwsp_cacheline))
> +		return __i915_timeline_get_seqno(tl, rq, seqno);
> +
> +	return 0;
> +}
> +
> +static int cacheline_ref(struct i915_timeline_cacheline *cl,
> +			 struct i915_request *rq)
> +{
> +	return i915_active_ref(&cl->active, rq->fence.context, rq);
> +}
> +
> +int i915_timeline_read_hwsp(struct i915_request *from,
> +			    struct i915_request *to,
> +			    u32 *hwsp)
> +{
> +	struct i915_timeline_cacheline *cl = from->hwsp_cacheline;
> +	struct i915_timeline *tl = from->timeline;
> +	int err;
> +
> +	GEM_BUG_ON(to->timeline == tl);
> +
> +	mutex_lock_nested(&tl->mutex, SINGLE_DEPTH_NESTING);
> +	err = i915_request_completed(from);
> +	if (!err)
> +		err = cacheline_ref(cl, to);
> +	if (!err) {
> +		if (likely(cl == tl->hwsp_cacheline)) {
> +			*hwsp = tl->hwsp_offset;
> +		} else { /* across a seqno wrap, recover the original offset */
> +			*hwsp = i915_ggtt_offset(cl->hwsp->vma) +
> +				ptr_unmask_bits(cl->vaddr, CACHELINE_BITS) *
> +				CACHELINE_BYTES;
> +		}
> +	}
> +	mutex_unlock(&tl->mutex);
> +
> +	return err;
> +}
> +
>   void i915_timeline_unpin(struct i915_timeline *tl)
>   {
>   	GEM_BUG_ON(!tl->pin_count);
> @@ -301,6 +544,7 @@ void i915_timeline_unpin(struct i915_timeline *tl)
>   		return;
>   
>   	timeline_remove_from_active(tl);
> +	cacheline_release(tl->hwsp_cacheline);
>   
>   	/*
>   	 * Since this timeline is idle, all bariers upon which we were waiting
> diff --git a/drivers/gpu/drm/i915/i915_timeline.h b/drivers/gpu/drm/i915/i915_timeline.h
> index 36c3849f7108..60b1dfad93ed 100644
> --- a/drivers/gpu/drm/i915/i915_timeline.h
> +++ b/drivers/gpu/drm/i915/i915_timeline.h
> @@ -34,7 +34,7 @@
>   #include "i915_utils.h"
>   
>   struct i915_vma;
> -struct i915_timeline_hwsp;
> +struct i915_timeline_cacheline;
>   
>   struct i915_timeline {
>   	u64 fence_context;
> @@ -51,6 +51,8 @@ struct i915_timeline {
>   	struct i915_vma *hwsp_ggtt;
>   	u32 hwsp_offset;
>   
> +	struct i915_timeline_cacheline *hwsp_cacheline;
> +
>   	bool has_initial_breadcrumb;
>   
>   	/**
> @@ -162,8 +164,15 @@ static inline bool i915_timeline_sync_is_later(struct i915_timeline *tl,
>   }
>   
>   int i915_timeline_pin(struct i915_timeline *tl);
> +int i915_timeline_get_seqno(struct i915_timeline *tl,
> +			    struct i915_request *rq,
> +			    u32 *seqno);
>   void i915_timeline_unpin(struct i915_timeline *tl);
>   
> +int i915_timeline_read_hwsp(struct i915_request *from,
> +			    struct i915_request *until,
> +			    u32 *hwsp_offset);
> +
>   void i915_timelines_init(struct drm_i915_private *i915);
>   void i915_timelines_park(struct drm_i915_private *i915);
>   void i915_timelines_fini(struct drm_i915_private *i915);
> diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
> index 9726df37c4c4..1fb9c8b2f5b3 100644
> --- a/drivers/gpu/drm/i915/i915_utils.h
> +++ b/drivers/gpu/drm/i915/i915_utils.h
> @@ -92,6 +92,9 @@
>   	((typeof(ptr))((unsigned long)(ptr) | __bits));			\
>   })
>   
> +#define ptr_set_bit(ptr, bit) ((typeof(ptr))((unsigned long)(ptr) | BIT(bit)))
> +#define ptr_test_bit(ptr, bit) ((unsigned long)(ptr) & BIT(bit))
> +
>   #define page_mask_bits(ptr) ptr_mask_bits(ptr, PAGE_SHIFT)
>   #define page_unmask_bits(ptr) ptr_unmask_bits(ptr, PAGE_SHIFT)
>   #define page_pack_bits(ptr, bits) ptr_pack_bits(ptr, bits, PAGE_SHIFT)
> diff --git a/drivers/gpu/drm/i915/selftests/i915_timeline.c b/drivers/gpu/drm/i915/selftests/i915_timeline.c
> index 12ea69b1a1e5..844701759ffc 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_timeline.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_timeline.c
> @@ -641,6 +641,118 @@ static int live_hwsp_alternate(void *arg)
>   #undef NUM_TIMELINES
>   }
>   
> +static int live_hwsp_wrap(void *arg)
> +{
> +	struct drm_i915_private *i915 = arg;
> +	struct intel_engine_cs *engine;
> +	struct i915_timeline *tl;
> +	enum intel_engine_id id;
> +	intel_wakeref_t wakeref;
> +	int err = 0;
> +
> +	/*
> +	 * Across a seqno wrap, we need to keep the old cacheline alive for
> +	 * foreign GPU references.
> +	 */
> +
> +	mutex_lock(&i915->drm.struct_mutex);
> +	wakeref = intel_runtime_pm_get(i915);
> +
> +	tl = i915_timeline_create(i915, __func__, NULL);
> +	if (IS_ERR(tl)) {
> +		err = PTR_ERR(tl);
> +		goto out_rpm;
> +	}
> +	if (!tl->has_initial_breadcrumb || !tl->hwsp_cacheline)
> +		goto out_free;
> +
> +	err = i915_timeline_pin(tl);
> +	if (err)
> +		goto out_free;
> +
> +	for_each_engine(engine, i915, id) {
> +		const u32 *hwsp_seqno[2];
> +		struct i915_request *rq;
> +		u32 seqno[2];
> +
> +		if (!intel_engine_can_store_dword(engine))
> +			continue;
> +
> +		rq = i915_request_alloc(engine, i915->kernel_context);
> +		if (IS_ERR(rq)) {
> +			err = PTR_ERR(rq);
> +			goto out;
> +		}
> +
> +		tl->seqno = -4u;
> +
> +		err = i915_timeline_get_seqno(tl, rq, &seqno[0]);
> +		if (err) {
> +			i915_request_add(rq);
> +			goto out;
> +		}
> +		pr_debug("seqno[0]:%08x, hwsp_offset:%08x\n",
> +			 seqno[0], tl->hwsp_offset);
> +
> +		err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[0]);
> +		if (err) {
> +			i915_request_add(rq);
> +			goto out;
> +		}
> +		hwsp_seqno[0] = tl->hwsp_seqno;
> +
> +		err = i915_timeline_get_seqno(tl, rq, &seqno[1]);
> +		if (err) {
> +			i915_request_add(rq);
> +			goto out;
> +		}
> +		pr_debug("seqno[1]:%08x, hwsp_offset:%08x\n",
> +			 seqno[1], tl->hwsp_offset);
> +
> +		err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[1]);
> +		if (err) {
> +			i915_request_add(rq);
> +			goto out;
> +		}
> +		hwsp_seqno[1] = tl->hwsp_seqno;
> +
> +		/* With wrap should come a new hwsp */
> +		GEM_BUG_ON(seqno[1] >= seqno[0]);
> +		GEM_BUG_ON(hwsp_seqno[0] == hwsp_seqno[1]);
> +
> +		i915_request_add(rq);
> +
> +		if (i915_request_wait(rq, I915_WAIT_LOCKED, HZ / 5) < 0) {
> +			pr_err("Wait for timeline writes timed out!\n");
> +			err = -EIO;
> +			goto out;
> +		}
> +
> +		if (*hwsp_seqno[0] != seqno[0] || *hwsp_seqno[1] != seqno[1]) {
> +			pr_err("Bad timeline values: found (%x, %x), expected (%x, %x)\n",
> +			       *hwsp_seqno[0], *hwsp_seqno[1],
> +			       seqno[0], seqno[1]);
> +			err = -EINVAL;
> +			goto out;
> +		}
> +
> +		i915_retire_requests(i915); /* recycle HWSP */
> +	}
> +
> +out:
> +	if (igt_flush_test(i915, I915_WAIT_LOCKED))
> +		err = -EIO;
> +
> +	i915_timeline_unpin(tl);
> +out_free:
> +	i915_timeline_put(tl);
> +out_rpm:
> +	intel_runtime_pm_put(i915, wakeref);
> +	mutex_unlock(&i915->drm.struct_mutex);
> +
> +	return err;
> +}
> +
>   static int live_hwsp_recycle(void *arg)
>   {
>   	struct drm_i915_private *i915 = arg;
> @@ -723,6 +835,7 @@ int i915_timeline_live_selftests(struct drm_i915_private *i915)
>   		SUBTEST(live_hwsp_recycle),
>   		SUBTEST(live_hwsp_engine),
>   		SUBTEST(live_hwsp_alternate),
> +		SUBTEST(live_hwsp_wrap),
>   	};
>   
>   	return i915_subtests(tests, i915);
> 

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

Regards,

Tvrtko