[Intel-gfx] [PATCH 17/23] drm/i915: Share per-timeline HWSP using a slab suballocator

Fri Jan 18 12:08:40 UTC 2019

On 17/01/2019 14:34, Chris Wilson wrote:
> If we restrict ourselves to only using a cacheline for each timeline's
> HWSP (we could go smaller, but want to avoid needless polluting
> cachelines on different engines between different contexts), then we can
> suballocate a single 4k page into 64 different timeline HWSP. By
> treating each fresh allocation as a slab of 64 entries, we can keep it
> around for the next 64 allocation attempts until we need to refresh the
> slab cache.
> 
> John Harrison noted the issue of fragmentation leading to the same worst
> case performance of one page per timeline as before, which can be
> mitigated by adopting a freelist.

Add "in a later patch."?

> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: John Harrison <John.C.Harrison at Intel.com>
> ---
>   drivers/gpu/drm/i915/i915_drv.h      |  5 ++
>   drivers/gpu/drm/i915/i915_timeline.c | 80 ++++++++++++++++++++++++----
>   2 files changed, 74 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 3913900600b7..d59228dabb6e 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1978,6 +1978,11 @@ struct drm_i915_private {
>   		struct i915_gt_timelines {
>   			struct mutex mutex; /* protects list, tainted by GPU */
>   			struct list_head list;
> +
> +			/* Pack multiple timelines' seqnos into the same page */
> +			spinlock_t hwsp_lock;
> +			struct i915_vma *hwsp;
> +			u64 bitmap;

freemap?

>   		} timelines;
>   
>   		struct list_head active_rings;
> diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
> index 380f4d25fb89..e939a9e1a4ab 100644
> --- a/drivers/gpu/drm/i915/i915_timeline.c
> +++ b/drivers/gpu/drm/i915/i915_timeline.c
> @@ -11,26 +11,73 @@
>   
>   static int hwsp_alloc(struct i915_timeline *timeline)
>   {
> +#define NBITS BITS_PER_TYPE(typeof(gt->bitmap))
>   	struct drm_i915_private *i915 = timeline->i915;
> -	struct drm_i915_gem_object *bo;
> +	struct i915_gt_timelines *gt = &i915->gt.timelines;
>   	struct i915_vma *vma;
> +	int offset;
> +
> +	spin_lock(&gt->hwsp_lock);
> +
> +restart:
> +	offset = find_first_bit((unsigned long *)&gt->bitmap, NBITS);
> +	if (offset == NBITS && gt->hwsp) {
> +		i915_vma_put(gt->hwsp);
> +		gt->hwsp = NULL;
> +	}
> +
> +	vma = gt->hwsp;
> +	if (!vma) {
> +		struct drm_i915_gem_object *bo;
> +
> +		spin_unlock(&gt->hwsp_lock);
>   
> -	bo = i915_gem_object_create_internal(i915, PAGE_SIZE);
> -	if (IS_ERR(bo))
> -		return PTR_ERR(bo);
> +		BUILD_BUG_ON(NBITS * CACHELINE_BYTES > PAGE_SIZE);
> +		bo = i915_gem_object_create_internal(i915, PAGE_SIZE);
> +		if (IS_ERR(bo))
> +			return PTR_ERR(bo);
>   
> -	i915_gem_object_set_cache_level(bo, I915_CACHE_LLC);
> +		i915_gem_object_set_cache_level(bo, I915_CACHE_LLC);
>   
> -	vma = i915_vma_instance(bo, &i915->ggtt.vm, NULL);
> -	if (IS_ERR(vma)) {
> -		i915_gem_object_put(bo);
> -		return PTR_ERR(vma);
> +		vma = i915_vma_instance(bo, &i915->ggtt.vm, NULL);
> +		if (IS_ERR(vma)) {
> +			i915_gem_object_put(bo);
> +			return PTR_ERR(vma);
> +		}
> +
> +		spin_lock(&gt->hwsp_lock);
> +		if (gt->hwsp) {
> +			i915_gem_object_put(bo);
> +			goto restart;
> +		}
> +
> +		gt->hwsp = vma;
> +		gt->bitmap = ~0ull;
> +		offset = 0;
>   	}
>   
> -	timeline->hwsp_ggtt = vma;
> -	timeline->hwsp_offset = 0;
> +	gt->bitmap &= ~BIT_ULL(offset);
> +
> +	spin_unlock(&gt->hwsp_lock);
> +
> +	timeline->hwsp_ggtt = i915_vma_get(vma);
> +	timeline->hwsp_offset = offset * CACHELINE_BYTES;
>   
>   	return 0;
> +#undef NBITS
> +}
> +
> +static void hwsp_free(struct i915_timeline *timeline)
> +{
> +	struct i915_gt_timelines *gt = &timeline->i915->gt.timelines;
> +
> +	if (timeline->hwsp_ggtt != gt->hwsp)
> +		return;
> +
> +	spin_lock(&gt->hwsp_lock);
> +	if (timeline->hwsp_ggtt == gt->hwsp)
> +		gt->bitmap |= BIT_ULL(timeline->hwsp_offset / CACHELINE_BYTES);
> +	spin_unlock(&gt->hwsp_lock);
>   }

There is some asymmetry in hwsp_alloc dealing with both backing store 
and bitmap management, while hwsp_free only deals with bitmap 
management. I'll have a look in the next patch if it all goes away.

>   
>   int i915_timeline_init(struct drm_i915_private *i915,
> @@ -65,6 +112,7 @@ int i915_timeline_init(struct drm_i915_private *i915,
>   
>   	vaddr = i915_gem_object_pin_map(timeline->hwsp_ggtt->obj, I915_MAP_WB);
>   	if (IS_ERR(vaddr)) {
> +		hwsp_free(timeline);
>   		i915_vma_put(timeline->hwsp_ggtt);
>   		return PTR_ERR(vaddr);
>   	}
> @@ -99,6 +147,8 @@ void i915_timelines_init(struct drm_i915_private *i915)
>   	mutex_init(&gt->mutex);
>   	INIT_LIST_HEAD(&gt->list);
>   
> +	spin_lock_init(&gt->hwsp_lock);
> +
>   	/* via i915_gem_wait_for_idle() */
>   	i915_gem_shrinker_taints_mutex(i915, &gt->mutex);
>   }
> @@ -144,6 +194,9 @@ void i915_timeline_fini(struct i915_timeline *timeline)
>   	list_del(&timeline->link);
>   	mutex_unlock(&gt->mutex);
>   
> +	i915_syncmap_free(&timeline->sync);
> +	hwsp_free(timeline);
> +
>   	i915_gem_object_unpin_map(timeline->hwsp_ggtt->obj);
>   	i915_vma_put(timeline->hwsp_ggtt);
>   }
> @@ -211,9 +264,14 @@ void __i915_timeline_free(struct kref *kref)
>   void i915_timelines_fini(struct drm_i915_private *i915)
>   {
>   	struct i915_gt_timelines *gt = &i915->gt.timelines;
> +	struct i915_vma *vma;
>   
>   	GEM_BUG_ON(!list_empty(&gt->list));
>   
> +	vma = fetch_and_zero(&i915->gt.timelines.hwsp);
> +	if (vma)
> +		i915_vma_put(vma);
> +
>   	mutex_destroy(&gt->mutex);
>   }
>   
> 

Looks okay to me. But best wait for John to give it a check over as well.

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

Regards,

Tvrtko