[PATCH 17/25] drm/i915: Share per-timeline HWSP using a slab suballocator

Thu Jan 17 21:09:29 UTC 2019

If we restrict ourselves to only using a cacheline for each timeline's
HWSP (we could go smaller, but want to avoid needless polluting
cachelines on different engines between different contexts), then we can
suballocate a single 4k page into 64 different timeline HWSP. By
treating each fresh allocation as a slab of 64 entries, we can keep it
around for the next 64 allocation attempts until we need to refresh the
slab cache.

John Harrison noted the issue of fragmentation leading to the same worst
case performance of one page per timeline as before, which can be
mitigated by adopting a freelist.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: John Harrison <John.C.Harrison at Intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h      |  5 ++
 drivers/gpu/drm/i915/i915_timeline.c | 80 ++++++++++++++++++++++++----
 2 files changed, 74 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 3913900600b7..d59228dabb6e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1978,6 +1978,11 @@ struct drm_i915_private {
 		struct i915_gt_timelines {
 			struct mutex mutex; /* protects list, tainted by GPU */
 			struct list_head list;
+
+			/* Pack multiple timelines' seqnos into the same page */
+			spinlock_t hwsp_lock;
+			struct i915_vma *hwsp;
+			u64 bitmap;
 		} timelines;
 
 		struct list_head active_rings;
diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
index 380f4d25fb89..e939a9e1a4ab 100644
--- a/drivers/gpu/drm/i915/i915_timeline.c
+++ b/drivers/gpu/drm/i915/i915_timeline.c
@@ -11,26 +11,73 @@
 
 static int hwsp_alloc(struct i915_timeline *timeline)
 {
+#define NBITS BITS_PER_TYPE(typeof(gt->bitmap))
 	struct drm_i915_private *i915 = timeline->i915;
-	struct drm_i915_gem_object *bo;
+	struct i915_gt_timelines *gt = &i915->gt.timelines;
 	struct i915_vma *vma;
+	int offset;
+
+	spin_lock(&gt->hwsp_lock);
+
+restart:
+	offset = find_first_bit((unsigned long *)&gt->bitmap, NBITS);
+	if (offset == NBITS && gt->hwsp) {
+		i915_vma_put(gt->hwsp);
+		gt->hwsp = NULL;
+	}
+
+	vma = gt->hwsp;
+	if (!vma) {
+		struct drm_i915_gem_object *bo;
+
+		spin_unlock(&gt->hwsp_lock);
 
-	bo = i915_gem_object_create_internal(i915, PAGE_SIZE);
-	if (IS_ERR(bo))
-		return PTR_ERR(bo);
+		BUILD_BUG_ON(NBITS * CACHELINE_BYTES > PAGE_SIZE);
+		bo = i915_gem_object_create_internal(i915, PAGE_SIZE);
+		if (IS_ERR(bo))
+			return PTR_ERR(bo);
 
-	i915_gem_object_set_cache_level(bo, I915_CACHE_LLC);
+		i915_gem_object_set_cache_level(bo, I915_CACHE_LLC);
 
-	vma = i915_vma_instance(bo, &i915->ggtt.vm, NULL);
-	if (IS_ERR(vma)) {
-		i915_gem_object_put(bo);
-		return PTR_ERR(vma);
+		vma = i915_vma_instance(bo, &i915->ggtt.vm, NULL);
+		if (IS_ERR(vma)) {
+			i915_gem_object_put(bo);
+			return PTR_ERR(vma);
+		}
+
+		spin_lock(&gt->hwsp_lock);
+		if (gt->hwsp) {
+			i915_gem_object_put(bo);
+			goto restart;
+		}
+
+		gt->hwsp = vma;
+		gt->bitmap = ~0ull;
+		offset = 0;
 	}
 
-	timeline->hwsp_ggtt = vma;
-	timeline->hwsp_offset = 0;
+	gt->bitmap &= ~BIT_ULL(offset);
+
+	spin_unlock(&gt->hwsp_lock);
+
+	timeline->hwsp_ggtt = i915_vma_get(vma);
+	timeline->hwsp_offset = offset * CACHELINE_BYTES;
 
 	return 0;
+#undef NBITS
+}
+
+static void hwsp_free(struct i915_timeline *timeline)
+{
+	struct i915_gt_timelines *gt = &timeline->i915->gt.timelines;
+
+	if (timeline->hwsp_ggtt != gt->hwsp)
+		return;
+
+	spin_lock(&gt->hwsp_lock);
+	if (timeline->hwsp_ggtt == gt->hwsp)
+		gt->bitmap |= BIT_ULL(timeline->hwsp_offset / CACHELINE_BYTES);
+	spin_unlock(&gt->hwsp_lock);
 }
 
 int i915_timeline_init(struct drm_i915_private *i915,
@@ -65,6 +112,7 @@ int i915_timeline_init(struct drm_i915_private *i915,
 
 	vaddr = i915_gem_object_pin_map(timeline->hwsp_ggtt->obj, I915_MAP_WB);
 	if (IS_ERR(vaddr)) {
+		hwsp_free(timeline);
 		i915_vma_put(timeline->hwsp_ggtt);
 		return PTR_ERR(vaddr);
 	}
@@ -99,6 +147,8 @@ void i915_timelines_init(struct drm_i915_private *i915)
 	mutex_init(&gt->mutex);
 	INIT_LIST_HEAD(&gt->list);
 
+	spin_lock_init(&gt->hwsp_lock);
+
 	/* via i915_gem_wait_for_idle() */
 	i915_gem_shrinker_taints_mutex(i915, &gt->mutex);
 }
@@ -144,6 +194,9 @@ void i915_timeline_fini(struct i915_timeline *timeline)
 	list_del(&timeline->link);
 	mutex_unlock(&gt->mutex);
 
+	i915_syncmap_free(&timeline->sync);
+	hwsp_free(timeline);
+
 	i915_gem_object_unpin_map(timeline->hwsp_ggtt->obj);
 	i915_vma_put(timeline->hwsp_ggtt);
 }
@@ -211,9 +264,14 @@ void __i915_timeline_free(struct kref *kref)
 void i915_timelines_fini(struct drm_i915_private *i915)
 {
 	struct i915_gt_timelines *gt = &i915->gt.timelines;
+	struct i915_vma *vma;
 
 	GEM_BUG_ON(!list_empty(&gt->list));
 
+	vma = fetch_and_zero(&i915->gt.timelines.hwsp);
+	if (vma)
+		i915_vma_put(vma);
+
 	mutex_destroy(&gt->mutex);
 }
 
-- 
2.20.1