[Intel-gfx] [RFC] drm/i915: Scratch page optimization for blanking buffer

Mon May 5 13:43:18 CEST 2014

From: Akash Goel <akash.goel at intel.com>

There is a use case, when user space (display compositor) tries
to directly flip a fb (without any prior rendering) on primary
plane. So the backing pages of the object are allocated at page
flip time only, which takes time. Since, this buffer is supposed to
serve as a blanking buffer (black colored), we can setup all the GTT entries
of that blanking buffer with scratch page (which is already zeroed out).
This saves the time in allocation of real backing physical space for the
blanking buffer and flushing of CPU cache.

Signed-off-by: Akash Goel <akash.goel at intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c      | 18 ++++++++-
 drivers/gpu/drm/i915/i915_gem_gtt.c  |  8 ++++
 drivers/gpu/drm/i915/intel_display.c | 72 ++++++++++++++++++++++++++++++++++++
 3 files changed, 97 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index b19ccb8..7c3963c 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1781,6 +1781,17 @@ i915_gem_object_put_pages(struct drm_i915_gem_object *obj)
 	 * lists early. */
 	list_del(&obj->global_list);
 
+	/*
+	 * If so far the object was backed up by a scratch page, then remove
+	 * that association & make it reusable as a normal Gem object
+	 */
+	if ((unsigned long)obj->pages == (unsigned long)(obj)) {
+		obj->pages = NULL;
+		obj->base.read_domains = obj->base.write_domain =
+						I915_GEM_DOMAIN_CPU;
+		return 0;
+	}
+
 	ops->put_pages(obj);
 	obj->pages = NULL;
 
@@ -3772,7 +3783,12 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
 	if (ret)
 		goto err_unpin_display;
 
-	i915_gem_object_flush_cpu_write_domain(obj, true);
+	/*
+	 * Check if object is backed up by a scratch page, in that case CPU
+	 * cache flush is not required, thus skip it.
+	 */
+	if ((unsigned long)(obj->pages) != (unsigned long)obj)
+		i915_gem_object_flush_cpu_write_domain(obj, true);
 
 	old_write_domain = obj->base.write_domain;
 	old_read_domains = obj->base.read_domains;
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index f6354e0..fb3193a 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1566,6 +1566,14 @@ static void ggtt_bind_vma(struct i915_vma *vma,
 	if (!dev_priv->mm.aliasing_ppgtt || flags & GLOBAL_BIND) {
 		if (!obj->has_global_gtt_mapping ||
 		    (cache_level != obj->cache_level)) {
+			if ((unsigned long)(obj->pages) == (unsigned long)obj) {
+				/* Leave the scratch page mapped into the GTT
+				 * entries of the object, as it is actually
+				 * supposed to be backed up by scratch page
+				 * only */
+				obj->has_global_gtt_mapping = 1;
+				return;
+			}
 			vma->vm->insert_entries(vma->vm, obj->pages,
 						vma->node.start,
 						cache_level);
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 59303213..dff85e4 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -40,6 +40,7 @@
 #include <drm/drm_dp_helper.h>
 #include <drm/drm_crtc_helper.h>
 #include <linux/dma_remapping.h>
+#include <linux/shmem_fs.h>
 
 static void intel_increase_pllclock(struct drm_crtc *crtc);
 static void intel_crtc_update_cursor(struct drm_crtc *crtc, bool on);
@@ -8469,6 +8470,75 @@ static void intel_crtc_destroy(struct drm_crtc *crtc)
 	kfree(intel_crtc);
 }
 
+static inline void
+intel_use_srcatch_page_for_fb(struct drm_i915_gem_object *obj)
+{
+	struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
+	int ret;
+
+	/* A fb being flipped without having any allocated backing physical
+	 * space (without any prior rendering), is most probably going to be
+	 * used as a blanking buffer (black colored). So instead of allocating
+	 * the real backing physical space for this buffer, we can try to
+	 * currently back this object by a scratch page, which is already
+	 * allocated. So we check if no shmem data pages have been allocated to
+	 * the fb we can back it by a scratch page and thus save time by avoid-
+	 * ing allocation of backing physicl space & subsequent CPU cache flush
+	 */
+	if (obj->base.filp) {
+		struct inode *inode = file_inode(obj->base.filp);
+		struct shmem_inode_info *info = SHMEM_I(inode);
+		spin_lock(&info->lock);
+		ret = info->alloced;
+		spin_unlock(&info->lock);
+		if ((ret == 0) && (obj->pages == NULL)) {
+			/*
+			 * Set the 'pages' field with the object pointer
+			 * itself, this will avoid the need of a new field in
+			 * obj structure to identify the object backed up by a
+			 * scratch page and will also avoid the call to
+			 * 'get_pages', thus also saving on the time required
+			 * for allocation of 'scatterlist' structure.
+			 */
+			obj->pages = (struct sg_table *)(obj);
+
+			/*
+			 * To avoid calls to gtt prepare & finish, as those
+			 * will dereference the 'pages' field
+			 */
+			obj->has_dma_mapping = 1;
+			list_add_tail(&obj->global_list,
+					&dev_priv->mm.unbound_list);
+
+			trace_printk("Using Scratch page for obj %p\n", obj);
+		}
+	}
+}
+
+static inline void
+intel_drop_srcatch_page_for_fb(struct drm_i915_gem_object *obj)
+{
+	int ret;
+	/*
+	 * Unmap the object backed up by scratch page, as it is no
+	 * longer being scanned out and thus it can be now allowed
+	 * to be used as a normal object.
+	 * Assumption: The User space will ensure that only when the
+	 * object is no longer being scanned out, it will be reused
+	 * for rendering. This is a valid assumption as there is no
+	 * such handling in driver for other regular fb objects also.
+	 */
+	if ((unsigned long)obj->pages ==
+				(unsigned long)obj) {
+		ret = i915_gem_object_ggtt_unbind(obj);
+		/* EBUSY is ok: this means that pin count is still not zero */
+		if (ret && ret != -EBUSY)
+			DRM_ERROR("unbind error %d\n", ret);
+		i915_gem_object_put_pages(obj);
+		obj->has_dma_mapping = 0;
+	}
+}
+
 static void intel_unpin_work_fn(struct work_struct *__work)
 {
 	struct intel_unpin_work *work =
@@ -8477,6 +8547,7 @@ static void intel_unpin_work_fn(struct work_struct *__work)
 
 	mutex_lock(&dev->struct_mutex);
 	intel_unpin_fb_obj(work->old_fb_obj);
+	intel_drop_srcatch_page_for_fb(work->old_fb_obj);
 	drm_gem_object_unreference(&work->pending_flip_obj->base);
 	drm_gem_object_unreference(&work->old_fb_obj->base);
 
@@ -8770,6 +8841,7 @@ static int intel_gen7_queue_flip(struct drm_device *dev,
 	if (IS_VALLEYVIEW(dev) || ring == NULL || ring->id != RCS)
 		ring = &dev_priv->ring[BCS];
 
+	intel_use_srcatch_page_for_fb(obj);
 	ret = intel_pin_and_fence_fb_obj(dev, obj, ring);
 	if (ret)
 		goto err;
-- 
1.9.2