[Intel-gfx] [RFC] drm/i915 : Reduce the shmem page allocation time by using blitter engines for clearing pages.

Tue May 6 13:10:58 CEST 2014

From: Sourab Gupta <sourab.gupta at intel.com>

This patch is in continuation of and is dependent on earlier patch
series to 'reduce the time for which device mutex is kept locked'.
(http://lists.freedesktop.org/archives/intel-gfx/2014-May/044596.html)

This patch aims to reduce the allocation time of pages from shmem
by using blitter engines for clearing the pages freshly alloced.
This is utilized in case of fresh pages allocated in shmem_preallocate
routines in execbuffer path and page_fault path only.

Even though the CPU memset routine is optimized, but still sometimes
the time consumed in clearing the pages of a large buffer comes in
the order of milliseconds.
We intend to make this operation asynchronous by using blitter engine,
so irrespective of the size of buffer to be cleared, the execbuffer
ioctl processing time will not be affected. Use of blitter engine will
make the overall execution time of 'execbuffer' ioctl lesser for
large buffers.

There may be power implications here on using blitter engines, and
we have to evaluate this. As a next step, we can selectively enable
this HW based memset only for large buffers, where the overhead of
adding commands in a blitter ring(which will otherwise be idle),
cross ring synchronization, will be negligible compared to using the
CPU to clear out the buffer.

Signed-off-by: Sourab Gupta <sourab.gupta at intel.com>
Signed-off-by: Akash Goel <akash.goel at intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h            |  6 +++
 drivers/gpu/drm/i915/i915_gem.c            | 80 +++++++++++++++++++++++++++++-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 43 ++++++++++++++++
 include/linux/shmem_fs.h                   |  2 +
 mm/shmem.c                                 | 44 +++++++++++++++-
 5 files changed, 173 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 6dc579a..c3844da 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1596,6 +1596,10 @@ struct drm_i915_gem_object {
 	unsigned int has_aliasing_ppgtt_mapping:1;
 	unsigned int has_global_gtt_mapping:1;
 	unsigned int has_dma_mapping:1;
+	/*
+	 * Do the pages of object need to be cleared after shmem allocation
+	 */
+	unsigned int require_clear:1;
 
 	struct sg_table *pages;
 	int pages_pin_count;
@@ -2120,6 +2124,8 @@ int i915_gem_mmap_gtt(struct drm_file *file_priv, struct drm_device *dev,
 		      uint32_t handle, uint64_t *offset);
 void
 i915_gem_object_shmem_preallocate(struct drm_i915_gem_object *obj);
+int i915_add_clear_obj_cmd(struct drm_i915_gem_object *obj);
+int i915_gem_memset_obj_hw(struct drm_i915_gem_object *obj);
 
 /**
  * Returns true if seq1 is later than seq2.
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 867da2d..972695a 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1376,6 +1376,7 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
 	return 0;
 }
 
+
 /**
  * i915_gem_fault - fault a page into the GTT
  * vma: VMA in question
@@ -1436,6 +1437,12 @@ int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (ret)
 		goto unlock;
 
+	if (obj->require_clear) {
+		i915_gem_object_flush_cpu_write_domain(obj, false);
+		i915_gem_memset_obj_hw(obj);
+		obj->require_clear = false;
+	}
+
 	ret = i915_gem_object_set_to_gtt_domain(obj, write);
 	if (ret)
 		goto unpin;
@@ -1927,12 +1934,13 @@ i915_gem_object_shmem_preallocate(struct drm_i915_gem_object *obj)
 	/* Get the list of pages out of our struct file
 	 * Fail silently without starting the shrinker
 	 */
+	obj->require_clear = 1;
 	mapping = file_inode(obj->base.filp)->i_mapping;
 	gfp = mapping_gfp_mask(mapping);
 	gfp |= __GFP_NORETRY | __GFP_NOWARN | __GFP_NO_KSWAPD;
 	gfp &= ~(__GFP_IO | __GFP_WAIT);
 	for (i = 0; i < page_count; i++) {
-		page = shmem_read_mapping_page_gfp(mapping, i, gfp);
+		page = shmem_read_mapping_page_gfp_noclear(mapping, i, gfp);
 		if (IS_ERR(page)) {
 			DRM_DEBUG_DRIVER("Failure for obj(%p), size(%x) at page(%d)\n",
 						obj, obj->base.size, i);
@@ -2173,6 +2181,76 @@ i915_gem_object_move_to_inactive(struct drm_i915_gem_object *obj)
 	WARN_ON(i915_verify_lists(dev));
 }
 
+int i915_add_clear_obj_cmd(struct drm_i915_gem_object *obj)
+{
+	struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
+	struct intel_ring_buffer *ring = &dev_priv->ring[BCS];
+	u32 offset = i915_gem_obj_ggtt_offset(obj);
+	int ret;
+
+	ret = intel_ring_begin(ring, 6);
+	if (ret)
+		return ret;
+
+	intel_ring_emit(ring, (0x2 << 29) | (0x40 << 22) |
+			      XY_SRC_COPY_BLT_WRITE_ALPHA |
+			      XY_SRC_COPY_BLT_WRITE_RGB |
+			      0x3);
+	intel_ring_emit(ring, BLT_DEPTH_32 | (0xF0 << 16) | 4096);
+	intel_ring_emit(ring,
+			(DIV_ROUND_UP(obj->base.size, 4096) << 16) | 4096);
+	intel_ring_emit(ring, offset);
+	intel_ring_emit(ring, 0);
+	intel_ring_emit(ring, MI_NOOP);
+	intel_ring_advance(ring);
+
+	return 0;
+}
+
+int i915_gem_memset_obj_hw(struct drm_i915_gem_object *obj)
+{
+	struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
+	struct intel_ring_buffer *ring = &dev_priv->ring[BCS];
+	u32 seqno;
+	int ret;
+
+	ret = i915_gem_obj_ggtt_pin(obj, 0, PIN_MAPPABLE);
+	if (ret) {
+		DRM_ERROR("Mapping of User FB to GTT failed\n");
+		return ret;
+	}
+
+	/* Adding commands to the blitter ring to
+	 * clear out the contents of the buffer object
+	 */
+	ret = i915_add_clear_obj_cmd(obj);
+	if (ret) {
+		DRM_ERROR("couldn't add commands in blitter ring\n");
+		i915_gem_object_ggtt_unpin(obj);
+		return ret;
+	}
+
+	seqno = intel_ring_get_seqno(ring);
+
+	obj->base.read_domains = I915_GEM_DOMAIN_RENDER;
+	obj->base.write_domain = I915_GEM_DOMAIN_RENDER;
+
+	i915_gem_object_move_to_active(obj, ring);
+
+	obj->dirty = 1;
+	obj->last_write_seqno = seqno;
+
+	/* Unconditionally force add_request to emit a full flush. */
+	ring->gpu_caches_dirty = true;
+
+	/* Add a breadcrumb for the completion of the batch buffer */
+	(void)i915_add_request(ring, NULL);
+
+	i915_gem_object_ggtt_unpin(obj);
+
+	return 0;
+}
+
 static void
 i915_gem_object_retire(struct drm_i915_gem_object *obj)
 {
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index cb9e143..47e6946 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -895,6 +895,45 @@ i915_gem_execbuffer_preallocate_objs(struct list_head *objects)
 	}
 }
 
+static int
+i915_gem_execbuffer_clear_objs(struct drm_device *dev, struct list_head *objects)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct intel_ring_buffer *ring = &dev_priv->ring[BCS];
+	struct drm_i915_gem_object *obj;
+	u32 seqno;
+	int ret;
+
+	list_for_each_entry(obj, objects, obj_exec_link) {
+		if (obj->require_clear) {
+			/* Flush the CPU write domain for the object if it's dirty. */
+			if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
+				i915_gem_clflush_object(obj, false);
+
+			ret = i915_add_clear_obj_cmd(obj);
+			if (ret)
+				return ret;
+
+			seqno = intel_ring_get_seqno(ring);
+
+			obj->base.read_domains = I915_GEM_DOMAIN_RENDER;
+			obj->base.write_domain = I915_GEM_DOMAIN_RENDER;
+			i915_vma_move_to_active(i915_gem_obj_to_ggtt(obj), ring);
+
+			obj->dirty = 1;
+			obj->last_write_seqno = seqno;
+			obj->require_clear = false;
+		}
+	}
+	/* Unconditionally force add_request to emit a full flush. */
+	ring->gpu_caches_dirty = true;
+
+	/* Add a breadcrumb for the completion of the batch buffer */
+	(void)i915_add_request(ring, NULL);
+
+	return 0;
+}
+
 static bool
 i915_gem_check_execbuffer(struct drm_i915_gem_execbuffer2 *exec)
 {
@@ -1286,6 +1325,10 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	if (ret)
 		goto err;
 
+	ret = i915_gem_execbuffer_clear_objs(dev, &eb->objects);
+	if (ret)
+		goto err;
+
 	/* The objects are in their final locations, apply the relocations. */
 	if (need_relocs)
 		ret = i915_gem_execbuffer_relocate(eb);
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 4d1771c..ac976c8 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -55,6 +55,8 @@ extern bool shmem_mapping(struct address_space *mapping);
 extern void shmem_unlock_mapping(struct address_space *mapping);
 extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
 					pgoff_t index, gfp_t gfp_mask);
+extern struct page *shmem_read_mapping_page_gfp_noclear(struct address_space *mapping,
+					pgoff_t index, gfp_t gfp_mask);
 extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 9f70e02..66d3a61 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -98,6 +98,7 @@ enum sgp_type {
 	SGP_DIRTY,	/* like SGP_CACHE, but set new page dirty */
 	SGP_WRITE,	/* may exceed i_size, may allocate !Uptodate page */
 	SGP_FALLOC,	/* like SGP_WRITE, but make existing page Uptodate */
+	SGP_CACHE_NOCLEAR,	/* like SGP_CACHE, but don't clear alloced pages */
 };
 
 #ifdef CONFIG_TMPFS
@@ -1169,7 +1170,8 @@ clear:
 		 * it now, lest undo on failure cancel our earlier guarantee.
 		 */
 		if (sgp != SGP_WRITE) {
-			clear_highpage(page);
+			if (sgp != SGP_CACHE_NOCLEAR)
+				clear_highpage(page);
 			flush_dcache_page(page);
 			SetPageUptodate(page);
 		}
@@ -2966,3 +2968,43 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
 #endif
 }
 EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
+
+
+/**
+ * shmem_read_mapping_page_gfp_noclear - read into page cache, using specified
+ * page allocation flags. Do not clear the pages, in case of newly allocated page.
+ * It is the responsibility of caller to clear the pages returned by this function.
+ * @mapping:	the page's address_space
+ * @index:	the page index
+ * @gfp:	the page allocator flags to use if allocating
+ *
+ * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
+ * with any new page allocations done using the specified allocation flags.
+ * But read_cache_page_gfp() uses the ->readpage() method: which does not
+ * suit tmpfs, since it may have pages in swapcache, and needs to find those
+ * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
+ *
+ */
+struct page *shmem_read_mapping_page_gfp_noclear(struct address_space *mapping,
+					 pgoff_t index, gfp_t gfp)
+{
+#ifdef CONFIG_SHMEM
+	struct inode *inode = mapping->host;
+	struct page *page;
+	int error;
+
+	BUG_ON(mapping->a_ops != &shmem_aops);
+	error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE_NOCLEAR, gfp, NULL);
+	if (error)
+		page = ERR_PTR(error);
+	else
+		unlock_page(page);
+	return page;
+#else
+	/*
+	 * The tiny !SHMEM case uses ramfs without swap
+	 */
+	return read_cache_page_gfp(mapping, index, gfp);
+#endif
+}
+EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp_noclear);
-- 
1.8.5.1