[Intel-gfx] [RFC 2/4] drm/i915: Use batch pools with the command parser

bradley.d.volkin at intel.com bradley.d.volkin at intel.com
Wed Jun 18 18:36:14 CEST 2014


From: Brad Volkin <bradley.d.volkin at intel.com>

This patch sets up all of the tracking and copying necessary to
use batch pools with the command parser, but does not actually
dispatch the copied (shadow) batch to the hardware yet. We still
aren't quite ready to set the secure bit during dispatch.

Note that performance takes a hit from the copy in some cases
and will likely need some work. At a rough pass, the memcpy
appears to be the bottleneck. Without having done a deeper
analysis, two ideas that come to mind are:
1) Copy sections of the batch at a time, as they are reached
   by parsing. Might improve cache locality.
2) Copy only up to the userspace-supplied batch length and
   memset the rest of the buffer. Reduces the number of reads.

Signed-off-by: Brad Volkin <bradley.d.volkin at intel.com>
---
 drivers/gpu/drm/i915/i915_cmd_parser.c       | 75 ++++++++++++++++++++++------
 drivers/gpu/drm/i915/i915_drv.h              |  7 ++-
 drivers/gpu/drm/i915/i915_gem.c              |  8 ++-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c   | 45 +++++++++++++++--
 drivers/gpu/drm/i915/i915_gem_render_state.c |  2 +-
 drivers/gpu/drm/i915/intel_ringbuffer.c      | 12 +++++
 drivers/gpu/drm/i915/intel_ringbuffer.h      |  7 +++
 7 files changed, 134 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index dea99d9..669afb0 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -831,6 +831,53 @@ finish:
 	return (u32*)addr;
 }
 
+/* Returns a vmap'd pointer to dest_obj, which the caller must unmap */
+static u32 *copy_batch(struct drm_i915_gem_object *dest_obj,
+		       struct drm_i915_gem_object *src_obj)
+{
+	int ret = 0;
+	int needs_clflush = 0;
+	u32 *src_addr, *dest_addr = NULL;
+
+	ret = i915_gem_obj_prepare_shmem_read(src_obj, &needs_clflush);
+	if (ret) {
+		DRM_DEBUG_DRIVER("CMD: failed to prep read\n");
+		return ERR_PTR(ret);
+	}
+
+	src_addr = vmap_batch(src_obj);
+	if (!src_addr) {
+		DRM_DEBUG_DRIVER("CMD: Failed to vmap batch\n");
+		ret = -ENOMEM;
+		goto unpin_src;
+	}
+
+	if (needs_clflush)
+		drm_clflush_virt_range((char *)src_addr, src_obj->base.size);
+
+	ret = i915_gem_object_set_to_cpu_domain(dest_obj, true);
+	if (ret) {
+		DRM_DEBUG_DRIVER("CMD: Failed to set batch CPU domain\n");
+		goto unmap_src;
+	}
+
+	dest_addr = vmap_batch(dest_obj);
+	if (!dest_addr) {
+		DRM_DEBUG_DRIVER("CMD: Failed to vmap shadow batch\n");
+		ret = -ENOMEM;
+		goto unmap_src;
+	}
+
+	memcpy(dest_addr, src_addr, src_obj->base.size);
+
+unmap_src:
+	vunmap(src_addr);
+unpin_src:
+	i915_gem_object_unpin_pages(src_obj);
+
+	return ret ? ERR_PTR(ret) : dest_addr;
+}
+
 /**
  * i915_needs_cmd_parser() - should a given ring use software command parsing?
  * @ring: the ring in question
@@ -952,6 +999,7 @@ static bool check_cmd(const struct intel_engine_cs *ring,
  * i915_parse_cmds() - parse a submitted batch buffer for privilege violations
  * @ring: the ring on which the batch is to execute
  * @batch_obj: the batch buffer in question
+ * @shadow_batch_obj: copy of the batch buffer in question
  * @batch_start_offset: byte offset in the batch at which execution starts
  * @is_master: is the submitting process the drm master?
  *
@@ -962,31 +1010,21 @@ static bool check_cmd(const struct intel_engine_cs *ring,
  */
 int i915_parse_cmds(struct intel_engine_cs *ring,
 		    struct drm_i915_gem_object *batch_obj,
+		    struct drm_i915_gem_object *shadow_batch_obj,
 		    u32 batch_start_offset,
 		    bool is_master)
 {
 	int ret = 0;
 	u32 *cmd, *batch_base, *batch_end;
 	struct drm_i915_cmd_descriptor default_desc = { 0 };
-	int needs_clflush = 0;
 	bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */
 
-	ret = i915_gem_obj_prepare_shmem_read(batch_obj, &needs_clflush);
-	if (ret) {
-		DRM_DEBUG_DRIVER("CMD: failed to prep read\n");
-		return ret;
-	}
-
-	batch_base = vmap_batch(batch_obj);
-	if (!batch_base) {
-		DRM_DEBUG_DRIVER("CMD: Failed to vmap batch\n");
-		i915_gem_object_unpin_pages(batch_obj);
-		return -ENOMEM;
+	batch_base = copy_batch(shadow_batch_obj, batch_obj);
+	if (IS_ERR(batch_base)) {
+		DRM_DEBUG_DRIVER("CMD: Failed to copy batch\n");
+		return PTR_ERR(batch_base);
 	}
 
-	if (needs_clflush)
-		drm_clflush_virt_range((char *)batch_base, batch_obj->base.size);
-
 	cmd = batch_base + (batch_start_offset / sizeof(*cmd));
 	batch_end = cmd + (batch_obj->base.size / sizeof(*batch_end));
 
@@ -1039,7 +1077,12 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
 
 	vunmap(batch_base);
 
-	i915_gem_object_unpin_pages(batch_obj);
+	if (!ret) {
+		ret = i915_gem_object_set_to_gtt_domain(shadow_batch_obj,
+							false);
+		if (ret)
+			DRM_DEBUG_DRIVER("CMD: Failed to set batch GTT domain\n");
+	}
 
 	return ret;
 }
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 2a88b5e..cecf0f8 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1766,6 +1766,9 @@ struct drm_i915_gem_request {
 	/** Batch buffer related to this request if any */
 	struct drm_i915_gem_object *batch_obj;
 
+	/** Shadow copy of the batch buffer, if any, for the command parser */
+	struct drm_i915_gem_object *shadow_batch_obj;
+
 	/** Time at which this request was emitted, in jiffies. */
 	unsigned long emitted_jiffies;
 
@@ -2297,9 +2300,10 @@ int __must_check i915_gem_suspend(struct drm_device *dev);
 int __i915_add_request(struct intel_engine_cs *ring,
 		       struct drm_file *file,
 		       struct drm_i915_gem_object *batch_obj,
+		       struct drm_i915_gem_object *shadow_batch_obj,
 		       u32 *seqno);
 #define i915_add_request(ring, seqno) \
-	__i915_add_request(ring, NULL, NULL, seqno)
+	__i915_add_request(ring, NULL, NULL, NULL, seqno)
 int __must_check i915_wait_seqno(struct intel_engine_cs *ring,
 				 uint32_t seqno);
 int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
@@ -2534,6 +2538,7 @@ void i915_cmd_parser_fini_ring(struct intel_engine_cs *ring);
 bool i915_needs_cmd_parser(struct intel_engine_cs *ring);
 int i915_parse_cmds(struct intel_engine_cs *ring,
 		    struct drm_i915_gem_object *batch_obj,
+		    struct drm_i915_gem_object *shadow_batch_obj,
 		    u32 batch_start_offset,
 		    bool is_master);
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d5e3001..a477818 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2325,6 +2325,7 @@ i915_gem_get_seqno(struct drm_device *dev, u32 *seqno)
 int __i915_add_request(struct intel_engine_cs *ring,
 		       struct drm_file *file,
 		       struct drm_i915_gem_object *obj,
+		       struct drm_i915_gem_object *shadow_obj,
 		       u32 *out_seqno)
 {
 	struct drm_i915_private *dev_priv = ring->dev->dev_private;
@@ -2368,9 +2369,10 @@ int __i915_add_request(struct intel_engine_cs *ring,
 	 * active_list, and so will hold the active reference. Only when this
 	 * request is retired will the the batch_obj be moved onto the
 	 * inactive_list and lose its active reference. Hence we do not need
-	 * to explicitly hold another reference here.
+	 * to explicitly hold another reference here. Same for shadow batch.
 	 */
 	request->batch_obj = obj;
+	request->shadow_batch_obj = shadow_obj;
 
 	/* Hold a reference to the current context so that we can inspect
 	 * it later in case a hangcheck error event fires.
@@ -2478,6 +2480,10 @@ static void i915_gem_free_request(struct drm_i915_gem_request *request)
 	if (request->ctx)
 		i915_gem_context_unreference(request->ctx);
 
+	if (request->shadow_batch_obj)
+		i915_gem_batch_pool_put(request->ring->batch_pool,
+					request->shadow_batch_obj);
+
 	kfree(request);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 93d7f72..0b263aa 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -992,13 +992,14 @@ static void
 i915_gem_execbuffer_retire_commands(struct drm_device *dev,
 				    struct drm_file *file,
 				    struct intel_engine_cs *ring,
-				    struct drm_i915_gem_object *obj)
+				    struct drm_i915_gem_object *obj,
+				    struct drm_i915_gem_object *shadow_obj)
 {
 	/* Unconditionally force add_request to emit a full flush. */
 	ring->gpu_caches_dirty = true;
 
 	/* Add a breadcrumb for the completion of the batch buffer */
-	(void)__i915_add_request(ring, file, obj, NULL);
+	(void)__i915_add_request(ring, file, obj, shadow_obj, NULL);
 }
 
 static int
@@ -1087,6 +1088,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct eb_vmas *eb;
 	struct drm_i915_gem_object *batch_obj;
+	struct drm_i915_gem_object *shadow_batch_obj = NULL;
 	struct drm_clip_rect *cliprects = NULL;
 	struct intel_engine_cs *ring;
 	struct intel_context *ctx;
@@ -1288,8 +1290,31 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	batch_obj->base.pending_read_domains |= I915_GEM_DOMAIN_COMMAND;
 
 	if (i915_needs_cmd_parser(ring)) {
+		shadow_batch_obj =
+			i915_gem_batch_pool_get(ring->batch_pool,
+						batch_obj->base.size);
+		if (IS_ERR(shadow_batch_obj)) {
+			ret = PTR_ERR(shadow_batch_obj);
+			/* Don't try to clean up the obj in the error path */
+			shadow_batch_obj = NULL;
+
+			/*
+			 * If the pool is at capcity, retiring requests might
+			 * return some buffers.
+			 */
+			if (ret == -EAGAIN)
+				i915_gem_retire_requests_ring(ring);
+
+			goto err;
+		}
+
+		ret = i915_gem_obj_ggtt_pin(shadow_batch_obj, 4096, 0);
+		if (ret)
+			goto err;
+
 		ret = i915_parse_cmds(ring,
 				      batch_obj,
+				      shadow_batch_obj,
 				      args->batch_start_offset,
 				      file->is_master);
 		if (ret)
@@ -1377,13 +1402,27 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	trace_i915_gem_ring_dispatch(ring, intel_ring_get_seqno(ring), flags);
 
 	i915_gem_execbuffer_move_to_active(&eb->vmas, ring);
-	i915_gem_execbuffer_retire_commands(dev, file, ring, batch_obj);
+	if (shadow_batch_obj) {
+		struct i915_vma *vma = i915_gem_obj_to_ggtt(shadow_batch_obj);
+
+		i915_vma_move_to_active(vma, ring);
+		i915_gem_object_ggtt_unpin(shadow_batch_obj);
+	}
+	i915_gem_execbuffer_retire_commands(dev, file, ring, batch_obj,
+					    shadow_batch_obj);
 
 err:
 	/* the request owns the ref now */
 	i915_gem_context_unreference(ctx);
 	eb_destroy(eb);
 
+	if (ret && ring && shadow_batch_obj) {
+		if (i915_gem_obj_is_pinned(shadow_batch_obj))
+			i915_gem_object_ggtt_unpin(shadow_batch_obj);
+
+		i915_gem_batch_pool_put(ring->batch_pool, shadow_batch_obj);
+	}
+
 	mutex_unlock(&dev->struct_mutex);
 
 pre_mutex_err:
diff --git a/drivers/gpu/drm/i915/i915_gem_render_state.c b/drivers/gpu/drm/i915/i915_gem_render_state.c
index e60be3f..f614406 100644
--- a/drivers/gpu/drm/i915/i915_gem_render_state.c
+++ b/drivers/gpu/drm/i915/i915_gem_render_state.c
@@ -161,7 +161,7 @@ int i915_gem_render_state_init(struct intel_engine_cs *ring)
 
 	i915_vma_move_to_active(i915_gem_obj_to_ggtt(so.obj), ring);
 
-	ret = __i915_add_request(ring, NULL, so.obj, NULL);
+	ret = __i915_add_request(ring, NULL, so.obj, NULL, NULL);
 	/* __i915_add_request moves object to inactive if it fails */
 out:
 	render_state_fini(&so);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index b96edaf..4cd123a 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1476,6 +1476,17 @@ static int intel_init_ring_buffer(struct drm_device *dev,
 	if (ret)
 		goto error;
 
+	/*
+	 * Benchmarking shows that the actual maximum number of buffers
+	 * in the pool varies a bit with workload, but 64 seems to cover
+	 * us fairly well.
+	 */
+	ring->batch_pool = i915_gem_batch_pool_alloc(dev, 64);
+	if (IS_ERR(ring->batch_pool)) {
+		ret = PTR_ERR(ring->batch_pool);
+		goto error;
+	}
+
 	ret = ring->init(ring);
 	if (ret)
 		goto error;
@@ -1513,6 +1524,7 @@ void intel_cleanup_ring_buffer(struct intel_engine_cs *ring)
 	cleanup_status_page(ring);
 
 	i915_cmd_parser_fini_ring(ring);
+	i915_gem_batch_pool_free(ring->batch_pool);
 
 	kfree(ringbuf);
 	ring->buffer = NULL;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index e72017b..82aae29 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -188,6 +188,13 @@ struct  intel_engine_cs {
 	bool needs_cmd_parser;
 
 	/*
+	 * A pool of objects to use as shadow copies of client batch buffers
+	 * when the command parser is enabled. Prevents the client from
+	 * modifying the batch contents after software parsing.
+	 */
+	struct i915_gem_batch_pool *batch_pool;
+
+	/*
 	 * Table of commands the command parser needs to know about
 	 * for this ring.
 	 */
-- 
1.8.3.2




More information about the Intel-gfx mailing list