[Intel-gfx] [PATCH 11/13 v4] drm/i915: Integrate GuC-based command submission

Mon Jul 27 08:57:49 PDT 2015

On Thu, Jul 09, 2015 at 07:29:12PM +0100, Dave Gordon wrote:
> From: Alex Dai <yu.dai at intel.com>
> 
> GuC-based submission is mostly the same as execlist mode, up to
> intel_logical_ring_advance_and_submit(), where the context being
> dispatched would be added to the execlist queue; at this point
> we submit the context to the GuC backend instead.
> 
> There are, however, a few other changes also required, notably:
> 1.  Contexts must be pinned at GGTT addresses accessible by the GuC
>     i.e. NOT in the range [0..WOPCM_SIZE), so we have to add the
>     PIN_OFFSET_BIAS flag to the relevant GGTT-pinning calls.
> 
> 2.  The GuC's TLB must be invalidated after a context is pinned at
>     a new GGTT address.
> 
> 3.  GuC firmware uses the one page before Ring Context as shared data.
>     Therefore, whenever driver wants to get base address of LRC, we
>     will offset one page for it. LRC_PPHWSP_PN is defined as the page
>     number of LRCA.
> 
> 4.  In the work queue used to pass requests to the GuC, the GuC
>     firmware requires the ring-tail-offset to be represented as an
>     11-bit value, expressed in QWords. Therefore, the ringbuffer
>     size must be reduced to the representable range (4 pages).
> 
> v2:
>     Defer adding #defines until needed [Chris Wilson]
>     Rationalise type declarations [Chris Wilson]
> 
> v4:
>     Squashed kerneldoc patch into here [Daniel Vetter]
> 
> Issue: VIZ-4884
> Signed-off-by: Alex Dai <yu.dai at intel.com>
> Signed-off-by: Dave Gordon <david.s.gordon at intel.com>
> ---
>  Documentation/DocBook/drm.tmpl             | 14 ++++++++
>  drivers/gpu/drm/i915/i915_debugfs.c        |  2 +-
>  drivers/gpu/drm/i915/i915_guc_submission.c | 52 +++++++++++++++++++++++++++---
>  drivers/gpu/drm/i915/intel_guc.h           |  1 +
>  drivers/gpu/drm/i915/intel_lrc.c           | 51 ++++++++++++++++++++---------
>  drivers/gpu/drm/i915/intel_lrc.h           |  6 ++++
>  6 files changed, 106 insertions(+), 20 deletions(-)
> 
> diff --git a/Documentation/DocBook/drm.tmpl b/Documentation/DocBook/drm.tmpl
> index 596b11d..0ff5fd7 100644
> --- a/Documentation/DocBook/drm.tmpl
> +++ b/Documentation/DocBook/drm.tmpl
> @@ -4223,6 +4223,20 @@ int num_ioctls;</synopsis>
>        </sect2>
>      </sect1>
>      <sect1>
> +      <title>GuC-based Command Submission</title>
> +      <sect2>
> +        <title>GuC</title>
> +!Pdrivers/gpu/drm/i915/intel_guc_loader.c GuC-specific firmware loader
> +!Idrivers/gpu/drm/i915/intel_guc_loader.c
> +      </sect2>
> +      <sect2>
> +        <title>GuC Client</title>
> +!Pdrivers/gpu/drm/i915/intel_guc_submission.c GuC-based command submissison
> +!Idrivers/gpu/drm/i915/intel_guc_submission.c
> +      </sect2>
> +    </sect1>
> +
> +    <sect1>
>        <title> Tracing </title>
>        <para>
>      This sections covers all things related to the tracepoints implemented in
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 13e37d1..d93732a 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -1982,7 +1982,7 @@ static void i915_dump_lrc_obj(struct seq_file *m,
>  		return;
>  	}
>  
> -	page = i915_gem_object_get_page(ctx_obj, 1);
> +	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
>  	if (!WARN_ON(page == NULL)) {
>  		reg_state = kmap_atomic(page);
>  
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 25d8807..c5c9fbf 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -346,18 +346,58 @@ static void guc_init_proc_desc(struct intel_guc *guc,
>  static void guc_init_ctx_desc(struct intel_guc *guc,
>  			      struct i915_guc_client *client)
>  {
> +	struct intel_context *ctx = client->owner;
>  	struct guc_context_desc desc;
>  	struct sg_table *sg;
> +	int i;
>  
>  	memset(&desc, 0, sizeof(desc));
>  
>  	desc.attribute = GUC_CTX_DESC_ATTR_ACTIVE | GUC_CTX_DESC_ATTR_KERNEL;
>  	desc.context_id = client->ctx_index;
>  	desc.priority = client->priority;
> -	desc.engines_used = (1 << RCS) | (1 << VCS) | (1 << BCS) |
> -			    (1 << VECS) | (1 << VCS2); /* all engines */
>  	desc.db_id = client->doorbell_id;
>  
> +	for (i = 0; i < I915_NUM_RINGS; i++) {
> +		struct guc_execlist_context *lrc = &desc.lrc[i];
> +		struct intel_ringbuffer *ringbuf = ctx->engine[i].ringbuf;
> +		struct intel_engine_cs *ring;
> +		struct drm_i915_gem_object *obj;
> +		uint64_t ctx_desc;
> +
> +		/* TODO: We have a design issue to be solved here. Only when we
> +		 * receive the first batch, we know which engine is used by the
> +		 * user. But here GuC expects the lrc and ring to be pinned. It
> +		 * is not an issue for default context, which is the only one
> +		 * for now who owns a GuC client. But for future owner of GuC
> +		 * client, need to make sure lrc is pinned prior to enter here.
> +		 */
> +		obj = ctx->engine[i].state;
> +		if (!obj)
> +			break;
> +
> +		ring = ringbuf->ring;
> +		ctx_desc = intel_lr_context_descriptor(ctx, ring);
> +		lrc->context_desc = (u32)ctx_desc;
> +
> +		/* The state page is after PPHWSP */
> +		lrc->ring_lcra = i915_gem_obj_ggtt_offset(obj) +
> +				LRC_STATE_PN * PAGE_SIZE;
> +		lrc->context_id = (client->ctx_index << GUC_ELC_CTXID_OFFSET) |
> +				(ring->id << GUC_ELC_ENGINE_OFFSET);
> +
> +		obj = ringbuf->obj;
> +
> +		lrc->ring_begin = i915_gem_obj_ggtt_offset(obj);
> +		lrc->ring_end = lrc->ring_begin + obj->base.size - 1;
> +		lrc->ring_next_free_location = lrc->ring_begin;
> +		lrc->ring_current_tail_pointer_value = 0;
> +
> +		desc.engines_used |= (1 << ring->id);
> +	}
> +
> +	WARN_ON(desc.engines_used == 0);
> +
>  	/*
>  	 * The CPU address is only needed at certain points, so kmap_atomic on
>  	 * demand instead of storing it in the ctx descriptor.
> @@ -622,11 +662,13 @@ static void guc_client_free(struct drm_device *dev,
>   * 		The kernel client to replace ExecList submission is created with
>   * 		NORMAL priority. Priority of a client for scheduler can be HIGH,
>   * 		while a preemption context can use CRITICAL.
> + * @ctx		the context to own the client (we use the default render context)
>   *
>   * Return:	An i915_guc_client object if success.
>   */
>  static struct i915_guc_client *guc_client_alloc(struct drm_device *dev,
> -						uint32_t priority)
> +						uint32_t priority,
> +						struct intel_context *ctx)
>  {
>  	struct i915_guc_client *client;
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> @@ -639,6 +681,7 @@ static struct i915_guc_client *guc_client_alloc(struct drm_device *dev,
>  
>  	client->doorbell_id = GUC_INVALID_DOORBELL_ID;
>  	client->priority = priority;
> +	client->owner = ctx;
>  
>  	client->ctx_index = (uint32_t)ida_simple_get(&guc->ctx_ids, 0,
>  			GUC_MAX_GPU_CONTEXTS, GFP_KERNEL);
> @@ -772,10 +815,11 @@ int i915_guc_submission_enable(struct drm_device *dev)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  	struct intel_guc *guc = &dev_priv->guc;
> +	struct intel_context *ctx = dev_priv->ring[RCS].default_context;
>  	struct i915_guc_client *client;
>  
>  	/* client for execbuf submission */
> -	client = guc_client_alloc(dev, GUC_CTX_PRIORITY_NORMAL);
> +	client = guc_client_alloc(dev, GUC_CTX_PRIORITY_NORMAL, ctx);
>  	if (!client) {
>  		DRM_ERROR("Failed to create execbuf guc_client\n");
>  		return -ENOMEM;
> diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
> index d249326..9571b56 100644
> --- a/drivers/gpu/drm/i915/intel_guc.h
> +++ b/drivers/gpu/drm/i915/intel_guc.h
> @@ -29,6 +29,7 @@
>  
>  struct i915_guc_client {
>  	struct drm_i915_gem_object *client_obj;
> +	struct intel_context *owner;
>  	uint32_t priority;
>  	uint32_t ctx_index;
>  
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 9e121d3..8294462 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -254,7 +254,8 @@ int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists
>   */
>  u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj)
>  {
> -	u32 lrca = i915_gem_obj_ggtt_offset(ctx_obj);
> +	u32 lrca = i915_gem_obj_ggtt_offset(ctx_obj) +
> +			LRC_PPHWSP_PN * PAGE_SIZE;
>  
>  	/* LRCA is required to be 4K aligned so the more significant 20 bits
>  	 * are globally unique */
> @@ -267,7 +268,8 @@ uint64_t intel_lr_context_descriptor(struct intel_context *ctx,
>  	struct drm_device *dev = ring->dev;
>  	struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state;
>  	uint64_t desc;
> -	uint64_t lrca = i915_gem_obj_ggtt_offset(ctx_obj);
> +	uint64_t lrca = i915_gem_obj_ggtt_offset(ctx_obj) +
> +			LRC_PPHWSP_PN * PAGE_SIZE;
>  
>  	WARN_ON(lrca & 0xFFFFFFFF00000FFFULL);
>  
> @@ -342,7 +344,7 @@ void intel_lr_context_update(struct drm_i915_gem_request *rq)
>  	WARN_ON(!i915_gem_obj_is_pinned(ctx_obj));
>  	WARN_ON(!i915_gem_obj_is_pinned(rb_obj));
>  
> -	page = i915_gem_object_get_page(ctx_obj, 1);
> +	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
>  	reg_state = kmap_atomic(page);
>  
>  	reg_state[CTX_RING_TAIL+1] = rq->tail;
> @@ -687,13 +689,17 @@ static void
>  intel_logical_ring_advance_and_submit(struct drm_i915_gem_request *request)
>  {
>  	struct intel_engine_cs *ring = request->ring;
> +	struct drm_i915_private *dev_priv = request->i915;
>  
>  	intel_logical_ring_advance(request->ringbuf);
>  
>  	if (intel_ring_stopped(ring))
>  		return;
>  
> -	execlists_context_queue(request);
> +	if (dev_priv->guc.execbuf_client)
> +		i915_guc_submit(dev_priv->guc.execbuf_client, request);
> +	else
> +		execlists_context_queue(request);
>  }
>  
>  static void __wrap_ring_buffer(struct intel_ringbuffer *ringbuf)
> @@ -984,6 +990,7 @@ int logical_ring_flush_all_caches(struct drm_i915_gem_request *req)
>  
>  static int intel_lr_context_pin(struct drm_i915_gem_request *rq)
>  {
> +	struct drm_i915_private *dev_priv = rq->i915;
>  	struct intel_engine_cs *ring = rq->ring;
>  	struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring->id].state;
>  	struct intel_ringbuffer *ringbuf = rq->ringbuf;
> @@ -991,14 +998,18 @@ static int intel_lr_context_pin(struct drm_i915_gem_request *rq)
>  
>  	WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
>  	if (rq->ctx->engine[ring->id].pin_count++ == 0) {
> -		ret = i915_gem_obj_ggtt_pin(ctx_obj,
> -				GEN8_LR_CONTEXT_ALIGN, 0);
> +		ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN,
> +				PIN_OFFSET_BIAS | GUC_WOPCM_SIZE_VALUE);
>  		if (ret)
>  			goto reset_pin_count;
>  
>  		ret = intel_pin_and_map_ringbuffer_obj(ring->dev, ringbuf);
>  		if (ret)
>  			goto unpin_ctx_obj;
> +
> +		/* Invalidate GuC TLB. */
> +		if (i915.enable_guc_submission)
> +			I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
>  	}
>  
>  	return ret;
> @@ -1668,8 +1679,13 @@ out:
>  
>  static int gen8_init_rcs_context(struct drm_i915_gem_request *req)
>  {
> +	struct drm_i915_private *dev_priv = req->i915;
>  	int ret;
>  
> +	/* Invalidate GuC TLB. */
[TOR:] This invalidation is in the init_context for render
ring but not the other rings.  Is this needed for other
rings?  Or, should this invalidation happen at a different
level?  It seems this may depend on the on render ring being
initialized first.

Thanks,
Tom

> +	if (i915.enable_guc_submission)
> +		I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
> +
>  	ret = intel_logical_ring_workarounds_emit(req);
>  	if (ret)
>  		return ret;
> @@ -2026,7 +2042,7 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o
>  
>  	/* The second page of the context object contains some fields which must
>  	 * be set up prior to the first execution. */
> -	page = i915_gem_object_get_page(ctx_obj, 1);
> +	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
>  	reg_state = kmap_atomic(page);
>  
>  	/* A context is actually a big batch buffer with several MI_LOAD_REGISTER_IMM
> @@ -2185,12 +2201,13 @@ static void lrc_setup_hardware_status_page(struct intel_engine_cs *ring,
>  		struct drm_i915_gem_object *default_ctx_obj)
>  {
>  	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> +	struct page *page;
>  
> -	/* The status page is offset 0 from the default context object
> -	 * in LRC mode. */
> -	ring->status_page.gfx_addr = i915_gem_obj_ggtt_offset(default_ctx_obj);
> -	ring->status_page.page_addr =
> -			kmap(sg_page(default_ctx_obj->pages->sgl));
> +	/* The HWSP is part of the default context object in LRC mode. */
> +	ring->status_page.gfx_addr = i915_gem_obj_ggtt_offset(default_ctx_obj)
> +			+ LRC_PPHWSP_PN * PAGE_SIZE;
> +	page = i915_gem_object_get_page(default_ctx_obj, LRC_PPHWSP_PN);
> +	ring->status_page.page_addr = kmap(page);
>  	ring->status_page.obj = default_ctx_obj;
>  
>  	I915_WRITE(RING_HWS_PGA(ring->mmio_base),
> @@ -2226,6 +2243,9 @@ int intel_lr_context_deferred_create(struct intel_context *ctx,
>  
>  	context_size = round_up(get_lr_context_size(ring), 4096);
>  
> +	/* One extra page as the sharing data between driver and GuC */
> +	context_size += PAGE_SIZE * LRC_PPHWSP_PN;
> +
>  	ctx_obj = i915_gem_alloc_object(dev, context_size);
>  	if (!ctx_obj) {
>  		DRM_DEBUG_DRIVER("Alloc LRC backing obj failed.\n");
> @@ -2233,7 +2253,8 @@ int intel_lr_context_deferred_create(struct intel_context *ctx,
>  	}
>  
>  	if (is_global_default_ctx) {
> -		ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN, 0);
> +		ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN,
> +				PIN_OFFSET_BIAS | GUC_WOPCM_SIZE_VALUE);
>  		if (ret) {
>  			DRM_DEBUG_DRIVER("Pin LRC backing obj failed: %d\n",
>  					ret);
> @@ -2252,7 +2273,7 @@ int intel_lr_context_deferred_create(struct intel_context *ctx,
>  
>  	ringbuf->ring = ring;
>  
> -	ringbuf->size = 32 * PAGE_SIZE;
> +	ringbuf->size = 4 * PAGE_SIZE;
>  	ringbuf->effective_size = ringbuf->size;
>  	ringbuf->head = 0;
>  	ringbuf->tail = 0;
> @@ -2352,7 +2373,7 @@ void intel_lr_context_reset(struct drm_device *dev,
>  			WARN(1, "Failed get_pages for context obj\n");
>  			continue;
>  		}
> -		page = i915_gem_object_get_page(ctx_obj, 1);
> +		page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
>  		reg_state = kmap_atomic(page);
>  
>  		reg_state[CTX_RING_HEAD+1] = 0;
> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> index 6ecc0b3..e04b5c2 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> @@ -67,6 +67,12 @@ static inline void intel_logical_ring_emit(struct intel_ringbuffer *ringbuf,
>  }
>  
>  /* Logical Ring Contexts */
> +
> +/* One extra page is added before LRC for GuC as shared data */
> +#define LRC_GUCSHR_PN	(0)
> +#define LRC_PPHWSP_PN	(LRC_GUCSHR_PN + 1)
> +#define LRC_STATE_PN	(LRC_PPHWSP_PN + 1)
> +
>  void intel_lr_context_free(struct intel_context *ctx);
>  int intel_lr_context_deferred_create(struct intel_context *ctx,
>  				     struct intel_engine_cs *ring);
> -- 
> 1.9.1
>