[Intel-gfx] [PATCH 11/13 v4] drm/i915: Integrate GuC-based command submission
O'Rourke, Tom
Tom.O'Rourke at intel.com
Mon Jul 27 08:57:49 PDT 2015
On Thu, Jul 09, 2015 at 07:29:12PM +0100, Dave Gordon wrote:
> From: Alex Dai <yu.dai at intel.com>
>
> GuC-based submission is mostly the same as execlist mode, up to
> intel_logical_ring_advance_and_submit(), where the context being
> dispatched would be added to the execlist queue; at this point
> we submit the context to the GuC backend instead.
>
> There are, however, a few other changes also required, notably:
> 1. Contexts must be pinned at GGTT addresses accessible by the GuC
> i.e. NOT in the range [0..WOPCM_SIZE), so we have to add the
> PIN_OFFSET_BIAS flag to the relevant GGTT-pinning calls.
>
> 2. The GuC's TLB must be invalidated after a context is pinned at
> a new GGTT address.
>
> 3. GuC firmware uses the one page before Ring Context as shared data.
> Therefore, whenever driver wants to get base address of LRC, we
> will offset one page for it. LRC_PPHWSP_PN is defined as the page
> number of LRCA.
>
> 4. In the work queue used to pass requests to the GuC, the GuC
> firmware requires the ring-tail-offset to be represented as an
> 11-bit value, expressed in QWords. Therefore, the ringbuffer
> size must be reduced to the representable range (4 pages).
>
> v2:
> Defer adding #defines until needed [Chris Wilson]
> Rationalise type declarations [Chris Wilson]
>
> v4:
> Squashed kerneldoc patch into here [Daniel Vetter]
>
> Issue: VIZ-4884
> Signed-off-by: Alex Dai <yu.dai at intel.com>
> Signed-off-by: Dave Gordon <david.s.gordon at intel.com>
> ---
> Documentation/DocBook/drm.tmpl | 14 ++++++++
> drivers/gpu/drm/i915/i915_debugfs.c | 2 +-
> drivers/gpu/drm/i915/i915_guc_submission.c | 52 +++++++++++++++++++++++++++---
> drivers/gpu/drm/i915/intel_guc.h | 1 +
> drivers/gpu/drm/i915/intel_lrc.c | 51 ++++++++++++++++++++---------
> drivers/gpu/drm/i915/intel_lrc.h | 6 ++++
> 6 files changed, 106 insertions(+), 20 deletions(-)
>
> diff --git a/Documentation/DocBook/drm.tmpl b/Documentation/DocBook/drm.tmpl
> index 596b11d..0ff5fd7 100644
> --- a/Documentation/DocBook/drm.tmpl
> +++ b/Documentation/DocBook/drm.tmpl
> @@ -4223,6 +4223,20 @@ int num_ioctls;</synopsis>
> </sect2>
> </sect1>
> <sect1>
> + <title>GuC-based Command Submission</title>
> + <sect2>
> + <title>GuC</title>
> +!Pdrivers/gpu/drm/i915/intel_guc_loader.c GuC-specific firmware loader
> +!Idrivers/gpu/drm/i915/intel_guc_loader.c
> + </sect2>
> + <sect2>
> + <title>GuC Client</title>
> +!Pdrivers/gpu/drm/i915/intel_guc_submission.c GuC-based command submissison
> +!Idrivers/gpu/drm/i915/intel_guc_submission.c
> + </sect2>
> + </sect1>
> +
> + <sect1>
> <title> Tracing </title>
> <para>
> This sections covers all things related to the tracepoints implemented in
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 13e37d1..d93732a 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -1982,7 +1982,7 @@ static void i915_dump_lrc_obj(struct seq_file *m,
> return;
> }
>
> - page = i915_gem_object_get_page(ctx_obj, 1);
> + page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
> if (!WARN_ON(page == NULL)) {
> reg_state = kmap_atomic(page);
>
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 25d8807..c5c9fbf 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -346,18 +346,58 @@ static void guc_init_proc_desc(struct intel_guc *guc,
> static void guc_init_ctx_desc(struct intel_guc *guc,
> struct i915_guc_client *client)
> {
> + struct intel_context *ctx = client->owner;
> struct guc_context_desc desc;
> struct sg_table *sg;
> + int i;
>
> memset(&desc, 0, sizeof(desc));
>
> desc.attribute = GUC_CTX_DESC_ATTR_ACTIVE | GUC_CTX_DESC_ATTR_KERNEL;
> desc.context_id = client->ctx_index;
> desc.priority = client->priority;
> - desc.engines_used = (1 << RCS) | (1 << VCS) | (1 << BCS) |
> - (1 << VECS) | (1 << VCS2); /* all engines */
> desc.db_id = client->doorbell_id;
>
> + for (i = 0; i < I915_NUM_RINGS; i++) {
> + struct guc_execlist_context *lrc = &desc.lrc[i];
> + struct intel_ringbuffer *ringbuf = ctx->engine[i].ringbuf;
> + struct intel_engine_cs *ring;
> + struct drm_i915_gem_object *obj;
> + uint64_t ctx_desc;
> +
> + /* TODO: We have a design issue to be solved here. Only when we
> + * receive the first batch, we know which engine is used by the
> + * user. But here GuC expects the lrc and ring to be pinned. It
> + * is not an issue for default context, which is the only one
> + * for now who owns a GuC client. But for future owner of GuC
> + * client, need to make sure lrc is pinned prior to enter here.
> + */
> + obj = ctx->engine[i].state;
> + if (!obj)
> + break;
> +
> + ring = ringbuf->ring;
> + ctx_desc = intel_lr_context_descriptor(ctx, ring);
> + lrc->context_desc = (u32)ctx_desc;
> +
> + /* The state page is after PPHWSP */
> + lrc->ring_lcra = i915_gem_obj_ggtt_offset(obj) +
> + LRC_STATE_PN * PAGE_SIZE;
> + lrc->context_id = (client->ctx_index << GUC_ELC_CTXID_OFFSET) |
> + (ring->id << GUC_ELC_ENGINE_OFFSET);
> +
> + obj = ringbuf->obj;
> +
> + lrc->ring_begin = i915_gem_obj_ggtt_offset(obj);
> + lrc->ring_end = lrc->ring_begin + obj->base.size - 1;
> + lrc->ring_next_free_location = lrc->ring_begin;
> + lrc->ring_current_tail_pointer_value = 0;
> +
> + desc.engines_used |= (1 << ring->id);
> + }
> +
> + WARN_ON(desc.engines_used == 0);
> +
> /*
> * The CPU address is only needed at certain points, so kmap_atomic on
> * demand instead of storing it in the ctx descriptor.
> @@ -622,11 +662,13 @@ static void guc_client_free(struct drm_device *dev,
> * The kernel client to replace ExecList submission is created with
> * NORMAL priority. Priority of a client for scheduler can be HIGH,
> * while a preemption context can use CRITICAL.
> + * @ctx the context to own the client (we use the default render context)
> *
> * Return: An i915_guc_client object if success.
> */
> static struct i915_guc_client *guc_client_alloc(struct drm_device *dev,
> - uint32_t priority)
> + uint32_t priority,
> + struct intel_context *ctx)
> {
> struct i915_guc_client *client;
> struct drm_i915_private *dev_priv = dev->dev_private;
> @@ -639,6 +681,7 @@ static struct i915_guc_client *guc_client_alloc(struct drm_device *dev,
>
> client->doorbell_id = GUC_INVALID_DOORBELL_ID;
> client->priority = priority;
> + client->owner = ctx;
>
> client->ctx_index = (uint32_t)ida_simple_get(&guc->ctx_ids, 0,
> GUC_MAX_GPU_CONTEXTS, GFP_KERNEL);
> @@ -772,10 +815,11 @@ int i915_guc_submission_enable(struct drm_device *dev)
> {
> struct drm_i915_private *dev_priv = dev->dev_private;
> struct intel_guc *guc = &dev_priv->guc;
> + struct intel_context *ctx = dev_priv->ring[RCS].default_context;
> struct i915_guc_client *client;
>
> /* client for execbuf submission */
> - client = guc_client_alloc(dev, GUC_CTX_PRIORITY_NORMAL);
> + client = guc_client_alloc(dev, GUC_CTX_PRIORITY_NORMAL, ctx);
> if (!client) {
> DRM_ERROR("Failed to create execbuf guc_client\n");
> return -ENOMEM;
> diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
> index d249326..9571b56 100644
> --- a/drivers/gpu/drm/i915/intel_guc.h
> +++ b/drivers/gpu/drm/i915/intel_guc.h
> @@ -29,6 +29,7 @@
>
> struct i915_guc_client {
> struct drm_i915_gem_object *client_obj;
> + struct intel_context *owner;
> uint32_t priority;
> uint32_t ctx_index;
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 9e121d3..8294462 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -254,7 +254,8 @@ int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists
> */
> u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj)
> {
> - u32 lrca = i915_gem_obj_ggtt_offset(ctx_obj);
> + u32 lrca = i915_gem_obj_ggtt_offset(ctx_obj) +
> + LRC_PPHWSP_PN * PAGE_SIZE;
>
> /* LRCA is required to be 4K aligned so the more significant 20 bits
> * are globally unique */
> @@ -267,7 +268,8 @@ uint64_t intel_lr_context_descriptor(struct intel_context *ctx,
> struct drm_device *dev = ring->dev;
> struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state;
> uint64_t desc;
> - uint64_t lrca = i915_gem_obj_ggtt_offset(ctx_obj);
> + uint64_t lrca = i915_gem_obj_ggtt_offset(ctx_obj) +
> + LRC_PPHWSP_PN * PAGE_SIZE;
>
> WARN_ON(lrca & 0xFFFFFFFF00000FFFULL);
>
> @@ -342,7 +344,7 @@ void intel_lr_context_update(struct drm_i915_gem_request *rq)
> WARN_ON(!i915_gem_obj_is_pinned(ctx_obj));
> WARN_ON(!i915_gem_obj_is_pinned(rb_obj));
>
> - page = i915_gem_object_get_page(ctx_obj, 1);
> + page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
> reg_state = kmap_atomic(page);
>
> reg_state[CTX_RING_TAIL+1] = rq->tail;
> @@ -687,13 +689,17 @@ static void
> intel_logical_ring_advance_and_submit(struct drm_i915_gem_request *request)
> {
> struct intel_engine_cs *ring = request->ring;
> + struct drm_i915_private *dev_priv = request->i915;
>
> intel_logical_ring_advance(request->ringbuf);
>
> if (intel_ring_stopped(ring))
> return;
>
> - execlists_context_queue(request);
> + if (dev_priv->guc.execbuf_client)
> + i915_guc_submit(dev_priv->guc.execbuf_client, request);
> + else
> + execlists_context_queue(request);
> }
>
> static void __wrap_ring_buffer(struct intel_ringbuffer *ringbuf)
> @@ -984,6 +990,7 @@ int logical_ring_flush_all_caches(struct drm_i915_gem_request *req)
>
> static int intel_lr_context_pin(struct drm_i915_gem_request *rq)
> {
> + struct drm_i915_private *dev_priv = rq->i915;
> struct intel_engine_cs *ring = rq->ring;
> struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring->id].state;
> struct intel_ringbuffer *ringbuf = rq->ringbuf;
> @@ -991,14 +998,18 @@ static int intel_lr_context_pin(struct drm_i915_gem_request *rq)
>
> WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
> if (rq->ctx->engine[ring->id].pin_count++ == 0) {
> - ret = i915_gem_obj_ggtt_pin(ctx_obj,
> - GEN8_LR_CONTEXT_ALIGN, 0);
> + ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN,
> + PIN_OFFSET_BIAS | GUC_WOPCM_SIZE_VALUE);
> if (ret)
> goto reset_pin_count;
>
> ret = intel_pin_and_map_ringbuffer_obj(ring->dev, ringbuf);
> if (ret)
> goto unpin_ctx_obj;
> +
> + /* Invalidate GuC TLB. */
> + if (i915.enable_guc_submission)
> + I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
> }
>
> return ret;
> @@ -1668,8 +1679,13 @@ out:
>
> static int gen8_init_rcs_context(struct drm_i915_gem_request *req)
> {
> + struct drm_i915_private *dev_priv = req->i915;
> int ret;
>
> + /* Invalidate GuC TLB. */
[TOR:] This invalidation is in the init_context for render
ring but not the other rings. Is this needed for other
rings? Or, should this invalidation happen at a different
level? It seems this may depend on the on render ring being
initialized first.
Thanks,
Tom
> + if (i915.enable_guc_submission)
> + I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
> +
> ret = intel_logical_ring_workarounds_emit(req);
> if (ret)
> return ret;
> @@ -2026,7 +2042,7 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o
>
> /* The second page of the context object contains some fields which must
> * be set up prior to the first execution. */
> - page = i915_gem_object_get_page(ctx_obj, 1);
> + page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
> reg_state = kmap_atomic(page);
>
> /* A context is actually a big batch buffer with several MI_LOAD_REGISTER_IMM
> @@ -2185,12 +2201,13 @@ static void lrc_setup_hardware_status_page(struct intel_engine_cs *ring,
> struct drm_i915_gem_object *default_ctx_obj)
> {
> struct drm_i915_private *dev_priv = ring->dev->dev_private;
> + struct page *page;
>
> - /* The status page is offset 0 from the default context object
> - * in LRC mode. */
> - ring->status_page.gfx_addr = i915_gem_obj_ggtt_offset(default_ctx_obj);
> - ring->status_page.page_addr =
> - kmap(sg_page(default_ctx_obj->pages->sgl));
> + /* The HWSP is part of the default context object in LRC mode. */
> + ring->status_page.gfx_addr = i915_gem_obj_ggtt_offset(default_ctx_obj)
> + + LRC_PPHWSP_PN * PAGE_SIZE;
> + page = i915_gem_object_get_page(default_ctx_obj, LRC_PPHWSP_PN);
> + ring->status_page.page_addr = kmap(page);
> ring->status_page.obj = default_ctx_obj;
>
> I915_WRITE(RING_HWS_PGA(ring->mmio_base),
> @@ -2226,6 +2243,9 @@ int intel_lr_context_deferred_create(struct intel_context *ctx,
>
> context_size = round_up(get_lr_context_size(ring), 4096);
>
> + /* One extra page as the sharing data between driver and GuC */
> + context_size += PAGE_SIZE * LRC_PPHWSP_PN;
> +
> ctx_obj = i915_gem_alloc_object(dev, context_size);
> if (!ctx_obj) {
> DRM_DEBUG_DRIVER("Alloc LRC backing obj failed.\n");
> @@ -2233,7 +2253,8 @@ int intel_lr_context_deferred_create(struct intel_context *ctx,
> }
>
> if (is_global_default_ctx) {
> - ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN, 0);
> + ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN,
> + PIN_OFFSET_BIAS | GUC_WOPCM_SIZE_VALUE);
> if (ret) {
> DRM_DEBUG_DRIVER("Pin LRC backing obj failed: %d\n",
> ret);
> @@ -2252,7 +2273,7 @@ int intel_lr_context_deferred_create(struct intel_context *ctx,
>
> ringbuf->ring = ring;
>
> - ringbuf->size = 32 * PAGE_SIZE;
> + ringbuf->size = 4 * PAGE_SIZE;
> ringbuf->effective_size = ringbuf->size;
> ringbuf->head = 0;
> ringbuf->tail = 0;
> @@ -2352,7 +2373,7 @@ void intel_lr_context_reset(struct drm_device *dev,
> WARN(1, "Failed get_pages for context obj\n");
> continue;
> }
> - page = i915_gem_object_get_page(ctx_obj, 1);
> + page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
> reg_state = kmap_atomic(page);
>
> reg_state[CTX_RING_HEAD+1] = 0;
> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> index 6ecc0b3..e04b5c2 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> @@ -67,6 +67,12 @@ static inline void intel_logical_ring_emit(struct intel_ringbuffer *ringbuf,
> }
>
> /* Logical Ring Contexts */
> +
> +/* One extra page is added before LRC for GuC as shared data */
> +#define LRC_GUCSHR_PN (0)
> +#define LRC_PPHWSP_PN (LRC_GUCSHR_PN + 1)
> +#define LRC_STATE_PN (LRC_PPHWSP_PN + 1)
> +
> void intel_lr_context_free(struct intel_context *ctx);
> int intel_lr_context_deferred_create(struct intel_context *ctx,
> struct intel_engine_cs *ring);
> --
> 1.9.1
>
More information about the Intel-gfx
mailing list