[Intel-gfx] [PATCH 1/3] drm/i915/execlists: Reinitialise context image after GPU hang
Mika Kuoppala
mika.kuoppala at linux.intel.com
Mon Oct 3 12:25:16 UTC 2016
Chris Wilson <chris at chris-wilson.co.uk> writes:
> On Braswell, at least, we observe that the context image is written in
> multiple phases. The first phase is to clear the register state, and
> subsequently rewrite it. A GPU reset at the right moment can interrupt
> the context update leaving it corrupt, and our update of the RING_HEAD
> is not sufficient to restart the engine afterwards. To recover, we need
> to reset the registers back to their original values. The context state
> is lost. What we need is a better mechanism to serialise the reset with
> pending flushes from the GPU.
>
> Fixes: 821ed7df6e2a ("drm/i915: Update reset path to fix incomplete requests")
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala at intel.com>
> ---
> drivers/gpu/drm/i915/intel_lrc.c | 95 +++++++++++++++++++++++-----------------
> 1 file changed, 56 insertions(+), 39 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 2d8eb2eb2b72..d6e762718ff4 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -226,10 +226,16 @@ enum {
> /* Typical size of the average request (2 pipecontrols and a MI_BB) */
> #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
>
> +#define WA_TAIL_DWORDS 2
> +
> static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
> struct intel_engine_cs *engine);
> static int intel_lr_context_pin(struct i915_gem_context *ctx,
> struct intel_engine_cs *engine);
> +static void execlists_init_reg_state(u32 *reg_state,
> + struct i915_gem_context *ctx,
> + struct intel_engine_cs *engine,
> + struct intel_ring *ring);
>
> /**
> * intel_sanitize_enable_execlists() - sanitize i915.enable_execlists
> @@ -707,7 +713,6 @@ static int intel_lr_context_pin(struct i915_gem_context *ctx,
> {
> struct intel_context *ce = &ctx->engine[engine->id];
> void *vaddr;
> - u32 *lrc_reg_state;
> int ret;
>
> lockdep_assert_held(&ctx->i915->drm.struct_mutex);
> @@ -726,17 +731,16 @@ static int intel_lr_context_pin(struct i915_gem_context *ctx,
> goto unpin_vma;
> }
>
> - lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
> -
> ret = intel_ring_pin(ce->ring);
> if (ret)
> goto unpin_map;
>
> intel_lr_context_descriptor_update(ctx, engine);
>
> - lrc_reg_state[CTX_RING_BUFFER_START+1] =
> + ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
> + ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
> i915_ggtt_offset(ce->ring->vma);
> - ce->lrc_reg_state = lrc_reg_state;
> +
> ce->state->obj->dirty = true;
>
> /* Invalidate GuC TLB. */
> @@ -1284,8 +1288,14 @@ static void reset_common_ring(struct intel_engine_cs *engine,
> struct execlist_port *port = engine->execlist_port;
> struct intel_context *ce = &request->ctx->engine[engine->id];
>
> + execlists_init_reg_state(ce->lrc_reg_state,
> + request->ctx, engine, ce->ring);
> +
> /* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
> + ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
> + i915_ggtt_offset(ce->ring->vma);
> ce->lrc_reg_state[CTX_RING_HEAD+1] = request->postfix;
> +
> request->ring->head = request->postfix;
> request->ring->last_retired_head = -1;
> intel_ring_update_space(request->ring);
> @@ -1305,6 +1315,9 @@ static void reset_common_ring(struct intel_engine_cs *engine,
> GEM_BUG_ON(request->ctx != port[0].request->ctx);
> port[0].count = 0;
> port[1].count = 0;
> +
> + /* Reset WaIdleLiteRestore:bdw,skl as well */
> + request->tail = request->wa_tail - WA_TAIL_DWORDS * sizeof(u32);
> }
>
> static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req)
> @@ -1542,7 +1555,6 @@ static void bxt_a_seqno_barrier(struct intel_engine_cs *engine)
> * used as a workaround for not being allowed to do lite
> * restore with HEAD==TAIL (WaIdleLiteRestore).
> */
> -#define WA_TAIL_DWORDS 2
>
> static int gen8_emit_request(struct drm_i915_gem_request *request)
> {
> @@ -1889,38 +1901,13 @@ static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *engine)
> return indirect_ctx_offset;
> }
>
> -static int
> -populate_lr_context(struct i915_gem_context *ctx,
> - struct drm_i915_gem_object *ctx_obj,
> - struct intel_engine_cs *engine,
> - struct intel_ring *ring)
> +static void execlists_init_reg_state(u32 *reg_state,
> + struct i915_gem_context *ctx,
> + struct intel_engine_cs *engine,
> + struct intel_ring *ring)
> {
> - struct drm_i915_private *dev_priv = ctx->i915;
> - struct i915_hw_ppgtt *ppgtt = ctx->ppgtt;
> - void *vaddr;
> - u32 *reg_state;
> - int ret;
> -
> - if (!ppgtt)
> - ppgtt = dev_priv->mm.aliasing_ppgtt;
> -
> - ret = i915_gem_object_set_to_cpu_domain(ctx_obj, true);
> - if (ret) {
> - DRM_DEBUG_DRIVER("Could not set to CPU domain\n");
> - return ret;
> - }
> -
> - vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
> - if (IS_ERR(vaddr)) {
> - ret = PTR_ERR(vaddr);
> - DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
> - return ret;
> - }
> - ctx_obj->dirty = true;
> -
> - /* The second page of the context object contains some fields which must
> - * be set up prior to the first execution. */
> - reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
> + struct drm_i915_private *dev_priv = engine->i915;
> + struct i915_hw_ppgtt *ppgtt = ctx->ppgtt ?: dev_priv->mm.aliasing_ppgtt;
>
> /* A context is actually a big batch buffer with several MI_LOAD_REGISTER_IMM
> * commands followed by (reg, value) pairs. The values we are setting here are
> @@ -1934,7 +1921,7 @@ populate_lr_context(struct i915_gem_context *ctx,
> _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
> CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
> (HAS_RESOURCE_STREAMER(dev_priv) ?
> - CTX_CTRL_RS_CTX_ENABLE : 0)));
> + CTX_CTRL_RS_CTX_ENABLE : 0)));
> ASSIGN_CTX_REG(reg_state, CTX_RING_HEAD, RING_HEAD(engine->mmio_base),
> 0);
> ASSIGN_CTX_REG(reg_state, CTX_RING_TAIL, RING_TAIL(engine->mmio_base),
> @@ -1946,7 +1933,7 @@ populate_lr_context(struct i915_gem_context *ctx,
> RING_START(engine->mmio_base), 0);
> ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_CONTROL,
> RING_CTL(engine->mmio_base),
> - ((ring->size - PAGE_SIZE) & RING_NR_PAGES) | RING_VALID);
> + (ring->size - PAGE_SIZE) | RING_VALID);
Patch looks good.
Reviewed-by: Mika Kuoppala <mika.kuoppala at intel.com>
Not exactly problems with this patch, but as we are in
the territory:
I still would like the ring->size setting to be accompanied
with comment about it matching page shift. I have fallen
for it twice now so I suspect the next reader will too.
And for that matter, removal of misleading comment
/ * It is written to the context image in execlists_update_context() */
in execlists_init_reg_state()
Thanks,
-Mika
> ASSIGN_CTX_REG(reg_state, CTX_BB_HEAD_U,
> RING_BBADDR_UDW(engine->mmio_base), 0);
> ASSIGN_CTX_REG(reg_state, CTX_BB_HEAD_L,
> @@ -2024,6 +2011,36 @@ populate_lr_context(struct i915_gem_context *ctx,
> ASSIGN_CTX_REG(reg_state, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
> make_rpcs(dev_priv));
> }
> +}
> +
> +static int
> +populate_lr_context(struct i915_gem_context *ctx,
> + struct drm_i915_gem_object *ctx_obj,
> + struct intel_engine_cs *engine,
> + struct intel_ring *ring)
> +{
> + void *vaddr;
> + int ret;
> +
> + ret = i915_gem_object_set_to_cpu_domain(ctx_obj, true);
> + if (ret) {
> + DRM_DEBUG_DRIVER("Could not set to CPU domain\n");
> + return ret;
> + }
> +
> + vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
> + if (IS_ERR(vaddr)) {
> + ret = PTR_ERR(vaddr);
> + DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
> + return ret;
> + }
> + ctx_obj->dirty = true;
> +
> + /* The second page of the context object contains some fields which must
> + * be set up prior to the first execution. */
> +
> + execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
> + ctx, engine, ring);
>
> i915_gem_object_unpin_map(ctx_obj);
>
> --
> 2.9.3
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
More information about the Intel-gfx
mailing list