[Intel-gfx] [PATCH v2] drm/i915: Rework GPU reset sequence to match driver load & thaw
Mika Kuoppala
mika.kuoppala at linux.intel.com
Fri Aug 15 19:03:25 CEST 2014
"Mcaulay, Alistair" <alistair.mcaulay at intel.com> writes:
> Hi Mika / Daniel,
>
> below is the basic code path of a reset which has been changed by my patch:
>
> i915_reset()
> {
> ....
> i915_gem_reset() -> This used to call i915_gem_context_reset(), which has now been removed.
> .....
> i915_gem_init_hw()
> .....
> i915_gem_context_enable() -> This used to return during reset. Now it doesn't
> .....
> for each ring, i915_switch_context(default)
> do_switch();
> .....
>
> .....
> }
>
> " I am with Daniel on this one. I don't understand how can we throw everything in here away."
> Did you maybe mean Ben?
> Daniel, I thought you were happy with the implementation, and V2 fixed your last concern, could you please comment.
>
> " We need to force hw to switch to a working context, after reset, so that our internal state tracking matches."
> I believe this patch does that using i915_switch_context, rather than the hacky i915_gem_context_reset()
Our internal state tracking will be ok after i915_gem_context_enable()
has been called. All rings have been set to the default context.
But what happens with this sequence:
- render ring was running in default context
- reset happens
- we call i915_gem_context_enable
- do_switch will get called
- it figure out that last context is the same as we are switching to
(from == to) and it bails out
- we never wrote anything to ring, so we have pre reset context running.
MI_SET_CONTEXT was never run.
Even if reset would not clear the CCID, I think it is safest to
force a MI_SET_CONTEXT here.
Further, if the default context was mangled before the reset,
we should reinitialize it to a known state by running
i915_gem_render_state_init() for it. But this can be
considered as a possible future work.
-Mika
> Alistair.
>
>> -----Original Message-----
>> From: Mika Kuoppala [mailto:mika.kuoppala at linux.intel.com]
>> Sent: Wednesday, August 06, 2014 5:25 PM
>> To: Mcaulay, Alistair; intel-gfx at lists.freedesktop.org
>> Subject: Re: [Intel-gfx] [PATCH v2] drm/i915: Rework GPU reset sequence to
>> match driver load & thaw
>>
>>
>> Hi,
>>
>> alistair.mcaulay at intel.com writes:
>>
>> > From: "McAulay, Alistair" <alistair.mcaulay at intel.com>
>> >
>> > This patch is to address Daniels concerns over different code during reset:
>> >
>> > http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html
>> >
>> > "The reason for aiming as hard as possible to use the exact same code
>> > for driver load, gpu reset and runtime pm/system resume is that we've
>> > simply seen too many bugs due to slight variations and unintended
>> omissions."
>> >
>> > Tested using igt drv_hangman.
>> >
>> > V2: Cleaner way of preventing check_wedge returning -EAGAIN
>> >
>> > Signed-off-by: McAulay, Alistair <alistair.mcaulay at intel.com>
>> > ---
>> > drivers/gpu/drm/i915/i915_drv.c | 6 +++
>> > drivers/gpu/drm/i915/i915_drv.h | 3 ++
>> > drivers/gpu/drm/i915/i915_gem.c | 6 +--
>> > drivers/gpu/drm/i915/i915_gem_context.c | 42 +--------------------
>> > drivers/gpu/drm/i915/i915_gem_gtt.c | 67 +++++----------------------------
>> > drivers/gpu/drm/i915/i915_gem_gtt.h | 3 +-
>> > 6 files changed, 23 insertions(+), 104 deletions(-)
>> >
>> > diff --git a/drivers/gpu/drm/i915/i915_drv.c
>> > b/drivers/gpu/drm/i915/i915_drv.c index 5e4fefd..3bfafe6 100644
>> > --- a/drivers/gpu/drm/i915/i915_drv.c
>> > +++ b/drivers/gpu/drm/i915/i915_drv.c
>> > @@ -806,7 +806,13 @@ int i915_reset(struct drm_device *dev)
>> > !dev_priv->ums.mm_suspended) {
>> > dev_priv->ums.mm_suspended = 0;
>> >
>> > + /* Used to prevent gem_check_wedged returning -EAGAIN
>> during gpu reset */
>> > + dev_priv->gpu_error.reload_in_reset = true;
>> > +
>> > ret = i915_gem_init_hw(dev);
>> > +
>> > + dev_priv->gpu_error.reload_in_reset = false;
>> > +
>> > mutex_unlock(&dev->struct_mutex);
>> > if (ret) {
>> > DRM_ERROR("Failed hw init on reset %d\n", ret); diff
>> --git
>> > a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> > index 991b663..116daff 100644
>> > --- a/drivers/gpu/drm/i915/i915_drv.h
>> > +++ b/drivers/gpu/drm/i915/i915_drv.h
>> > @@ -1217,6 +1217,9 @@ struct i915_gpu_error {
>> >
>> > /* For missed irq/seqno simulation. */
>> > unsigned int test_irq_rings;
>> > +
>> > + /* Used to prevent gem_check_wedged returning -EAGAIN during
>> gpu reset */
>> > + bool reload_in_reset;
>> > };
>> >
>> > enum modeset_restore {
>> > diff --git a/drivers/gpu/drm/i915/i915_gem.c
>> > b/drivers/gpu/drm/i915/i915_gem.c index ef047bc..14e1770 100644
>> > --- a/drivers/gpu/drm/i915/i915_gem.c
>> > +++ b/drivers/gpu/drm/i915/i915_gem.c
>> > @@ -1085,7 +1085,9 @@ i915_gem_check_wedge(struct i915_gpu_error
>> *error,
>> > if (i915_terminally_wedged(error))
>> > return -EIO;
>> >
>> > - return -EAGAIN;
>> > + /* Check if GPU Reset is in progress */
>> > + if (!error->reload_in_reset)
>> > + return -EAGAIN;
>> > }
>> >
>> > return 0;
>> > @@ -2590,8 +2592,6 @@ void i915_gem_reset(struct drm_device *dev)
>> > for_each_ring(ring, dev_priv, i)
>> > i915_gem_reset_ring_cleanup(dev_priv, ring);
>> >
>> > - i915_gem_context_reset(dev);
>> > -
>> > i915_gem_restore_fences(dev);
>> > }
>> >
>> > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
>> > b/drivers/gpu/drm/i915/i915_gem_context.c
>> > index de72a28..d96219f 100644
>> > --- a/drivers/gpu/drm/i915/i915_gem_context.c
>> > +++ b/drivers/gpu/drm/i915/i915_gem_context.c
>> > @@ -372,42 +372,6 @@ err_destroy:
>> > return ERR_PTR(ret);
>> > }
>> >
>> > -void i915_gem_context_reset(struct drm_device *dev) -{
>> > - struct drm_i915_private *dev_priv = dev->dev_private;
>> > - int i;
>> > -
>> > - /* Prevent the hardware from restoring the last context (which
>> hung) on
>> > - * the next switch */
>> > - for (i = 0; i < I915_NUM_RINGS; i++) {
>> > - struct intel_engine_cs *ring = &dev_priv->ring[i];
>> > - struct intel_context *dctx = ring->default_context;
>> > - struct intel_context *lctx = ring->last_context;
>> > -
>> > - /* Do a fake switch to the default context */
>> > - if (lctx == dctx)
>> > - continue;
>> > -
>> > - if (!lctx)
>> > - continue;
>> > -
>> > - if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
>> > - WARN_ON(i915_gem_obj_ggtt_pin(dctx-
>> >legacy_hw_ctx.rcs_state,
>> > -
>> get_context_alignment(dev), 0));
>> > - /* Fake a finish/inactive */
>> > - dctx->legacy_hw_ctx.rcs_state->base.write_domain
>> = 0;
>> > - dctx->legacy_hw_ctx.rcs_state->active = 0;
>> > - }
>> > -
>> > - if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
>> > - i915_gem_object_ggtt_unpin(lctx-
>> >legacy_hw_ctx.rcs_state);
>> > -
>> > - i915_gem_context_unreference(lctx);
>> > - i915_gem_context_reference(dctx);
>> > - ring->last_context = dctx;
>> > - }
>> > -}
>> > -
>>
>> I am with Daniel on this one. I don't understand how can we throw
>> everything in here away.
>>
>> We need to force hw to switch to a working context, after reset, so that our
>> internal state tracking matches.
>>
>> Further, if we aim to more unification I think we should make it so that the
>> initial render state will get run, also after reset.
>>
>> If we cleanup the last context for each ring set default context carefully,
>> i915_gem_context_enable() will then switch to default contexts and reinit
>> them using the initial render state. Something like this:
>>
>> void i915_gem_context_reset(struct drm_device *dev) {
>> struct drm_i915_private *dev_priv = dev->dev_private;
>> int i;
>>
>> for (i = 0; i < I915_NUM_RINGS; i++) {
>> struct intel_engine_cs *ring = &dev_priv->ring[i];
>> struct intel_context *lctx = ring->last_context;
>> struct intel_context *dctx = ring->default_context;
>>
>> if (lctx) {
>> if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
>> i915_gem_object_ggtt_unpin(lctx-
>> >legacy_hw_ctx.rcs_state);
>>
>> i915_gem_context_unreference(lctx);
>> ring->last_context = NULL;
>> }
>>
>> if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
>> dctx->legacy_hw_ctx.rcs_state->base.write_domain
>> = 0;
>> dctx->legacy_hw_ctx.rcs_state->active = 0;
>> dctx->legacy_hw_ctx.initialized = false;
>> }
>> }
>> }
>>
>> The state would be closer of what we get after module reload.
>>
>> -Mika
>>
>> > int i915_gem_context_init(struct drm_device *dev) {
>> > struct drm_i915_private *dev_priv = dev->dev_private; @@ -498,10
>> > +462,6 @@ int i915_gem_context_enable(struct drm_i915_private
>> *dev_priv)
>> > ppgtt->enable(ppgtt);
>> > }
>> >
>> > - /* FIXME: We should make this work, even in reset */
>> > - if (i915_reset_in_progress(&dev_priv->gpu_error))
>> > - return 0;
>> > -
>> > BUG_ON(!dev_priv->ring[RCS].default_context);
>> >
>> > for_each_ring(ring, dev_priv, i) {
>> > @@ -645,7 +605,7 @@ static int do_switch(struct intel_engine_cs *ring,
>> > from = ring->last_context;
>> >
>> > if (USES_FULL_PPGTT(ring->dev)) {
>> > - ret = ppgtt->switch_mm(ppgtt, ring, false);
>> > + ret = ppgtt->switch_mm(ppgtt, ring);
>> > if (ret)
>> > goto unpin_out;
>> > }
>> > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
>> > b/drivers/gpu/drm/i915/i915_gem_gtt.c
>> > index 5188936..450c8a9 100644
>> > --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
>> > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
>> > @@ -216,19 +216,12 @@ static gen6_gtt_pte_t
>> iris_pte_encode(dma_addr_t
>> > addr,
>> >
>> > /* Broadwell Page Directory Pointer Descriptors */ static int
>> > gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
>> > - uint64_t val, bool synchronous)
>> > + uint64_t val)
>> > {
>> > - struct drm_i915_private *dev_priv = ring->dev->dev_private;
>> > int ret;
>> >
>> > BUG_ON(entry >= 4);
>> >
>> > - if (synchronous) {
>> > - I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
>> > - I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
>> > - return 0;
>> > - }
>> > -
>> > ret = intel_ring_begin(ring, 6);
>> > if (ret)
>> > return ret;
>> > @@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs
>> > *ring, unsigned entry, }
>> >
>> > static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
>> > - struct intel_engine_cs *ring,
>> > - bool synchronous)
>> > + struct intel_engine_cs *ring)
>> > {
>> > int i, ret;
>> >
>> > @@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt
>> > *ppgtt,
>> >
>> > for (i = used_pd - 1; i >= 0; i--) {
>> > dma_addr_t addr = ppgtt->pd_dma_addr[i];
>> > - ret = gen8_write_pdp(ring, i, addr, synchronous);
>> > + ret = gen8_write_pdp(ring, i, addr);
>> > if (ret)
>> > return ret;
>> > }
>> > @@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct
>> > i915_hw_ppgtt *ppgtt) }
>> >
>> > static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
>> > - struct intel_engine_cs *ring,
>> > - bool synchronous)
>> > + struct intel_engine_cs *ring)
>> > {
>> > - struct drm_device *dev = ppgtt->base.dev;
>> > - struct drm_i915_private *dev_priv = dev->dev_private;
>> > int ret;
>> >
>> > - /* If we're in reset, we can assume the GPU is sufficiently idle to
>> > - * manually frob these bits. Ideally we could use the ring functions,
>> > - * except our error handling makes it quite difficult (can't use
>> > - * intel_ring_begin, ring->flush, or intel_ring_advance)
>> > - *
>> > - * FIXME: We should try not to special case reset
>> > - */
>> > - if (synchronous ||
>> > - i915_reset_in_progress(&dev_priv->gpu_error)) {
>> > - WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
>> > - I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
>> > - I915_WRITE(RING_PP_DIR_BASE(ring),
>> get_pd_offset(ppgtt));
>> > - POSTING_READ(RING_PP_DIR_BASE(ring));
>> > - return 0;
>> > - }
>> > -
>> > /* NB: TLBs must be flushed and invalidated before a switch */
>> > ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
>> I915_GEM_GPU_DOMAINS);
>> > if (ret)
>> > @@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt
>> > *ppgtt, }
>> >
>> > static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
>> > - struct intel_engine_cs *ring,
>> > - bool synchronous)
>> > + struct intel_engine_cs *ring)
>> > {
>> > - struct drm_device *dev = ppgtt->base.dev;
>> > - struct drm_i915_private *dev_priv = dev->dev_private;
>> > int ret;
>> >
>> > - /* If we're in reset, we can assume the GPU is sufficiently idle to
>> > - * manually frob these bits. Ideally we could use the ring functions,
>> > - * except our error handling makes it quite difficult (can't use
>> > - * intel_ring_begin, ring->flush, or intel_ring_advance)
>> > - *
>> > - * FIXME: We should try not to special case reset
>> > - */
>> > - if (synchronous ||
>> > - i915_reset_in_progress(&dev_priv->gpu_error)) {
>> > - WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
>> > - I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
>> > - I915_WRITE(RING_PP_DIR_BASE(ring),
>> get_pd_offset(ppgtt));
>> > - POSTING_READ(RING_PP_DIR_BASE(ring));
>> > - return 0;
>> > - }
>> > -
>> > /* NB: TLBs must be flushed and invalidated before a switch */
>> > ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
>> I915_GEM_GPU_DOMAINS);
>> > if (ret)
>> > @@ -819,14 +773,11 @@ static int gen7_mm_switch(struct i915_hw_ppgtt
>> > *ppgtt, }
>> >
>> > static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
>> > - struct intel_engine_cs *ring,
>> > - bool synchronous)
>> > + struct intel_engine_cs *ring)
>> > {
>> > struct drm_device *dev = ppgtt->base.dev;
>> > struct drm_i915_private *dev_priv = dev->dev_private;
>> >
>> > - if (!synchronous)
>> > - return 0;
>> >
>> > I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
>> > I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt)); @@ -
>> 852,7
>> > +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
>> > if (USES_FULL_PPGTT(dev))
>> > continue;
>> >
>> > - ret = ppgtt->switch_mm(ppgtt, ring, true);
>> > + ret = ppgtt->switch_mm(ppgtt, ring);
>> > if (ret)
>> > goto err_out;
>> > }
>> > @@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt
>> *ppgtt)
>> > if (USES_FULL_PPGTT(dev))
>> > continue;
>> >
>> > - ret = ppgtt->switch_mm(ppgtt, ring, true);
>> > + ret = ppgtt->switch_mm(ppgtt, ring);
>> > if (ret)
>> > return ret;
>> > }
>> > @@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt
>> *ppgtt)
>> > I915_WRITE(GFX_MODE,
>> _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
>> >
>> > for_each_ring(ring, dev_priv, i) {
>> > - int ret = ppgtt->switch_mm(ppgtt, ring, true);
>> > + int ret = ppgtt->switch_mm(ppgtt, ring);
>> > if (ret)
>> > return ret;
>> > }
>> > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h
>> > b/drivers/gpu/drm/i915/i915_gem_gtt.h
>> > index 8d6f7c1..bf1e4fc 100644
>> > --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
>> > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
>> > @@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
>> >
>> > int (*enable)(struct i915_hw_ppgtt *ppgtt);
>> > int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
>> > - struct intel_engine_cs *ring,
>> > - bool synchronous);
>> > + struct intel_engine_cs *ring);
>> > void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file
>> *m);
>> > };
>> >
>> > --
>> > 2.0.0
>> >
>> > _______________________________________________
>> > Intel-gfx mailing list
>> > Intel-gfx at lists.freedesktop.org
>> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
More information about the Intel-gfx
mailing list