[Intel-gfx] [PATCH v2] drm/i915: Rework GPU reset sequence to match driver load & thaw

Fri Aug 15 19:03:25 CEST 2014

"Mcaulay, Alistair" <alistair.mcaulay at intel.com> writes:

> Hi Mika / Daniel,
>
> below is the basic code path of a reset which has been changed by my patch:
>
> i915_reset()
> {
> 	....
> 	i915_gem_reset() -> This used to call i915_gem_context_reset(), which has now been removed. 
> 	.....
> 	i915_gem_init_hw()
> 		.....
> 		i915_gem_context_enable() -> This used to return during reset. Now it doesn't
> 		.....
> 			for each ring, i915_switch_context(default)
> 				do_switch();
> 		.....
>
> 	.....
> }
>
> " I am with Daniel on this one. I don't understand how can we throw everything in here away."
> Did you maybe mean Ben?
> Daniel, I thought you were happy with the implementation, and V2 fixed your last concern, could you please comment.
>
> " We need to force hw to switch to a working context, after reset, so that our internal state tracking matches."
> I believe this patch does that using i915_switch_context, rather than the hacky i915_gem_context_reset()

Our internal state tracking will be ok after i915_gem_context_enable()
has been called. All rings have been set to the default context.

But what happens with this sequence:

- render ring was running in default context 
- reset happens
- we call i915_gem_context_enable 
- do_switch will get called 
- it figure out that last context is the same as we are switching to
  (from == to) and it bails out
- we never wrote anything to ring, so we have pre reset context running.
  MI_SET_CONTEXT was never run.

Even if reset would not clear the CCID, I think it is safest to
force a MI_SET_CONTEXT here.

Further, if the default context was mangled before the reset,
we should reinitialize it to a known state by running
i915_gem_render_state_init() for it. But this can be
considered as a possible future work.

-Mika

> Alistair.
>
>> -----Original Message-----
>> From: Mika Kuoppala [mailto:mika.kuoppala at linux.intel.com]
>> Sent: Wednesday, August 06, 2014 5:25 PM
>> To: Mcaulay, Alistair; intel-gfx at lists.freedesktop.org
>> Subject: Re: [Intel-gfx] [PATCH v2] drm/i915: Rework GPU reset sequence to
>> match driver load & thaw
>> 
>> 
>> Hi,
>> 
>> alistair.mcaulay at intel.com writes:
>> 
>> > From: "McAulay, Alistair" <alistair.mcaulay at intel.com>
>> >
>> > This patch is to address Daniels concerns over different code during reset:
>> >
>> > http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html
>> >
>> > "The reason for aiming as hard as possible to use the exact same code
>> > for driver load, gpu reset and runtime pm/system resume is that we've
>> > simply seen too many bugs due to slight variations and unintended
>> omissions."
>> >
>> > Tested using igt drv_hangman.
>> >
>> > V2: Cleaner way of preventing check_wedge returning -EAGAIN
>> >
>> > Signed-off-by: McAulay, Alistair <alistair.mcaulay at intel.com>
>> > ---
>> >  drivers/gpu/drm/i915/i915_drv.c         |  6 +++
>> >  drivers/gpu/drm/i915/i915_drv.h         |  3 ++
>> >  drivers/gpu/drm/i915/i915_gem.c         |  6 +--
>> >  drivers/gpu/drm/i915/i915_gem_context.c | 42 +--------------------
>> >  drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++----------------------------
>> >  drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
>> >  6 files changed, 23 insertions(+), 104 deletions(-)
>> >
>> > diff --git a/drivers/gpu/drm/i915/i915_drv.c
>> > b/drivers/gpu/drm/i915/i915_drv.c index 5e4fefd..3bfafe6 100644
>> > --- a/drivers/gpu/drm/i915/i915_drv.c
>> > +++ b/drivers/gpu/drm/i915/i915_drv.c
>> > @@ -806,7 +806,13 @@ int i915_reset(struct drm_device *dev)
>> >  			!dev_priv->ums.mm_suspended) {
>> >  		dev_priv->ums.mm_suspended = 0;
>> >
>> > +		/* Used to prevent gem_check_wedged returning -EAGAIN
>> during gpu reset */
>> > +		dev_priv->gpu_error.reload_in_reset = true;
>> > +
>> >  		ret = i915_gem_init_hw(dev);
>> > +
>> > +		dev_priv->gpu_error.reload_in_reset = false;
>> > +
>> >  		mutex_unlock(&dev->struct_mutex);
>> >  		if (ret) {
>> >  			DRM_ERROR("Failed hw init on reset %d\n", ret); diff
>> --git
>> > a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> > index 991b663..116daff 100644
>> > --- a/drivers/gpu/drm/i915/i915_drv.h
>> > +++ b/drivers/gpu/drm/i915/i915_drv.h
>> > @@ -1217,6 +1217,9 @@ struct i915_gpu_error {
>> >
>> >  	/* For missed irq/seqno simulation. */
>> >  	unsigned int test_irq_rings;
>> > +
>> > +	/* Used to prevent gem_check_wedged returning -EAGAIN during
>> gpu reset   */
>> > +	bool reload_in_reset;
>> >  };
>> >
>> >  enum modeset_restore {
>> > diff --git a/drivers/gpu/drm/i915/i915_gem.c
>> > b/drivers/gpu/drm/i915/i915_gem.c index ef047bc..14e1770 100644
>> > --- a/drivers/gpu/drm/i915/i915_gem.c
>> > +++ b/drivers/gpu/drm/i915/i915_gem.c
>> > @@ -1085,7 +1085,9 @@ i915_gem_check_wedge(struct i915_gpu_error
>> *error,
>> >  		if (i915_terminally_wedged(error))
>> >  			return -EIO;
>> >
>> > -		return -EAGAIN;
>> > +		/* Check if GPU Reset is in progress */
>> > +		if (!error->reload_in_reset)
>> > +			return -EAGAIN;
>> >  	}
>> >
>> >  	return 0;
>> > @@ -2590,8 +2592,6 @@ void i915_gem_reset(struct drm_device *dev)
>> >  	for_each_ring(ring, dev_priv, i)
>> >  		i915_gem_reset_ring_cleanup(dev_priv, ring);
>> >
>> > -	i915_gem_context_reset(dev);
>> > -
>> >  	i915_gem_restore_fences(dev);
>> >  }
>> >
>> > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
>> > b/drivers/gpu/drm/i915/i915_gem_context.c
>> > index de72a28..d96219f 100644
>> > --- a/drivers/gpu/drm/i915/i915_gem_context.c
>> > +++ b/drivers/gpu/drm/i915/i915_gem_context.c
>> > @@ -372,42 +372,6 @@ err_destroy:
>> >  	return ERR_PTR(ret);
>> >  }
>> >
>> > -void i915_gem_context_reset(struct drm_device *dev) -{
>> > -	struct drm_i915_private *dev_priv = dev->dev_private;
>> > -	int i;
>> > -
>> > -	/* Prevent the hardware from restoring the last context (which
>> hung) on
>> > -	 * the next switch */
>> > -	for (i = 0; i < I915_NUM_RINGS; i++) {
>> > -		struct intel_engine_cs *ring = &dev_priv->ring[i];
>> > -		struct intel_context *dctx = ring->default_context;
>> > -		struct intel_context *lctx = ring->last_context;
>> > -
>> > -		/* Do a fake switch to the default context */
>> > -		if (lctx == dctx)
>> > -			continue;
>> > -
>> > -		if (!lctx)
>> > -			continue;
>> > -
>> > -		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
>> > -			WARN_ON(i915_gem_obj_ggtt_pin(dctx-
>> >legacy_hw_ctx.rcs_state,
>> > -
>> get_context_alignment(dev), 0));
>> > -			/* Fake a finish/inactive */
>> > -			dctx->legacy_hw_ctx.rcs_state->base.write_domain
>> = 0;
>> > -			dctx->legacy_hw_ctx.rcs_state->active = 0;
>> > -		}
>> > -
>> > -		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
>> > -			i915_gem_object_ggtt_unpin(lctx-
>> >legacy_hw_ctx.rcs_state);
>> > -
>> > -		i915_gem_context_unreference(lctx);
>> > -		i915_gem_context_reference(dctx);
>> > -		ring->last_context = dctx;
>> > -	}
>> > -}
>> > -
>> 
>> I am with Daniel on this one. I don't understand how can we throw
>> everything in here away.
>> 
>> We need to force hw to switch to a working context, after reset, so that our
>> internal state tracking matches.
>> 
>> Further, if we aim to more unification I think we should make it so that the
>> initial render state will get run, also after reset.
>> 
>> If we cleanup the last context for each ring set default context carefully,
>> i915_gem_context_enable() will then switch to default contexts and reinit
>> them using the initial render state. Something like this:
>> 
>> void i915_gem_context_reset(struct drm_device *dev) {
>> 	struct drm_i915_private *dev_priv = dev->dev_private;
>> 	int i;
>> 
>> 	for (i = 0; i < I915_NUM_RINGS; i++) {
>> 		struct intel_engine_cs *ring = &dev_priv->ring[i];
>> 		struct intel_context *lctx = ring->last_context;
>> 		struct intel_context *dctx = ring->default_context;
>> 
>> 		if (lctx) {
>> 			if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
>> 				i915_gem_object_ggtt_unpin(lctx-
>> >legacy_hw_ctx.rcs_state);
>> 
>> 			i915_gem_context_unreference(lctx);
>> 			ring->last_context = NULL;
>> 		}
>> 
>> 		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
>> 			dctx->legacy_hw_ctx.rcs_state->base.write_domain
>> = 0;
>> 			dctx->legacy_hw_ctx.rcs_state->active = 0;
>> 			dctx->legacy_hw_ctx.initialized = false;
>> 		}
>> 	}
>> }
>> 
>> The state would be closer of what we get after module reload.
>> 
>> -Mika
>> 
>> >  int i915_gem_context_init(struct drm_device *dev)  {
>> >  	struct drm_i915_private *dev_priv = dev->dev_private; @@ -498,10
>> > +462,6 @@ int i915_gem_context_enable(struct drm_i915_private
>> *dev_priv)
>> >  		ppgtt->enable(ppgtt);
>> >  	}
>> >
>> > -	/* FIXME: We should make this work, even in reset */
>> > -	if (i915_reset_in_progress(&dev_priv->gpu_error))
>> > -		return 0;
>> > -
>> >  	BUG_ON(!dev_priv->ring[RCS].default_context);
>> >
>> >  	for_each_ring(ring, dev_priv, i) {
>> > @@ -645,7 +605,7 @@ static int do_switch(struct intel_engine_cs *ring,
>> >  	from = ring->last_context;
>> >
>> >  	if (USES_FULL_PPGTT(ring->dev)) {
>> > -		ret = ppgtt->switch_mm(ppgtt, ring, false);
>> > +		ret = ppgtt->switch_mm(ppgtt, ring);
>> >  		if (ret)
>> >  			goto unpin_out;
>> >  	}
>> > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
>> > b/drivers/gpu/drm/i915/i915_gem_gtt.c
>> > index 5188936..450c8a9 100644
>> > --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
>> > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
>> > @@ -216,19 +216,12 @@ static gen6_gtt_pte_t
>> iris_pte_encode(dma_addr_t
>> > addr,
>> >
>> >  /* Broadwell Page Directory Pointer Descriptors */  static int
>> > gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
>> > -			   uint64_t val, bool synchronous)
>> > +			   uint64_t val)
>> >  {
>> > -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
>> >  	int ret;
>> >
>> >  	BUG_ON(entry >= 4);
>> >
>> > -	if (synchronous) {
>> > -		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
>> > -		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
>> > -		return 0;
>> > -	}
>> > -
>> >  	ret = intel_ring_begin(ring, 6);
>> >  	if (ret)
>> >  		return ret;
>> > @@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs
>> > *ring, unsigned entry,  }
>> >
>> >  static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
>> > -			  struct intel_engine_cs *ring,
>> > -			  bool synchronous)
>> > +			  struct intel_engine_cs *ring)
>> >  {
>> >  	int i, ret;
>> >
>> > @@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt
>> > *ppgtt,
>> >
>> >  	for (i = used_pd - 1; i >= 0; i--) {
>> >  		dma_addr_t addr = ppgtt->pd_dma_addr[i];
>> > -		ret = gen8_write_pdp(ring, i, addr, synchronous);
>> > +		ret = gen8_write_pdp(ring, i, addr);
>> >  		if (ret)
>> >  			return ret;
>> >  	}
>> > @@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct
>> > i915_hw_ppgtt *ppgtt)  }
>> >
>> >  static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
>> > -			 struct intel_engine_cs *ring,
>> > -			 bool synchronous)
>> > +			 struct intel_engine_cs *ring)
>> >  {
>> > -	struct drm_device *dev = ppgtt->base.dev;
>> > -	struct drm_i915_private *dev_priv = dev->dev_private;
>> >  	int ret;
>> >
>> > -	/* If we're in reset, we can assume the GPU is sufficiently idle to
>> > -	 * manually frob these bits. Ideally we could use the ring functions,
>> > -	 * except our error handling makes it quite difficult (can't use
>> > -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
>> > -	 *
>> > -	 * FIXME: We should try not to special case reset
>> > -	 */
>> > -	if (synchronous ||
>> > -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
>> > -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
>> > -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
>> > -		I915_WRITE(RING_PP_DIR_BASE(ring),
>> get_pd_offset(ppgtt));
>> > -		POSTING_READ(RING_PP_DIR_BASE(ring));
>> > -		return 0;
>> > -	}
>> > -
>> >  	/* NB: TLBs must be flushed and invalidated before a switch */
>> >  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
>> I915_GEM_GPU_DOMAINS);
>> >  	if (ret)
>> > @@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt
>> > *ppgtt,  }
>> >
>> >  static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
>> > -			  struct intel_engine_cs *ring,
>> > -			  bool synchronous)
>> > +			  struct intel_engine_cs *ring)
>> >  {
>> > -	struct drm_device *dev = ppgtt->base.dev;
>> > -	struct drm_i915_private *dev_priv = dev->dev_private;
>> >  	int ret;
>> >
>> > -	/* If we're in reset, we can assume the GPU is sufficiently idle to
>> > -	 * manually frob these bits. Ideally we could use the ring functions,
>> > -	 * except our error handling makes it quite difficult (can't use
>> > -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
>> > -	 *
>> > -	 * FIXME: We should try not to special case reset
>> > -	 */
>> > -	if (synchronous ||
>> > -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
>> > -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
>> > -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
>> > -		I915_WRITE(RING_PP_DIR_BASE(ring),
>> get_pd_offset(ppgtt));
>> > -		POSTING_READ(RING_PP_DIR_BASE(ring));
>> > -		return 0;
>> > -	}
>> > -
>> >  	/* NB: TLBs must be flushed and invalidated before a switch */
>> >  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
>> I915_GEM_GPU_DOMAINS);
>> >  	if (ret)
>> > @@ -819,14 +773,11 @@ static int gen7_mm_switch(struct i915_hw_ppgtt
>> > *ppgtt,  }
>> >
>> >  static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
>> > -			  struct intel_engine_cs *ring,
>> > -			  bool synchronous)
>> > +			  struct intel_engine_cs *ring)
>> >  {
>> >  	struct drm_device *dev = ppgtt->base.dev;
>> >  	struct drm_i915_private *dev_priv = dev->dev_private;
>> >
>> > -	if (!synchronous)
>> > -		return 0;
>> >
>> >  	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
>> >  	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt)); @@ -
>> 852,7
>> > +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
>> >  		if (USES_FULL_PPGTT(dev))
>> >  			continue;
>> >
>> > -		ret = ppgtt->switch_mm(ppgtt, ring, true);
>> > +		ret = ppgtt->switch_mm(ppgtt, ring);
>> >  		if (ret)
>> >  			goto err_out;
>> >  	}
>> > @@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt
>> *ppgtt)
>> >  		if (USES_FULL_PPGTT(dev))
>> >  			continue;
>> >
>> > -		ret = ppgtt->switch_mm(ppgtt, ring, true);
>> > +		ret = ppgtt->switch_mm(ppgtt, ring);
>> >  		if (ret)
>> >  			return ret;
>> >  	}
>> > @@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt
>> *ppgtt)
>> >  	I915_WRITE(GFX_MODE,
>> _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
>> >
>> >  	for_each_ring(ring, dev_priv, i) {
>> > -		int ret = ppgtt->switch_mm(ppgtt, ring, true);
>> > +		int ret = ppgtt->switch_mm(ppgtt, ring);
>> >  		if (ret)
>> >  			return ret;
>> >  	}
>> > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h
>> > b/drivers/gpu/drm/i915/i915_gem_gtt.h
>> > index 8d6f7c1..bf1e4fc 100644
>> > --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
>> > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
>> > @@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
>> >
>> >  	int (*enable)(struct i915_hw_ppgtt *ppgtt);
>> >  	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
>> > -			 struct intel_engine_cs *ring,
>> > -			 bool synchronous);
>> > +			 struct intel_engine_cs *ring);
>> >  	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file
>> *m);
>> > };
>> >
>> > --
>> > 2.0.0
>> >
>> > _______________________________________________
>> > Intel-gfx mailing list
>> > Intel-gfx at lists.freedesktop.org
>> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx