[Intel-gfx] [PATCH] drm/i915: Rework GPU reset sequence to match driver load & thaw

Ben Widawsky ben at bwidawsk.net
Sat Jul 26 03:05:29 CEST 2014


On Wed, Jul 16, 2014 at 04:05:59PM +0100, alistair.mcaulay at intel.com wrote:
> From: "McAulay, Alistair" <alistair.mcaulay at intel.com>
> 
> This patch is to address Daniels concerns over different code during reset:
> 
> http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html
> 
> "The reason for aiming as hard as possible to use the exact same code for
> driver load, gpu reset and runtime pm/system resume is that we've simply
> seen too many bugs due to slight variations and unintended omissions."
> 
> Tested using igt drv_hangman.
> 
> Signed-off-by: McAulay, Alistair <alistair.mcaulay at intel.com>

2 quick comments before I actually do a real review.
1. Did you actually run this with and without full ppgtt?
2. I don't think this is actually fulfilling what Daniel is requesting,
though we can let him comment.
3. Did you reall do #1?

Assuming you satisifed #1, can you please post the igt results for the
permutations (pre patch w/ and w/o ppgtt; post patch w/ and w/o ppgtt)

I really want this data because I spent *a lot* of time with these
specific areas in the PPGTT work, and I am somewhat skeptical enough of
the code has changed that this will magically work. I also tried the
trickiness with the ring handling functions, and never succeeded. Also,
with the context stuff, I'm simply not convinced it can magically
vanish. If igt looks good, and Daniel agrees that this is what he
actually wanted, I will go fishing for corner cases and do the review.

Thanks.

> ---
>  drivers/gpu/drm/i915/i915_gem.c         |  2 -
>  drivers/gpu/drm/i915/i915_gem_context.c | 42 +--------------------
>  drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++----------------------------
>  drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
>  drivers/gpu/drm/i915/intel_ringbuffer.c |  4 +-
>  5 files changed, 14 insertions(+), 104 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index ef047bc..b38e086 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2590,8 +2590,6 @@ void i915_gem_reset(struct drm_device *dev)
>  	for_each_ring(ring, dev_priv, i)
>  		i915_gem_reset_ring_cleanup(dev_priv, ring);
>  
> -	i915_gem_context_reset(dev);
> -
>  	i915_gem_restore_fences(dev);
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index de72a28..d96219f 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -372,42 +372,6 @@ err_destroy:
>  	return ERR_PTR(ret);
>  }
>  
> -void i915_gem_context_reset(struct drm_device *dev)
> -{
> -	struct drm_i915_private *dev_priv = dev->dev_private;
> -	int i;
> -
> -	/* Prevent the hardware from restoring the last context (which hung) on
> -	 * the next switch */
> -	for (i = 0; i < I915_NUM_RINGS; i++) {
> -		struct intel_engine_cs *ring = &dev_priv->ring[i];
> -		struct intel_context *dctx = ring->default_context;
> -		struct intel_context *lctx = ring->last_context;
> -
> -		/* Do a fake switch to the default context */
> -		if (lctx == dctx)
> -			continue;
> -
> -		if (!lctx)
> -			continue;
> -
> -		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
> -			WARN_ON(i915_gem_obj_ggtt_pin(dctx->legacy_hw_ctx.rcs_state,
> -						      get_context_alignment(dev), 0));
> -			/* Fake a finish/inactive */
> -			dctx->legacy_hw_ctx.rcs_state->base.write_domain = 0;
> -			dctx->legacy_hw_ctx.rcs_state->active = 0;
> -		}
> -
> -		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> -			i915_gem_object_ggtt_unpin(lctx->legacy_hw_ctx.rcs_state);
> -
> -		i915_gem_context_unreference(lctx);
> -		i915_gem_context_reference(dctx);
> -		ring->last_context = dctx;
> -	}
> -}
> -
>  int i915_gem_context_init(struct drm_device *dev)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> @@ -498,10 +462,6 @@ int i915_gem_context_enable(struct drm_i915_private *dev_priv)
>  		ppgtt->enable(ppgtt);
>  	}
>  
> -	/* FIXME: We should make this work, even in reset */
> -	if (i915_reset_in_progress(&dev_priv->gpu_error))
> -		return 0;
> -
>  	BUG_ON(!dev_priv->ring[RCS].default_context);
>  
>  	for_each_ring(ring, dev_priv, i) {
> @@ -645,7 +605,7 @@ static int do_switch(struct intel_engine_cs *ring,
>  	from = ring->last_context;
>  
>  	if (USES_FULL_PPGTT(ring->dev)) {
> -		ret = ppgtt->switch_mm(ppgtt, ring, false);
> +		ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			goto unpin_out;
>  	}
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 5188936..450c8a9 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -216,19 +216,12 @@ static gen6_gtt_pte_t iris_pte_encode(dma_addr_t addr,
>  
>  /* Broadwell Page Directory Pointer Descriptors */
>  static int gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
> -			   uint64_t val, bool synchronous)
> +			   uint64_t val)
>  {
> -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
>  	int ret;
>  
>  	BUG_ON(entry >= 4);
>  
> -	if (synchronous) {
> -		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
> -		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
> -		return 0;
> -	}
> -
>  	ret = intel_ring_begin(ring, 6);
>  	if (ret)
>  		return ret;
> @@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
>  }
>  
>  static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			  struct intel_engine_cs *ring,
> -			  bool synchronous)
> +			  struct intel_engine_cs *ring)
>  {
>  	int i, ret;
>  
> @@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
>  
>  	for (i = used_pd - 1; i >= 0; i--) {
>  		dma_addr_t addr = ppgtt->pd_dma_addr[i];
> -		ret = gen8_write_pdp(ring, i, addr, synchronous);
> +		ret = gen8_write_pdp(ring, i, addr);
>  		if (ret)
>  			return ret;
>  	}
> @@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct i915_hw_ppgtt *ppgtt)
>  }
>  
>  static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			 struct intel_engine_cs *ring,
> -			 bool synchronous)
> +			 struct intel_engine_cs *ring)
>  {
> -	struct drm_device *dev = ppgtt->base.dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
>  	int ret;
>  
> -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> -	 * manually frob these bits. Ideally we could use the ring functions,
> -	 * except our error handling makes it quite difficult (can't use
> -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> -	 *
> -	 * FIXME: We should try not to special case reset
> -	 */
> -	if (synchronous ||
> -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> -		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
> -		POSTING_READ(RING_PP_DIR_BASE(ring));
> -		return 0;
> -	}
> -
>  	/* NB: TLBs must be flushed and invalidated before a switch */
>  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
>  	if (ret)
> @@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
>  }
>  
>  static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			  struct intel_engine_cs *ring,
> -			  bool synchronous)
> +			  struct intel_engine_cs *ring)
>  {
> -	struct drm_device *dev = ppgtt->base.dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
>  	int ret;
>  
> -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> -	 * manually frob these bits. Ideally we could use the ring functions,
> -	 * except our error handling makes it quite difficult (can't use
> -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> -	 *
> -	 * FIXME: We should try not to special case reset
> -	 */
> -	if (synchronous ||
> -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> -		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
> -		POSTING_READ(RING_PP_DIR_BASE(ring));
> -		return 0;
> -	}
> -
>  	/* NB: TLBs must be flushed and invalidated before a switch */
>  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
>  	if (ret)
> @@ -819,14 +773,11 @@ static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
>  }
>  
>  static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			  struct intel_engine_cs *ring,
> -			  bool synchronous)
> +			  struct intel_engine_cs *ring)
>  {
>  	struct drm_device *dev = ppgtt->base.dev;
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  
> -	if (!synchronous)
> -		return 0;
>  
>  	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
>  	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
> @@ -852,7 +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
>  		if (USES_FULL_PPGTT(dev))
>  			continue;
>  
> -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> +		ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			goto err_out;
>  	}
> @@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
>  		if (USES_FULL_PPGTT(dev))
>  			continue;
>  
> -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> +		ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			return ret;
>  	}
> @@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
>  	I915_WRITE(GFX_MODE, _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
>  
>  	for_each_ring(ring, dev_priv, i) {
> -		int ret = ppgtt->switch_mm(ppgtt, ring, true);
> +		int ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			return ret;
>  	}
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
> index 8d6f7c1..bf1e4fc 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> @@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
>  
>  	int (*enable)(struct i915_hw_ppgtt *ppgtt);
>  	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
> -			 struct intel_engine_cs *ring,
> -			 bool synchronous);
> +			 struct intel_engine_cs *ring);
>  	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file *m);
>  };
>  
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 599709e..e33c2e1 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -1832,7 +1832,9 @@ int intel_ring_begin(struct intel_engine_cs *ring,
>  
>  	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
>  				   dev_priv->mm.interruptible);
> -	if (ret)
> +
> +	/* -EAGAIN means a reset is in progress, it is Ok to continue */
> +	if (ret && (ret != -EAGAIN))
>  		return ret;
>  
>  	ret = __intel_ring_prepare(ring, num_dwords * sizeof(uint32_t));
> -- 
> 2.0.0
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Ben Widawsky, Intel Open Source Technology Center



More information about the Intel-gfx mailing list