[Intel-gfx] [PATCH v2] drm/i915: fix SFC reset flow

Thu Sep 19 09:34:15 UTC 2019

On 19/09/2019 02:53, Daniele Ceraolo Spurio wrote:
> Our assumption that the we can ask the HW to lock the SFC even if not
> currently in use does not match the HW commitment. The expectation from
> the HW is that SW will not try to lock the SFC if the engine is not
> using it and if we do that the behavior is undefined; on ICL the HW
> ends up to returning the ack and ignoring our lock request, but this is
> not guaranteed and we shouldn't expect it going forward.
> 
> Also, failing to get the ack while the SFC is in use means that we can't
> cleanly reset it, so fail the engine reset in that scenario.
> 
> v2: drop rmw change, keep the log as debug and handle failure (Chris),
>      improve comments (Tvrtko).
> 
> Reported-by: Owen Zhang <owen.zhang at intel.com>
> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at linux.intel.com>
> Cc: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/gt/intel_reset.c | 51 +++++++++++++++++----------
>   1 file changed, 33 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
> index 8327220ac558..797cf50625cb 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> @@ -309,7 +309,7 @@ static int gen6_reset_engines(struct intel_gt *gt,
>   	return gen6_hw_domain_reset(gt, hw_mask);
>   }
>   
> -static u32 gen11_lock_sfc(struct intel_engine_cs *engine)
> +static int gen11_lock_sfc(struct intel_engine_cs *engine, u32 *hw_mask)
>   {
>   	struct intel_uncore *uncore = engine->uncore;
>   	u8 vdbox_sfc_access = RUNTIME_INFO(engine->i915)->vdbox_sfc_access;
> @@ -318,6 +318,7 @@ static u32 gen11_lock_sfc(struct intel_engine_cs *engine)
>   	i915_reg_t sfc_usage;
>   	u32 sfc_usage_bit;
>   	u32 sfc_reset_bit;
> +	int ret;
>   
>   	switch (engine->class) {
>   	case VIDEO_DECODE_CLASS:
> @@ -352,28 +353,33 @@ static u32 gen11_lock_sfc(struct intel_engine_cs *engine)
>   	}
>   
>   	/*
> -	 * Tell the engine that a software reset is going to happen. The engine
> -	 * will then try to force lock the SFC (if currently locked, it will
> -	 * remain so until we tell the engine it is safe to unlock; if currently
> -	 * unlocked, it will ignore this and all new lock requests). If SFC
> -	 * ends up being locked to the engine we want to reset, we have to reset
> -	 * it as well (we will unlock it once the reset sequence is completed).
> +	 * If the engine is using a SFC, tell the engine that a software reset
> +	 * is going to happen. The engine will then try to force lock the SFC.
> +	 * If SFC ends up being locked to the engine we want to reset, we have
> +	 * to reset it as well (we will unlock it once the reset sequence is
> +	 * completed).
>   	 */
> +	if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit))
> +		return 0;
> +
>   	rmw_set_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit);
>   
> -	if (__intel_wait_for_register_fw(uncore,
> -					 sfc_forced_lock_ack,
> -					 sfc_forced_lock_ack_bit,
> -					 sfc_forced_lock_ack_bit,
> -					 1000, 0, NULL)) {
> -		DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n");
> +	ret = __intel_wait_for_register_fw(uncore,
> +					   sfc_forced_lock_ack,
> +					   sfc_forced_lock_ack_bit,
> +					   sfc_forced_lock_ack_bit,
> +					   1000, 0, NULL);
> +
> +	/* was the SFC released while we were trying to lock it? */
> +	if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit))
>   		return 0;
> -	}
>   
> -	if (intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit)
> -		return sfc_reset_bit;
> +	if (ret)
> +		DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n");
> +	else
> +		*hw_mask |= sfc_reset_bit;
>   
> -	return 0;
> +	return ret;
>   }
>   
>   static void gen11_unlock_sfc(struct intel_engine_cs *engine)
> @@ -430,12 +436,21 @@ static int gen11_reset_engines(struct intel_gt *gt,
>   		for_each_engine_masked(engine, gt->i915, engine_mask, tmp) {
>   			GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask));
>   			hw_mask |= hw_engine_mask[engine->id];
> -			hw_mask |= gen11_lock_sfc(engine);
> +			ret = gen11_lock_sfc(engine, &hw_mask);
> +			if (ret)
> +				goto sfc_unlock;

Break on first failure looks unsafe to me. I think it would be more 
robust to continue, no? Like if we have been asked to reset multiple 
engines and only one failed, why not do the ones we can?

>   		}
>   	}
>   
>   	ret = gen6_hw_domain_reset(gt, hw_mask);
>   
> +sfc_unlock:
> +	/*
> +	 * we unlock the SFC based on the lock status and not the result of
> +	 * gen11_lock_sfc to make sure that we clean properly if something
> +	 * wrong happened during the lock (e.g. lock acquired after timeout
> +	 * expiration).
> +	 */
>   	if (engine_mask != ALL_ENGINES)
>   		for_each_engine_masked(engine, gt->i915, engine_mask, tmp)
>   			gen11_unlock_sfc(engine);
> 

So you decided not to read the register and cross check?

Regards,

Tvrtko