[PATCH 2/2] drm/i915: Use internal class when counting engine resets

Daniele Ceraolo Spurio daniele.ceraolospurio at intel.com
Wed Dec 6 00:52:43 UTC 2023



On 12/1/2023 4:21 AM, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>
> Commit 503579448db9 ("drm/i915/gsc: Mark internal GSC engine with reserved uabi class")
> made the GSC0 engine not have a valid uabi class and so broke the engine
> reset counting, which in turn was made class based in cb823ed9915b ("drm/i915/gt: Use intel_gt as the primary object for handling resets").
>
> Despite the title and commit text of the latter is not mentioning it (and
> has left the storage array incorrectly sized), tracking by class, despite
> it adding aliasing in hypthotetical multi-tile systems, is handy for
> virtual engines which for instance do not have a valid engine->id.
>
> Therefore we keep that but just change it to use the internal class which
> is always valid. We also add a helper to increment the count, which
> aligns with the existing getter.
>
> What was broken without this fix were out of bounds reads every time a
> reset would happen on the GSC0 engine, or during selftests when storing
> and cross-checking the counts in igt_live_test_begin and
> igt_live_test_end.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Fixes: 503579448db9 ("drm/i915/gsc: Mark internal GSC engine with reserved uabi class")
> Reported-by: Alan Previn Teres Alexis <alan.previn.teres.alexis at intel.com>
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>

Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>

Daniele

> ---
>   drivers/gpu/drm/i915/gt/intel_reset.c             |  2 +-
>   drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c |  5 +++--
>   drivers/gpu/drm/i915/i915_gpu_error.h             | 12 ++++++++++--
>   3 files changed, 14 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
> index d5ed904f355d..6801f8b95c53 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> @@ -1293,7 +1293,7 @@ int __intel_engine_reset_bh(struct intel_engine_cs *engine, const char *msg)
>   	if (msg)
>   		drm_notice(&engine->i915->drm,
>   			   "Resetting %s for %s\n", engine->name, msg);
> -	atomic_inc(&engine->i915->gpu_error.reset_engine_count[engine->uabi_class]);
> +	i915_increase_reset_engine_count(&engine->i915->gpu_error, engine);
>   
>   	ret = intel_gt_reset_engine(engine);
>   	if (ret) {
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 04f8377fd7a3..58ea285c51d4 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -5003,7 +5003,8 @@ static void capture_error_state(struct intel_guc *guc,
>   			if (match) {
>   				intel_engine_set_hung_context(e, ce);
>   				engine_mask |= e->mask;
> -				atomic_inc(&i915->gpu_error.reset_engine_count[e->uabi_class]);
> +				i915_increase_reset_engine_count(&i915->gpu_error,
> +								 e);
>   			}
>   		}
>   
> @@ -5015,7 +5016,7 @@ static void capture_error_state(struct intel_guc *guc,
>   	} else {
>   		intel_engine_set_hung_context(ce->engine, ce);
>   		engine_mask = ce->engine->mask;
> -		atomic_inc(&i915->gpu_error.reset_engine_count[ce->engine->uabi_class]);
> +		i915_increase_reset_engine_count(&i915->gpu_error, ce->engine);
>   	}
>   
>   	with_intel_runtime_pm(&i915->runtime_pm, wakeref)
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
> index fa886620d3f8..7c255bb1c319 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
> @@ -17,6 +17,7 @@
>   #include "display/intel_display_device.h"
>   #include "display/intel_display_params.h"
>   #include "gt/intel_engine.h"
> +#include "gt/intel_engine_types.h"
>   #include "gt/intel_gt_types.h"
>   #include "gt/uc/intel_uc_fw.h"
>   
> @@ -234,7 +235,7 @@ struct i915_gpu_error {
>   	atomic_t reset_count;
>   
>   	/** Number of times an engine has been reset */
> -	atomic_t reset_engine_count[I915_NUM_ENGINES];
> +	atomic_t reset_engine_count[MAX_ENGINE_CLASS];
>   };
>   
>   struct drm_i915_error_state_buf {
> @@ -257,7 +258,14 @@ static inline u32 i915_reset_count(struct i915_gpu_error *error)
>   static inline u32 i915_reset_engine_count(struct i915_gpu_error *error,
>   					  const struct intel_engine_cs *engine)
>   {
> -	return atomic_read(&error->reset_engine_count[engine->uabi_class]);
> +	return atomic_read(&error->reset_engine_count[engine->class]);
> +}
> +
> +static inline void
> +i915_increase_reset_engine_count(struct i915_gpu_error *error,
> +				 const struct intel_engine_cs *engine)
> +{
> +	atomic_inc(&error->reset_engine_count[engine->class]);
>   }
>   
>   #define CORE_DUMP_FLAG_NONE           0x0



More information about the dri-devel mailing list