[Intel-gfx] [PATCH 2/2] drm/i915: Use internal class when counting engine resets
Daniele Ceraolo Spurio
daniele.ceraolospurio at intel.com
Wed Dec 6 00:52:43 UTC 2023
On 12/1/2023 4:21 AM, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>
> Commit 503579448db9 ("drm/i915/gsc: Mark internal GSC engine with reserved uabi class")
> made the GSC0 engine not have a valid uabi class and so broke the engine
> reset counting, which in turn was made class based in cb823ed9915b ("drm/i915/gt: Use intel_gt as the primary object for handling resets").
>
> Despite the title and commit text of the latter is not mentioning it (and
> has left the storage array incorrectly sized), tracking by class, despite
> it adding aliasing in hypthotetical multi-tile systems, is handy for
> virtual engines which for instance do not have a valid engine->id.
>
> Therefore we keep that but just change it to use the internal class which
> is always valid. We also add a helper to increment the count, which
> aligns with the existing getter.
>
> What was broken without this fix were out of bounds reads every time a
> reset would happen on the GSC0 engine, or during selftests when storing
> and cross-checking the counts in igt_live_test_begin and
> igt_live_test_end.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Fixes: 503579448db9 ("drm/i915/gsc: Mark internal GSC engine with reserved uabi class")
> Reported-by: Alan Previn Teres Alexis <alan.previn.teres.alexis at intel.com>
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
Daniele
> ---
> drivers/gpu/drm/i915/gt/intel_reset.c | 2 +-
> drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 5 +++--
> drivers/gpu/drm/i915/i915_gpu_error.h | 12 ++++++++++--
> 3 files changed, 14 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
> index d5ed904f355d..6801f8b95c53 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> @@ -1293,7 +1293,7 @@ int __intel_engine_reset_bh(struct intel_engine_cs *engine, const char *msg)
> if (msg)
> drm_notice(&engine->i915->drm,
> "Resetting %s for %s\n", engine->name, msg);
> - atomic_inc(&engine->i915->gpu_error.reset_engine_count[engine->uabi_class]);
> + i915_increase_reset_engine_count(&engine->i915->gpu_error, engine);
>
> ret = intel_gt_reset_engine(engine);
> if (ret) {
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 04f8377fd7a3..58ea285c51d4 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -5003,7 +5003,8 @@ static void capture_error_state(struct intel_guc *guc,
> if (match) {
> intel_engine_set_hung_context(e, ce);
> engine_mask |= e->mask;
> - atomic_inc(&i915->gpu_error.reset_engine_count[e->uabi_class]);
> + i915_increase_reset_engine_count(&i915->gpu_error,
> + e);
> }
> }
>
> @@ -5015,7 +5016,7 @@ static void capture_error_state(struct intel_guc *guc,
> } else {
> intel_engine_set_hung_context(ce->engine, ce);
> engine_mask = ce->engine->mask;
> - atomic_inc(&i915->gpu_error.reset_engine_count[ce->engine->uabi_class]);
> + i915_increase_reset_engine_count(&i915->gpu_error, ce->engine);
> }
>
> with_intel_runtime_pm(&i915->runtime_pm, wakeref)
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
> index fa886620d3f8..7c255bb1c319 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
> @@ -17,6 +17,7 @@
> #include "display/intel_display_device.h"
> #include "display/intel_display_params.h"
> #include "gt/intel_engine.h"
> +#include "gt/intel_engine_types.h"
> #include "gt/intel_gt_types.h"
> #include "gt/uc/intel_uc_fw.h"
>
> @@ -234,7 +235,7 @@ struct i915_gpu_error {
> atomic_t reset_count;
>
> /** Number of times an engine has been reset */
> - atomic_t reset_engine_count[I915_NUM_ENGINES];
> + atomic_t reset_engine_count[MAX_ENGINE_CLASS];
> };
>
> struct drm_i915_error_state_buf {
> @@ -257,7 +258,14 @@ static inline u32 i915_reset_count(struct i915_gpu_error *error)
> static inline u32 i915_reset_engine_count(struct i915_gpu_error *error,
> const struct intel_engine_cs *engine)
> {
> - return atomic_read(&error->reset_engine_count[engine->uabi_class]);
> + return atomic_read(&error->reset_engine_count[engine->class]);
> +}
> +
> +static inline void
> +i915_increase_reset_engine_count(struct i915_gpu_error *error,
> + const struct intel_engine_cs *engine)
> +{
> + atomic_inc(&error->reset_engine_count[engine->class]);
> }
>
> #define CORE_DUMP_FLAG_NONE 0x0
More information about the Intel-gfx
mailing list