[PATCH 2/2] drm/i915: Use internal class when counting engine resets
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Thu Dec 7 11:12:11 UTC 2023
On 06/12/2023 00:52, Daniele Ceraolo Spurio wrote:
>
>
> On 12/1/2023 4:21 AM, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>>
>> Commit 503579448db9 ("drm/i915/gsc: Mark internal GSC engine with
>> reserved uabi class")
>> made the GSC0 engine not have a valid uabi class and so broke the engine
>> reset counting, which in turn was made class based in cb823ed9915b
>> ("drm/i915/gt: Use intel_gt as the primary object for handling resets").
>>
>> Despite the title and commit text of the latter is not mentioning it (and
>> has left the storage array incorrectly sized), tracking by class, despite
>> it adding aliasing in hypthotetical multi-tile systems, is handy for
>> virtual engines which for instance do not have a valid engine->id.
>>
>> Therefore we keep that but just change it to use the internal class which
>> is always valid. We also add a helper to increment the count, which
>> aligns with the existing getter.
>>
>> What was broken without this fix were out of bounds reads every time a
>> reset would happen on the GSC0 engine, or during selftests when storing
>> and cross-checking the counts in igt_live_test_begin and
>> igt_live_test_end.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>> Fixes: 503579448db9 ("drm/i915/gsc: Mark internal GSC engine with
>> reserved uabi class")
>> Reported-by: Alan Previn Teres Alexis
>> <alan.previn.teres.alexis at intel.com>
>> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
>
> Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
Thanks! Lets see if 1/2 gets some attention so I don't have to split out
2/2 just for CI.
Regards,
Tvrtko
>
> Daniele
>
>> ---
>> drivers/gpu/drm/i915/gt/intel_reset.c | 2 +-
>> drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 5 +++--
>> drivers/gpu/drm/i915/i915_gpu_error.h | 12 ++++++++++--
>> 3 files changed, 14 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c
>> b/drivers/gpu/drm/i915/gt/intel_reset.c
>> index d5ed904f355d..6801f8b95c53 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
>> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
>> @@ -1293,7 +1293,7 @@ int __intel_engine_reset_bh(struct
>> intel_engine_cs *engine, const char *msg)
>> if (msg)
>> drm_notice(&engine->i915->drm,
>> "Resetting %s for %s\n", engine->name, msg);
>> -
>> atomic_inc(&engine->i915->gpu_error.reset_engine_count[engine->uabi_class]);
>> + i915_increase_reset_engine_count(&engine->i915->gpu_error, engine);
>> ret = intel_gt_reset_engine(engine);
>> if (ret) {
>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>> index 04f8377fd7a3..58ea285c51d4 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>> @@ -5003,7 +5003,8 @@ static void capture_error_state(struct intel_guc
>> *guc,
>> if (match) {
>> intel_engine_set_hung_context(e, ce);
>> engine_mask |= e->mask;
>> -
>> atomic_inc(&i915->gpu_error.reset_engine_count[e->uabi_class]);
>> + i915_increase_reset_engine_count(&i915->gpu_error,
>> + e);
>> }
>> }
>> @@ -5015,7 +5016,7 @@ static void capture_error_state(struct intel_guc
>> *guc,
>> } else {
>> intel_engine_set_hung_context(ce->engine, ce);
>> engine_mask = ce->engine->mask;
>> -
>> atomic_inc(&i915->gpu_error.reset_engine_count[ce->engine->uabi_class]);
>> + i915_increase_reset_engine_count(&i915->gpu_error, ce->engine);
>> }
>> with_intel_runtime_pm(&i915->runtime_pm, wakeref)
>> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h
>> b/drivers/gpu/drm/i915/i915_gpu_error.h
>> index fa886620d3f8..7c255bb1c319 100644
>> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
>> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
>> @@ -17,6 +17,7 @@
>> #include "display/intel_display_device.h"
>> #include "display/intel_display_params.h"
>> #include "gt/intel_engine.h"
>> +#include "gt/intel_engine_types.h"
>> #include "gt/intel_gt_types.h"
>> #include "gt/uc/intel_uc_fw.h"
>> @@ -234,7 +235,7 @@ struct i915_gpu_error {
>> atomic_t reset_count;
>> /** Number of times an engine has been reset */
>> - atomic_t reset_engine_count[I915_NUM_ENGINES];
>> + atomic_t reset_engine_count[MAX_ENGINE_CLASS];
>> };
>> struct drm_i915_error_state_buf {
>> @@ -257,7 +258,14 @@ static inline u32 i915_reset_count(struct
>> i915_gpu_error *error)
>> static inline u32 i915_reset_engine_count(struct i915_gpu_error *error,
>> const struct intel_engine_cs *engine)
>> {
>> - return atomic_read(&error->reset_engine_count[engine->uabi_class]);
>> + return atomic_read(&error->reset_engine_count[engine->class]);
>> +}
>> +
>> +static inline void
>> +i915_increase_reset_engine_count(struct i915_gpu_error *error,
>> + const struct intel_engine_cs *engine)
>> +{
>> + atomic_inc(&error->reset_engine_count[engine->class]);
>> }
>> #define CORE_DUMP_FLAG_NONE 0x0
>
More information about the Intel-gfx
mailing list