[PATCH 2/2] drm/i915: Use internal class when counting engine resets

Tvrtko Ursulin tvrtko.ursulin at linux.intel.com
Thu Dec 7 11:12:11 UTC 2023


On 06/12/2023 00:52, Daniele Ceraolo Spurio wrote:
> 
> 
> On 12/1/2023 4:21 AM, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>>
>> Commit 503579448db9 ("drm/i915/gsc: Mark internal GSC engine with 
>> reserved uabi class")
>> made the GSC0 engine not have a valid uabi class and so broke the engine
>> reset counting, which in turn was made class based in cb823ed9915b 
>> ("drm/i915/gt: Use intel_gt as the primary object for handling resets").
>>
>> Despite the title and commit text of the latter is not mentioning it (and
>> has left the storage array incorrectly sized), tracking by class, despite
>> it adding aliasing in hypthotetical multi-tile systems, is handy for
>> virtual engines which for instance do not have a valid engine->id.
>>
>> Therefore we keep that but just change it to use the internal class which
>> is always valid. We also add a helper to increment the count, which
>> aligns with the existing getter.
>>
>> What was broken without this fix were out of bounds reads every time a
>> reset would happen on the GSC0 engine, or during selftests when storing
>> and cross-checking the counts in igt_live_test_begin and
>> igt_live_test_end.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>> Fixes: 503579448db9 ("drm/i915/gsc: Mark internal GSC engine with 
>> reserved uabi class")
>> Reported-by: Alan Previn Teres Alexis 
>> <alan.previn.teres.alexis at intel.com>
>> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> 
> Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>

Thanks! Lets see if 1/2 gets some attention so I don't have to split out 
2/2 just for CI.

Regards,

Tvrtko

> 
> Daniele
> 
>> ---
>>   drivers/gpu/drm/i915/gt/intel_reset.c             |  2 +-
>>   drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c |  5 +++--
>>   drivers/gpu/drm/i915/i915_gpu_error.h             | 12 ++++++++++--
>>   3 files changed, 14 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
>> b/drivers/gpu/drm/i915/gt/intel_reset.c
>> index d5ed904f355d..6801f8b95c53 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
>> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
>> @@ -1293,7 +1293,7 @@ int __intel_engine_reset_bh(struct 
>> intel_engine_cs *engine, const char *msg)
>>       if (msg)
>>           drm_notice(&engine->i915->drm,
>>                  "Resetting %s for %s\n", engine->name, msg);
>> -    
>> atomic_inc(&engine->i915->gpu_error.reset_engine_count[engine->uabi_class]);
>> +    i915_increase_reset_engine_count(&engine->i915->gpu_error, engine);
>>       ret = intel_gt_reset_engine(engine);
>>       if (ret) {
>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>> index 04f8377fd7a3..58ea285c51d4 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>> @@ -5003,7 +5003,8 @@ static void capture_error_state(struct intel_guc 
>> *guc,
>>               if (match) {
>>                   intel_engine_set_hung_context(e, ce);
>>                   engine_mask |= e->mask;
>> -                
>> atomic_inc(&i915->gpu_error.reset_engine_count[e->uabi_class]);
>> +                i915_increase_reset_engine_count(&i915->gpu_error,
>> +                                 e);
>>               }
>>           }
>> @@ -5015,7 +5016,7 @@ static void capture_error_state(struct intel_guc 
>> *guc,
>>       } else {
>>           intel_engine_set_hung_context(ce->engine, ce);
>>           engine_mask = ce->engine->mask;
>> -        
>> atomic_inc(&i915->gpu_error.reset_engine_count[ce->engine->uabi_class]);
>> +        i915_increase_reset_engine_count(&i915->gpu_error, ce->engine);
>>       }
>>       with_intel_runtime_pm(&i915->runtime_pm, wakeref)
>> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h 
>> b/drivers/gpu/drm/i915/i915_gpu_error.h
>> index fa886620d3f8..7c255bb1c319 100644
>> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
>> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
>> @@ -17,6 +17,7 @@
>>   #include "display/intel_display_device.h"
>>   #include "display/intel_display_params.h"
>>   #include "gt/intel_engine.h"
>> +#include "gt/intel_engine_types.h"
>>   #include "gt/intel_gt_types.h"
>>   #include "gt/uc/intel_uc_fw.h"
>> @@ -234,7 +235,7 @@ struct i915_gpu_error {
>>       atomic_t reset_count;
>>       /** Number of times an engine has been reset */
>> -    atomic_t reset_engine_count[I915_NUM_ENGINES];
>> +    atomic_t reset_engine_count[MAX_ENGINE_CLASS];
>>   };
>>   struct drm_i915_error_state_buf {
>> @@ -257,7 +258,14 @@ static inline u32 i915_reset_count(struct 
>> i915_gpu_error *error)
>>   static inline u32 i915_reset_engine_count(struct i915_gpu_error *error,
>>                         const struct intel_engine_cs *engine)
>>   {
>> -    return atomic_read(&error->reset_engine_count[engine->uabi_class]);
>> +    return atomic_read(&error->reset_engine_count[engine->class]);
>> +}
>> +
>> +static inline void
>> +i915_increase_reset_engine_count(struct i915_gpu_error *error,
>> +                 const struct intel_engine_cs *engine)
>> +{
>> +    atomic_inc(&error->reset_engine_count[engine->class]);
>>   }
>>   #define CORE_DUMP_FLAG_NONE           0x0
> 


More information about the Intel-gfx mailing list