[Intel-gfx] [PATCH] drm/i915/gt: Consider multi-gt at all places

Tvrtko Ursulin tvrtko.ursulin at linux.intel.com
Tue Apr 18 07:32:49 UTC 2023


On 17/04/2023 19:13, Upadhyay, Tejas wrote:
> 
> 
>> -----Original Message-----
>> From: Tvrtko Ursulin <tvrtko.ursulin at linux.intel.com>
>> Sent: Wednesday, April 5, 2023 2:01 PM
>> To: Upadhyay, Tejas <tejas.upadhyay at intel.com>; Intel-
>> GFX at Lists.FreeDesktop.Org
>> Subject: Re: [Intel-gfx] [PATCH] drm/i915/gt: Consider multi-gt at all places
>>
>>
>> On 05/04/2023 07:56, Upadhyay, Tejas wrote:
>>>>> -int igt_live_test_end(struct igt_live_test *t)
>>>>> +int igt_live_test_end(struct igt_live_test *t, struct intel_gt *gt)
>>>>>     {
>>>>> -	struct drm_i915_private *i915 = t->i915;
>>>>> +	struct drm_i915_private *i915 = gt->i915;
>>>>>     	struct intel_engine_cs *engine;
>>>>>     	enum intel_engine_id id;
>>>>>
>>>>> @@ -57,7 +57,7 @@ int igt_live_test_end(struct igt_live_test *t)
>>>>>     		return -EIO;
>>>>>     	}
>>>>>
>>>>> -	for_each_engine(engine, to_gt(i915), id) {
>>>>> +	for_each_engine(engine, gt, id) {
>>>>>     		if (t->reset_engine[id] ==
>>>>>     		    i915_reset_engine_count(&i915->gpu_error, engine))
>>>>>     			continue;
>>>>> diff --git a/drivers/gpu/drm/i915/selftests/igt_live_test.h
>>>>> b/drivers/gpu/drm/i915/selftests/igt_live_test.h
>>>>> index 36ed42736c52..209b0548c603 100644
>>>>> --- a/drivers/gpu/drm/i915/selftests/igt_live_test.h
>>>>> +++ b/drivers/gpu/drm/i915/selftests/igt_live_test.h
>>>>> @@ -27,9 +27,9 @@ struct igt_live_test {
>>>>>      * e.g. if the GPU was reset.
>>>>>      */
>>>>>     int igt_live_test_begin(struct igt_live_test *t,
>>>>> -			struct drm_i915_private *i915,
>>>>> +			struct intel_gt *gt,
>>>>>     			const char *func,
>>>>>     			const char *name);
>>>>> -int igt_live_test_end(struct igt_live_test *t);
>>>>> +int igt_live_test_end(struct igt_live_test *t, struct intel_gt
>>>>> +*gt);
>>>>
>>>> Back in the day the plan was that live selftests are device focused
>>>> and then we also have intel_gt_live_subtests, which are obviously GT
>>>> focused. So in that sense adding a single GT parameter to
>>>> igt_live_test_begin isn't something I immediately understand.
>>>>
>>>> Could you explain in one or two practical examples what is not
>>>> working properly and how is this patch fixing it?
>>>
>>> For example you are running test "live_all_engines(void *arg)",
>>>
>>> -- Below test begin, will reset counters for primary GT - Tile0 -- err
>>> = igt_live_test_begin(&t, to_gt(i915), __func__, "");
>>>           if (err)
>>>                   goto out_free;
>>>
>>> --- Now we loop for all engines, note here for MTL vcs, vecs engines are not
>> on primary GT or tile 0,
>>>        So counters did not reset on test begin does not cover them. ---
>>>
>>>         In test_begin, below will not reset count for vcs, vecs engines on MTL,
>>> 	for_each_engine(engine, gt, id)
>>>                   t->reset_engine[id] =
>>>                           i915_reset_engine_count(&i915->gpu_error,
>>> engine);
>>>
>>> --- Then below will end test, again for primary GT where above
>>> mentioned engines are not there --- err = igt_live_test_end(&t,
>>> to_gt(i915));
>>>
>>> In short to me it looks like igt_live_test for device needs attention when we
>> have different engines on different GTs like MTL.
>>
>> If you either add for_each_gt to that for_each_engine in igt_live_test_begin
>> and igt_live_test_end, or alternatively for_each_uabi_engine (maybe misses
>> some internal engines?), everything works? That would be much smaller
>> patch and wouldn't falsely associate live tests with a single gt.
>>
> 
> Below would work, if you agree I will rearrange and send patch.
> 
> --- a/drivers/gpu/drm/i915/selftests/igt_live_test.c
> +++ b/drivers/gpu/drm/i915/selftests/igt_live_test.c
> @@ -16,28 +16,31 @@ int igt_live_test_begin(struct igt_live_test *t,
>                          const char *func,
>                          const char *name)
>   {
> -       struct intel_gt *gt = to_gt(i915);
> +       struct intel_gt *gt;
>          struct intel_engine_cs *engine;
>          enum intel_engine_id id;
>          int err;
> +       unsigned int i;
> 
> -       t->i915 = i915;
> -       t->func = func;
> -       t->name = name;
> +       for_each_gt(gt, i915, i) {
> +               t->i915 = i915;
> +               t->func = func;
> +               t->name = name;

These three assignments should stay outside the outer loop.

> 
> -       err = intel_gt_wait_for_idle(gt, MAX_SCHEDULE_TIMEOUT);
> -       if (err) {
> -               pr_err("%s(%s): failed to idle before, with err=%d!",
> -                      func, name, err);
> -               return err;
> -       }
> +               err = intel_gt_wait_for_idle(gt, MAX_SCHEDULE_TIMEOUT);
> +               if (err) {
> +                       pr_err("%s(%s): failed to idle before, with err=%d!",
> +                                       func, name, err);
> +                       return err;
> +               }
> 
> -       t->reset_global = i915_reset_count(&i915->gpu_error);
> +               t->reset_global = i915_reset_count(&i915->gpu_error);

Ditto.

Rest looks good to me.

Regards,

Tvrtko

> 
> -       for_each_engine(engine, gt, id)
> -               t->reset_engine[id] =
> +               for_each_engine(engine, gt, id)
> +                       t->reset_engine[id] =
>                          i915_reset_engine_count(&i915->gpu_error, engine);
> 
> +       }
>          return 0;
>   }
> 
> @@ -46,6 +49,7 @@ int igt_live_test_end(struct igt_live_test *t)
>          struct drm_i915_private *i915 = t->i915;
>          struct intel_engine_cs *engine;
>          enum intel_engine_id id;
> +       unsigned int i;
> 
>          if (igt_flush_test(i915))
>                  return -EIO;
> @@ -57,17 +61,18 @@ int igt_live_test_end(struct igt_live_test *t)
>                  return -EIO;
>          }
> 
> -       for_each_engine(engine, to_gt(i915), id) {
> -               if (t->reset_engine[id] ==
> -                   i915_reset_engine_count(&i915->gpu_error, engine))
> -                       continue;
> +       for_each_gt(gt, i915, i) {
> +               for_each_engine(engine, gt, id) {
> +                       if (t->reset_engine[id] ==
> +                                       i915_reset_engine_count(&i915->gpu_error, engine))
> +                               continue;
> 
> -               pr_err("%s(%s): engine '%s' was reset %d times!\n",
> -                      t->func, t->name, engine->name,
> -                      i915_reset_engine_count(&i915->gpu_error, engine) -
> -                      t->reset_engine[id]);
> -               return -EIO;
> +                       pr_err("%s(%s): engine '%s' was reset %d times!\n",
> +                                       t->func, t->name, engine->name,
> +                                       i915_reset_engine_count(&i915->gpu_error, engine) -
> +                                       t->reset_engine[id]);
> +                       return -EIO;
> +               }
>          }
>       
> Regards,
> Tejas
>> Regards,
>>
>> Tvrtko


More information about the Intel-gfx mailing list