[PATCH Review 1/1] drm/amdgpu: Support setting recover method

Thu Apr 11 11:49:17 UTC 2024

Am 11.04.24 um 13:30 schrieb Yang, Stanley:
> [AMD Official Use Only - General]
>
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumerken at gmail.com>
>> Sent: Thursday, April 11, 2024 7:17 PM
>> To: Yang, Stanley <Stanley.Yang at amd.com>; amd-gfx at lists.freedesktop.org
>> Subject: Re: [PATCH Review 1/1] drm/amdgpu: Support setting recover method
>>
>> Am 11.04.24 um 13:11 schrieb Stanley.Yang:
>>> Don't modify amdgpu gpu recover get operation, add amdgpu gpu recover
>>> set operation to select reset method, only support mode1 and mode2
>>> currently.
>> Well I don't think setting this from userspace is valid.
>>
>> The reset method to use is determined by the hardware and environment (e.g.
>> SRIOV, passthrough, whatever) and can't be chosen simply.
> [Stanley]: Agree, the setting is invalid for some devices not supported setting method and devices still reset with default method,
> but it's valid for those devices supported setting reset method, user can conduct combination testing like mode1 test then mode2 test without
> re-modprobe driver.

Well and the user could also shoot himself into the foot.

I really don't think that this is a valuable functionality.

Regards,
Christian.

>
> Regards,
> Stanley
>> Regards,
>> Christian.
>>
>>> Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 ++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  1 +
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 37
>> +++++++++++++++++++---
>>>    3 files changed, 37 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 9c62552bec34..c82976b2b977 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -1151,6 +1151,9 @@ struct amdgpu_device {
>>>      bool                            debug_largebar;
>>>      bool                            debug_disable_soft_recovery;
>>>      bool                            debug_use_vram_fw_buf;
>>> +
>>> +   /* Used to set gpu reset method */
>>> +   int                             recover_method;
>>>    };
>>>
>>>    static inline uint32_t amdgpu_ip_version(const struct amdgpu_device
>>> *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 3204b8f6edeb..8411a793be18 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -3908,6 +3908,7 @@ int amdgpu_device_init(struct amdgpu_device
>> *adev,
>>>      else
>>>              adev->asic_type = flags & AMD_ASIC_MASK;
>>>
>>> +   adev->recover_method = AMD_RESET_METHOD_NONE;
>>>      adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
>>>      if (amdgpu_emu_mode == 1)
>>>              adev->usec_timeout *= 10;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> index 10832b470448..e388a50d11d9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> @@ -965,9 +965,37 @@ static int gpu_recover_get(void *data, u64 *val)
>>>      return 0;
>>>    }
>>>
>>> +static int gpu_recover_set(void *data, u64 val) {
>>> +   struct amdgpu_device *adev = (struct amdgpu_device *)data;
>>> +   struct drm_device *dev = adev_to_drm(adev);
>>> +   int r;
>>> +
>>> +   /* TODO: support mode1 and mode2 currently */
>>> +   if (val == AMD_RESET_METHOD_MODE1 ||
>>> +           val == AMD_RESET_METHOD_MODE2)
>>> +           adev->recover_method = val;
>>> +   else
>>> +           adev->recover_method = AMD_RESET_METHOD_NONE;
>>> +
>>> +   r = pm_runtime_get_sync(dev->dev);
>>> +   if (r < 0) {
>>> +           pm_runtime_put_autosuspend(dev->dev);
>>> +           return 0;
>>> +   }
>>> +
>>> +   if (amdgpu_reset_domain_schedule(adev->reset_domain, &adev-
>>> reset_work))
>>> +           flush_work(&adev->reset_work);
>>> +
>>> +   pm_runtime_mark_last_busy(dev->dev);
>>> +   pm_runtime_put_autosuspend(dev->dev);
>>> +
>>> +   return 0;
>>> +}
>>> +
>>>    DEFINE_SHOW_ATTRIBUTE(amdgpu_debugfs_fence_info);
>>> -DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_gpu_recover_fops,
>> gpu_recover_get, NULL,
>>> -                    "%lld\n");
>>> +DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_gpu_recover_fops,
>> gpu_recover_get,
>>> +                    gpu_recover_set, "%lld\n");
>>>
>>>    static void amdgpu_debugfs_reset_work(struct work_struct *work)
>>>    {
>>> @@ -978,9 +1006,10 @@ static void amdgpu_debugfs_reset_work(struct
>>> work_struct *work)
>>>
>>>      memset(&reset_context, 0, sizeof(reset_context));
>>>
>>> -   reset_context.method = AMD_RESET_METHOD_NONE;
>>> +   reset_context.method = adev->recover_method;
>>>      reset_context.reset_req_dev = adev;
>>>      set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
>>> +   adev->recover_method = AMD_RESET_METHOD_NONE;
>>>
>>>      amdgpu_device_gpu_recover(adev, NULL, &reset_context);
>>>    }
>>> @@ -999,7 +1028,7 @@ void amdgpu_debugfs_fence_init(struct
>> amdgpu_device *adev)
>>>      if (!amdgpu_sriov_vf(adev)) {
>>>
>>>              INIT_WORK(&adev->reset_work, amdgpu_debugfs_reset_work);
>>> -           debugfs_create_file("amdgpu_gpu_recover", 0444, root, adev,
>>> +           debugfs_create_file("amdgpu_gpu_recover", 0666, root, adev,
>>>                                  &amdgpu_debugfs_gpu_recover_fops);
>>>      }
>>>    #endif