[PATCH Review 1/1] drm/amdgpu: Support setting recover method
Christian König
ckoenig.leichtzumerken at gmail.com
Thu Apr 11 11:49:17 UTC 2024
Am 11.04.24 um 13:30 schrieb Yang, Stanley:
> [AMD Official Use Only - General]
>
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumerken at gmail.com>
>> Sent: Thursday, April 11, 2024 7:17 PM
>> To: Yang, Stanley <Stanley.Yang at amd.com>; amd-gfx at lists.freedesktop.org
>> Subject: Re: [PATCH Review 1/1] drm/amdgpu: Support setting recover method
>>
>> Am 11.04.24 um 13:11 schrieb Stanley.Yang:
>>> Don't modify amdgpu gpu recover get operation, add amdgpu gpu recover
>>> set operation to select reset method, only support mode1 and mode2
>>> currently.
>> Well I don't think setting this from userspace is valid.
>>
>> The reset method to use is determined by the hardware and environment (e.g.
>> SRIOV, passthrough, whatever) and can't be chosen simply.
> [Stanley]: Agree, the setting is invalid for some devices not supported setting method and devices still reset with default method,
> but it's valid for those devices supported setting reset method, user can conduct combination testing like mode1 test then mode2 test without
> re-modprobe driver.
Well and the user could also shoot himself into the foot.
I really don't think that this is a valuable functionality.
Regards,
Christian.
>
> Regards,
> Stanley
>> Regards,
>> Christian.
>>
>>> Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 ++
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 +
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 37
>> +++++++++++++++++++---
>>> 3 files changed, 37 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 9c62552bec34..c82976b2b977 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -1151,6 +1151,9 @@ struct amdgpu_device {
>>> bool debug_largebar;
>>> bool debug_disable_soft_recovery;
>>> bool debug_use_vram_fw_buf;
>>> +
>>> + /* Used to set gpu reset method */
>>> + int recover_method;
>>> };
>>>
>>> static inline uint32_t amdgpu_ip_version(const struct amdgpu_device
>>> *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 3204b8f6edeb..8411a793be18 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -3908,6 +3908,7 @@ int amdgpu_device_init(struct amdgpu_device
>> *adev,
>>> else
>>> adev->asic_type = flags & AMD_ASIC_MASK;
>>>
>>> + adev->recover_method = AMD_RESET_METHOD_NONE;
>>> adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
>>> if (amdgpu_emu_mode == 1)
>>> adev->usec_timeout *= 10;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> index 10832b470448..e388a50d11d9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> @@ -965,9 +965,37 @@ static int gpu_recover_get(void *data, u64 *val)
>>> return 0;
>>> }
>>>
>>> +static int gpu_recover_set(void *data, u64 val) {
>>> + struct amdgpu_device *adev = (struct amdgpu_device *)data;
>>> + struct drm_device *dev = adev_to_drm(adev);
>>> + int r;
>>> +
>>> + /* TODO: support mode1 and mode2 currently */
>>> + if (val == AMD_RESET_METHOD_MODE1 ||
>>> + val == AMD_RESET_METHOD_MODE2)
>>> + adev->recover_method = val;
>>> + else
>>> + adev->recover_method = AMD_RESET_METHOD_NONE;
>>> +
>>> + r = pm_runtime_get_sync(dev->dev);
>>> + if (r < 0) {
>>> + pm_runtime_put_autosuspend(dev->dev);
>>> + return 0;
>>> + }
>>> +
>>> + if (amdgpu_reset_domain_schedule(adev->reset_domain, &adev-
>>> reset_work))
>>> + flush_work(&adev->reset_work);
>>> +
>>> + pm_runtime_mark_last_busy(dev->dev);
>>> + pm_runtime_put_autosuspend(dev->dev);
>>> +
>>> + return 0;
>>> +}
>>> +
>>> DEFINE_SHOW_ATTRIBUTE(amdgpu_debugfs_fence_info);
>>> -DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_gpu_recover_fops,
>> gpu_recover_get, NULL,
>>> - "%lld\n");
>>> +DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_gpu_recover_fops,
>> gpu_recover_get,
>>> + gpu_recover_set, "%lld\n");
>>>
>>> static void amdgpu_debugfs_reset_work(struct work_struct *work)
>>> {
>>> @@ -978,9 +1006,10 @@ static void amdgpu_debugfs_reset_work(struct
>>> work_struct *work)
>>>
>>> memset(&reset_context, 0, sizeof(reset_context));
>>>
>>> - reset_context.method = AMD_RESET_METHOD_NONE;
>>> + reset_context.method = adev->recover_method;
>>> reset_context.reset_req_dev = adev;
>>> set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
>>> + adev->recover_method = AMD_RESET_METHOD_NONE;
>>>
>>> amdgpu_device_gpu_recover(adev, NULL, &reset_context);
>>> }
>>> @@ -999,7 +1028,7 @@ void amdgpu_debugfs_fence_init(struct
>> amdgpu_device *adev)
>>> if (!amdgpu_sriov_vf(adev)) {
>>>
>>> INIT_WORK(&adev->reset_work, amdgpu_debugfs_reset_work);
>>> - debugfs_create_file("amdgpu_gpu_recover", 0444, root, adev,
>>> + debugfs_create_file("amdgpu_gpu_recover", 0666, root, adev,
>>> &amdgpu_debugfs_gpu_recover_fops);
>>> }
>>> #endif
More information about the amd-gfx
mailing list