[PATCH v3 4/7] drm/amdgpu: Add work_struct for GPU reset from debugfs

Mon May 30 15:46:10 UTC 2022

+ Monk

On 2022-05-30 03:52, Christian König wrote:
> 
> 
> Am 25.05.22 um 21:04 schrieb Andrey Grodzovsky:
>> We need to have a work_struct to cancel this reset if another
>> already in progress.
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h       |  2 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 19 +++++++++++++++++--
>>   2 files changed, 19 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 76df583663c7..8165ee5b0457 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1048,6 +1048,8 @@ struct amdgpu_device {
>>       bool                            scpm_enabled;
>>       uint32_t                        scpm_status;
>> +
>> +    struct work_struct        reset_work;
>>   };
>>   static inline struct amdgpu_device *drm_to_adev(struct drm_device 
>> *ddev)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> index d16c8c1f72db..b0498ffcf7c3 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> @@ -39,6 +39,7 @@
>>   #include <drm/drm_drv.h>
>>   #include "amdgpu.h"
>>   #include "amdgpu_trace.h"
>> +#include "amdgpu_reset.h"
>>   /*
>>    * Fences
>> @@ -798,7 +799,10 @@ static int gpu_recover_get(void *data, u64 *val)
>>           return 0;
>>       }
>> -    *val = amdgpu_device_gpu_recover(adev, NULL);
>> +    if (amdgpu_reset_domain_schedule(adev->reset_domain, 
>> &adev->reset_work))
>> +        flush_work(&adev->reset_work);
>> +
>> +    *val = atomic_read(&adev->reset_domain->reset_res);
>>       pm_runtime_mark_last_busy(dev->dev);
>>       pm_runtime_put_autosuspend(dev->dev);
>> @@ -810,6 +814,14 @@ DEFINE_SHOW_ATTRIBUTE(amdgpu_debugfs_fence_info);
>>   DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_gpu_recover_fops, 
>> gpu_recover_get, NULL,
>>                "%lld\n");
>> +static void amdgpu_debugfs_reset_work(struct work_struct *work)
>> +{
>> +    struct amdgpu_device *adev = container_of(work, struct 
>> amdgpu_device,
>> +                          reset_work);
>> +
>> +    amdgpu_device_gpu_recover_imp(adev, NULL);
>> +}
>> +
>>   #endif
>>   void amdgpu_debugfs_fence_init(struct amdgpu_device *adev)
>> @@ -821,9 +833,12 @@ void amdgpu_debugfs_fence_init(struct 
>> amdgpu_device *adev)
>>       debugfs_create_file("amdgpu_fence_info", 0444, root, adev,
>>                   &amdgpu_debugfs_fence_info_fops);
>> -    if (!amdgpu_sriov_vf(adev))
>> +    if (!amdgpu_sriov_vf(adev)) {
> 
> I think we should drop the check for amdgpu_sriov_vf() here. It's a 
> valid requirement to be able to trigger a GPU reset for a VF as well.
> 
> But not topic of this patch, feel free to add an Reviewed-by: Christian 
> König <christian.koenig at amd.com>.
> 
> Regards,
> Christian.

Monk - any idea why we prevent from creation of debugfs GPU reset for VF ?

Andrey

> 
>> +
>> +        INIT_WORK(&adev->reset_work, amdgpu_debugfs_reset_work);
>>           debugfs_create_file("amdgpu_gpu_recover", 0444, root, adev,
>>                       &amdgpu_debugfs_gpu_recover_fops);
>> +    }
>>   #endif
>>   }
>