[RFC 4/6] drm/amdgpu: Serialize non TDR gpu recovery with TDRs

Mon Dec 20 22:17:01 UTC 2021

On 2021-12-20 2:20 a.m., Christian König wrote:
> Am 17.12.21 um 23:27 schrieb Andrey Grodzovsky:
>> Use reset domain wq also for non TDR gpu recovery trigers
>> such as sysfs and RAS. We must serialize all possible
>> GPU recoveries to gurantee no concurrency there.
>> For TDR call the original recovery function directly since
>> it's already executed from within the wq. For others just
>> use a wrapper to qeueue work and wait on it to finish.
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 33 +++++++++++++++++++++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  2 +-
>>   3 files changed, 35 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index b5ff76aae7e0..8e96b9a14452 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1296,6 +1296,8 @@ bool amdgpu_device_has_job_running(struct 
>> amdgpu_device *adev);
>>   bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev);
>>   int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>                     struct amdgpu_job* job);
>> +int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
>> +                  struct amdgpu_job *job);
>>   void amdgpu_device_pci_config_reset(struct amdgpu_device *adev);
>>   int amdgpu_device_pci_reset(struct amdgpu_device *adev);
>>   bool amdgpu_device_need_post(struct amdgpu_device *adev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index b595e6d699b5..55cd67b9ede2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -4979,7 +4979,7 @@ static void amdgpu_device_recheck_guilty_jobs(
>>    * Returns 0 for success or an error on failure.
>>    */
>>   -int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>> +int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
>>                     struct amdgpu_job *job)
>>   {
>>       struct list_head device_list, *device_list_handle =  NULL;
>> @@ -5236,6 +5236,37 @@ int amdgpu_device_gpu_recover(struct 
>> amdgpu_device *adev,
>>       return r;
>>   }
>>   +struct recover_work_struct {
>
> Please add an amdgpu_ prefix to the name.
>
>> +    struct work_struct base;
>> +    struct amdgpu_device *adev;
>> +    struct amdgpu_job *job;
>> +    int ret;
>> +};
>> +
>> +static void amdgpu_device_queue_gpu_recover_work(struct work_struct 
>> *work)
>> +{
>> +    struct recover_work_struct *recover_work = container_of(work, 
>> struct recover_work_struct, base);
>> +
>> +    recover_work->ret = 
>> amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job);
>> +}
>> +/*
>> + * Serialize gpu recover into reset domain single threaded wq
>> + */
>> +int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>> +                    struct amdgpu_job *job)
>> +{
>> +    struct recover_work_struct work = {.adev = adev, .job = job};
>> +
>> +    INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work);
>> +
>> +    if (!queue_work(adev->reset_domain.wq, &work.base))
>> +        return -EAGAIN;
>> +
>> +    flush_work(&work.base);
>> +
>> +    return work.ret;
>> +}
>
> Maybe that should be part of the scheduler code? Not really sure, just 
> an idea.

Seems to me that since the reset domain is almost always above a single 
scheduler granularity then it wouldn't feet very well there.

Andrey

>
> Apart from that looks good to me,
> Christian.
>
>> +
>>   /**
>>    * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
>>    *
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index bfc47bea23db..38c9fd7b7ad4 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -63,7 +63,7 @@ static enum drm_gpu_sched_stat 
>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>>             ti.process_name, ti.tgid, ti.task_name, ti.pid);
>>         if (amdgpu_device_should_recover_gpu(ring->adev)) {
>> -        amdgpu_device_gpu_recover(ring->adev, job);
>> +        amdgpu_device_gpu_recover_imp(ring->adev, job);
>>       } else {
>>           drm_sched_suspend_timeout(&ring->sched);
>>           if (amdgpu_sriov_vf(adev))
>