[PATCH] drm/amdgpu: Fix repeatly flr issue

Tue Aug 18 02:48:55 UTC 2020

[AMD Official Use Only - Internal Distribution Only]

>-----Original Message-----
>From: Das, Nirmoy <Nirmoy.Das at amd.com>
>Sent: Wednesday, August 12, 2020 8:18 PM
>To: Deng, Emily <Emily.Deng at amd.com>; amd-gfx at lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: Fix repeatly flr issue
>
>
>On 8/12/20 11:19 AM, Emily.Deng wrote:
>> From: jqdeng <Emily.Deng at amd.com>
>>
>> Only for no job running test case need to do recover in flr
>> notification.
>> For having job in mirror list, then let guest driver to hit job
>> timeout, and then do recover.
>>
>> Signed-off-by: jqdeng <Emily.Deng at amd.com>
>> Change-Id: Ic6234fce46fa1655ba81c4149235eeac75e75868
>> ---
>>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 20 +++++++++++++++++++-
>>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 22 ++++++++++++++++++++-
>-
>>   2 files changed, 39 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>> index fe31cbeccfe9..12fe5164aaf3 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>> @@ -238,6 +238,9 @@ static void xgpu_ai_mailbox_flr_work(struct
>work_struct *work)
>>   struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt,
>flr_work);
>>   struct amdgpu_device *adev = container_of(virt, struct
>amdgpu_device, virt);
>>   int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>> +int i;
>> +bool need_do_recover = true;
>
>
>We should find a better name for "need_do_recover", may be
>"need_to_recover" ?
Thanks, will modify later.
>
>
>> +struct drm_sched_job *job;
>>
>>   /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>    * otherwise the mailbox msg will be ruined/reseted by
>> @@ -258,10 +261,25 @@ static void xgpu_ai_mailbox_flr_work(struct
>work_struct *work)
>>
>>   flr_done:
>>   up_read(&adev->reset_sem);
>> +for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +struct amdgpu_ring *ring = adev->rings[i];
>> +
>> +if (!ring || !ring->sched.thread)
>> +continue;
>> +
>> +spin_lock(&ring->sched.job_list_lock);
>> +job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
>> +struct drm_sched_job, node);
>> +spin_unlock(&ring->sched.job_list_lock);
>> +if (job) {
>> +need_do_recover = false;
>> +break;
>> +}
>> +}
>
>
>This 1st job retrieval logic can move to a function as there are two
>instance of it.
>Sorry, I didn't get your point.
>
>>
>>   /* Trigger recovery for world switch failure if no TDR */
>>   if (amdgpu_device_should_recover_gpu(adev)
>> -&& adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)
>> +&& (need_do_recover || adev->sdma_timeout ==
>MAX_SCHEDULE_TIMEOUT))
>>   amdgpu_device_gpu_recover(adev, NULL);
>>   }
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>> index 6f55172e8337..fc92c494df0b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>> @@ -259,6 +259,9 @@ static void xgpu_nv_mailbox_flr_work(struct
>work_struct *work)
>>   struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt,
>flr_work);
>>   struct amdgpu_device *adev = container_of(virt, struct
>amdgpu_device, virt);
>>   int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>> +int i;
>> +bool need_do_recover = true;
>> +struct drm_sched_job *job;
>>
>>   /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>    * otherwise the mailbox msg will be ruined/reseted by
>> @@ -279,10 +282,25 @@ static void xgpu_nv_mailbox_flr_work(struct
>work_struct *work)
>>
>>   flr_done:
>>   up_read(&adev->reset_sem);
>> +for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +struct amdgpu_ring *ring = adev->rings[i];
>> +
>> +if (!ring || !ring->sched.thread)
>> +continue;
>> +
>> +spin_lock(&ring->sched.job_list_lock);
>> +job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
>> +struct drm_sched_job, node);
>> +spin_unlock(&ring->sched.job_list_lock);
>> +if (job) {
>> +need_do_recover = false;
>> +break;
>> +}
>> +}
>>
>>   /* Trigger recovery for world switch failure if no TDR */
>> -if (amdgpu_device_should_recover_gpu(adev)
>> -&& (adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
>> +if (amdgpu_device_should_recover_gpu(adev) && (need_do_recover
>||
>> +adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
>>   adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
>>   adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
>>   adev->video_timeout == MAX_SCHEDULE_TIMEOUT))