[PATCH] drm/amdgpu: Fix repeatly flr issue

Tue Aug 18 08:31:23 UTC 2020

[AMD Official Use Only - Internal Distribution Only]

>-----Original Message-----
>From: Das, Nirmoy <Nirmoy.Das at amd.com>
>Sent: Tuesday, August 18, 2020 4:22 PM
>To: Deng, Emily <Emily.Deng at amd.com>; Das, Nirmoy
><Nirmoy.Das at amd.com>; amd-gfx at lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: Fix repeatly flr issue
>
>
>On 8/18/20 4:48 AM, Deng, Emily wrote:
>> [AMD Official Use Only - Internal Distribution Only]
>>
>>> -----Original Message-----
>>> From: Das, Nirmoy <Nirmoy.Das at amd.com>
>>> Sent: Wednesday, August 12, 2020 8:18 PM
>>> To: Deng, Emily <Emily.Deng at amd.com>; amd-gfx at lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: Fix repeatly flr issue
>>>
>>>
>>> On 8/12/20 11:19 AM, Emily.Deng wrote:
>>>> From: jqdeng <Emily.Deng at amd.com>
>>>>
>>>> Only for no job running test case need to do recover in flr
>>>> notification.
>>>> For having job in mirror list, then let guest driver to hit job
>>>> timeout, and then do recover.
>>>>
>>>> Signed-off-by: jqdeng <Emily.Deng at amd.com>
>>>> Change-Id: Ic6234fce46fa1655ba81c4149235eeac75e75868
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 20
>+++++++++++++++++++-
>>>>    drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 22
>++++++++++++++++++++-
>>> -
>>>>    2 files changed, 39 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>> index fe31cbeccfe9..12fe5164aaf3 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>> @@ -238,6 +238,9 @@ static void xgpu_ai_mailbox_flr_work(struct
>>> work_struct *work)
>>>>    struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt,
>>> flr_work);
>>>>    struct amdgpu_device *adev = container_of(virt, struct
>>> amdgpu_device, virt);
>>>>    int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>> +int i;
>>>> +bool need_do_recover = true;
>>>
>>> We should find a better name for "need_do_recover", may be
>>> "need_to_recover" ?
>> Thanks, will modify later.
>>>
>>>> +struct drm_sched_job *job;
>>>>
>>>>    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>     * otherwise the mailbox msg will be ruined/reseted by @@ -258,10
>>>> +261,25 @@ static void xgpu_ai_mailbox_flr_work(struct
>>> work_struct *work)
>>>>    flr_done:
>>>>    up_read(&adev->reset_sem);
>>>> +for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring =
>>>> +adev->rings[i];
>>>> +
>>>> +if (!ring || !ring->sched.thread)
>>>> +continue;
>>>> +
>>>> +spin_lock(&ring->sched.job_list_lock);
>>>> +job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
>>>> +struct drm_sched_job, node);
>>>> +spin_unlock(&ring->sched.job_list_lock);
>>>> +if (job) {
>>>> +need_do_recover = false;
>>>> +break;
>>>> +}
>>>> +}
>>>
>>> This 1st job retrieval logic can move to a function as there are two
>>> instance of it.
>>> Sorry, I didn't get your point.
>
>
>xgpu_ai_mailbox_flr_work() and xgpu_nv_mailbox_flr_work() are using same
>logic under "flr_done:"  label trying to retrieve 1st job entry to determine if
>we should do recover or not.
>
>We could move that logic into a function like:
>
>
>bool function_name ()
>{
>for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>struct amdgpu_ring *ring = adev->rings[i];
>
>if (!ring || !ring->sched.thread)
>continue;
>
>spin_lock(&ring->sched.job_list_lock);
>job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
>struct drm_sched_job, node);
>spin_unlock(&ring->sched.job_list_lock);
>if (job)
>return true;
>
>}
>
>return false;
>}
>
>and use that in xgpu_ai_mailbox_flr_work() and
>xgpu_nv_mailbox_flr_work() instead of
>
>having two copy of that logic.
Understand completely now. Thanks.
>
>
>
>Nirmoy
>
>>>
>>>>    /* Trigger recovery for world switch failure if no TDR */
>>>>    if (amdgpu_device_should_recover_gpu(adev)
>>>> -&& adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)
>>>> +&& (need_do_recover || adev->sdma_timeout ==
>>> MAX_SCHEDULE_TIMEOUT))
>>>>    amdgpu_device_gpu_recover(adev, NULL);
>>>>    }
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>> index 6f55172e8337..fc92c494df0b 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>> @@ -259,6 +259,9 @@ static void xgpu_nv_mailbox_flr_work(struct
>>> work_struct *work)
>>>>    struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt,
>>> flr_work);
>>>>    struct amdgpu_device *adev = container_of(virt, struct
>>> amdgpu_device, virt);
>>>>    int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>> +int i;
>>>> +bool need_do_recover = true;
>>>> +struct drm_sched_job *job;
>>>>
>>>>    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>     * otherwise the mailbox msg will be ruined/reseted by @@ -279,10
>>>> +282,25 @@ static void xgpu_nv_mailbox_flr_work(struct
>>> work_struct *work)
>>>>    flr_done:
>>>>    up_read(&adev->reset_sem);
>>>> +for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring =
>>>> +adev->rings[i];
>>>> +
>>>> +if (!ring || !ring->sched.thread)
>>>> +continue;
>>>> +
>>>> +spin_lock(&ring->sched.job_list_lock);
>>>> +job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
>>>> +struct drm_sched_job, node);
>>>> +spin_unlock(&ring->sched.job_list_lock);
>>>> +if (job) {
>>>> +need_do_recover = false;
>>>> +break;
>>>> +}
>>>> +}
>>>>
>>>>    /* Trigger recovery for world switch failure if no TDR */ -if
>>>> (amdgpu_device_should_recover_gpu(adev)
>>>> -&& (adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
>>>> +if (amdgpu_device_should_recover_gpu(adev) && (need_do_recover
>>> ||
>>>> +adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
>>>>    adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
>>>>    adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
>>>>    adev->video_timeout == MAX_SCHEDULE_TIMEOUT))