[PATCH] drm/amdgpu: Fix repeatly flr issue
Nirmoy
nirmodas at amd.com
Wed Aug 12 12:18:14 UTC 2020
On 8/12/20 11:19 AM, Emily.Deng wrote:
> From: jqdeng <Emily.Deng at amd.com>
>
> Only for no job running test case need to do recover in
> flr notification.
> For having job in mirror list, then let guest driver to
> hit job timeout, and then do recover.
>
> Signed-off-by: jqdeng <Emily.Deng at amd.com>
> Change-Id: Ic6234fce46fa1655ba81c4149235eeac75e75868
> ---
> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 20 +++++++++++++++++++-
> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 22 ++++++++++++++++++++--
> 2 files changed, 39 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index fe31cbeccfe9..12fe5164aaf3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -238,6 +238,9 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
> struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
> struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
> + int i;
> + bool need_do_recover = true;
We should find a better name for "need_do_recover", may be
"need_to_recover" ?
> + struct drm_sched_job *job;
>
> /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
> * otherwise the mailbox msg will be ruined/reseted by
> @@ -258,10 +261,25 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>
> flr_done:
> up_read(&adev->reset_sem);
> + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> + struct amdgpu_ring *ring = adev->rings[i];
> +
> + if (!ring || !ring->sched.thread)
> + continue;
> +
> + spin_lock(&ring->sched.job_list_lock);
> + job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
> + struct drm_sched_job, node);
> + spin_unlock(&ring->sched.job_list_lock);
> + if (job) {
> + need_do_recover = false;
> + break;
> + }
> + }
This 1st job retrieval logic can move to a function as there are two
instance of it.
>
> /* Trigger recovery for world switch failure if no TDR */
> if (amdgpu_device_should_recover_gpu(adev)
> - && adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)
> + && (need_do_recover || adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT))
> amdgpu_device_gpu_recover(adev, NULL);
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> index 6f55172e8337..fc92c494df0b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> @@ -259,6 +259,9 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
> struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
> struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
> + int i;
> + bool need_do_recover = true;
> + struct drm_sched_job *job;
>
> /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
> * otherwise the mailbox msg will be ruined/reseted by
> @@ -279,10 +282,25 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>
> flr_done:
> up_read(&adev->reset_sem);
> + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> + struct amdgpu_ring *ring = adev->rings[i];
> +
> + if (!ring || !ring->sched.thread)
> + continue;
> +
> + spin_lock(&ring->sched.job_list_lock);
> + job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
> + struct drm_sched_job, node);
> + spin_unlock(&ring->sched.job_list_lock);
> + if (job) {
> + need_do_recover = false;
> + break;
> + }
> + }
>
> /* Trigger recovery for world switch failure if no TDR */
> - if (amdgpu_device_should_recover_gpu(adev)
> - && (adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
> + if (amdgpu_device_should_recover_gpu(adev) && (need_do_recover ||
> + adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
> adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
> adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
> adev->video_timeout == MAX_SCHEDULE_TIMEOUT))
More information about the amd-gfx
mailing list