[PATCH] drm/amdgpu: Fix repeatly flr issue

Wed Aug 12 12:18:14 UTC 2020

On 8/12/20 11:19 AM, Emily.Deng wrote:
> From: jqdeng <Emily.Deng at amd.com>
>
> Only for no job running test case need to do recover in
> flr notification.
> For having job in mirror list, then let guest driver to
> hit job timeout, and then do recover.
>
> Signed-off-by: jqdeng <Emily.Deng at amd.com>
> Change-Id: Ic6234fce46fa1655ba81c4149235eeac75e75868
> ---
>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 20 +++++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 22 ++++++++++++++++++++--
>   2 files changed, 39 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index fe31cbeccfe9..12fe5164aaf3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -238,6 +238,9 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>   	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
>   	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
>   	int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
> +	int i;
> +	bool need_do_recover = true;

We should find a better name for "need_do_recover", may be 
"need_to_recover" ?

> +	struct drm_sched_job *job;
>   
>   	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>   	 * otherwise the mailbox msg will be ruined/reseted by
> @@ -258,10 +261,25 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>   
>   flr_done:
>   	up_read(&adev->reset_sem);
> +	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +		struct amdgpu_ring *ring = adev->rings[i];
> +
> +		if (!ring || !ring->sched.thread)
> +			continue;
> +
> +		spin_lock(&ring->sched.job_list_lock);
> +		job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
> +		struct drm_sched_job, node);
> +		spin_unlock(&ring->sched.job_list_lock);
> +		if (job) {
> +			need_do_recover = false;
> +			break;
> +		}
> +	}

This 1st job retrieval logic can move to a function as there are two 
instance of it.

>   
>   	/* Trigger recovery for world switch failure if no TDR */
>   	if (amdgpu_device_should_recover_gpu(adev)
> -		&& adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)
> +		&& (need_do_recover || adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT))
>   		amdgpu_device_gpu_recover(adev, NULL);
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> index 6f55172e8337..fc92c494df0b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> @@ -259,6 +259,9 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>   	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
>   	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
>   	int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
> +	int i;
> +	bool need_do_recover = true;
> +	struct drm_sched_job *job;
>   
>   	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>   	 * otherwise the mailbox msg will be ruined/reseted by
> @@ -279,10 +282,25 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>   
>   flr_done:
>   	up_read(&adev->reset_sem);
> +	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +		struct amdgpu_ring *ring = adev->rings[i];
> +
> +		if (!ring || !ring->sched.thread)
> +			continue;
> +
> +		spin_lock(&ring->sched.job_list_lock);
> +		job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
> +					struct drm_sched_job, node);
> +		spin_unlock(&ring->sched.job_list_lock);
> +		if (job) {
> +			need_do_recover = false;
> +			break;
> +		}
> +	}
>   
>   	/* Trigger recovery for world switch failure if no TDR */
> -	if (amdgpu_device_should_recover_gpu(adev)
> -		&& (adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
> +	if (amdgpu_device_should_recover_gpu(adev) && (need_do_recover ||
> +		adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
>   		adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
>   		adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
>   		adev->video_timeout == MAX_SCHEDULE_TIMEOUT))