[PATCH 1/2] drm/amdgpu: Fix repeatly flr issue
Nirmoy
nirmodas at amd.com
Tue Aug 18 13:36:33 UTC 2020
Acked-by: Nirmoy Das <nirmoy.das at amd.com>
On 8/18/20 3:29 PM, Emily.Deng wrote:
> From: jqdeng <Emily.Deng at amd.com>
>
> Only for no job running test case need to do recover in
> flr notification.
> For having job in mirror list, then let guest driver to
> hit job timeout, and then do recover.
>
> Signed-off-by: jqdeng <Emily.Deng at amd.com>
> Change-Id: Ic6234fce46fa1655ba81c4149235eeac75e75868
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 28 ++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 4 ++--
> 4 files changed, 32 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 1f9d97f61aa5..69115781be05 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1136,6 +1136,7 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
> #define amdgpu_inc_vram_lost(adev) atomic_inc(&((adev)->vram_lost_counter));
>
> /* Common functions */
> +bool amdgpu_device_has_job_running(struct amdgpu_device *adev);
> bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev);
> int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> struct amdgpu_job* job);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index fe8878761c29..de4bce6d7516 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3925,6 +3925,34 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
> return r;
> }
>
> +/**
> + * amdgpu_device_has_job_running - check if there is any job in mirror list
> + *
> + * @adev: amdgpu device pointer
> + *
> + * check if there is any job in mirror list
> + */
> +bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
> +{
> + int i;
> + struct drm_sched_job *job;
> +
> + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> + struct amdgpu_ring *ring = adev->rings[i];
> +
> + if (!ring || !ring->sched.thread)
> + continue;
> +
> + spin_lock(&ring->sched.job_list_lock);
> + job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
> + struct drm_sched_job, node);
> + spin_unlock(&ring->sched.job_list_lock);
> + if (job)
> + return true;
> + }
> + return false;
> +}
> +
> /**
> * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
> *
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index fe31cbeccfe9..bd4e7c2d0dd1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -261,7 +261,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>
> /* Trigger recovery for world switch failure if no TDR */
> if (amdgpu_device_should_recover_gpu(adev)
> - && adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)
> + && (amdgpu_device_has_job_running(adev) || adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT))
> amdgpu_device_gpu_recover(adev, NULL);
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> index 6f55172e8337..d5c14745a9d8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> @@ -281,8 +281,8 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
> up_read(&adev->reset_sem);
>
> /* Trigger recovery for world switch failure if no TDR */
> - if (amdgpu_device_should_recover_gpu(adev)
> - && (adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
> + if (amdgpu_device_should_recover_gpu(adev) && (amdgpu_device_has_job_running(adev) ||
> + adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
> adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
> adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
> adev->video_timeout == MAX_SCHEDULE_TIMEOUT))
More information about the amd-gfx
mailing list