[RFC v4 04/11] drm/amd/virt: For SRIOV send GPU reset directly to TDR queue.
Christian König
ckoenig.leichtzumerken at gmail.com
Wed Feb 9 07:54:41 UTC 2022
Am 09.02.22 um 01:23 schrieb Andrey Grodzovsky:
> No need to to trigger another work queue inside the work queue.
>
> v3:
>
> Problem:
> Extra reset caused by host side FLR notification
> following guest side triggered reset.
> Fix: Preven qeuing flr_work from mailbox irq if guest
> already executing a reset.
>
> Suggested-by: Liu Shaoyun <Shaoyun.Liu at amd.com>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
Feel free to add an Acked-by: Christian König
<christian.koenig at amd.com>, but an rb from somebody more familiar with
the code would be better.
Regards,
Christian.
> ---
> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 9 ++++++---
> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 9 ++++++---
> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 9 ++++++---
> 3 files changed, 18 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index 56da5ab82987..5869d51d8bee 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -282,7 +282,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
> if (amdgpu_device_should_recover_gpu(adev)
> && (!amdgpu_device_has_job_running(adev) ||
> adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT))
> - amdgpu_device_gpu_recover(adev, NULL);
> + amdgpu_device_gpu_recover_imp(adev, NULL);
> }
>
> static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
> @@ -307,8 +307,11 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev,
>
> switch (event) {
> case IDH_FLR_NOTIFICATION:
> - if (amdgpu_sriov_runtime(adev))
> - schedule_work(&adev->virt.flr_work);
> + if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
> + WARN_ONCE(!queue_work(adev->reset_domain.wq,
> + &adev->virt.flr_work),
> + "Failed to queue work! at %s",
> + __func__);
> break;
> case IDH_QUERY_ALIVE:
> xgpu_ai_mailbox_send_ack(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> index 477d0dde19c5..5728a6401d73 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> @@ -309,7 +309,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
> adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
> adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
> adev->video_timeout == MAX_SCHEDULE_TIMEOUT))
> - amdgpu_device_gpu_recover(adev, NULL);
> + amdgpu_device_gpu_recover_imp(adev, NULL);
> }
>
> static int xgpu_nv_set_mailbox_rcv_irq(struct amdgpu_device *adev,
> @@ -337,8 +337,11 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev,
>
> switch (event) {
> case IDH_FLR_NOTIFICATION:
> - if (amdgpu_sriov_runtime(adev))
> - schedule_work(&adev->virt.flr_work);
> + if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
> + WARN_ONCE(!queue_work(adev->reset_domain.wq,
> + &adev->virt.flr_work),
> + "Failed to queue work! at %s",
> + __func__);
> break;
> /* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can ignore
> * it byfar since that polling thread will handle it,
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> index aef9d059ae52..02290febfcf4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> @@ -521,7 +521,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
>
> /* Trigger recovery due to world switch failure */
> if (amdgpu_device_should_recover_gpu(adev))
> - amdgpu_device_gpu_recover(adev, NULL);
> + amdgpu_device_gpu_recover_imp(adev, NULL);
> }
>
> static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,
> @@ -550,8 +550,11 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device *adev,
> r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);
>
> /* only handle FLR_NOTIFY now */
> - if (!r)
> - schedule_work(&adev->virt.flr_work);
> + if (!r && !amdgpu_in_reset(adev))
> + WARN_ONCE(!queue_work(adev->reset_domain.wq,
> + &adev->virt.flr_work),
> + "Failed to queue work! at %s",
> + __func__);
> }
>
> return 0;
More information about the amd-gfx
mailing list