[PATCH] drm/amdgpu: give more chance for tlb flush if failed
Christian König
ckoenig.leichtzumerken at gmail.com
Tue Mar 20 10:23:53 UTC 2018
Am 20.03.2018 um 07:29 schrieb Emily Deng:
> under SR-IOV sometimes CPU based tlb flush would timeout
> within the given 100ms period, instead let it fail and
> continue we can give it more chance to repeat the
> tlb flush on the failed VMHUB
>
> this could fix the massive "Timeout waiting for VM flush ACK"
> error during vk_encoder test.
Well that one is a big NAK since it once more just hides the real
problem that we sometimes drop register writes.
What we did during debugging to avoid the problem is the following:
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index a70cbc45c4c1..3536d50375fa 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -338,6 +338,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct
> amdgpu_device *adev,
> u32 tmp = gmc_v9_0_get_invalidate_req(vmid);
>
> WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
> + while (RREG32_NO_KIQ(hub->vm_inv_eng0_req + eng) != tmp) {
> + DRM_ERROR("Need one more try to write the
> VMHUB flush request!");
> + WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
> + }
>
> /* Busy wait for ACK.*/
> for (j = 0; j < 100; j++) {
But that can only be a temporary workaround as well.
The question is rather can you reliable reproduce this issue with the
vk_encoder test?
Thanks,
Christian.
>
> Signed-off-by: Monk Liu <Monk.Liu at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 24 +++++++++++++++++++-----
> 1 file changed, 19 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index a70cbc4..517712b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -329,13 +329,18 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev,
> {
> /* Use register 17 for GART */
> const unsigned eng = 17;
> - unsigned i, j;
> + unsigned i, j, loop = 0;
> + unsigned flush_done = 0;
> +
> +retry:
>
> spin_lock(&adev->gmc.invalidate_lock);
>
> for (i = 0; i < AMDGPU_MAX_VMHUBS; ++i) {
> struct amdgpu_vmhub *hub = &adev->vmhub[i];
> u32 tmp = gmc_v9_0_get_invalidate_req(vmid);
> + if (flush_done & (1 << i)) /* this vmhub flushed */
> + continue;
>
> WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
>
> @@ -347,8 +352,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev,
> break;
> cpu_relax();
> }
> - if (j < 100)
> + if (j < 100) {
> + flush_done |= (1 << i);
> continue;
> + }
>
> /* Wait for ACK with a delay.*/
> for (j = 0; j < adev->usec_timeout; j++) {
> @@ -358,15 +365,22 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev,
> break;
> udelay(1);
> }
> - if (j < adev->usec_timeout)
> + if (j < adev->usec_timeout) {
> + flush_done |= (1 << i);
> continue;
> -
> - DRM_ERROR("Timeout waiting for VM flush ACK!\n");
> + }
> }
>
> spin_unlock(&adev->gmc.invalidate_lock);
> + if (flush_done != 3) {
> + if (loop++ < 3)
> + goto retry;
> + else
> + DRM_ERROR("Timeout waiting for VM flush ACK!\n");
> + }
> }
>
> +
> static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> unsigned vmid, uint64_t pd_addr)
> {
More information about the amd-gfx
mailing list