[PATCH] drm/amdgpu: change the fence ring wait timeout

Christian König ckoenig.leichtzumerken at gmail.com
Wed Jan 13 14:03:04 UTC 2021


Am 13.01.21 um 07:36 schrieb Roy Sun:
> This fix bug where when the engine hang, the fence ring will wait without quit and cause kernel crash

NAK, this blocking is intentional unlimited because otherwise we will 
cause a memory corruption.

What is the actual bug you are trying to fix here?

Regards,
Christian.

>
> Signed-off-by: Roy Sun <Roy.Sun at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 48 ++++++++++++++++++++---
>   1 file changed, 43 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 6b0aeee61b8b..738ea65077ea 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -41,6 +41,8 @@
>   #include "amdgpu.h"
>   #include "amdgpu_trace.h"
>   
> +#define AMDGPU_FENCE_TIMEOUT  msecs_to_jiffies(1000)
> +#define AMDGPU_FENCE_GFX_XGMI_TIMEOUT msecs_to_jiffies(2000)
>   /*
>    * Fences
>    * Fences mark an event in the GPUs pipeline and are used
> @@ -104,6 +106,38 @@ static void amdgpu_fence_write(struct amdgpu_ring *ring, u32 seq)
>   		*drv->cpu_addr = cpu_to_le32(seq);
>   }
>   
> +/**
> + * amdgpu_fence_wait_timeout - get the fence wait timeout
> + *
> + * @ring: ring the fence is associated with
> + *
> + * Returns the value of the fence wait timeout.
> + */
> +long amdgpu_fence_wait_timeout(struct amdgpu_ring *ring)
> +{
> +	long tmo_gfx, tmo_mm, tmo;
> +	struct amdgpu_device *adev = ring->adev;
> +	tmo_mm = tmo_gfx = AMDGPU_FENCE_TIMEOUT;
> +	if (amdgpu_sriov_vf(adev)) {
> +		tmo_mm = 8 * AMDGPU_FENCE_TIMEOUT;
> +	}
> +	if (amdgpu_sriov_runtime(adev)) {
> +		tmo_gfx = 8 * AMDGPU_FENCE_TIMEOUT;
> +	} else if (adev->gmc.xgmi.hive_id) {
> +		tmo_gfx = AMDGPU_FENCE_GFX_XGMI_TIMEOUT;
> +	}
> +	if (ring->funcs->type == AMDGPU_RING_TYPE_UVD ||
> +		ring->funcs->type == AMDGPU_RING_TYPE_VCE ||
> +		ring->funcs->type == AMDGPU_RING_TYPE_UVD_ENC ||
> +		ring->funcs->type == AMDGPU_RING_TYPE_VCN_DEC ||
> +		ring->funcs->type == AMDGPU_RING_TYPE_VCN_ENC ||
> +		ring->funcs->type == AMDGPU_RING_TYPE_VCN_JPEG)
> +		tmo = tmo_mm;
> +	else
> +		tmo = tmo_gfx;
> +	return tmo;
> +}
> +
>   /**
>    * amdgpu_fence_read - read a fence value
>    *
> @@ -166,10 +200,12 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
>   		rcu_read_unlock();
>   
>   		if (old) {
> -			r = dma_fence_wait(old, false);
> +			long timeout;
> +			timeout = amdgpu_fence_wait_timeout(ring);
> +			r = dma_fence_wait_timeout(old, false, timeout);
>   			dma_fence_put(old);
>   			if (r)
> -				return r;
> +				return r < 0 ? r : 0;
>   		}
>   	}
>   
> @@ -343,10 +379,12 @@ int amdgpu_fence_wait_empty(struct amdgpu_ring *ring)
>   		return 0;
>   	}
>   	rcu_read_unlock();
> -
> -	r = dma_fence_wait(fence, false);
> +	
> +	long timeout;
> +	timeout = amdgpu_fence_wait_timeout(ring);
> +	r = dma_fence_wait_timeout(fence, false, timeout);
>   	dma_fence_put(fence);
> -	return r;
> +	return r < 0 ? r : 0;
>   }
>   
>   /**



More information about the amd-gfx mailing list