[PATCH] drm/radeon: Avoid double gpu reset by adding a timeout on IB ring tests.

Christian König deathsimple at vodafone.de
Sun Jan 24 01:49:00 PST 2016


Am 24.01.2016 um 07:18 schrieb Matthew Dawson:
> When the radeon driver resets a gpu, it attempts to test whether all the
> rings can successfully handle an IB.  If these rings fail to respond, the
> process will wait forever.  Another gpu reset can't happen at this point,
> as the current reset holds a lock required to do so.  Instead, make all
> the IB tests run with a timeout, so the system can attempt to recover
> in this case.
>
> While this doesn't fix the underlying issue with card resets failing, it
> gives the system a higher chance of recovering.  These timeouts have been
> confirmed to help both a Tathi and Hawaii card recover after a gpu reset.
>
> I haven't been able to test this on other cards, but everything compiles.
> I wasn't sure what to use for a timeout value, so for now I've used
> radeon_lockup_timeout.  When that is 0, the timeout functionality in this
> patch is also disabled.
>
> This also adds a new function, radeon_fence_wait_timeout, that behaves like
> radeon_fence_wait except allowing a different timeout value to be passed to
> radeon_fence_wait_seq_timeout.
>
> Signed-off-by: Matthew Dawson <matthew at mjdsystems.ca>

Really good idea, but please don't use radeon_lockup_timeout for this 
cause the 10 seconds default of that is way to long. Instead just use a 
fixed, let's say 100ms timeout defined in radeon.h.

Also don't define our own radeon_fence_wait_timeout, just use 
fence_wait_timeout(&fence->base, RADEON_IB_TEST_TIMEOUT) for this.

Apart from that the idea looks really good to me.

Regards,
Christian.

> ---
>   drivers/gpu/drm/radeon/cik_sdma.c     |  3 ++-
>   drivers/gpu/drm/radeon/r100.c         |  3 ++-
>   drivers/gpu/drm/radeon/r600.c         |  3 ++-
>   drivers/gpu/drm/radeon/r600_dma.c     |  3 ++-
>   drivers/gpu/drm/radeon/radeon.h       |  1 +
>   drivers/gpu/drm/radeon/radeon_fence.c | 25 ++++++++++++++++++++++---
>   drivers/gpu/drm/radeon/radeon_vce.c   |  3 ++-
>   drivers/gpu/drm/radeon/uvd_v1_0.c     |  3 ++-
>   8 files changed, 35 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/cik_sdma.c b/drivers/gpu/drm/radeon/cik_sdma.c
> index d16f2ee..014f5ca 100644
> --- a/drivers/gpu/drm/radeon/cik_sdma.c
> +++ b/drivers/gpu/drm/radeon/cik_sdma.c
> @@ -737,7 +737,8 @@ int cik_sdma_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>   		return r;
>   	}
> -	r = radeon_fence_wait(ib.fence, false);
> +	r = radeon_fence_wait_timeout(ib.fence, false, msecs_to_jiffies(
> +		(radeon_lockup_timeout) ? radeon_lockup_timeout : MAX_SCHEDULE_TIMEOUT));
>   	if (r) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>   		return r;
> diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
> index 5eae0a8..5386c09 100644
> --- a/drivers/gpu/drm/radeon/r100.c
> +++ b/drivers/gpu/drm/radeon/r100.c
> @@ -3732,7 +3732,8 @@ int r100_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>   		goto free_ib;
>   	}
> -	r = radeon_fence_wait(ib.fence, false);
> +	r = radeon_fence_wait_timeout(ib.fence, false, msecs_to_jiffies(
> +		(radeon_lockup_timeout) ? radeon_lockup_timeout : MAX_SCHEDULE_TIMEOUT));
>   	if (r) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>   		goto free_ib;
> diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
> index cc2fdf0..8fbe5d7 100644
> --- a/drivers/gpu/drm/radeon/r600.c
> +++ b/drivers/gpu/drm/radeon/r600.c
> @@ -3381,7 +3381,8 @@ int r600_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>   		goto free_ib;
>   	}
> -	r = radeon_fence_wait(ib.fence, false);
> +	r = radeon_fence_wait_timeout(ib.fence, false, msecs_to_jiffies(
> +		(radeon_lockup_timeout) ? radeon_lockup_timeout : MAX_SCHEDULE_TIMEOUT));
>   	if (r) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>   		goto free_ib;
> diff --git a/drivers/gpu/drm/radeon/r600_dma.c b/drivers/gpu/drm/radeon/r600_dma.c
> index d2dd29a..3ff614d 100644
> --- a/drivers/gpu/drm/radeon/r600_dma.c
> +++ b/drivers/gpu/drm/radeon/r600_dma.c
> @@ -368,7 +368,8 @@ int r600_dma_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>   		return r;
>   	}
> -	r = radeon_fence_wait(ib.fence, false);
> +	r = radeon_fence_wait_timeout(ib.fence, false, msecs_to_jiffies(
> +		(radeon_lockup_timeout) ? radeon_lockup_timeout : MAX_SCHEDULE_TIMEOUT));
>   	if (r) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>   		return r;
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index 5ae6db9..0b698b6 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -382,6 +382,7 @@ void radeon_fence_driver_force_completion(struct radeon_device *rdev, int ring);
>   int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence **fence, int ring);
>   void radeon_fence_process(struct radeon_device *rdev, int ring);
>   bool radeon_fence_signaled(struct radeon_fence *fence);
> +int radeon_fence_wait_timeout(struct radeon_fence *fence, bool interruptible, long timeout);
>   int radeon_fence_wait(struct radeon_fence *fence, bool interruptible);
>   int radeon_fence_wait_next(struct radeon_device *rdev, int ring);
>   int radeon_fence_wait_empty(struct radeon_device *rdev, int ring);
> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
> index 05815c4..9fec805 100644
> --- a/drivers/gpu/drm/radeon/radeon_fence.c
> +++ b/drivers/gpu/drm/radeon/radeon_fence.c
> @@ -527,7 +527,7 @@ static long radeon_fence_wait_seq_timeout(struct radeon_device *rdev,
>   }
>   
>   /**
> - * radeon_fence_wait - wait for a fence to signal
> + * radeon_fence_wait_timeout - wait for a fence to signal with timeout
>    *
>    * @fence: radeon fence object
>    * @intr: use interruptible sleep
> @@ -535,9 +535,10 @@ static long radeon_fence_wait_seq_timeout(struct radeon_device *rdev,
>    * Wait for the requested fence to signal (all asics).
>    * @intr selects whether to use interruptable (true) or non-interruptable
>    * (false) sleep when waiting for the fence.
> + * @timeout: maximum time to wait, or MAX_SCHEDULE_TIMEOUT for infinite wait
>    * Returns 0 if the fence has passed, error for all other cases.
>    */
> -int radeon_fence_wait(struct radeon_fence *fence, bool intr)
> +int radeon_fence_wait_timeout(struct radeon_fence *fence, bool intr, long timeout)
>   {
>   	uint64_t seq[RADEON_NUM_RINGS] = {};
>   	long r;
> @@ -552,9 +553,11 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr)
>   		return fence_wait(&fence->base, intr);
>   
>   	seq[fence->ring] = fence->seq;
> -	r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, MAX_SCHEDULE_TIMEOUT);
> +	r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, timeout);
>   	if (r < 0) {
>   		return r;
> +	} else if (r == 0) {
> +		return -ETIMEDOUT;
>   	}
>   
>   	r = fence_signal(&fence->base);
> @@ -564,6 +567,22 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr)
>   }
>   
>   /**
> + * radeon_fence_wait - wait for a fence to signal
> + *
> + * @fence: radeon fence object
> + * @intr: use interruptible sleep
> + *
> + * Wait for the requested fence to signal (all asics).
> + * @intr selects whether to use interruptable (true) or non-interruptable
> + * (false) sleep when waiting for the fence.
> + * Returns 0 if the fence has passed, error for all other cases.
> + */
> +int radeon_fence_wait(struct radeon_fence *fence, bool intr)
> +{
> +	return radeon_fence_wait_timeout(fence, intr, MAX_SCHEDULE_TIMEOUT);
> +}
> +
> +/**
>    * radeon_fence_wait_any - wait for a fence to signal on any ring
>    *
>    * @rdev: radeon device pointer
> diff --git a/drivers/gpu/drm/radeon/radeon_vce.c b/drivers/gpu/drm/radeon/radeon_vce.c
> index 7eb1ae7..7d80dad 100644
> --- a/drivers/gpu/drm/radeon/radeon_vce.c
> +++ b/drivers/gpu/drm/radeon/radeon_vce.c
> @@ -810,7 +810,8 @@ int radeon_vce_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		goto error;
>   	}
>   
> -	r = radeon_fence_wait(fence, false);
> +	r = radeon_fence_wait_timeout(fence, false, msecs_to_jiffies(
> +		(radeon_lockup_timeout) ? radeon_lockup_timeout : MAX_SCHEDULE_TIMEOUT));
>   	if (r) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>   	} else {
> diff --git a/drivers/gpu/drm/radeon/uvd_v1_0.c b/drivers/gpu/drm/radeon/uvd_v1_0.c
> index c6b1cbc..35caa89 100644
> --- a/drivers/gpu/drm/radeon/uvd_v1_0.c
> +++ b/drivers/gpu/drm/radeon/uvd_v1_0.c
> @@ -522,7 +522,8 @@ int uvd_v1_0_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		goto error;
>   	}
>   
> -	r = radeon_fence_wait(fence, false);
> +	r = radeon_fence_wait_timeout(fence, false, msecs_to_jiffies(
> +		(radeon_lockup_timeout) ? radeon_lockup_timeout : MAX_SCHEDULE_TIMEOUT));
>   	if (r) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>   		goto error;



More information about the dri-devel mailing list