[PATCH] drm/radeon: Avoid double gpu reset by adding a timeout on IB ring tests.

Christian König deathsimple at vodafone.de
Mon Feb 8 09:38:35 UTC 2016


Am 07.02.2016 um 22:51 schrieb Matthew Dawson:
> When the radeon driver resets a gpu, it attempts to test whether all the
> rings can successfully handle an IB.  If these rings fail to respond, the
> process will wait forever.  Another gpu reset can't happen at this point,
> as the current reset holds a lock required to do so.  Instead, make all
> the IB tests run with a timeout, so the system can attempt to recover
> in this case.
>
> While this doesn't fix the underlying issue with card resets failing, it
> gives the system a higher chance of recovering.  These timeouts have been
> confirmed to help both a Tathi and Hawaii card recover after a gpu reset.
>
> This also adds a new function, radeon_fence_wait_timeout, that behaves like
> fence_wait_timeout.  It is used instead of fence_wait_timeout as it continues
> to work during a reset.  radeon_fence_wait is changed to be implemented
> using this function.
>
> V2:
>   - Changed the timeout to 1s, as the default 10s from radeon_wait_timeout was
> too long.  A timeout of 100ms was tested and found to be too short.
>   - Changed radeon_fence_wait_timeout to behave more like fence_wait_timeout.
>
> Signed-off-by: Matthew Dawson <matthew at mjdsystems.ca>

Reviewed-by: Christian König <christian.koenig at amd.com>

Regards,
Christian.

> ---
>   drivers/gpu/drm/radeon/cik.c          | 11 ++++++++--
>   drivers/gpu/drm/radeon/cik_sdma.c     |  9 ++++++--
>   drivers/gpu/drm/radeon/r100.c         | 10 +++++++--
>   drivers/gpu/drm/radeon/r600.c         | 10 +++++++--
>   drivers/gpu/drm/radeon/r600_dma.c     |  9 ++++++--
>   drivers/gpu/drm/radeon/radeon.h       |  2 ++
>   drivers/gpu/drm/radeon/radeon_fence.c | 40 ++++++++++++++++++++++++++++-------
>   drivers/gpu/drm/radeon/radeon_vce.c   | 11 +++++++---
>   drivers/gpu/drm/radeon/uvd_v1_0.c     | 10 +++++++--
>   9 files changed, 89 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c
> index 4c30d8c..0600140 100644
> --- a/drivers/gpu/drm/radeon/cik.c
> +++ b/drivers/gpu/drm/radeon/cik.c
> @@ -4219,13 +4219,20 @@ int cik_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>   		return r;
>   	}
> -	r = radeon_fence_wait(ib.fence, false);
> -	if (r) {
> +	r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
> +		RADEON_USEC_IB_TEST_TIMEOUT));
> +	if (r < 0) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>   		radeon_scratch_free(rdev, scratch);
>   		radeon_ib_free(rdev, &ib);
>   		return r;
> +	} else if (r == 0) {
> +		DRM_ERROR("radeon: fence wait timed out.\n");
> +		radeon_scratch_free(rdev, scratch);
> +		radeon_ib_free(rdev, &ib);
> +		return -ETIMEDOUT;
>   	}
> +	r = 0;
>   	for (i = 0; i < rdev->usec_timeout; i++) {
>   		tmp = RREG32(scratch);
>   		if (tmp == 0xDEADBEEF)
> diff --git a/drivers/gpu/drm/radeon/cik_sdma.c b/drivers/gpu/drm/radeon/cik_sdma.c
> index d16f2ee..9c351dc 100644
> --- a/drivers/gpu/drm/radeon/cik_sdma.c
> +++ b/drivers/gpu/drm/radeon/cik_sdma.c
> @@ -737,11 +737,16 @@ int cik_sdma_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>   		return r;
>   	}
> -	r = radeon_fence_wait(ib.fence, false);
> -	if (r) {
> +	r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
> +		RADEON_USEC_IB_TEST_TIMEOUT));
> +	if (r < 0) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>   		return r;
> +	} else if (r == 0) {
> +		DRM_ERROR("radeon: fence wait timed out.\n");
> +		return -ETIMEDOUT;
>   	}
> +	r = 0;
>   	for (i = 0; i < rdev->usec_timeout; i++) {
>   		tmp = le32_to_cpu(rdev->wb.wb[index/4]);
>   		if (tmp == 0xDEADBEEF)
> diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
> index 5eae0a8..6e478a2 100644
> --- a/drivers/gpu/drm/radeon/r100.c
> +++ b/drivers/gpu/drm/radeon/r100.c
> @@ -3732,11 +3732,17 @@ int r100_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>   		goto free_ib;
>   	}
> -	r = radeon_fence_wait(ib.fence, false);
> -	if (r) {
> +	r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
> +		RADEON_USEC_IB_TEST_TIMEOUT));
> +	if (r < 0) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>   		goto free_ib;
> +	} else if (r == 0) {
> +		DRM_ERROR("radeon: fence wait timed out.\n");
> +		r = -ETIMEDOUT;
> +		goto free_ib;
>   	}
> +	r = 0;
>   	for (i = 0; i < rdev->usec_timeout; i++) {
>   		tmp = RREG32(scratch);
>   		if (tmp == 0xDEADBEEF) {
> diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
> index cc2fdf0..ed12104 100644
> --- a/drivers/gpu/drm/radeon/r600.c
> +++ b/drivers/gpu/drm/radeon/r600.c
> @@ -3381,11 +3381,17 @@ int r600_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>   		goto free_ib;
>   	}
> -	r = radeon_fence_wait(ib.fence, false);
> -	if (r) {
> +	r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
> +		RADEON_USEC_IB_TEST_TIMEOUT));
> +	if (r < 0) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>   		goto free_ib;
> +	} else if (r == 0) {
> +		DRM_ERROR("radeon: fence wait timed out.\n");
> +		r = -ETIMEDOUT;
> +		goto free_ib;
>   	}
> +	r = 0;
>   	for (i = 0; i < rdev->usec_timeout; i++) {
>   		tmp = RREG32(scratch);
>   		if (tmp == 0xDEADBEEF)
> diff --git a/drivers/gpu/drm/radeon/r600_dma.c b/drivers/gpu/drm/radeon/r600_dma.c
> index d2dd29a..fb65e6f 100644
> --- a/drivers/gpu/drm/radeon/r600_dma.c
> +++ b/drivers/gpu/drm/radeon/r600_dma.c
> @@ -368,11 +368,16 @@ int r600_dma_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>   		return r;
>   	}
> -	r = radeon_fence_wait(ib.fence, false);
> -	if (r) {
> +	r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
> +		RADEON_USEC_IB_TEST_TIMEOUT));
> +	if (r < 0) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>   		return r;
> +	} else if (r == 0) {
> +		DRM_ERROR("radeon: fence wait timed out.\n");
> +		return -ETIMEDOUT;
>   	}
> +	r = 0;
>   	for (i = 0; i < rdev->usec_timeout; i++) {
>   		tmp = le32_to_cpu(rdev->wb.wb[index/4]);
>   		if (tmp == 0xDEADBEEF)
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index 78a51b3..007be29 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -120,6 +120,7 @@ extern int radeon_mst;
>    */
>   #define RADEON_MAX_USEC_TIMEOUT			100000	/* 100 ms */
>   #define RADEON_FENCE_JIFFIES_TIMEOUT		(HZ / 2)
> +#define RADEON_USEC_IB_TEST_TIMEOUT		1000000 /* 1s */
>   /* RADEON_IB_POOL_SIZE must be a power of 2 */
>   #define RADEON_IB_POOL_SIZE			16
>   #define RADEON_DEBUGFS_MAX_COMPONENTS		32
> @@ -382,6 +383,7 @@ void radeon_fence_driver_force_completion(struct radeon_device *rdev, int ring);
>   int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence **fence, int ring);
>   void radeon_fence_process(struct radeon_device *rdev, int ring);
>   bool radeon_fence_signaled(struct radeon_fence *fence);
> +long radeon_fence_wait_timeout(struct radeon_fence *fence, bool interruptible, long timeout);
>   int radeon_fence_wait(struct radeon_fence *fence, bool interruptible);
>   int radeon_fence_wait_next(struct radeon_device *rdev, int ring);
>   int radeon_fence_wait_empty(struct radeon_device *rdev, int ring);
> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
> index 05815c4..7ef075a 100644
> --- a/drivers/gpu/drm/radeon/radeon_fence.c
> +++ b/drivers/gpu/drm/radeon/radeon_fence.c
> @@ -527,7 +527,7 @@ static long radeon_fence_wait_seq_timeout(struct radeon_device *rdev,
>   }
>   
>   /**
> - * radeon_fence_wait - wait for a fence to signal
> + * radeon_fence_wait_timeout - wait for a fence to signal with timeout
>    *
>    * @fence: radeon fence object
>    * @intr: use interruptible sleep
> @@ -535,12 +535,15 @@ static long radeon_fence_wait_seq_timeout(struct radeon_device *rdev,
>    * Wait for the requested fence to signal (all asics).
>    * @intr selects whether to use interruptable (true) or non-interruptable
>    * (false) sleep when waiting for the fence.
> - * Returns 0 if the fence has passed, error for all other cases.
> + * @timeout: maximum time to wait, or MAX_SCHEDULE_TIMEOUT for infinite wait
> + * Returns remaining time if the sequence number has passed, 0 when
> + * the wait timeout, or an error for all other cases.
>    */
> -int radeon_fence_wait(struct radeon_fence *fence, bool intr)
> +long radeon_fence_wait_timeout(struct radeon_fence *fence, bool intr, long timeout)
>   {
>   	uint64_t seq[RADEON_NUM_RINGS] = {};
>   	long r;
> +	int r_sig;
>   
>   	/*
>   	 * This function should not be called on !radeon fences.
> @@ -552,15 +555,36 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr)
>   		return fence_wait(&fence->base, intr);
>   
>   	seq[fence->ring] = fence->seq;
> -	r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, MAX_SCHEDULE_TIMEOUT);
> -	if (r < 0) {
> +	r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, timeout);
> +	if (r <= 0) {
>   		return r;
>   	}
>   
> -	r = fence_signal(&fence->base);
> -	if (!r)
> +	r_sig = fence_signal(&fence->base);
> +	if (!r_sig)
>   		FENCE_TRACE(&fence->base, "signaled from fence_wait\n");
> -	return 0;
> +	return r;
> +}
> +
> +/**
> + * radeon_fence_wait - wait for a fence to signal
> + *
> + * @fence: radeon fence object
> + * @intr: use interruptible sleep
> + *
> + * Wait for the requested fence to signal (all asics).
> + * @intr selects whether to use interruptable (true) or non-interruptable
> + * (false) sleep when waiting for the fence.
> + * Returns 0 if the fence has passed, error for all other cases.
> + */
> +int radeon_fence_wait(struct radeon_fence *fence, bool intr)
> +{
> +	long r = radeon_fence_wait_timeout(fence, intr, MAX_SCHEDULE_TIMEOUT);
> +	if (r > 0) {
> +		return 0;
> +	} else {
> +		return r;
> +	}
>   }
>   
>   /**
> diff --git a/drivers/gpu/drm/radeon/radeon_vce.c b/drivers/gpu/drm/radeon/radeon_vce.c
> index 7eb1ae7..566a1a0 100644
> --- a/drivers/gpu/drm/radeon/radeon_vce.c
> +++ b/drivers/gpu/drm/radeon/radeon_vce.c
> @@ -810,11 +810,16 @@ int radeon_vce_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		goto error;
>   	}
>   
> -	r = radeon_fence_wait(fence, false);
> -	if (r) {
> +	r = radeon_fence_wait_timeout(fence, false, usecs_to_jiffies(
> +		RADEON_USEC_IB_TEST_TIMEOUT));
> +	if (r < 0) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
> +	} else if (r == 0) {
> +		DRM_ERROR("radeon: fence wait timed out.\n");
> +		r = -ETIMEDOUT;
>   	} else {
> -	        DRM_INFO("ib test on ring %d succeeded\n", ring->idx);
> +		DRM_INFO("ib test on ring %d succeeded\n", ring->idx);
> +		r = 0;
>   	}
>   error:
>   	radeon_fence_unref(&fence);
> diff --git a/drivers/gpu/drm/radeon/uvd_v1_0.c b/drivers/gpu/drm/radeon/uvd_v1_0.c
> index c6b1cbc..12ddcfa 100644
> --- a/drivers/gpu/drm/radeon/uvd_v1_0.c
> +++ b/drivers/gpu/drm/radeon/uvd_v1_0.c
> @@ -522,11 +522,17 @@ int uvd_v1_0_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		goto error;
>   	}
>   
> -	r = radeon_fence_wait(fence, false);
> -	if (r) {
> +	r = radeon_fence_wait_timeout(fence, false, usecs_to_jiffies(
> +		RADEON_USEC_IB_TEST_TIMEOUT));
> +	if (r < 0) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>   		goto error;
> +	} else if (r == 0) {
> +		DRM_ERROR("radeon: fence wait timed out.\n");
> +		r = -ETIMEDOUT;
> +		goto error;
>   	}
> +	r = 0;
>   	DRM_INFO("ib test on ring %d succeeded\n",  ring->idx);
>   error:
>   	radeon_fence_unref(&fence);



More information about the dri-devel mailing list