[PATCH] drm/radeon: Avoid double gpu reset by adding a timeout on IB ring tests.
Christian König
deathsimple at vodafone.de
Mon Feb 8 09:38:35 UTC 2016
Am 07.02.2016 um 22:51 schrieb Matthew Dawson:
> When the radeon driver resets a gpu, it attempts to test whether all the
> rings can successfully handle an IB. If these rings fail to respond, the
> process will wait forever. Another gpu reset can't happen at this point,
> as the current reset holds a lock required to do so. Instead, make all
> the IB tests run with a timeout, so the system can attempt to recover
> in this case.
>
> While this doesn't fix the underlying issue with card resets failing, it
> gives the system a higher chance of recovering. These timeouts have been
> confirmed to help both a Tathi and Hawaii card recover after a gpu reset.
>
> This also adds a new function, radeon_fence_wait_timeout, that behaves like
> fence_wait_timeout. It is used instead of fence_wait_timeout as it continues
> to work during a reset. radeon_fence_wait is changed to be implemented
> using this function.
>
> V2:
> - Changed the timeout to 1s, as the default 10s from radeon_wait_timeout was
> too long. A timeout of 100ms was tested and found to be too short.
> - Changed radeon_fence_wait_timeout to behave more like fence_wait_timeout.
>
> Signed-off-by: Matthew Dawson <matthew at mjdsystems.ca>
Reviewed-by: Christian König <christian.koenig at amd.com>
Regards,
Christian.
> ---
> drivers/gpu/drm/radeon/cik.c | 11 ++++++++--
> drivers/gpu/drm/radeon/cik_sdma.c | 9 ++++++--
> drivers/gpu/drm/radeon/r100.c | 10 +++++++--
> drivers/gpu/drm/radeon/r600.c | 10 +++++++--
> drivers/gpu/drm/radeon/r600_dma.c | 9 ++++++--
> drivers/gpu/drm/radeon/radeon.h | 2 ++
> drivers/gpu/drm/radeon/radeon_fence.c | 40 ++++++++++++++++++++++++++++-------
> drivers/gpu/drm/radeon/radeon_vce.c | 11 +++++++---
> drivers/gpu/drm/radeon/uvd_v1_0.c | 10 +++++++--
> 9 files changed, 89 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c
> index 4c30d8c..0600140 100644
> --- a/drivers/gpu/drm/radeon/cik.c
> +++ b/drivers/gpu/drm/radeon/cik.c
> @@ -4219,13 +4219,20 @@ int cik_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
> DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
> return r;
> }
> - r = radeon_fence_wait(ib.fence, false);
> - if (r) {
> + r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
> + RADEON_USEC_IB_TEST_TIMEOUT));
> + if (r < 0) {
> DRM_ERROR("radeon: fence wait failed (%d).\n", r);
> radeon_scratch_free(rdev, scratch);
> radeon_ib_free(rdev, &ib);
> return r;
> + } else if (r == 0) {
> + DRM_ERROR("radeon: fence wait timed out.\n");
> + radeon_scratch_free(rdev, scratch);
> + radeon_ib_free(rdev, &ib);
> + return -ETIMEDOUT;
> }
> + r = 0;
> for (i = 0; i < rdev->usec_timeout; i++) {
> tmp = RREG32(scratch);
> if (tmp == 0xDEADBEEF)
> diff --git a/drivers/gpu/drm/radeon/cik_sdma.c b/drivers/gpu/drm/radeon/cik_sdma.c
> index d16f2ee..9c351dc 100644
> --- a/drivers/gpu/drm/radeon/cik_sdma.c
> +++ b/drivers/gpu/drm/radeon/cik_sdma.c
> @@ -737,11 +737,16 @@ int cik_sdma_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
> DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
> return r;
> }
> - r = radeon_fence_wait(ib.fence, false);
> - if (r) {
> + r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
> + RADEON_USEC_IB_TEST_TIMEOUT));
> + if (r < 0) {
> DRM_ERROR("radeon: fence wait failed (%d).\n", r);
> return r;
> + } else if (r == 0) {
> + DRM_ERROR("radeon: fence wait timed out.\n");
> + return -ETIMEDOUT;
> }
> + r = 0;
> for (i = 0; i < rdev->usec_timeout; i++) {
> tmp = le32_to_cpu(rdev->wb.wb[index/4]);
> if (tmp == 0xDEADBEEF)
> diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
> index 5eae0a8..6e478a2 100644
> --- a/drivers/gpu/drm/radeon/r100.c
> +++ b/drivers/gpu/drm/radeon/r100.c
> @@ -3732,11 +3732,17 @@ int r100_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
> DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
> goto free_ib;
> }
> - r = radeon_fence_wait(ib.fence, false);
> - if (r) {
> + r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
> + RADEON_USEC_IB_TEST_TIMEOUT));
> + if (r < 0) {
> DRM_ERROR("radeon: fence wait failed (%d).\n", r);
> goto free_ib;
> + } else if (r == 0) {
> + DRM_ERROR("radeon: fence wait timed out.\n");
> + r = -ETIMEDOUT;
> + goto free_ib;
> }
> + r = 0;
> for (i = 0; i < rdev->usec_timeout; i++) {
> tmp = RREG32(scratch);
> if (tmp == 0xDEADBEEF) {
> diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
> index cc2fdf0..ed12104 100644
> --- a/drivers/gpu/drm/radeon/r600.c
> +++ b/drivers/gpu/drm/radeon/r600.c
> @@ -3381,11 +3381,17 @@ int r600_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
> DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
> goto free_ib;
> }
> - r = radeon_fence_wait(ib.fence, false);
> - if (r) {
> + r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
> + RADEON_USEC_IB_TEST_TIMEOUT));
> + if (r < 0) {
> DRM_ERROR("radeon: fence wait failed (%d).\n", r);
> goto free_ib;
> + } else if (r == 0) {
> + DRM_ERROR("radeon: fence wait timed out.\n");
> + r = -ETIMEDOUT;
> + goto free_ib;
> }
> + r = 0;
> for (i = 0; i < rdev->usec_timeout; i++) {
> tmp = RREG32(scratch);
> if (tmp == 0xDEADBEEF)
> diff --git a/drivers/gpu/drm/radeon/r600_dma.c b/drivers/gpu/drm/radeon/r600_dma.c
> index d2dd29a..fb65e6f 100644
> --- a/drivers/gpu/drm/radeon/r600_dma.c
> +++ b/drivers/gpu/drm/radeon/r600_dma.c
> @@ -368,11 +368,16 @@ int r600_dma_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
> DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
> return r;
> }
> - r = radeon_fence_wait(ib.fence, false);
> - if (r) {
> + r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
> + RADEON_USEC_IB_TEST_TIMEOUT));
> + if (r < 0) {
> DRM_ERROR("radeon: fence wait failed (%d).\n", r);
> return r;
> + } else if (r == 0) {
> + DRM_ERROR("radeon: fence wait timed out.\n");
> + return -ETIMEDOUT;
> }
> + r = 0;
> for (i = 0; i < rdev->usec_timeout; i++) {
> tmp = le32_to_cpu(rdev->wb.wb[index/4]);
> if (tmp == 0xDEADBEEF)
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index 78a51b3..007be29 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -120,6 +120,7 @@ extern int radeon_mst;
> */
> #define RADEON_MAX_USEC_TIMEOUT 100000 /* 100 ms */
> #define RADEON_FENCE_JIFFIES_TIMEOUT (HZ / 2)
> +#define RADEON_USEC_IB_TEST_TIMEOUT 1000000 /* 1s */
> /* RADEON_IB_POOL_SIZE must be a power of 2 */
> #define RADEON_IB_POOL_SIZE 16
> #define RADEON_DEBUGFS_MAX_COMPONENTS 32
> @@ -382,6 +383,7 @@ void radeon_fence_driver_force_completion(struct radeon_device *rdev, int ring);
> int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence **fence, int ring);
> void radeon_fence_process(struct radeon_device *rdev, int ring);
> bool radeon_fence_signaled(struct radeon_fence *fence);
> +long radeon_fence_wait_timeout(struct radeon_fence *fence, bool interruptible, long timeout);
> int radeon_fence_wait(struct radeon_fence *fence, bool interruptible);
> int radeon_fence_wait_next(struct radeon_device *rdev, int ring);
> int radeon_fence_wait_empty(struct radeon_device *rdev, int ring);
> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
> index 05815c4..7ef075a 100644
> --- a/drivers/gpu/drm/radeon/radeon_fence.c
> +++ b/drivers/gpu/drm/radeon/radeon_fence.c
> @@ -527,7 +527,7 @@ static long radeon_fence_wait_seq_timeout(struct radeon_device *rdev,
> }
>
> /**
> - * radeon_fence_wait - wait for a fence to signal
> + * radeon_fence_wait_timeout - wait for a fence to signal with timeout
> *
> * @fence: radeon fence object
> * @intr: use interruptible sleep
> @@ -535,12 +535,15 @@ static long radeon_fence_wait_seq_timeout(struct radeon_device *rdev,
> * Wait for the requested fence to signal (all asics).
> * @intr selects whether to use interruptable (true) or non-interruptable
> * (false) sleep when waiting for the fence.
> - * Returns 0 if the fence has passed, error for all other cases.
> + * @timeout: maximum time to wait, or MAX_SCHEDULE_TIMEOUT for infinite wait
> + * Returns remaining time if the sequence number has passed, 0 when
> + * the wait timeout, or an error for all other cases.
> */
> -int radeon_fence_wait(struct radeon_fence *fence, bool intr)
> +long radeon_fence_wait_timeout(struct radeon_fence *fence, bool intr, long timeout)
> {
> uint64_t seq[RADEON_NUM_RINGS] = {};
> long r;
> + int r_sig;
>
> /*
> * This function should not be called on !radeon fences.
> @@ -552,15 +555,36 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr)
> return fence_wait(&fence->base, intr);
>
> seq[fence->ring] = fence->seq;
> - r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, MAX_SCHEDULE_TIMEOUT);
> - if (r < 0) {
> + r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, timeout);
> + if (r <= 0) {
> return r;
> }
>
> - r = fence_signal(&fence->base);
> - if (!r)
> + r_sig = fence_signal(&fence->base);
> + if (!r_sig)
> FENCE_TRACE(&fence->base, "signaled from fence_wait\n");
> - return 0;
> + return r;
> +}
> +
> +/**
> + * radeon_fence_wait - wait for a fence to signal
> + *
> + * @fence: radeon fence object
> + * @intr: use interruptible sleep
> + *
> + * Wait for the requested fence to signal (all asics).
> + * @intr selects whether to use interruptable (true) or non-interruptable
> + * (false) sleep when waiting for the fence.
> + * Returns 0 if the fence has passed, error for all other cases.
> + */
> +int radeon_fence_wait(struct radeon_fence *fence, bool intr)
> +{
> + long r = radeon_fence_wait_timeout(fence, intr, MAX_SCHEDULE_TIMEOUT);
> + if (r > 0) {
> + return 0;
> + } else {
> + return r;
> + }
> }
>
> /**
> diff --git a/drivers/gpu/drm/radeon/radeon_vce.c b/drivers/gpu/drm/radeon/radeon_vce.c
> index 7eb1ae7..566a1a0 100644
> --- a/drivers/gpu/drm/radeon/radeon_vce.c
> +++ b/drivers/gpu/drm/radeon/radeon_vce.c
> @@ -810,11 +810,16 @@ int radeon_vce_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
> goto error;
> }
>
> - r = radeon_fence_wait(fence, false);
> - if (r) {
> + r = radeon_fence_wait_timeout(fence, false, usecs_to_jiffies(
> + RADEON_USEC_IB_TEST_TIMEOUT));
> + if (r < 0) {
> DRM_ERROR("radeon: fence wait failed (%d).\n", r);
> + } else if (r == 0) {
> + DRM_ERROR("radeon: fence wait timed out.\n");
> + r = -ETIMEDOUT;
> } else {
> - DRM_INFO("ib test on ring %d succeeded\n", ring->idx);
> + DRM_INFO("ib test on ring %d succeeded\n", ring->idx);
> + r = 0;
> }
> error:
> radeon_fence_unref(&fence);
> diff --git a/drivers/gpu/drm/radeon/uvd_v1_0.c b/drivers/gpu/drm/radeon/uvd_v1_0.c
> index c6b1cbc..12ddcfa 100644
> --- a/drivers/gpu/drm/radeon/uvd_v1_0.c
> +++ b/drivers/gpu/drm/radeon/uvd_v1_0.c
> @@ -522,11 +522,17 @@ int uvd_v1_0_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
> goto error;
> }
>
> - r = radeon_fence_wait(fence, false);
> - if (r) {
> + r = radeon_fence_wait_timeout(fence, false, usecs_to_jiffies(
> + RADEON_USEC_IB_TEST_TIMEOUT));
> + if (r < 0) {
> DRM_ERROR("radeon: fence wait failed (%d).\n", r);
> goto error;
> + } else if (r == 0) {
> + DRM_ERROR("radeon: fence wait timed out.\n");
> + r = -ETIMEDOUT;
> + goto error;
> }
> + r = 0;
> DRM_INFO("ib test on ring %d succeeded\n", ring->idx);
> error:
> radeon_fence_unref(&fence);
More information about the dri-devel
mailing list