[PATCH] drm/radeon: Avoid double gpu reset by adding a timeout on IB ring tests.
Alex Deucher
alexdeucher at gmail.com
Mon Feb 8 15:42:36 UTC 2016
On Mon, Feb 8, 2016 at 4:38 AM, Christian König <deathsimple at vodafone.de> wrote:
> Am 07.02.2016 um 22:51 schrieb Matthew Dawson:
>>
>> When the radeon driver resets a gpu, it attempts to test whether all the
>> rings can successfully handle an IB. If these rings fail to respond, the
>> process will wait forever. Another gpu reset can't happen at this point,
>> as the current reset holds a lock required to do so. Instead, make all
>> the IB tests run with a timeout, so the system can attempt to recover
>> in this case.
>>
>> While this doesn't fix the underlying issue with card resets failing, it
>> gives the system a higher chance of recovering. These timeouts have been
>> confirmed to help both a Tathi and Hawaii card recover after a gpu reset.
>>
>> This also adds a new function, radeon_fence_wait_timeout, that behaves
>> like
>> fence_wait_timeout. It is used instead of fence_wait_timeout as it
>> continues
>> to work during a reset. radeon_fence_wait is changed to be implemented
>> using this function.
>>
>> V2:
>> - Changed the timeout to 1s, as the default 10s from radeon_wait_timeout
>> was
>> too long. A timeout of 100ms was tested and found to be too short.
>> - Changed radeon_fence_wait_timeout to behave more like
>> fence_wait_timeout.
>>
>> Signed-off-by: Matthew Dawson <matthew at mjdsystems.ca>
>
>
> Reviewed-by: Christian König <christian.koenig at amd.com>
Applied. Thanks!
Alex
>
> Regards,
> Christian.
>
>
>> ---
>> drivers/gpu/drm/radeon/cik.c | 11 ++++++++--
>> drivers/gpu/drm/radeon/cik_sdma.c | 9 ++++++--
>> drivers/gpu/drm/radeon/r100.c | 10 +++++++--
>> drivers/gpu/drm/radeon/r600.c | 10 +++++++--
>> drivers/gpu/drm/radeon/r600_dma.c | 9 ++++++--
>> drivers/gpu/drm/radeon/radeon.h | 2 ++
>> drivers/gpu/drm/radeon/radeon_fence.c | 40
>> ++++++++++++++++++++++++++++-------
>> drivers/gpu/drm/radeon/radeon_vce.c | 11 +++++++---
>> drivers/gpu/drm/radeon/uvd_v1_0.c | 10 +++++++--
>> 9 files changed, 89 insertions(+), 23 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c
>> index 4c30d8c..0600140 100644
>> --- a/drivers/gpu/drm/radeon/cik.c
>> +++ b/drivers/gpu/drm/radeon/cik.c
>> @@ -4219,13 +4219,20 @@ int cik_ib_test(struct radeon_device *rdev, struct
>> radeon_ring *ring)
>> DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>> return r;
>> }
>> - r = radeon_fence_wait(ib.fence, false);
>> - if (r) {
>> + r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
>> + RADEON_USEC_IB_TEST_TIMEOUT));
>> + if (r < 0) {
>> DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>> radeon_scratch_free(rdev, scratch);
>> radeon_ib_free(rdev, &ib);
>> return r;
>> + } else if (r == 0) {
>> + DRM_ERROR("radeon: fence wait timed out.\n");
>> + radeon_scratch_free(rdev, scratch);
>> + radeon_ib_free(rdev, &ib);
>> + return -ETIMEDOUT;
>> }
>> + r = 0;
>> for (i = 0; i < rdev->usec_timeout; i++) {
>> tmp = RREG32(scratch);
>> if (tmp == 0xDEADBEEF)
>> diff --git a/drivers/gpu/drm/radeon/cik_sdma.c
>> b/drivers/gpu/drm/radeon/cik_sdma.c
>> index d16f2ee..9c351dc 100644
>> --- a/drivers/gpu/drm/radeon/cik_sdma.c
>> +++ b/drivers/gpu/drm/radeon/cik_sdma.c
>> @@ -737,11 +737,16 @@ int cik_sdma_ib_test(struct radeon_device *rdev,
>> struct radeon_ring *ring)
>> DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>> return r;
>> }
>> - r = radeon_fence_wait(ib.fence, false);
>> - if (r) {
>> + r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
>> + RADEON_USEC_IB_TEST_TIMEOUT));
>> + if (r < 0) {
>> DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>> return r;
>> + } else if (r == 0) {
>> + DRM_ERROR("radeon: fence wait timed out.\n");
>> + return -ETIMEDOUT;
>> }
>> + r = 0;
>> for (i = 0; i < rdev->usec_timeout; i++) {
>> tmp = le32_to_cpu(rdev->wb.wb[index/4]);
>> if (tmp == 0xDEADBEEF)
>> diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
>> index 5eae0a8..6e478a2 100644
>> --- a/drivers/gpu/drm/radeon/r100.c
>> +++ b/drivers/gpu/drm/radeon/r100.c
>> @@ -3732,11 +3732,17 @@ int r100_ib_test(struct radeon_device *rdev,
>> struct radeon_ring *ring)
>> DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>> goto free_ib;
>> }
>> - r = radeon_fence_wait(ib.fence, false);
>> - if (r) {
>> + r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
>> + RADEON_USEC_IB_TEST_TIMEOUT));
>> + if (r < 0) {
>> DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>> goto free_ib;
>> + } else if (r == 0) {
>> + DRM_ERROR("radeon: fence wait timed out.\n");
>> + r = -ETIMEDOUT;
>> + goto free_ib;
>> }
>> + r = 0;
>> for (i = 0; i < rdev->usec_timeout; i++) {
>> tmp = RREG32(scratch);
>> if (tmp == 0xDEADBEEF) {
>> diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
>> index cc2fdf0..ed12104 100644
>> --- a/drivers/gpu/drm/radeon/r600.c
>> +++ b/drivers/gpu/drm/radeon/r600.c
>> @@ -3381,11 +3381,17 @@ int r600_ib_test(struct radeon_device *rdev,
>> struct radeon_ring *ring)
>> DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>> goto free_ib;
>> }
>> - r = radeon_fence_wait(ib.fence, false);
>> - if (r) {
>> + r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
>> + RADEON_USEC_IB_TEST_TIMEOUT));
>> + if (r < 0) {
>> DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>> goto free_ib;
>> + } else if (r == 0) {
>> + DRM_ERROR("radeon: fence wait timed out.\n");
>> + r = -ETIMEDOUT;
>> + goto free_ib;
>> }
>> + r = 0;
>> for (i = 0; i < rdev->usec_timeout; i++) {
>> tmp = RREG32(scratch);
>> if (tmp == 0xDEADBEEF)
>> diff --git a/drivers/gpu/drm/radeon/r600_dma.c
>> b/drivers/gpu/drm/radeon/r600_dma.c
>> index d2dd29a..fb65e6f 100644
>> --- a/drivers/gpu/drm/radeon/r600_dma.c
>> +++ b/drivers/gpu/drm/radeon/r600_dma.c
>> @@ -368,11 +368,16 @@ int r600_dma_ib_test(struct radeon_device *rdev,
>> struct radeon_ring *ring)
>> DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>> return r;
>> }
>> - r = radeon_fence_wait(ib.fence, false);
>> - if (r) {
>> + r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
>> + RADEON_USEC_IB_TEST_TIMEOUT));
>> + if (r < 0) {
>> DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>> return r;
>> + } else if (r == 0) {
>> + DRM_ERROR("radeon: fence wait timed out.\n");
>> + return -ETIMEDOUT;
>> }
>> + r = 0;
>> for (i = 0; i < rdev->usec_timeout; i++) {
>> tmp = le32_to_cpu(rdev->wb.wb[index/4]);
>> if (tmp == 0xDEADBEEF)
>> diff --git a/drivers/gpu/drm/radeon/radeon.h
>> b/drivers/gpu/drm/radeon/radeon.h
>> index 78a51b3..007be29 100644
>> --- a/drivers/gpu/drm/radeon/radeon.h
>> +++ b/drivers/gpu/drm/radeon/radeon.h
>> @@ -120,6 +120,7 @@ extern int radeon_mst;
>> */
>> #define RADEON_MAX_USEC_TIMEOUT 100000 /* 100 ms
>> */
>> #define RADEON_FENCE_JIFFIES_TIMEOUT (HZ / 2)
>> +#define RADEON_USEC_IB_TEST_TIMEOUT 1000000 /* 1s */
>> /* RADEON_IB_POOL_SIZE must be a power of 2 */
>> #define RADEON_IB_POOL_SIZE 16
>> #define RADEON_DEBUGFS_MAX_COMPONENTS 32
>> @@ -382,6 +383,7 @@ void radeon_fence_driver_force_completion(struct
>> radeon_device *rdev, int ring);
>> int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence
>> **fence, int ring);
>> void radeon_fence_process(struct radeon_device *rdev, int ring);
>> bool radeon_fence_signaled(struct radeon_fence *fence);
>> +long radeon_fence_wait_timeout(struct radeon_fence *fence, bool
>> interruptible, long timeout);
>> int radeon_fence_wait(struct radeon_fence *fence, bool interruptible);
>> int radeon_fence_wait_next(struct radeon_device *rdev, int ring);
>> int radeon_fence_wait_empty(struct radeon_device *rdev, int ring);
>> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c
>> b/drivers/gpu/drm/radeon/radeon_fence.c
>> index 05815c4..7ef075a 100644
>> --- a/drivers/gpu/drm/radeon/radeon_fence.c
>> +++ b/drivers/gpu/drm/radeon/radeon_fence.c
>> @@ -527,7 +527,7 @@ static long radeon_fence_wait_seq_timeout(struct
>> radeon_device *rdev,
>> }
>> /**
>> - * radeon_fence_wait - wait for a fence to signal
>> + * radeon_fence_wait_timeout - wait for a fence to signal with timeout
>> *
>> * @fence: radeon fence object
>> * @intr: use interruptible sleep
>> @@ -535,12 +535,15 @@ static long radeon_fence_wait_seq_timeout(struct
>> radeon_device *rdev,
>> * Wait for the requested fence to signal (all asics).
>> * @intr selects whether to use interruptable (true) or
>> non-interruptable
>> * (false) sleep when waiting for the fence.
>> - * Returns 0 if the fence has passed, error for all other cases.
>> + * @timeout: maximum time to wait, or MAX_SCHEDULE_TIMEOUT for infinite
>> wait
>> + * Returns remaining time if the sequence number has passed, 0 when
>> + * the wait timeout, or an error for all other cases.
>> */
>> -int radeon_fence_wait(struct radeon_fence *fence, bool intr)
>> +long radeon_fence_wait_timeout(struct radeon_fence *fence, bool intr,
>> long timeout)
>> {
>> uint64_t seq[RADEON_NUM_RINGS] = {};
>> long r;
>> + int r_sig;
>> /*
>> * This function should not be called on !radeon fences.
>> @@ -552,15 +555,36 @@ int radeon_fence_wait(struct radeon_fence *fence,
>> bool intr)
>> return fence_wait(&fence->base, intr);
>> seq[fence->ring] = fence->seq;
>> - r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr,
>> MAX_SCHEDULE_TIMEOUT);
>> - if (r < 0) {
>> + r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr,
>> timeout);
>> + if (r <= 0) {
>> return r;
>> }
>> - r = fence_signal(&fence->base);
>> - if (!r)
>> + r_sig = fence_signal(&fence->base);
>> + if (!r_sig)
>> FENCE_TRACE(&fence->base, "signaled from fence_wait\n");
>> - return 0;
>> + return r;
>> +}
>> +
>> +/**
>> + * radeon_fence_wait - wait for a fence to signal
>> + *
>> + * @fence: radeon fence object
>> + * @intr: use interruptible sleep
>> + *
>> + * Wait for the requested fence to signal (all asics).
>> + * @intr selects whether to use interruptable (true) or non-interruptable
>> + * (false) sleep when waiting for the fence.
>> + * Returns 0 if the fence has passed, error for all other cases.
>> + */
>> +int radeon_fence_wait(struct radeon_fence *fence, bool intr)
>> +{
>> + long r = radeon_fence_wait_timeout(fence, intr,
>> MAX_SCHEDULE_TIMEOUT);
>> + if (r > 0) {
>> + return 0;
>> + } else {
>> + return r;
>> + }
>> }
>> /**
>> diff --git a/drivers/gpu/drm/radeon/radeon_vce.c
>> b/drivers/gpu/drm/radeon/radeon_vce.c
>> index 7eb1ae7..566a1a0 100644
>> --- a/drivers/gpu/drm/radeon/radeon_vce.c
>> +++ b/drivers/gpu/drm/radeon/radeon_vce.c
>> @@ -810,11 +810,16 @@ int radeon_vce_ib_test(struct radeon_device *rdev,
>> struct radeon_ring *ring)
>> goto error;
>> }
>> - r = radeon_fence_wait(fence, false);
>> - if (r) {
>> + r = radeon_fence_wait_timeout(fence, false, usecs_to_jiffies(
>> + RADEON_USEC_IB_TEST_TIMEOUT));
>> + if (r < 0) {
>> DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>> + } else if (r == 0) {
>> + DRM_ERROR("radeon: fence wait timed out.\n");
>> + r = -ETIMEDOUT;
>> } else {
>> - DRM_INFO("ib test on ring %d succeeded\n", ring->idx);
>> + DRM_INFO("ib test on ring %d succeeded\n", ring->idx);
>> + r = 0;
>> }
>> error:
>> radeon_fence_unref(&fence);
>> diff --git a/drivers/gpu/drm/radeon/uvd_v1_0.c
>> b/drivers/gpu/drm/radeon/uvd_v1_0.c
>> index c6b1cbc..12ddcfa 100644
>> --- a/drivers/gpu/drm/radeon/uvd_v1_0.c
>> +++ b/drivers/gpu/drm/radeon/uvd_v1_0.c
>> @@ -522,11 +522,17 @@ int uvd_v1_0_ib_test(struct radeon_device *rdev,
>> struct radeon_ring *ring)
>> goto error;
>> }
>> - r = radeon_fence_wait(fence, false);
>> - if (r) {
>> + r = radeon_fence_wait_timeout(fence, false, usecs_to_jiffies(
>> + RADEON_USEC_IB_TEST_TIMEOUT));
>> + if (r < 0) {
>> DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>> goto error;
>> + } else if (r == 0) {
>> + DRM_ERROR("radeon: fence wait timed out.\n");
>> + r = -ETIMEDOUT;
>> + goto error;
>> }
>> + r = 0;
>> DRM_INFO("ib test on ring %d succeeded\n", ring->idx);
>> error:
>> radeon_fence_unref(&fence);
>
>
More information about the dri-devel
mailing list