[PATCH] drm/radeon: Avoid double gpu reset by adding a timeout on IB ring tests.

Alex Deucher alexdeucher at gmail.com
Mon Feb 8 15:42:36 UTC 2016


On Mon, Feb 8, 2016 at 4:38 AM, Christian König <deathsimple at vodafone.de> wrote:
> Am 07.02.2016 um 22:51 schrieb Matthew Dawson:
>>
>> When the radeon driver resets a gpu, it attempts to test whether all the
>> rings can successfully handle an IB.  If these rings fail to respond, the
>> process will wait forever.  Another gpu reset can't happen at this point,
>> as the current reset holds a lock required to do so.  Instead, make all
>> the IB tests run with a timeout, so the system can attempt to recover
>> in this case.
>>
>> While this doesn't fix the underlying issue with card resets failing, it
>> gives the system a higher chance of recovering.  These timeouts have been
>> confirmed to help both a Tathi and Hawaii card recover after a gpu reset.
>>
>> This also adds a new function, radeon_fence_wait_timeout, that behaves
>> like
>> fence_wait_timeout.  It is used instead of fence_wait_timeout as it
>> continues
>> to work during a reset.  radeon_fence_wait is changed to be implemented
>> using this function.
>>
>> V2:
>>   - Changed the timeout to 1s, as the default 10s from radeon_wait_timeout
>> was
>> too long.  A timeout of 100ms was tested and found to be too short.
>>   - Changed radeon_fence_wait_timeout to behave more like
>> fence_wait_timeout.
>>
>> Signed-off-by: Matthew Dawson <matthew at mjdsystems.ca>
>
>
> Reviewed-by: Christian König <christian.koenig at amd.com>


Applied.  Thanks!

Alex

>
> Regards,
> Christian.
>
>
>> ---
>>   drivers/gpu/drm/radeon/cik.c          | 11 ++++++++--
>>   drivers/gpu/drm/radeon/cik_sdma.c     |  9 ++++++--
>>   drivers/gpu/drm/radeon/r100.c         | 10 +++++++--
>>   drivers/gpu/drm/radeon/r600.c         | 10 +++++++--
>>   drivers/gpu/drm/radeon/r600_dma.c     |  9 ++++++--
>>   drivers/gpu/drm/radeon/radeon.h       |  2 ++
>>   drivers/gpu/drm/radeon/radeon_fence.c | 40
>> ++++++++++++++++++++++++++++-------
>>   drivers/gpu/drm/radeon/radeon_vce.c   | 11 +++++++---
>>   drivers/gpu/drm/radeon/uvd_v1_0.c     | 10 +++++++--
>>   9 files changed, 89 insertions(+), 23 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c
>> index 4c30d8c..0600140 100644
>> --- a/drivers/gpu/drm/radeon/cik.c
>> +++ b/drivers/gpu/drm/radeon/cik.c
>> @@ -4219,13 +4219,20 @@ int cik_ib_test(struct radeon_device *rdev, struct
>> radeon_ring *ring)
>>                 DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>>                 return r;
>>         }
>> -       r = radeon_fence_wait(ib.fence, false);
>> -       if (r) {
>> +       r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
>> +               RADEON_USEC_IB_TEST_TIMEOUT));
>> +       if (r < 0) {
>>                 DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>>                 radeon_scratch_free(rdev, scratch);
>>                 radeon_ib_free(rdev, &ib);
>>                 return r;
>> +       } else if (r == 0) {
>> +               DRM_ERROR("radeon: fence wait timed out.\n");
>> +               radeon_scratch_free(rdev, scratch);
>> +               radeon_ib_free(rdev, &ib);
>> +               return -ETIMEDOUT;
>>         }
>> +       r = 0;
>>         for (i = 0; i < rdev->usec_timeout; i++) {
>>                 tmp = RREG32(scratch);
>>                 if (tmp == 0xDEADBEEF)
>> diff --git a/drivers/gpu/drm/radeon/cik_sdma.c
>> b/drivers/gpu/drm/radeon/cik_sdma.c
>> index d16f2ee..9c351dc 100644
>> --- a/drivers/gpu/drm/radeon/cik_sdma.c
>> +++ b/drivers/gpu/drm/radeon/cik_sdma.c
>> @@ -737,11 +737,16 @@ int cik_sdma_ib_test(struct radeon_device *rdev,
>> struct radeon_ring *ring)
>>                 DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>>                 return r;
>>         }
>> -       r = radeon_fence_wait(ib.fence, false);
>> -       if (r) {
>> +       r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
>> +               RADEON_USEC_IB_TEST_TIMEOUT));
>> +       if (r < 0) {
>>                 DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>>                 return r;
>> +       } else if (r == 0) {
>> +               DRM_ERROR("radeon: fence wait timed out.\n");
>> +               return -ETIMEDOUT;
>>         }
>> +       r = 0;
>>         for (i = 0; i < rdev->usec_timeout; i++) {
>>                 tmp = le32_to_cpu(rdev->wb.wb[index/4]);
>>                 if (tmp == 0xDEADBEEF)
>> diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
>> index 5eae0a8..6e478a2 100644
>> --- a/drivers/gpu/drm/radeon/r100.c
>> +++ b/drivers/gpu/drm/radeon/r100.c
>> @@ -3732,11 +3732,17 @@ int r100_ib_test(struct radeon_device *rdev,
>> struct radeon_ring *ring)
>>                 DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>>                 goto free_ib;
>>         }
>> -       r = radeon_fence_wait(ib.fence, false);
>> -       if (r) {
>> +       r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
>> +               RADEON_USEC_IB_TEST_TIMEOUT));
>> +       if (r < 0) {
>>                 DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>>                 goto free_ib;
>> +       } else if (r == 0) {
>> +               DRM_ERROR("radeon: fence wait timed out.\n");
>> +               r = -ETIMEDOUT;
>> +               goto free_ib;
>>         }
>> +       r = 0;
>>         for (i = 0; i < rdev->usec_timeout; i++) {
>>                 tmp = RREG32(scratch);
>>                 if (tmp == 0xDEADBEEF) {
>> diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
>> index cc2fdf0..ed12104 100644
>> --- a/drivers/gpu/drm/radeon/r600.c
>> +++ b/drivers/gpu/drm/radeon/r600.c
>> @@ -3381,11 +3381,17 @@ int r600_ib_test(struct radeon_device *rdev,
>> struct radeon_ring *ring)
>>                 DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>>                 goto free_ib;
>>         }
>> -       r = radeon_fence_wait(ib.fence, false);
>> -       if (r) {
>> +       r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
>> +               RADEON_USEC_IB_TEST_TIMEOUT));
>> +       if (r < 0) {
>>                 DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>>                 goto free_ib;
>> +       } else if (r == 0) {
>> +               DRM_ERROR("radeon: fence wait timed out.\n");
>> +               r = -ETIMEDOUT;
>> +               goto free_ib;
>>         }
>> +       r = 0;
>>         for (i = 0; i < rdev->usec_timeout; i++) {
>>                 tmp = RREG32(scratch);
>>                 if (tmp == 0xDEADBEEF)
>> diff --git a/drivers/gpu/drm/radeon/r600_dma.c
>> b/drivers/gpu/drm/radeon/r600_dma.c
>> index d2dd29a..fb65e6f 100644
>> --- a/drivers/gpu/drm/radeon/r600_dma.c
>> +++ b/drivers/gpu/drm/radeon/r600_dma.c
>> @@ -368,11 +368,16 @@ int r600_dma_ib_test(struct radeon_device *rdev,
>> struct radeon_ring *ring)
>>                 DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>>                 return r;
>>         }
>> -       r = radeon_fence_wait(ib.fence, false);
>> -       if (r) {
>> +       r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
>> +               RADEON_USEC_IB_TEST_TIMEOUT));
>> +       if (r < 0) {
>>                 DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>>                 return r;
>> +       } else if (r == 0) {
>> +               DRM_ERROR("radeon: fence wait timed out.\n");
>> +               return -ETIMEDOUT;
>>         }
>> +       r = 0;
>>         for (i = 0; i < rdev->usec_timeout; i++) {
>>                 tmp = le32_to_cpu(rdev->wb.wb[index/4]);
>>                 if (tmp == 0xDEADBEEF)
>> diff --git a/drivers/gpu/drm/radeon/radeon.h
>> b/drivers/gpu/drm/radeon/radeon.h
>> index 78a51b3..007be29 100644
>> --- a/drivers/gpu/drm/radeon/radeon.h
>> +++ b/drivers/gpu/drm/radeon/radeon.h
>> @@ -120,6 +120,7 @@ extern int radeon_mst;
>>    */
>>   #define RADEON_MAX_USEC_TIMEOUT                       100000  /* 100 ms
>> */
>>   #define RADEON_FENCE_JIFFIES_TIMEOUT          (HZ / 2)
>> +#define RADEON_USEC_IB_TEST_TIMEOUT            1000000 /* 1s */
>>   /* RADEON_IB_POOL_SIZE must be a power of 2 */
>>   #define RADEON_IB_POOL_SIZE                   16
>>   #define RADEON_DEBUGFS_MAX_COMPONENTS         32
>> @@ -382,6 +383,7 @@ void radeon_fence_driver_force_completion(struct
>> radeon_device *rdev, int ring);
>>   int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence
>> **fence, int ring);
>>   void radeon_fence_process(struct radeon_device *rdev, int ring);
>>   bool radeon_fence_signaled(struct radeon_fence *fence);
>> +long radeon_fence_wait_timeout(struct radeon_fence *fence, bool
>> interruptible, long timeout);
>>   int radeon_fence_wait(struct radeon_fence *fence, bool interruptible);
>>   int radeon_fence_wait_next(struct radeon_device *rdev, int ring);
>>   int radeon_fence_wait_empty(struct radeon_device *rdev, int ring);
>> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c
>> b/drivers/gpu/drm/radeon/radeon_fence.c
>> index 05815c4..7ef075a 100644
>> --- a/drivers/gpu/drm/radeon/radeon_fence.c
>> +++ b/drivers/gpu/drm/radeon/radeon_fence.c
>> @@ -527,7 +527,7 @@ static long radeon_fence_wait_seq_timeout(struct
>> radeon_device *rdev,
>>   }
>>     /**
>> - * radeon_fence_wait - wait for a fence to signal
>> + * radeon_fence_wait_timeout - wait for a fence to signal with timeout
>>    *
>>    * @fence: radeon fence object
>>    * @intr: use interruptible sleep
>> @@ -535,12 +535,15 @@ static long radeon_fence_wait_seq_timeout(struct
>> radeon_device *rdev,
>>    * Wait for the requested fence to signal (all asics).
>>    * @intr selects whether to use interruptable (true) or
>> non-interruptable
>>    * (false) sleep when waiting for the fence.
>> - * Returns 0 if the fence has passed, error for all other cases.
>> + * @timeout: maximum time to wait, or MAX_SCHEDULE_TIMEOUT for infinite
>> wait
>> + * Returns remaining time if the sequence number has passed, 0 when
>> + * the wait timeout, or an error for all other cases.
>>    */
>> -int radeon_fence_wait(struct radeon_fence *fence, bool intr)
>> +long radeon_fence_wait_timeout(struct radeon_fence *fence, bool intr,
>> long timeout)
>>   {
>>         uint64_t seq[RADEON_NUM_RINGS] = {};
>>         long r;
>> +       int r_sig;
>>         /*
>>          * This function should not be called on !radeon fences.
>> @@ -552,15 +555,36 @@ int radeon_fence_wait(struct radeon_fence *fence,
>> bool intr)
>>                 return fence_wait(&fence->base, intr);
>>         seq[fence->ring] = fence->seq;
>> -       r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr,
>> MAX_SCHEDULE_TIMEOUT);
>> -       if (r < 0) {
>> +       r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr,
>> timeout);
>> +       if (r <= 0) {
>>                 return r;
>>         }
>>   -     r = fence_signal(&fence->base);
>> -       if (!r)
>> +       r_sig = fence_signal(&fence->base);
>> +       if (!r_sig)
>>                 FENCE_TRACE(&fence->base, "signaled from fence_wait\n");
>> -       return 0;
>> +       return r;
>> +}
>> +
>> +/**
>> + * radeon_fence_wait - wait for a fence to signal
>> + *
>> + * @fence: radeon fence object
>> + * @intr: use interruptible sleep
>> + *
>> + * Wait for the requested fence to signal (all asics).
>> + * @intr selects whether to use interruptable (true) or non-interruptable
>> + * (false) sleep when waiting for the fence.
>> + * Returns 0 if the fence has passed, error for all other cases.
>> + */
>> +int radeon_fence_wait(struct radeon_fence *fence, bool intr)
>> +{
>> +       long r = radeon_fence_wait_timeout(fence, intr,
>> MAX_SCHEDULE_TIMEOUT);
>> +       if (r > 0) {
>> +               return 0;
>> +       } else {
>> +               return r;
>> +       }
>>   }
>>     /**
>> diff --git a/drivers/gpu/drm/radeon/radeon_vce.c
>> b/drivers/gpu/drm/radeon/radeon_vce.c
>> index 7eb1ae7..566a1a0 100644
>> --- a/drivers/gpu/drm/radeon/radeon_vce.c
>> +++ b/drivers/gpu/drm/radeon/radeon_vce.c
>> @@ -810,11 +810,16 @@ int radeon_vce_ib_test(struct radeon_device *rdev,
>> struct radeon_ring *ring)
>>                 goto error;
>>         }
>>   -     r = radeon_fence_wait(fence, false);
>> -       if (r) {
>> +       r = radeon_fence_wait_timeout(fence, false, usecs_to_jiffies(
>> +               RADEON_USEC_IB_TEST_TIMEOUT));
>> +       if (r < 0) {
>>                 DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>> +       } else if (r == 0) {
>> +               DRM_ERROR("radeon: fence wait timed out.\n");
>> +               r = -ETIMEDOUT;
>>         } else {
>> -               DRM_INFO("ib test on ring %d succeeded\n", ring->idx);
>> +               DRM_INFO("ib test on ring %d succeeded\n", ring->idx);
>> +               r = 0;
>>         }
>>   error:
>>         radeon_fence_unref(&fence);
>> diff --git a/drivers/gpu/drm/radeon/uvd_v1_0.c
>> b/drivers/gpu/drm/radeon/uvd_v1_0.c
>> index c6b1cbc..12ddcfa 100644
>> --- a/drivers/gpu/drm/radeon/uvd_v1_0.c
>> +++ b/drivers/gpu/drm/radeon/uvd_v1_0.c
>> @@ -522,11 +522,17 @@ int uvd_v1_0_ib_test(struct radeon_device *rdev,
>> struct radeon_ring *ring)
>>                 goto error;
>>         }
>>   -     r = radeon_fence_wait(fence, false);
>> -       if (r) {
>> +       r = radeon_fence_wait_timeout(fence, false, usecs_to_jiffies(
>> +               RADEON_USEC_IB_TEST_TIMEOUT));
>> +       if (r < 0) {
>>                 DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>>                 goto error;
>> +       } else if (r == 0) {
>> +               DRM_ERROR("radeon: fence wait timed out.\n");
>> +               r = -ETIMEDOUT;
>> +               goto error;
>>         }
>> +       r = 0;
>>         DRM_INFO("ib test on ring %d succeeded\n",  ring->idx);
>>   error:
>>         radeon_fence_unref(&fence);
>
>


More information about the dri-devel mailing list