[PATCH 2/2] drm/radeon: rework and fix reset detection v2
Dieter Nützel
Dieter at nuetzel-hh.de
Mon Oct 14 15:20:07 CEST 2013
Am 14.10.2013 11:32, schrieb Christian König:
> From: Christian König <christian.koenig at amd.com>
>
> Stop fiddling with jiffies, always wait for
> RADEON_FENCE_JIFFIES_TIMEOUT.
> Consolidate the two wait sequence implementations into just one
> function.
> Activate all waiters and remember if the reset was already done instead
> of
> trying to reset from only one thread.
>
> v2: clear reset flag earlier to avoid timeout in IB test
Hello Christian,
even for radeon_fence.c Dan Carpenter had a question/patch on October,
6
[patch] drm/radeon: tweak a timeout condition slightly
and nobody replied.
Greetings,
Dieter
> Signed-off-by: Christian König <christian.koenig at amd.com>
> ---
> drivers/gpu/drm/radeon/radeon.h | 2 +-
> drivers/gpu/drm/radeon/radeon_device.c | 8 +
> drivers/gpu/drm/radeon/radeon_fence.c | 347
> +++++++++++----------------------
> 3 files changed, 127 insertions(+), 230 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/radeon.h
> b/drivers/gpu/drm/radeon/radeon.h
> index a400ac1..0201c6e 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -327,7 +327,6 @@ struct radeon_fence_driver {
> /* sync_seq is protected by ring emission lock */
> uint64_t sync_seq[RADEON_NUM_RINGS];
> atomic64_t last_seq;
> - unsigned long last_activity;
> bool initialized;
> };
>
> @@ -2170,6 +2169,7 @@ struct radeon_device {
> bool need_dma32;
> bool accel_working;
> bool fastfb_working; /* IGP feature*/
> + bool needs_reset;
> struct radeon_surface_reg surface_regs[RADEON_GEM_MAX_SURFACES];
> const struct firmware *me_fw; /* all family ME firmware */
> const struct firmware *pfp_fw; /* r6/700 PFP firmware */
> diff --git a/drivers/gpu/drm/radeon/radeon_device.c
> b/drivers/gpu/drm/radeon/radeon_device.c
> index 841d0e0..3f35f21 100644
> --- a/drivers/gpu/drm/radeon/radeon_device.c
> +++ b/drivers/gpu/drm/radeon/radeon_device.c
> @@ -1549,6 +1549,14 @@ int radeon_gpu_reset(struct radeon_device *rdev)
> int resched;
>
> down_write(&rdev->exclusive_lock);
> +
> + if (!rdev->needs_reset) {
> + up_write(&rdev->exclusive_lock);
> + return 0;
> + }
> +
> + rdev->needs_reset = false;
> +
> radeon_save_bios_scratch_regs(rdev);
> /* block TTM */
> resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev);
> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c
> b/drivers/gpu/drm/radeon/radeon_fence.c
> index ddb8f8e..b8f68b2 100644
> --- a/drivers/gpu/drm/radeon/radeon_fence.c
> +++ b/drivers/gpu/drm/radeon/radeon_fence.c
> @@ -190,10 +190,8 @@ void radeon_fence_process(struct radeon_device
> *rdev, int ring)
> }
> } while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq) > seq);
>
> - if (wake) {
> - rdev->fence_drv[ring].last_activity = jiffies;
> + if (wake)
> wake_up_all(&rdev->fence_queue);
> - }
> }
>
> /**
> @@ -212,13 +210,13 @@ static void radeon_fence_destroy(struct kref
> *kref)
> }
>
> /**
> - * radeon_fence_seq_signaled - check if a fence sequeuce number has
> signaled
> + * radeon_fence_seq_signaled - check if a fence sequence number has
> signaled
> *
> * @rdev: radeon device pointer
> * @seq: sequence number
> * @ring: ring index the fence is associated with
> *
> - * Check if the last singled fence sequnce number is >= the requested
> + * Check if the last signaled fence sequnce number is >= the requested
> * sequence number (all asics).
> * Returns true if the fence has signaled (current fence value
> * is >= requested value) or false if it has not (current fence
> @@ -263,113 +261,131 @@ bool radeon_fence_signaled(struct radeon_fence
> *fence)
> }
>
> /**
> - * radeon_fence_wait_seq - wait for a specific sequence number
> + * radeon_fence_any_seq_signaled - check if any sequence number is
> signaled
> *
> * @rdev: radeon device pointer
> - * @target_seq: sequence number we want to wait for
> - * @ring: ring index the fence is associated with
> + * @seq: sequence numbers
> + *
> + * Check if the last signaled fence sequnce number is >= the requested
> + * sequence number (all asics).
> + * Returns true if any has signaled (current value is >= requested
> value)
> + * or false if it has not. Helper function for radeon_fence_wait_seq.
> + */
> +static bool radeon_fence_any_seq_signaled(struct radeon_device *rdev,
> u64 *seq)
> +{
> + unsigned i;
> +
> + for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> + if (seq[i] && radeon_fence_seq_signaled(rdev, seq[i], i))
> + return true;
> + }
> + return false;
> +}
> +
> +/**
> + * radeon_fence_wait_seq - wait for a specific sequence numbers
> + *
> + * @rdev: radeon device pointer
> + * @target_seq: sequence number(s) we want to wait for
> * @intr: use interruptable sleep
> * @lock_ring: whether the ring should be locked or not
> *
> - * Wait for the requested sequence number to be written (all asics).
> + * Wait for the requested sequence number(s) to be written by any ring
> + * (all asics). Sequnce number array is indexed by ring id.
> * @intr selects whether to use interruptable (true) or
> non-interruptable
> * (false) sleep when waiting for the sequence number. Helper
> function
> - * for radeon_fence_wait(), et al.
> + * for radeon_fence_wait_*().
> * Returns 0 if the sequence number has passed, error for all other
> cases.
> - * -EDEADLK is returned when a GPU lockup has been detected and the
> ring is
> - * marked as not ready so no further jobs get scheduled until a
> successful
> - * reset.
> + * -EDEADLK is returned when a GPU lockup has been detected.
> */
> -static int radeon_fence_wait_seq(struct radeon_device *rdev, u64
> target_seq,
> - unsigned ring, bool intr, bool lock_ring)
> +static int radeon_fence_wait_seq(struct radeon_device *rdev, u64
> *target_seq,
> + bool intr, bool lock_ring)
> {
> - unsigned long timeout, last_activity;
> - uint64_t seq;
> - unsigned i;
> + uint64_t last_seq[RADEON_NUM_RINGS];
> bool signaled;
> - int r;
> + int i, r;
> +
> + while (!radeon_fence_any_seq_signaled(rdev, target_seq)) {
> +
> + /* Save current sequence values, used to check for GPU lockups */
> + for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> + if (!target_seq[i])
> + continue;
>
> - while (target_seq > atomic64_read(&rdev->fence_drv[ring].last_seq)) {
> - if (!rdev->ring[ring].ready) {
> - return -EBUSY;
> + last_seq[i] = atomic64_read(&rdev->fence_drv[i].last_seq);
> + trace_radeon_fence_wait_begin(rdev->ddev, target_seq[i]);
> + radeon_irq_kms_sw_irq_get(rdev, i);
> }
>
> - timeout = jiffies - RADEON_FENCE_JIFFIES_TIMEOUT;
> - if (time_after(rdev->fence_drv[ring].last_activity, timeout)) {
> - /* the normal case, timeout is somewhere before last_activity */
> - timeout = rdev->fence_drv[ring].last_activity - timeout;
> + if (intr) {
> + r = wait_event_interruptible_timeout(rdev->fence_queue, (
> + (signaled = radeon_fence_any_seq_signaled(rdev, target_seq))
> + || rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT);
> } else {
> - /* either jiffies wrapped around, or no fence was signaled in the
> last 500ms
> - * anyway we will just wait for the minimum amount and then check
> for a lockup
> - */
> - timeout = 1;
> + r = wait_event_timeout(rdev->fence_queue, (
> + (signaled = radeon_fence_any_seq_signaled(rdev, target_seq))
> + || rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT);
> }
> - seq = atomic64_read(&rdev->fence_drv[ring].last_seq);
> - /* Save current last activity valuee, used to check for GPU lockups
> */
> - last_activity = rdev->fence_drv[ring].last_activity;
>
> - trace_radeon_fence_wait_begin(rdev->ddev, seq);
> - radeon_irq_kms_sw_irq_get(rdev, ring);
> - if (intr) {
> - r = wait_event_interruptible_timeout(rdev->fence_queue,
> - (signaled = radeon_fence_seq_signaled(rdev, target_seq, ring)),
> - timeout);
> - } else {
> - r = wait_event_timeout(rdev->fence_queue,
> - (signaled = radeon_fence_seq_signaled(rdev, target_seq, ring)),
> - timeout);
> + for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> + if (!target_seq[i])
> + continue;
> +
> + radeon_irq_kms_sw_irq_put(rdev, i);
> + trace_radeon_fence_wait_end(rdev->ddev, target_seq[i]);
> }
> - radeon_irq_kms_sw_irq_put(rdev, ring);
> - if (unlikely(r < 0)) {
> +
> + if (unlikely(r < 0))
> return r;
> - }
> - trace_radeon_fence_wait_end(rdev->ddev, seq);
>
> if (unlikely(!signaled)) {
> + if (rdev->needs_reset)
> + return -EDEADLK;
> +
> /* we were interrupted for some reason and fence
> * isn't signaled yet, resume waiting */
> - if (r) {
> + if (r)
> continue;
> +
> + for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> + if (!target_seq[i])
> + continue;
> +
> + if (last_seq[i] != atomic64_read(&rdev->fence_drv[i].last_seq))
> + break;
> }
>
> - /* check if sequence value has changed since last_activity */
> - if (seq != atomic64_read(&rdev->fence_drv[ring].last_seq)) {
> + if (i != RADEON_NUM_RINGS)
> continue;
> - }
>
> - if (lock_ring) {
> + if (lock_ring)
> mutex_lock(&rdev->ring_lock);
> - }
>
> - /* test if somebody else has already decided that this is a lockup
> */
> - if (last_activity != rdev->fence_drv[ring].last_activity) {
> - if (lock_ring) {
> - mutex_unlock(&rdev->ring_lock);
> - }
> - continue;
> + for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> + if (!target_seq[i])
> + continue;
> +
> + if (radeon_ring_is_lockup(rdev, i, &rdev->ring[i]))
> + break;
> }
>
> - if (radeon_ring_is_lockup(rdev, ring, &rdev->ring[ring])) {
> + if (i < RADEON_NUM_RINGS) {
> /* good news we believe it's a lockup */
> - dev_warn(rdev->dev, "GPU lockup (waiting for 0x%016llx last fence
> id 0x%016llx)\n",
> - target_seq, seq);
> -
> - /* change last activity so nobody else think there is a lockup */
> - for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> - rdev->fence_drv[i].last_activity = jiffies;
> - }
> -
> - /* mark the ring as not ready any more */
> - rdev->ring[ring].ready = false;
> - if (lock_ring) {
> + dev_warn(rdev->dev, "GPU lockup (waiting for "
> + "0x%016llx last fence id 0x%016llx on"
> + " ring %d)\n",
> + target_seq[i], last_seq[i], i);
> +
> + /* remember that we need an reset */
> + rdev->needs_reset = true;
> + if (lock_ring)
> mutex_unlock(&rdev->ring_lock);
> - }
> + wake_up_all(&rdev->fence_queue);
> return -EDEADLK;
> }
>
> - if (lock_ring) {
> + if (lock_ring)
> mutex_unlock(&rdev->ring_lock);
> - }
> }
> }
> return 0;
> @@ -388,6 +404,7 @@ static int radeon_fence_wait_seq(struct
> radeon_device *rdev, u64 target_seq,
> */
> int radeon_fence_wait(struct radeon_fence *fence, bool intr)
> {
> + uint64_t seq[RADEON_NUM_RINGS] = {};
> int r;
>
> if (fence == NULL) {
> @@ -395,147 +412,15 @@ int radeon_fence_wait(struct radeon_fence
> *fence, bool intr)
> return -EINVAL;
> }
>
> - r = radeon_fence_wait_seq(fence->rdev, fence->seq,
> - fence->ring, intr, true);
> - if (r) {
> - return r;
> - }
> - fence->seq = RADEON_FENCE_SIGNALED_SEQ;
> - return 0;
> -}
> -
> -static bool radeon_fence_any_seq_signaled(struct radeon_device *rdev,
> u64 *seq)
> -{
> - unsigned i;
> -
> - for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> - if (seq[i] && radeon_fence_seq_signaled(rdev, seq[i], i)) {
> - return true;
> - }
> - }
> - return false;
> -}
> -
> -/**
> - * radeon_fence_wait_any_seq - wait for a sequence number on any ring
> - *
> - * @rdev: radeon device pointer
> - * @target_seq: sequence number(s) we want to wait for
> - * @intr: use interruptable sleep
> - *
> - * Wait for the requested sequence number(s) to be written by any ring
> - * (all asics). Sequnce number array is indexed by ring id.
> - * @intr selects whether to use interruptable (true) or
> non-interruptable
> - * (false) sleep when waiting for the sequence number. Helper
> function
> - * for radeon_fence_wait_any(), et al.
> - * Returns 0 if the sequence number has passed, error for all other
> cases.
> - */
> -static int radeon_fence_wait_any_seq(struct radeon_device *rdev,
> - u64 *target_seq, bool intr)
> -{
> - unsigned long timeout, last_activity, tmp;
> - unsigned i, ring = RADEON_NUM_RINGS;
> - bool signaled;
> - int r;
> -
> - for (i = 0, last_activity = 0; i < RADEON_NUM_RINGS; ++i) {
> - if (!target_seq[i]) {
> - continue;
> - }
> -
> - /* use the most recent one as indicator */
> - if (time_after(rdev->fence_drv[i].last_activity, last_activity)) {
> - last_activity = rdev->fence_drv[i].last_activity;
> - }
> -
> - /* For lockup detection just pick the lowest ring we are
> - * actively waiting for
> - */
> - if (i < ring) {
> - ring = i;
> - }
> - }
> -
> - /* nothing to wait for ? */
> - if (ring == RADEON_NUM_RINGS) {
> - return -ENOENT;
> - }
> -
> - while (!radeon_fence_any_seq_signaled(rdev, target_seq)) {
> - timeout = jiffies - RADEON_FENCE_JIFFIES_TIMEOUT;
> - if (time_after(last_activity, timeout)) {
> - /* the normal case, timeout is somewhere before last_activity */
> - timeout = last_activity - timeout;
> - } else {
> - /* either jiffies wrapped around, or no fence was signaled in the
> last 500ms
> - * anyway we will just wait for the minimum amount and then check
> for a lockup
> - */
> - timeout = 1;
> - }
> + seq[fence->ring] = fence->seq;
> + if (seq[fence->ring] == RADEON_FENCE_SIGNALED_SEQ)
> + return 0;
>
> - trace_radeon_fence_wait_begin(rdev->ddev, target_seq[ring]);
> - for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> - if (target_seq[i]) {
> - radeon_irq_kms_sw_irq_get(rdev, i);
> - }
> - }
> - if (intr) {
> - r = wait_event_interruptible_timeout(rdev->fence_queue,
> - (signaled = radeon_fence_any_seq_signaled(rdev, target_seq)),
> - timeout);
> - } else {
> - r = wait_event_timeout(rdev->fence_queue,
> - (signaled = radeon_fence_any_seq_signaled(rdev, target_seq)),
> - timeout);
> - }
> - for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> - if (target_seq[i]) {
> - radeon_irq_kms_sw_irq_put(rdev, i);
> - }
> - }
> - if (unlikely(r < 0)) {
> - return r;
> - }
> - trace_radeon_fence_wait_end(rdev->ddev, target_seq[ring]);
> -
> - if (unlikely(!signaled)) {
> - /* we were interrupted for some reason and fence
> - * isn't signaled yet, resume waiting */
> - if (r) {
> - continue;
> - }
> -
> - mutex_lock(&rdev->ring_lock);
> - for (i = 0, tmp = 0; i < RADEON_NUM_RINGS; ++i) {
> - if (time_after(rdev->fence_drv[i].last_activity, tmp)) {
> - tmp = rdev->fence_drv[i].last_activity;
> - }
> - }
> - /* test if somebody else has already decided that this is a lockup
> */
> - if (last_activity != tmp) {
> - last_activity = tmp;
> - mutex_unlock(&rdev->ring_lock);
> - continue;
> - }
> -
> - if (radeon_ring_is_lockup(rdev, ring, &rdev->ring[ring])) {
> - /* good news we believe it's a lockup */
> - dev_warn(rdev->dev, "GPU lockup (waiting for 0x%016llx)\n",
> - target_seq[ring]);
> -
> - /* change last activity so nobody else think there is a lockup */
> - for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> - rdev->fence_drv[i].last_activity = jiffies;
> - }
> + r = radeon_fence_wait_seq(fence->rdev, seq, intr, true);
> + if (r)
> + return r;
>
> - /* mark the ring as not ready any more */
> - rdev->ring[ring].ready = false;
> - mutex_unlock(&rdev->ring_lock);
> - return -EDEADLK;
> - }
> - mutex_unlock(&rdev->ring_lock);
> - }
> - }
> + fence->seq = RADEON_FENCE_SIGNALED_SEQ;
> return 0;
> }
>
> @@ -557,7 +442,7 @@ int radeon_fence_wait_any(struct radeon_device
> *rdev,
> bool intr)
> {
> uint64_t seq[RADEON_NUM_RINGS];
> - unsigned i;
> + unsigned i, num_rings = 0;
> int r;
>
> for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> @@ -567,15 +452,19 @@ int radeon_fence_wait_any(struct radeon_device
> *rdev,
> continue;
> }
>
> - if (fences[i]->seq == RADEON_FENCE_SIGNALED_SEQ) {
> - /* something was allready signaled */
> - return 0;
> - }
> -
> seq[i] = fences[i]->seq;
> + ++num_rings;
> +
> + /* test if something was allready signaled */
> + if (seq[i] == RADEON_FENCE_SIGNALED_SEQ)
> + return 0;
> }
>
> - r = radeon_fence_wait_any_seq(rdev, seq, intr);
> + /* nothing to wait for ? */
> + if (num_rings == 0)
> + return -ENOENT;
> +
> + r = radeon_fence_wait_seq(rdev, seq, intr, true);
> if (r) {
> return r;
> }
> @@ -594,15 +483,15 @@ int radeon_fence_wait_any(struct radeon_device
> *rdev,
> */
> int radeon_fence_wait_next_locked(struct radeon_device *rdev, int
> ring)
> {
> - uint64_t seq;
> + uint64_t seq[RADEON_NUM_RINGS] = {};
>
> - seq = atomic64_read(&rdev->fence_drv[ring].last_seq) + 1ULL;
> - if (seq >= rdev->fence_drv[ring].sync_seq[ring]) {
> + seq[ring] = atomic64_read(&rdev->fence_drv[ring].last_seq) + 1ULL;
> + if (seq[ring] >= rdev->fence_drv[ring].sync_seq[ring]) {
> /* nothing to wait for, last_seq is
> already the last emited fence */
> return -ENOENT;
> }
> - return radeon_fence_wait_seq(rdev, seq, ring, false, false);
> + return radeon_fence_wait_seq(rdev, seq, false, false);
> }
>
> /**
> @@ -617,14 +506,15 @@ int radeon_fence_wait_next_locked(struct
> radeon_device *rdev, int ring)
> */
> int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int
> ring)
> {
> - uint64_t seq = rdev->fence_drv[ring].sync_seq[ring];
> + uint64_t seq[RADEON_NUM_RINGS] = {};
> int r;
>
> - r = radeon_fence_wait_seq(rdev, seq, ring, false, false);
> + seq[ring] = rdev->fence_drv[ring].sync_seq[ring];
> + r = radeon_fence_wait_seq(rdev, seq, false, false);
> if (r) {
> - if (r == -EDEADLK) {
> + if (r == -EDEADLK)
> return -EDEADLK;
> - }
> +
> dev_err(rdev->dev, "error waiting for ring[%d] to become idle
> (%d)\n",
> ring, r);
> }
> @@ -826,7 +716,6 @@ static void radeon_fence_driver_init_ring(struct
> radeon_device *rdev, int ring)
> for (i = 0; i < RADEON_NUM_RINGS; ++i)
> rdev->fence_drv[ring].sync_seq[i] = 0;
> atomic64_set(&rdev->fence_drv[ring].last_seq, 0);
> - rdev->fence_drv[ring].last_activity = jiffies;
> rdev->fence_drv[ring].initialized = false;
> }
More information about the dri-devel
mailing list