[PATCH 2/2] drm/radeon: rework and fix reset detection v2

Dieter Nützel Dieter at nuetzel-hh.de
Mon Oct 14 15:20:07 CEST 2013


Am 14.10.2013 11:32, schrieb Christian König:
> From: Christian König <christian.koenig at amd.com>
> 
> Stop fiddling with jiffies, always wait for 
> RADEON_FENCE_JIFFIES_TIMEOUT.
> Consolidate the two wait sequence implementations into just one 
> function.
> Activate all waiters and remember if the reset was already done instead 
> of
> trying to reset from only one thread.
> 
> v2: clear reset flag earlier to avoid timeout in IB test

Hello Christian,

even for radeon_fence.c  Dan Carpenter had a question/patch on October, 
6
[patch] drm/radeon: tweak a timeout condition slightly
and nobody replied.

Greetings,
   Dieter


> Signed-off-by: Christian König <christian.koenig at amd.com>
> ---
>  drivers/gpu/drm/radeon/radeon.h        |   2 +-
>  drivers/gpu/drm/radeon/radeon_device.c |   8 +
>  drivers/gpu/drm/radeon/radeon_fence.c  | 347 
> +++++++++++----------------------
>  3 files changed, 127 insertions(+), 230 deletions(-)
> 
> diff --git a/drivers/gpu/drm/radeon/radeon.h 
> b/drivers/gpu/drm/radeon/radeon.h
> index a400ac1..0201c6e 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -327,7 +327,6 @@ struct radeon_fence_driver {
>  	/* sync_seq is protected by ring emission lock */
>  	uint64_t			sync_seq[RADEON_NUM_RINGS];
>  	atomic64_t			last_seq;
> -	unsigned long			last_activity;
>  	bool				initialized;
>  };
> 
> @@ -2170,6 +2169,7 @@ struct radeon_device {
>  	bool				need_dma32;
>  	bool				accel_working;
>  	bool				fastfb_working; /* IGP feature*/
> +	bool				needs_reset;
>  	struct radeon_surface_reg surface_regs[RADEON_GEM_MAX_SURFACES];
>  	const struct firmware *me_fw;	/* all family ME firmware */
>  	const struct firmware *pfp_fw;	/* r6/700 PFP firmware */
> diff --git a/drivers/gpu/drm/radeon/radeon_device.c
> b/drivers/gpu/drm/radeon/radeon_device.c
> index 841d0e0..3f35f21 100644
> --- a/drivers/gpu/drm/radeon/radeon_device.c
> +++ b/drivers/gpu/drm/radeon/radeon_device.c
> @@ -1549,6 +1549,14 @@ int radeon_gpu_reset(struct radeon_device *rdev)
>  	int resched;
> 
>  	down_write(&rdev->exclusive_lock);
> +
> +	if (!rdev->needs_reset) {
> +		up_write(&rdev->exclusive_lock);
> +		return 0;
> +	}
> +
> +	rdev->needs_reset = false;
> +
>  	radeon_save_bios_scratch_regs(rdev);
>  	/* block TTM */
>  	resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev);
> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c
> b/drivers/gpu/drm/radeon/radeon_fence.c
> index ddb8f8e..b8f68b2 100644
> --- a/drivers/gpu/drm/radeon/radeon_fence.c
> +++ b/drivers/gpu/drm/radeon/radeon_fence.c
> @@ -190,10 +190,8 @@ void radeon_fence_process(struct radeon_device
> *rdev, int ring)
>  		}
>  	} while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq) > seq);
> 
> -	if (wake) {
> -		rdev->fence_drv[ring].last_activity = jiffies;
> +	if (wake)
>  		wake_up_all(&rdev->fence_queue);
> -	}
>  }
> 
>  /**
> @@ -212,13 +210,13 @@ static void radeon_fence_destroy(struct kref 
> *kref)
>  }
> 
>  /**
> - * radeon_fence_seq_signaled - check if a fence sequeuce number has 
> signaled
> + * radeon_fence_seq_signaled - check if a fence sequence number has 
> signaled
>   *
>   * @rdev: radeon device pointer
>   * @seq: sequence number
>   * @ring: ring index the fence is associated with
>   *
> - * Check if the last singled fence sequnce number is >= the requested
> + * Check if the last signaled fence sequnce number is >= the requested
>   * sequence number (all asics).
>   * Returns true if the fence has signaled (current fence value
>   * is >= requested value) or false if it has not (current fence
> @@ -263,113 +261,131 @@ bool radeon_fence_signaled(struct radeon_fence 
> *fence)
>  }
> 
>  /**
> - * radeon_fence_wait_seq - wait for a specific sequence number
> + * radeon_fence_any_seq_signaled - check if any sequence number is 
> signaled
>   *
>   * @rdev: radeon device pointer
> - * @target_seq: sequence number we want to wait for
> - * @ring: ring index the fence is associated with
> + * @seq: sequence numbers
> + *
> + * Check if the last signaled fence sequnce number is >= the requested
> + * sequence number (all asics).
> + * Returns true if any has signaled (current value is >= requested 
> value)
> + * or false if it has not. Helper function for radeon_fence_wait_seq.
> + */
> +static bool radeon_fence_any_seq_signaled(struct radeon_device *rdev, 
> u64 *seq)
> +{
> +	unsigned i;
> +
> +	for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> +		if (seq[i] && radeon_fence_seq_signaled(rdev, seq[i], i))
> +			return true;
> +	}
> +	return false;
> +}
> +
> +/**
> + * radeon_fence_wait_seq - wait for a specific sequence numbers
> + *
> + * @rdev: radeon device pointer
> + * @target_seq: sequence number(s) we want to wait for
>   * @intr: use interruptable sleep
>   * @lock_ring: whether the ring should be locked or not
>   *
> - * Wait for the requested sequence number to be written (all asics).
> + * Wait for the requested sequence number(s) to be written by any ring
> + * (all asics).  Sequnce number array is indexed by ring id.
>   * @intr selects whether to use interruptable (true) or 
> non-interruptable
>   * (false) sleep when waiting for the sequence number.  Helper 
> function
> - * for radeon_fence_wait(), et al.
> + * for radeon_fence_wait_*().
>   * Returns 0 if the sequence number has passed, error for all other 
> cases.
> - * -EDEADLK is returned when a GPU lockup has been detected and the 
> ring is
> - * marked as not ready so no further jobs get scheduled until a 
> successful
> - * reset.
> + * -EDEADLK is returned when a GPU lockup has been detected.
>   */
> -static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 
> target_seq,
> -				 unsigned ring, bool intr, bool lock_ring)
> +static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 
> *target_seq,
> +				 bool intr, bool lock_ring)
>  {
> -	unsigned long timeout, last_activity;
> -	uint64_t seq;
> -	unsigned i;
> +	uint64_t last_seq[RADEON_NUM_RINGS];
>  	bool signaled;
> -	int r;
> +	int i, r;
> +
> +	while (!radeon_fence_any_seq_signaled(rdev, target_seq)) {
> +
> +		/* Save current sequence values, used to check for GPU lockups */
> +		for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> +			if (!target_seq[i])
> +				continue;
> 
> -	while (target_seq > atomic64_read(&rdev->fence_drv[ring].last_seq)) {
> -		if (!rdev->ring[ring].ready) {
> -			return -EBUSY;
> +			last_seq[i] = atomic64_read(&rdev->fence_drv[i].last_seq);
> +			trace_radeon_fence_wait_begin(rdev->ddev, target_seq[i]);
> +			radeon_irq_kms_sw_irq_get(rdev, i);
>  		}
> 
> -		timeout = jiffies - RADEON_FENCE_JIFFIES_TIMEOUT;
> -		if (time_after(rdev->fence_drv[ring].last_activity, timeout)) {
> -			/* the normal case, timeout is somewhere before last_activity */
> -			timeout = rdev->fence_drv[ring].last_activity - timeout;
> +		if (intr) {
> +			r = wait_event_interruptible_timeout(rdev->fence_queue, (
> +				(signaled = radeon_fence_any_seq_signaled(rdev, target_seq))
> +				 || rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT);
>  		} else {
> -			/* either jiffies wrapped around, or no fence was signaled in the 
> last 500ms
> -			 * anyway we will just wait for the minimum amount and then check
> for a lockup
> -			 */
> -			timeout = 1;
> +			r = wait_event_timeout(rdev->fence_queue, (
> +				(signaled = radeon_fence_any_seq_signaled(rdev, target_seq))
> +				 || rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT);
>  		}
> -		seq = atomic64_read(&rdev->fence_drv[ring].last_seq);
> -		/* Save current last activity valuee, used to check for GPU lockups 
> */
> -		last_activity = rdev->fence_drv[ring].last_activity;
> 
> -		trace_radeon_fence_wait_begin(rdev->ddev, seq);
> -		radeon_irq_kms_sw_irq_get(rdev, ring);
> -		if (intr) {
> -			r = wait_event_interruptible_timeout(rdev->fence_queue,
> -				(signaled = radeon_fence_seq_signaled(rdev, target_seq, ring)),
> -				timeout);
> -                } else {
> -			r = wait_event_timeout(rdev->fence_queue,
> -				(signaled = radeon_fence_seq_signaled(rdev, target_seq, ring)),
> -				timeout);
> +		for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> +			if (!target_seq[i])
> +				continue;
> +
> +			radeon_irq_kms_sw_irq_put(rdev, i);
> +			trace_radeon_fence_wait_end(rdev->ddev, target_seq[i]);
>  		}
> -		radeon_irq_kms_sw_irq_put(rdev, ring);
> -		if (unlikely(r < 0)) {
> +
> +		if (unlikely(r < 0))
>  			return r;
> -		}
> -		trace_radeon_fence_wait_end(rdev->ddev, seq);
> 
>  		if (unlikely(!signaled)) {
> +			if (rdev->needs_reset)
> +				return -EDEADLK;
> +
>  			/* we were interrupted for some reason and fence
>  			 * isn't signaled yet, resume waiting */
> -			if (r) {
> +			if (r)
>  				continue;
> +
> +			for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> +				if (!target_seq[i])
> +					continue;
> +
> +				if (last_seq[i] != atomic64_read(&rdev->fence_drv[i].last_seq))
> +					break;
>  			}
> 
> -			/* check if sequence value has changed since last_activity */
> -			if (seq != atomic64_read(&rdev->fence_drv[ring].last_seq)) {
> +			if (i != RADEON_NUM_RINGS)
>  				continue;
> -			}
> 
> -			if (lock_ring) {
> +			if (lock_ring)
>  				mutex_lock(&rdev->ring_lock);
> -			}
> 
> -			/* test if somebody else has already decided that this is a lockup 
> */
> -			if (last_activity != rdev->fence_drv[ring].last_activity) {
> -				if (lock_ring) {
> -					mutex_unlock(&rdev->ring_lock);
> -				}
> -				continue;
> +			for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> +				if (!target_seq[i])
> +					continue;
> +
> +				if (radeon_ring_is_lockup(rdev, i, &rdev->ring[i]))
> +					break;
>  			}
> 
> -			if (radeon_ring_is_lockup(rdev, ring, &rdev->ring[ring])) {
> +			if (i < RADEON_NUM_RINGS) {
>  				/* good news we believe it's a lockup */
> -				dev_warn(rdev->dev, "GPU lockup (waiting for 0x%016llx last fence
> id 0x%016llx)\n",
> -					 target_seq, seq);
> -
> -				/* change last activity so nobody else think there is a lockup */
> -				for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> -					rdev->fence_drv[i].last_activity = jiffies;
> -				}
> -
> -				/* mark the ring as not ready any more */
> -				rdev->ring[ring].ready = false;
> -				if (lock_ring) {
> +				dev_warn(rdev->dev, "GPU lockup (waiting for "
> +					 "0x%016llx last fence id 0x%016llx on"
> +					 " ring %d)\n",
> +					 target_seq[i], last_seq[i], i);
> +
> +				/* remember that we need an reset */
> +				rdev->needs_reset = true;
> +				if (lock_ring)
>  					mutex_unlock(&rdev->ring_lock);
> -				}
> +				wake_up_all(&rdev->fence_queue);
>  				return -EDEADLK;
>  			}
> 
> -			if (lock_ring) {
> +			if (lock_ring)
>  				mutex_unlock(&rdev->ring_lock);
> -			}
>  		}
>  	}
>  	return 0;
> @@ -388,6 +404,7 @@ static int radeon_fence_wait_seq(struct
> radeon_device *rdev, u64 target_seq,
>   */
>  int radeon_fence_wait(struct radeon_fence *fence, bool intr)
>  {
> +	uint64_t seq[RADEON_NUM_RINGS] = {};
>  	int r;
> 
>  	if (fence == NULL) {
> @@ -395,147 +412,15 @@ int radeon_fence_wait(struct radeon_fence
> *fence, bool intr)
>  		return -EINVAL;
>  	}
> 
> -	r = radeon_fence_wait_seq(fence->rdev, fence->seq,
> -				  fence->ring, intr, true);
> -	if (r) {
> -		return r;
> -	}
> -	fence->seq = RADEON_FENCE_SIGNALED_SEQ;
> -	return 0;
> -}
> -
> -static bool radeon_fence_any_seq_signaled(struct radeon_device *rdev, 
> u64 *seq)
> -{
> -	unsigned i;
> -
> -	for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> -		if (seq[i] && radeon_fence_seq_signaled(rdev, seq[i], i)) {
> -			return true;
> -		}
> -	}
> -	return false;
> -}
> -
> -/**
> - * radeon_fence_wait_any_seq - wait for a sequence number on any ring
> - *
> - * @rdev: radeon device pointer
> - * @target_seq: sequence number(s) we want to wait for
> - * @intr: use interruptable sleep
> - *
> - * Wait for the requested sequence number(s) to be written by any ring
> - * (all asics).  Sequnce number array is indexed by ring id.
> - * @intr selects whether to use interruptable (true) or 
> non-interruptable
> - * (false) sleep when waiting for the sequence number.  Helper 
> function
> - * for radeon_fence_wait_any(), et al.
> - * Returns 0 if the sequence number has passed, error for all other 
> cases.
> - */
> -static int radeon_fence_wait_any_seq(struct radeon_device *rdev,
> -				     u64 *target_seq, bool intr)
> -{
> -	unsigned long timeout, last_activity, tmp;
> -	unsigned i, ring = RADEON_NUM_RINGS;
> -	bool signaled;
> -	int r;
> -
> -	for (i = 0, last_activity = 0; i < RADEON_NUM_RINGS; ++i) {
> -		if (!target_seq[i]) {
> -			continue;
> -		}
> -
> -		/* use the most recent one as indicator */
> -		if (time_after(rdev->fence_drv[i].last_activity, last_activity)) {
> -			last_activity = rdev->fence_drv[i].last_activity;
> -		}
> -
> -		/* For lockup detection just pick the lowest ring we are
> -		 * actively waiting for
> -		 */
> -		if (i < ring) {
> -			ring = i;
> -		}
> -	}
> -
> -	/* nothing to wait for ? */
> -	if (ring == RADEON_NUM_RINGS) {
> -		return -ENOENT;
> -	}
> -
> -	while (!radeon_fence_any_seq_signaled(rdev, target_seq)) {
> -		timeout = jiffies - RADEON_FENCE_JIFFIES_TIMEOUT;
> -		if (time_after(last_activity, timeout)) {
> -			/* the normal case, timeout is somewhere before last_activity */
> -			timeout = last_activity - timeout;
> -		} else {
> -			/* either jiffies wrapped around, or no fence was signaled in the 
> last 500ms
> -			 * anyway we will just wait for the minimum amount and then check
> for a lockup
> -			 */
> -			timeout = 1;
> -		}
> +	seq[fence->ring] = fence->seq;
> +	if (seq[fence->ring] == RADEON_FENCE_SIGNALED_SEQ)
> +		return 0;
> 
> -		trace_radeon_fence_wait_begin(rdev->ddev, target_seq[ring]);
> -		for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> -			if (target_seq[i]) {
> -				radeon_irq_kms_sw_irq_get(rdev, i);
> -			}
> -		}
> -		if (intr) {
> -			r = wait_event_interruptible_timeout(rdev->fence_queue,
> -				(signaled = radeon_fence_any_seq_signaled(rdev, target_seq)),
> -				timeout);
> -		} else {
> -			r = wait_event_timeout(rdev->fence_queue,
> -				(signaled = radeon_fence_any_seq_signaled(rdev, target_seq)),
> -				timeout);
> -		}
> -		for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> -			if (target_seq[i]) {
> -				radeon_irq_kms_sw_irq_put(rdev, i);
> -			}
> -		}
> -		if (unlikely(r < 0)) {
> -			return r;
> -		}
> -		trace_radeon_fence_wait_end(rdev->ddev, target_seq[ring]);
> -
> -		if (unlikely(!signaled)) {
> -			/* we were interrupted for some reason and fence
> -			 * isn't signaled yet, resume waiting */
> -			if (r) {
> -				continue;
> -			}
> -
> -			mutex_lock(&rdev->ring_lock);
> -			for (i = 0, tmp = 0; i < RADEON_NUM_RINGS; ++i) {
> -				if (time_after(rdev->fence_drv[i].last_activity, tmp)) {
> -					tmp = rdev->fence_drv[i].last_activity;
> -				}
> -			}
> -			/* test if somebody else has already decided that this is a lockup 
> */
> -			if (last_activity != tmp) {
> -				last_activity = tmp;
> -				mutex_unlock(&rdev->ring_lock);
> -				continue;
> -			}
> -
> -			if (radeon_ring_is_lockup(rdev, ring, &rdev->ring[ring])) {
> -				/* good news we believe it's a lockup */
> -				dev_warn(rdev->dev, "GPU lockup (waiting for 0x%016llx)\n",
> -					 target_seq[ring]);
> -
> -				/* change last activity so nobody else think there is a lockup */
> -				for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> -					rdev->fence_drv[i].last_activity = jiffies;
> -				}
> +	r = radeon_fence_wait_seq(fence->rdev, seq, intr, true);
> +	if (r)
> +		return r;
> 
> -				/* mark the ring as not ready any more */
> -				rdev->ring[ring].ready = false;
> -				mutex_unlock(&rdev->ring_lock);
> -				return -EDEADLK;
> -			}
> -			mutex_unlock(&rdev->ring_lock);
> -		}
> -	}
> +	fence->seq = RADEON_FENCE_SIGNALED_SEQ;
>  	return 0;
>  }
> 
> @@ -557,7 +442,7 @@ int radeon_fence_wait_any(struct radeon_device 
> *rdev,
>  			  bool intr)
>  {
>  	uint64_t seq[RADEON_NUM_RINGS];
> -	unsigned i;
> +	unsigned i, num_rings = 0;
>  	int r;
> 
>  	for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> @@ -567,15 +452,19 @@ int radeon_fence_wait_any(struct radeon_device 
> *rdev,
>  			continue;
>  		}
> 
> -		if (fences[i]->seq == RADEON_FENCE_SIGNALED_SEQ) {
> -			/* something was allready signaled */
> -			return 0;
> -		}
> -
>  		seq[i] = fences[i]->seq;
> +		++num_rings;
> +
> +		/* test if something was allready signaled */
> +		if (seq[i] == RADEON_FENCE_SIGNALED_SEQ)
> +			return 0;
>  	}
> 
> -	r = radeon_fence_wait_any_seq(rdev, seq, intr);
> +	/* nothing to wait for ? */
> +	if (num_rings == 0)
> +		return -ENOENT;
> +
> +	r = radeon_fence_wait_seq(rdev, seq, intr, true);
>  	if (r) {
>  		return r;
>  	}
> @@ -594,15 +483,15 @@ int radeon_fence_wait_any(struct radeon_device 
> *rdev,
>   */
>  int radeon_fence_wait_next_locked(struct radeon_device *rdev, int 
> ring)
>  {
> -	uint64_t seq;
> +	uint64_t seq[RADEON_NUM_RINGS] = {};
> 
> -	seq = atomic64_read(&rdev->fence_drv[ring].last_seq) + 1ULL;
> -	if (seq >= rdev->fence_drv[ring].sync_seq[ring]) {
> +	seq[ring] = atomic64_read(&rdev->fence_drv[ring].last_seq) + 1ULL;
> +	if (seq[ring] >= rdev->fence_drv[ring].sync_seq[ring]) {
>  		/* nothing to wait for, last_seq is
>  		   already the last emited fence */
>  		return -ENOENT;
>  	}
> -	return radeon_fence_wait_seq(rdev, seq, ring, false, false);
> +	return radeon_fence_wait_seq(rdev, seq, false, false);
>  }
> 
>  /**
> @@ -617,14 +506,15 @@ int radeon_fence_wait_next_locked(struct
> radeon_device *rdev, int ring)
>   */
>  int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int 
> ring)
>  {
> -	uint64_t seq = rdev->fence_drv[ring].sync_seq[ring];
> +	uint64_t seq[RADEON_NUM_RINGS] = {};
>  	int r;
> 
> -	r = radeon_fence_wait_seq(rdev, seq, ring, false, false);
> +	seq[ring] = rdev->fence_drv[ring].sync_seq[ring];
> +	r = radeon_fence_wait_seq(rdev, seq, false, false);
>  	if (r) {
> -		if (r == -EDEADLK) {
> +		if (r == -EDEADLK)
>  			return -EDEADLK;
> -		}
> +
>  		dev_err(rdev->dev, "error waiting for ring[%d] to become idle 
> (%d)\n",
>  			ring, r);
>  	}
> @@ -826,7 +716,6 @@ static void radeon_fence_driver_init_ring(struct
> radeon_device *rdev, int ring)
>  	for (i = 0; i < RADEON_NUM_RINGS; ++i)
>  		rdev->fence_drv[ring].sync_seq[i] = 0;
>  	atomic64_set(&rdev->fence_drv[ring].last_seq, 0);
> -	rdev->fence_drv[ring].last_activity = jiffies;
>  	rdev->fence_drv[ring].initialized = false;
>  }


More information about the dri-devel mailing list