[PATCH] drm/radeon: avoid deadlock in pm path when waiting for fence

Christian König deathsimple at vodafone.de
Mon Dec 17 08:41:31 PST 2012


On 17.12.2012 17:04, j.glisse at gmail.com wrote:
> From: Jerome Glisse <jglisse at redhat.com>
>
> radeon_fence_wait_empty_locked should not trigger GPU reset as no
> place where it's call from would benefit from such thing and it
> actually lead to a kernel deadlock in case the reset is triggered
> from pm codepath. Instead force ring completion in place where it
> makes sense or return early in others.
>
> Signed-off-by: Jerome Glisse <jglisse at redhat.com>

I wanted to stop losing GPU reset signals by moving the reset into 
radeon_fence_wait_empty locked, but it's true that in most cases it 
doesn't make much sense (suspend/finish) and I didn't know that it could 
cause a hang in the PM code.

We should probably fix the PM code to properly signal this condition to 
it's caller and reset the GPU when it is possible to do so, but fixing 
the deadlock is of course more important.

Also should probably go into the stable kernel as well, but anyway it is:

Reviewed-by: Christian König <christian.koenig at amd.com>

> ---
>   drivers/gpu/drm/radeon/radeon.h        |  2 +-
>   drivers/gpu/drm/radeon/radeon_device.c | 13 +++++++++++--
>   drivers/gpu/drm/radeon/radeon_fence.c  | 30 ++++++++++++++----------------
>   drivers/gpu/drm/radeon/radeon_pm.c     | 15 ++++++++++++---
>   4 files changed, 38 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index 9c7625c..071b2d7 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -231,7 +231,7 @@ void radeon_fence_process(struct radeon_device *rdev, int ring);
>   bool radeon_fence_signaled(struct radeon_fence *fence);
>   int radeon_fence_wait(struct radeon_fence *fence, bool interruptible);
>   int radeon_fence_wait_next_locked(struct radeon_device *rdev, int ring);
> -void radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring);
> +int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring);
>   int radeon_fence_wait_any(struct radeon_device *rdev,
>   			  struct radeon_fence **fences,
>   			  bool intr);
> diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
> index 774fae7..53a9223 100644
> --- a/drivers/gpu/drm/radeon/radeon_device.c
> +++ b/drivers/gpu/drm/radeon/radeon_device.c
> @@ -1163,6 +1163,7 @@ int radeon_suspend_kms(struct drm_device *dev, pm_message_t state)
>   	struct drm_crtc *crtc;
>   	struct drm_connector *connector;
>   	int i, r;
> +	bool force_completion = false;
>   
>   	if (dev == NULL || dev->dev_private == NULL) {
>   		return -ENODEV;
> @@ -1205,8 +1206,16 @@ int radeon_suspend_kms(struct drm_device *dev, pm_message_t state)
>   
>   	mutex_lock(&rdev->ring_lock);
>   	/* wait for gpu to finish processing current batch */
> -	for (i = 0; i < RADEON_NUM_RINGS; i++)
> -		radeon_fence_wait_empty_locked(rdev, i);
> +	for (i = 0; i < RADEON_NUM_RINGS; i++) {
> +		r = radeon_fence_wait_empty_locked(rdev, i);
> +		if (r) {
> +			/* delay GPU reset to resume */
> +			force_completion = true;
> +		}
> +	}
> +	if (force_completion) {
> +		radeon_fence_driver_force_completion(rdev);
> +	}
>   	mutex_unlock(&rdev->ring_lock);
>   
>   	radeon_save_bios_scratch_regs(rdev);
> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
> index bf7b20e..28c09b6 100644
> --- a/drivers/gpu/drm/radeon/radeon_fence.c
> +++ b/drivers/gpu/drm/radeon/radeon_fence.c
> @@ -609,26 +609,20 @@ int radeon_fence_wait_next_locked(struct radeon_device *rdev, int ring)
>    * Returns 0 if the fences have passed, error for all other cases.
>    * Caller must hold ring lock.
>    */
> -void radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring)
> +int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring)
>   {
>   	uint64_t seq = rdev->fence_drv[ring].sync_seq[ring];
> +	int r;
>   
> -	while(1) {
> -		int r;
> -		r = radeon_fence_wait_seq(rdev, seq, ring, false, false);
> +	r = radeon_fence_wait_seq(rdev, seq, ring, false, false);
> +	if (r) {
>   		if (r == -EDEADLK) {
> -			mutex_unlock(&rdev->ring_lock);
> -			r = radeon_gpu_reset(rdev);
> -			mutex_lock(&rdev->ring_lock);
> -			if (!r)
> -				continue;
> -		}
> -		if (r) {
> -			dev_err(rdev->dev, "error waiting for ring to become"
> -				" idle (%d)\n", r);
> +			return -EDEADLK;
>   		}
> -		return;
> +		dev_err(rdev->dev, "error waiting for ring[%d] to become idle (%d)\n",
> +			ring, r);
>   	}
> +	return 0;
>   }
>   
>   /**
> @@ -854,13 +848,17 @@ int radeon_fence_driver_init(struct radeon_device *rdev)
>    */
>   void radeon_fence_driver_fini(struct radeon_device *rdev)
>   {
> -	int ring;
> +	int ring, r;
>   
>   	mutex_lock(&rdev->ring_lock);
>   	for (ring = 0; ring < RADEON_NUM_RINGS; ring++) {
>   		if (!rdev->fence_drv[ring].initialized)
>   			continue;
> -		radeon_fence_wait_empty_locked(rdev, ring);
> +		r = radeon_fence_wait_empty_locked(rdev, ring);
> +		if (r) {
> +			/* no need to trigger GPU reset as we are unloading */
> +			radeon_fence_driver_force_completion(rdev);
> +		}
>   		wake_up_all(&rdev->fence_queue);
>   		radeon_scratch_free(rdev, rdev->fence_drv[ring].scratch_reg);
>   		rdev->fence_drv[ring].initialized = false;
> diff --git a/drivers/gpu/drm/radeon/radeon_pm.c b/drivers/gpu/drm/radeon/radeon_pm.c
> index aa14dbb..0bfa656 100644
> --- a/drivers/gpu/drm/radeon/radeon_pm.c
> +++ b/drivers/gpu/drm/radeon/radeon_pm.c
> @@ -234,7 +234,7 @@ static void radeon_set_power_state(struct radeon_device *rdev)
>   
>   static void radeon_pm_set_clocks(struct radeon_device *rdev)
>   {
> -	int i;
> +	int i, r;
>   
>   	/* no need to take locks, etc. if nothing's going to change */
>   	if ((rdev->pm.requested_clock_mode_index == rdev->pm.current_clock_mode_index) &&
> @@ -248,8 +248,17 @@ static void radeon_pm_set_clocks(struct radeon_device *rdev)
>   	/* wait for the rings to drain */
>   	for (i = 0; i < RADEON_NUM_RINGS; i++) {
>   		struct radeon_ring *ring = &rdev->ring[i];
> -		if (ring->ready)
> -			radeon_fence_wait_empty_locked(rdev, i);
> +		if (!ring->ready) {
> +			continue;
> +		}
> +		r = radeon_fence_wait_empty_locked(rdev, i);
> +		if (r) {
> +			/* needs a GPU reset dont reset here */
> +			mutex_unlock(&rdev->ring_lock);
> +			up_write(&rdev->pm.mclk_lock);
> +			mutex_unlock(&rdev->ddev->struct_mutex);
> +			return;
> +		}
>   	}
>   
>   	radeon_unmap_vram_bos(rdev);



More information about the dri-devel mailing list