[PATCH] [RFC] drm/radeon: rework gpu reset and fence signalling error handling

Dave Airlie airlied at gmail.com
Fri Feb 3 07:37:42 PST 2012


From: Dave Airlie <airlied at redhat.com>

This is an RFC for an idea I had to make GPU lockup recovery less likely to deadlock.

This starts using error values for the gpu lockup and fence signalling.

We can now return more information, like whether the GPU is in reset
-EAGAIN, or crashed -EIO, or fence is busy -EBUSY.

We then use this information to put gpu reset into a workqueue.

We avoid doing IB test while the GPU is still in reset state,
we could move IB test to the workqueue after we free up to lockup state
but before we drop the CS lock.

We also take the VRAM mutex around the reset operation, but drop it before resume
the modes.

This should allow us to drop the recursive locks. (I've got patches to do that locally)

TODO:
testing more
I think I broke fencing slightly somewhere

Tested on rv770 with the EXT_transform_feedback lockup, ran gears.

Signed-off-by: Dave Airlie <airlied at redhat.com>
---
 drivers/gpu/drm/radeon/r100.c           |    6 ++-
 drivers/gpu/drm/radeon/r300.c           |    2 +-
 drivers/gpu/drm/radeon/r600.c           |    4 ++
 drivers/gpu/drm/radeon/radeon.h         |    6 ++-
 drivers/gpu/drm/radeon/radeon_cs.c      |    2 +-
 drivers/gpu/drm/radeon/radeon_device.c  |   24 ++++++++---
 drivers/gpu/drm/radeon/radeon_display.c |    2 +-
 drivers/gpu/drm/radeon/radeon_fence.c   |   69 +++++++++++++++++-------------
 drivers/gpu/drm/radeon/radeon_ring.c    |    5 +-
 drivers/gpu/drm/radeon/radeon_test.c    |   12 +++---
 drivers/gpu/drm/radeon/radeon_ttm.c     |    2 +-
 drivers/gpu/drm/radeon/rs600.c          |    2 +-
 12 files changed, 83 insertions(+), 53 deletions(-)

diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
index 99bb006..a0fefe3 100644
--- a/drivers/gpu/drm/radeon/r100.c
+++ b/drivers/gpu/drm/radeon/r100.c
@@ -2265,7 +2265,7 @@ int r100_asic_reset(struct radeon_device *rdev)
 	if (G_000E40_SE_BUSY(status) || G_000E40_RE_BUSY(status) ||
 		G_000E40_TAM_BUSY(status) || G_000E40_PB_BUSY(status)) {
 		dev_err(rdev->dev, "failed to reset GPU\n");
-		rdev->gpu_lockup = true;
+		atomic_set(&rdev->gpu_lockup_error, -EIO);
 		ret = -1;
 	} else
 		dev_info(rdev->dev, "GPU reset succeed\n");
@@ -3717,6 +3717,10 @@ int r100_ib_test(struct radeon_device *rdev)
 	unsigned i;
 	int r;
 
+	/* don't attempt to run IB during GPU lockup */
+	if (atomic_read(&rdev->gpu_lockup_error) != 0)
+		return 0;
+
 	r = radeon_scratch_get(rdev, &scratch);
 	if (r) {
 		DRM_ERROR("radeon: failed to get scratch reg (%d).\n", r);
diff --git a/drivers/gpu/drm/radeon/r300.c b/drivers/gpu/drm/radeon/r300.c
index 3fc0d29..cea8f67 100644
--- a/drivers/gpu/drm/radeon/r300.c
+++ b/drivers/gpu/drm/radeon/r300.c
@@ -450,7 +450,7 @@ int r300_asic_reset(struct radeon_device *rdev)
 	/* Check if GPU is idle */
 	if (G_000E40_GA_BUSY(status) || G_000E40_VAP_BUSY(status)) {
 		dev_err(rdev->dev, "failed to reset GPU\n");
-		rdev->gpu_lockup = true;
+		atomic_set(&rdev->gpu_lockup_error, -EIO);
 		ret = -1;
 	} else
 		dev_info(rdev->dev, "GPU reset succeed\n");
diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
index 4f08e5e..22336f9 100644
--- a/drivers/gpu/drm/radeon/r600.c
+++ b/drivers/gpu/drm/radeon/r600.c
@@ -2705,6 +2705,10 @@ int r600_ib_test(struct radeon_device *rdev, int ring)
 	unsigned i;
 	int r;
 
+	/* don't attempt to run IB during GPU lockup */
+	if (atomic_read(&rdev->gpu_lockup_error) != 0)
+		return 0;
+
 	r = radeon_scratch_get(rdev, &scratch);
 	if (r) {
 		DRM_ERROR("radeon: failed to get scratch reg (%d).\n", r);
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 1668ec1..0fdba57 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -280,7 +280,7 @@ void radeon_fence_driver_fini(struct radeon_device *rdev);
 int radeon_fence_create(struct radeon_device *rdev, struct radeon_fence **fence, int ring);
 int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence *fence);
 void radeon_fence_process(struct radeon_device *rdev, int ring);
-bool radeon_fence_signaled(struct radeon_fence *fence);
+int radeon_fence_signaled(struct radeon_fence *fence);
 int radeon_fence_wait(struct radeon_fence *fence, bool interruptible);
 int radeon_fence_wait_next(struct radeon_device *rdev, int ring);
 int radeon_fence_wait_last(struct radeon_device *rdev, int ring);
@@ -1452,7 +1452,7 @@ struct radeon_device {
 	struct radeon_mutex		cs_mutex;
 	struct radeon_wb		wb;
 	struct radeon_dummy_page	dummy_page;
-	bool				gpu_lockup;
+	atomic_t			gpu_lockup_error;
 	bool				shutdown;
 	bool				suspend;
 	bool				need_dma32;
@@ -1467,6 +1467,8 @@ struct radeon_device {
 	int msi_enabled; /* msi enabled */
 	struct r600_ih ih; /* r6/700 interrupt ring */
 	struct work_struct hotplug_work;
+
+	struct work_struct reset_work;
 	int num_crtc; /* number of crtcs */
 	struct mutex dc_hw_i2c_mutex; /* display controller hw i2c mutex */
 	struct mutex vram_mutex;
diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c
index 435a3d9..2a20e2e 100644
--- a/drivers/gpu/drm/radeon/radeon_cs.c
+++ b/drivers/gpu/drm/radeon/radeon_cs.c
@@ -87,7 +87,7 @@ int radeon_cs_parser_relocs(struct radeon_cs_parser *p)
 
 			if (p->relocs[i].robj->tbo.sync_obj && !(r->flags & RADEON_RELOC_DONT_SYNC)) {
 				struct radeon_fence *fence = p->relocs[i].robj->tbo.sync_obj;
-				if (!radeon_fence_signaled(fence)) {
+				if (radeon_fence_signaled(fence) != 0) {
 					p->sync_to_ring[fence->ring] = true;
 				}
 			}
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
index 49f7cb7..ff307c7 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -692,6 +692,13 @@ static bool radeon_switcheroo_can_switch(struct pci_dev *pdev)
 	return can_switch;
 }
 
+static void radeon_gpu_reset_work_func(struct work_struct *work)
+{
+	struct radeon_device *rdev = container_of(work, struct radeon_device,
+						  reset_work);
+	radeon_gpu_reset(rdev);
+}
+
 
 int radeon_device_init(struct radeon_device *rdev,
 		       struct drm_device *ddev,
@@ -710,7 +717,7 @@ int radeon_device_init(struct radeon_device *rdev,
 	rdev->is_atom_bios = false;
 	rdev->usec_timeout = RADEON_MAX_USEC_TIMEOUT;
 	rdev->mc.gtt_size = radeon_gart_size * 1024 * 1024;
-	rdev->gpu_lockup = false;
+	atomic_set(&rdev->gpu_lockup_error, 0);
 	rdev->accel_working = false;
 
 	DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X).\n",
@@ -740,6 +747,7 @@ int radeon_device_init(struct radeon_device *rdev,
 	rdev->vm_manager.max_pfn = 1 << 20;
 	INIT_LIST_HEAD(&rdev->vm_manager.lru_vm);
 
+	INIT_WORK(&rdev->reset_work, radeon_gpu_reset_work_func);
 	/* Set asic functions */
 	r = radeon_asic_init(rdev);
 	if (r)
@@ -987,6 +995,8 @@ int radeon_gpu_reset(struct radeon_device *rdev)
 	/* Prevent CS ioctl from interfering */
 	radeon_mutex_lock(&rdev->cs_mutex);
 
+	mutex_lock(&rdev->vram_mutex);
+
 	radeon_save_bios_scratch_regs(rdev);
 	/* block TTM */
 	resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev);
@@ -997,17 +1007,19 @@ int radeon_gpu_reset(struct radeon_device *rdev)
 		dev_info(rdev->dev, "GPU reset succeed\n");
 		radeon_resume(rdev);
 		radeon_restore_bios_scratch_regs(rdev);
+		/* drop VRAM mutex before resetting mode */
+		mutex_unlock(&rdev->vram_mutex);
 		drm_helper_resume_force_mode(rdev->ddev);
 		ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
-	}
-
-	radeon_mutex_unlock(&rdev->cs_mutex);
-
-	if (r) {
+		atomic_set(&rdev->gpu_lockup_error, 0);
+	} else {
 		/* bad news, how to tell it to userspace ? */
+		mutex_unlock(&rdev->vram_mutex);
 		dev_info(rdev->dev, "GPU reset failed\n");
+		atomic_set(&rdev->gpu_lockup_error, -EIO);
 	}
 
+	radeon_mutex_unlock(&rdev->cs_mutex);
 	return r;
 }
 
diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
index 8c49fef..698cd8d 100644
--- a/drivers/gpu/drm/radeon/radeon_display.c
+++ b/drivers/gpu/drm/radeon/radeon_display.c
@@ -280,7 +280,7 @@ void radeon_crtc_handle_flip(struct radeon_device *rdev, int crtc_id)
 	spin_lock_irqsave(&rdev->ddev->event_lock, flags);
 	work = radeon_crtc->unpin_work;
 	if (work == NULL ||
-	    (work->fence && !radeon_fence_signaled(work->fence))) {
+	    (work->fence && radeon_fence_signaled(work->fence))) {
 		spin_unlock_irqrestore(&rdev->ddev->event_lock, flags);
 		return;
 	}
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
index 64ea3dd..94dd376 100644
--- a/drivers/gpu/drm/radeon/radeon_fence.c
+++ b/drivers/gpu/drm/radeon/radeon_fence.c
@@ -182,34 +182,38 @@ int radeon_fence_create(struct radeon_device *rdev,
 	write_unlock_irqrestore(&rdev->fence_lock, irq_flags);
 	return 0;
 }
-
-bool radeon_fence_signaled(struct radeon_fence *fence)
+/*
+ * returns
+ * 0 : fence is signaled with no error
+ * -EBUSY: fence still not signalled - no error
+ * -EAGAIN: fence not signalled yet - possible error
+ * -EIO: GPU has gone and isn't coming back
+ */
+int radeon_fence_signaled(struct radeon_fence *fence)
 {
 	unsigned long irq_flags;
-	bool signaled = false;
+	int ret = 0;
 
 	if (!fence)
-		return true;
+		return 0;
 
-	if (fence->rdev->gpu_lockup)
-		return true;
+	ret = atomic_read(&fence->rdev->gpu_lockup_error);
+	if (ret)
+		return ret;
 
 	write_lock_irqsave(&fence->rdev->fence_lock, irq_flags);
-	signaled = fence->signaled;
 	/* if we are shuting down report all fence as signaled */
 	if (fence->rdev->shutdown) {
-		signaled = true;
-	}
-	if (!fence->emitted) {
+		ret = 0;
+	} else if (!fence->emitted) {
 		WARN(1, "Querying an unemitted fence : %p !\n", fence);
-		signaled = true;
-	}
-	if (!signaled) {
+		ret = 0;
+	} else if (!fence->signaled) {
 		radeon_fence_poll_locked(fence->rdev, fence->ring);
-		signaled = fence->signaled;
+		ret = fence->signaled ? 0 : -EBUSY;
 	}
 	write_unlock_irqrestore(&fence->rdev->fence_lock, irq_flags);
-	return signaled;
+	return ret;
 }
 
 int radeon_fence_wait(struct radeon_fence *fence, bool intr)
@@ -224,9 +228,14 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr)
 		return 0;
 	}
 	rdev = fence->rdev;
-	if (radeon_fence_signaled(fence)) {
+	r = radeon_fence_signaled(fence);
+	if (r == 0) {
 		return 0;
-	}
+	} else if (r == -EIO || r == -EAGAIN)
+	    return r;
+
+	/* only go forward for -EBUSY */
+
 	timeout = rdev->fence_drv[fence->ring].last_timeout;
 retry:
 	/* save current sequence used to check for GPU lockup */
@@ -235,7 +244,7 @@ retry:
 	if (intr) {
 		radeon_irq_kms_sw_irq_get(rdev, fence->ring);
 		r = wait_event_interruptible_timeout(rdev->fence_drv[fence->ring].queue,
-				radeon_fence_signaled(fence), timeout);
+						     (radeon_fence_signaled(fence) == 0), timeout);
 		radeon_irq_kms_sw_irq_put(rdev, fence->ring);
 		if (unlikely(r < 0)) {
 			return r;
@@ -243,11 +252,11 @@ retry:
 	} else {
 		radeon_irq_kms_sw_irq_get(rdev, fence->ring);
 		r = wait_event_timeout(rdev->fence_drv[fence->ring].queue,
-			 radeon_fence_signaled(fence), timeout);
+				       (radeon_fence_signaled(fence) == 0), timeout);
 		radeon_irq_kms_sw_irq_put(rdev, fence->ring);
 	}
 	trace_radeon_fence_wait_end(rdev->ddev, seq);
-	if (unlikely(!radeon_fence_signaled(fence))) {
+	if (unlikely(radeon_fence_signaled(fence))) {
 		/* we were interrupted for some reason and fence isn't
 		 * isn't signaled yet, resume wait
 		 */
@@ -266,12 +275,10 @@ retry:
 			/* FIXME: what should we do ? marking everyone
 			 * as signaled for now
 			 */
-			rdev->gpu_lockup = true;
-			r = radeon_gpu_reset(rdev);
-			if (r)
-				return r;
-			radeon_fence_write(rdev, fence->seq, fence->ring);
-			rdev->gpu_lockup = false;
+			if (atomic_cmpxchg(&rdev->gpu_lockup_error, 0, -EAGAIN) == 0) {
+				schedule_work(&rdev->reset_work);
+			}
+			return -EAGAIN;
 		}
 		timeout = RADEON_FENCE_JIFFIES_TIMEOUT;
 		write_lock_irqsave(&rdev->fence_lock, irq_flags);
@@ -289,8 +296,9 @@ int radeon_fence_wait_next(struct radeon_device *rdev, int ring)
 	struct radeon_fence *fence;
 	int r;
 
-	if (rdev->gpu_lockup) {
-		return 0;
+	r = atomic_read(&rdev->gpu_lockup_error);
+	if (r) {
+		return r;
 	}
 	write_lock_irqsave(&rdev->fence_lock, irq_flags);
 	if (list_empty(&rdev->fence_drv[ring].emitted)) {
@@ -312,8 +320,9 @@ int radeon_fence_wait_last(struct radeon_device *rdev, int ring)
 	struct radeon_fence *fence;
 	int r;
 
-	if (rdev->gpu_lockup) {
-		return 0;
+	r = atomic_read(&rdev->gpu_lockup_error);
+	if (r) {
+		return r;
 	}
 	write_lock_irqsave(&rdev->fence_lock, irq_flags);
 	if (list_empty(&rdev->fence_drv[ring].emitted)) {
diff --git a/drivers/gpu/drm/radeon/radeon_ring.c b/drivers/gpu/drm/radeon/radeon_ring.c
index 30a4c50..d4a70b6 100644
--- a/drivers/gpu/drm/radeon/radeon_ring.c
+++ b/drivers/gpu/drm/radeon/radeon_ring.c
@@ -83,7 +83,7 @@ bool radeon_ib_try_free(struct radeon_device *rdev, struct radeon_ib *ib)
 
 	/* only free ib which have been emited */
 	if (ib->fence && ib->fence->emitted) {
-		if (radeon_fence_signaled(ib->fence)) {
+		if (radeon_fence_signaled(ib->fence) == 0) {
 			radeon_fence_unref(&ib->fence);
 			radeon_sa_bo_free(rdev, &ib->sa_bo);
 			done = true;
@@ -150,9 +150,8 @@ retry:
 	for (i = 0; i < RADEON_IB_POOL_SIZE; i++) {
 		if (rdev->ib_pool.ibs[idx].fence && rdev->ib_pool.ibs[idx].fence->emitted) {
 			r = radeon_fence_wait(rdev->ib_pool.ibs[idx].fence, false);
-			if (!r) {
+			if (!r)
 				goto retry;
-			}
 			/* an error happened */
 			break;
 		}
diff --git a/drivers/gpu/drm/radeon/radeon_test.c b/drivers/gpu/drm/radeon/radeon_test.c
index dc5dcf4..a51c5e7 100644
--- a/drivers/gpu/drm/radeon/radeon_test.c
+++ b/drivers/gpu/drm/radeon/radeon_test.c
@@ -275,7 +275,7 @@ void radeon_test_ring_sync(struct radeon_device *rdev,
 
 	mdelay(1000);
 
-	if (radeon_fence_signaled(fence1)) {
+	if (radeon_fence_signaled(fence1) == 0) {
 		DRM_ERROR("Fence 1 signaled without waiting for semaphore.\n");
 		goto out_cleanup;
 	}
@@ -296,7 +296,7 @@ void radeon_test_ring_sync(struct radeon_device *rdev,
 
 	mdelay(1000);
 
-	if (radeon_fence_signaled(fence2)) {
+	if (radeon_fence_signaled(fence2) == 0) {
 		DRM_ERROR("Fence 2 signaled without waiting for semaphore.\n");
 		goto out_cleanup;
 	}
@@ -379,11 +379,11 @@ void radeon_test_ring_sync2(struct radeon_device *rdev,
 
 	mdelay(1000);
 
-	if (radeon_fence_signaled(fenceA)) {
+	if (radeon_fence_signaled(fenceA) == 0) {
 		DRM_ERROR("Fence A signaled without waiting for semaphore.\n");
 		goto out_cleanup;
 	}
-	if (radeon_fence_signaled(fenceB)) {
+	if (radeon_fence_signaled(fenceB) == 0) {
 		DRM_ERROR("Fence A signaled without waiting for semaphore.\n");
 		goto out_cleanup;
 	}
@@ -398,8 +398,8 @@ void radeon_test_ring_sync2(struct radeon_device *rdev,
 
 	for (i = 0; i < 30; ++i) {
 		mdelay(100);
-		sigA = radeon_fence_signaled(fenceA);
-		sigB = radeon_fence_signaled(fenceB);
+		sigA = (radeon_fence_signaled(fenceA) == 0);
+		sigB = (radeon_fence_signaled(fenceB) == 0);
 		if (sigA || sigB)
 			break;
 	}
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
index c421e77..6dc7106 100644
--- a/drivers/gpu/drm/radeon/radeon_ttm.c
+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
@@ -524,7 +524,7 @@ static void *radeon_sync_obj_ref(void *sync_obj)
 
 static bool radeon_sync_obj_signaled(void *sync_obj, void *sync_arg)
 {
-	return radeon_fence_signaled((struct radeon_fence *)sync_obj);
+	return (radeon_fence_signaled((struct radeon_fence *)sync_obj) == 0);
 }
 
 /*
diff --git a/drivers/gpu/drm/radeon/rs600.c b/drivers/gpu/drm/radeon/rs600.c
index ec46eb4..4224d97 100644
--- a/drivers/gpu/drm/radeon/rs600.c
+++ b/drivers/gpu/drm/radeon/rs600.c
@@ -377,7 +377,7 @@ int rs600_asic_reset(struct radeon_device *rdev)
 	/* Check if GPU is idle */
 	if (G_000E40_GA_BUSY(status) || G_000E40_VAP_BUSY(status)) {
 		dev_err(rdev->dev, "failed to reset GPU\n");
-		rdev->gpu_lockup = true;
+		atomic_set(&rdev->gpu_lockup_error, -EIO);
 		ret = -1;
 	} else
 		dev_info(rdev->dev, "GPU reset succeed\n");
-- 
1.7.7.6



More information about the dri-devel mailing list