[PATCH] drm/radeon: disable any GPU activity after unrecovered lockup v2

j.glisse at gmail.com j.glisse at gmail.com
Wed Jun 27 08:34:38 PDT 2012


From: Jerome Glisse <jglisse at redhat.com>

After unrecovered GPU lockup avoid any GPU activities to avoid
things like kernel segfault and alike to happen in any of the
path that assume hw is working.

The segfault is due to PCIE vram gart table being unmapped after
suspend in the GPU reset path. To avoid segault to happen and to
avoid further GPU activity if unsuccessful at reseting GPU we
use the accel_working boolean to transform ttm activities into
noop. It does not impact the module load path because in that
path ttm have an empty schedule queue and accel_working will be
set to true as soon as the gart table is in valid state. Because
ttm might have work queued it is better to use the accel working
then disabling radeon_bo ioctl.

To trigger the segfault launch a program that repeatly create bo
in ttm and let it run in background, then trigger gpu lockup from
another process.

v2: fix spelling error and disable accel before suspend and reenable
    it after pcie gart initialization to be even more cautious about
    possible segfault. Improve commit message

cc: stable at vger.kernel.org
Signed-off-by: Jerome Glisse <jglisse at redhat.com>
---
 drivers/gpu/drm/radeon/evergreen.c     |    2 +-
 drivers/gpu/drm/radeon/ni.c            |    2 +-
 drivers/gpu/drm/radeon/r300.c          |    2 +-
 drivers/gpu/drm/radeon/r520.c          |    2 +-
 drivers/gpu/drm/radeon/r600.c          |    2 +-
 drivers/gpu/drm/radeon/radeon_device.c |    9 ++++---
 drivers/gpu/drm/radeon/radeon_object.c |    7 ++++++
 drivers/gpu/drm/radeon/radeon_ttm.c    |   41 ++++++++++++++++++++++++++++++++
 drivers/gpu/drm/radeon/rs400.c         |    2 +-
 drivers/gpu/drm/radeon/rs600.c         |    2 +-
 drivers/gpu/drm/radeon/rs690.c         |    2 +-
 drivers/gpu/drm/radeon/rv515.c         |    2 +-
 drivers/gpu/drm/radeon/rv770.c         |    2 +-
 drivers/gpu/drm/radeon/si.c            |    2 +-
 drivers/gpu/drm/ttm/ttm_tt.c           |    1 +
 15 files changed, 66 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/radeon/evergreen.c b/drivers/gpu/drm/radeon/evergreen.c
index 7fb3d2e..2a4be53 100644
--- a/drivers/gpu/drm/radeon/evergreen.c
+++ b/drivers/gpu/drm/radeon/evergreen.c
@@ -3055,6 +3055,7 @@ static int evergreen_startup(struct radeon_device *rdev)
 		if (r)
 			return r;
 	}
+	rdev->accel_working = true;
 	evergreen_gpu_init(rdev);
 
 	r = evergreen_blit_init(rdev);
@@ -3129,7 +3130,6 @@ int evergreen_resume(struct radeon_device *rdev)
 	/* post card */
 	atom_asic_init(rdev->mode_info.atom_context);
 
-	rdev->accel_working = true;
 	r = evergreen_startup(rdev);
 	if (r) {
 		DRM_ERROR("evergreen startup failed on resume\n");
diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c
index b7bf18e..18f87ca 100644
--- a/drivers/gpu/drm/radeon/ni.c
+++ b/drivers/gpu/drm/radeon/ni.c
@@ -1229,6 +1229,7 @@ static int cayman_startup(struct radeon_device *rdev)
 	r = cayman_pcie_gart_enable(rdev);
 	if (r)
 		return r;
+	rdev->accel_working = true;
 	cayman_gpu_init(rdev);
 
 	r = evergreen_blit_init(rdev);
@@ -1321,7 +1322,6 @@ int cayman_resume(struct radeon_device *rdev)
 	/* post card */
 	atom_asic_init(rdev->mode_info.atom_context);
 
-	rdev->accel_working = true;
 	r = cayman_startup(rdev);
 	if (r) {
 		DRM_ERROR("cayman startup failed on resume\n");
diff --git a/drivers/gpu/drm/radeon/r300.c b/drivers/gpu/drm/radeon/r300.c
index 97722a3..206ac1f 100644
--- a/drivers/gpu/drm/radeon/r300.c
+++ b/drivers/gpu/drm/radeon/r300.c
@@ -1358,6 +1358,7 @@ static int r300_startup(struct radeon_device *rdev)
 		if (r)
 			return r;
 	}
+	rdev->accel_working = true;
 
 	if (rdev->family == CHIP_R300 ||
 	    rdev->family == CHIP_R350 ||
@@ -1426,7 +1427,6 @@ int r300_resume(struct radeon_device *rdev)
 	/* Initialize surface registers */
 	radeon_surface_init(rdev);
 
-	rdev->accel_working = true;
 	r = r300_startup(rdev);
 	if (r) {
 		rdev->accel_working = false;
diff --git a/drivers/gpu/drm/radeon/r520.c b/drivers/gpu/drm/radeon/r520.c
index b5cf837..6409eb0 100644
--- a/drivers/gpu/drm/radeon/r520.c
+++ b/drivers/gpu/drm/radeon/r520.c
@@ -181,6 +181,7 @@ static int r520_startup(struct radeon_device *rdev)
 		if (r)
 			return r;
 	}
+	rdev->accel_working = true;
 
 	/* allocate wb buffer */
 	r = radeon_wb_init(rdev);
@@ -236,7 +237,6 @@ int r520_resume(struct radeon_device *rdev)
 	/* Initialize surface registers */
 	radeon_surface_init(rdev);
 
-	rdev->accel_working = true;
 	r = r520_startup(rdev);
 	if (r) {
 		rdev->accel_working = false;
diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
index 78c0d0d..692b48b 100644
--- a/drivers/gpu/drm/radeon/r600.c
+++ b/drivers/gpu/drm/radeon/r600.c
@@ -2382,6 +2382,7 @@ int r600_startup(struct radeon_device *rdev)
 		if (r)
 			return r;
 	}
+	rdev->accel_working = true;
 	r600_gpu_init(rdev);
 	r = r600_blit_init(rdev);
 	if (r) {
@@ -2465,7 +2466,6 @@ int r600_resume(struct radeon_device *rdev)
 	/* post card */
 	atom_asic_init(rdev->mode_info.atom_context);
 
-	rdev->accel_working = true;
 	r = r600_startup(rdev);
 	if (r) {
 		DRM_ERROR("r600 startup failed on resume\n");
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
index 066c98b..c4d55af 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -992,17 +992,20 @@ int radeon_gpu_reset(struct radeon_device *rdev)
 	radeon_save_bios_scratch_regs(rdev);
 	/* block TTM */
 	resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev);
+	rdev->accel_working = false;
 	radeon_suspend(rdev);
 
 	r = radeon_asic_reset(rdev);
 	if (!r) {
 		dev_info(rdev->dev, "GPU reset succeed\n");
 		radeon_resume(rdev);
-		radeon_restore_bios_scratch_regs(rdev);
-		drm_helper_resume_force_mode(rdev->ddev);
-		ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
 	}
 
+	/* no matter what restore video mode */
+	radeon_restore_bios_scratch_regs(rdev);
+	drm_helper_resume_force_mode(rdev->ddev);
+	ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
+
 	if (r) {
 		/* bad news, how to tell it to userspace ? */
 		dev_info(rdev->dev, "GPU reset failed\n");
diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c
index 830f1a7..27e8e53 100644
--- a/drivers/gpu/drm/radeon/radeon_object.c
+++ b/drivers/gpu/drm/radeon/radeon_object.c
@@ -89,6 +89,13 @@ void radeon_ttm_placement_from_domain(struct radeon_bo *rbo, u32 domain)
 	rbo->placement.lpfn = 0;
 	rbo->placement.placement = rbo->placements;
 	rbo->placement.busy_placement = rbo->placements;
+	if (!rbo->rdev->accel_working) {
+		/* for new bo to system ram when GPU is not working */
+		rbo->placements[c++] = TTM_PL_MASK_CACHING | TTM_PL_FLAG_SYSTEM;
+		rbo->placement.num_placement = c;
+		rbo->placement.num_busy_placement = c;
+		return;
+	}
 	if (domain & RADEON_GEM_DOMAIN_VRAM)
 		rbo->placements[c++] = TTM_PL_FLAG_WC | TTM_PL_FLAG_UNCACHED |
 					TTM_PL_FLAG_VRAM;
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
index c94a225..b17a951 100644
--- a/drivers/gpu/drm/radeon/radeon_ttm.c
+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
@@ -215,6 +215,25 @@ static void radeon_move_null(struct ttm_buffer_object *bo,
 	new_mem->mm_node = NULL;
 }
 
+static void radeon_move_noop(struct ttm_buffer_object *bo,
+			     struct ttm_mem_reg *new_mem)
+{
+	struct ttm_bo_device *bdev = bo->bdev;
+	struct ttm_mem_type_manager *man = &bdev->man[new_mem->mem_type];
+	struct ttm_mem_reg *old_mem = &bo->mem;
+	struct ttm_mem_reg old_copy = *old_mem;
+
+	*old_mem = *new_mem;
+	new_mem->mm_node = NULL;
+
+	if ((man->flags & TTM_MEMTYPE_FLAG_FIXED) && (bo->ttm != NULL)) {
+		ttm_tt_destroy(bo->ttm);
+		bo->ttm = NULL;
+	}
+
+	ttm_bo_mem_put(bo, &old_copy);
+}
+
 static int radeon_move_blit(struct ttm_buffer_object *bo,
 			bool evict, int no_wait_reserve, bool no_wait_gpu,
 			struct ttm_mem_reg *new_mem,
@@ -399,6 +418,14 @@ static int radeon_bo_move(struct ttm_buffer_object *bo,
 		radeon_move_null(bo, new_mem);
 		return 0;
 	}
+	if (!rdev->accel_working) {
+		/* when accel is not working GPU is in broken state just
+		 * do nothing for any ttm operation to avoid making the
+		 * situation worse than it is
+		 */
+		radeon_move_noop(bo, new_mem);
+		return 0;
+	}
 	if ((old_mem->mem_type == TTM_PL_TT &&
 	     new_mem->mem_type == TTM_PL_SYSTEM) ||
 	    (old_mem->mem_type == TTM_PL_SYSTEM &&
@@ -545,6 +572,13 @@ static int radeon_ttm_backend_bind(struct ttm_tt *ttm,
 		WARN(1, "nothing to bind %lu pages for mreg %p back %p!\n",
 		     ttm->num_pages, bo_mem, ttm);
 	}
+	if (!gtt->rdev->accel_working) {
+		/* when accel is not working GPU is in broken state just
+		 * do nothing for any ttm operation to avoid making the
+		 * situation worse than it is
+		 */
+		return 0;
+	}
 	r = radeon_gart_bind(gtt->rdev, gtt->offset,
 			     ttm->num_pages, ttm->pages, gtt->ttm.dma_address);
 	if (r) {
@@ -559,6 +593,13 @@ static int radeon_ttm_backend_unbind(struct ttm_tt *ttm)
 {
 	struct radeon_ttm_tt *gtt = (void *)ttm;
 
+	if (!gtt->rdev->accel_working) {
+		/* when accel is not working GPU is in broken state just
+		 * do nothing for any ttm operation to avoid making the
+		 * situation worse than it is
+		 */
+		return 0;
+	}
 	radeon_gart_unbind(gtt->rdev, gtt->offset, ttm->num_pages);
 	return 0;
 }
diff --git a/drivers/gpu/drm/radeon/rs400.c b/drivers/gpu/drm/radeon/rs400.c
index a464eb5..39fac11 100644
--- a/drivers/gpu/drm/radeon/rs400.c
+++ b/drivers/gpu/drm/radeon/rs400.c
@@ -404,6 +404,7 @@ static int rs400_startup(struct radeon_device *rdev)
 	r = rs400_gart_enable(rdev);
 	if (r)
 		return r;
+	rdev->accel_working = true;
 
 	/* allocate wb buffer */
 	r = radeon_wb_init(rdev);
@@ -460,7 +461,6 @@ int rs400_resume(struct radeon_device *rdev)
 	/* Initialize surface registers */
 	radeon_surface_init(rdev);
 
-	rdev->accel_working = true;
 	r = rs400_startup(rdev);
 	if (r) {
 		rdev->accel_working = false;
diff --git a/drivers/gpu/drm/radeon/rs600.c b/drivers/gpu/drm/radeon/rs600.c
index e95c5e6..5633203 100644
--- a/drivers/gpu/drm/radeon/rs600.c
+++ b/drivers/gpu/drm/radeon/rs600.c
@@ -886,6 +886,7 @@ static int rs600_startup(struct radeon_device *rdev)
 	r = rs600_gart_enable(rdev);
 	if (r)
 		return r;
+	rdev->accel_working = true;
 
 	/* allocate wb buffer */
 	r = radeon_wb_init(rdev);
@@ -946,7 +947,6 @@ int rs600_resume(struct radeon_device *rdev)
 	/* Initialize surface registers */
 	radeon_surface_init(rdev);
 
-	rdev->accel_working = true;
 	r = rs600_startup(rdev);
 	if (r) {
 		rdev->accel_working = false;
diff --git a/drivers/gpu/drm/radeon/rs690.c b/drivers/gpu/drm/radeon/rs690.c
index 159b6a4..633220e 100644
--- a/drivers/gpu/drm/radeon/rs690.c
+++ b/drivers/gpu/drm/radeon/rs690.c
@@ -615,6 +615,7 @@ static int rs690_startup(struct radeon_device *rdev)
 	r = rs400_gart_enable(rdev);
 	if (r)
 		return r;
+	rdev->accel_working = true;
 
 	/* allocate wb buffer */
 	r = radeon_wb_init(rdev);
@@ -675,7 +676,6 @@ int rs690_resume(struct radeon_device *rdev)
 	/* Initialize surface registers */
 	radeon_surface_init(rdev);
 
-	rdev->accel_working = true;
 	r = rs690_startup(rdev);
 	if (r) {
 		rdev->accel_working = false;
diff --git a/drivers/gpu/drm/radeon/rv515.c b/drivers/gpu/drm/radeon/rv515.c
index 7f08ced..81af328 100644
--- a/drivers/gpu/drm/radeon/rv515.c
+++ b/drivers/gpu/drm/radeon/rv515.c
@@ -386,6 +386,7 @@ static int rv515_startup(struct radeon_device *rdev)
 		if (r)
 			return r;
 	}
+	rdev->accel_working = true;
 
 	/* allocate wb buffer */
 	r = radeon_wb_init(rdev);
@@ -441,7 +442,6 @@ int rv515_resume(struct radeon_device *rdev)
 	/* Initialize surface registers */
 	radeon_surface_init(rdev);
 
-	rdev->accel_working = true;
 	r =  rv515_startup(rdev);
 	if (r) {
 		rdev->accel_working = false;
diff --git a/drivers/gpu/drm/radeon/rv770.c b/drivers/gpu/drm/radeon/rv770.c
index b4f51c5..95a594c 100644
--- a/drivers/gpu/drm/radeon/rv770.c
+++ b/drivers/gpu/drm/radeon/rv770.c
@@ -910,6 +910,7 @@ static int rv770_startup(struct radeon_device *rdev)
 		if (r)
 			return r;
 	}
+	rdev->accel_working = true;
 
 	rv770_gpu_init(rdev);
 	r = r600_blit_init(rdev);
@@ -979,7 +980,6 @@ int rv770_resume(struct radeon_device *rdev)
 	/* post card */
 	atom_asic_init(rdev->mode_info.atom_context);
 
-	rdev->accel_working = true;
 	r = rv770_startup(rdev);
 	if (r) {
 		DRM_ERROR("r600 startup failed on resume\n");
diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c
index c7b61f1..813cc34 100644
--- a/drivers/gpu/drm/radeon/si.c
+++ b/drivers/gpu/drm/radeon/si.c
@@ -3675,6 +3675,7 @@ static int si_startup(struct radeon_device *rdev)
 	r = si_pcie_gart_enable(rdev);
 	if (r)
 		return r;
+	rdev->accel_working = true;
 	si_gpu_init(rdev);
 
 #if 0
@@ -3795,7 +3796,6 @@ int si_resume(struct radeon_device *rdev)
 	/* post card */
 	atom_asic_init(rdev->mode_info.atom_context);
 
-	rdev->accel_working = true;
 	r = si_startup(rdev);
 	if (r) {
 		DRM_ERROR("si startup failed on resume\n");
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
index fa09daf..f7bdb04 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -181,6 +181,7 @@ void ttm_tt_destroy(struct ttm_tt *ttm)
 	ttm->swap_storage = NULL;
 	ttm->func->destroy(ttm);
 }
+EXPORT_SYMBOL(ttm_tt_destroy);
 
 int ttm_tt_init(struct ttm_tt *ttm, struct ttm_bo_device *bdev,
 		unsigned long size, uint32_t page_flags,
-- 
1.7.10.2



More information about the dri-devel mailing list