[Intel-xe] [PATCH] drm/xe: fix pvc unload issue

Chang, Bruce yu.bruce.chang at intel.com
Thu Mar 30 15:40:26 UTC 2023


Currently, unload pvc driver will generate a null dereference
and the call stack is as below.

[ 4850.618000] Call Trace:
[ 4850.620740]  <TASK>
[ 4850.623134]  ttm_bo_cleanup_memtype_use+0x3f/0x50 [ttm]
[ 4850.628661]  ttm_bo_release+0x154/0x2c0 [ttm]
[ 4850.633317]  ? drm_buddy_fini+0x62/0x80 [drm_buddy]
[ 4850.638487]  ? __kmem_cache_free+0x27d/0x2c0
[ 4850.643054]  ttm_bo_put+0x38/0x60 [ttm]
[ 4850.647190]  xe_gem_object_free+0x1f/0x30 [xe]
[ 4850.651945]  drm_gem_object_free+0x1e/0x30 [drm]
[ 4850.656904]  ggtt_fini_noalloc+0x9d/0xe0 [xe]
[ 4850.661574]  drm_managed_release+0xb5/0x150 [drm]
[ 4850.666617]  drm_dev_release+0x30/0x50 [drm]
[ 4850.671209]  devm_drm_dev_init_release+0x3c/0x60 [drm]

There are a couple issues, but the main one is due to TTM has only
one TTM_PL_TT region, but since pvc has 2 tiles and tries to setup
1 TTM_PL_TT each tile. The second will overwrite the first one.

During unload time, the first tile will reset the TTM_PL_TT manger
and when the second tile is trying to free Bo and it will generate
the null reference since the TTM manage is already got reset to 0.

The fix is to share the TTM_PL_TT manager and use a count to only
allow the last instance to release.

Cc: Stuart Summers <stuart.summers at intel.com>
Cc: Matthew Brost <matthew.brost at intel.com>
Signed-off-by: Bruce Chang <yu.bruce.chang at intel.com>
---
 drivers/gpu/drm/xe/xe_device_types.h |  3 +++
 drivers/gpu/drm/xe/xe_gt.c           |  9 +++++++--
 drivers/gpu/drm/xe/xe_ttm_gtt_mgr.c  | 25 +++++++++++++++----------
 3 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
index 88f863edc41c..ea771f120c0f 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -195,6 +195,9 @@ struct xe_device {
 			/** @mapping: pointer to VRAM mappable space */
 			void *__iomem mapping;
 		} vram;
+		/** @gtt_mr: GTT TTM manager */
+		struct xe_ttm_gtt_mgr *gtt_mgr;
+		int instance;
 	} mem;
 
 	/** @usm: unified memory state */
diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
index fd7a5b43ba3e..e7ca141478ef 100644
--- a/drivers/gpu/drm/xe/xe_gt.c
+++ b/drivers/gpu/drm/xe/xe_gt.c
@@ -77,8 +77,13 @@ int xe_gt_alloc(struct xe_device *xe, struct xe_gt *gt)
 		if (!gt->mem.vram_mgr)
 			return -ENOMEM;
 
-		gt->mem.gtt_mgr = drmm_kzalloc(drm, sizeof(*gt->mem.gtt_mgr),
-					       GFP_KERNEL);
+		if (!xe->mem.gtt_mgr) {
+			xe->mem.gtt_mgr =
+				drmm_kzalloc(drm, sizeof(*gt->mem.gtt_mgr),
+					     GFP_KERNEL);
+			xe->mem.instance = 0;
+		}
+		gt->mem.gtt_mgr = xe->mem.gtt_mgr;
 		if (!gt->mem.gtt_mgr)
 			return -ENOMEM;
 	} else {
diff --git a/drivers/gpu/drm/xe/xe_ttm_gtt_mgr.c b/drivers/gpu/drm/xe/xe_ttm_gtt_mgr.c
index 8075781070f2..05300c71928e 100644
--- a/drivers/gpu/drm/xe/xe_ttm_gtt_mgr.c
+++ b/drivers/gpu/drm/xe/xe_ttm_gtt_mgr.c
@@ -94,6 +94,9 @@ static void ttm_gtt_mgr_fini(struct drm_device *drm, void *arg)
 	struct ttm_resource_manager *man = &mgr->manager;
 	int err;
 
+	if (--xe->mem.instance)
+		return;
+
 	ttm_resource_manager_set_used(man, false);
 
 	err = ttm_resource_manager_evict_all(&xe->ttm, man);
@@ -113,18 +116,20 @@ int xe_ttm_gtt_mgr_init(struct xe_gt *gt, struct xe_ttm_gtt_mgr *mgr,
 
 	XE_BUG_ON(xe_gt_is_media_type(gt));
 
-	mgr->gt = gt;
-	man->use_tt = true;
-	man->func = &xe_ttm_gtt_mgr_func;
-
-	ttm_resource_manager_init(man, &xe->ttm, gtt_size >> PAGE_SHIFT);
+	if (!xe->mem.instance) {
+		mgr->gt = gt;
+		man->use_tt = true;
+		man->func = &xe_ttm_gtt_mgr_func;
 
-	ttm_set_driver_manager(&xe->ttm, XE_PL_TT, &mgr->manager);
-	ttm_resource_manager_set_used(man, true);
+		ttm_resource_manager_init(man, &xe->ttm, gtt_size >> PAGE_SHIFT);
 
-	err = drmm_add_action_or_reset(&xe->drm, ttm_gtt_mgr_fini, mgr);
-	if (err)
-		return err;
+		ttm_set_driver_manager(&xe->ttm, XE_PL_TT, &mgr->manager);
+		ttm_resource_manager_set_used(man, true);
+		err = drmm_add_action_or_reset(&xe->drm, ttm_gtt_mgr_fini, mgr);
+		if (err)
+			return err;
+	}
+	xe->mem.instance ++;
 
 	return 0;
 }
-- 
2.25.1



More information about the Intel-xe mailing list