[PATCH v2] drm/xe: improve hibernation on igpu

Fri Nov 1 17:38:19 UTC 2024

On Fri, Nov 01, 2024 at 05:01:57PM +0000, Matthew Auld wrote:
>The GGTT looks to be stored inside stolen memory on igpu which is not
>treated as normal RAM.  The core kernel skips this memory range when
>creating the hibernation image, therefore when coming back from

can you add the log for e820 mapping to confirm?

>hibernation the GGTT programming is lost. This seems to cause issues
>with broken resume where GuC FW fails to load:
>
>[drm] *ERROR* GT0: load failed: status = 0x400000A0, time = 10ms, freq = 1250MHz (req 1300MHz), done = -1
>[drm] *ERROR* GT0: load failed: status: Reset = 0, BootROM = 0x50, UKernel = 0x00, MIA = 0x00, Auth = 0x01
>[drm] *ERROR* GT0: firmware signature verification failed
>[drm] *ERROR* CRITICAL: Xe has declared device 0000:00:02.0 as wedged.

it seems the message above is cut short. Just above these lines don't
you have a log with __xe_guc_upload? Which means: we actually upload the
firmware again to stolen and it doesn't matter that we lost it when
hibernating.

It'd be good to know the size of the rsa key in the failing scenarios.

Also it seems this is also reproduced in DG2 and I wonder if it's the
same issue or something different:

	[drm:__xe_guc_upload.isra.0 [xe]] GT0: load still in progress, timeouts = 0, freq = 1700MHz (req 2050MHz), status = 0x00000064 [0x32/00]
	[drm:__xe_guc_upload.isra.0 [xe]] GT0: load still in progress, timeouts = 0, freq = 1700MHz (req 2050MHz), status = 0x00000072 [0x39/00]
	[drm:__xe_guc_upload.isra.0 [xe]] GT0: load still in progress, timeouts = 0, freq = 1700MHz (req 2050MHz), status = 0x00000086 [0x43/00]
	[drm] *ERROR* GT0: load failed: status = 0x400000A0, time = 5ms, freq = 1700MHz (req 2050MHz), done = -1
	[drm] *ERROR* GT0: load failed: status: Reset = 0, BootROM = 0x50, UKernel = 0x00, MIA = 0x00, Auth = 0x01
	[drm] *ERROR* GT0: firmware signature verification failed

Cc Ulisses.

>
>Current GGTT users are kernel internal and tracked as pinned, so it
>should be possible to hook into the existing save/restore logic that we
>use for dgpu, where the actual evict is skipped but on restore we
>importantly restore the GGTT programming.  This has been confirmed to
>fix hibernation on at least ADL and MTL, though likely all igpu
>platforms are affected.
>
>This also means we have a hole in our testing, where the existing s4
>tests only really test the driver hooks, and don't go as far as actually
>rebooting and restoring from the hibernation image and in turn powering
>down RAM (and therefore losing the contents of stolen).

yeah, the problem is that enabling it to go through the entire sequence
we reproduce all kind of issues in other parts of the kernel and userspace
env leading to flaky tests that are usually red in CI. The most annoying
one is the network not coming back so we mark the test as failure
(actually abort. since we stop running everything).

>
>v2 (Brost)
> - Remove extra newline and drop unnecessary parentheses.
>
>Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs")
>Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/3275
>Signed-off-by: Matthew Auld <matthew.auld at intel.com>
>Cc: Matthew Brost <matthew.brost at intel.com>
>Cc: <stable at vger.kernel.org> # v6.8+
>Reviewed-by: Matthew Brost <matthew.brost at intel.com>
>---
> drivers/gpu/drm/xe/xe_bo.c       | 37 ++++++++++++++------------------
> drivers/gpu/drm/xe/xe_bo_evict.c |  6 ------
> 2 files changed, 16 insertions(+), 27 deletions(-)
>
>diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
>index 8286cbc23721..549866da5cd1 100644
>--- a/drivers/gpu/drm/xe/xe_bo.c
>+++ b/drivers/gpu/drm/xe/xe_bo.c
>@@ -952,7 +952,10 @@ int xe_bo_restore_pinned(struct xe_bo *bo)
> 	if (WARN_ON(!xe_bo_is_pinned(bo)))
> 		return -EINVAL;
>
>-	if (WARN_ON(xe_bo_is_vram(bo) || !bo->ttm.ttm))
>+	if (WARN_ON(xe_bo_is_vram(bo)))
>+		return -EINVAL;
>+
>+	if (WARN_ON(!bo->ttm.ttm && !xe_bo_is_stolen(bo)))
> 		return -EINVAL;
>
> 	if (!mem_type_is_vram(place->mem_type))
>@@ -1774,6 +1777,7 @@ int xe_bo_pin_external(struct xe_bo *bo)
>
> int xe_bo_pin(struct xe_bo *bo)
> {
>+	struct ttm_place *place = &bo->placements[0];
> 	struct xe_device *xe = xe_bo_device(bo);
> 	int err;
>
>@@ -1804,8 +1808,6 @@ int xe_bo_pin(struct xe_bo *bo)
> 	 */
> 	if (IS_DGFX(xe) && !(IS_ENABLED(CONFIG_DRM_XE_DEBUG) &&
> 	    bo->flags & XE_BO_FLAG_INTERNAL_TEST)) {
>-		struct ttm_place *place = &(bo->placements[0]);
>-
> 		if (mem_type_is_vram(place->mem_type)) {
> 			xe_assert(xe, place->flags & TTM_PL_FLAG_CONTIGUOUS);
>
>@@ -1813,13 +1815,12 @@ int xe_bo_pin(struct xe_bo *bo)
> 				       vram_region_gpu_offset(bo->ttm.resource)) >> PAGE_SHIFT;
> 			place->lpfn = place->fpfn + (bo->size >> PAGE_SHIFT);
> 		}
>+	}
>
>-		if (mem_type_is_vram(place->mem_type) ||
>-		    bo->flags & XE_BO_FLAG_GGTT) {
>-			spin_lock(&xe->pinned.lock);
>-			list_add_tail(&bo->pinned_link, &xe->pinned.kernel_bo_present);
>-			spin_unlock(&xe->pinned.lock);
>-		}
>+	if (mem_type_is_vram(place->mem_type) || bo->flags & XE_BO_FLAG_GGTT) {

should this test for devmem so we restore everything rather than just
ggtt?

Lucas De Marchi

>+		spin_lock(&xe->pinned.lock);
>+		list_add_tail(&bo->pinned_link, &xe->pinned.kernel_bo_present);
>+		spin_unlock(&xe->pinned.lock);
> 	}
>
> 	ttm_bo_pin(&bo->ttm);
>@@ -1867,24 +1868,18 @@ void xe_bo_unpin_external(struct xe_bo *bo)
>
> void xe_bo_unpin(struct xe_bo *bo)
> {
>+	struct ttm_place *place = &bo->placements[0];
> 	struct xe_device *xe = xe_bo_device(bo);
>
> 	xe_assert(xe, !bo->ttm.base.import_attach);
> 	xe_assert(xe, xe_bo_is_pinned(bo));
>
>-	if (IS_DGFX(xe) && !(IS_ENABLED(CONFIG_DRM_XE_DEBUG) &&
>-	    bo->flags & XE_BO_FLAG_INTERNAL_TEST)) {
>-		struct ttm_place *place = &(bo->placements[0]);
>-
>-		if (mem_type_is_vram(place->mem_type) ||
>-		    bo->flags & XE_BO_FLAG_GGTT) {
>-			spin_lock(&xe->pinned.lock);
>-			xe_assert(xe, !list_empty(&bo->pinned_link));
>-			list_del_init(&bo->pinned_link);
>-			spin_unlock(&xe->pinned.lock);
>-		}
>+	if (mem_type_is_vram(place->mem_type) || bo->flags & XE_BO_FLAG_GGTT) {
>+		spin_lock(&xe->pinned.lock);
>+		xe_assert(xe, !list_empty(&bo->pinned_link));
>+		list_del_init(&bo->pinned_link);
>+		spin_unlock(&xe->pinned.lock);
> 	}
>-
> 	ttm_bo_unpin(&bo->ttm);
> }
>
>diff --git a/drivers/gpu/drm/xe/xe_bo_evict.c b/drivers/gpu/drm/xe/xe_bo_evict.c
>index 32043e1e5a86..b01bc20eb90b 100644
>--- a/drivers/gpu/drm/xe/xe_bo_evict.c
>+++ b/drivers/gpu/drm/xe/xe_bo_evict.c
>@@ -34,9 +34,6 @@ int xe_bo_evict_all(struct xe_device *xe)
> 	u8 id;
> 	int ret;
>
>-	if (!IS_DGFX(xe))
>-		return 0;
>-
> 	/* User memory */
> 	for (mem_type = XE_PL_VRAM0; mem_type <= XE_PL_VRAM1; ++mem_type) {
> 		struct ttm_resource_manager *man =
>@@ -125,9 +122,6 @@ int xe_bo_restore_kernel(struct xe_device *xe)
> 	struct xe_bo *bo;
> 	int ret;
>
>-	if (!IS_DGFX(xe))
>-		return 0;
>-
> 	spin_lock(&xe->pinned.lock);
> 	for (;;) {
> 		bo = list_first_entry_or_null(&xe->pinned.evicted,
>-- 
>2.47.0
>