[PATCH 1/2] drm/xe: Fix tlb invalidation when wedging
Matthew Brost
matthew.brost at intel.com
Fri Jan 3 01:02:22 UTC 2025
On Thu, Jan 02, 2025 at 04:11:10PM -0800, Lucas De Marchi wrote:
> If GuC fails to load, the driver wedges, but in the process it tries to
> do stuff that may not be initialized yet. This moves the
> xe_gt_tlb_invalidation_init() to be done earlier: as its own doc says,
> it's a software-only initialization and should had been named with the
> _early() suffix.
>
> Move it to be called by xe_gt_init_early(), so the locks and seqno are
> initialized, avoiding a NULL ptr deref when wedging:
>
> xe 0000:03:00.0: [drm] *ERROR* GT0: load failed: status: Reset = 0, BootROM = 0x50, UKernel = 0x00, MIA = 0x00, Auth = 0x01
> xe 0000:03:00.0: [drm] *ERROR* GT0: firmware signature verification failed
> xe 0000:03:00.0: [drm] *ERROR* CRITICAL: Xe has declared device 0000:03:00.0 as wedged.
> ...
> BUG: kernel NULL pointer dereference, address: 0000000000000000
> #PF: supervisor read access in kernel mode
> #PF: error_code(0x0000) - not-present page
> PGD 0 P4D 0
> Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI
> CPU: 9 UID: 0 PID: 3908 Comm: modprobe Tainted: G U W 6.13.0-rc4-xe+ #3
> Tainted: [U]=USER, [W]=WARN
> Hardware name: Intel Corporation Alder Lake Client Platform/AlderLake-S ADP-S DDR5 UDIMM CRB, BIOS ADLSFWI1.R00.3275.A00.2207010640 07/01/2022
> RIP: 0010:xe_gt_tlb_invalidation_reset+0x75/0x110 [xe]
>
> This can be easily triggered by poking the GuC binary to force a
> signature failure. There will still be an extra message,
>
> xe 0000:03:00.0: [drm] *ERROR* GT0: GuC mmio request 0x4100: no reply 0x4100
>
> but that's better than a NULL ptr deref.
>
> Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/3956
> Fixes: 7dbe8af13c18 ("drm/xe: Wedge the entire device")
> Signed-off-by: Lucas De Marchi <lucas.demarchi at intel.com>
Reviewed-by: Matthew Brost <matthew.brost at intel.com>
> ---
> drivers/gpu/drm/xe/xe_gt.c | 8 ++++----
> drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 4 ++--
> drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h | 3 ++-
> 3 files changed, 8 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> index 41ab7fbebc193..26e64530ada27 100644
> --- a/drivers/gpu/drm/xe/xe_gt.c
> +++ b/drivers/gpu/drm/xe/xe_gt.c
> @@ -387,6 +387,10 @@ int xe_gt_init_early(struct xe_gt *gt)
> xe_force_wake_init_gt(gt, gt_to_fw(gt));
> spin_lock_init(>->global_invl_lock);
>
> + err = xe_gt_tlb_invalidation_init_early(gt);
> + if (err)
> + return err;
> +
> return 0;
> }
>
> @@ -588,10 +592,6 @@ int xe_gt_init(struct xe_gt *gt)
> xe_hw_fence_irq_init(>->fence_irq[i]);
> }
>
> - err = xe_gt_tlb_invalidation_init(gt);
> - if (err)
> - return err;
> -
> err = xe_gt_pagefault_init(gt);
> if (err)
> return err;
> diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> index 665927b80e9ea..257b500e17037 100644
> --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> @@ -106,7 +106,7 @@ static void xe_gt_tlb_fence_timeout(struct work_struct *work)
> }
>
> /**
> - * xe_gt_tlb_invalidation_init - Initialize GT TLB invalidation state
> + * xe_gt_tlb_invalidation_init_early - Initialize GT TLB invalidation state
> * @gt: graphics tile
> *
> * Initialize GT TLB invalidation state, purely software initialization, should
> @@ -114,7 +114,7 @@ static void xe_gt_tlb_fence_timeout(struct work_struct *work)
> *
> * Return: 0 on success, negative error code on error.
> */
> -int xe_gt_tlb_invalidation_init(struct xe_gt *gt)
> +int xe_gt_tlb_invalidation_init_early(struct xe_gt *gt)
> {
> gt->tlb_invalidation.seqno = 1;
> INIT_LIST_HEAD(>->tlb_invalidation.pending_fences);
> diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h
> index 00b1c6c01e8d9..672acfcdf0d70 100644
> --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h
> +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h
> @@ -14,7 +14,8 @@ struct xe_gt;
> struct xe_guc;
> struct xe_vma;
>
> -int xe_gt_tlb_invalidation_init(struct xe_gt *gt);
> +int xe_gt_tlb_invalidation_init_early(struct xe_gt *gt);
> +
> void xe_gt_tlb_invalidation_reset(struct xe_gt *gt);
> int xe_gt_tlb_invalidation_ggtt(struct xe_gt *gt);
> int xe_gt_tlb_invalidation_vma(struct xe_gt *gt,
> --
> 2.47.0
>
More information about the Intel-xe
mailing list