<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<style type="text/css" style="display:none;"> P {margin-top:0;margin-bottom:0;} </style>
</head>
<body dir="ltr">
<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
<br>
</div>
<div id="appendonsend"></div>
<div style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
<br>
</div>
<hr style="display: inline-block; width: 98%;">
<div id="divRplyFwdMsg" dir="ltr"><span style="font-family: Calibri, sans-serif; font-size: 11pt; color: rgb(0, 0, 0);"><b>From:</b> Cavitt, Jonathan <jonathan.cavitt@intel.com><br>
<b>Sent:</b> Friday, February 28, 2025 10:21 AM<br>
<b>To:</b> intel-xe@lists.freedesktop.org <intel-xe@lists.freedesktop.org><br>
<b>Cc:</b> Gupta, saurabhg <saurabhg.gupta@intel.com>; Zuo, Alex <alex.zuo@intel.com>; Cavitt, Jonathan <jonathan.cavitt@intel.com>; joonas.lahtinen@linux.intel.com <joonas.lahtinen@linux.intel.com>; Brost, Matthew <matthew.brost@intel.com>; Zhang, Jianxun
<jianxun.zhang@intel.com>; dri-devel@lists.freedesktop.org <dri-devel@lists.freedesktop.org><br>
<b>Subject:</b> [PATCH v3 3/6] drm/xe/xe_vm: Add per VM pagefault info</span>
<div> </div>
</div>
<div class="elementToProof" style="font-size: 11pt;">Add additional information to each VM so they can report up to the last<br>
50 seen pagefaults. Only failed pagefaults are saved this way, as<br>
successful pagefaults should recover and not need to be reported to<br>
userspace.<br>
<br>
Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com><br>
Suggested-by: Matthew Brost <matthew.brost@intel.com><br>
---<br>
drivers/gpu/drm/xe/xe_gt_pagefault.c | 17 +++++++++++<br>
drivers/gpu/drm/xe/xe_vm.c | 45 ++++++++++++++++++++++++++++<br>
drivers/gpu/drm/xe/xe_vm.h | 6 ++++<br>
drivers/gpu/drm/xe/xe_vm_types.h | 20 +++++++++++++<br>
4 files changed, 88 insertions(+)<br>
<br>
diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c<br>
index 07b52d3c1a60..84907fb4295e 100644<br>
--- a/drivers/gpu/drm/xe/xe_gt_pagefault.c<br>
+++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c<br>
@@ -335,6 +335,22 @@ int xe_guc_pagefault_handler(struct xe_guc *guc, u32 *msg, u32 len)<br>
return full ? -ENOSPC : 0;<br>
}<br>
<br>
+static void save_pagefault_to_vm(struct xe_device *xe, struct xe_pagefault *pf)<br>
+{<br>
+ struct xe_vm *vm;<br>
+ struct xe_pagefault *store;<br>
+<br>
+ vm = asid_to_vm(xe, pf->asid);<br>
+ if (IS_ERR(vm))<br>
+ return;<br>
+<br>
+ spin_lock(&vm->pfs.lock);<br>
+ store = kzalloc(sizeof(*pf), GFP_KERNEL);<br>
+ memcpy(store, pf, sizeof(*pf));<br>
+ xe_vm_add_pf_entry(vm, store);<br>
+ spin_unlock(&vm->pfs.lock);<br>
+}<br>
+<br>
#define USM_QUEUE_MAX_RUNTIME_MS 20<br>
<br>
static void pf_queue_work_func(struct work_struct *w)<br>
@@ -353,6 +369,7 @@ static void pf_queue_work_func(struct work_struct *w)<br>
ret = handle_pagefault(gt, &pf);<br>
if (unlikely(ret)) {<br>
print_pagefault(xe, &pf);<br>
+ save_pagefault_to_vm(xe, &pf);<br>
pf.fault_unsuccessful = 1;<br>
drm_dbg(&xe->drm, "Fault response: Unsuccessful %d\n", ret);<br>
}<br>
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c<br>
index 996000f2424e..6211b971bbbd 100644<br>
--- a/drivers/gpu/drm/xe/xe_vm.c<br>
+++ b/drivers/gpu/drm/xe/xe_vm.c<br>
@@ -746,6 +746,46 @@ int xe_vm_userptr_check_repin(struct xe_vm *vm)<br>
list_empty_careful(&vm->userptr.invalidated)) ? 0 : -EAGAIN;<br>
}<br>
<br>
+static void free_pf_entry(struct xe_vm *vm, struct xe_vm_pf_entry *e)<br>
+{<br>
+ list_del(&e->list);<br>
+ kfree(e->pf);<br>
+ kfree(e);<br>
+ vm->pfs.len--;<br>
+}<br>
+<br>
+void xe_vm_add_pf_entry(struct xe_vm *vm, struct xe_pagefault *pf)<br>
+{<br>
+ struct xe_vm_pf_entry *e = NULL;<br>
+<br>
+ e = kzalloc(sizeof(*e), GFP_KERNEL);<br>
+ xe_assert(vm->xe, e);<br>
+<br>
+ spin_lock(&vm->pfs.lock);<br>
+ list_add_tail(&e->list, &vm->pfs.list);<br>
+ vm->pfs.len++;<br>
+ /**<br>
+ * Limit the number of pfs in the pf list to prevent memory overuse.<br>
+ */<br>
+ if (vm->pfs.len > MAX_PFS) {<br>
+ struct xe_vm_pf_entry *rem =<br>
+ list_first_entry(&vm->pfs.list, struct xe_vm_pf_entry, list);<br>
+</div>
<div class="elementToProof" style="font-size: 11pt; color: rgb(0, 0, 0);">I think the first page fault could be more valuable than the following in actual debug work though I cannot provide a concrete case. Maybe we should just stop adding new page faults once
the list is full? 50 faults perphaps is enough for a developer to work out...</div>
<div class="elementToProof" style="font-size: 11pt;">+ free_pf_entry(vm, rem);</div>
<div class="elementToProof" style="font-size: 11pt;"><br>
+ }<br>
+ spin_unlock(&vm->pfs.lock);<br>
+}<br>
+<br>
+void xe_vm_remove_pf_entries(struct xe_vm *vm)<br>
+{<br>
+ struct xe_vm_pf_entry *e, *tmp;<br>
+<br>
+ spin_lock(&vm->pfs.lock);<br>
+ list_for_each_entry_safe(e, tmp, &vm->pfs.list, list)<br>
+ free_pf_entry(vm, e);<br>
+ spin_unlock(&vm->pfs.lock);<br>
+}<br>
+<br>
static int xe_vma_ops_alloc(struct xe_vma_ops *vops, bool array_of_binds)<br>
{<br>
int i;<br>
@@ -1448,6 +1488,9 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)<br>
init_rwsem(&vm->userptr.notifier_lock);<br>
spin_lock_init(&vm->userptr.invalidated_lock);<br>
<br>
+ INIT_LIST_HEAD(&vm->pfs.list);<br>
+ spin_lock_init(&vm->pfs.lock);<br>
+<br>
ttm_lru_bulk_move_init(&vm->lru_bulk_move);<br>
<br>
INIT_WORK(&vm->destroy_work, vm_destroy_work_func);<br>
@@ -1672,6 +1715,8 @@ void xe_vm_close_and_put(struct xe_vm *vm)<br>
}<br>
up_write(&xe->usm.lock);<br>
<br>
+ xe_vm_remove_pf_entries(vm);<br>
+<br>
for_each_tile(tile, xe, id)<br>
xe_range_fence_tree_fini(&vm->rftree[id]);<br>
<br>
diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h<br>
index f66075f8a6fe..4d94ab5c8ea4 100644<br>
--- a/drivers/gpu/drm/xe/xe_vm.h<br>
+++ b/drivers/gpu/drm/xe/xe_vm.h<br>
@@ -12,6 +12,8 @@<br>
#include "xe_map.h"<br>
#include "xe_vm_types.h"<br>
<br>
+#define MAX_PFS 50<br>
+<br>
struct drm_device;<br>
struct drm_printer;<br>
struct drm_file;<br>
@@ -244,6 +246,10 @@ int xe_vma_userptr_pin_pages(struct xe_userptr_vma *uvma);<br>
<br>
int xe_vma_userptr_check_repin(struct xe_userptr_vma *uvma);<br>
<br>
+void xe_vm_add_pf_entry(struct xe_vm *vm, struct xe_pagefault *pf);<br>
+<br>
+void xe_vm_remove_pf_entries(struct xe_vm *vm);<br>
+<br>
bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end);<br>
<br>
int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma);<br>
diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h<br>
index 52467b9b5348..10b0952db56c 100644<br>
--- a/drivers/gpu/drm/xe/xe_vm_types.h<br>
+++ b/drivers/gpu/drm/xe/xe_vm_types.h<br>
@@ -18,6 +18,7 @@<br>
#include "xe_range_fence.h"<br>
<br>
struct xe_bo;<br>
+struct xe_pagefault;<br>
struct xe_sync_entry;<br>
struct xe_user_fence;<br>
struct xe_vm;<br>
@@ -135,6 +136,13 @@ struct xe_userptr_vma {<br>
<br>
struct xe_device;<br>
<br>
+struct xe_vm_pf_entry {<br>
+ /** @pf: observed pagefault */<br>
+ struct xe_pagefault *pf;<br>
+ /** @list: link into @xe_vm.pfs.list */<br>
+ struct list_head list;<br>
+};<br>
+<br>
struct xe_vm {<br>
/** @gpuvm: base GPUVM used to track VMAs */<br>
struct drm_gpuvm gpuvm;<br>
@@ -274,6 +282,18 @@ struct xe_vm {<br>
bool capture_once;<br>
} error_capture;<br>
<br>
+ /**<br>
+ * @pfs: List of all pagefaults associated with this VM<br>
+ */<br>
+ struct {<br>
+ /** @lock: lock protecting @bans.list */<br>
+ spinlock_t lock;<br>
+ /** @list: list of xe_exec_queue_ban_entry entries */<br>
+ struct list_head list;<br>
+ /** @len: length of @bans.list */<br>
+ unsigned int len;<br>
+ } pfs;<br>
+<br>
/**<br>
* @tlb_flush_seqno: Required TLB flush seqno for the next exec.<br>
* protected by the vm resv.<br>
--<br>
2.43.0<br>
<br>
</div>
</body>
</html>