[Intel-xe] [PATCH 1/1] drm/xe: Fix pagefault and access counter worker functions

Brian Welty brian.welty at intel.com
Wed Nov 1 22:52:44 UTC 2023


When processing G2H messages for pagefault or access counters, we queue a
work item and call queue_work(). This fails if the worker thread is already
queued to run.
The expectation is that the worker function will do more than process a
single item and return. It needs to either process all pending items or
requeue itself (when items are pending) before returning.

Here, we choose to requeue the worker if faults are pending. This will
introduce some unnecessary latency, but if this is the preference, we can
start here and tune this later.

This resolves issues seen with several igt at xe_exec_fault_mode subtests
where the GPU will hang when KMD ignores a pending pagefault.

v2: requeue the worker instead of having an internal processing loop.
    Likely, we want a hybrid model which processes more than a single
    fault before requeuing.  (Matt B)

Signed-off-by: Brian Welty <brian.welty at intel.com>
---
 drivers/gpu/drm/xe/xe_gt_pagefault.c | 33 ++++++++++++++++++----------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
index ab6daebbd77c..9b082817c877 100644
--- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
+++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
@@ -276,10 +276,11 @@ static void print_pagefault(struct xe_device *xe, struct pagefault *pf)
 
 #define PF_MSG_LEN_DW	4
 
-static int get_pagefault(struct pf_queue *pf_queue, struct pagefault *pf)
+static bool get_pagefault(struct pf_queue *pf_queue, struct pagefault *pf,
+			  bool *pending)
 {
 	const struct xe_guc_pagefault_desc *desc;
-	int ret = 0;
+	bool ret = false;
 
 	spin_lock_irq(&pf_queue->lock);
 	if (pf_queue->head != pf_queue->tail) {
@@ -303,8 +304,8 @@ static int get_pagefault(struct pf_queue *pf_queue, struct pagefault *pf)
 
 		pf_queue->head = (pf_queue->head + PF_MSG_LEN_DW) %
 			PF_QUEUE_NUM_DW;
-	} else {
-		ret = -1;
+		*pending = pf_queue->head != pf_queue->tail;
+		ret = true;
 	}
 	spin_unlock_irq(&pf_queue->lock);
 
@@ -355,10 +356,10 @@ static void pf_queue_work_func(struct work_struct *w)
 	struct xe_device *xe = gt_to_xe(gt);
 	struct xe_guc_pagefault_reply reply = {};
 	struct pagefault pf = {};
+	bool requeue;
 	int ret;
 
-	ret = get_pagefault(pf_queue, &pf);
-	if (ret)
+	if (!get_pagefault(pf_queue, &pf, &requeue))
 		return;
 
 	ret = handle_pagefault(gt, &pf);
@@ -380,6 +381,10 @@ static void pf_queue_work_func(struct work_struct *w)
 		FIELD_PREP(PFR_PDATA, pf.pdata);
 
 	send_pagefault_reply(&gt->uc.guc, &reply);
+
+	/* TODO: evaluate to service 'x' number of faults before requeue */
+	if (requeue)
+		queue_work(gt->usm.pf_wq, w);
 }
 
 static void acc_queue_work_func(struct work_struct *w);
@@ -544,10 +549,10 @@ static int handle_acc(struct xe_gt *gt, struct acc *acc)
 
 #define ACC_MSG_LEN_DW        4
 
-static int get_acc(struct acc_queue *acc_queue, struct acc *acc)
+static bool get_acc(struct acc_queue *acc_queue, struct acc *acc, bool *pending)
 {
 	const struct xe_guc_acc_desc *desc;
-	int ret = 0;
+	bool ret = false;
 
 	spin_lock(&acc_queue->lock);
 	if (acc_queue->head != acc_queue->tail) {
@@ -567,8 +572,8 @@ static int get_acc(struct acc_queue *acc_queue, struct acc *acc)
 
 		acc_queue->head = (acc_queue->head + ACC_MSG_LEN_DW) %
 				  ACC_QUEUE_NUM_DW;
-	} else {
-		ret = -1;
+		*pending = acc_queue->head != acc_queue->tail;
+		ret = true;
 	}
 	spin_unlock(&acc_queue->lock);
 
@@ -581,10 +586,10 @@ static void acc_queue_work_func(struct work_struct *w)
 	struct xe_gt *gt = acc_queue->gt;
 	struct xe_device *xe = gt_to_xe(gt);
 	struct acc acc = {};
+	bool requeue;
 	int ret;
 
-	ret = get_acc(acc_queue, &acc);
-	if (ret)
+	if (!get_acc(acc_queue, &acc, &requeue))
 		return;
 
 	ret = handle_acc(gt, &acc);
@@ -592,6 +597,10 @@ static void acc_queue_work_func(struct work_struct *w)
 		print_acc(xe, &acc);
 		drm_warn(&xe->drm, "ACC: Unsuccessful %d\n", ret);
 	}
+
+	/* TODO: evaluate to service 'x' number of acc before requeue */
+	if (requeue)
+		queue_work(gt->usm.acc_wq, w);
 }
 
 static bool acc_queue_full(struct acc_queue *acc_queue)
-- 
2.38.0



More information about the Intel-xe mailing list