[PATCH v2 2/2] drm/xe: upon page fault, use generic uc fw instead of guc

Tue Apr 29 17:07:14 UTC 2025

On Tue, Apr 29, 2025 at 05:14:42PM +0300, Dafna Hirschfeld wrote:
> On 29.04.2025 15:49, Dafna Hirschfeld wrote:
> > On 28.04.2025 14:03, Matthew Brost wrote:
> > > On Tue, Feb 18, 2025 at 01:27:02PM +0200, Dafna Hirschfeld wrote:
> > > > Future asic will have different firmware to handle page faults.
> > > > Change the code in xe_gt_pagefault.c to use xe_uc_fw instead of
> > > > xe_guc and add a callback to struct xe_uc_fw to send the page
> > > > fault replay to the fw.
> > > > 
> > > 
> > > I think this is on the right track - the current code is clearly layered
> > > incorrectly.
> > > 
> > > > Signed-off-by: Dafna Hirschfeld <dafna.hirschfeld at intel.com>
> > > > ---
> > > > drivers/gpu/drm/xe/xe_gt_pagefault.c | 24 ++++--------------------
> > > > drivers/gpu/drm/xe/xe_gt_pagefault.h |  6 +++---
> > > > drivers/gpu/drm/xe/xe_gt_types.h     |  2 ++
> > > > drivers/gpu/drm/xe/xe_guc.c          | 15 +++++++++++++++
> > > > drivers/gpu/drm/xe/xe_guc_ct.c       |  8 ++++----
> > > > drivers/gpu/drm/xe/xe_uc_fw_types.h  |  5 +++++
> > > > 6 files changed, 33 insertions(+), 27 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > > > index 51c4bcc769d2..c1e076e14219 100644
> > > > --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > > > +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > > > @@ -11,13 +11,10 @@
> > > > #include <drm/drm_exec.h>
> > > > #include <drm/drm_managed.h>
> > > > 
> > > > -#include "abi/guc_actions_abi.h"
> > > > #include "xe_bo.h"
> > > > #include "xe_gt.h"
> > > > #include "xe_gt_stats.h"
> > > > #include "xe_gt_tlb_invalidation.h"
> > > > -#include "xe_guc.h"
> > > > -#include "xe_guc_ct.h"
> > > 
> > > Yes, removing all GuC specific code in xe_gt_pagefaul.c is a good idea.
> > > 
> > > > #include "xe_migrate.h"
> > > > #include "xe_trace_bo.h"
> > > > #include "xe_vm.h"
> > > > @@ -246,18 +243,6 @@ static int handle_pagefault(struct xe_gt *gt, struct pagefault *pf)
> > > > 	return err;
> > > > }
> > > > 
> > > > -static int send_pagefault_reply(struct xe_guc *guc,
> > > > -				struct xe_uc_pagefault_reply *reply)
> > > > -{
> > > > -	u32 action[] = {
> > > > -		XE_GUC_ACTION_PAGE_FAULT_RES_DESC,
> > > > -		reply->dw0,
> > > > -		reply->dw1,
> > > > -	};
> > > > -
> > > > -	return xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action), 0, 0);
> > > > -}
> > > > -
> > > > static void print_pagefault(struct xe_device *xe, struct pagefault *pf)
> > > > {
> > > > 	drm_dbg(&xe->drm, "\n\tASID: %d\n"
> > > > @@ -322,9 +307,8 @@ static bool pf_queue_full(struct pf_queue *pf_queue)
> > > > 		PF_MSG_LEN_DW;
> > > > }
> > > > 
> > > > -int xe_guc_pagefault_handler(struct xe_guc *guc, u32 *msg, u32 len)
> > > > +int xe_pagefault_handler(struct xe_gt *gt, struct xe_uc_fw *fw, u32 *msg, u32 len)
> > > 
> > > s/xe_pagefault_handler/xe_gt_pagefault_handler/
> > > 
> > > I think this needs a interface refactor.
> > > 
> > > This is still assuming the 'msg' field will be the same format across
> > > firmwares which I don't think is a assumption we can safely make.
> > > 
> > > > {
> > > > -	struct xe_gt *gt = guc_to_gt(guc);
> > > > 	struct xe_device *xe = gt_to_xe(gt);
> > > > 	struct pf_queue *pf_queue;
> > > > 	unsigned long flags;
> > > > @@ -336,6 +320,7 @@ int xe_guc_pagefault_handler(struct xe_guc *guc, u32 *msg, u32 len)
> > > > 
> > > > 	asid = FIELD_GET(PFD_ASID, msg[1]);
> > > > 	pf_queue = gt->usm.pf_queue + (asid % NUM_PF_QUEUE);
> > > > +	pf_queue->fw = fw;
> > > > 
> > > 
> > > This doesn't work as if you have multiple FWs queuing up page faults
> > > this could be pointing to the wrong firmware by the time the fault is
> > > processed.
> > 
> > right, but this will work if setting the FW is inside the condition 'if (!full)' right?
> 
> oh, I see now , we have 'data' holding a queue of different msg opaques that might come
> from different firmwares
> 

Yep, I don't we should assume a single GT or PF msg queue is only being
feed by 1 firmware.

Matt

> Dafna
> 
> > 
> > > 
> > > > 	/*
> > > > 	 * The below logic doesn't work unless PF_QUEUE_NUM_DW % PF_MSG_LEN_DW == 0
> > > > @@ -390,7 +375,7 @@ static void pf_queue_work_func(struct work_struct *w)
> > > > 			FIELD_PREP(PFR_ENG_CLASS, pf.engine_class) |
> > > > 			FIELD_PREP(PFR_PDATA, pf.pdata);
> > > > 
> > > > -		send_pagefault_reply(&gt->uc.guc, &reply);
> > > 
> > > I also don't think it is the job GT page fault layer to encode the
> > > response to the firmware as that could be different between firmwares
> > > too.
> > > 
> > > So roughly I think we something along these lines...
> > > 
> > > - The firmwares parse the fields which GT page fault handler needs to
> > > understand (page_addr, asid, access_type) and passing these in as
> > > arguments to xe_gt_pagefault_handler
> > > - The original FW message is also passed in, we define max length for
> > > this, this is just opaque data to GT page fault layer
> > > - The firmware passes in vfunc for send_pagefault_reply, this accepts
> > > original FW message + pass / fail value so the firmware can encode the
> > > reply
> > 
> > Why pass a vfunc instead of the FW itself like I did in this patch?
> > If we pass a vfunc, then it should also accept a pointer to the FW right?
> > 
> > Thanks,
> > Dafna
> > 
> > 
> > > - All the above is stored in a pf_queue entry
> > > - Internally in GT page fault layer we drop looking at firmware specifc
> > > 'struct pagefault' rather the new format which encodes page_addr,
> > > asid, access_type, the reply vfunc, and opaque firmware data.
> > > 
> > > This will be more data in the pf_queue but it makes the GT page fault
> > > layer truly opaque to any firmware interfaces.
> > > 
> > > Does this make sense?
> > > 
> > > Matt
> > > 
> > > > +		pf_queue->fw->send_pagefault_reply(gt, pf_queue->fw, &reply);
> > > > 
> > > > 		if (time_after(jiffies, threshold) &&
> > > > 		    pf_queue->tail != pf_queue->head) {
> > > > @@ -664,9 +649,8 @@ static bool acc_queue_full(struct acc_queue *acc_queue)
> > > > 		ACC_MSG_LEN_DW;
> > > > }
> > > > 
> > > > -int xe_guc_access_counter_notify_handler(struct xe_guc *guc, u32 *msg, u32 len)
> > > > +int xe_access_counter_notify_handler(struct xe_gt *gt, struct xe_uc_fw *fw, u32 *msg, u32 len)
> > > > {
> > > > -	struct xe_gt *gt = guc_to_gt(guc);
> > > > 	struct acc_queue *acc_queue;
> > > > 	u32 asid;
> > > > 	bool full;
> > > > diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.h b/drivers/gpu/drm/xe/xe_gt_pagefault.h
> > > > index 839c065a5e4c..0c01af3b7124 100644
> > > > --- a/drivers/gpu/drm/xe/xe_gt_pagefault.h
> > > > +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.h
> > > > @@ -9,11 +9,11 @@
> > > > #include <linux/types.h>
> > > > 
> > > > struct xe_gt;
> > > > -struct xe_guc;
> > > > +struct xe_uc_fw;
> > > > 
> > > > int xe_gt_pagefault_init(struct xe_gt *gt);
> > > > void xe_gt_pagefault_reset(struct xe_gt *gt);
> > > > -int xe_guc_pagefault_handler(struct xe_guc *guc, u32 *msg, u32 len);
> > > > -int xe_guc_access_counter_notify_handler(struct xe_guc *guc, u32 *msg, u32 len);
> > > > +int xe_pagefault_handler(struct xe_gt *gt, struct xe_uc_fw *fw, u32 *msg, u32 len);
> > > > +int xe_access_counter_notify_handler(struct xe_gt *gt, struct xe_uc_fw *fw, u32 *msg, u32 len);
> > > > 
> > > > #endif	/* _XE_GT_PAGEFAULT_ */
> > > > diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h
> > > > index 6e66bf0e8b3f..73f7bd044828 100644
> > > > --- a/drivers/gpu/drm/xe/xe_gt_types.h
> > > > +++ b/drivers/gpu/drm/xe/xe_gt_types.h
> > > > @@ -249,6 +249,8 @@ struct xe_gt {
> > > > 		struct pf_queue {
> > > > 			/** @usm.pf_queue.gt: back pointer to GT */
> > > > 			struct xe_gt *gt;
> > > > +			/** @usm.pf_queue.fw the fw that handles the pf */
> > > > +			struct xe_uc_fw *fw;
> > > > 			/** @usm.pf_queue.data: data in the page fault queue */
> > > > 			u32 *data;
> > > > 			/**
> > > > diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
> > > > index 1619c0a52db9..1ea3678f1463 100644
> > > > --- a/drivers/gpu/drm/xe/xe_guc.c
> > > > +++ b/drivers/gpu/drm/xe/xe_guc.c
> > > > @@ -5,6 +5,7 @@
> > > > 
> > > > #include "xe_guc.h"
> > > > 
> > > > +#include "xe_uc_fw_types.h"
> > > > #include <drm/drm_managed.h>
> > > > 
> > > > #include <generated/xe_wa_oob.h>
> > > > @@ -642,6 +643,19 @@ static int vf_guc_init(struct xe_guc *guc)
> > > > 	return 0;
> > > > }
> > > > 
> > > > +static int send_pagefault_reply(struct xe_gt *gt, struct xe_uc_fw *fw,
> > > > +				struct xe_uc_pagefault_reply *reply)
> > > > +{
> > > > +	struct xe_guc *guc = container_of(fw, struct xe_guc, fw);
> > > > +	u32 action[] = {
> > > > +		XE_GUC_ACTION_PAGE_FAULT_RES_DESC,
> > > > +		reply->dw0,
> > > > +		reply->dw1,
> > > > +	};
> > > > +
> > > > +	return xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action), 0, 0);
> > > > +}
> > > > +
> > > > int xe_guc_init(struct xe_guc *guc)
> > > > {
> > > > 	struct xe_device *xe = guc_to_xe(guc);
> > > > @@ -652,6 +666,7 @@ int xe_guc_init(struct xe_guc *guc)
> > > > 	ret = xe_uc_fw_init(&guc->fw);
> > > > 	if (ret)
> > > > 		goto out;
> > > > +	guc->fw.send_pagefault_reply = &send_pagefault_reply;
> > > > 
> > > > 	if (!xe_uc_fw_is_enabled(&guc->fw))
> > > > 		return 0;
> > > > diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
> > > > index 72ad576fc18e..14ff5f255d45 100644
> > > > --- a/drivers/gpu/drm/xe/xe_guc_ct.c
> > > > +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
> > > > @@ -1293,15 +1293,15 @@ static int process_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len)
> > > > 								 adj_len);
> > > > 		break;
> > > > 	case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC:
> > > > -		ret = xe_guc_pagefault_handler(guc, payload, adj_len);
> > > > +		ret = xe_pagefault_handler(gt, &guc->fw, payload, adj_len);
> > > > 		break;
> > > > 	case XE_GUC_ACTION_TLB_INVALIDATION_DONE:
> > > > 		ret = xe_guc_tlb_invalidation_done_handler(guc, payload,
> > > > 							   adj_len);
> > > > 		break;
> > > > 	case XE_GUC_ACTION_ACCESS_COUNTER_NOTIFY:
> > > > -		ret = xe_guc_access_counter_notify_handler(guc, payload,
> > > > -							   adj_len);
> > > > +		ret = xe_access_counter_notify_handler(gt, &guc->fw, payload,
> > > > +						       adj_len);
> > > > 		break;
> > > > 	case XE_GUC_ACTION_GUC2PF_RELAY_FROM_VF:
> > > > 		ret = xe_guc_relay_process_guc2pf(&guc->relay, hxg, hxg_len);
> > > > @@ -1494,7 +1494,7 @@ static void g2h_fast_path(struct xe_guc_ct *ct, u32 *msg, u32 len)
> > > > 
> > > > 	switch (action) {
> > > > 	case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC:
> > > > -		ret = xe_guc_pagefault_handler(guc, payload, adj_len);
> > > > +		ret = xe_pagefault_handler(gt, &guc->fw, payload, adj_len);
> > > > 		break;
> > > > 	case XE_GUC_ACTION_TLB_INVALIDATION_DONE:
> > > > 		__g2h_release_space(ct, len);
> > > > diff --git a/drivers/gpu/drm/xe/xe_uc_fw_types.h b/drivers/gpu/drm/xe/xe_uc_fw_types.h
> > > > index d8714ccb3f78..4d82430e3274 100644
> > > > --- a/drivers/gpu/drm/xe/xe_uc_fw_types.h
> > > > +++ b/drivers/gpu/drm/xe/xe_uc_fw_types.h
> > > > @@ -9,6 +9,7 @@
> > > > #include <linux/types.h>
> > > > 
> > > > struct xe_bo;
> > > > +struct xe_gt;
> > > > 
> > > > /*
> > > >  * +------------+---------------------------------------------------+
> > > > @@ -228,6 +229,10 @@ struct xe_uc_fw {
> > > > 
> > > > 	/** @private_data_size: size of private data found in uC css header */
> > > > 	u32 private_data_size;
> > > > +
> > > > +	/** a callback to send the page fault replay to the fw */
> > > > +	int (*send_pagefault_reply)(struct xe_gt *gt, struct xe_uc_fw *xe_fw,
> > > > +				    struct xe_uc_pagefault_reply *reply);
> > > > };
> > > > 
> > > > #endif
> > > > --
> > > > 2.34.1
> > > >