[PATCH 3/5] drm/xe: Avoid reading RMW registers in emit_wa_job

Mon Mar 3 18:06:53 UTC 2025

On Mon, Mar 03, 2025 at 06:35:20PM +0100, Michal Wajdeczko wrote:
>To allow VFs properly handle LRC WAs, we should postpone doing
>all RMW register operations and let them be run by the engine
>itself, since attempt to perform read registers from within the
>driver will fail on the VF. Use MI_MATH and ALU for that.
>
>Signed-off-by: Michal Wajdeczko <michal.wajdeczko at intel.com>
>Cc: Michał Winiarski <michal.winiarski at intel.com>
>Cc: Matt Roper <matthew.d.roper at intel.com>
>---
> drivers/gpu/drm/xe/xe_gt.c | 84 ++++++++++++++++++++++++++++----------
> 1 file changed, 63 insertions(+), 21 deletions(-)
>
>diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
>index 10a9e3c72b36..8068b4bc0a09 100644
>--- a/drivers/gpu/drm/xe/xe_gt.c
>+++ b/drivers/gpu/drm/xe/xe_gt.c
>@@ -12,8 +12,10 @@
>
> #include <generated/xe_wa_oob.h>
>
>+#include "instructions/xe_alu_commands.h"
> #include "instructions/xe_gfxpipe_commands.h"
> #include "instructions/xe_mi_commands.h"
>+#include "regs/xe_engine_regs.h"
> #include "regs/xe_gt_regs.h"
> #include "xe_assert.h"
> #include "xe_bb.h"
>@@ -176,15 +178,6 @@ static int emit_nop_job(struct xe_gt *gt, struct xe_exec_queue *q)
> 	return 0;
> }
>
>-/*
>- * Convert back from encoded value to type-safe, only to be used when reg.mcr
>- * is true
>- */
>-static struct xe_reg_mcr to_xe_reg_mcr(const struct xe_reg reg)
>-{
>-	return (const struct xe_reg_mcr){.__reg.raw = reg.raw };
>-}
>-
> static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q)
> {
> 	struct xe_reg_sr *sr = &q->hwe->reg_lrc;
>@@ -194,6 +187,7 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q)
> 	struct xe_bb *bb;
> 	struct dma_fence *fence;
> 	long timeout;
>+	int count_rmw = 0;
> 	int count = 0;
>
> 	if (q->hwe->class == XE_ENGINE_CLASS_RENDER)
>@@ -206,30 +200,32 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q)
> 	if (IS_ERR(bb))
> 		return PTR_ERR(bb);
>
>-	xa_for_each(&sr->xa, idx, entry)
>-		++count;
>+	/* count RMW registers as those will be handled separately */
>+	xa_for_each(&sr->xa, idx, entry) {
>+		if (entry->reg.masked || entry->clr_bits == ~0)
>+			++count;
>+		else
>+			++count_rmw;
>+	}
>
>-	if (count) {
>+	if (count || count_rmw)
> 		xe_gt_dbg(gt, "LRC WA %s save-restore batch\n", sr->name);
>
>+	if (count) {
>+		/* emit single LRI with all non RMW regs */
>+
> 		bb->cs[bb->len++] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
>
> 		xa_for_each(&sr->xa, idx, entry) {
> 			struct xe_reg reg = entry->reg;
>-			struct xe_reg_mcr reg_mcr = to_xe_reg_mcr(reg);
> 			u32 val;
>
>-			/*
>-			 * Skip reading the register if it's not really needed
>-			 */
> 			if (reg.masked)
> 				val = entry->clr_bits << 16;
>-			else if (entry->clr_bits + 1)
>-				val = (reg.mcr ?
>-				       xe_gt_mcr_unicast_read_any(gt, reg_mcr) :
>-				       xe_mmio_read32(&gt->mmio, reg)) & (~entry->clr_bits);
>-			else
>+			else if (entry->clr_bits == ~0)
> 				val = 0;
>+			else
>+				continue;
>
> 			val |= entry->set_bits;
>
>@@ -239,6 +235,52 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q)
> 		}
> 	}
>
>+	if (count_rmw) {
>+		/* emit MI_MATH for each RMW reg */
>+
>+		xa_for_each(&sr->xa, idx, entry) {
>+			if (entry->reg.masked || entry->clr_bits == ~0)
>+				continue;

why can't we handle the normal writes here as well and avoid having some
written from the CPU side and some from the GPU side?

Lucas De Marchi