[PATCH 3/5] drm/xe: Avoid reading RMW registers in emit_wa_job
Matt Roper
matthew.d.roper at intel.com
Mon Mar 3 22:06:49 UTC 2025
On Mon, Mar 03, 2025 at 06:35:20PM +0100, Michal Wajdeczko wrote:
> To allow VFs properly handle LRC WAs, we should postpone doing
> all RMW register operations and let them be run by the engine
> itself, since attempt to perform read registers from within the
> driver will fail on the VF. Use MI_MATH and ALU for that.
>
> Signed-off-by: Michal Wajdeczko <michal.wajdeczko at intel.com>
> Cc: Michał Winiarski <michal.winiarski at intel.com>
> Cc: Matt Roper <matthew.d.roper at intel.com>
It always confuses when reading code like this that MI_LRR and
MI_MATH_STORE use opposite orders for src/dest registers. But
everything here looks correct.
Reviewed-by: Matt Roper <matthew.d.roper at intel.com>
> ---
> drivers/gpu/drm/xe/xe_gt.c | 84 ++++++++++++++++++++++++++++----------
> 1 file changed, 63 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> index 10a9e3c72b36..8068b4bc0a09 100644
> --- a/drivers/gpu/drm/xe/xe_gt.c
> +++ b/drivers/gpu/drm/xe/xe_gt.c
> @@ -12,8 +12,10 @@
>
> #include <generated/xe_wa_oob.h>
>
> +#include "instructions/xe_alu_commands.h"
> #include "instructions/xe_gfxpipe_commands.h"
> #include "instructions/xe_mi_commands.h"
> +#include "regs/xe_engine_regs.h"
> #include "regs/xe_gt_regs.h"
> #include "xe_assert.h"
> #include "xe_bb.h"
> @@ -176,15 +178,6 @@ static int emit_nop_job(struct xe_gt *gt, struct xe_exec_queue *q)
> return 0;
> }
>
> -/*
> - * Convert back from encoded value to type-safe, only to be used when reg.mcr
> - * is true
> - */
> -static struct xe_reg_mcr to_xe_reg_mcr(const struct xe_reg reg)
> -{
> - return (const struct xe_reg_mcr){.__reg.raw = reg.raw };
> -}
> -
> static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q)
> {
> struct xe_reg_sr *sr = &q->hwe->reg_lrc;
> @@ -194,6 +187,7 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q)
> struct xe_bb *bb;
> struct dma_fence *fence;
> long timeout;
> + int count_rmw = 0;
> int count = 0;
>
> if (q->hwe->class == XE_ENGINE_CLASS_RENDER)
> @@ -206,30 +200,32 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q)
> if (IS_ERR(bb))
> return PTR_ERR(bb);
>
> - xa_for_each(&sr->xa, idx, entry)
> - ++count;
> + /* count RMW registers as those will be handled separately */
> + xa_for_each(&sr->xa, idx, entry) {
> + if (entry->reg.masked || entry->clr_bits == ~0)
> + ++count;
> + else
> + ++count_rmw;
> + }
>
> - if (count) {
> + if (count || count_rmw)
> xe_gt_dbg(gt, "LRC WA %s save-restore batch\n", sr->name);
>
> + if (count) {
> + /* emit single LRI with all non RMW regs */
> +
> bb->cs[bb->len++] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
>
> xa_for_each(&sr->xa, idx, entry) {
> struct xe_reg reg = entry->reg;
> - struct xe_reg_mcr reg_mcr = to_xe_reg_mcr(reg);
> u32 val;
>
> - /*
> - * Skip reading the register if it's not really needed
> - */
> if (reg.masked)
> val = entry->clr_bits << 16;
> - else if (entry->clr_bits + 1)
> - val = (reg.mcr ?
> - xe_gt_mcr_unicast_read_any(gt, reg_mcr) :
> - xe_mmio_read32(>->mmio, reg)) & (~entry->clr_bits);
> - else
> + else if (entry->clr_bits == ~0)
> val = 0;
> + else
> + continue;
>
> val |= entry->set_bits;
>
> @@ -239,6 +235,52 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q)
> }
> }
>
> + if (count_rmw) {
> + /* emit MI_MATH for each RMW reg */
> +
> + xa_for_each(&sr->xa, idx, entry) {
> + if (entry->reg.masked || entry->clr_bits == ~0)
> + continue;
> +
> + bb->cs[bb->len++] = MI_LOAD_REGISTER_REG | MI_LRR_DST_CS_MMIO;
> + bb->cs[bb->len++] = entry->reg.addr;
> + bb->cs[bb->len++] = CS_GPR_REG(0, 0).addr;
> +
> + bb->cs[bb->len++] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(2) |
> + MI_LRI_LRM_CS_MMIO;
> + bb->cs[bb->len++] = CS_GPR_REG(0, 1).addr;
> + bb->cs[bb->len++] = entry->clr_bits;
> + bb->cs[bb->len++] = CS_GPR_REG(0, 2).addr;
> + bb->cs[bb->len++] = entry->set_bits;
> +
> + bb->cs[bb->len++] = MI_MATH(8);
> + bb->cs[bb->len++] = CS_ALU_INSTR_LOAD(SRCA, REG0);
> + bb->cs[bb->len++] = CS_ALU_INSTR_LOADINV(SRCB, REG1);
> + bb->cs[bb->len++] = CS_ALU_INSTR_AND;
> + bb->cs[bb->len++] = CS_ALU_INSTR_STORE(REG0, ACCU);
> + bb->cs[bb->len++] = CS_ALU_INSTR_LOAD(SRCA, REG0);
> + bb->cs[bb->len++] = CS_ALU_INSTR_LOAD(SRCB, REG2);
> + bb->cs[bb->len++] = CS_ALU_INSTR_OR;
> + bb->cs[bb->len++] = CS_ALU_INSTR_STORE(REG0, ACCU);
> +
> + bb->cs[bb->len++] = MI_LOAD_REGISTER_REG | MI_LRR_SRC_CS_MMIO;
> + bb->cs[bb->len++] = CS_GPR_REG(0, 0).addr;
> + bb->cs[bb->len++] = entry->reg.addr;
> +
> + xe_gt_dbg(gt, "REG[%#x] = ~%#x|%#x\n",
> + entry->reg.addr, entry->clr_bits, entry->set_bits);
> + }
> +
> + /* reset used GPR */
> + bb->cs[bb->len++] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(3) | MI_LRI_LRM_CS_MMIO;
> + bb->cs[bb->len++] = CS_GPR_REG(0, 0).addr;
> + bb->cs[bb->len++] = 0;
> + bb->cs[bb->len++] = CS_GPR_REG(0, 1).addr;
> + bb->cs[bb->len++] = 0;
> + bb->cs[bb->len++] = CS_GPR_REG(0, 2).addr;
> + bb->cs[bb->len++] = 0;
> + }
> +
> xe_lrc_emit_hwe_state_instructions(q, bb);
>
> job = xe_bb_create_job(q, bb);
> --
> 2.47.1
>
--
Matt Roper
Graphics Software Engineer
Linux GPU Platform Enablement
Intel Corporation
More information about the Intel-xe
mailing list