[PATCH 3/5] drm/xe: Avoid reading RMW registers in emit_wa_job
Lucas De Marchi
lucas.demarchi at intel.com
Tue Mar 4 16:50:48 UTC 2025
On Mon, Mar 03, 2025 at 07:47:50PM +0100, Michal Wajdeczko wrote:
>
>
>On 03.03.2025 19:06, Lucas De Marchi wrote:
>> On Mon, Mar 03, 2025 at 06:35:20PM +0100, Michal Wajdeczko wrote:
>>> To allow VFs properly handle LRC WAs, we should postpone doing
>>> all RMW register operations and let them be run by the engine
>>> itself, since attempt to perform read registers from within the
>>> driver will fail on the VF. Use MI_MATH and ALU for that.
>>>
>>> Signed-off-by: Michal Wajdeczko <michal.wajdeczko at intel.com>
>>> Cc: Michał Winiarski <michal.winiarski at intel.com>
>>> Cc: Matt Roper <matthew.d.roper at intel.com>
>>> ---
>>> drivers/gpu/drm/xe/xe_gt.c | 84 ++++++++++++++++++++++++++++----------
>>> 1 file changed, 63 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
>>> index 10a9e3c72b36..8068b4bc0a09 100644
>>> --- a/drivers/gpu/drm/xe/xe_gt.c
>>> +++ b/drivers/gpu/drm/xe/xe_gt.c
>>> @@ -12,8 +12,10 @@
>>>
>>> #include <generated/xe_wa_oob.h>
>>>
>>> +#include "instructions/xe_alu_commands.h"
>>> #include "instructions/xe_gfxpipe_commands.h"
>>> #include "instructions/xe_mi_commands.h"
>>> +#include "regs/xe_engine_regs.h"
>>> #include "regs/xe_gt_regs.h"
>>> #include "xe_assert.h"
>>> #include "xe_bb.h"
>>> @@ -176,15 +178,6 @@ static int emit_nop_job(struct xe_gt *gt, struct
>>> xe_exec_queue *q)
>>> return 0;
>>> }
>>>
>>> -/*
>>> - * Convert back from encoded value to type-safe, only to be used when
>>> reg.mcr
>>> - * is true
>>> - */
>>> -static struct xe_reg_mcr to_xe_reg_mcr(const struct xe_reg reg)
>>> -{
>>> - return (const struct xe_reg_mcr){.__reg.raw = reg.raw };
>>> -}
>>> -
>>> static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q)
>>> {
>>> struct xe_reg_sr *sr = &q->hwe->reg_lrc;
>>> @@ -194,6 +187,7 @@ static int emit_wa_job(struct xe_gt *gt, struct
>>> xe_exec_queue *q)
>>> struct xe_bb *bb;
>>> struct dma_fence *fence;
>>> long timeout;
>>> + int count_rmw = 0;
>>> int count = 0;
>>>
>>> if (q->hwe->class == XE_ENGINE_CLASS_RENDER)
>>> @@ -206,30 +200,32 @@ static int emit_wa_job(struct xe_gt *gt, struct
>>> xe_exec_queue *q)
>>> if (IS_ERR(bb))
>>> return PTR_ERR(bb);
>>>
>>> - xa_for_each(&sr->xa, idx, entry)
>>> - ++count;
>>> + /* count RMW registers as those will be handled separately */
>>> + xa_for_each(&sr->xa, idx, entry) {
>>> + if (entry->reg.masked || entry->clr_bits == ~0)
>>> + ++count;
>>> + else
>>> + ++count_rmw;
>>> + }
>>>
>>> - if (count) {
>>> + if (count || count_rmw)
>>> xe_gt_dbg(gt, "LRC WA %s save-restore batch\n", sr->name);
>>>
>>> + if (count) {
>>> + /* emit single LRI with all non RMW regs */
>>> +
>>> bb->cs[bb->len++] = MI_LOAD_REGISTER_IMM |
>>> MI_LRI_NUM_REGS(count);
>>>
>>> xa_for_each(&sr->xa, idx, entry) {
>>> struct xe_reg reg = entry->reg;
>>> - struct xe_reg_mcr reg_mcr = to_xe_reg_mcr(reg);
>>> u32 val;
>>>
>>> - /*
>>> - * Skip reading the register if it's not really needed
>>> - */
>>> if (reg.masked)
>>> val = entry->clr_bits << 16;
>>> - else if (entry->clr_bits + 1)
>>> - val = (reg.mcr ?
>>> - xe_gt_mcr_unicast_read_any(gt, reg_mcr) :
>>> - xe_mmio_read32(>->mmio, reg)) & (~entry-
>>> >clr_bits);
>>> - else
>>> + else if (entry->clr_bits == ~0)
>>> val = 0;
>>> + else
>>> + continue;
>>>
>>> val |= entry->set_bits;
>>>
>>> @@ -239,6 +235,52 @@ static int emit_wa_job(struct xe_gt *gt, struct
>>> xe_exec_queue *q)
>>> }
>>> }
>>>
>>> + if (count_rmw) {
>>> + /* emit MI_MATH for each RMW reg */
>>> +
>>> + xa_for_each(&sr->xa, idx, entry) {
>>> + if (entry->reg.masked || entry->clr_bits == ~0)
>>> + continue;
>>
>> why can't we handle the normal writes here as well and avoid having some
>> written from the CPU side and some from the GPU side?
>>
>
>there were/are no writes here, we had reads only in case of the RMW
>value that had to be programmed in LRI (previous approach)
>
>and we have two loops since first covers all simple writes as all could
>be programmed as single LRI command, while second loop emits separate
>MATH commands per each RMW
alright - I was misremembering the part of this code. After double
checking and clarifying it offline, it makes sense. Thanks.
Lucas De Marchi
More information about the Intel-xe
mailing list