[PATCH 3/5] drm/xe: Avoid reading RMW registers in emit_wa_job

Tue Mar 4 16:50:48 UTC 2025

On Mon, Mar 03, 2025 at 07:47:50PM +0100, Michal Wajdeczko wrote:
>
>
>On 03.03.2025 19:06, Lucas De Marchi wrote:
>> On Mon, Mar 03, 2025 at 06:35:20PM +0100, Michal Wajdeczko wrote:
>>> To allow VFs properly handle LRC WAs, we should postpone doing
>>> all RMW register operations and let them be run by the engine
>>> itself, since attempt to perform read registers from within the
>>> driver will fail on the VF. Use MI_MATH and ALU for that.
>>>
>>> Signed-off-by: Michal Wajdeczko <michal.wajdeczko at intel.com>
>>> Cc: Michał Winiarski <michal.winiarski at intel.com>
>>> Cc: Matt Roper <matthew.d.roper at intel.com>
>>> ---
>>> drivers/gpu/drm/xe/xe_gt.c | 84 ++++++++++++++++++++++++++++----------
>>> 1 file changed, 63 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
>>> index 10a9e3c72b36..8068b4bc0a09 100644
>>> --- a/drivers/gpu/drm/xe/xe_gt.c
>>> +++ b/drivers/gpu/drm/xe/xe_gt.c
>>> @@ -12,8 +12,10 @@
>>>
>>> #include <generated/xe_wa_oob.h>
>>>
>>> +#include "instructions/xe_alu_commands.h"
>>> #include "instructions/xe_gfxpipe_commands.h"
>>> #include "instructions/xe_mi_commands.h"
>>> +#include "regs/xe_engine_regs.h"
>>> #include "regs/xe_gt_regs.h"
>>> #include "xe_assert.h"
>>> #include "xe_bb.h"
>>> @@ -176,15 +178,6 @@ static int emit_nop_job(struct xe_gt *gt, struct
>>> xe_exec_queue *q)
>>>     return 0;
>>> }
>>>
>>> -/*
>>> - * Convert back from encoded value to type-safe, only to be used when
>>> reg.mcr
>>> - * is true
>>> - */
>>> -static struct xe_reg_mcr to_xe_reg_mcr(const struct xe_reg reg)
>>> -{
>>> -    return (const struct xe_reg_mcr){.__reg.raw = reg.raw };
>>> -}
>>> -
>>> static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q)
>>> {
>>>     struct xe_reg_sr *sr = &q->hwe->reg_lrc;
>>> @@ -194,6 +187,7 @@ static int emit_wa_job(struct xe_gt *gt, struct
>>> xe_exec_queue *q)
>>>     struct xe_bb *bb;
>>>     struct dma_fence *fence;
>>>     long timeout;
>>> +    int count_rmw = 0;
>>>     int count = 0;
>>>
>>>     if (q->hwe->class == XE_ENGINE_CLASS_RENDER)
>>> @@ -206,30 +200,32 @@ static int emit_wa_job(struct xe_gt *gt, struct
>>> xe_exec_queue *q)
>>>     if (IS_ERR(bb))
>>>         return PTR_ERR(bb);
>>>
>>> -    xa_for_each(&sr->xa, idx, entry)
>>> -        ++count;
>>> +    /* count RMW registers as those will be handled separately */
>>> +    xa_for_each(&sr->xa, idx, entry) {
>>> +        if (entry->reg.masked || entry->clr_bits == ~0)
>>> +            ++count;
>>> +        else
>>> +            ++count_rmw;
>>> +    }
>>>
>>> -    if (count) {
>>> +    if (count || count_rmw)
>>>         xe_gt_dbg(gt, "LRC WA %s save-restore batch\n", sr->name);
>>>
>>> +    if (count) {
>>> +        /* emit single LRI with all non RMW regs */
>>> +
>>>         bb->cs[bb->len++] = MI_LOAD_REGISTER_IMM |
>>> MI_LRI_NUM_REGS(count);
>>>
>>>         xa_for_each(&sr->xa, idx, entry) {
>>>             struct xe_reg reg = entry->reg;
>>> -            struct xe_reg_mcr reg_mcr = to_xe_reg_mcr(reg);
>>>             u32 val;
>>>
>>> -            /*
>>> -             * Skip reading the register if it's not really needed
>>> -             */
>>>             if (reg.masked)
>>>                 val = entry->clr_bits << 16;
>>> -            else if (entry->clr_bits + 1)
>>> -                val = (reg.mcr ?
>>> -                       xe_gt_mcr_unicast_read_any(gt, reg_mcr) :
>>> -                       xe_mmio_read32(&gt->mmio, reg)) & (~entry-
>>> >clr_bits);
>>> -            else
>>> +            else if (entry->clr_bits == ~0)
>>>                 val = 0;
>>> +            else
>>> +                continue;
>>>
>>>             val |= entry->set_bits;
>>>
>>> @@ -239,6 +235,52 @@ static int emit_wa_job(struct xe_gt *gt, struct
>>> xe_exec_queue *q)
>>>         }
>>>     }
>>>
>>> +    if (count_rmw) {
>>> +        /* emit MI_MATH for each RMW reg */
>>> +
>>> +        xa_for_each(&sr->xa, idx, entry) {
>>> +            if (entry->reg.masked || entry->clr_bits == ~0)
>>> +                continue;
>>
>> why can't we handle the normal writes here as well and avoid having some
>> written from the CPU side and some from the GPU side?
>>
>
>there were/are no writes here, we had reads only in case of the RMW
>value that had to be programmed in LRI (previous approach)
>
>and we have two loops since first covers all simple writes as all could
>be programmed as single LRI command, while second loop emits separate
>MATH commands per each RMW

alright - I was misremembering the part of this code. After double
checking and clarifying it offline, it makes sense. Thanks.

Lucas De Marchi