Fw: Re: [PATCH v2 3/3] drm/xe: Add WA BB to capture active context utilization
Matthew Brost
matthew.brost at intel.com
Tue Apr 29 03:30:05 UTC 2025
On Mon, Apr 28, 2025 at 06:01:54PM -0700, Umesh Nerlige Ramappa wrote:
> Resending, since I forgot to add the list
>
> ----- Forwarded message from Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com> -----
>
> Date: Mon, 28 Apr 2025 16:58:37 -0700
> From: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
> To: Matthew Brost <matthew.brost at intel.com>
> Subject: Re: [PATCH v2 3/3] drm/xe: Add WA BB to capture active context utilization
>
> On Fri, Apr 25, 2025 at 09:54:52PM -0700, Matthew Brost wrote:
> > On Fri, Apr 25, 2025 at 05:29:51PM -0700, Umesh Nerlige Ramappa wrote:
> > > Context Timestamp (CTX_TIMESTAMP) in the lrc accumulates the run ticks
> > > of the context, but only gets updated when the context switches out. In
> > > order to check how long a context has been active before it switches
> > > out, two things are required:
> > > > (1) Determine if the context is running:
> > > > To do so, we program the wa bb to set an initial value for
> > CTX_TIMESTAMP
> > > in the lrc. The value chosen is 1 since 0 is the initial value when the
> > > lrc is initialized. During a query, we just check for this value to
> > > determine if the context is active. If the context switched out, it
> > > would overwrite this location with the actual CTX_TIMESTAMP MMIO value.
> > > Note that wa bb will run after the context has been restored, so reusing
> > > this lrc location will not clobber anything.
> > > > (2) Calculate the time that the context has been active for:
> > > > The CTX_TIMESTAMP ticks only when the context is active. If a
> > context is
> > > active, we just use the CTX_TIMESTAMP MMIO as the new value of
> > > utilization. While doing so, we need to read the CTX_TIMESTAMP MMIO
> > > for the specific engine instance. Since we do not know which instance
> > > the context is running on until it is scheduled, we also read the
> > > ENGINE_ID MMIO in the wa bb and store it in the PPHSWP.
> > > > Using the above 2 instructions in a WA BB, capture active context
> > > utilization.
> > > > v2: (Matt Brost)
> > > - This breaks TDR, fix it by saving the CTX_TIMESTAMP register
> > > "drm/xe: Save CTX_TIMESTAMP mmio value instead of LRC value"
> > > - Drop tile from lrc if using gt
> > > "drm/xe: Save the gt pointer in lrc and drop the tile"
> > > > Signed-off-by: Umesh Nerlige Ramappa
> > <umesh.nerlige.ramappa at intel.com>
> > > ---
> > > .../gpu/drm/xe/instructions/xe_mi_commands.h | 4 +
> > > drivers/gpu/drm/xe/regs/xe_engine_regs.h | 4 +
> > > drivers/gpu/drm/xe/regs/xe_lrc_layout.h | 1 +
> > > drivers/gpu/drm/xe/xe_lrc.c | 167 +++++++++++++++++-
> > > drivers/gpu/drm/xe/xe_lrc_types.h | 3 +
> > > 5 files changed, 177 insertions(+), 2 deletions(-)
> > > > diff --git a/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
> > b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
> > > index eba582058d55..9153a7cd2ceb 100644
> > > --- a/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
> > > +++ b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
> > > @@ -62,6 +62,10 @@
> > > #define MI_LOAD_REGISTER_MEM (__MI_INSTR(0x29) | XE_INSTR_NUM_DW(4))
> > > #define MI_LRM_USE_GGTT REG_BIT(22)
> > > > +#define MI_STORE_REGISTER_MEM (__MI_INSTR(0x24) |
> > XE_INSTR_NUM_DW(4))
> > > +#define MI_SRM_USE_GGTT REG_BIT(22)
> > > +#define MI_SRM_ADD_CS_OFFSET REG_BIT(19)
> > > +
> > > #define MI_LOAD_REGISTER_REG (__MI_INSTR(0x2a) | XE_INSTR_NUM_DW(3))
> > > #define MI_LRR_DST_CS_MMIO REG_BIT(19)
> > > #define MI_LRR_SRC_CS_MMIO REG_BIT(18)
> > > diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
> > > index da713634d6a0..cc698136a6f6 100644
> > > --- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h
> > > +++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
> > > @@ -43,6 +43,10 @@
> > > #define XEHPC_BCS8_RING_BASE 0x3ee000
> > > #define GSCCS_RING_BASE 0x11a000
> > > > +#define ENGINE_ID(base) XE_REG((base) + 0x8c)
> > > +#define ENGINE_INSTANCE_ID REG_GENMASK(9, 4)
> > > +#define ENGINE_CLASS_ID REG_GENMASK(2, 0)
> > > +
> > > #define RING_TAIL(base) XE_REG((base) + 0x30)
> > > #define TAIL_ADDR REG_GENMASK(20, 3)
> > > > diff --git a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
> > b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
> > > index 57944f90bbf6..210be0ef12f2 100644
> > > --- a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
> > > +++ b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
> > > @@ -11,6 +11,7 @@
> > > #define CTX_RING_TAIL (0x06 + 1)
> > > #define CTX_RING_START (0x08 + 1)
> > > #define CTX_RING_CTL (0x0a + 1)
> > > +#define CTX_BB_PER_CTX_PTR (0x12 + 1)
> > > #define CTX_TIMESTAMP (0x22 + 1)
> > > #define CTX_INDIRECT_RING_STATE (0x26 + 1)
> > > #define CTX_PDP0_UDW (0x30 + 1)
> > > diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
> > > index 089b195e2635..a78acec1a358 100644
> > > --- a/drivers/gpu/drm/xe/xe_lrc.c
> > > +++ b/drivers/gpu/drm/xe/xe_lrc.c
> > > @@ -24,6 +24,7 @@
> > > #include "xe_hw_fence.h"
> > > #include "xe_map.h"
> > > #include "xe_memirq.h"
> > > +#include "xe_mmio.h"
> > > #include "xe_sriov.h"
> > > #include "xe_trace_lrc.h"
> > > #include "xe_vm.h"
> > > @@ -654,6 +655,7 @@ u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
> > > #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
> > > #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
> > > #define LRC_PARALLEL_PPHWSP_OFFSET 2048
> > > +#define LRC_ENGINE_ID_PPHWSP_OFFSET 2096
> > > > u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
> > > {
> > > @@ -697,11 +699,21 @@ static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
> > > return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
> > > }
> > > > +static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
> > > +{
> > > + return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
> > > +}
> > > +
> > > static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
> > > {
> > > return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
> > > }
> > > > +static u32 __xe_lrc_bb_per_ctx_ptr_offset(struct xe_lrc *lrc)
> > > +{
> > > + return __xe_lrc_regs_offset(lrc) + CTX_BB_PER_CTX_PTR * sizeof(u32);
> > > +}
>
> This also looks like it's unused, so will drop it.
>
> > > +
> > > static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
> > > {
> > > /* Indirect ring state page is at the very end of LRC */
> > > @@ -731,6 +743,8 @@ DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
> > > DECL_MAP_ADDR_HELPERS(ctx_timestamp)
> > > DECL_MAP_ADDR_HELPERS(parallel)
> > > DECL_MAP_ADDR_HELPERS(indirect_ring)
> > > +DECL_MAP_ADDR_HELPERS(bb_per_ctx_ptr)
> >
> > This appears to be unused.
>
> oh, you are right, will remove
>
> >
> > > +DECL_MAP_ADDR_HELPERS(engine_id)
> > > > #undef DECL_MAP_ADDR_HELPERS
> > > > @@ -880,6 +894,58 @@ static void xe_lrc_finish(struct xe_lrc *lrc)
> > > xe_bo_unpin(lrc->bo);
> > > xe_bo_unlock(lrc->bo);
> > > xe_bo_put(lrc->bo);
> > > + xe_bo_unpin_map_no_vm(lrc->bb_per_ctx_bo);
> > > +}
> > > +
> > > +/*
> > > + * xe_lrc_setup_utilization() - Setup wa bb to assist in calculating active
> > > + * context run ticks.
> > > + * @lrc: Pointer to the lrc.
> > > + *
> > > + * Context Timestamp (CTX_TIMESTAMP) in the lrc accumulates the run ticks of the
> > > + * context, but only gets updated when the context switches out. In order to
> > > + * check how long a context has been running before it switches out, two things
> > > + * are required:
> > > + *
> > > + * (1) Determine if the context is running
> > > + * To do so, we program the wa bb to set an initial value for CTX_TIMESTAMP in
> > > + * the lrc. The value chosen is 1 since 0 is the initial value when the lrc is
> > > + * initialized. During a query, we just check for this value to determine if the
> > > + * context is active. If the context switched out, it would overwrite this
> > > + * location with the actual CTX_TIMESTAMP MMIO value. Note that wa bb will run
> > > + * after the context has been restored, so reusing this lrc location will not
> > > + * clobber anything.
> > > + *
> > > + * (2) Calculate the time that the context has been running for
> > > + * The CTX_TIMESTAMP ticks only when the context is active. If a context is
> > > + * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
> > > + * While doing so, we need to read the CTX_TIMESTAMP MMIO from the specific
> > > + * engine instance. Since we do not know which instance the context is running
> > > + * on until it is scheduled, we also read the ENGINE_ID MMIO in the wa bb and
> > > + * store it in the PPHSWP.
> > > + */
> > > +
> > > +static void xe_lrc_setup_utilization(struct xe_lrc *lrc)
> > > +{
> > > + u32 *cmd;
> > > +
> > > + cmd = lrc->bb_per_ctx_bo->vmap.vaddr;
> > > +
> > > + *cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
> > > + *cmd++ = ENGINE_ID(0).addr;
> > > + *cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
> > > + *cmd++ = 0;
> > > +
> > > + *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
> > > + *cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
> > > + *cmd++ = 0;
> > > + *cmd++ = 1;
> >
> > Nit: Rather than '1' can we have define indicating this is a special value.
>
> Sure, will add define.
>
> >
> > Also there is a small possibily (1 in 4 billion - I think) of a race
> > here. It is possible than an LRC switches out with a value of '1' and I
> > believe we'd read garbage from get_ctx_timestamp() ... Is there really
> > not another register in the LRC we can steal a bit from?
>
> There are, but the ones that are saved/restored during lite-restore is
> entirely hw implementation dependent. Looks like CTX_TIMESTAMP is NOT
> saved/restored during lite-restore and also it directly relates to this
> feature, so a safer bet.
>
> Other alternatives explored
> 1) ppHWSP locations that might be updated on context save. These are not
> consistent across platforms and are disabled in some.
>
> For 2, 3 below, Ref: https://www.intel.com/content/www/us/en/docs/graphics-for-linux/developer-reference/1-0/alchemist-arctic-sound-m.html
>
> 2) Vol 8: Page 57/64
> If we look for anything after CSEL_BE in the LRC, then we would need to add
> logic to traverse the LRC to find the right offset because it is engine
> specific and also platform specific. Can be done, but not a lot of ROI
> considering the probability of hitting this.
>
> 3) Vol 9: Page 39/730
> If we omit support for media and copy engines, then we could use upper half
> of PREEMPTION_STATUS field. Needs more experiments though. This doesn't get
> restored at all, only saved. So we could just write
> something like (ENGINE_ID + 1) to this location from the WA BB.
>
> >
> > If not, maybe we can live with this race? At minimum we'd need comment
> > somewhere explaining this - at worst we'd get some bad stats or in a
> > very unlikely case get false job timeout if we switch the TDR to use
> > this new code path.
>
> I asked around and got some suggestions - like check HEAD/TAIL of the ring
> in the LRC and that should tell if the context is really active. But if
> HEAD != TAIL, we still run into the same issue of checking validity again.
>
> IMO, since the probability of hitting this is very small, I would go with
> the current implementation and like you said, add a comment describing this
> race.
>
I think getting another persons opinion (e.g., Lucas) here might be a
good idea. I'm not sure what the maintainer view is on accepting
something that we know can be an issue into the KMD.
Matt
> >
> > The patch / design LGTM to good me though.
>
> Thanks,
> Umesh
>
> >
> > Matt
> >
> > > +
> > > + *cmd++ = MI_BATCH_BUFFER_END;
> > > +
> > > + xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
> > > + xe_bo_ggtt_addr(lrc->bb_per_ctx_bo) | 1);
> > > +
> > > }
> > > > #define PVC_CTX_ASID (0x2e + 1)
> > > @@ -921,6 +987,14 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> > > if (IS_ERR(lrc->bo))
> > > return PTR_ERR(lrc->bo);
> > > > + lrc->bb_per_ctx_bo = xe_bo_create_pin_map(xe, tile, NULL, SZ_4K,
> > > + ttm_bo_type_kernel,
> > > + bo_flags);
> > > + if (IS_ERR(lrc->bb_per_ctx_bo)) {
> > > + err = PTR_ERR(lrc->bb_per_ctx_bo);
> > > + goto err_lrc_finish;
> > > + }
> > > +
> > > lrc->size = lrc_size;
> > > lrc->ring.size = ring_size;
> > > lrc->ring.tail = 0;
> > > @@ -1026,6 +1100,8 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> > > map = __xe_lrc_start_seqno_map(lrc);
> > > xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
> > > > + xe_lrc_setup_utilization(lrc);
> > > +
> > > return 0;
> > > > err_lrc_finish:
> > > @@ -1245,6 +1321,21 @@ struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
> > > return __xe_lrc_parallel_map(lrc);
> > > }
> > > > +/**
> > > + * xe_lrc_engine_id() - Read engine id value
> > > + * @lrc: Pointer to the lrc.
> > > + *
> > > + * Returns: context id value
> > > + */
> > > +static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
> > > +{
> > > + struct xe_device *xe = lrc_to_xe(lrc);
> > > + struct iosys_map map;
> > > +
> > > + map = __xe_lrc_engine_id_map(lrc);
> > > + return xe_map_read32(xe, &map);
> > > +}
> > > +
> > > static int instr_dw(u32 cmd_header)
> > > {
> > > /* GFXPIPE "SINGLE_DW" opcodes are a single dword */
> > > @@ -1792,21 +1883,93 @@ void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
> > > kfree(snapshot);
> > > }
> > > > +static u32 get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id)
> > > +{
> > > + static u32 base[XE_ENGINE_CLASS_MAX][XE_HW_ENGINE_MAX_INSTANCE] = {
> > > + {
> > > + RENDER_RING_BASE,
> > > + },
> > > + {
> > > + BSD_RING_BASE,
> > > + BSD2_RING_BASE,
> > > + BSD3_RING_BASE,
> > > + BSD4_RING_BASE,
> > > + XEHP_BSD5_RING_BASE,
> > > + XEHP_BSD6_RING_BASE,
> > > + XEHP_BSD7_RING_BASE,
> > > + XEHP_BSD8_RING_BASE,
> > > + },
> > > + {
> > > + VEBOX_RING_BASE,
> > > + VEBOX2_RING_BASE,
> > > + XEHP_VEBOX3_RING_BASE,
> > > + XEHP_VEBOX4_RING_BASE,
> > > + },
> > > + {
> > > + BLT_RING_BASE,
> > > + XEHPC_BCS1_RING_BASE,
> > > + XEHPC_BCS2_RING_BASE,
> > > + XEHPC_BCS3_RING_BASE,
> > > + XEHPC_BCS4_RING_BASE,
> > > + XEHPC_BCS5_RING_BASE,
> > > + XEHPC_BCS6_RING_BASE,
> > > + XEHPC_BCS7_RING_BASE,
> > > + XEHPC_BCS8_RING_BASE,
> > > + },
> > > + { 0 },
> > > + {
> > > + COMPUTE0_RING_BASE,
> > > + COMPUTE1_RING_BASE,
> > > + COMPUTE2_RING_BASE,
> > > + COMPUTE3_RING_BASE,
> > > + },
> > > + };
> > > + u32 c = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
> > > + u32 i = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
> > > + u32 mmio_base = base[c][i];
> > > +
> > > + if (xe_gt_WARN_ONCE(lrc->gt, !mmio_base,
> > > + "Unexpected engine c:i %d:%d for context utilization\n",
> > > + c, i))
> > > + return 0;
> > > +
> > > + return xe_mmio_read32(&lrc->gt->mmio, RING_CTX_TIMESTAMP(mmio_base));
> > > +}
> > > +
> > > /**
> > > * xe_lrc_update_timestamp() - Update ctx timestamp
> > > * @lrc: Pointer to the lrc.
> > > * @old_ts: Old timestamp value
> > > *
> > > * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
> > > - * update saved value.
> > > + * update saved value. With support for active contexts, the calculation may be
> > > + * slightly racy, so follow a read-again logic to ensure that the context is
> > > + * still active before returning the right timestamp.
> > > *
> > > * Returns: New ctx timestamp value
> > > */
> > > u32 xe_lrc_update_timestamp(struct xe_lrc *lrc, u32 *old_ts)
> > > {
> > > + u32 ctx_lrc, ctx_reg, engine_id;
> > > +
> > > *old_ts = lrc->ctx_timestamp;
> > > > - lrc->ctx_timestamp = xe_lrc_ctx_timestamp(lrc);
> > > + ctx_lrc = xe_lrc_ctx_timestamp(lrc);
> > > + if (ctx_lrc == 1) {
> > > + engine_id = xe_lrc_engine_id(lrc);
> > > + ctx_reg = get_ctx_timestamp(lrc, engine_id);
> > > + if (!ctx_reg)
> > > + lrc->ctx_timestamp = *old_ts;
> > > + else
> > > + lrc->ctx_timestamp = ctx_reg;
> > > +
> > > + /* read lrc again to ensure context is still active */
> > > + ctx_lrc = xe_lrc_ctx_timestamp(lrc);
> > > + }
> > > +
> > > + /* If context switched out, just use the lrc value */
> > > + if (ctx_lrc != 1)
> > > + lrc->ctx_timestamp = ctx_lrc;
> > > > trace_xe_lrc_update_timestamp(lrc, *old_ts);
> > > > diff --git a/drivers/gpu/drm/xe/xe_lrc_types.h
> > b/drivers/gpu/drm/xe/xe_lrc_types.h
> > > index cd38586ae989..a62d0b8ca8a4 100644
> > > --- a/drivers/gpu/drm/xe/xe_lrc_types.h
> > > +++ b/drivers/gpu/drm/xe/xe_lrc_types.h
> > > @@ -53,6 +53,9 @@ struct xe_lrc {
> > > > /** @ctx_timestamp: readout value of CTX_TIMESTAMP on last update
> > */
> > > u32 ctx_timestamp;
> > > +
> > > + /** @bb_per_ctx_bo: buffer object for per context batch wa buffer */
> > > + struct xe_bo *bb_per_ctx_bo;
> > > };
> > > > struct xe_lrc_snapshot;
> > > --
> > > 2.43.0
> > >
>
> ----- End forwarded message -----
More information about the Intel-xe
mailing list