Fw: Re: [PATCH v2 3/3] drm/xe: Add WA BB to capture active context utilization
Umesh Nerlige Ramappa
umesh.nerlige.ramappa at intel.com
Tue Apr 29 01:01:54 UTC 2025
Resending, since I forgot to add the list
----- Forwarded message from Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com> -----
Date: Mon, 28 Apr 2025 16:58:37 -0700
From: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
To: Matthew Brost <matthew.brost at intel.com>
Subject: Re: [PATCH v2 3/3] drm/xe: Add WA BB to capture active context utilization
On Fri, Apr 25, 2025 at 09:54:52PM -0700, Matthew Brost wrote:
> On Fri, Apr 25, 2025 at 05:29:51PM -0700, Umesh Nerlige Ramappa wrote:
> > Context Timestamp (CTX_TIMESTAMP) in the lrc accumulates the run ticks
> > of the context, but only gets updated when the context switches out. In
> > order to check how long a context has been active before it switches
> > out, two things are required:
> >
> > (1) Determine if the context is running:
> >
> > To do so, we program the wa bb to set an initial value for CTX_TIMESTAMP
> > in the lrc. The value chosen is 1 since 0 is the initial value when the
> > lrc is initialized. During a query, we just check for this value to
> > determine if the context is active. If the context switched out, it
> > would overwrite this location with the actual CTX_TIMESTAMP MMIO value.
> > Note that wa bb will run after the context has been restored, so reusing
> > this lrc location will not clobber anything.
> >
> > (2) Calculate the time that the context has been active for:
> >
> > The CTX_TIMESTAMP ticks only when the context is active. If a context is
> > active, we just use the CTX_TIMESTAMP MMIO as the new value of
> > utilization. While doing so, we need to read the CTX_TIMESTAMP MMIO
> > for the specific engine instance. Since we do not know which instance
> > the context is running on until it is scheduled, we also read the
> > ENGINE_ID MMIO in the wa bb and store it in the PPHSWP.
> >
> > Using the above 2 instructions in a WA BB, capture active context
> > utilization.
> >
> > v2: (Matt Brost)
> > - This breaks TDR, fix it by saving the CTX_TIMESTAMP register
> > "drm/xe: Save CTX_TIMESTAMP mmio value instead of LRC value"
> > - Drop tile from lrc if using gt
> > "drm/xe: Save the gt pointer in lrc and drop the tile"
> >
> > Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
> > ---
> > .../gpu/drm/xe/instructions/xe_mi_commands.h | 4 +
> > drivers/gpu/drm/xe/regs/xe_engine_regs.h | 4 +
> > drivers/gpu/drm/xe/regs/xe_lrc_layout.h | 1 +
> > drivers/gpu/drm/xe/xe_lrc.c | 167 +++++++++++++++++-
> > drivers/gpu/drm/xe/xe_lrc_types.h | 3 +
> > 5 files changed, 177 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/xe/instructions/xe_mi_commands.h b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
> > index eba582058d55..9153a7cd2ceb 100644
> > --- a/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
> > +++ b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
> > @@ -62,6 +62,10 @@
> > #define MI_LOAD_REGISTER_MEM (__MI_INSTR(0x29) | XE_INSTR_NUM_DW(4))
> > #define MI_LRM_USE_GGTT REG_BIT(22)
> >
> > +#define MI_STORE_REGISTER_MEM (__MI_INSTR(0x24) | XE_INSTR_NUM_DW(4))
> > +#define MI_SRM_USE_GGTT REG_BIT(22)
> > +#define MI_SRM_ADD_CS_OFFSET REG_BIT(19)
> > +
> > #define MI_LOAD_REGISTER_REG (__MI_INSTR(0x2a) | XE_INSTR_NUM_DW(3))
> > #define MI_LRR_DST_CS_MMIO REG_BIT(19)
> > #define MI_LRR_SRC_CS_MMIO REG_BIT(18)
> > diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
> > index da713634d6a0..cc698136a6f6 100644
> > --- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h
> > +++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
> > @@ -43,6 +43,10 @@
> > #define XEHPC_BCS8_RING_BASE 0x3ee000
> > #define GSCCS_RING_BASE 0x11a000
> >
> > +#define ENGINE_ID(base) XE_REG((base) + 0x8c)
> > +#define ENGINE_INSTANCE_ID REG_GENMASK(9, 4)
> > +#define ENGINE_CLASS_ID REG_GENMASK(2, 0)
> > +
> > #define RING_TAIL(base) XE_REG((base) + 0x30)
> > #define TAIL_ADDR REG_GENMASK(20, 3)
> >
> > diff --git a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
> > index 57944f90bbf6..210be0ef12f2 100644
> > --- a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
> > +++ b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
> > @@ -11,6 +11,7 @@
> > #define CTX_RING_TAIL (0x06 + 1)
> > #define CTX_RING_START (0x08 + 1)
> > #define CTX_RING_CTL (0x0a + 1)
> > +#define CTX_BB_PER_CTX_PTR (0x12 + 1)
> > #define CTX_TIMESTAMP (0x22 + 1)
> > #define CTX_INDIRECT_RING_STATE (0x26 + 1)
> > #define CTX_PDP0_UDW (0x30 + 1)
> > diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
> > index 089b195e2635..a78acec1a358 100644
> > --- a/drivers/gpu/drm/xe/xe_lrc.c
> > +++ b/drivers/gpu/drm/xe/xe_lrc.c
> > @@ -24,6 +24,7 @@
> > #include "xe_hw_fence.h"
> > #include "xe_map.h"
> > #include "xe_memirq.h"
> > +#include "xe_mmio.h"
> > #include "xe_sriov.h"
> > #include "xe_trace_lrc.h"
> > #include "xe_vm.h"
> > @@ -654,6 +655,7 @@ u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
> > #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
> > #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
> > #define LRC_PARALLEL_PPHWSP_OFFSET 2048
> > +#define LRC_ENGINE_ID_PPHWSP_OFFSET 2096
> >
> > u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
> > {
> > @@ -697,11 +699,21 @@ static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
> > return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
> > }
> >
> > +static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
> > +{
> > + return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
> > +}
> > +
> > static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
> > {
> > return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
> > }
> >
> > +static u32 __xe_lrc_bb_per_ctx_ptr_offset(struct xe_lrc *lrc)
> > +{
> > + return __xe_lrc_regs_offset(lrc) + CTX_BB_PER_CTX_PTR * sizeof(u32);
> > +}
This also looks like it's unused, so will drop it.
> > +
> > static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
> > {
> > /* Indirect ring state page is at the very end of LRC */
> > @@ -731,6 +743,8 @@ DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
> > DECL_MAP_ADDR_HELPERS(ctx_timestamp)
> > DECL_MAP_ADDR_HELPERS(parallel)
> > DECL_MAP_ADDR_HELPERS(indirect_ring)
> > +DECL_MAP_ADDR_HELPERS(bb_per_ctx_ptr)
>
> This appears to be unused.
oh, you are right, will remove
>
> > +DECL_MAP_ADDR_HELPERS(engine_id)
> >
> > #undef DECL_MAP_ADDR_HELPERS
> >
> > @@ -880,6 +894,58 @@ static void xe_lrc_finish(struct xe_lrc *lrc)
> > xe_bo_unpin(lrc->bo);
> > xe_bo_unlock(lrc->bo);
> > xe_bo_put(lrc->bo);
> > + xe_bo_unpin_map_no_vm(lrc->bb_per_ctx_bo);
> > +}
> > +
> > +/*
> > + * xe_lrc_setup_utilization() - Setup wa bb to assist in calculating active
> > + * context run ticks.
> > + * @lrc: Pointer to the lrc.
> > + *
> > + * Context Timestamp (CTX_TIMESTAMP) in the lrc accumulates the run ticks of the
> > + * context, but only gets updated when the context switches out. In order to
> > + * check how long a context has been running before it switches out, two things
> > + * are required:
> > + *
> > + * (1) Determine if the context is running
> > + * To do so, we program the wa bb to set an initial value for CTX_TIMESTAMP in
> > + * the lrc. The value chosen is 1 since 0 is the initial value when the lrc is
> > + * initialized. During a query, we just check for this value to determine if the
> > + * context is active. If the context switched out, it would overwrite this
> > + * location with the actual CTX_TIMESTAMP MMIO value. Note that wa bb will run
> > + * after the context has been restored, so reusing this lrc location will not
> > + * clobber anything.
> > + *
> > + * (2) Calculate the time that the context has been running for
> > + * The CTX_TIMESTAMP ticks only when the context is active. If a context is
> > + * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
> > + * While doing so, we need to read the CTX_TIMESTAMP MMIO from the specific
> > + * engine instance. Since we do not know which instance the context is running
> > + * on until it is scheduled, we also read the ENGINE_ID MMIO in the wa bb and
> > + * store it in the PPHSWP.
> > + */
> > +
> > +static void xe_lrc_setup_utilization(struct xe_lrc *lrc)
> > +{
> > + u32 *cmd;
> > +
> > + cmd = lrc->bb_per_ctx_bo->vmap.vaddr;
> > +
> > + *cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
> > + *cmd++ = ENGINE_ID(0).addr;
> > + *cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
> > + *cmd++ = 0;
> > +
> > + *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
> > + *cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
> > + *cmd++ = 0;
> > + *cmd++ = 1;
>
> Nit: Rather than '1' can we have define indicating this is a special value.
Sure, will add define.
>
> Also there is a small possibily (1 in 4 billion - I think) of a race
> here. It is possible than an LRC switches out with a value of '1' and I
> believe we'd read garbage from get_ctx_timestamp() ... Is there really
> not another register in the LRC we can steal a bit from?
There are, but the ones that are saved/restored during lite-restore is
entirely hw implementation dependent. Looks like CTX_TIMESTAMP is NOT
saved/restored during lite-restore and also it directly relates to this
feature, so a safer bet.
Other alternatives explored
1) ppHWSP locations that might be updated on context save. These are not
consistent across platforms and are disabled in some.
For 2, 3 below, Ref: https://www.intel.com/content/www/us/en/docs/graphics-for-linux/developer-reference/1-0/alchemist-arctic-sound-m.html
2) Vol 8: Page 57/64
If we look for anything after CSEL_BE in the LRC, then we would need to add
logic to traverse the LRC to find the right offset because it is engine
specific and also platform specific. Can be done, but not a lot of ROI
considering the probability of hitting this.
3) Vol 9: Page 39/730
If we omit support for media and copy engines, then we could use upper half of
PREEMPTION_STATUS field. Needs more experiments though. This doesn't get
restored at all, only saved. So we could just write
something like (ENGINE_ID + 1) to this location from the WA BB.
>
> If not, maybe we can live with this race? At minimum we'd need comment
> somewhere explaining this - at worst we'd get some bad stats or in a
> very unlikely case get false job timeout if we switch the TDR to use
> this new code path.
I asked around and got some suggestions - like check HEAD/TAIL of the ring in
the LRC and that should tell if the context is really active. But if HEAD !=
TAIL, we still run into the same issue of checking validity again.
IMO, since the probability of hitting this is very small, I would go with the
current implementation and like you said, add a comment describing this race.
>
> The patch / design LGTM to good me though.
Thanks,
Umesh
>
> Matt
>
> > +
> > + *cmd++ = MI_BATCH_BUFFER_END;
> > +
> > + xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
> > + xe_bo_ggtt_addr(lrc->bb_per_ctx_bo) | 1);
> > +
> > }
> >
> > #define PVC_CTX_ASID (0x2e + 1)
> > @@ -921,6 +987,14 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> > if (IS_ERR(lrc->bo))
> > return PTR_ERR(lrc->bo);
> >
> > + lrc->bb_per_ctx_bo = xe_bo_create_pin_map(xe, tile, NULL, SZ_4K,
> > + ttm_bo_type_kernel,
> > + bo_flags);
> > + if (IS_ERR(lrc->bb_per_ctx_bo)) {
> > + err = PTR_ERR(lrc->bb_per_ctx_bo);
> > + goto err_lrc_finish;
> > + }
> > +
> > lrc->size = lrc_size;
> > lrc->ring.size = ring_size;
> > lrc->ring.tail = 0;
> > @@ -1026,6 +1100,8 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> > map = __xe_lrc_start_seqno_map(lrc);
> > xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
> >
> > + xe_lrc_setup_utilization(lrc);
> > +
> > return 0;
> >
> > err_lrc_finish:
> > @@ -1245,6 +1321,21 @@ struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
> > return __xe_lrc_parallel_map(lrc);
> > }
> >
> > +/**
> > + * xe_lrc_engine_id() - Read engine id value
> > + * @lrc: Pointer to the lrc.
> > + *
> > + * Returns: context id value
> > + */
> > +static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
> > +{
> > + struct xe_device *xe = lrc_to_xe(lrc);
> > + struct iosys_map map;
> > +
> > + map = __xe_lrc_engine_id_map(lrc);
> > + return xe_map_read32(xe, &map);
> > +}
> > +
> > static int instr_dw(u32 cmd_header)
> > {
> > /* GFXPIPE "SINGLE_DW" opcodes are a single dword */
> > @@ -1792,21 +1883,93 @@ void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
> > kfree(snapshot);
> > }
> >
> > +static u32 get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id)
> > +{
> > + static u32 base[XE_ENGINE_CLASS_MAX][XE_HW_ENGINE_MAX_INSTANCE] = {
> > + {
> > + RENDER_RING_BASE,
> > + },
> > + {
> > + BSD_RING_BASE,
> > + BSD2_RING_BASE,
> > + BSD3_RING_BASE,
> > + BSD4_RING_BASE,
> > + XEHP_BSD5_RING_BASE,
> > + XEHP_BSD6_RING_BASE,
> > + XEHP_BSD7_RING_BASE,
> > + XEHP_BSD8_RING_BASE,
> > + },
> > + {
> > + VEBOX_RING_BASE,
> > + VEBOX2_RING_BASE,
> > + XEHP_VEBOX3_RING_BASE,
> > + XEHP_VEBOX4_RING_BASE,
> > + },
> > + {
> > + BLT_RING_BASE,
> > + XEHPC_BCS1_RING_BASE,
> > + XEHPC_BCS2_RING_BASE,
> > + XEHPC_BCS3_RING_BASE,
> > + XEHPC_BCS4_RING_BASE,
> > + XEHPC_BCS5_RING_BASE,
> > + XEHPC_BCS6_RING_BASE,
> > + XEHPC_BCS7_RING_BASE,
> > + XEHPC_BCS8_RING_BASE,
> > + },
> > + { 0 },
> > + {
> > + COMPUTE0_RING_BASE,
> > + COMPUTE1_RING_BASE,
> > + COMPUTE2_RING_BASE,
> > + COMPUTE3_RING_BASE,
> > + },
> > + };
> > + u32 c = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
> > + u32 i = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
> > + u32 mmio_base = base[c][i];
> > +
> > + if (xe_gt_WARN_ONCE(lrc->gt, !mmio_base,
> > + "Unexpected engine c:i %d:%d for context utilization\n",
> > + c, i))
> > + return 0;
> > +
> > + return xe_mmio_read32(&lrc->gt->mmio, RING_CTX_TIMESTAMP(mmio_base));
> > +}
> > +
> > /**
> > * xe_lrc_update_timestamp() - Update ctx timestamp
> > * @lrc: Pointer to the lrc.
> > * @old_ts: Old timestamp value
> > *
> > * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
> > - * update saved value.
> > + * update saved value. With support for active contexts, the calculation may be
> > + * slightly racy, so follow a read-again logic to ensure that the context is
> > + * still active before returning the right timestamp.
> > *
> > * Returns: New ctx timestamp value
> > */
> > u32 xe_lrc_update_timestamp(struct xe_lrc *lrc, u32 *old_ts)
> > {
> > + u32 ctx_lrc, ctx_reg, engine_id;
> > +
> > *old_ts = lrc->ctx_timestamp;
> >
> > - lrc->ctx_timestamp = xe_lrc_ctx_timestamp(lrc);
> > + ctx_lrc = xe_lrc_ctx_timestamp(lrc);
> > + if (ctx_lrc == 1) {
> > + engine_id = xe_lrc_engine_id(lrc);
> > + ctx_reg = get_ctx_timestamp(lrc, engine_id);
> > + if (!ctx_reg)
> > + lrc->ctx_timestamp = *old_ts;
> > + else
> > + lrc->ctx_timestamp = ctx_reg;
> > +
> > + /* read lrc again to ensure context is still active */
> > + ctx_lrc = xe_lrc_ctx_timestamp(lrc);
> > + }
> > +
> > + /* If context switched out, just use the lrc value */
> > + if (ctx_lrc != 1)
> > + lrc->ctx_timestamp = ctx_lrc;
> >
> > trace_xe_lrc_update_timestamp(lrc, *old_ts);
> >
> > diff --git a/drivers/gpu/drm/xe/xe_lrc_types.h b/drivers/gpu/drm/xe/xe_lrc_types.h
> > index cd38586ae989..a62d0b8ca8a4 100644
> > --- a/drivers/gpu/drm/xe/xe_lrc_types.h
> > +++ b/drivers/gpu/drm/xe/xe_lrc_types.h
> > @@ -53,6 +53,9 @@ struct xe_lrc {
> >
> > /** @ctx_timestamp: readout value of CTX_TIMESTAMP on last update */
> > u32 ctx_timestamp;
> > +
> > + /** @bb_per_ctx_bo: buffer object for per context batch wa buffer */
> > + struct xe_bo *bb_per_ctx_bo;
> > };
> >
> > struct xe_lrc_snapshot;
> > --
> > 2.43.0
> >
----- End forwarded message -----
More information about the Intel-xe
mailing list