[PATCH] drm/xe: Add WA BB to capture active context utilization
Matthew Brost
matthew.brost at intel.com
Fri Apr 25 19:42:04 UTC 2025
On Fri, Apr 25, 2025 at 12:10:39PM -0700, Umesh Nerlige Ramappa wrote:
> Context Timestamp (CTX_TIMESTAMP) in the lrc accumulates the run ticks
> of the context, but only gets updated when the context switches out. In
> order to check how long a context has been active before it switches
> out, two things are required:
>
> (1) Determine if the context is running:
>
> To do so, we program the wa bb to set an initial value for CTX_TIMESTAMP
> in the lrc. The value chosen is 1 since 0 is the initial value when the
> lrc is initialized. During a query, we just check for this value to
> determine if the context is active. If the context switched out, it
> would overwrite this location with the actual CTX_TIMESTAMP MMIO value.
> Note that wa bb will run after the context has been restored, so reusing
> this lrc location will not clobber anything.
>
> (2) Calculate the time that the context has been active for:
>
> The CTX_TIMESTAMP ticks only when the context is active. If a context is
> active, we just use the CTX_TIMESTAMP MMIO as the new value of
> utilization. While doing so, we need to read the CTX_TIMESTAMP MMIO
> for the specific engine instance. Since we do not know which instance
> the context is running on until it is scheduled, we also read the
> ENGINE_ID MMIO in the wa bb and store it in the PPHSWP.
>
> Using the above 2 instructions in a WA BB, capture active context
> utilization.
>
I think this may break the GuC code, which samples the context timestamp
to determine if a job has run long enough to timeout.
Let me explain how that code works.
When a job is started, it stores the context timestamp into a scratch
memory area in the LRC — LRC_CTX_JOB_TIMESTAMP_OFFSET. See
emit_copy_timestamp in xe_ring_ops.c.
When a TDR (DRM scheduler job timeout mechanism) fires — see
guc_exec_queue_timedout_job in xe_guc_submit.c — we toggle the
scheduling state to update the LRC timestamp. We then compare the values
to determine whether the job has actually been running too long.
It’s common, especially when the engine is timeslicing multiple jobs,
that a job hasn’t truly exceeded the timeout. For example, a batch of
parallel dEQP test cases (e.g., Vulkan) could easily trigger false job
timeouts without this code.
If I'm understanding this code correctly, with the WA BB we'd now always
copy a value of 1 into LRC_CTX_JOB_TIMESTAMP_OFFSET, so I think this
will break.
Can we not clobber context timestamp in WA BB rather set a bit somewhere
else that gets cleared on LRC switch out? Or maybe we update
emit_copy_timestamp to read a MMIO value rather than from LRC CTX
timestamp?
Presumably in the TDR we could avoid toggling the scheduling state too
and hook into this new code to get an accurate view of the jobs
execution.
One more nit below.
> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
> ---
> .../gpu/drm/xe/instructions/xe_mi_commands.h | 4 +
> drivers/gpu/drm/xe/regs/xe_engine_regs.h | 4 +
> drivers/gpu/drm/xe/regs/xe_lrc_layout.h | 1 +
> drivers/gpu/drm/xe/xe_lrc.c | 168 +++++++++++++++++-
> drivers/gpu/drm/xe/xe_lrc_types.h | 6 +
> 5 files changed, 181 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/instructions/xe_mi_commands.h b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
> index eba582058d55..9153a7cd2ceb 100644
> --- a/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
> +++ b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
> @@ -62,6 +62,10 @@
> #define MI_LOAD_REGISTER_MEM (__MI_INSTR(0x29) | XE_INSTR_NUM_DW(4))
> #define MI_LRM_USE_GGTT REG_BIT(22)
>
> +#define MI_STORE_REGISTER_MEM (__MI_INSTR(0x24) | XE_INSTR_NUM_DW(4))
> +#define MI_SRM_USE_GGTT REG_BIT(22)
> +#define MI_SRM_ADD_CS_OFFSET REG_BIT(19)
> +
> #define MI_LOAD_REGISTER_REG (__MI_INSTR(0x2a) | XE_INSTR_NUM_DW(3))
> #define MI_LRR_DST_CS_MMIO REG_BIT(19)
> #define MI_LRR_SRC_CS_MMIO REG_BIT(18)
> diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
> index da713634d6a0..cc698136a6f6 100644
> --- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h
> +++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
> @@ -43,6 +43,10 @@
> #define XEHPC_BCS8_RING_BASE 0x3ee000
> #define GSCCS_RING_BASE 0x11a000
>
> +#define ENGINE_ID(base) XE_REG((base) + 0x8c)
> +#define ENGINE_INSTANCE_ID REG_GENMASK(9, 4)
> +#define ENGINE_CLASS_ID REG_GENMASK(2, 0)
> +
> #define RING_TAIL(base) XE_REG((base) + 0x30)
> #define TAIL_ADDR REG_GENMASK(20, 3)
>
> diff --git a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
> index 57944f90bbf6..210be0ef12f2 100644
> --- a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
> +++ b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
> @@ -11,6 +11,7 @@
> #define CTX_RING_TAIL (0x06 + 1)
> #define CTX_RING_START (0x08 + 1)
> #define CTX_RING_CTL (0x0a + 1)
> +#define CTX_BB_PER_CTX_PTR (0x12 + 1)
> #define CTX_TIMESTAMP (0x22 + 1)
> #define CTX_INDIRECT_RING_STATE (0x26 + 1)
> #define CTX_PDP0_UDW (0x30 + 1)
> diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
> index 855c8acaf3f1..11eabe4216ef 100644
> --- a/drivers/gpu/drm/xe/xe_lrc.c
> +++ b/drivers/gpu/drm/xe/xe_lrc.c
> @@ -24,6 +24,7 @@
> #include "xe_hw_fence.h"
> #include "xe_map.h"
> #include "xe_memirq.h"
> +#include "xe_mmio.h"
> #include "xe_sriov.h"
> #include "xe_trace_lrc.h"
> #include "xe_vm.h"
> @@ -654,6 +655,7 @@ u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
> #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
> #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
> #define LRC_PARALLEL_PPHWSP_OFFSET 2048
> +#define LRC_ENGINE_ID_PPHWSP_OFFSET 2096
>
> u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
> {
> @@ -697,11 +699,21 @@ static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
> return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
> }
>
> +static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
> +{
> + return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
> +}
> +
> static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
> {
> return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
> }
>
> +static u32 __xe_lrc_bb_per_ctx_ptr_offset(struct xe_lrc *lrc)
> +{
> + return __xe_lrc_regs_offset(lrc) + CTX_BB_PER_CTX_PTR * sizeof(u32);
> +}
> +
> static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
> {
> /* Indirect ring state page is at the very end of LRC */
> @@ -731,6 +743,8 @@ DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
> DECL_MAP_ADDR_HELPERS(ctx_timestamp)
> DECL_MAP_ADDR_HELPERS(parallel)
> DECL_MAP_ADDR_HELPERS(indirect_ring)
> +DECL_MAP_ADDR_HELPERS(bb_per_ctx_ptr)
Is this used anywhere? I'm not seeing it in my search of the patch.
Matt
> +DECL_MAP_ADDR_HELPERS(engine_id)
>
> #undef DECL_MAP_ADDR_HELPERS
>
> @@ -880,6 +894,58 @@ static void xe_lrc_finish(struct xe_lrc *lrc)
> xe_bo_unpin(lrc->bo);
> xe_bo_unlock(lrc->bo);
> xe_bo_put(lrc->bo);
> + xe_bo_unpin_map_no_vm(lrc->bb_per_ctx_bo);
> +}
> +
> +/*
> + * xe_lrc_setup_utilization() - Setup wa bb to assist in calculating active
> + * context run ticks.
> + * @lrc: Pointer to the lrc.
> + *
> + * Context Timestamp (CTX_TIMESTAMP) in the lrc accumulates the run ticks of the
> + * context, but only gets updated when the context switches out. In order to
> + * check how long a context has been running before it switches out, two things
> + * are required:
> + *
> + * (1) Determine if the context is running
> + * To do so, we program the wa bb to set an initial value for CTX_TIMESTAMP in
> + * the lrc. The value chosen is 1 since 0 is the initial value when the lrc is
> + * initialized. During a query, we just check for this value to determine if the
> + * context is active. If the context switched out, it would overwrite this
> + * location with the actual CTX_TIMESTAMP MMIO value. Note that wa bb will run
> + * after the context has been restored, so reusing this lrc location will not
> + * clobber anything.
> + *
> + * (2) Calculate the time that the context has been running for
> + * The CTX_TIMESTAMP ticks only when the context is active. If a context is
> + * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
> + * While doing so, we need to read the CTX_TIMESTAMP MMIO from the specific
> + * engine instance. Since we do not know which instance the context is running
> + * on until it is scheduled, we also read the ENGINE_ID MMIO in the wa bb and
> + * store it in the PPHSWP.
> + */
> +
> +static void xe_lrc_setup_utilization(struct xe_lrc *lrc)
> +{
> + u32 *cmd;
> +
> + cmd = lrc->bb_per_ctx_bo->vmap.vaddr;
> +
> + *cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
> + *cmd++ = ENGINE_ID(0).addr;
> + *cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
> + *cmd++ = 0;
> +
> + *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
> + *cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
> + *cmd++ = 0;
> + *cmd++ = 1;
> +
> + *cmd++ = MI_BATCH_BUFFER_END;
> +
> + xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
> + xe_bo_ggtt_addr(lrc->bb_per_ctx_bo) | 1);
> +
> }
>
> #define PVC_CTX_ASID (0x2e + 1)
> @@ -900,6 +966,7 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> int err;
>
> kref_init(&lrc->refcount);
> + lrc->gt = gt;
> lrc->flags = 0;
> lrc_size = ring_size + xe_gt_lrc_size(gt, hwe->class);
> if (xe_gt_has_indirect_ring_state(gt))
> @@ -920,6 +987,14 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> if (IS_ERR(lrc->bo))
> return PTR_ERR(lrc->bo);
>
> + lrc->bb_per_ctx_bo = xe_bo_create_pin_map(xe, tile, NULL, SZ_4K,
> + ttm_bo_type_kernel,
> + bo_flags);
> + if (IS_ERR(lrc->bb_per_ctx_bo)) {
> + err = PTR_ERR(lrc->bb_per_ctx_bo);
> + goto err_lrc_finish;
> + }
> +
> lrc->size = lrc_size;
> lrc->tile = gt_to_tile(hwe->gt);
> lrc->ring.size = ring_size;
> @@ -1026,6 +1101,8 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> map = __xe_lrc_start_seqno_map(lrc);
> xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
>
> + xe_lrc_setup_utilization(lrc);
> +
> return 0;
>
> err_lrc_finish:
> @@ -1245,6 +1322,21 @@ struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
> return __xe_lrc_parallel_map(lrc);
> }
>
> +/**
> + * xe_lrc_engine_id() - Read engine id value
> + * @lrc: Pointer to the lrc.
> + *
> + * Returns: context id value
> + */
> +static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
> +{
> + struct xe_device *xe = lrc_to_xe(lrc);
> + struct iosys_map map;
> +
> + map = __xe_lrc_engine_id_map(lrc);
> + return xe_map_read32(xe, &map);
> +}
> +
> static int instr_dw(u32 cmd_header)
> {
> /* GFXPIPE "SINGLE_DW" opcodes are a single dword */
> @@ -1792,21 +1884,93 @@ void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
> kfree(snapshot);
> }
>
> +static u32 get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id)
> +{
> + static u32 base[XE_ENGINE_CLASS_MAX][XE_HW_ENGINE_MAX_INSTANCE] = {
> + {
> + RENDER_RING_BASE,
> + },
> + {
> + BSD_RING_BASE,
> + BSD2_RING_BASE,
> + BSD3_RING_BASE,
> + BSD4_RING_BASE,
> + XEHP_BSD5_RING_BASE,
> + XEHP_BSD6_RING_BASE,
> + XEHP_BSD7_RING_BASE,
> + XEHP_BSD8_RING_BASE,
> + },
> + {
> + VEBOX_RING_BASE,
> + VEBOX2_RING_BASE,
> + XEHP_VEBOX3_RING_BASE,
> + XEHP_VEBOX4_RING_BASE,
> + },
> + {
> + BLT_RING_BASE,
> + XEHPC_BCS1_RING_BASE,
> + XEHPC_BCS2_RING_BASE,
> + XEHPC_BCS3_RING_BASE,
> + XEHPC_BCS4_RING_BASE,
> + XEHPC_BCS5_RING_BASE,
> + XEHPC_BCS6_RING_BASE,
> + XEHPC_BCS7_RING_BASE,
> + XEHPC_BCS8_RING_BASE,
> + },
> + { 0 },
> + {
> + COMPUTE0_RING_BASE,
> + COMPUTE1_RING_BASE,
> + COMPUTE2_RING_BASE,
> + COMPUTE3_RING_BASE,
> + },
> + };
> + u32 c = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
> + u32 i = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
> + u32 mmio_base = base[c][i];
> +
> + if (xe_gt_WARN_ONCE(lrc->gt, !mmio_base,
> + "Unexpected engine c:i %d:%d for context utilization\n",
> + c, i))
> + return 0;
> +
> + return xe_mmio_read32(&lrc->gt->mmio, RING_CTX_TIMESTAMP(mmio_base));
> +}
> +
> /**
> * xe_lrc_update_timestamp() - Update ctx timestamp
> * @lrc: Pointer to the lrc.
> * @old_ts: Old timestamp value
> *
> * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
> - * update saved value.
> + * update saved value. With support for active contexts, the calculation may be
> + * slightly racy, so follow a read-again logic to ensure that the context is
> + * still active before returning the right timestamp.
> *
> * Returns: New ctx timestamp value
> */
> u32 xe_lrc_update_timestamp(struct xe_lrc *lrc, u32 *old_ts)
> {
> + u32 ctx_lrc, ctx_reg, engine_id;
> +
> *old_ts = lrc->ctx_timestamp;
>
> - lrc->ctx_timestamp = xe_lrc_ctx_timestamp(lrc);
> + ctx_lrc = xe_lrc_ctx_timestamp(lrc);
> + if (ctx_lrc == 1) {
> + engine_id = xe_lrc_engine_id(lrc);
> + ctx_reg = get_ctx_timestamp(lrc, engine_id);
> + if (!ctx_reg)
> + lrc->ctx_timestamp = *old_ts;
> + else
> + lrc->ctx_timestamp = ctx_reg;
> +
> + /* read lrc again to ensure context is still active */
> + ctx_lrc = xe_lrc_ctx_timestamp(lrc);
> + }
> +
> + /* If context switched out, just use the lrc value */
> + if (ctx_lrc != 1)
> + lrc->ctx_timestamp = ctx_lrc;
>
> trace_xe_lrc_update_timestamp(lrc, *old_ts);
>
> diff --git a/drivers/gpu/drm/xe/xe_lrc_types.h b/drivers/gpu/drm/xe/xe_lrc_types.h
> index 71ecb453f811..1c973d71ba98 100644
> --- a/drivers/gpu/drm/xe/xe_lrc_types.h
> +++ b/drivers/gpu/drm/xe/xe_lrc_types.h
> @@ -28,6 +28,9 @@ struct xe_lrc {
> /** @tile: tile which this LRC belongs to */
> struct xe_tile *tile;
>
> + /** @gt: gt which this LRC belongs to */
> + struct xe_gt *gt;
> +
> /** @flags: LRC flags */
> #define XE_LRC_FLAG_INDIRECT_RING_STATE 0x1
> u32 flags;
> @@ -53,6 +56,9 @@ struct xe_lrc {
>
> /** @ctx_timestamp: readout value of CTX_TIMESTAMP on last update */
> u32 ctx_timestamp;
> +
> + /** @bb_per_ctx_bo: buffer object for per context batch wa buffer */
> + struct xe_bo *bb_per_ctx_bo;
> };
>
> struct xe_lrc_snapshot;
> --
> 2.43.0
>
More information about the Intel-xe
mailing list