[PATCH] drm/xe: Add WA BB to capture active context utilization

Umesh Nerlige Ramappa umesh.nerlige.ramappa at intel.com
Wed Apr 30 18:28:40 UTC 2025


On Tue, Apr 29, 2025 at 04:39:33PM -0500, Lucas De Marchi wrote:
>On Fri, Apr 25, 2025 at 12:10:39PM -0700, Umesh Nerlige Ramappa wrote:
>>Context Timestamp (CTX_TIMESTAMP) in the lrc accumulates the run ticks
>>of the context, but only gets updated when the context switches out. In
>>order to check how long a context has been active before it switches
>>out, two things are required:
>>
>>(1) Determine if the context is running:
>>
>>To do so, we program the wa bb to set an initial value for CTX_TIMESTAMP
>>in the lrc. The value chosen is 1 since 0 is the initial value when the
>>lrc is initialized. During a query, we just check for this value to
>>determine if the context is active. If the context switched out, it
>>would overwrite this location with the actual CTX_TIMESTAMP MMIO value.
>>Note that wa bb will run after the context has been restored, so reusing
>
>AFAICS it's part of the "context restore" which is composed by, in
>order: Ring Context Restore, Engine Context Restore (with possible jump
>to Indirect Ctx, controlled by the INDIRECT_CTX_OFFSET) and lastly
>Context Restore Batch Buffer.
>
>>this lrc location will not clobber anything.
>>
>>(2) Calculate the time that the context has been active for:
>>
>>The CTX_TIMESTAMP ticks only when the context is active. If a context is
>>active, we just use the CTX_TIMESTAMP MMIO as the new value of
>>utilization. While doing so, we need to read the CTX_TIMESTAMP MMIO
>>for the specific engine instance. Since we do not know which instance
>>the context is running on until it is scheduled, we also read the
>>ENGINE_ID MMIO in the wa bb and store it in the PPHSWP.
>>
>>Using the above 2 instructions in a WA BB, capture active context
>>utilization.
>>
>>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
>>---
>>.../gpu/drm/xe/instructions/xe_mi_commands.h  |   4 +
>>drivers/gpu/drm/xe/regs/xe_engine_regs.h      |   4 +
>>drivers/gpu/drm/xe/regs/xe_lrc_layout.h       |   1 +
>>drivers/gpu/drm/xe/xe_lrc.c                   | 168 +++++++++++++++++-
>>drivers/gpu/drm/xe/xe_lrc_types.h             |   6 +
>>5 files changed, 181 insertions(+), 2 deletions(-)
>>
>>diff --git a/drivers/gpu/drm/xe/instructions/xe_mi_commands.h b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
>>index eba582058d55..9153a7cd2ceb 100644
>>--- a/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
>>+++ b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
>>@@ -62,6 +62,10 @@
>>#define MI_LOAD_REGISTER_MEM		(__MI_INSTR(0x29) | XE_INSTR_NUM_DW(4))
>>#define   MI_LRM_USE_GGTT		REG_BIT(22)
>>
>>+#define MI_STORE_REGISTER_MEM		(__MI_INSTR(0x24) | XE_INSTR_NUM_DW(4))
>>+#define   MI_SRM_USE_GGTT		REG_BIT(22)
>>+#define   MI_SRM_ADD_CS_OFFSET		REG_BIT(19)
>>+
>>#define MI_LOAD_REGISTER_REG		(__MI_INSTR(0x2a) | XE_INSTR_NUM_DW(3))
>>#define   MI_LRR_DST_CS_MMIO		REG_BIT(19)
>>#define   MI_LRR_SRC_CS_MMIO		REG_BIT(18)
>>diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
>>index da713634d6a0..cc698136a6f6 100644
>>--- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h
>>+++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
>>@@ -43,6 +43,10 @@
>>#define XEHPC_BCS8_RING_BASE			0x3ee000
>>#define GSCCS_RING_BASE				0x11a000
>>
>>+#define ENGINE_ID(base)				XE_REG((base) + 0x8c)
>>+#define   ENGINE_INSTANCE_ID			REG_GENMASK(9, 4)
>>+#define   ENGINE_CLASS_ID			REG_GENMASK(2, 0)
>>+
>>#define RING_TAIL(base)				XE_REG((base) + 0x30)
>>#define   TAIL_ADDR				REG_GENMASK(20, 3)
>>
>>diff --git a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
>>index 57944f90bbf6..210be0ef12f2 100644
>>--- a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
>>+++ b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
>>@@ -11,6 +11,7 @@
>>#define CTX_RING_TAIL			(0x06 + 1)
>>#define CTX_RING_START			(0x08 + 1)
>>#define CTX_RING_CTL			(0x0a + 1)
>>+#define CTX_BB_PER_CTX_PTR		(0x12 + 1)
>>#define CTX_TIMESTAMP			(0x22 + 1)
>>#define CTX_INDIRECT_RING_STATE		(0x26 + 1)
>>#define CTX_PDP0_UDW			(0x30 + 1)
>>diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
>>index 855c8acaf3f1..11eabe4216ef 100644
>>--- a/drivers/gpu/drm/xe/xe_lrc.c
>>+++ b/drivers/gpu/drm/xe/xe_lrc.c
>>@@ -24,6 +24,7 @@
>>#include "xe_hw_fence.h"
>>#include "xe_map.h"
>>#include "xe_memirq.h"
>>+#include "xe_mmio.h"
>>#include "xe_sriov.h"
>>#include "xe_trace_lrc.h"
>>#include "xe_vm.h"
>>@@ -654,6 +655,7 @@ u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
>>#define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
>>#define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
>>#define LRC_PARALLEL_PPHWSP_OFFSET 2048
>>+#define LRC_ENGINE_ID_PPHWSP_OFFSET 2096
>>
>>u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
>>{
>>@@ -697,11 +699,21 @@ static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
>>	return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
>>}
>>
>>+static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
>>+{
>>+	return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
>>+}
>>+
>>static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
>>{
>>	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
>>}
>>
>>+static u32 __xe_lrc_bb_per_ctx_ptr_offset(struct xe_lrc *lrc)
>>+{
>>+	return __xe_lrc_regs_offset(lrc) + CTX_BB_PER_CTX_PTR * sizeof(u32);
>>+}
>>+
>>static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
>>{
>>	/* Indirect ring state page is at the very end of LRC */
>>@@ -731,6 +743,8 @@ DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
>>DECL_MAP_ADDR_HELPERS(ctx_timestamp)
>>DECL_MAP_ADDR_HELPERS(parallel)
>>DECL_MAP_ADDR_HELPERS(indirect_ring)
>>+DECL_MAP_ADDR_HELPERS(bb_per_ctx_ptr)
>>+DECL_MAP_ADDR_HELPERS(engine_id)
>>
>>#undef DECL_MAP_ADDR_HELPERS
>>
>>@@ -880,6 +894,58 @@ static void xe_lrc_finish(struct xe_lrc *lrc)
>>	xe_bo_unpin(lrc->bo);
>>	xe_bo_unlock(lrc->bo);
>>	xe_bo_put(lrc->bo);
>>+	xe_bo_unpin_map_no_vm(lrc->bb_per_ctx_bo);
>>+}
>>+
>>+/*
>>+ * xe_lrc_setup_utilization() - Setup wa bb to assist in calculating active
>>+ * context run ticks.
>>+ * @lrc: Pointer to the lrc.
>>+ *
>>+ * Context Timestamp (CTX_TIMESTAMP) in the lrc accumulates the run ticks of the
>>+ * context, but only gets updated when the context switches out. In order to
>>+ * check how long a context has been running before it switches out, two things
>>+ * are required:
>>+ *
>>+ * (1) Determine if the context is running
>>+ * To do so, we program the wa bb to set an initial value for CTX_TIMESTAMP in
>>+ * the lrc. The value chosen is 1 since 0 is the initial value when the lrc is
>>+ * initialized. During a query, we just check for this value to determine if the
>>+ * context is active. If the context switched out, it would overwrite this
>>+ * location with the actual CTX_TIMESTAMP MMIO value. Note that wa bb will run
>>+ * after the context has been restored, so reusing this lrc location will not
>>+ * clobber anything.
>>+ *
>>+ * (2) Calculate the time that the context has been running for
>>+ * The CTX_TIMESTAMP ticks only when the context is active. If a context is
>>+ * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
>>+ * While doing so, we need to read the CTX_TIMESTAMP MMIO from the specific
>>+ * engine instance. Since we do not know which instance the context is running
>>+ * on until it is scheduled, we also read the ENGINE_ID MMIO in the wa bb and
>>+ * store it in the PPHSWP.
>>+ */
>>+
>>+static void xe_lrc_setup_utilization(struct xe_lrc *lrc)
>>+{
>>+	u32 *cmd;
>>+
>>+	cmd = lrc->bb_per_ctx_bo->vmap.vaddr;
>>+
>>+	*cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
>>+	*cmd++ = ENGINE_ID(0).addr;
>>+	*cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
>>+	*cmd++ = 0;
>>+
>>+        *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
>>+        *cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
>>+        *cmd++ = 0;
>>+        *cmd++ = 1;
>
>some wrong indentantion here.
>
>>+
>>+	*cmd++ = MI_BATCH_BUFFER_END;
>>+
>>+	xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
>>+			     xe_bo_ggtt_addr(lrc->bb_per_ctx_bo) | 1);
>>+
>>}
>>
>>#define PVC_CTX_ASID		(0x2e + 1)
>>@@ -900,6 +966,7 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
>>	int err;
>>
>>	kref_init(&lrc->refcount);
>>+	lrc->gt = gt;
>>	lrc->flags = 0;
>>	lrc_size = ring_size + xe_gt_lrc_size(gt, hwe->class);
>>	if (xe_gt_has_indirect_ring_state(gt))
>>@@ -920,6 +987,14 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
>>	if (IS_ERR(lrc->bo))
>>		return PTR_ERR(lrc->bo);
>>
>>+	lrc->bb_per_ctx_bo = xe_bo_create_pin_map(xe, tile, NULL, SZ_4K,
>>+						  ttm_bo_type_kernel,
>>+						  bo_flags);
>>+	if (IS_ERR(lrc->bb_per_ctx_bo)) {
>>+		err = PTR_ERR(lrc->bb_per_ctx_bo);
>>+		goto err_lrc_finish;
>>+	}
>>+
>>	lrc->size = lrc_size;
>>	lrc->tile = gt_to_tile(hwe->gt);
>>	lrc->ring.size = ring_size;
>>@@ -1026,6 +1101,8 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
>>	map = __xe_lrc_start_seqno_map(lrc);
>>	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
>>
>>+	xe_lrc_setup_utilization(lrc);
>>+
>>	return 0;
>>
>>err_lrc_finish:
>>@@ -1245,6 +1322,21 @@ struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
>>	return __xe_lrc_parallel_map(lrc);
>>}
>>
>>+/**
>>+ * xe_lrc_engine_id() - Read engine id value
>>+ * @lrc: Pointer to the lrc.
>>+ *
>>+ * Returns: context id value
>>+ */
>>+static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
>>+{
>>+	struct xe_device *xe = lrc_to_xe(lrc);
>>+	struct iosys_map map;
>>+
>>+	map = __xe_lrc_engine_id_map(lrc);
>>+	return xe_map_read32(xe, &map);
>>+}
>>+
>>static int instr_dw(u32 cmd_header)
>>{
>>	/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
>>@@ -1792,21 +1884,93 @@ void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
>>	kfree(snapshot);
>>}
>>
>>+static u32 get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id)
>>+{
>>+	static u32 base[XE_ENGINE_CLASS_MAX][XE_HW_ENGINE_MAX_INSTANCE] = {
>>+		{
>>+			RENDER_RING_BASE,
>>+		},
>>+		{
>>+			BSD_RING_BASE,
>>+			BSD2_RING_BASE,
>>+			BSD3_RING_BASE,
>>+			BSD4_RING_BASE,
>>+			XEHP_BSD5_RING_BASE,
>>+			XEHP_BSD6_RING_BASE,
>>+			XEHP_BSD7_RING_BASE,
>>+			XEHP_BSD8_RING_BASE,
>>+		},
>>+		{
>>+			VEBOX_RING_BASE,
>>+			VEBOX2_RING_BASE,
>>+			XEHP_VEBOX3_RING_BASE,
>>+			XEHP_VEBOX4_RING_BASE,
>>+		},
>>+		{
>>+			BLT_RING_BASE,
>>+			XEHPC_BCS1_RING_BASE,
>>+			XEHPC_BCS2_RING_BASE,
>>+			XEHPC_BCS3_RING_BASE,
>>+			XEHPC_BCS4_RING_BASE,
>>+			XEHPC_BCS5_RING_BASE,
>>+			XEHPC_BCS6_RING_BASE,
>>+			XEHPC_BCS7_RING_BASE,
>>+			XEHPC_BCS8_RING_BASE,
>>+		},
>>+		{ 0 },
>>+		{
>>+			COMPUTE0_RING_BASE,
>>+			COMPUTE1_RING_BASE,
>>+			COMPUTE2_RING_BASE,
>>+			COMPUTE3_RING_BASE,
>>+		},
>>+	};
>>+	u32 c = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
>>+	u32 i = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
>
>humn... do we really need this mapping? Couldn't we lookup a hwe from
>class + instance and then xe_hw_engine_mmio_read32() It doesn't seem like
>this lookup belongs here.
>

I thought it would be faster this way. I can change it to lookup the 
hwe. 

>>+	u32 mmio_base = base[c][i];
>>+
>>+	if (xe_gt_WARN_ONCE(lrc->gt, !mmio_base,
>>+			    "Unexpected engine c:i %d:%d for context utilization\n",
>>+			    c, i))
>>+		return 0;
>>+
>>+	return xe_mmio_read32(&lrc->gt->mmio, RING_CTX_TIMESTAMP(mmio_base));
>>+}
>>+
>>/**
>> * xe_lrc_update_timestamp() - Update ctx timestamp
>> * @lrc: Pointer to the lrc.
>> * @old_ts: Old timestamp value
>> *
>> * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
>>- * update saved value.
>>+ * update saved value. With support for active contexts, the calculation may be
>>+ * slightly racy, so follow a read-again logic to ensure that the context is
>>+ * still active before returning the right timestamp.
>> *
>> * Returns: New ctx timestamp value
>> */
>>u32 xe_lrc_update_timestamp(struct xe_lrc *lrc, u32 *old_ts)
>>{
>>+	u32 ctx_lrc, ctx_reg, engine_id;
>>+
>>	*old_ts = lrc->ctx_timestamp;
>>
>>-	lrc->ctx_timestamp = xe_lrc_ctx_timestamp(lrc);
>>+	ctx_lrc = xe_lrc_ctx_timestamp(lrc);
>>+	if (ctx_lrc == 1) {
>
>so in the rare case this clashes with a real sched-out value, the
>outcome will be that we will read a bogus timestamp and the math will be
>totally invalid. Maybe start using the entire u64 counter for this since
>it would then not wrap in practice?

I was trying to keep it compatible with platforms without 64 bit 
CTX_TIMESTAMP. Also th issue is reported on DG2.

Thanks,
Umesh

>
>Lucas De Marchi
>
>>+		engine_id = xe_lrc_engine_id(lrc);
>>+		ctx_reg = get_ctx_timestamp(lrc, engine_id);
>>+		if (!ctx_reg)
>>+			lrc->ctx_timestamp = *old_ts;
>>+		else
>>+			lrc->ctx_timestamp = ctx_reg;
>>+
>>+		/* read lrc again to ensure context is still active */
>>+		ctx_lrc = xe_lrc_ctx_timestamp(lrc);
>>+	}
>>+
>>+	/* If context switched out, just use the lrc value */
>>+	if (ctx_lrc != 1)
>>+		lrc->ctx_timestamp = ctx_lrc;
>>
>>	trace_xe_lrc_update_timestamp(lrc, *old_ts);
>>
>>diff --git a/drivers/gpu/drm/xe/xe_lrc_types.h b/drivers/gpu/drm/xe/xe_lrc_types.h
>>index 71ecb453f811..1c973d71ba98 100644
>>--- a/drivers/gpu/drm/xe/xe_lrc_types.h
>>+++ b/drivers/gpu/drm/xe/xe_lrc_types.h
>>@@ -28,6 +28,9 @@ struct xe_lrc {
>>	/** @tile: tile which this LRC belongs to */
>>	struct xe_tile *tile;
>>
>>+	/** @gt: gt which this LRC belongs to */
>>+	struct xe_gt *gt;
>>+
>>	/** @flags: LRC flags */
>>#define XE_LRC_FLAG_INDIRECT_RING_STATE		0x1
>>	u32 flags;
>>@@ -53,6 +56,9 @@ struct xe_lrc {
>>
>>	/** @ctx_timestamp: readout value of CTX_TIMESTAMP on last update */
>>	u32 ctx_timestamp;
>>+
>>+	/** @bb_per_ctx_bo: buffer object for per context batch wa buffer */
>>+	struct xe_bo *bb_per_ctx_bo;
>>};
>>
>>struct xe_lrc_snapshot;
>>-- 
>>2.43.0
>>


More information about the Intel-xe mailing list