[PATCH] drm/xe: Add WA BB to capture active context utilization
Umesh Nerlige Ramappa
umesh.nerlige.ramappa at intel.com
Fri Apr 25 19:10:39 UTC 2025
Context Timestamp (CTX_TIMESTAMP) in the lrc accumulates the run ticks
of the context, but only gets updated when the context switches out. In
order to check how long a context has been active before it switches
out, two things are required:
(1) Determine if the context is running:
To do so, we program the wa bb to set an initial value for CTX_TIMESTAMP
in the lrc. The value chosen is 1 since 0 is the initial value when the
lrc is initialized. During a query, we just check for this value to
determine if the context is active. If the context switched out, it
would overwrite this location with the actual CTX_TIMESTAMP MMIO value.
Note that wa bb will run after the context has been restored, so reusing
this lrc location will not clobber anything.
(2) Calculate the time that the context has been active for:
The CTX_TIMESTAMP ticks only when the context is active. If a context is
active, we just use the CTX_TIMESTAMP MMIO as the new value of
utilization. While doing so, we need to read the CTX_TIMESTAMP MMIO
for the specific engine instance. Since we do not know which instance
the context is running on until it is scheduled, we also read the
ENGINE_ID MMIO in the wa bb and store it in the PPHSWP.
Using the above 2 instructions in a WA BB, capture active context
utilization.
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
---
.../gpu/drm/xe/instructions/xe_mi_commands.h | 4 +
drivers/gpu/drm/xe/regs/xe_engine_regs.h | 4 +
drivers/gpu/drm/xe/regs/xe_lrc_layout.h | 1 +
drivers/gpu/drm/xe/xe_lrc.c | 168 +++++++++++++++++-
drivers/gpu/drm/xe/xe_lrc_types.h | 6 +
5 files changed, 181 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/xe/instructions/xe_mi_commands.h b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
index eba582058d55..9153a7cd2ceb 100644
--- a/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
+++ b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
@@ -62,6 +62,10 @@
#define MI_LOAD_REGISTER_MEM (__MI_INSTR(0x29) | XE_INSTR_NUM_DW(4))
#define MI_LRM_USE_GGTT REG_BIT(22)
+#define MI_STORE_REGISTER_MEM (__MI_INSTR(0x24) | XE_INSTR_NUM_DW(4))
+#define MI_SRM_USE_GGTT REG_BIT(22)
+#define MI_SRM_ADD_CS_OFFSET REG_BIT(19)
+
#define MI_LOAD_REGISTER_REG (__MI_INSTR(0x2a) | XE_INSTR_NUM_DW(3))
#define MI_LRR_DST_CS_MMIO REG_BIT(19)
#define MI_LRR_SRC_CS_MMIO REG_BIT(18)
diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
index da713634d6a0..cc698136a6f6 100644
--- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
@@ -43,6 +43,10 @@
#define XEHPC_BCS8_RING_BASE 0x3ee000
#define GSCCS_RING_BASE 0x11a000
+#define ENGINE_ID(base) XE_REG((base) + 0x8c)
+#define ENGINE_INSTANCE_ID REG_GENMASK(9, 4)
+#define ENGINE_CLASS_ID REG_GENMASK(2, 0)
+
#define RING_TAIL(base) XE_REG((base) + 0x30)
#define TAIL_ADDR REG_GENMASK(20, 3)
diff --git a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
index 57944f90bbf6..210be0ef12f2 100644
--- a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
+++ b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
@@ -11,6 +11,7 @@
#define CTX_RING_TAIL (0x06 + 1)
#define CTX_RING_START (0x08 + 1)
#define CTX_RING_CTL (0x0a + 1)
+#define CTX_BB_PER_CTX_PTR (0x12 + 1)
#define CTX_TIMESTAMP (0x22 + 1)
#define CTX_INDIRECT_RING_STATE (0x26 + 1)
#define CTX_PDP0_UDW (0x30 + 1)
diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
index 855c8acaf3f1..11eabe4216ef 100644
--- a/drivers/gpu/drm/xe/xe_lrc.c
+++ b/drivers/gpu/drm/xe/xe_lrc.c
@@ -24,6 +24,7 @@
#include "xe_hw_fence.h"
#include "xe_map.h"
#include "xe_memirq.h"
+#include "xe_mmio.h"
#include "xe_sriov.h"
#include "xe_trace_lrc.h"
#include "xe_vm.h"
@@ -654,6 +655,7 @@ u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
#define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
#define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
#define LRC_PARALLEL_PPHWSP_OFFSET 2048
+#define LRC_ENGINE_ID_PPHWSP_OFFSET 2096
u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
{
@@ -697,11 +699,21 @@ static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
}
+static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
+{
+ return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
+}
+
static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
{
return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
}
+static u32 __xe_lrc_bb_per_ctx_ptr_offset(struct xe_lrc *lrc)
+{
+ return __xe_lrc_regs_offset(lrc) + CTX_BB_PER_CTX_PTR * sizeof(u32);
+}
+
static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
{
/* Indirect ring state page is at the very end of LRC */
@@ -731,6 +743,8 @@ DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
DECL_MAP_ADDR_HELPERS(ctx_timestamp)
DECL_MAP_ADDR_HELPERS(parallel)
DECL_MAP_ADDR_HELPERS(indirect_ring)
+DECL_MAP_ADDR_HELPERS(bb_per_ctx_ptr)
+DECL_MAP_ADDR_HELPERS(engine_id)
#undef DECL_MAP_ADDR_HELPERS
@@ -880,6 +894,58 @@ static void xe_lrc_finish(struct xe_lrc *lrc)
xe_bo_unpin(lrc->bo);
xe_bo_unlock(lrc->bo);
xe_bo_put(lrc->bo);
+ xe_bo_unpin_map_no_vm(lrc->bb_per_ctx_bo);
+}
+
+/*
+ * xe_lrc_setup_utilization() - Setup wa bb to assist in calculating active
+ * context run ticks.
+ * @lrc: Pointer to the lrc.
+ *
+ * Context Timestamp (CTX_TIMESTAMP) in the lrc accumulates the run ticks of the
+ * context, but only gets updated when the context switches out. In order to
+ * check how long a context has been running before it switches out, two things
+ * are required:
+ *
+ * (1) Determine if the context is running
+ * To do so, we program the wa bb to set an initial value for CTX_TIMESTAMP in
+ * the lrc. The value chosen is 1 since 0 is the initial value when the lrc is
+ * initialized. During a query, we just check for this value to determine if the
+ * context is active. If the context switched out, it would overwrite this
+ * location with the actual CTX_TIMESTAMP MMIO value. Note that wa bb will run
+ * after the context has been restored, so reusing this lrc location will not
+ * clobber anything.
+ *
+ * (2) Calculate the time that the context has been running for
+ * The CTX_TIMESTAMP ticks only when the context is active. If a context is
+ * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
+ * While doing so, we need to read the CTX_TIMESTAMP MMIO from the specific
+ * engine instance. Since we do not know which instance the context is running
+ * on until it is scheduled, we also read the ENGINE_ID MMIO in the wa bb and
+ * store it in the PPHSWP.
+ */
+
+static void xe_lrc_setup_utilization(struct xe_lrc *lrc)
+{
+ u32 *cmd;
+
+ cmd = lrc->bb_per_ctx_bo->vmap.vaddr;
+
+ *cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
+ *cmd++ = ENGINE_ID(0).addr;
+ *cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
+ *cmd++ = 0;
+
+ *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
+ *cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
+ *cmd++ = 0;
+ *cmd++ = 1;
+
+ *cmd++ = MI_BATCH_BUFFER_END;
+
+ xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
+ xe_bo_ggtt_addr(lrc->bb_per_ctx_bo) | 1);
+
}
#define PVC_CTX_ASID (0x2e + 1)
@@ -900,6 +966,7 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
int err;
kref_init(&lrc->refcount);
+ lrc->gt = gt;
lrc->flags = 0;
lrc_size = ring_size + xe_gt_lrc_size(gt, hwe->class);
if (xe_gt_has_indirect_ring_state(gt))
@@ -920,6 +987,14 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
if (IS_ERR(lrc->bo))
return PTR_ERR(lrc->bo);
+ lrc->bb_per_ctx_bo = xe_bo_create_pin_map(xe, tile, NULL, SZ_4K,
+ ttm_bo_type_kernel,
+ bo_flags);
+ if (IS_ERR(lrc->bb_per_ctx_bo)) {
+ err = PTR_ERR(lrc->bb_per_ctx_bo);
+ goto err_lrc_finish;
+ }
+
lrc->size = lrc_size;
lrc->tile = gt_to_tile(hwe->gt);
lrc->ring.size = ring_size;
@@ -1026,6 +1101,8 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
map = __xe_lrc_start_seqno_map(lrc);
xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
+ xe_lrc_setup_utilization(lrc);
+
return 0;
err_lrc_finish:
@@ -1245,6 +1322,21 @@ struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
return __xe_lrc_parallel_map(lrc);
}
+/**
+ * xe_lrc_engine_id() - Read engine id value
+ * @lrc: Pointer to the lrc.
+ *
+ * Returns: context id value
+ */
+static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
+{
+ struct xe_device *xe = lrc_to_xe(lrc);
+ struct iosys_map map;
+
+ map = __xe_lrc_engine_id_map(lrc);
+ return xe_map_read32(xe, &map);
+}
+
static int instr_dw(u32 cmd_header)
{
/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
@@ -1792,21 +1884,93 @@ void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
kfree(snapshot);
}
+static u32 get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id)
+{
+ static u32 base[XE_ENGINE_CLASS_MAX][XE_HW_ENGINE_MAX_INSTANCE] = {
+ {
+ RENDER_RING_BASE,
+ },
+ {
+ BSD_RING_BASE,
+ BSD2_RING_BASE,
+ BSD3_RING_BASE,
+ BSD4_RING_BASE,
+ XEHP_BSD5_RING_BASE,
+ XEHP_BSD6_RING_BASE,
+ XEHP_BSD7_RING_BASE,
+ XEHP_BSD8_RING_BASE,
+ },
+ {
+ VEBOX_RING_BASE,
+ VEBOX2_RING_BASE,
+ XEHP_VEBOX3_RING_BASE,
+ XEHP_VEBOX4_RING_BASE,
+ },
+ {
+ BLT_RING_BASE,
+ XEHPC_BCS1_RING_BASE,
+ XEHPC_BCS2_RING_BASE,
+ XEHPC_BCS3_RING_BASE,
+ XEHPC_BCS4_RING_BASE,
+ XEHPC_BCS5_RING_BASE,
+ XEHPC_BCS6_RING_BASE,
+ XEHPC_BCS7_RING_BASE,
+ XEHPC_BCS8_RING_BASE,
+ },
+ { 0 },
+ {
+ COMPUTE0_RING_BASE,
+ COMPUTE1_RING_BASE,
+ COMPUTE2_RING_BASE,
+ COMPUTE3_RING_BASE,
+ },
+ };
+ u32 c = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
+ u32 i = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
+ u32 mmio_base = base[c][i];
+
+ if (xe_gt_WARN_ONCE(lrc->gt, !mmio_base,
+ "Unexpected engine c:i %d:%d for context utilization\n",
+ c, i))
+ return 0;
+
+ return xe_mmio_read32(&lrc->gt->mmio, RING_CTX_TIMESTAMP(mmio_base));
+}
+
/**
* xe_lrc_update_timestamp() - Update ctx timestamp
* @lrc: Pointer to the lrc.
* @old_ts: Old timestamp value
*
* Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
- * update saved value.
+ * update saved value. With support for active contexts, the calculation may be
+ * slightly racy, so follow a read-again logic to ensure that the context is
+ * still active before returning the right timestamp.
*
* Returns: New ctx timestamp value
*/
u32 xe_lrc_update_timestamp(struct xe_lrc *lrc, u32 *old_ts)
{
+ u32 ctx_lrc, ctx_reg, engine_id;
+
*old_ts = lrc->ctx_timestamp;
- lrc->ctx_timestamp = xe_lrc_ctx_timestamp(lrc);
+ ctx_lrc = xe_lrc_ctx_timestamp(lrc);
+ if (ctx_lrc == 1) {
+ engine_id = xe_lrc_engine_id(lrc);
+ ctx_reg = get_ctx_timestamp(lrc, engine_id);
+ if (!ctx_reg)
+ lrc->ctx_timestamp = *old_ts;
+ else
+ lrc->ctx_timestamp = ctx_reg;
+
+ /* read lrc again to ensure context is still active */
+ ctx_lrc = xe_lrc_ctx_timestamp(lrc);
+ }
+
+ /* If context switched out, just use the lrc value */
+ if (ctx_lrc != 1)
+ lrc->ctx_timestamp = ctx_lrc;
trace_xe_lrc_update_timestamp(lrc, *old_ts);
diff --git a/drivers/gpu/drm/xe/xe_lrc_types.h b/drivers/gpu/drm/xe/xe_lrc_types.h
index 71ecb453f811..1c973d71ba98 100644
--- a/drivers/gpu/drm/xe/xe_lrc_types.h
+++ b/drivers/gpu/drm/xe/xe_lrc_types.h
@@ -28,6 +28,9 @@ struct xe_lrc {
/** @tile: tile which this LRC belongs to */
struct xe_tile *tile;
+ /** @gt: gt which this LRC belongs to */
+ struct xe_gt *gt;
+
/** @flags: LRC flags */
#define XE_LRC_FLAG_INDIRECT_RING_STATE 0x1
u32 flags;
@@ -53,6 +56,9 @@ struct xe_lrc {
/** @ctx_timestamp: readout value of CTX_TIMESTAMP on last update */
u32 ctx_timestamp;
+
+ /** @bb_per_ctx_bo: buffer object for per context batch wa buffer */
+ struct xe_bo *bb_per_ctx_bo;
};
struct xe_lrc_snapshot;
--
2.43.0
More information about the Intel-xe
mailing list