[PATCH] drm/xe: Make WA BB part of LRC BO

Thu Jun 12 05:19:46 UTC 2025

On Wed, Jun 11, 2025 at 08:19:25PM -0700, Matthew Brost wrote:
>No idea why, but without this GuC context switches randomly fail when
>running IGTs in a loop. Need to follow up why this fixes the
>aforementioned issue but can live with a stable driver for now.
>
>Fixes: 617d824c5323 ("drm/xe: Add WA BB to capture active context utilization")
>Cc: stable at vger.kernel.org
>Signed-off-by: Matthew Brost <matthew.brost at intel.com>

As an optimization, I agree with this. And it's even more than half way
through the other optimization I chatted with Matt Roper about the extra
allocation we are now doing.... eventually the additional 4K could be
part of the empty_lrc_data() (and thus of lrc_size) so we only calculate
it once (per class).

However there's something funky going on here. Why aren't we able to
pin the second bo and use it.

code here looks correct.

Reviewed-by: Lucas De Marchi <lucas.demarchi at intel.com>

thanks
Lucas De Marchi

>---
> drivers/gpu/drm/xe/xe_lrc.c       | 36 +++++++++++++++----------------
> drivers/gpu/drm/xe/xe_lrc_types.h |  3 ---
> 2 files changed, 18 insertions(+), 21 deletions(-)
>
>diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
>index 529c6a972a55..a875b93697a5 100644
>--- a/drivers/gpu/drm/xe/xe_lrc.c
>+++ b/drivers/gpu/drm/xe/xe_lrc.c
>@@ -40,6 +40,7 @@
>
> #define LRC_PPHWSP_SIZE				SZ_4K
> #define LRC_INDIRECT_RING_STATE_SIZE		SZ_4K
>+#define LRC_WA_BB_SIZE				SZ_4K
>
> static struct xe_device *
> lrc_to_xe(struct xe_lrc *lrc)
>@@ -910,7 +911,6 @@ static void xe_lrc_finish(struct xe_lrc *lrc)
> {
> 	xe_hw_fence_ctx_finish(&lrc->fence_ctx);
> 	xe_bo_unpin_map_no_vm(lrc->bo);
>-	xe_bo_unpin_map_no_vm(lrc->bb_per_ctx_bo);
> }
>
> /*
>@@ -973,22 +973,27 @@ struct wa_bb_setup {
> 			 u32 *batch, size_t max_size);
> };
>
>+static size_t wa_bb_offset(struct xe_lrc *lrc)
>+{
>+	return lrc->bo->size - LRC_WA_BB_SIZE;
>+}
>+
> static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
> {
>-	const size_t max_size = lrc->bb_per_ctx_bo->size;
>+	const size_t max_size = LRC_WA_BB_SIZE;
> 	static const struct wa_bb_setup funcs[] = {
> 		{ .setup = wa_bb_setup_utilization },
> 	};
> 	ssize_t remain;
> 	u32 *cmd, *buf = NULL;
>
>-	if (lrc->bb_per_ctx_bo->vmap.is_iomem) {
>+	if (lrc->bo->vmap.is_iomem) {
> 		buf = kmalloc(max_size, GFP_KERNEL);
> 		if (!buf)
> 			return -ENOMEM;
> 		cmd = buf;
> 	} else {
>-		cmd = lrc->bb_per_ctx_bo->vmap.vaddr;
>+		cmd = lrc->bo->vmap.vaddr + wa_bb_offset(lrc);
> 	}
>
> 	remain = max_size / sizeof(*cmd);
>@@ -1011,13 +1016,14 @@ static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
> 	*cmd++ = MI_BATCH_BUFFER_END;
>
> 	if (buf) {
>-		xe_map_memcpy_to(gt_to_xe(lrc->gt), &lrc->bb_per_ctx_bo->vmap, 0,
>-				 buf, (cmd - buf) * sizeof(*cmd));
>+		xe_map_memcpy_to(gt_to_xe(lrc->gt), &lrc->bo->vmap,
>+				 wa_bb_offset(lrc), buf,
>+				 (cmd - buf) * sizeof(*cmd));
> 		kfree(buf);
> 	}
>
>-	xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
>-			     xe_bo_ggtt_addr(lrc->bb_per_ctx_bo) | 1);
>+	xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR, xe_bo_ggtt_addr(lrc->bo) +
>+			     wa_bb_offset(lrc) + 1);
>
> 	return 0;
>
>@@ -1059,20 +1065,13 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> 	 * FIXME: Perma-pinning LRC as we don't yet support moving GGTT address
> 	 * via VM bind calls.
> 	 */
>-	lrc->bo = xe_bo_create_pin_map(xe, tile, NULL, lrc_size,
>+	lrc->bo = xe_bo_create_pin_map(xe, tile, NULL,
>+				       lrc_size + LRC_WA_BB_SIZE,
> 				       ttm_bo_type_kernel,
> 				       bo_flags);
> 	if (IS_ERR(lrc->bo))
> 		return PTR_ERR(lrc->bo);
>
>-	lrc->bb_per_ctx_bo = xe_bo_create_pin_map(xe, tile, NULL, SZ_4K,
>-						  ttm_bo_type_kernel,
>-						  bo_flags);
>-	if (IS_ERR(lrc->bb_per_ctx_bo)) {
>-		err = PTR_ERR(lrc->bb_per_ctx_bo);
>-		goto err_lrc_finish;
>-	}
>-
> 	lrc->size = lrc_size;
> 	lrc->ring.size = ring_size;
> 	lrc->ring.tail = 0;
>@@ -1860,7 +1859,8 @@ struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
> 	snapshot->seqno = xe_lrc_seqno(lrc);
> 	snapshot->lrc_bo = xe_bo_get(lrc->bo);
> 	snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
>-	snapshot->lrc_size = lrc->bo->size - snapshot->lrc_offset;
>+	snapshot->lrc_size = lrc->bo->size - snapshot->lrc_offset -
>+		LRC_WA_BB_SIZE;
> 	snapshot->lrc_snapshot = NULL;
> 	snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc));
> 	snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
>diff --git a/drivers/gpu/drm/xe/xe_lrc_types.h b/drivers/gpu/drm/xe/xe_lrc_types.h
>index ae24cf6f8dd9..883e550a9423 100644
>--- a/drivers/gpu/drm/xe/xe_lrc_types.h
>+++ b/drivers/gpu/drm/xe/xe_lrc_types.h
>@@ -53,9 +53,6 @@ struct xe_lrc {
>
> 	/** @ctx_timestamp: readout value of CTX_TIMESTAMP on last update */
> 	u64 ctx_timestamp;
>-
>-	/** @bb_per_ctx_bo: buffer object for per context batch wa buffer */
>-	struct xe_bo *bb_per_ctx_bo;
> };
>
> struct xe_lrc_snapshot;
>-- 
>2.34.1
>