[PATCH v7 2/8] drm/xe: Generalize wa bb emission code
Lucas De Marchi
lucas.demarchi at intel.com
Wed Jul 2 15:16:11 UTC 2025
On Wed, Jul 02, 2025 at 10:49:26AM +0100, Tvrtko Ursulin wrote:
>
>On 02/07/2025 06:13, Lucas De Marchi wrote:
>>On Mon, Jun 30, 2025 at 01:22:38PM +0100, Tvrtko Ursulin wrote:
>>>Generalize the wa bb emission by splitting it into three phases - setup,
>>>emit and finish, and extract setup and finish steps into helpers.
>>>
>>>This will enable using the same infrastructure for emitting the indirect
>>>context workarounds.
>>>
>>>Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
>>>Cc: Lucas De Marchi <lucas.demarchi at intel.com>
>>>---
>>>drivers/gpu/drm/xe/xe_lrc.c | 77 +++++++++++++++++++++++++------------
>>>1 file changed, 53 insertions(+), 24 deletions(-)
>>>
>>>diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
>>>index d2ad8fe737eb..c07fe23e5e73 100644
>>>--- a/drivers/gpu/drm/xe/xe_lrc.c
>>>+++ b/drivers/gpu/drm/xe/xe_lrc.c
>>>@@ -972,32 +972,37 @@ static ssize_t
>>>wa_bb_setup_utilization(struct xe_lrc *lrc, struct xe_hw_engine *
>>> return cmd - batch;
>>>}
>>>
>>>-struct wa_bb_setup {
>>>+struct wa_bo_setup {
>>> ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
>>> u32 *batch, size_t max_size);
>>>};
>>>
>>>-static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
>>>+static u32 *
>>>+setup_wa_bo(struct xe_lrc *lrc,
>>>+ struct xe_hw_engine *hwe,
>>>+ const size_t max_size,
>>>+ unsigned int offset,
>>>+ const struct wa_bo_setup *funcs,
>>>+ unsigned int num_funcs,
>>>+ u32 **free)
>>>{
>>>- const size_t max_size = LRC_WA_BB_SIZE;
>>>- static const struct wa_bb_setup funcs[] = {
>>>- { .setup = wa_bb_setup_utilization },
>>>- };
>>>+ u32 *cmd, *buf = NULL;
>>> ssize_t remain;
>>>- u32 *cmd, *buf = NULL;
>>>
>>> if (lrc->bo->vmap.is_iomem) {
>>> buf = kmalloc(max_size, GFP_KERNEL);
>>> if (!buf)
>>>- return -ENOMEM;
>>>+ return ERR_PTR(-ENOMEM);
>>> cmd = buf;
>>>+ *free = buf;
>>> } else {
>>>- cmd = lrc->bo->vmap.vaddr + __xe_lrc_wa_bb_offset(lrc);
>>>+ cmd = lrc->bo->vmap.vaddr + offset;
>>>+ *free = NULL;
>>> }
>>>
>>> remain = max_size / sizeof(*cmd);
>>>
>>>- for (size_t i = 0; i < ARRAY_SIZE(funcs); i++) {
>>>+ for (size_t i = 0; i < num_funcs; i++) {
>>> ssize_t len = funcs[i].setup(lrc, hwe, cmd, remain);
>>>
>>> remain -= len;
>>>@@ -1012,23 +1017,47 @@ static int setup_wa_bb(struct xe_lrc *lrc,
>>>struct xe_hw_engine *hwe)
>>> cmd += len;
>>> }
>>>
>>>- *cmd++ = MI_BATCH_BUFFER_END;
>>>-
>>>- if (buf) {
>>>- xe_map_memcpy_to(gt_to_xe(lrc->gt), &lrc->bo->vmap,
>>>- __xe_lrc_wa_bb_offset(lrc), buf,
>>>- (cmd - buf) * sizeof(*cmd));
>>>- kfree(buf);
>>>- }
>>>-
>>>- xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
>>>xe_bo_ggtt_addr(lrc->bo) +
>>>- __xe_lrc_wa_bb_offset(lrc) + 1);
>>>-
>>>- return 0;
>>>+ return cmd;
>>>
>>>fail:
>>> kfree(buf);
>>>- return -ENOSPC;
>>>+ return ERR_PTR(-ENOSPC);
>>>+}
>>>+
>>>+static void finish_wa_bo(struct xe_lrc *lrc,
>>>+ unsigned int offset,
>>>+ u32 *cmd,
>>>+ u32 *free)
>>>+{
>>>+ if (!free)
>>>+ return;
>>>+
>>>+ xe_map_memcpy_to(gt_to_xe(lrc->gt), &lrc->bo->vmap, offset, free,
>>>+ (cmd - free) * sizeof(*cmd));
>>>+ kfree(free);
>>>+}
>>>+
>>>+static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
>>>+{
>>>+ static const struct wa_bo_setup funcs[] = {
>>>+ { .setup = wa_bb_setup_utilization },
>>>+ };
>>>+ unsigned int offset = __xe_lrc_wa_bb_offset(lrc);
>>>+ u32 *cmd, *buf = NULL;
>>>+
>>>+ cmd = setup_wa_bo(lrc, hwe, LRC_WA_BB_SIZE, offset, funcs,
>>>+ ARRAY_SIZE(funcs), &buf);
>>
>>I'm not seeing much advantage in this generalization when then you need
>>to extend it with the free pointer (which is then also used for offset),
>>and number of written words etc. I understand what you did with
>>setup_wa_bo and setup_wa_bb, but they are soo similar it looks like
>>a
>>typo.
>>
>>Can we keep the wa_bb and indirect_ctx separate and eventually merge
>>them through helpers rather than making the wa_bb stuff fit the needs
>>of indirect ctx? In other words, go straight to patch 6.
>
>Hm I don't exactly follow the critique. Apart from the bb vs bo in the
>naming, but that can be easily improved.
>
>Purpose of the generalisation is that there is one copy of the much of
>the code and vfuncs which emit stuff can be shared and used for either
>wa_bb or indirect ctx. In a way setup_wa_bo() and finish_wa_bo() are
>helpers. Helpers for anyone who wants to emit a series of commads to
>some BO+offset.
>
>Apart from avoiding duplication the ability to share is super
>interesting for workarounds which need to use different mechanism
>depending on the engine. For example the CTX_TIMESTAMP wa, which Matt
>found BSpec calls to be emitted from the indirect ctx for RCS and as
>wa_bb for the rest.
>
>If we look at the end result (trimmed a bit for the purpose of comparison):
>
>static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
>{
> static const struct wa_bo_setup funcs[] = {
> { .setup = setup_timestamp_wa },
> { .setup = setup_invalidate_state_cache_wa },
> { .setup = setup_utilization_wa },
> };
> unsigned int offset = __xe_lrc_wa_bb_offset(lrc);
> unsigned int written = 0;
> u32 *cmd, *buf = NULL;
>
> cmd = setup_wa_bo(lrc, hwe, LRC_WA_BB_SIZE, 1, offset, funcs,
> ARRAY_SIZE(funcs), &buf, &written);
> if (IS_ERR(cmd))
> return PTR_ERR(cmd);
>
> *cmd++ = MI_BATCH_BUFFER_END;
> written++;
>
> finish_wa_bo(lrc, offset, written, buf);
>
> xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
> xe_bo_ggtt_addr(lrc->bo) + offset + 1);
>
> return 0;
>}
>
>static int
>setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
>{
> static struct wa_bo_setup rcs_funcs[] = {
> { .setup = setup_timestamp_wa },
> { .setup = setup_invalidate_auxccs_wa },
> };
> unsigned int offset, written = 0;
> u32 *cmd, *buf = NULL;
>
> offset = __xe_lrc_indirect_ctx_offset(lrc);
>
> cmd = setup_wa_bo(lrc, hwe, LRC_INDIRECT_CTX_SIZE, 15, offset, rcs_funcs,
> ARRAY_SIZE(rcs_funcs), &buf, &written);
> if (IS_ERR(cmd))
> return PTR_ERR(cmd);
>
> /* Align to 64B cacheline. */
> while ((unsigned long)cmd & 0x3f) {
> *cmd++ = MI_NOOP;
> written++;
> }
>
> finish_wa_bo(lrc, offset, written, buf);
>
> xe_lrc_write_ctx_reg(lrc,
> CTX_CS_INDIRECT_CTX,
> (xe_bo_ggtt_addr(lrc->bo) + offset) |
> /* Size in CLs. */
> (written * sizeof(u32) / 64));
> xe_lrc_write_ctx_reg(lrc,
> CTX_CS_INDIRECT_CTX_OFFSET,
> XELP_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT << 6);
>
> return 0;
>}
>
>I thought something like this was what you wanted from the earlier
>discussion, when I waited for you to land "drm/xe/lrc: Prepare WA BB
>setup for more users" and then rebased this on top of it.
looking at the final result above helped. Maybe the names threw me off
as you are using wa_bo naming for both wa_bb and indirect_ctx.
If we have both setup_wa_bb / setup_indirect_ctx and use helpers
called e.g. setup_bo, it would be clearer the function layering
setup_wa_bb() {
...
cmd = setup_bo(...);
...
finish_bo(...);
}
setup_indirect_ctx() {
...
cmd = setup_bo(...);
...
finish_bo(...);
}
or even hide the bounce buffer usage inside the struct you are passing
as param, but that could come later to avoid multiple changes here.
>
>But if it still does not look clean to you I can rewrite it with code
>duplication if that is what you definitely want, no problem.
Nah, what you wrote above clarifies it. Thanks
Lucas De Marchi
>
>Regards,
>
>Tvrtko
>
>>>+ if (IS_ERR(cmd))
>>>+ return PTR_ERR(cmd);
>>>+
>>>+ *cmd++ = MI_BATCH_BUFFER_END;
>>>+
>>>+ finish_wa_bo(lrc, offset, cmd, buf);
>>>+
>>>+ xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
>>>+ xe_bo_ggtt_addr(lrc->bo) + offset + 1);
>>>+
>>>+ return 0;
>>>}
>>>
>>>#define PVC_CTX_ASID (0x2e + 1)
>>>--
>>>2.48.0
>>>
>
More information about the Intel-xe
mailing list