[PATCH v7 2/3] drm/xe/vf: Attach and detach CCS copy commands with BO
Matthew Brost
matthew.brost at intel.com
Tue Jun 17 23:17:45 UTC 2025
On Tue, Jun 17, 2025 at 04:11:15PM -0700, Matthew Brost wrote:
> On Mon, Jun 16, 2025 at 08:38:04PM +0530, Satyanarayana K V P wrote:
> > Attach CCS read/write copy commands to BO for old and new mem types as
> > NULL -> tt or system -> tt.
> > Detach the CCS read/write copy commands from BO while deleting ttm bo
> > from xe_ttm_bo_delete_mem_notify().
> >
> > Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p at intel.com>
> > Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
> > Cc: Matthew Brost <matthew.brost at intel.com>
> > Cc: Matthew Auld <matthew.auld at intel.com>
> > Cc: Michał Winiarski <michal.winiarski at intel.com>
> > ---
> > Cc: Tomasz Lis <tomasz.lis at intel.com>
> >
> > V6 -> V7:
> > - Created xe_bb_ccs_realloc() to create a single BB instead of maintaining
> > a list. (Matthew Brost)
> >
> > V5 -> V6:
> > - Removed dead code from xe_migrate_ccs_rw_copy() function. (Matthew Brost)
> >
> > V4 -> V5:
> > - Create a list of BBs for the given BO and fixed memory leak while
> > detaching BOs. (Matthew Brost).
> > - Fixed review comments (Matthew Brost & Matthew Auld).
> > - Yet to cleanup xe_migrate_ccs_rw_copy() function.
> >
> > V3 -> V4:
> > - Fixed issues reported by patchworks.
> >
> > V2 -> V3:
> > - Attach and detach functions check for IS_VF_CCS_READY().
> >
> > V1 -> V2:
> > - Fixed review comments.
> > ---
> > drivers/gpu/drm/xe/xe_bb.c | 54 ++++++++++++
> > drivers/gpu/drm/xe/xe_bb.h | 6 ++
> > drivers/gpu/drm/xe/xe_bo.c | 23 ++++++
> > drivers/gpu/drm/xe/xe_bo_types.h | 3 +
> > drivers/gpu/drm/xe/xe_migrate.c | 95 ++++++++++++++++++++++
> > drivers/gpu/drm/xe/xe_migrate.h | 6 ++
> > drivers/gpu/drm/xe/xe_sriov_vf_ccs.c | 72 ++++++++++++++++
> > drivers/gpu/drm/xe/xe_sriov_vf_ccs.h | 3 +
> > drivers/gpu/drm/xe/xe_sriov_vf_ccs_types.h | 8 ++
> > 9 files changed, 270 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/xe/xe_bb.c b/drivers/gpu/drm/xe/xe_bb.c
> > index 9570672fce33..9a9c33c7362a 100644
> > --- a/drivers/gpu/drm/xe/xe_bb.c
> > +++ b/drivers/gpu/drm/xe/xe_bb.c
> > @@ -60,6 +60,60 @@ struct xe_bb *xe_bb_new(struct xe_gt *gt, u32 dwords, bool usm)
> > return ERR_PTR(err);
> > }
> >
> > +struct xe_bb *xe_bb_ccs_new(struct xe_gt *gt, u32 dwords,
> > + enum xe_sriov_vf_ccs_rw_ctxs ctx_id)
> > +{
> > + struct xe_bb *bb = kmalloc(sizeof(*bb), GFP_KERNEL);
> > + struct xe_tile *tile = gt_to_tile(gt);
> > + struct xe_sa_manager *bb_pool;
> > + int err;
> > +
> > + if (!bb)
> > + return ERR_PTR(-ENOMEM);
> > +
> > + /*
> > + * We need to allocate space for the requested number of dwords,
> > + * one additional MI_BATCH_BUFFER_END dword, and additional buffer
> > + * space to accommodate the platform-specific hardware prefetch
> > + * requirements.
> > + */
>
> This comment doesn't make sense as you don't insert a
> MI_BATCH_BUFFER_END instruction.
>
> > + bb_pool = tile->sriov.vf.ccs[ctx_id].mem.ccs_bb_pool;
> > + bb->bo = xe_sa_bo_new(bb_pool, 4 * (dwords + 1) + bb_prefetch(gt));
>
> I think you can skip the bb_prefetch here too. However when you insert
> the single MI_BATCH_BUFFER_END I think it actually needs go at BO size -
> bb_prefetch(gt). Also don't allow the SA to allocate from the last
> bb_prefetch(gt) in the BO. We could (and should) actually do this in our
> existing SA code too.
>
I actually take that back the existing code should be changed - it is
actually right. In the existing code we dont things fetched into caches
which could potentially being modified by KMD and then subsequent job is
run looking at the stale values in the cache. In your new code, since
this entire BO is run in one shot, we don't have such a concern so my
suggestion of not including bb_prefetch() on each SA makes sense. So
does moving the single MI_BATCH_BUFFER_END to BO size - bb_prefetch(gt).
Matt
> > +
> > + if (IS_ERR(bb->bo)) {
> > + err = PTR_ERR(bb->bo);
> > + goto err;
> > + }
> > +
> > + bb->cs = xe_sa_bo_cpu_addr(bb->bo);
> > + bb->len = 0;
> > +
> > + return bb;
> > +err:
> > + kfree(bb);
> > + return ERR_PTR(err);
> > +}
> > +
> > +struct xe_bb *xe_bb_ccs_realloc(struct xe_bb *bb, struct xe_gt *gt,
> > + u32 dwords,
> > + enum xe_sriov_vf_ccs_rw_ctxs ctx_id)
>
> Not what I was suggesting.
>
> > +{
> > + struct xe_bb *new_bb;
> > +
> > + if (!bb)
> > + return xe_bb_ccs_new(gt, dwords, ctx_id);
> > +
> > + new_bb = xe_bb_ccs_new(gt, dwords, ctx_id);
> > + if (IS_ERR(new_bb))
> > + return new_bb;
> > +
> > + memcpy(new_bb->cs, bb->cs, bb->len * sizeof(u32));
> > + new_bb->len = bb->len;
> > + xe_bb_free(bb, NULL);
> > +
> > + return new_bb;
> > +}
> > +
> > static struct xe_sched_job *
> > __xe_bb_create_job(struct xe_exec_queue *q, struct xe_bb *bb, u64 *addr)
> > {
> > diff --git a/drivers/gpu/drm/xe/xe_bb.h b/drivers/gpu/drm/xe/xe_bb.h
> > index fafacd73dcc3..7dfe9422dfab 100644
> > --- a/drivers/gpu/drm/xe/xe_bb.h
> > +++ b/drivers/gpu/drm/xe/xe_bb.h
> > @@ -13,8 +13,14 @@ struct dma_fence;
> > struct xe_gt;
> > struct xe_exec_queue;
> > struct xe_sched_job;
> > +enum xe_sriov_vf_ccs_rw_ctxs;
> >
> > struct xe_bb *xe_bb_new(struct xe_gt *gt, u32 size, bool usm);
> > +struct xe_bb *xe_bb_ccs_new(struct xe_gt *gt, u32 dwords,
> > + enum xe_sriov_vf_ccs_rw_ctxs ctx_id);
> > +struct xe_bb *xe_bb_ccs_realloc(struct xe_bb *bb, struct xe_gt *gt,
> > + u32 dwords,
> > + enum xe_sriov_vf_ccs_rw_ctxs ctx_id);
> > struct xe_sched_job *xe_bb_create_job(struct xe_exec_queue *q,
> > struct xe_bb *bb);
> > struct xe_sched_job *xe_bb_create_migration_job(struct xe_exec_queue *q,
> > diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
> > index 4e39188a021a..beaf8544bf08 100644
> > --- a/drivers/gpu/drm/xe/xe_bo.c
> > +++ b/drivers/gpu/drm/xe/xe_bo.c
> > @@ -31,6 +31,7 @@
> > #include "xe_pxp.h"
> > #include "xe_res_cursor.h"
> > #include "xe_shrinker.h"
> > +#include "xe_sriov_vf_ccs.h"
> > #include "xe_trace_bo.h"
> > #include "xe_ttm_stolen_mgr.h"
> > #include "xe_vm.h"
> > @@ -947,6 +948,20 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
> > dma_fence_put(fence);
> > xe_pm_runtime_put(xe);
> >
> > + /*
> > + * CCS meta data is migrated from TT -> SMEM. So, let us detach the
> > + * BBs from BO as it is no longer needed.
> > + */
> > + if (IS_VF_CCS_BB_VALID(xe, bo) && old_mem_type == XE_PL_TT &&
> > + new_mem->mem_type == XE_PL_SYSTEM)
> > + xe_sriov_vf_ccs_detach_bo(bo);
> > +
> > + if (IS_SRIOV_VF(xe) &&
> > + ((move_lacks_source && new_mem->mem_type == XE_PL_TT) ||
> > + (old_mem_type == XE_PL_SYSTEM && new_mem->mem_type == XE_PL_TT)) &&
> > + handle_system_ccs)
> > + ret = xe_sriov_vf_ccs_attach_bo(bo);
> > +
> > out:
> > if ((!ttm_bo->resource || ttm_bo->resource->mem_type == XE_PL_SYSTEM) &&
> > ttm_bo->ttm) {
> > @@ -957,6 +972,9 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
> > if (timeout < 0)
> > ret = timeout;
> >
> > + if (IS_VF_CCS_BB_VALID(xe, bo))
> > + xe_sriov_vf_ccs_detach_bo(bo);
> > +
> > xe_tt_unmap_sg(xe, ttm_bo->ttm);
> > }
> >
> > @@ -1483,9 +1501,14 @@ static void xe_ttm_bo_release_notify(struct ttm_buffer_object *ttm_bo)
> >
> > static void xe_ttm_bo_delete_mem_notify(struct ttm_buffer_object *ttm_bo)
> > {
> > + struct xe_bo *bo = ttm_to_xe_bo(ttm_bo);
> > +
> > if (!xe_bo_is_xe_bo(ttm_bo))
> > return;
> >
> > + if (IS_VF_CCS_BB_VALID(ttm_to_xe_device(ttm_bo->bdev), bo))
> > + xe_sriov_vf_ccs_detach_bo(bo);
> > +
> > /*
> > * Object is idle and about to be destroyed. Release the
> > * dma-buf attachment.
> > diff --git a/drivers/gpu/drm/xe/xe_bo_types.h b/drivers/gpu/drm/xe/xe_bo_types.h
> > index eb5e83c5f233..642e519fcfd1 100644
> > --- a/drivers/gpu/drm/xe/xe_bo_types.h
> > +++ b/drivers/gpu/drm/xe/xe_bo_types.h
> > @@ -78,6 +78,9 @@ struct xe_bo {
> > /** @ccs_cleared */
> > bool ccs_cleared;
> >
> > + /** @bb_ccs_rw: BB instructions of CCS read/write. Valid only for VF */
> > + struct xe_bb *bb_ccs[XE_SRIOV_VF_CCS_CTX_COUNT];
> > +
> > /**
> > * @cpu_caching: CPU caching mode. Currently only used for userspace
> > * objects. Exceptions are system memory on DGFX, which is always
> > diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> > index 8f8e9fdfb2a8..6c30b106b817 100644
> > --- a/drivers/gpu/drm/xe/xe_migrate.c
> > +++ b/drivers/gpu/drm/xe/xe_migrate.c
> > @@ -940,6 +940,101 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
> > return fence;
> > }
> >
> > +/**
> > + * xe_migrate_ccs_rw_copy() - Copy content of TTM resources.
> > + * @m: The migration context.
> > + * @src_bo: The buffer object @src is currently bound to.
> > + * @read_write : Creates BB commands for CCS read/write.
> > + *
> > + * Creates batch buffer instructions to copy CCS metadata from CCS pool to
> > + * memory and vice versa.
> > + *
> > + * This function should only be called for IGPU.
> > + *
> > + * Return: 0 if successful, negative error code on failure.
> > + */
> > +int xe_migrate_ccs_rw_copy(struct xe_migrate *m,
> > + struct xe_bo *src_bo,
> > + enum xe_sriov_vf_ccs_rw_ctxs read_write)
> > +
> > +{
> > + bool src_is_pltt = read_write == XE_SRIOV_VF_CCS_WRITE_CTX;
> > + bool dst_is_pltt = read_write == XE_SRIOV_VF_CCS_READ_CTX;
> > + struct ttm_resource *src = src_bo->ttm.resource;
> > + struct xe_gt *gt = m->tile->primary_gt;
> > + struct xe_device *xe = gt_to_xe(gt);
> > + struct xe_res_cursor src_it, ccs_it;
> > + u64 size = src_bo->size;
> > + struct xe_bb *bb = NULL;
> > + u64 src_L0, src_L0_ofs;
> > + u32 batch_size;
> > + u32 src_L0_pt;
> > + int err;
> > +
> > + xe_res_first_sg(xe_bo_sg(src_bo), 0, size, &src_it);
> > +
> > + xe_res_first_sg(xe_bo_sg(src_bo), xe_bo_ccs_pages_start(src_bo),
> > + PAGE_ALIGN(xe_device_ccs_bytes(xe, size)),
> > + &ccs_it);
> > +
> > + batch_size = 0;
> > + while (size) {
> > + batch_size += 8; /* arb_clear() + MI_BATCH_BUFFER_END + Flush + NOP */
>
> This comment / math doesn't make sense arb_clear() +
> MI_BATCH_BUFFER_END, rather just 2 flushes. I think a value 6 is
> actually correct too as that is the size of the 2 flushes.
>
> > + u32 flush_flags = 0;
> > + u64 ccs_ofs, ccs_size;
> > + u32 ccs_pt;
> > +
> > + u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;
> > +
> > + src_L0 = xe_migrate_res_sizes(m, &src_it);
> > +
> > + batch_size += pte_update_size(m, false, src, &src_it, &src_L0,
> > + &src_L0_ofs, &src_L0_pt, 0, 0,
> > + avail_pts);
> > +
> > + ccs_size = xe_device_ccs_bytes(xe, src_L0);
> > + batch_size += pte_update_size(m, 0, NULL, &ccs_it, &ccs_size, &ccs_ofs,
> > + &ccs_pt, 0, avail_pts, avail_pts);
> > + xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE));
> > +
> > + /* Add copy commands size here */
> > + batch_size += EMIT_COPY_CCS_DW;
> > +
>
> This is where you should split the loop. All of the above is loop
> calculate the size of BB.
>
> Then allocate the BB, reset iterators, size to initial value.
>
> All of the below is 2nd loop which emits instructions.
>
> > + bb = xe_bb_ccs_realloc(bb, gt, batch_size, read_write);
>
> Again, no realloc, rather just a single BB allocation.
>
> > + if (IS_ERR(bb)) {
> > + drm_err(&xe->drm, "BB allocation failed.\n");
> > + err = PTR_ERR(bb);
> > + goto err_ret;
> > + }
> > +
> > + emit_pte(m, bb, src_L0_pt, false, true, &src_it, src_L0, src);
> > +
> > + emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src);
> > +
> > + bb->cs[bb->len++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW |
> > + MI_FLUSH_IMM_DW;
> > + bb->cs[bb->len++] = MI_NOOP;
> > + bb->cs[bb->len++] = MI_NOOP;
> > +
> > + flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs, src_is_pltt,
> > + src_L0_ofs, dst_is_pltt,
> > + src_L0, ccs_ofs, true);
> > +
> > + bb->cs[bb->len++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW |
> > + MI_FLUSH_IMM_DW | flush_flags;
> > + bb->cs[bb->len++] = MI_NOOP;
> > + bb->cs[bb->len++] = MI_NOOP;
> > +
> > + src_bo->bb_ccs[read_write] = bb;
>
> Maybe an assert when storing the BB bb->len == calculated size do ensure
> our math is correct.
>
> Matt
>
> > +
> > + size -= src_L0;
> > + }
> > + return 0;
> > +
> > +err_ret:
> > + return err;
> > +}
> > +
> > static void emit_clear_link_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,
> > u32 size, u32 pitch)
> > {
> > diff --git a/drivers/gpu/drm/xe/xe_migrate.h b/drivers/gpu/drm/xe/xe_migrate.h
> > index fb9839c1bae0..96b0449e7edb 100644
> > --- a/drivers/gpu/drm/xe/xe_migrate.h
> > +++ b/drivers/gpu/drm/xe/xe_migrate.h
> > @@ -24,6 +24,8 @@ struct xe_vm;
> > struct xe_vm_pgtable_update;
> > struct xe_vma;
> >
> > +enum xe_sriov_vf_ccs_rw_ctxs;
> > +
> > /**
> > * struct xe_migrate_pt_update_ops - Callbacks for the
> > * xe_migrate_update_pgtables() function.
> > @@ -112,6 +114,10 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
> > struct ttm_resource *dst,
> > bool copy_only_ccs);
> >
> > +int xe_migrate_ccs_rw_copy(struct xe_migrate *m,
> > + struct xe_bo *src_bo,
> > + enum xe_sriov_vf_ccs_rw_ctxs read_write);
> > +
> > int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
> > unsigned long offset, void *buf, int len,
> > int write);
> > diff --git a/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c b/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c
> > index ff5ad472eb96..3e37cb95ff32 100644
> > --- a/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c
> > +++ b/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c
> > @@ -5,6 +5,7 @@
> >
> > #include "instructions/xe_mi_commands.h"
> > #include "instructions/xe_gpu_commands.h"
> > +#include "xe_bb.h"
> > #include "xe_bo.h"
> > #include "xe_device.h"
> > #include "xe_migrate.h"
> > @@ -208,3 +209,74 @@ int xe_sriov_vf_ccs_init(struct xe_device *xe)
> > err_ret:
> > return err;
> > }
> > +
> > +/**
> > + * xe_sriov_vf_ccs_attach_bo - Insert CCS read write commands in the BO.
> > + * @bo: the &buffer object to which batch buffer commands will be added.
> > + *
> > + * This function shall be called only by VF. It inserts the PTEs and copy
> > + * command instructions in the BO by calling xe_migrate_ccs_rw_copy()
> > + * function.
> > + *
> > + * Returns: 0 if successful, negative error code on failure.
> > + */
> > +int xe_sriov_vf_ccs_attach_bo(struct xe_bo *bo)
> > +{
> > + struct xe_device *xe = xe_bo_device(bo);
> > + enum xe_sriov_vf_ccs_rw_ctxs ctx_id;
> > + struct xe_migrate *migrate;
> > + struct xe_tile *tile;
> > + struct xe_bb *bb;
> > + int tile_id;
> > + int err = 0;
> > +
> > + if (!IS_VF_CCS_READY(xe))
> > + return 0;
> > +
> > + for_each_tile(tile, xe, tile_id) {
> > + for_each_ccs_rw_ctx(ctx_id) {
> > + bb = bo->bb_ccs[ctx_id];
> > + if (bb)
> > + xe_sriov_err(xe, "Probable memory leak\n");
> > +
> > + migrate = tile->sriov.vf.ccs[ctx_id].migrate;
> > + err = xe_migrate_ccs_rw_copy(migrate, bo, ctx_id);
> > + }
> > + }
> > + return err;
> > +}
> > +
> > +/**
> > + * xe_sriov_vf_ccs_detach_bo - Remove CCS read write commands from the BO.
> > + * @bo: the &buffer object from which batch buffer commands will be removed.
> > + *
> > + * This function shall be called only by VF. It removes the PTEs and copy
> > + * command instructions from the BO. Make sure to update the BB with MI_NOOP
> > + * before freeing.
> > + *
> > + * Returns: 0 if successful.
> > + */
> > +int xe_sriov_vf_ccs_detach_bo(struct xe_bo *bo)
> > +{
> > + struct xe_device *xe = xe_bo_device(bo);
> > + enum xe_sriov_vf_ccs_rw_ctxs ctx_id;
> > + struct xe_bb *bb;
> > + struct xe_tile *tile;
> > + int tile_id;
> > +
> > + if (!IS_VF_CCS_READY(xe))
> > + return 0;
> > +
> > + for_each_tile(tile, xe, tile_id) {
> > + for_each_ccs_rw_ctx(ctx_id) {
> > + bb = bo->bb_ccs[ctx_id];
> > + if (!bb)
> > + continue;
> > +
> > + memset(bb->cs, MI_NOOP, bb->len * sizeof(u32));
> > + xe_bb_free(bb, NULL);
> > + bo->bb_ccs[ctx_id] = NULL;
> > + }
> > + }
> > + return 0;
> > +}
> > diff --git a/drivers/gpu/drm/xe/xe_sriov_vf_ccs.h b/drivers/gpu/drm/xe/xe_sriov_vf_ccs.h
> > index 5df9ba028d14..5d5e4bd25904 100644
> > --- a/drivers/gpu/drm/xe/xe_sriov_vf_ccs.h
> > +++ b/drivers/gpu/drm/xe/xe_sriov_vf_ccs.h
> > @@ -7,7 +7,10 @@
> > #define _XE_SRIOV_VF_CCS_H_
> >
> > struct xe_device;
> > +struct xe_bo;
> >
> > int xe_sriov_vf_ccs_init(struct xe_device *xe);
> > +int xe_sriov_vf_ccs_attach_bo(struct xe_bo *bo);
> > +int xe_sriov_vf_ccs_detach_bo(struct xe_bo *bo);
> >
> > #endif
> > diff --git a/drivers/gpu/drm/xe/xe_sriov_vf_ccs_types.h b/drivers/gpu/drm/xe/xe_sriov_vf_ccs_types.h
> > index 6dc279d206ec..e240f3fd18af 100644
> > --- a/drivers/gpu/drm/xe/xe_sriov_vf_ccs_types.h
> > +++ b/drivers/gpu/drm/xe/xe_sriov_vf_ccs_types.h
> > @@ -27,6 +27,14 @@ enum xe_sriov_vf_ccs_rw_ctxs {
> > XE_SRIOV_VF_CCS_CTX_COUNT
> > };
> >
> > +#define IS_VF_CCS_BB_VALID(xe, bo) ({ \
> > + struct xe_device *___xe = (xe); \
> > + struct xe_bo *___bo = (bo); \
> > + IS_SRIOV_VF(___xe) && \
> > + ___bo->bb_ccs[XE_SRIOV_VF_CCS_READ_CTX] && \
> > + ___bo->bb_ccs[XE_SRIOV_VF_CCS_WRITE_CTX]; \
> > + })
> > +
> > struct xe_migrate;
> > struct xe_sa_manager;
> >
> > --
> > 2.43.0
> >
More information about the Intel-xe
mailing list