[ONLY FOR INTERNAL REVIEW 2/3] drm/xe/vf: Attach and detach CCS copy commands with BO.

Matthew Auld matthew.auld at intel.com
Fri May 16 13:48:11 UTC 2025


On 16/05/2025 12:48, Satyanarayana K V P wrote:
> Attach CCS read/write copy commands to BO for old and new mem types as
> NULL -> tt or tt -> system or system -> tt.
> Detach the CCS read/write copy commands from BO while deleting ttm bo
> from xe_ttm_bo_delete_mem_notify().
> 
> Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p at intel.com>
> ---
> Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
> Cc: Michał Winiarski <michal.winiarski at intel.com>
> Cc: Tomasz Lis <tomasz.lis at intel.com>
> Cc: Matthew Brost <matthew.brost at intel.com>
> Cc: Matthew Auld <matthew.auld at intel.com>
> ---
>   drivers/gpu/drm/xe/xe_bb.c           |  33 ++++++
>   drivers/gpu/drm/xe/xe_bb.h           |   1 +
>   drivers/gpu/drm/xe/xe_bo.c           |  13 +++
>   drivers/gpu/drm/xe/xe_bo_types.h     |   3 +
>   drivers/gpu/drm/xe/xe_migrate.c      | 159 +++++++++++++++++++++++++++
>   drivers/gpu/drm/xe/xe_migrate.h      |   8 ++
>   drivers/gpu/drm/xe/xe_sriov_vf_ccs.c |  68 ++++++++++++
>   drivers/gpu/drm/xe/xe_sriov_vf_ccs.h |   3 +
>   8 files changed, 288 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/xe_bb.c b/drivers/gpu/drm/xe/xe_bb.c
> index 9570672fce33..1e752a6d5a94 100644
> --- a/drivers/gpu/drm/xe/xe_bb.c
> +++ b/drivers/gpu/drm/xe/xe_bb.c
> @@ -60,6 +60,39 @@ struct xe_bb *xe_bb_new(struct xe_gt *gt, u32 dwords, bool usm)
>   	return ERR_PTR(err);
>   }
>   
> +struct xe_bb *xe_bb_ccs_rw_new(struct xe_gt *gt, u32 dwords, bool ctx_id)
> +{
> +	struct xe_bb *bb = kmalloc(sizeof(*bb), GFP_KERNEL);
> +	struct xe_tile *tile = gt_to_tile(gt);
> +	struct xe_sa_manager *bb_pool;
> +	int err;
> +
> +	if (!bb)
> +		return ERR_PTR(-ENOMEM);
> +
> +	/*
> +	 * We need to allocate space for the requested number of dwords,
> +	 * one additional MI_BATCH_BUFFER_END dword, and additional buffer
> +	 * space to accommodate the platform-specific hardware prefetch
> +	 * requirements.
> +	 */
> +	bb_pool = tile->sriov.vf.ccs_rw_ctx[ctx_id].mem.ccs_rw_bb_pool;
> +	bb->bo = xe_sa_bo_new(bb_pool, 4 * (dwords + 1) + bb_prefetch(gt));
> +
> +	if (IS_ERR(bb->bo)) {
> +		err = PTR_ERR(bb->bo);
> +		goto err;
> +	}
> +
> +	bb->cs = xe_sa_bo_cpu_addr(bb->bo);
> +	bb->len = 0;
> +
> +	return bb;
> +err:
> +	kfree(bb);
> +	return ERR_PTR(err);
> +}
> +
>   static struct xe_sched_job *
>   __xe_bb_create_job(struct xe_exec_queue *q, struct xe_bb *bb, u64 *addr)
>   {
> diff --git a/drivers/gpu/drm/xe/xe_bb.h b/drivers/gpu/drm/xe/xe_bb.h
> index fafacd73dcc3..5fc1f8b14c07 100644
> --- a/drivers/gpu/drm/xe/xe_bb.h
> +++ b/drivers/gpu/drm/xe/xe_bb.h
> @@ -15,6 +15,7 @@ struct xe_exec_queue;
>   struct xe_sched_job;
>   
>   struct xe_bb *xe_bb_new(struct xe_gt *gt, u32 size, bool usm);
> +struct xe_bb *xe_bb_ccs_rw_new(struct xe_gt *gt, u32 dwords, bool ctx);
>   struct xe_sched_job *xe_bb_create_job(struct xe_exec_queue *q,
>   				      struct xe_bb *bb);
>   struct xe_sched_job *xe_bb_create_migration_job(struct xe_exec_queue *q,
> diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
> index d99d91fe8aa9..4826c431171d 100644
> --- a/drivers/gpu/drm/xe/xe_bo.c
> +++ b/drivers/gpu/drm/xe/xe_bo.c
> @@ -31,6 +31,7 @@
>   #include "xe_pxp.h"
>   #include "xe_res_cursor.h"
>   #include "xe_shrinker.h"
> +#include "xe_sriov_vf_ccs.h"
>   #include "xe_trace_bo.h"
>   #include "xe_ttm_stolen_mgr.h"
>   #include "xe_vm.h"
> @@ -964,6 +965,12 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
>   	xe_pm_runtime_put(xe);
>   
>   out:
> +	if (((move_lacks_source && new_mem->mem_type == XE_PL_TT) ||
> +	     (old_mem_type == XE_PL_TT && new_mem->mem_type == XE_PL_SYSTEM) ||

I think for tt -> sys direction we would have already kicked off the 
native ccs save (likely this is the start of swap-out or something)...

> +	     (old_mem_type == XE_PL_SYSTEM && new_mem->mem_type == XE_PL_TT)) &&
> +	    handle_system_ccs)
> +		ret = xe_sriov_vf_ccs_rw_attach_bo(bo);
> +
>   	if ((!ttm_bo->resource || ttm_bo->resource->mem_type == XE_PL_SYSTEM) &&
>   	    ttm_bo->ttm) {
>   		long timeout = dma_resv_wait_timeout(ttm_bo->base.resv,

...so here after we sync, I think we need to remove the bb programming 
(or maybe later in tt_unpopulate), since the native ccs save would have 
completed so no need to do it twice (also no need to handle the 
restore). But bigger issue is that the pages might be freed shortly 
after this and we don't want the VF pause running and then trampling 
memory which is potentially owned by someone else.

Also related to that I think you are then calling 
xe_sriov_vf_ccs_rw_attach_bo() multiple times, which is then leaking the bb?

> @@ -1478,9 +1485,14 @@ static void xe_ttm_bo_release_notify(struct ttm_buffer_object *ttm_bo)
>   
>   static void xe_ttm_bo_delete_mem_notify(struct ttm_buffer_object *ttm_bo)
>   {
> +	struct xe_bo *bo = ttm_to_xe_bo(ttm_bo);
> +
>   	if (!xe_bo_is_xe_bo(ttm_bo))
>   		return;
>   
> +	if (bo->bb_ccs_rw[0])
> +		xe_sriov_vf_ccs_rw_detach_bo(bo);
> +
>   	/*
>   	 * Object is idle and about to be destroyed. Release the
>   	 * dma-buf attachment.
> @@ -2739,6 +2751,7 @@ int xe_gem_create_ioctl(struct drm_device *dev, void *data,
>   		goto out_bulk;
>   
>   	args->handle = handle;
> +
>   	goto out_put;
>   
>   out_bulk:
> diff --git a/drivers/gpu/drm/xe/xe_bo_types.h b/drivers/gpu/drm/xe/xe_bo_types.h
> index eb5e83c5f233..6eb2061886d4 100644
> --- a/drivers/gpu/drm/xe/xe_bo_types.h
> +++ b/drivers/gpu/drm/xe/xe_bo_types.h
> @@ -78,6 +78,9 @@ struct xe_bo {
>   	/** @ccs_cleared */
>   	bool ccs_cleared;
>   
> +	/** @bb_ccs_rw: BB instructions of CCS read/write. Valid only for VF */
> +	struct xe_bb *bb_ccs_rw[XE_CCS_RW_MAX_CTXS];
> +
>   	/**
>   	 * @cpu_caching: CPU caching mode. Currently only used for userspace
>   	 * objects. Exceptions are system memory on DGFX, which is always
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> index 8f8e9fdfb2a8..df0560f2239b 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -940,6 +940,165 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
>   	return fence;
>   }
>   
> +/**
> + * xe_migrate_ccs_rw_copy() - Copy content of TTM resources.
> + * @m: The migration context.
> + * @src_bo: The buffer object @src is currently bound to.
> + * @dst_bo: If copying between resources created for the same bo, set this to
> + * the same value as @src_bo. If copying between buffer objects, set it to
> + * the buffer object @dst is currently bound to.
> + * @src: The source TTM resource.
> + * @dst: The dst TTM resource.
> + * @copy_only_ccs: If true copy only CCS metadata
> + * @read_write : Creates BB commands for CCS read/write.
> + *
> + * Creates batch buffer instructions to copy CCS metadata from CCS pool to
> + * memory and vice versa.
> + *
> + * This function should only be called for IGPU.
> + *
> + * Return: 0 if successful, negative error code on failure.
> + */
> +int xe_migrate_ccs_rw_copy(struct xe_migrate *m,
> +			   struct xe_bo *src_bo,
> +			   struct xe_bo *dst_bo,
> +			   struct ttm_resource *src,
> +			   struct ttm_resource *dst,
> +			   bool copy_only_ccs,
> +			   bool read_write)
> +
> +{
> +	struct xe_gt *gt = m->tile->primary_gt;
> +	struct xe_device *xe = gt_to_xe(gt);
> +	u64 size = src_bo->size;
> +	struct xe_res_cursor src_it, dst_it, ccs_it;
> +	u64 src_L0_ofs, dst_L0_ofs;
> +	u32 src_L0_pt, dst_L0_pt;
> +	u64 src_L0, dst_L0;
> +	int err;
> +	bool src_is_pltt = read_write == XE_CCS_READ_CTX;
> +	bool dst_is_pltt = read_write == XE_CCS_WRITE_CTX;
> +	bool src_is_vram = mem_type_is_vram(src->mem_type);
> +	bool dst_is_vram = mem_type_is_vram(dst->mem_type);
> +	bool type_device = src_bo->ttm.type == ttm_bo_type_device;
> +	bool needs_ccs_emit = type_device && xe_migrate_needs_ccs_emit(xe);
> +	bool copy_ccs = xe_device_has_flat_ccs(xe) &&
> +		xe_bo_needs_ccs_pages(src_bo) && xe_bo_needs_ccs_pages(dst_bo);
> +	bool copy_system_ccs = copy_ccs && (!src_is_vram || !dst_is_vram);
> +	bool use_comp_pat = type_device && xe_device_has_flat_ccs(xe) &&
> +		GRAPHICS_VER(xe) >= 20 && src_is_vram && !dst_is_vram;
> +
> +	/* Copying CCS between two different BOs is not supported yet. */
> +	if (XE_WARN_ON(copy_ccs && src_bo != dst_bo))
> +		return -EINVAL;
> +
> +	if (src_bo != dst_bo && XE_WARN_ON(src_bo->size != dst_bo->size))
> +		return -EINVAL;
> +
> +	if (!src_is_vram)
> +		xe_res_first_sg(xe_bo_sg(src_bo), 0, size, &src_it);
> +	else
> +		xe_res_first(src, 0, size, &src_it);
> +	if (!dst_is_vram)
> +		xe_res_first_sg(xe_bo_sg(dst_bo), 0, size, &dst_it);
> +	else
> +		xe_res_first(dst, 0, size, &dst_it);
> +
> +	if (copy_system_ccs)
> +		xe_res_first_sg(xe_bo_sg(src_bo), xe_bo_ccs_pages_start(src_bo),
> +				PAGE_ALIGN(xe_device_ccs_bytes(xe, size)),
> +				&ccs_it);
> +
> +	while (size) {
> +		u32 batch_size = 8; /* arb_clear() + MI_BATCH_BUFFER_END + Flush + NOP */
> +		struct xe_bb *bb;
> +		u32 flush_flags = 0;
> +		u64 ccs_ofs, ccs_size;
> +		u32 ccs_pt;
> +		u32 pte_flags;
> +
> +		u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;
> +
> +		src_L0 = xe_migrate_res_sizes(m, &src_it);
> +		dst_L0 = xe_migrate_res_sizes(m, &dst_it);
> +
> +		src_L0 = min(src_L0, dst_L0);
> +
> +		pte_flags = src_is_vram ? PTE_UPDATE_FLAG_IS_VRAM : 0;
> +		pte_flags |= use_comp_pat ? PTE_UPDATE_FLAG_IS_COMP_PTE : 0;
> +		batch_size += pte_update_size(m, pte_flags, src, &src_it, &src_L0,
> +					      &src_L0_ofs, &src_L0_pt, 0, 0,
> +					      avail_pts);
> +
> +		pte_flags = dst_is_vram ? PTE_UPDATE_FLAG_IS_VRAM : 0;
> +		batch_size += pte_update_size(m, pte_flags, dst, &dst_it, &src_L0,
> +					      &dst_L0_ofs, &dst_L0_pt, 0,
> +					      avail_pts, avail_pts);
> +
> +		if (copy_system_ccs) {
> +			xe_assert(xe, type_device);
> +			ccs_size = xe_device_ccs_bytes(xe, src_L0);
> +			batch_size += pte_update_size(m, 0, NULL, &ccs_it, &ccs_size,
> +						      &ccs_ofs, &ccs_pt, 0,
> +						      2 * avail_pts,
> +						      avail_pts);
> +			xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE));
> +		}
> +
> +		/* Add copy commands size here */
> +		batch_size += ((copy_only_ccs) ? 0 : EMIT_COPY_DW) +
> +			((needs_ccs_emit ? EMIT_COPY_CCS_DW : 0));
> +
> +		bb = xe_bb_ccs_rw_new(gt, batch_size, read_write);
> +		if (IS_ERR(bb)) {
> +			err = PTR_ERR(bb);
> +			goto err_ret;
> +		}
> +
> +		if (src_is_vram && xe_migrate_allow_identity(src_L0, &src_it))
> +			xe_res_next(&src_it, src_L0);
> +		else
> +			emit_pte(m, bb, src_L0_pt, src_is_vram, copy_system_ccs,
> +				 &src_it, src_L0, src);
> +
> +		if (dst_is_vram && xe_migrate_allow_identity(src_L0, &dst_it))
> +			xe_res_next(&dst_it, src_L0);
> +		else
> +			emit_pte(m, bb, dst_L0_pt, dst_is_vram, copy_system_ccs,
> +				 &dst_it, src_L0, dst);
> +
> +		if (copy_system_ccs)
> +			emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src);
> +
> +		bb->cs[bb->len++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW |
> +					MI_FLUSH_IMM_DW;
> +		bb->cs[bb->len++] = MI_NOOP;
> +		bb->cs[bb->len++] = MI_NOOP;
> +
> +		if (!copy_only_ccs)
> +			emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, src_L0, XE_PAGE_SIZE);
> +
> +		if (needs_ccs_emit)
> +			flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs,
> +							  IS_DGFX(xe) ? src_is_vram : src_is_pltt,
> +							  dst_L0_ofs,
> +							  IS_DGFX(xe) ? dst_is_vram : dst_is_pltt,
> +							  src_L0, ccs_ofs, copy_ccs);
> +		bb->cs[bb->len++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW |
> +					MI_FLUSH_IMM_DW | flush_flags;
> +		bb->cs[bb->len++] = MI_NOOP;
> +		bb->cs[bb->len++] = MI_NOOP;
> +
> +		size -= src_L0;
> +
> +		src_bo->bb_ccs_rw[read_write] = bb;
> +	}
> +	return 0;
> +
> +err_ret:
> +	return err;
> +}
> +
>   static void emit_clear_link_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,
>   				 u32 size, u32 pitch)
>   {
> diff --git a/drivers/gpu/drm/xe/xe_migrate.h b/drivers/gpu/drm/xe/xe_migrate.h
> index fb9839c1bae0..2a2f6c4690fb 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.h
> +++ b/drivers/gpu/drm/xe/xe_migrate.h
> @@ -112,6 +112,14 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
>   				  struct ttm_resource *dst,
>   				  bool copy_only_ccs);
>   
> +int xe_migrate_ccs_rw_copy(struct xe_migrate *m,
> +			   struct xe_bo *src_bo,
> +			   struct xe_bo *dst_bo,
> +			   struct ttm_resource *src,
> +			   struct ttm_resource *dst,
> +			   bool copy_only_ccs,
> +			   bool read_write);
> +
>   int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
>   			     unsigned long offset, void *buf, int len,
>   			     int write);
> diff --git a/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c b/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c
> index a8a21336dc12..96e7710aa0b7 100644
> --- a/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c
> +++ b/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c
> @@ -4,6 +4,7 @@
>    */
>   
>   #include "instructions/xe_mi_commands.h"
> +#include "xe_bb.h"
>   #include "xe_bo.h"
>   #include "xe_device.h"
>   #include "xe_migrate.h"
> @@ -154,3 +155,70 @@ int xe_sriov_vf_ccs_rw_init(struct xe_device *xe)
>   err_ret:
>   	return err;
>   }
> +
> +/**
> + * xe_sriov_vf_ccs_rw_attach_bo - Insert CCS read write commands in the BO.
> + * @bo: the &buffer object to which batch buffer commands will be added.
> + *
> + * This function shall be called only by VF. It inserts the PTEs and copy
> + * command instructions in the BO by calling xe_migrate_ccs_rw_copy()
> + * function.
> + *
> + * Returns: 0 if successful, negative error code on failure.
> + */
> +int xe_sriov_vf_ccs_rw_attach_bo(struct xe_bo *bo)
> +{
> +	struct xe_device *xe = xe_bo_device(bo);
> +	struct xe_migrate *migrate;
> +	bool copy_only_ccs = 1;
> +	struct xe_tile *tile;
> +	int tile_id, ctx_id;
> +	int err = 0;
> +
> +	if (!IS_SRIOV_VF(xe) || IS_DGFX(xe))
> +		return 0;
> +
> +	for_each_tile(tile, xe, tile_id) {
> +		for_each_ccs_rw_ctx(ctx_id) {
> +			migrate = tile->sriov.vf.ccs_rw_ctx[ctx_id].migrate;
> +			err = xe_migrate_ccs_rw_copy(migrate, bo, bo,
> +						     bo->ttm.resource,
> +						     bo->ttm.resource,
> +						     copy_only_ccs, ctx_id);
> +		}
> +	}
> +	return err;
> +}
> +
> +/**
> + * xe_sriov_vf_ccs_rw_detach_bo - Remove CCS read write commands from the BO.
> + * @bo: the &buffer object from which batch buffer commands will be removed.
> + *
> + * This function shall be called only by VF. It removes the PTEs and copy
> + * command instructions from the BO. Make sure to update the BB with MI_NOOP
> + * before freeing.
> + *
> + * Returns: 0 if successful, negative error code on failure.
> + */
> +int xe_sriov_vf_ccs_rw_detach_bo(struct xe_bo *bo)
> +{
> +	struct xe_device *xe = xe_bo_device(bo);
> +	struct xe_tile *tile;
> +	int tile_id, ctx_id;
> +	struct xe_bb *bb;
> +
> +	if (!IS_SRIOV_VF(xe) || IS_DGFX(xe))
> +		return 0;
> +
> +	for_each_tile(tile, xe, tile_id) {
> +		for_each_ccs_rw_ctx(ctx_id) {
> +			bb = bo->bb_ccs_rw[ctx_id];
> +			if (!bb)
> +				continue;
> +
> +			memset(bb->cs, MI_NOOP, bb->len * sizeof(u32));
> +			xe_bb_free(bb, NULL);
> +		}
> +	}
> +	return 0;
> +}
> diff --git a/drivers/gpu/drm/xe/xe_sriov_vf_ccs.h b/drivers/gpu/drm/xe/xe_sriov_vf_ccs.h
> index c371aabb4d21..a42d92adeefc 100644
> --- a/drivers/gpu/drm/xe/xe_sriov_vf_ccs.h
> +++ b/drivers/gpu/drm/xe/xe_sriov_vf_ccs.h
> @@ -7,7 +7,10 @@
>   #define _XE_SRIOV_VF_CCS_H_
>   
>   struct xe_device;
> +struct xe_bo;
>   
>   int xe_sriov_vf_ccs_rw_init(struct xe_device *xe);
> +int xe_sriov_vf_ccs_rw_attach_bo(struct xe_bo *bo);
> +int xe_sriov_vf_ccs_rw_detach_bo(struct xe_bo *bo);
>   
>   #endif



More information about the Intel-xe mailing list