[PATCH v5 4/8] drm/xe/xe2: Introduce identity map for compressed pat for vram

Tue Jul 16 23:19:49 UTC 2024

On Tue, Jul 16, 2024 at 10:54:05PM +0000, Akshata Jahagirdar wrote:
> Xe2+ has unified compression (exactly one compression mode/format),
> where compression is now controlled via PAT at PTE level.
> This simplifies KMD operations, as it can now decompress freely
> without concern for the buffer's original compression format—unlike DG2,
> which had multiple compression formats and thus required copying the
> raw CCS state during VRAM eviction. In addition mixed VRAM and system
> memory buffers were not supported with compression enabled.
> 
> On Xe2 dGPU compression is still only supported with VRAM, however we
> can now support compression with VRAM and system memory buffers,
> with GPU access being seamless underneath. So long as when doing
> VRAM -> system memory the KMD uses compressed -> uncompressed,
> to decompress it. This also allows CPU access to such buffers,
> assuming that userspace first decompress the corresponding
> pages being accessed.
> If the pages are already in system memory then KMD would have already
> decompressed them. When restoring such buffers with sysmem -> VRAM
> the KMD can't easily know which pages were originally compressed,
> so we always use uncompressed -> uncompressed here.
> With this it also means we can drop all the raw CCS handling on such
> platforms (including needing to allocate extra CCS storage).
> 
> In order to support this we now need to have two different identity
> mappings for compressed and uncompressed VRAM.
> In this patch, we set up the additional identity map for the VRAM with
> compressed pat_index. We then select the appropriate mapping during
> migration/clear. During eviction (vram->sysmem), we use the mapping
> from compressed -> uncompressed. During restore (sysmem->vram), we need
> the mapping from uncompressed -> uncompressed.
> Therefore, we need to have two different mappings for compressed and
> uncompressed vram. We set up an additional identity map for the vram
> with compressed pat_index.
> We then select the appropriate mapping during migration/clear.
> 
> v2: Formatting nits, Updated code to match recent changes in
>     xe_migrate_prepare_vm(). (Matt)
> 
> v3: Move identity map loop to a helper function. (Matt Brost)
> 
> v4: Split helper function in different patch, and
> 	add asserts and nits. (Matt Brost)
> 
> Signed-off-by: Akshata Jahagirdar <akshata.jahagirdar at intel.com>
> Reviewed-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_migrate.c | 65 +++++++++++++++++++++++----------
>  1 file changed, 45 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> index 6b952ed98a51..601c9e790dae 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -73,6 +73,7 @@ struct xe_migrate {
>  #define NUM_PT_SLOTS 32
>  #define LEVEL0_PAGE_TABLE_ENCODE_SIZE SZ_2M
>  #define MAX_NUM_PTE 512
> +#define IDENTITY_OFFSET 256ULL
>  
>  /*
>   * Although MI_STORE_DATA_IMM's "length" field is 10-bits, 0x3FE is the largest
> @@ -120,14 +121,19 @@ static u64 xe_migrate_vm_addr(u64 slot, u32 level)
>  	return (slot + 1ULL) << xe_pt_shift(level + 1);
>  }
>  
> -static u64 xe_migrate_vram_ofs(struct xe_device *xe, u64 addr)
> +static u64 xe_migrate_vram_ofs(struct xe_device *xe, u64 addr, bool is_comp_pte)
>  {
>  	/*
>  	 * Remove the DPA to get a correct offset into identity table for the
>  	 * migrate offset
>  	 */
> +	u64 identity_offset = IDENTITY_OFFSET;
> +
> +	if (GRAPHICS_VER(xe) >= 20 && is_comp_pte)
> +		identity_offset += DIV_ROUND_UP_ULL(xe->mem.vram.actual_physical_size, SZ_1G);
> +
>  	addr -= xe->mem.vram.dpa_base;
> -	return addr + (256ULL << xe_pt_shift(2));
> +	return addr + (identity_offset << xe_pt_shift(2));
>  }
>  
>  static void xe_migrate_program_identity(struct xe_device *xe, struct xe_vm *vm, struct xe_bo *bo,
> @@ -182,10 +188,10 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m,
>  	u16 pat_index = xe->pat.idx[XE_CACHE_WB];
>  	u8 id = tile->id;
>  	u32 num_entries = NUM_PT_SLOTS, num_level = vm->pt_root[id]->level,
> -	    num_setup = num_level + 1;
> +	    num_setup = num_level + 2;

Nit, maybe do something like to better self-document.

#define VRAM_IDENTITY_MAP_COUNT	2
num_setup = num_level + VRAM_IDENTITY_MAP_COUNT;
#undef VRAM_IDENTITY_MAP_COUNT

>  	u32 map_ofs, level, i;
>  	struct xe_bo *bo, *batch = tile->mem.kernel_bb_pool->bo;
> -	u64 entry, pt30_ofs;
> +	u64 entry, pt29_ofs;
>  
>  	/* Can't bump NUM_PT_SLOTS too high */
>  	BUILD_BUG_ON(NUM_PT_SLOTS > SZ_2M/XE_PAGE_SIZE);
> @@ -205,9 +211,9 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m,
>  	if (IS_ERR(bo))
>  		return PTR_ERR(bo);
>  
> -	/* PT31 reserved for 2M identity map */
> -	pt30_ofs = bo->size - 2 * XE_PAGE_SIZE;
> -	entry = vm->pt_ops->pde_encode_bo(bo, pt30_ofs, pat_index);
> +	/* PT30 & PT31 reserved for 2M identity map */
> +	pt29_ofs = bo->size - 3 * XE_PAGE_SIZE;
> +	entry = vm->pt_ops->pde_encode_bo(bo, pt29_ofs, pat_index);
>  	xe_pt_write(xe, &vm->pt_root[id]->bo->vmap, 0, entry);
>  
>  	map_ofs = (num_entries - num_setup) * XE_PAGE_SIZE;
> @@ -259,12 +265,12 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m,
>  	} else {
>  		u64 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE);
>  
> -		m->batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr);
> +		m->batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr, false);
>  
>  		if (xe->info.has_usm) {
>  			batch = tile->primary_gt->usm.bb_pool->bo;
>  			batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE);
> -			m->usm_batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr);
> +			m->usm_batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr, false);
>  		}
>  	}
>  
> @@ -298,10 +304,27 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m,
>  
>  	/* Identity map the entire vram at 256GiB offset */
>  	if (IS_DGFX(xe)) {
> -		u64 pt31_ofs = bo->size - XE_PAGE_SIZE;
> +		u64 pt30_ofs = bo->size - 2 * XE_PAGE_SIZE;
> +
> +		xe_migrate_program_identity(xe, vm, bo, map_ofs, IDENTITY_OFFSET,
> +					    pat_index, pt30_ofs);
> +		xe_assert(xe, (xe->mem.vram.actual_physical_size <= IDENTITY_OFFSET * SZ_1G));

Sorry to change my comment from my previous patch, realized this after
sending.

Techinally this should be:

xe->mem.vram.actual_physical_size <= (512 - IDENTITY_OFFSET) * SZ_1G

Or 512 replaced a define for the number of PTEs we can map in 4k.

>  
> -		xe_migrate_program_identity(xe, vm, bo, map_ofs, 256, pat_index, pt31_ofs);
> -		xe_assert(xe, (xe->mem.vram.actual_physical_size <= SZ_256G));
> +		/*
> +		 * Identity map the entire vram for compressed pat_index for xe2+
> +		 * if flat ccs is enabled.
> +		 */
> +		if (GRAPHICS_VER(xe) >= 20 && xe_device_has_flat_ccs(xe)) {
> +			u16 comp_pat_index = xe->pat.idx[XE_CACHE_NONE_COMPRESSION];
> +
> +			xe_assert(xe, xe->mem.vram.actual_physical_size <= 128ULL * SZ_1G);

The assert needs to go after variables, pretty the compiler will compain about this.

Also s/128ULL/(512 - IDENTITY_OFFSET - IDENTITY_OFFSET / 2)

Same comment as above for the 512.

> +			u64 vram_offset = IDENTITY_OFFSET +
> +				DIV_ROUND_UP_ULL(xe->mem.vram.actual_physical_size, SZ_1G);
> +			u64 pt31_ofs = bo->size - XE_PAGE_SIZE;
> +
> +			xe_migrate_program_identity(xe, vm, bo, map_ofs, vram_offset,
> +						    comp_pat_index, pt31_ofs);
> +		}
>  	}
>  
>  	/*
> @@ -309,7 +332,7 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m,
>  	 * [PT0...PT7]: kernel PT's for copy/clear; 64 or 4KiB PTE's
>  	 * [PT8]: Kernel PT for VM_BIND, 4 KiB PTE's
>  	 * [PT9...PT27]: Userspace PT's for VM_BIND, 4 KiB PTE's
> -	 * [PT28 = PDE 0] [PT29 = PDE 1] [PT30 = PDE 2] [PT31 = 2M vram identity map]
> +	 * [PT28 = PDE 0] [PT29 = PDE 1] [PT30 & PT31 = 2M vram identity map]

So this would now be:

[PT27 = PDE 0] [PT28 = PDE 1] [PT29 = PDE 2] [PT30 & PT31 = 2M vram identity map]

>  	 *
>  	 * This makes the lowest part of the VM point to the pagetables.
>  	 * Hence the lowest 2M in the vm should point to itself, with a few writes
> @@ -488,7 +511,7 @@ static bool xe_migrate_allow_identity(u64 size, const struct xe_res_cursor *cur)
>  }
>  
>  static u32 pte_update_size(struct xe_migrate *m,
> -			   bool is_vram,
> +			   bool is_vram, bool is_comp_pte,

So two bools get confusing and is prone to bugs by mixing but the
argument order at the caller. Can we switch this a flags field?

e.g.
#define PTE_UPDATE_FLAG_IS_VRAM		BIT(0)
#define PTE_UPDATE_FLAD_IS_COMP_PTE	BIT(1)

pte_update_size(struct xe_migrate *m, unsigned int flag, ...);
{
	...
	bool is_vram = PTE_UPDATE_FLAG_IS_VRAM & flags;
	bool is_comp_pte = PTE_UPDATE_FLAD_IS_COMP_PTE & flags;
	...
}

Other than these nits, the patch looks functionally correct to me.

Matt

>  			   struct ttm_resource *res,
>  			   struct xe_res_cursor *cur,
>  			   u64 *L0, u64 *L0_ofs, u32 *L0_pt,
> @@ -500,7 +523,8 @@ static u32 pte_update_size(struct xe_migrate *m,
>  	if (is_vram && xe_migrate_allow_identity(*L0, cur)) {
>  		/* Offset into identity map. */
>  		*L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile),
> -					      cur->start + vram_region_gpu_offset(res));
> +					      cur->start + vram_region_gpu_offset(res),
> +					      is_comp_pte);
>  		cmds += cmd_size;
>  	} else {
>  		/* Clip L0 to available size */
> @@ -791,17 +815,17 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
>  
>  		src_L0 = min(src_L0, dst_L0);
>  
> -		batch_size += pte_update_size(m, src_is_vram, src, &src_it, &src_L0,
> +		batch_size += pte_update_size(m, src_is_vram, false, src, &src_it, &src_L0,
>  					      &src_L0_ofs, &src_L0_pt, 0, 0,
>  					      avail_pts);
>  
> -		batch_size += pte_update_size(m, dst_is_vram, dst, &dst_it, &src_L0,
> +		batch_size += pte_update_size(m, dst_is_vram, false, dst, &dst_it, &src_L0,
>  					      &dst_L0_ofs, &dst_L0_pt, 0,
>  					      avail_pts, avail_pts);
>  
>  		if (copy_system_ccs) {
>  			ccs_size = xe_device_ccs_bytes(xe, src_L0);
> -			batch_size += pte_update_size(m, false, NULL, &ccs_it, &ccs_size,
> +			batch_size += pte_update_size(m, false, false, NULL, &ccs_it, &ccs_size,
>  						      &ccs_ofs, &ccs_pt, 0,
>  						      2 * avail_pts,
>  						      avail_pts);
> @@ -1042,7 +1066,7 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
>  
>  		/* Calculate final sizes and batch size.. */
>  		batch_size = 2 +
> -			pte_update_size(m, clear_vram, src, &src_it,
> +			pte_update_size(m, clear_vram, false, src, &src_it,
>  					&clear_L0, &clear_L0_ofs, &clear_L0_pt,
>  					clear_system_ccs ? 0 : emit_clear_cmd_len(gt), 0,
>  					avail_pts);
> @@ -1159,7 +1183,7 @@ static void write_pgtable(struct xe_tile *tile, struct xe_bb *bb, u64 ppgtt_ofs,
>  	if (!ppgtt_ofs)
>  		ppgtt_ofs = xe_migrate_vram_ofs(tile_to_xe(tile),
>  						xe_bo_addr(update->pt_bo, 0,
> -							   XE_PAGE_SIZE));
> +							   XE_PAGE_SIZE), false);
>  
>  	do {
>  		u64 addr = ppgtt_ofs + ofs * 8;
> @@ -1493,3 +1517,4 @@ void xe_migrate_wait(struct xe_migrate *m)
>  #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)
>  #include "tests/xe_migrate.c"
>  #endif
> +
> -- 
> 2.34.1
>