[PATCH 3/6] drm/xe/xe2: Introduce identity map for compressed pat for vram

Thu Jul 11 12:32:52 UTC 2024

On 11/07/2024 10:19, Akshata Jahagirdar wrote:
> Xe2+ has unified compression (exactly one compression mode/format),
> where compression is now controlled via PAT at PTE level.
> This simplifies KMD operations, as it can now decompress freely
> without concern for the buffer's original compression format—unlike DG2,
> which had multiple compression formats and thus required copying the
> raw CCS state during VRAM eviction. In addition mixed VRAM and system
> memory buffers were not supported with compression enabled.
> 
> On Xe2 dGPU compression is still only supported with VRAM, however we
> can now support compression with VRAM and system memory buffers,
> with GPU access being seamless underneath. So long as when doing
> VRAM -> system memory the KMD uses compressed -> uncompressed,
> to decompress it. This also allows CPU access to such buffers,
> assuming that userspace first decompress the corresponding
> pages being accessed.
> If the pages are already in system memory then KMD would have already
> decompressed them. When restoring such buffers with sysmem -> VRAM
> the KMD can't easily know which pages were originally compressed,
> so we always use uncompressed -> uncompressed here.
> With this it also means we can drop all the raw CCS handling on such
> platforms (including needing to allocate extra CCS storage).
> 
> In order to support this we now need to have two different identity
> mappings for compressed and uncompressed VRAM.
> In this patch, we set up the additional identity map for the VRAM with
> compressed pat_index. We then select the appropriate mapping during
> migration/clear.During eviction (vram->sysmem), we use the mapping from compressed -> uncompressed.
> During restore (sysmem->vram), we need the mapping from uncompressed -> uncompressed.
> Therefore, we need to have two different mappings for compressed and uncompressed
> vram. We set up an additional identity map for the vram with compressed
> pat_index. We then select the appropriate mapping during migration/clear.

Nit: Formatting looks off.

> 
> Signed-off-by: Akshata Jahagirdar <akshata.jahagirdar at intel.com>

Should this not be earlier in the series. I would have expected it to be 
the first patch, since both the new clearing and copy logic are built on 
top of this AFAICT.

> ---
>   drivers/gpu/drm/xe/xe_migrate.c | 55 +++++++++++++++++++++++++--------
>   1 file changed, 42 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> index 2fc2cf375b1e..a3d6d3113ac2 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -120,14 +120,20 @@ static u64 xe_migrate_vm_addr(u64 slot, u32 level)
>   	return (slot + 1ULL) << xe_pt_shift(level + 1);
>   }
>   
> -static u64 xe_migrate_vram_ofs(struct xe_device *xe, u64 addr)
> +static u64 xe_migrate_vram_ofs(struct xe_device *xe, u64 addr, bool is_comp_pte)
>   {
>   	/*
>   	 * Remove the DPA to get a correct offset into identity table for the
>   	 * migrate offset
>   	 */
> +	u64 identity_offset = 256ULL;
> +
> +	if (GRAPHICS_VER(xe) >= 20 && is_comp_pte)
> +		identity_offset = 256ULL +
> +				  DIV_ROUND_UP_ULL(xe->mem.vram.actual_physical_size, SZ_1G);
> +
>   	addr -= xe->mem.vram.dpa_base;
> -	return addr + (256ULL << xe_pt_shift(2));
> +	return addr + (identity_offset << xe_pt_shift(2));
>   }
>   
>   static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m,
> @@ -214,12 +220,12 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m,
>   	} else {
>   		u64 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE);
>   
> -		m->batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr);
> +		m->batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr, false);
>   
>   		if (xe->info.has_usm) {
>   			batch = tile->primary_gt->usm.bb_pool->bo;
>   			batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE);
> -			m->usm_batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr);
> +			m->usm_batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr, false);
>   		}
>   	}
>   
> @@ -251,7 +257,7 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m,
>   		  | XE_PTE_NULL);
>   	m->cleared_mem_ofs = (255ULL << xe_pt_shift(level));
>   
> -	/* Identity map the entire vram at 256GiB offset */
> +	/* Identity map the entire vram for uncompressed pat_index at 256GiB offset */
>   	if (IS_DGFX(xe)) {
>   		u64 pos, ofs, flags;
>   		/* XXX: Unclear if this should be usable_size? */
> @@ -294,6 +300,30 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m,
>   		}
>   
>   		xe_assert(xe, pos == vram_limit);
> +
> +		/*
> +		 * Identity map the entire vram for compressed pat_index for xe2+
> +		 * if flat ccs is enabled.
> +		 */
> +		if (GRAPHICS_VER(xe) >= 20 && xe_device_has_flat_ccs(xe)) {
> +			u16 comp_pat_index = xe->pat.idx[XE_CACHE_NONE_COMPRESSION];
> +			u64 vram_offset = 256 +
> +				DIV_ROUND_UP_ULL(xe->mem.vram.actual_physical_size, SZ_1G);
> +
> +			level = 2;
> +			ofs = map_ofs + XE_PAGE_SIZE * level + vram_offset * 8;
> +			flags = vm->pt_ops->pte_encode_addr(xe, 0, comp_pat_index, level,
> +								true, 0);
> +
> +			/*
> +			 * Use 1GB pages, it shouldn't matter the physical amount of
> +			 * vram is less, when we don't access it.
> +			 */
> +			for (pos = xe->mem.vram.dpa_base;
> +				 pos < xe->mem.vram.actual_physical_size + xe->mem.vram.dpa_base;
> +				 pos += SZ_1G, ofs += 8)

Nit: Formatting looks off?  There are some other formatting issues 
reported by checkpatch in the ci results.

Also it looks like there were some recent changes in 
xe_migrate_prepare_vm, with how the identity map is constructed. I think 
this will need to be updated to match? See: 
6d3581edffea0b3a64b0d3094d3f09222e0024f7.

> +				xe_map_wr(xe, &bo->vmap, ofs, u64, pos | flags);
> +		}
>   	}
>   
>   	/*
> @@ -475,7 +505,7 @@ static bool xe_migrate_allow_identity(u64 size, const struct xe_res_cursor *cur)
>   }
>   
>   static u32 pte_update_size(struct xe_migrate *m,
> -			   bool is_vram,
> +			   bool is_vram, bool is_comp_pte,
>   			   struct ttm_resource *res,
>   			   struct xe_res_cursor *cur,
>   			   u64 *L0, u64 *L0_ofs, u32 *L0_pt,
> @@ -487,7 +517,7 @@ static u32 pte_update_size(struct xe_migrate *m,
>   	if (is_vram && xe_migrate_allow_identity(*L0, cur)) {
>   		/* Offset into identity map. */
>   		*L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile),
> -					      cur->start + vram_region_gpu_offset(res));
> +					      cur->start + vram_region_gpu_offset(res), is_comp_pte);
>   		cmds += cmd_size;
>   	} else {
>   		/* Clip L0 to available size */
> @@ -778,17 +808,17 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
>   
>   		src_L0 = min(src_L0, dst_L0);
>   
> -		batch_size += pte_update_size(m, src_is_vram, src, &src_it, &src_L0,
> +		batch_size += pte_update_size(m, src_is_vram, false, src, &src_it, &src_L0,
>   					      &src_L0_ofs, &src_L0_pt, 0, 0,
>   					      avail_pts);
>   
> -		batch_size += pte_update_size(m, dst_is_vram, dst, &dst_it, &src_L0,
> +		batch_size += pte_update_size(m, dst_is_vram, false, dst, &dst_it, &src_L0,
>   					      &dst_L0_ofs, &dst_L0_pt, 0,
>   					      avail_pts, avail_pts);
>   
>   		if (copy_system_ccs) {
>   			ccs_size = xe_device_ccs_bytes(xe, src_L0);
> -			batch_size += pte_update_size(m, false, NULL, &ccs_it, &ccs_size,
> +			batch_size += pte_update_size(m, false, false, NULL, &ccs_it, &ccs_size,
>   						      &ccs_ofs, &ccs_pt, 0,
>   						      2 * avail_pts,
>   						      avail_pts);
> @@ -1029,14 +1059,13 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
>   
>   		/* Calculate final sizes and batch size.. */
>   		batch_size = 2 +
> -			pte_update_size(m, clear_vram, src, &src_it,
> +			pte_update_size(m, clear_vram, false, src, &src_it,
>   					&clear_L0, &clear_L0_ofs, &clear_L0_pt,
>   					clear_system_ccs ? 0 : emit_clear_cmd_len(gt), 0,
>   					avail_pts);
>   
>   		if (xe_device_needs_ccs_emit(xe))
>   			batch_size += EMIT_COPY_CCS_DW;
> -
>   		/* Clear commands */
>   
>   		if (WARN_ON_ONCE(!clear_L0))
> @@ -1146,7 +1175,7 @@ static void write_pgtable(struct xe_tile *tile, struct xe_bb *bb, u64 ppgtt_ofs,
>   	if (!ppgtt_ofs)
>   		ppgtt_ofs = xe_migrate_vram_ofs(tile_to_xe(tile),
>   						xe_bo_addr(update->pt_bo, 0,
> -							   XE_PAGE_SIZE));
> +							   XE_PAGE_SIZE), false);
>   
>   	do {
>   		u64 addr = ppgtt_ofs + ofs * 8;