[PATCH v5 3/4] drm/xe: Split xe_device_td_flush()

Fri Jun 20 11:01:08 UTC 2025

On 18/06/2025 19:50, Lucas De Marchi wrote:
> xe_device_td_flush() has 2 possible implementations: an entire L2 flush
> or a transient flush, depending on WA 16023588340. Make this clear by
> splitting the function so it calls each of them.
> 
> Signed-off-by: Lucas De Marchi <lucas.demarchi at intel.com>

Reviewed-by: Matthew Auld <matthew.auld at intel.com>

> ---
>   drivers/gpu/drm/xe/xe_device.c | 68 +++++++++++++++++++++++++-----------------
>   1 file changed, 40 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> index 8cfcfff250ca5..8396612b68d4b 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -981,38 +981,15 @@ void xe_device_wmb(struct xe_device *xe)
>   		xe_mmio_write32(xe_root_tile_mmio(xe), VF_CAP_REG, 0);
>   }
>   
> -/**
> - * xe_device_td_flush() - Flush transient L3 cache entries
> - * @xe: The device
> - *
> - * Display engine has direct access to memory and is never coherent with L3/L4
> - * caches (or CPU caches), however KMD is responsible for specifically flushing
> - * transient L3 GPU cache entries prior to the flip sequence to ensure scanout
> - * can happen from such a surface without seeing corruption.
> - *
> - * Display surfaces can be tagged as transient by mapping it using one of the
> - * various L3:XD PAT index modes on Xe2.
> - *
> - * Note: On non-discrete xe2 platforms, like LNL, the entire L3 cache is flushed
> - * at the end of each submission via PIPE_CONTROL for compute/render, since SA
> - * Media is not coherent with L3 and we want to support render-vs-media
> - * usescases. For other engines like copy/blt the HW internally forces uncached
> - * behaviour, hence why we can skip the TDF on such platforms.
> +/*
> + * Issue a TRANSIENT_FLUSH_REQUEST and wait for completion on each gt.
>    */
> -void xe_device_td_flush(struct xe_device *xe)
> +static void tdf_request_sync(struct xe_device *xe)
>   {
> -	struct xe_gt *gt;
>   	unsigned int fw_ref;
> +	struct xe_gt *gt;
>   	u8 id;
>   
> -	if (!IS_DGFX(xe) || GRAPHICS_VER(xe) < 20)
> -		return;
> -
> -	if (XE_WA(xe_root_mmio_gt(xe), 16023588340)) {
> -		xe_device_l2_flush(xe);
> -		return;
> -	}
> -
>   	for_each_gt(gt, xe, id) {
>   		if (xe_gt_is_media_type(gt))
>   			continue;
> @@ -1022,6 +999,7 @@ void xe_device_td_flush(struct xe_device *xe)
>   			return;
>   
>   		xe_mmio_write32(&gt->mmio, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST);
> +
>   		/*
>   		 * FIXME: We can likely do better here with our choice of
>   		 * timeout. Currently we just assume the worst case, i.e. 150us,
> @@ -1052,15 +1030,49 @@ void xe_device_l2_flush(struct xe_device *xe)
>   		return;
>   
>   	spin_lock(&gt->global_invl_lock);
> -	xe_mmio_write32(&gt->mmio, XE2_GLOBAL_INVAL, 0x1);
>   
> +	xe_mmio_write32(&gt->mmio, XE2_GLOBAL_INVAL, 0x1);
>   	if (xe_mmio_wait32(&gt->mmio, XE2_GLOBAL_INVAL, 0x1, 0x0, 500, NULL, true))
>   		xe_gt_err_once(gt, "Global invalidation timeout\n");
> +
>   	spin_unlock(&gt->global_invl_lock);
>   
>   	xe_force_wake_put(gt_to_fw(gt), fw_ref);
>   }
>   
> +/**
> + * xe_device_td_flush() - Flush transient L3 cache entries
> + * @xe: The device
> + *
> + * Display engine has direct access to memory and is never coherent with L3/L4
> + * caches (or CPU caches), however KMD is responsible for specifically flushing
> + * transient L3 GPU cache entries prior to the flip sequence to ensure scanout
> + * can happen from such a surface without seeing corruption.
> + *
> + * Display surfaces can be tagged as transient by mapping it using one of the
> + * various L3:XD PAT index modes on Xe2.
> + *
> + * Note: On non-discrete xe2 platforms, like LNL, the entire L3 cache is flushed
> + * at the end of each submission via PIPE_CONTROL for compute/render, since SA
> + * Media is not coherent with L3 and we want to support render-vs-media
> + * usescases. For other engines like copy/blt the HW internally forces uncached
> + * behaviour, hence why we can skip the TDF on such platforms.
> + */
> +void xe_device_td_flush(struct xe_device *xe)
> +{
> +	struct xe_gt *root_gt;
> +
> +	if (!IS_DGFX(xe) || GRAPHICS_VER(xe) < 20)
> +		return;
> +
> +	root_gt = xe_root_mmio_gt(xe);
> +	if (XE_WA(root_gt, 16023588340))
> +		/* A transient flush is not sufficient: flush the L2 */
> +		xe_device_l2_flush(xe);
> +	else
> +		tdf_request_sync(xe);
> +}
> +
>   u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size)
>   {
>   	return xe_device_has_flat_ccs(xe) ?
>