[igt-dev] [PATCH i-g-t v2 2/4] lib/DG2: create flat ccs framebuffers with 4-tile

Fri May 13 08:47:05 UTC 2022

> -----Original Message-----
> From: igt-dev <igt-dev-bounces at lists.freedesktop.org> On Behalf Of Jeevan B
> Sent: Wednesday, April 20, 2022 1:39 PM
> To: igt-dev at lists.freedesktop.org
> Cc: Heikkila, Juha-pekka <juha-pekka.heikkila at intel.com>
> Subject: [igt-dev] [PATCH i-g-t v2 2/4] lib/DG2: create flat ccs framebuffers with
> 4-tile
> 
> From: Juha-Pekka Heikkilä <juha-pekka.heikkila at intel.com>
> 
> Add support for DG2 flat ccs framebuffers with tile-4.
> 
> Signed-off-by: Juha-Pekka Heikkilä <juha-pekka.heikkila at intel.com>
> Signed-off-by: Jeevan B <jeevan.b at intel.com>

Reviewed-by: Mika Kahola <mika.kahola at intel.com>

> ---
>  lib/gen9_render.h       |  40 ++++++++++---
>  lib/igt_fb.c            |  49 ++++++++++++----
>  lib/intel_aux_pgtable.c |   6 +-
>  lib/intel_batchbuffer.c |   2 +-
>  lib/intel_bufops.c      | 119 +++++++++++++++++++++++++++++++++----
>  lib/intel_chipset.h     |   3 +-
>  lib/rendercopy_gen9.c   | 127 +++++++++++++++++++++++++++-------------
>  lib/veboxcopy_gen12.c   | 109 +++++++++++++++++++++++++---------
>  8 files changed, 348 insertions(+), 107 deletions(-)
> 
> diff --git a/lib/gen9_render.h b/lib/gen9_render.h index 06d9718c..82a9f99c
> 100644
> --- a/lib/gen9_render.h
> +++ b/lib/gen9_render.h
> @@ -59,9 +59,15 @@ struct gen9_surface_state {
>  		uint32_t depth:11;
>  	} ss3;
> 
> -	struct {
> -		uint32_t minimum_array_element:27;
> -		uint32_t pad0:5;
> +	union {
> +		struct {
> +			uint32_t minimum_array_element:27;
> +			uint32_t pad0:5;
> +		} skl;
> +		struct {
> +			uint32_t decompress_in_l3:1;
> +			uint32_t pad0:31;
> +		} dg2;
>  	} ss4;
> 
>  	struct {
> @@ -116,6 +122,15 @@ struct gen9_surface_state {
>  			uint32_t media_compression:1;
>  			uint32_t pad2:1;
>  		} tgl;
> +
> +		struct {
> +			uint32_t pad0:14;
> +			uint32_t
> disable_support_for_multi_gpu_partial_writes:1;
> +			uint32_t disable_support_for_multi_gpu_atomics:1;
> +			uint32_t pad1:14;
> +			uint32_t memory_compression_enable:1;
> +			uint32_t memory_compression_type:1;
> +		} dg2;
>  	} ss7;
> 
>  	struct {
> @@ -138,15 +153,22 @@ struct gen9_surface_state {
>  		uint32_t aux_base_addr_hi;
>  	} ss11;
> 
> -	/* register can be used for either
> -	 * clear value or depth clear value
> -	 */
>  	struct {
> -		uint32_t clear_address;
> -	} ss12;
> +		/*
> +		 * compression_format is used only dg2 onward.
> +		 * prior to dg2 full ss12 is used for the address
> +		 * but due to alignments bits 0..6 will be zero
> +		 * and asserted in code to be so
> +		 */
> +		uint32_t compression_format:5;
> +		uint32_t pad0:1;
> +		uint32_t clear_address:26;
> +        } ss12;
> 
>  	struct {
> -		uint32_t clear_address_hi;
> +		uint32_t clear_address_hi:16;
> +		uint32_t pad0:16;
> +
>  	} ss13;
> 
>  	struct {
> diff --git a/lib/igt_fb.c b/lib/igt_fb.c index eafbe7fd..93e98733 100644
> --- a/lib/igt_fb.c
> +++ b/lib/igt_fb.c
> @@ -457,6 +457,9 @@ void igt_get_fb_tile_size(int fd, uint64_t modifier, int
> fb_bpp,
>  	case I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC:
>  	case I915_FORMAT_MOD_Y_TILED_GEN12_MC_CCS:
>  	case I915_FORMAT_MOD_4_TILED:
> +	case I915_FORMAT_MOD_4_TILED_DG2_RC_CCS:
> +	case I915_FORMAT_MOD_4_TILED_DG2_MC_CCS:
> +	case I915_FORMAT_MOD_4_TILED_DG2_RC_CCS_CC:
>  		igt_require_intel(fd);
>  		if (intel_display_ver(intel_get_drm_devid(fd)) == 2) {
>  			*width_ret = 128;
> @@ -565,14 +568,17 @@ void igt_get_fb_tile_size(int fd, uint64_t modifier, int
> fb_bpp,
> 
>  static bool is_gen12_mc_ccs_modifier(uint64_t modifier)  {
> -	return modifier == I915_FORMAT_MOD_Y_TILED_GEN12_MC_CCS;
> +	return modifier == I915_FORMAT_MOD_Y_TILED_GEN12_MC_CCS ||
> +		modifier == I915_FORMAT_MOD_4_TILED_DG2_MC_CCS;
>  }
> 
>  static bool is_gen12_ccs_modifier(uint64_t modifier)  {
>  	return is_gen12_mc_ccs_modifier(modifier) ||
>  		modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS ||
> -		modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC;
> +		modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC
> ||
> +		modifier == I915_FORMAT_MOD_4_TILED_DG2_RC_CCS ||
> +		modifier == I915_FORMAT_MOD_4_TILED_DG2_RC_CCS_CC;
>  }
> 
>  static bool is_ccs_modifier(uint64_t modifier) @@ -584,7 +590,7 @@ static
> bool is_ccs_modifier(uint64_t modifier)
> 
>  static bool is_ccs_plane(const struct igt_fb *fb, int plane)  {
> -	if (!is_ccs_modifier(fb->modifier))
> +	if (!is_ccs_modifier(fb->modifier) ||
> +HAS_FLATCCS(intel_get_drm_devid(fb->fd)))
>  		return false;
> 
>  	return plane >= fb->num_planes / 2;
> @@ -602,8 +608,15 @@ static bool is_gen12_ccs_plane(const struct igt_fb *fb,
> int plane)
> 
>  static bool is_gen12_ccs_cc_plane(const struct igt_fb *fb, int plane)  {
> -	return fb->modifier ==
> I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC &&
> -	       plane == 2;
> +	if (fb->modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC
> &&
> +	    plane == 2)
> +		return true;
> +
> +	if (fb->modifier == I915_FORMAT_MOD_4_TILED_DG2_RC_CCS_CC &&
> +            plane == 1)
> +		return true;
> +
> +	return false;
>  }
> 
>  bool igt_fb_is_gen12_ccs_cc_plane(const struct igt_fb *fb, int plane) @@ -
> 689,7 +702,8 @@ static int fb_num_planes(const struct igt_fb *fb)
>  	if (is_ccs_modifier(fb->modifier))
>  		num_planes *= 2;
> 
> -	if (fb->modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC)
> +	if (fb->modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC
> ||
> +	    fb->modifier == I915_FORMAT_MOD_4_TILED_DG2_RC_CCS_CC)
>  		num_planes++;
> 
>  	return num_planes;
> @@ -763,7 +777,7 @@ static uint32_t calc_plane_stride(struct igt_fb *fb, int
> plane)
>  		return ALIGN(min_stride, tile_width);
>  	} else if (is_gen12_ccs_cc_plane(fb, plane)) {
>  		/* clear color always fixed to 64 bytes */
> -		return 64;
> +		return HAS_FLATCCS(intel_get_drm_devid(fb->fd)) ? 512 : 64;
>  	} else if (is_gen12_ccs_plane(fb, plane)) {
>  		/*
>  		 * The CCS surface stride is
> @@ -966,6 +980,9 @@ uint64_t igt_fb_mod_to_tiling(uint64_t modifier)
>  	case I915_FORMAT_MOD_Y_TILED_GEN12_MC_CCS:
>  		return I915_TILING_Y;
>  	case I915_FORMAT_MOD_4_TILED:
> +	case I915_FORMAT_MOD_4_TILED_DG2_RC_CCS:
> +	case I915_FORMAT_MOD_4_TILED_DG2_MC_CCS:
> +	case I915_FORMAT_MOD_4_TILED_DG2_RC_CCS_CC:
>  		return I915_TILING_4;
>  	case I915_FORMAT_MOD_Yf_TILED:
>  	case I915_FORMAT_MOD_Yf_TILED_CCS:
> @@ -2504,9 +2521,10 @@ igt_fb_create_intel_buf(int fd, struct buf_ops *bops,
>  	if (is_ccs_modifier(fb->modifier)) {
>  		igt_assert_eq(fb->strides[0] & 127, 0);
> 
> -		if (is_gen12_ccs_modifier(fb->modifier))
> -			igt_assert_eq(fb->strides[1] & 63, 0);
> -		else
> +		if (is_gen12_ccs_modifier(fb->modifier)) {
> +			if (!HAS_FLATCCS(intel_get_drm_devid(fb->fd)))
> +				igt_assert_eq(fb->strides[1] & 63, 0);
> +		} else
>  			igt_assert_eq(fb->strides[1] & 127, 0);
> 
>  		if (is_gen12_mc_ccs_modifier(fb->modifier))
> @@ -2539,7 +2557,7 @@ igt_fb_create_intel_buf(int fd, struct buf_ops *bops,
>  		buf->yuv_semiplanar_bpp = yuv_semiplanar_bpp(fb-
> >drm_format);
> 
>  	if (is_ccs_modifier(fb->modifier)) {
> -		num_surfaces = fb->num_planes / 2;
> +		num_surfaces = fb->num_planes /
> +(HAS_FLATCCS(intel_get_drm_devid(fb->fd)) ? 1 : 2);
>  		for (i = 0; i < num_surfaces; i++)
>  			init_buf_ccs(buf, i,
>  				     fb->offsets[num_surfaces + i], @@ -2560,6
> +2578,9 @@ igt_fb_create_intel_buf(int fd, struct buf_ops *bops,
>  	if (fb->modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC)
>  		buf->cc.offset = fb->offsets[2];
> 
> +	if (fb->modifier == I915_FORMAT_MOD_4_TILED_DG2_RC_CCS_CC)
> +		buf->cc.offset = fb->offsets[1];
> +
>  	return buf;
>  }
> 
> @@ -4570,6 +4591,12 @@ const char *igt_fb_modifier_name(uint64_t
> modifier)
>  		return "Y-MC_CCS";
>  	case I915_FORMAT_MOD_4_TILED:
>  		return "4";
> +	case I915_FORMAT_MOD_4_TILED_DG2_RC_CCS:
> +		return "4-RC_CCS";
> +	case I915_FORMAT_MOD_4_TILED_DG2_MC_CCS:
> +		return "4-MC_CCS";
> +	case I915_FORMAT_MOD_4_TILED_DG2_RC_CCS_CC:
> +		return "4-RC_CCS-CC";
>  	default:
>  		return "?";
>  	}
> diff --git a/lib/intel_aux_pgtable.c b/lib/intel_aux_pgtable.c index
> f5796fdf..e31a6c34 100644
> --- a/lib/intel_aux_pgtable.c
> +++ b/lib/intel_aux_pgtable.c
> @@ -263,7 +263,8 @@ static uint64_t pgt_get_l1_flags(const struct intel_buf
> *buf, int surface_idx)
>  	} entry = {
>  		.e = {
>  			.valid = 1,
> -			.tile_mode = buf->tiling == I915_TILING_Y ? 1 : 0,
> +			.tile_mode = buf->tiling == I915_TILING_Y ? 1 :
> +				(buf->tiling == I915_TILING_4 ? 2 : 0),
>  		}
>  	};
> 
> @@ -274,7 +275,8 @@ static uint64_t pgt_get_l1_flags(const struct intel_buf
> *buf, int surface_idx)
>  	 */
>  	igt_assert(buf->tiling == I915_TILING_Y ||
>  		   buf->tiling == I915_TILING_Yf ||
> -		   buf->tiling == I915_TILING_Ys);
> +		   buf->tiling == I915_TILING_Ys ||
> +		   buf->tiling == I915_TILING_4);
> 
>  	entry.e.ycr = surface_idx > 0;
> 
> diff --git a/lib/intel_batchbuffer.c b/lib/intel_batchbuffer.c index
> ebf3c598..81d2e140 100644
> --- a/lib/intel_batchbuffer.c
> +++ b/lib/intel_batchbuffer.c
> @@ -1146,7 +1146,7 @@ igt_render_copyfunc_t igt_get_render_copyfunc(int
> devid)
>  		copy = gen9_render_copyfunc;
>  	else if (IS_GEN11(devid))
>  		copy = gen11_render_copyfunc;
> -	else if (IS_DG2(devid))
> +	else if (HAS_4TILE(devid))
>  		copy = gen12p71_render_copyfunc;
>  	else if (IS_GEN12(devid))
>  		copy = gen12_render_copyfunc;
> diff --git a/lib/intel_bufops.c b/lib/intel_bufops.c index f13063fa..72b1bafa
> 100644
> --- a/lib/intel_bufops.c
> +++ b/lib/intel_bufops.c
> @@ -89,6 +89,7 @@
>  #define TILE_Y      TILE_DEF(I915_TILING_Y)
>  #define TILE_Yf     TILE_DEF(I915_TILING_Yf)
>  #define TILE_Ys     TILE_DEF(I915_TILING_Ys)
> +#define TILE_4      TILE_DEF(I915_TILING_4)
> 
>  #define CCS_OFFSET(buf) (buf->ccs[0].offset)  #define CCS_SIZE(gen, buf) \ @@
> -105,16 +106,19 @@ struct buf_ops {
>  	uint32_t supported_hw_tiles;
>  	uint32_t swizzle_x;
>  	uint32_t swizzle_y;
> +	uint32_t swizzle_tile4;
>  	bo_copy linear_to;
>  	bo_copy linear_to_x;
>  	bo_copy linear_to_y;
>  	bo_copy linear_to_yf;
>  	bo_copy linear_to_ys;
> +	bo_copy linear_to_tile4;
>  	bo_copy to_linear;
>  	bo_copy x_to_linear;
>  	bo_copy y_to_linear;
>  	bo_copy yf_to_linear;
>  	bo_copy ys_to_linear;
> +	bo_copy tile4_to_linear;
>  };
> 
>  static const char *tiling_str(uint32_t tiling) @@ -125,6 +129,7 @@ static const
> char *tiling_str(uint32_t tiling)
>  	case I915_TILING_Y:    return "Y";
>  	case I915_TILING_Yf:   return "Yf";
>  	case I915_TILING_Ys:   return "Ys";
> +	case I915_TILING_4:    return "4";
>  	default:               return "UNKNOWN";
>  	}
>  }
> @@ -222,7 +227,8 @@ static void set_hw_tiled(struct buf_ops *bops, struct
> intel_buf *buf)  {
>  	uint32_t ret_tiling, ret_swizzle;
> 
> -	if (buf->tiling != I915_TILING_X && buf->tiling != I915_TILING_Y)
> +	if (buf->tiling != I915_TILING_X && buf->tiling != I915_TILING_Y &&
> +	    buf->tiling != I915_TILING_4)
>  		return;
> 
>  	if (!buf_ops_has_hw_fence(bops, buf->tiling)) { @@ -320,6 +326,50
> @@ static void *y_ptr(void *ptr,
>  	return ptr + pos;
>  }
> 
> +/*
> + * (x,y) to memory location in tiled-4 surface
> + *
> + * coverted those divisions and multiplications to shifts and masks
> + * in hope this wouldn't be so slow.
> + */
> +static void *tile4_ptr(void *ptr,
> +			unsigned int x, unsigned int y,
> +			unsigned int stride, unsigned int cpp) {
> +	const int tile_width = 128;
> +	const int tile_height = 32;
> +	const int subtile_size = 64;
> +	const int owords = 16;
> +	int base, _x, _y, subtile, tile_x, tile_y;
> +	int x_loc = x << __builtin_ctz(cpp);
> +	int pos;
> +
> +	/* Pixel in tile via masks */
> +	tile_x = x_loc & (tile_width - 1);
> +	tile_y = y & (tile_height - 1);
> +
> +	/* subtile in 4k tile */
> +	_x = tile_x >> __builtin_ctz(owords);
> +	_y = tile_y >> 2;
> +
> +	/* tile-4 swizzle */
> +	subtile = ((_y >> 1) << 4) + ((_y & 1) << 2) + (_x & 3) + ((_x & 4) <<
> +1);
> +
> +	/* memory location */
> +	base = (y >> __builtin_ctz(tile_height)) *
> +		(stride << __builtin_ctz(tile_height)) +
> +		(((x_loc >> __builtin_ctz(tile_width)) << __builtin_ctz(4096)));
> +
> +	pos = base + (subtile << __builtin_ctz(subtile_size)) +
> +		((tile_y & 3) << __builtin_ctz(owords)) +
> +		(tile_x & (owords - 1));
> +	igt_assert((pos & (cpp - 1)) == 0);
> +	pos = pos >> __builtin_ctz(cpp);
> +
> +	return ptr + pos;
> +}
> +
> +
>  static void *yf_ptr(void *ptr,
>  		    unsigned int x, unsigned int y,
>  		    unsigned int stride, unsigned int cpp) @@ -365,6 +415,8 @@
> static tile_fn __get_tile_fn_ptr(int tiling)
>  	case I915_TILING_Yf:
>  		fn = yf_ptr;
>  		break;
> +	case I915_TILING_4:
> +		fn = tile4_ptr;
>  	case I915_TILING_Ys:
>  		/* To be implemented */
>  		break;
> @@ -391,7 +443,7 @@ static void __copy_ccs(struct buf_ops *bops, struct
> intel_buf *buf,
>  	void *map;
>  	int gen;
> 
> -	if (!buf->compression)
> +	if (!buf->compression || HAS_FLATCCS(intel_get_drm_devid(bops->fd)))
>  		return;
> 
>  	gen = bops->intel_gen;
> @@ -551,6 +603,13 @@ static void copy_linear_to_ys(struct buf_ops *bops,
> struct intel_buf *buf,
>  	__copy_linear_to(bops->fd, buf, linear, I915_TILING_Ys, 0);  }
> 
> +static void copy_linear_to_tile4(struct buf_ops *bops, struct intel_buf *buf,
> +				 uint32_t *linear)
> +{
> +	DEBUGFN();
> +	__copy_linear_to(bops->fd, buf, linear, I915_TILING_4,
> +bops->swizzle_tile4); }
> +
>  static void __copy_to_linear(int fd, struct intel_buf *buf,
>  			     uint32_t *linear, int tiling, uint32_t swizzle)  { @@ -
> 601,6 +660,13 @@ static void copy_ys_to_linear(struct buf_ops *bops, struct
> intel_buf *buf,
>  	__copy_to_linear(bops->fd, buf, linear, I915_TILING_Ys, 0);  }
> 
> +static void copy_tile4_to_linear(struct buf_ops *bops, struct intel_buf *buf,
> +				 uint32_t *linear)
> +{
> +	DEBUGFN();
> +	__copy_to_linear(bops->fd, buf, linear, I915_TILING_4, 0); }
> +
>  static void copy_linear_to_gtt(struct buf_ops *bops, struct intel_buf *buf,
>  			       uint32_t *linear)
>  {
> @@ -752,11 +818,10 @@ static void __intel_buf_init(struct buf_ops *bops,
>  	IGT_INIT_LIST_HEAD(&buf->link);
> 
>  	if (compression) {
> -		int aux_width, aux_height;
> -
>  		igt_require(bops->intel_gen >= 9);
>  		igt_assert(req_tiling == I915_TILING_Y ||
> -			   req_tiling == I915_TILING_Yf);
> +			   req_tiling == I915_TILING_Yf ||
> +			   req_tiling == I915_TILING_4);
>  		/*
>  		 * On GEN12+ we align the main surface to 4 * 4 main surface
>  		 * tiles, which is 64kB. These 16 tiles are mapped by 4 AUX @@ -
> 778,13 +843,19 @@ static void __intel_buf_init(struct buf_ops *bops,
>  		buf->bpp = bpp;
>  		buf->compression = compression;
> 
> -		aux_width = intel_buf_ccs_width(bops->intel_gen, buf);
> -		aux_height = intel_buf_ccs_height(bops->intel_gen, buf);
> +		if (!HAS_FLATCCS(intel_get_drm_devid(bops->fd))) {
> +			int aux_width, aux_height;
> 
> -		buf->ccs[0].offset = buf->surface[0].stride * ALIGN(height, 32);
> -		buf->ccs[0].stride = aux_width;
> +			aux_width = intel_buf_ccs_width(bops->intel_gen, buf);
> +			aux_height = intel_buf_ccs_height(bops->intel_gen,
> buf);
> 
> -		size = buf->ccs[0].offset + aux_width * aux_height;
> +			buf->ccs[0].offset = buf->surface[0].stride *
> ALIGN(height, 32);
> +			buf->ccs[0].stride = aux_width;
> +			size = buf->ccs[0].offset + aux_width * aux_height;
> +		}
> +		else {
> +			size = buf->ccs[0].offset;
> +		}
>  	} else {
>  		if (tiling) {
>  			devid =  intel_get_drm_devid(bops->fd); @@ -1176,17
> +1247,19 @@ void intel_buf_write_aux_to_png(struct intel_buf *buf, const
> char *filename)  #define DEFAULT_BUFOPS(__gen_start, __gen_end) \
>  	.gen_start          = __gen_start, \
>  	.gen_end            = __gen_end, \
> -	.supported_hw_tiles = TILE_X | TILE_Y, \
> +	.supported_hw_tiles = TILE_X | TILE_Y | TILE_4, \
>  	.linear_to          = copy_linear_to_wc, \
>  	.linear_to_x        = copy_linear_to_gtt, \
>  	.linear_to_y        = copy_linear_to_gtt, \
>  	.linear_to_yf       = copy_linear_to_yf, \
>  	.linear_to_ys       = copy_linear_to_ys, \
> +	.linear_to_tile4    = copy_linear_to_tile4, \
>  	.to_linear          = copy_wc_to_linear, \
>  	.x_to_linear        = copy_gtt_to_linear, \
>  	.y_to_linear        = copy_gtt_to_linear, \
>  	.yf_to_linear       = copy_yf_to_linear, \
> -	.ys_to_linear       = copy_ys_to_linear
> +	.ys_to_linear       = copy_ys_to_linear, \
> +	.tile4_to_linear    = copy_tile4_to_linear
> 
>  struct buf_ops buf_ops_arr[] = {
>  	{
> @@ -1201,7 +1274,7 @@ struct buf_ops buf_ops_arr[] = {
> 
>  	{
>  		DEFAULT_BUFOPS(12, 12),
> -		.supported_tiles   = TILE_NONE | TILE_X | TILE_Y | TILE_Yf |
> TILE_Ys,
> +		.supported_tiles   = TILE_NONE | TILE_X | TILE_Y | TILE_Yf |
> TILE_Ys | TILE_4,
>  	},
>  };
> 
> @@ -1230,6 +1303,8 @@ static bool probe_hw_tiling(struct buf_ops *bops,
> uint32_t tiling,
>  			bops->swizzle_x = buf_swizzle;
>  		else if (tiling == I915_TILING_Y)
>  			bops->swizzle_y = buf_swizzle;
> +		else if (tiling == I915_TILING_4)
> +			bops->swizzle_tile4 = buf_swizzle;
> 
>  		*swizzling_supported = buf_swizzle == phys_swizzle;
>  	}
> @@ -1390,6 +1465,24 @@ static struct buf_ops *__buf_ops_create(int fd, bool
> check_idempotency)
>  		}
>  	}
> 
> +	if (is_hw_tiling_supported(bops, I915_TILING_4)) {
> +		bool swizzling_supported;
> +		bool supported = probe_hw_tiling(bops, I915_TILING_4,
> +						 &swizzling_supported);
> +
> +		if (!swizzling_supported) {
> +			igt_debug("Swizzling for 4 is not supported\n");
> +			bops->supported_tiles &= ~TILE_4;
> +		}
> +
> +		igt_debug("4 fence support: %s\n", bool_str(supported));
> +		if (!supported) {
> +			bops->supported_hw_tiles &= ~TILE_4;
> +			bops->linear_to_tile4 = copy_linear_to_tile4;
> +			bops->tile4_to_linear = copy_tile4_to_linear;
> +		}
> +	}
> +
>  	/* Disable other tiling format functions if not supported */
>  	if (!is_tiling_supported(bops, I915_TILING_Yf)) {
>  		igt_debug("Yf format not supported\n"); diff --git
> a/lib/intel_chipset.h b/lib/intel_chipset.h index db75a829..4d9f4623 100644
> --- a/lib/intel_chipset.h
> +++ b/lib/intel_chipset.h
> @@ -219,6 +219,7 @@ void intel_check_pch(void);
> 
>  #define HAS_4TILE(devid)	(intel_get_device_info(devid)->has_4tile)
> 
> -#define HAS_FLATCCS(devid)	(intel_get_device_info(devid)->has_flatccs)
> +/* use HAS_4TILE here as all devices with 4-tile have flat ccs. */
> +#define HAS_FLATCCS(devid)	HAS_4TILE(devid)
> 
>  #endif /* _INTEL_CHIPSET_H */
> diff --git a/lib/rendercopy_gen9.c b/lib/rendercopy_gen9.c index
> 6c45efb4..9d7e5b71 100644
> --- a/lib/rendercopy_gen9.c
> +++ b/lib/rendercopy_gen9.c
> @@ -165,7 +165,8 @@ intel_get_uc_mocs(int fd) {
> 
>  /* Mostly copy+paste from gen6, except height, width, pitch moved */  static
> uint32_t -gen8_bind_buf(struct intel_bb *ibb, const struct intel_buf *buf, int
> is_dst) {
> +gen8_bind_buf(struct intel_bb *ibb, const struct intel_buf *buf, int is_dst,
> +	      bool fast_clear) {
>  	struct gen9_surface_state *ss;
>  	uint32_t write_domain, read_domain;
>  	uint64_t address;
> @@ -192,15 +193,26 @@ gen8_bind_buf(struct intel_bb *ibb, const struct
> intel_buf *buf, int is_dst) {
>  		case 64: ss->ss0.surface_format =
> SURFACEFORMAT_R16G16B16A16_FLOAT; break;
>  		default: igt_assert(0);
>  	}
> -	ss->ss0.render_cache_read_write = 1;
>  	ss->ss0.vertical_alignment = 1; /* align 4 */
> -	ss->ss0.horizontal_alignment = 1; /* align 4 */
> +	ss->ss0.horizontal_alignment = 1; /* align 4 or HALIGN_32 on display
> +ver >= 13*/
> +
> +	if (HAS_4TILE(ibb->devid)) {
> +		/*
> +		 * mocs table version 1 index 3 groub wb use l3
> +		 */
> +		ss->ss1.memory_object_control = 3 << 1;
> +		ss->ss5.mip_tail_start_lod = 0;
> +	} else {
> +		ss->ss0.render_cache_read_write = 1;
> +		ss->ss1.memory_object_control = intel_get_uc_mocs(i915);
> +		ss->ss5.mip_tail_start_lod = 1; /* needed with trmode */
> +	}
> +
>  	if (buf->tiling == I915_TILING_X)
>  		ss->ss0.tiled_mode = 2;
>  	else if (buf->tiling != I915_TILING_NONE)
>  		ss->ss0.tiled_mode = 3;
> 
> -	ss->ss1.memory_object_control = intel_get_uc_mocs(i915);
>  	if (intel_buf_pxp(buf))
>  		ss->ss1.memory_object_control |= 1;
> 
> @@ -208,7 +220,6 @@ gen8_bind_buf(struct intel_bb *ibb, const struct
> intel_buf *buf, int is_dst) {
>  		ss->ss5.trmode = 1;
>  	else if (buf->tiling == I915_TILING_Ys)
>  		ss->ss5.trmode = 2;
> -	ss->ss5.mip_tail_start_lod = 1; /* needed with trmode */
> 
>  	address = intel_bb_offset_reloc(ibb, buf->handle,
>  					read_domain, write_domain,
> @@ -229,20 +240,22 @@ gen8_bind_buf(struct intel_bb *ibb, const struct
> intel_buf *buf, int is_dst) {
>  	if (buf->compression == I915_COMPRESSION_MEDIA)
>  		ss->ss7.tgl.media_compression = 1;
>  	else if (buf->compression == I915_COMPRESSION_RENDER) {
> -		igt_assert(buf->ccs[0].stride);
> -
>  		ss->ss6.aux_mode = 0x5; /* AUX_CCS_E */
> -		ss->ss6.aux_pitch = (buf->ccs[0].stride / 128) - 1;
> 
> -		address = intel_bb_offset_reloc_with_delta(ibb, buf->handle,
> -							   read_domain,
> write_domain,
> -							   (buf->cc.offset ? (1
> << 10) : 0) | buf->ccs[0].offset,
> -							   intel_bb_offset(ibb)
> + 4 * 10,
> -							   buf->addr.offset);
> -		ss->ss10.aux_base_addr = (address + buf->ccs[0].offset) >> 12;
> -		ss->ss11.aux_base_addr_hi = (address + buf->ccs[0].offset) >>
> 32;
> +		if (buf->ccs[0].stride) {
> +
> +			ss->ss6.aux_pitch = (buf->ccs[0].stride / 128) - 1;
> +
> +			address = intel_bb_offset_reloc_with_delta(ibb, buf-
> >handle,
> +
> read_domain, write_domain,
> +								   (buf-
> >cc.offset ? (1 << 10) : 0) | buf->ccs[0].offset,
> +
> intel_bb_offset(ibb) + 4 * 10,
> +								   buf-
> >addr.offset);
> +			ss->ss10.aux_base_addr = (address + buf-
> >ccs[0].offset) >> 12;
> +			ss->ss11.aux_base_addr_hi = (address + buf-
> >ccs[0].offset) >> 32;
> +		}
> 
> -		if (buf->cc.offset) {
> +		if (fast_clear || (buf->cc.offset && !HAS_FLATCCS(ibb->devid)))
> {
>  			igt_assert(buf->compression ==
> I915_COMPRESSION_RENDER);
> 
>  			ss->ss10.clearvalue_addr_enable = 1; @@ -252,9
> +265,30 @@ gen8_bind_buf(struct intel_bb *ibb, const struct intel_buf *buf, int
> is_dst) {
>  								   buf-
> >cc.offset,
> 
> intel_bb_offset(ibb) + 4 * 12,
>  								   buf-
> >addr.offset);
> -			ss->ss12.clear_address = address + buf->cc.offset;
> -			ss->ss13.clear_address_hi = (address + buf->cc.offset)
> >> 32;
> -		}
> +
> +			/*
> +                         * If this assert doesn't hold below clear address will be
> +                         * written wrong.
> +                         */
> +                        igt_assert(__builtin_ctzl(address + buf->cc.offset) >= 6 &&
> +                                   (__builtin_clzl(address +
> +buf->cc.offset) >= 16));
> +
> +                        ss->ss12.clear_address = (address + buf->cc.offset) >> 6;
> +                        ss->ss13.clear_address_hi = (address + buf->cc.offset) >> 32;
> +                } else if (HAS_FLATCCS(ibb->devid)) {
> +                        ss->ss7.dg2.memory_compression_type = 0;
> +                        ss->ss7.dg2.memory_compression_enable = 0;
> +                        ss->ss7.dg2.disable_support_for_multi_gpu_partial_writes = 1;
> +
> + ss->ss7.dg2.disable_support_for_multi_gpu_atomics = 1;
> +
> +                        /*
> +                         * For now here is coming only 32bpp rgb format
> +                         * which is marked below as B8G8R8X8_UNORM = '8'
> +                         * If here ever arrive other formats below need to be
> +                         * fixed to take that into account.
> +                         */
> +                        ss->ss12.compression_format = 8;
> +                }
>  	}
> 
>  	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*ss)); @@ -
> 266,14 +300,15 @@ gen8_bind_surfaces(struct intel_bb *ibb,
>  		   const struct intel_buf *dst)
>  {
>  	uint32_t *binding_table, binding_table_offset;
> +	bool fast_clear = !src;
> 
>  	binding_table = intel_bb_ptr_align(ibb, 32);
>  	binding_table_offset = intel_bb_ptr_add_return_prev_offset(ibb, 32);
> 
> -	binding_table[0] = gen8_bind_buf(ibb, dst, 1);
> +	binding_table[0] = gen8_bind_buf(ibb, dst, 1, fast_clear);
> 
>  	if (src != NULL)
> -		binding_table[1] = gen8_bind_buf(ibb, src, 0);
> +		binding_table[1] = gen8_bind_buf(ibb, src, 0, false);
> 
>  	return binding_table_offset;
>  }
> @@ -856,12 +891,14 @@ gen8_emit_ps(struct intel_bb *ibb, uint32_t kernel,
> bool fast_clear) {  static void  gen9_emit_depth(struct intel_bb *ibb)  {
> +	bool need_10dw = HAS_4TILE(ibb->devid);
> +
>  	intel_bb_out(ibb, GEN8_3DSTATE_WM_DEPTH_STENCIL | (4 - 2));
>  	intel_bb_out(ibb, 0);
>  	intel_bb_out(ibb, 0);
>  	intel_bb_out(ibb, 0);
> 
> -	intel_bb_out(ibb, GEN7_3DSTATE_DEPTH_BUFFER | (8-2));
> +	intel_bb_out(ibb, GEN7_3DSTATE_DEPTH_BUFFER | (need_10dw ? (10-
> 2) :
> +(8-2)));
>  	intel_bb_out(ibb, 0);
>  	intel_bb_out(ibb, 0);
>  	intel_bb_out(ibb, 0);
> @@ -869,6 +906,10 @@ gen9_emit_depth(struct intel_bb *ibb)
>  	intel_bb_out(ibb, 0);
>  	intel_bb_out(ibb, 0);
>  	intel_bb_out(ibb, 0);
> +	if (need_10dw) {
> +		intel_bb_out(ibb, 0);
> +		intel_bb_out(ibb, 0);
> +	}
> 
>  	intel_bb_out(ibb, GEN8_3DSTATE_HIER_DEPTH_BUFFER | (5-2));
>  	intel_bb_out(ibb, 0);
> @@ -1080,7 +1121,7 @@ void _gen9_render_op(struct intel_bb *ibb,
> 
>  	gen9_emit_state_base_address(ibb);
> 
> -	if (IS_DG2(ibb->devid) || intel_gen(ibb->devid) > 12) {
> +	if (HAS_4TILE(ibb->devid) || intel_gen(ibb->devid) > 12) {
>  		intel_bb_out(ibb,
> GEN4_3DSTATE_BINDING_TABLE_POOL_ALLOC | 2);
>  		intel_bb_emit_reloc(ibb, ibb->handle,
>  				    I915_GEM_DOMAIN_RENDER |
> I915_GEM_DOMAIN_INSTRUCTION, 0, @@ -1197,18 +1238,12 @@ void
> gen12p71_render_copyfunc(struct intel_bb *ibb,
>  			      struct intel_buf *dst,
>  			      unsigned int dst_x, unsigned int dst_y)  {
> -	struct aux_pgtable_info pgtable_info = { };
> -
> -	gen12_aux_pgtable_init(&pgtable_info, ibb, src, dst);
> -
>  	_gen9_render_op(ibb, src, src_x, src_y,
>  			width, height, dst, dst_x, dst_y,
> -			pgtable_info.pgtable_buf,
> +			NULL,
>  			NULL,
>  			gen12p71_render_copy,
>  			sizeof(gen12p71_render_copy));
> -
> -	gen12_aux_pgtable_cleanup(ibb, &pgtable_info);
>  }
> 
>  void gen12_render_clearfunc(struct intel_bb *ibb, @@ -1217,16 +1252,24 @@
> void gen12_render_clearfunc(struct intel_bb *ibb,
>  			    unsigned int width, unsigned int height,
>  			    const float clear_color[4])
>  {
> -	struct aux_pgtable_info pgtable_info = { };
> -
> -	gen12_aux_pgtable_init(&pgtable_info, ibb, NULL, dst);
> -
> -	_gen9_render_op(ibb, NULL, 0, 0,
> -		        width, height, dst, dst_x, dst_y,
> -		        pgtable_info.pgtable_buf,
> -		        clear_color,
> -		        gen12_render_copy,
> -		        sizeof(gen12_render_copy));
> -
> -	gen12_aux_pgtable_cleanup(ibb, &pgtable_info);
> +	if (!HAS_4TILE(ibb->devid)) {
> +		struct aux_pgtable_info pgtable_info = { };
> +		gen12_aux_pgtable_init(&pgtable_info, ibb, NULL, dst);
> +
> +		_gen9_render_op(ibb, NULL, 0, 0,
> +			        width, height, dst, dst_x, dst_y,
> +			        pgtable_info.pgtable_buf,
> +			        clear_color,
> +			        gen12_render_copy,
> +			        sizeof(gen12_render_copy));
> +
> +		gen12_aux_pgtable_cleanup(ibb, &pgtable_info);
> +	} else {
> +			_gen9_render_op(ibb, NULL, 0, 0,
> +					width, height, dst, dst_x, dst_y,
> +					NULL,
> +					clear_color,
> +					gen12p71_render_copy,
> +					sizeof(gen12p71_render_copy));
> +	}
>  }
> diff --git a/lib/veboxcopy_gen12.c b/lib/veboxcopy_gen12.c index
> 17564493..7c3ca245 100644
> --- a/lib/veboxcopy_gen12.c
> +++ b/lib/veboxcopy_gen12.c
> @@ -53,19 +53,25 @@ struct vebox_surface_state {
>  		uint32_t width:14;
>  		uint32_t height:14;
>  	} ss2;
> -	struct {
> +	union {
> +		struct {
>  #define VEBOX_TILE_WALK_XMAJOR 0
>  #define VEBOX_TILE_WALK_YMAJOR 1
> -		uint32_t tile_walk:1;
> -		uint32_t tiled_surface:1;
> -		uint32_t chroma_half_pitch:1;
> -		uint32_t surface_pitch:17;
> -		uint32_t chroma_interleave:1;
> -		uint32_t lsb_packed_enable:1;
> -		uint32_t bayer_input_alignment:2;
> -		uint32_t bayer_pattern_format:1;
> -		uint32_t bayer_pattern_offset:2;
> -		uint32_t surface_format:5;
> +			uint32_t tile_walk:1;
> +			uint32_t tiled_surface:1;
> +			uint32_t chroma_half_pitch:1;
> +			uint32_t surface_pitch:17;
> +			uint32_t chroma_interleave:1;
> +			uint32_t lsb_packed_enable:1;
> +			uint32_t bayer_input_alignment:2;
> +			uint32_t bayer_pattern_format:1;
> +			uint32_t bayer_pattern_offset:2;
> +			uint32_t surface_format:5;
> +		} tgl;
> +		struct {
> +			uint32_t tile_mode:2;
> +			uint32_t pad0:30;
> +		} dg2;
>  	} ss3;
>  	struct {
>  		uint32_t u_y_offset:15;
> @@ -82,9 +88,15 @@ struct vebox_surface_state {
>  		uint32_t frame_x_offset:15;
>  		uint32_t pad:2;
>  	} ss6;
> -	struct {
> -		uint32_t derived_surface_pitch:17;
> -		uint32_t pad:15;
> +	union {
> +		struct {
> +			uint32_t derived_surface_pitch:17;
> +			uint32_t pad:15;
> +		} skl;
> +		struct {
> +			uint32_t pad:27;
> +			uint32_t compression_format:5;
> +		} dg2;
>  	} ss7;
>  	struct {
>  		uint32_t skin_score_output_surface_pitch:17;
> @@ -166,17 +178,46 @@ static void emit_surface_state_cmd(struct intel_bb
> *ibb,
>  	ss->ss2.height = height - 1;
>  	ss->ss2.width = width - 1;
> 
> -	ss->ss3.surface_format = format;
> +	ss->ss3.tgl.surface_format = format;
>  	if (format_is_interleaved_yuv(format))
> -		ss->ss3.chroma_interleave = 1;
> -	ss->ss3.surface_pitch = pitch - 1;
> -	ss->ss3.tile_walk = (tiling == I915_TILING_Y) ||
> -			    (tiling == I915_TILING_Yf);
> -	ss->ss3.tiled_surface = tiling != I915_TILING_NONE;
> +		ss->ss3.tgl.chroma_interleave = 1;
> +	ss->ss3.tgl.surface_pitch = pitch - 1;
> 
>  	ss->ss4.u_y_offset = uv_offset / pitch;
> 
> -	ss->ss7.derived_surface_pitch = pitch - 1;
> +	if (HAS_FLATCCS(ibb->devid)) {
> +                /*
> +                 * f-tile = 3 (Tile F)
> +                 */
> +                ss->ss3.dg2.tile_mode = (tiling != I915_TILING_NONE) ?
> +3 : 0;
> +
> +                switch (format) {
> +                case R8G8B8A8_UNORM:
> +                        ss->ss7.dg2.compression_format = 0xa;
> +                        break;
> +                case PLANAR_420_8:
> +                        ss->ss7.dg2.compression_format = 0xf;
> +                        break;
> +                case PLANAR_420_16:
> +                        ss->ss7.dg2.compression_format = 8;
> +                        break;
> +                case YCRCB_NORMAL:
> +                        ss->ss7.dg2.compression_format = 3;
> +                        break;
> +                case PACKED_444A_8:
> +                        ss->ss7.dg2.compression_format = 0x9;
> +                        break;
> +                default:
> +                        igt_assert(0);
> +                }
> +        } else {
> +                ss->ss3.tgl.tile_walk = (tiling == I915_TILING_Y) ||
> +                        (tiling == I915_TILING_Yf) ||
> +                        (tiling == I915_TILING_4);
> +                ss->ss3.tgl.tiled_surface = tiling != I915_TILING_NONE;
> +        }
> +
> +	ss->ss7.skl.derived_surface_pitch = pitch - 1;
> 
>  	intel_bb_ptr_add(ibb, sizeof(*ss));
>  }
> @@ -203,7 +244,11 @@ static void emit_tiling_convert_cmd(struct intel_bb
> *ibb,
>  		tc->tc1_2.input_compression_type =
>  			src->compression == I915_COMPRESSION_RENDER;
>  	}
> -	tc->tc1_2.input_tiled_resource_mode = src->tiling == I915_TILING_Yf;
> +
> +	if (HAS_4TILE(ibb->devid))
> +		tc->tc1_2.input_mocs_idx = 3;
> +	else
> +		tc->tc1_2.input_tiled_resource_mode = src->tiling ==
> I915_TILING_Yf;
>  	reloc_delta = tc->tc1_2_l;
> 
>  	igt_assert(src->addr.offset == ALIGN(src->addr.offset, 0x1000)); @@ -
> 220,7 +265,12 @@ static void emit_tiling_convert_cmd(struct intel_bb *ibb,
>  		tc->tc3_4.output_compression_type =
>  			dst->compression == I915_COMPRESSION_RENDER;
>  	}
> -	tc->tc3_4.output_tiled_resource_mode = dst->tiling == I915_TILING_Yf;
> +
> +	if (HAS_4TILE(ibb->devid))
> +		tc->tc3_4.output_mocs_idx = 3;
> +	else
> +		tc->tc3_4.output_tiled_resource_mode = dst->tiling ==
> I915_TILING_Yf;
> +
>  	reloc_delta = tc->tc3_4_l;
> 
>  	igt_assert(dst->addr.offset == ALIGN(dst->addr.offset, 0x1000)); @@ -
> 255,10 +305,12 @@ void gen12_vebox_copyfunc(struct intel_bb *ibb,
>  	intel_bb_add_intel_buf(ibb, dst, true);
>  	intel_bb_add_intel_buf(ibb, src, false);
> 
> -	intel_bb_ptr_set(ibb, BATCH_STATE_SPLIT);
> -	gen12_aux_pgtable_init(&aux_pgtable_info, ibb, src, dst);
> -	aux_pgtable_state = gen12_create_aux_pgtable_state(ibb,
> -
> aux_pgtable_info.pgtable_buf);
> +	if (!HAS_FLATCCS(ibb->devid)) {
> +		intel_bb_ptr_set(ibb, BATCH_STATE_SPLIT);
> +		gen12_aux_pgtable_init(&aux_pgtable_info, ibb, src, dst);
> +		aux_pgtable_state = gen12_create_aux_pgtable_state(ibb,
> +
> aux_pgtable_info.pgtable_buf);
> +	}
> 
>  	intel_bb_ptr_set(ibb, 0);
>  	gen12_emit_aux_pgtable_state(ibb, aux_pgtable_state, false); @@ -
> 311,5 +363,6 @@ void gen12_vebox_copyfunc(struct intel_bb *ibb,
> 
>  	intel_bb_reset(ibb, false);
> 
> -	gen12_aux_pgtable_cleanup(ibb, &aux_pgtable_info);
> +	if (!HAS_FLATCCS(ibb->devid))
> +		gen12_aux_pgtable_cleanup(ibb, &aux_pgtable_info);
>  }
> --
> 2.35.1