[igt-dev] [PATCH i-g-t v3 2/4] lib/DG2: create flat ccs framebuffers with 4-tile

Thu May 12 11:01:11 UTC 2022

From: Juha-Pekka Heikkilä <juha-pekka.heikkila at intel.com>

Add support for DG2 flat ccs framebuffers with tile-4.

Signed-off-by: Juha-Pekka Heikkilä <juha-pekka.heikkila at intel.com>
Signed-off-by: Jeevan B <jeevan.b at intel.com>
---
 lib/gen9_render.h       |  38 +++++++++---
 lib/igt_fb.c            |  51 ++++++++++++----
 lib/intel_aux_pgtable.c |   6 +-
 lib/intel_batchbuffer.c |   2 +-
 lib/intel_bufops.c      | 118 ++++++++++++++++++++++++++++++++-----
 lib/intel_chipset.h     |   3 +-
 lib/rendercopy_gen9.c   | 125 +++++++++++++++++++++++++++-------------
 lib/veboxcopy_gen12.c   | 109 ++++++++++++++++++++++++++---------
 8 files changed, 347 insertions(+), 105 deletions(-)

diff --git a/lib/gen9_render.h b/lib/gen9_render.h
index 06d9718c..af3a2b3a 100644
--- a/lib/gen9_render.h
+++ b/lib/gen9_render.h
@@ -59,9 +59,15 @@ struct gen9_surface_state {
 		uint32_t depth:11;
 	} ss3;
 
-	struct {
-		uint32_t minimum_array_element:27;
-		uint32_t pad0:5;
+	union {
+		struct {
+			uint32_t minimum_array_element:27;
+			uint32_t pad0:5;
+		} skl;
+		struct {
+			uint32_t decompress_in_l3:1;
+			uint32_t pad0:31;
+		} dg2;
 	} ss4;
 
 	struct {
@@ -116,6 +122,15 @@ struct gen9_surface_state {
 			uint32_t media_compression:1;
 			uint32_t pad2:1;
 		} tgl;
+
+		struct {
+			uint32_t pad0:14;
+			uint32_t disable_support_for_multi_gpu_partial_writes:1;
+			uint32_t disable_support_for_multi_gpu_atomics:1;
+			uint32_t pad1:14;
+			uint32_t memory_compression_enable:1;
+			uint32_t memory_compression_type:1;
+		} dg2;
 	} ss7;
 
 	struct {
@@ -138,15 +153,22 @@ struct gen9_surface_state {
 		uint32_t aux_base_addr_hi;
 	} ss11;
 
-	/* register can be used for either
-	 * clear value or depth clear value
-	 */
 	struct {
-		uint32_t clear_address;
+		/*
+		 * compression_format is used only dg2 onward.
+		 * prior to dg2 full ss12 is used for the address
+		 * but due to alignments bits 0..6 will be zero
+		 * and asserted in code to be so
+		 */
+		uint32_t compression_format:5;
+		uint32_t pad0:1;
+		uint32_t clear_address:26;
 	} ss12;
 
 	struct {
-		uint32_t clear_address_hi;
+		uint32_t clear_address_hi:16;
+		uint32_t pad0:16;
+
 	} ss13;
 
 	struct {
diff --git a/lib/igt_fb.c b/lib/igt_fb.c
index eafbe7fd..f3cb711e 100644
--- a/lib/igt_fb.c
+++ b/lib/igt_fb.c
@@ -457,6 +457,9 @@ void igt_get_fb_tile_size(int fd, uint64_t modifier, int fb_bpp,
 	case I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC:
 	case I915_FORMAT_MOD_Y_TILED_GEN12_MC_CCS:
 	case I915_FORMAT_MOD_4_TILED:
+	case I915_FORMAT_MOD_4_TILED_DG2_RC_CCS:
+	case I915_FORMAT_MOD_4_TILED_DG2_MC_CCS:
+	case I915_FORMAT_MOD_4_TILED_DG2_RC_CCS_CC:
 		igt_require_intel(fd);
 		if (intel_display_ver(intel_get_drm_devid(fd)) == 2) {
 			*width_ret = 128;
@@ -565,14 +568,17 @@ void igt_get_fb_tile_size(int fd, uint64_t modifier, int fb_bpp,
 
 static bool is_gen12_mc_ccs_modifier(uint64_t modifier)
 {
-	return modifier == I915_FORMAT_MOD_Y_TILED_GEN12_MC_CCS;
+	return modifier == I915_FORMAT_MOD_Y_TILED_GEN12_MC_CCS ||
+		modifier == I915_FORMAT_MOD_4_TILED_DG2_MC_CCS;
 }
 
 static bool is_gen12_ccs_modifier(uint64_t modifier)
 {
 	return is_gen12_mc_ccs_modifier(modifier) ||
 		modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS ||
-		modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC;
+		modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC ||
+		modifier == I915_FORMAT_MOD_4_TILED_DG2_RC_CCS ||
+		modifier == I915_FORMAT_MOD_4_TILED_DG2_RC_CCS_CC;
 }
 
 static bool is_ccs_modifier(uint64_t modifier)
@@ -584,7 +590,7 @@ static bool is_ccs_modifier(uint64_t modifier)
 
 static bool is_ccs_plane(const struct igt_fb *fb, int plane)
 {
-	if (!is_ccs_modifier(fb->modifier))
+	if (!is_ccs_modifier(fb->modifier) || HAS_FLATCCS(intel_get_drm_devid(fb->fd)))
 		return false;
 
 	return plane >= fb->num_planes / 2;
@@ -602,8 +608,15 @@ static bool is_gen12_ccs_plane(const struct igt_fb *fb, int plane)
 
 static bool is_gen12_ccs_cc_plane(const struct igt_fb *fb, int plane)
 {
-	return fb->modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC &&
-	       plane == 2;
+	if (fb->modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC &&
+	    plane == 2)
+		return true;
+
+	if (fb->modifier == I915_FORMAT_MOD_4_TILED_DG2_RC_CCS_CC &&
+	    plane == 1)
+		return true;
+
+	return false;
 }
 
 bool igt_fb_is_gen12_ccs_cc_plane(const struct igt_fb *fb, int plane)
@@ -686,10 +699,11 @@ static int fb_num_planes(const struct igt_fb *fb)
 {
 	int num_planes = lookup_drm_format(fb->drm_format)->num_planes;
 
-	if (is_ccs_modifier(fb->modifier))
+	if (is_ccs_modifier(fb->modifier) && !HAS_FLATCCS(intel_get_drm_devid(fb->fd)))
 		num_planes *= 2;
 
-	if (fb->modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC)
+	if (fb->modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC ||
+	    fb->modifier == I915_FORMAT_MOD_4_TILED_DG2_RC_CCS_CC)
 		num_planes++;
 
 	return num_planes;
@@ -763,7 +777,7 @@ static uint32_t calc_plane_stride(struct igt_fb *fb, int plane)
 		return ALIGN(min_stride, tile_width);
 	} else if (is_gen12_ccs_cc_plane(fb, plane)) {
 		/* clear color always fixed to 64 bytes */
-		return 64;
+		return HAS_FLATCCS(intel_get_drm_devid(fb->fd)) ? 512 : 64;
 	} else if (is_gen12_ccs_plane(fb, plane)) {
 		/*
 		 * The CCS surface stride is
@@ -966,6 +980,9 @@ uint64_t igt_fb_mod_to_tiling(uint64_t modifier)
 	case I915_FORMAT_MOD_Y_TILED_GEN12_MC_CCS:
 		return I915_TILING_Y;
 	case I915_FORMAT_MOD_4_TILED:
+	case I915_FORMAT_MOD_4_TILED_DG2_RC_CCS:
+	case I915_FORMAT_MOD_4_TILED_DG2_MC_CCS:
+	case I915_FORMAT_MOD_4_TILED_DG2_RC_CCS_CC:
 		return I915_TILING_4;
 	case I915_FORMAT_MOD_Yf_TILED:
 	case I915_FORMAT_MOD_Yf_TILED_CCS:
@@ -2504,9 +2521,10 @@ igt_fb_create_intel_buf(int fd, struct buf_ops *bops,
 	if (is_ccs_modifier(fb->modifier)) {
 		igt_assert_eq(fb->strides[0] & 127, 0);
 
-		if (is_gen12_ccs_modifier(fb->modifier))
-			igt_assert_eq(fb->strides[1] & 63, 0);
-		else
+		if (is_gen12_ccs_modifier(fb->modifier)) {
+			if (!HAS_FLATCCS(intel_get_drm_devid(fb->fd)))
+				igt_assert_eq(fb->strides[1] & 63, 0);
+		} else
 			igt_assert_eq(fb->strides[1] & 127, 0);
 
 		if (is_gen12_mc_ccs_modifier(fb->modifier))
@@ -2539,7 +2557,7 @@ igt_fb_create_intel_buf(int fd, struct buf_ops *bops,
 		buf->yuv_semiplanar_bpp = yuv_semiplanar_bpp(fb->drm_format);
 
 	if (is_ccs_modifier(fb->modifier)) {
-		num_surfaces = fb->num_planes / 2;
+		num_surfaces = fb->num_planes / (HAS_FLATCCS(intel_get_drm_devid(fb->fd)) ? 1 : 2);
 		for (i = 0; i < num_surfaces; i++)
 			init_buf_ccs(buf, i,
 				     fb->offsets[num_surfaces + i],
@@ -2560,6 +2578,9 @@ igt_fb_create_intel_buf(int fd, struct buf_ops *bops,
 	if (fb->modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC)
 		buf->cc.offset = fb->offsets[2];
 
+	if (fb->modifier == I915_FORMAT_MOD_4_TILED_DG2_RC_CCS_CC)
+		buf->cc.offset = fb->offsets[1];
+
 	return buf;
 }
 
@@ -4570,6 +4591,12 @@ const char *igt_fb_modifier_name(uint64_t modifier)
 		return "Y-MC_CCS";
 	case I915_FORMAT_MOD_4_TILED:
 		return "4";
+	case I915_FORMAT_MOD_4_TILED_DG2_RC_CCS:
+		return "4-RC_CCS";
+	case I915_FORMAT_MOD_4_TILED_DG2_MC_CCS:
+		return "4-MC_CCS";
+	case I915_FORMAT_MOD_4_TILED_DG2_RC_CCS_CC:
+		return "4-RC_CCS-CC";
 	default:
 		return "?";
 	}
diff --git a/lib/intel_aux_pgtable.c b/lib/intel_aux_pgtable.c
index f5796fdf..e31a6c34 100644
--- a/lib/intel_aux_pgtable.c
+++ b/lib/intel_aux_pgtable.c
@@ -263,7 +263,8 @@ static uint64_t pgt_get_l1_flags(const struct intel_buf *buf, int surface_idx)
 	} entry = {
 		.e = {
 			.valid = 1,
-			.tile_mode = buf->tiling == I915_TILING_Y ? 1 : 0,
+			.tile_mode = buf->tiling == I915_TILING_Y ? 1 :
+				(buf->tiling == I915_TILING_4 ? 2 : 0),
 		}
 	};
 
@@ -274,7 +275,8 @@ static uint64_t pgt_get_l1_flags(const struct intel_buf *buf, int surface_idx)
 	 */
 	igt_assert(buf->tiling == I915_TILING_Y ||
 		   buf->tiling == I915_TILING_Yf ||
-		   buf->tiling == I915_TILING_Ys);
+		   buf->tiling == I915_TILING_Ys ||
+		   buf->tiling == I915_TILING_4);
 
 	entry.e.ycr = surface_idx > 0;
 
diff --git a/lib/intel_batchbuffer.c b/lib/intel_batchbuffer.c
index ebf3c598..81d2e140 100644
--- a/lib/intel_batchbuffer.c
+++ b/lib/intel_batchbuffer.c
@@ -1146,7 +1146,7 @@ igt_render_copyfunc_t igt_get_render_copyfunc(int devid)
 		copy = gen9_render_copyfunc;
 	else if (IS_GEN11(devid))
 		copy = gen11_render_copyfunc;
-	else if (IS_DG2(devid))
+	else if (HAS_4TILE(devid))
 		copy = gen12p71_render_copyfunc;
 	else if (IS_GEN12(devid))
 		copy = gen12_render_copyfunc;
diff --git a/lib/intel_bufops.c b/lib/intel_bufops.c
index f13063fa..05c0b0d4 100644
--- a/lib/intel_bufops.c
+++ b/lib/intel_bufops.c
@@ -89,6 +89,7 @@
 #define TILE_Y      TILE_DEF(I915_TILING_Y)
 #define TILE_Yf     TILE_DEF(I915_TILING_Yf)
 #define TILE_Ys     TILE_DEF(I915_TILING_Ys)
+#define TILE_4      TILE_DEF(I915_TILING_4)
 
 #define CCS_OFFSET(buf) (buf->ccs[0].offset)
 #define CCS_SIZE(gen, buf) \
@@ -105,16 +106,19 @@ struct buf_ops {
 	uint32_t supported_hw_tiles;
 	uint32_t swizzle_x;
 	uint32_t swizzle_y;
+	uint32_t swizzle_tile4;
 	bo_copy linear_to;
 	bo_copy linear_to_x;
 	bo_copy linear_to_y;
 	bo_copy linear_to_yf;
 	bo_copy linear_to_ys;
+	bo_copy linear_to_tile4;
 	bo_copy to_linear;
 	bo_copy x_to_linear;
 	bo_copy y_to_linear;
 	bo_copy yf_to_linear;
 	bo_copy ys_to_linear;
+	bo_copy tile4_to_linear;
 };
 
 static const char *tiling_str(uint32_t tiling)
@@ -125,6 +129,7 @@ static const char *tiling_str(uint32_t tiling)
 	case I915_TILING_Y:    return "Y";
 	case I915_TILING_Yf:   return "Yf";
 	case I915_TILING_Ys:   return "Ys";
+	case I915_TILING_4:    return "4";
 	default:               return "UNKNOWN";
 	}
 }
@@ -222,7 +227,8 @@ static void set_hw_tiled(struct buf_ops *bops, struct intel_buf *buf)
 {
 	uint32_t ret_tiling, ret_swizzle;
 
-	if (buf->tiling != I915_TILING_X && buf->tiling != I915_TILING_Y)
+	if (buf->tiling != I915_TILING_X && buf->tiling != I915_TILING_Y &&
+	    buf->tiling != I915_TILING_4)
 		return;
 
 	if (!buf_ops_has_hw_fence(bops, buf->tiling)) {
@@ -320,6 +326,50 @@ static void *y_ptr(void *ptr,
 	return ptr + pos;
 }
 
+/*
+ * (x,y) to memory location in tiled-4 surface
+ *
+ * coverted those divisions and multiplications to shifts and masks
+ * in hope this wouldn't be so slow.
+ */
+static void *tile4_ptr(void *ptr,
+			unsigned int x, unsigned int y,
+			unsigned int stride, unsigned int cpp)
+{
+	const int tile_width = 128;
+	const int tile_height = 32;
+	const int subtile_size = 64;
+	const int owords = 16;
+	int base, _x, _y, subtile, tile_x, tile_y;
+	int x_loc = x << __builtin_ctz(cpp);
+	int pos;
+
+	/* Pixel in tile via masks */
+	tile_x = x_loc & (tile_width - 1);
+	tile_y = y & (tile_height - 1);
+
+	/* subtile in 4k tile */
+	_x = tile_x >> __builtin_ctz(owords);
+	_y = tile_y >> 2;
+
+	/* tile-4 swizzle */
+	subtile = ((_y >> 1) << 4) + ((_y & 1) << 2) + (_x & 3) + ((_x & 4) << 1);
+
+	/* memory location */
+	base = (y >> __builtin_ctz(tile_height)) *
+		(stride << __builtin_ctz(tile_height)) +
+		(((x_loc >> __builtin_ctz(tile_width)) << __builtin_ctz(4096)));
+
+	pos = base + (subtile << __builtin_ctz(subtile_size)) +
+		((tile_y & 3) << __builtin_ctz(owords)) +
+		(tile_x & (owords - 1));
+	igt_assert((pos & (cpp - 1)) == 0);
+	pos = pos >> __builtin_ctz(cpp);
+
+	return ptr + pos;
+}
+
+
 static void *yf_ptr(void *ptr,
 		    unsigned int x, unsigned int y,
 		    unsigned int stride, unsigned int cpp)
@@ -365,6 +415,8 @@ static tile_fn __get_tile_fn_ptr(int tiling)
 	case I915_TILING_Yf:
 		fn = yf_ptr;
 		break;
+	case I915_TILING_4:
+		fn = tile4_ptr;
 	case I915_TILING_Ys:
 		/* To be implemented */
 		break;
@@ -391,7 +443,7 @@ static void __copy_ccs(struct buf_ops *bops, struct intel_buf *buf,
 	void *map;
 	int gen;
 
-	if (!buf->compression)
+	if (!buf->compression || HAS_FLATCCS(intel_get_drm_devid(bops->fd)))
 		return;
 
 	gen = bops->intel_gen;
@@ -551,6 +603,13 @@ static void copy_linear_to_ys(struct buf_ops *bops, struct intel_buf *buf,
 	__copy_linear_to(bops->fd, buf, linear, I915_TILING_Ys, 0);
 }
 
+static void copy_linear_to_tile4(struct buf_ops *bops, struct intel_buf *buf,
+				 uint32_t *linear)
+{
+	DEBUGFN();
+	__copy_linear_to(bops->fd, buf, linear, I915_TILING_4, bops->swizzle_tile4);
+}
+
 static void __copy_to_linear(int fd, struct intel_buf *buf,
 			     uint32_t *linear, int tiling, uint32_t swizzle)
 {
@@ -601,6 +660,13 @@ static void copy_ys_to_linear(struct buf_ops *bops, struct intel_buf *buf,
 	__copy_to_linear(bops->fd, buf, linear, I915_TILING_Ys, 0);
 }
 
+static void copy_tile4_to_linear(struct buf_ops *bops, struct intel_buf *buf,
+				 uint32_t *linear)
+{
+	DEBUGFN();
+	__copy_to_linear(bops->fd, buf, linear, I915_TILING_4, 0);
+}
+
 static void copy_linear_to_gtt(struct buf_ops *bops, struct intel_buf *buf,
 			       uint32_t *linear)
 {
@@ -752,11 +818,10 @@ static void __intel_buf_init(struct buf_ops *bops,
 	IGT_INIT_LIST_HEAD(&buf->link);
 
 	if (compression) {
-		int aux_width, aux_height;
-
 		igt_require(bops->intel_gen >= 9);
 		igt_assert(req_tiling == I915_TILING_Y ||
-			   req_tiling == I915_TILING_Yf);
+			   req_tiling == I915_TILING_Yf ||
+			   req_tiling == I915_TILING_4);
 		/*
 		 * On GEN12+ we align the main surface to 4 * 4 main surface
 		 * tiles, which is 64kB. These 16 tiles are mapped by 4 AUX
@@ -778,13 +843,18 @@ static void __intel_buf_init(struct buf_ops *bops,
 		buf->bpp = bpp;
 		buf->compression = compression;
 
-		aux_width = intel_buf_ccs_width(bops->intel_gen, buf);
-		aux_height = intel_buf_ccs_height(bops->intel_gen, buf);
+		if (!HAS_FLATCCS(intel_get_drm_devid(bops->fd))) {
+			int aux_width, aux_height;
 
-		buf->ccs[0].offset = buf->surface[0].stride * ALIGN(height, 32);
-		buf->ccs[0].stride = aux_width;
+			aux_width = intel_buf_ccs_width(bops->intel_gen, buf);
+			aux_height = intel_buf_ccs_height(bops->intel_gen, buf);
 
-		size = buf->ccs[0].offset + aux_width * aux_height;
+			buf->ccs[0].offset = buf->surface[0].stride * ALIGN(height, 32);
+			buf->ccs[0].stride = aux_width;
+			size = buf->ccs[0].offset + aux_width * aux_height;
+		} else {
+			size = buf->ccs[0].offset;
+		}
 	} else {
 		if (tiling) {
 			devid =  intel_get_drm_devid(bops->fd);
@@ -1176,17 +1246,19 @@ void intel_buf_write_aux_to_png(struct intel_buf *buf, const char *filename)
 #define DEFAULT_BUFOPS(__gen_start, __gen_end) \
 	.gen_start          = __gen_start, \
 	.gen_end            = __gen_end, \
-	.supported_hw_tiles = TILE_X | TILE_Y, \
+	.supported_hw_tiles = TILE_X | TILE_Y | TILE_4, \
 	.linear_to          = copy_linear_to_wc, \
 	.linear_to_x        = copy_linear_to_gtt, \
 	.linear_to_y        = copy_linear_to_gtt, \
 	.linear_to_yf       = copy_linear_to_yf, \
 	.linear_to_ys       = copy_linear_to_ys, \
+	.linear_to_tile4    = copy_linear_to_tile4, \
 	.to_linear          = copy_wc_to_linear, \
 	.x_to_linear        = copy_gtt_to_linear, \
 	.y_to_linear        = copy_gtt_to_linear, \
 	.yf_to_linear       = copy_yf_to_linear, \
-	.ys_to_linear       = copy_ys_to_linear
+	.ys_to_linear       = copy_ys_to_linear, \
+	.tile4_to_linear    = copy_tile4_to_linear
 
 struct buf_ops buf_ops_arr[] = {
 	{
@@ -1201,7 +1273,7 @@ struct buf_ops buf_ops_arr[] = {
 
 	{
 		DEFAULT_BUFOPS(12, 12),
-		.supported_tiles   = TILE_NONE | TILE_X | TILE_Y | TILE_Yf | TILE_Ys,
+		.supported_tiles   = TILE_NONE | TILE_X | TILE_Y | TILE_Yf | TILE_Ys | TILE_4,
 	},
 };
 
@@ -1230,6 +1302,8 @@ static bool probe_hw_tiling(struct buf_ops *bops, uint32_t tiling,
 			bops->swizzle_x = buf_swizzle;
 		else if (tiling == I915_TILING_Y)
 			bops->swizzle_y = buf_swizzle;
+		else if (tiling == I915_TILING_4)
+			bops->swizzle_tile4 = buf_swizzle;
 
 		*swizzling_supported = buf_swizzle == phys_swizzle;
 	}
@@ -1390,6 +1464,24 @@ static struct buf_ops *__buf_ops_create(int fd, bool check_idempotency)
 		}
 	}
 
+	if (is_hw_tiling_supported(bops, I915_TILING_4)) {
+		bool swizzling_supported;
+		bool supported = probe_hw_tiling(bops, I915_TILING_4,
+						 &swizzling_supported);
+
+		if (!swizzling_supported) {
+			igt_debug("Swizzling for 4 is not supported\n");
+			bops->supported_tiles &= ~TILE_4;
+		}
+
+		igt_debug("4 fence support: %s\n", bool_str(supported));
+		if (!supported) {
+			bops->supported_hw_tiles &= ~TILE_4;
+			bops->linear_to_tile4 = copy_linear_to_tile4;
+			bops->tile4_to_linear = copy_tile4_to_linear;
+		}
+	}
+
 	/* Disable other tiling format functions if not supported */
 	if (!is_tiling_supported(bops, I915_TILING_Yf)) {
 		igt_debug("Yf format not supported\n");
diff --git a/lib/intel_chipset.h b/lib/intel_chipset.h
index db75a829..4d9f4623 100644
--- a/lib/intel_chipset.h
+++ b/lib/intel_chipset.h
@@ -219,6 +219,7 @@ void intel_check_pch(void);
 
 #define HAS_4TILE(devid)	(intel_get_device_info(devid)->has_4tile)
 
-#define HAS_FLATCCS(devid)	(intel_get_device_info(devid)->has_flatccs)
+/* use HAS_4TILE here as all devices with 4-tile have flat ccs. */
+#define HAS_FLATCCS(devid)	HAS_4TILE(devid)
 
 #endif /* _INTEL_CHIPSET_H */
diff --git a/lib/rendercopy_gen9.c b/lib/rendercopy_gen9.c
index 6c45efb4..ae0f775a 100644
--- a/lib/rendercopy_gen9.c
+++ b/lib/rendercopy_gen9.c
@@ -165,7 +165,8 @@ intel_get_uc_mocs(int fd) {
 
 /* Mostly copy+paste from gen6, except height, width, pitch moved */
 static uint32_t
-gen8_bind_buf(struct intel_bb *ibb, const struct intel_buf *buf, int is_dst) {
+gen8_bind_buf(struct intel_bb *ibb, const struct intel_buf *buf, int is_dst,
+	      bool fast_clear) {
 	struct gen9_surface_state *ss;
 	uint32_t write_domain, read_domain;
 	uint64_t address;
@@ -192,15 +193,26 @@ gen8_bind_buf(struct intel_bb *ibb, const struct intel_buf *buf, int is_dst) {
 		case 64: ss->ss0.surface_format = SURFACEFORMAT_R16G16B16A16_FLOAT; break;
 		default: igt_assert(0);
 	}
-	ss->ss0.render_cache_read_write = 1;
 	ss->ss0.vertical_alignment = 1; /* align 4 */
-	ss->ss0.horizontal_alignment = 1; /* align 4 */
+	ss->ss0.horizontal_alignment = 1; /* align 4 or HALIGN_32 on display ver >= 13*/
+
+	if (HAS_4TILE(ibb->devid)) {
+		/*
+		 * mocs table version 1 index 3 groub wb use l3
+		 */
+		ss->ss1.memory_object_control = 3 << 1;
+		ss->ss5.mip_tail_start_lod = 0;
+	} else {
+		ss->ss0.render_cache_read_write = 1;
+		ss->ss1.memory_object_control = intel_get_uc_mocs(i915);
+		ss->ss5.mip_tail_start_lod = 1; /* needed with trmode */
+	}
+
 	if (buf->tiling == I915_TILING_X)
 		ss->ss0.tiled_mode = 2;
 	else if (buf->tiling != I915_TILING_NONE)
 		ss->ss0.tiled_mode = 3;
 
-	ss->ss1.memory_object_control = intel_get_uc_mocs(i915);
 	if (intel_buf_pxp(buf))
 		ss->ss1.memory_object_control |= 1;
 
@@ -208,7 +220,6 @@ gen8_bind_buf(struct intel_bb *ibb, const struct intel_buf *buf, int is_dst) {
 		ss->ss5.trmode = 1;
 	else if (buf->tiling == I915_TILING_Ys)
 		ss->ss5.trmode = 2;
-	ss->ss5.mip_tail_start_lod = 1; /* needed with trmode */
 
 	address = intel_bb_offset_reloc(ibb, buf->handle,
 					read_domain, write_domain,
@@ -229,20 +240,23 @@ gen8_bind_buf(struct intel_bb *ibb, const struct intel_buf *buf, int is_dst) {
 	if (buf->compression == I915_COMPRESSION_MEDIA)
 		ss->ss7.tgl.media_compression = 1;
 	else if (buf->compression == I915_COMPRESSION_RENDER) {
-		igt_assert(buf->ccs[0].stride);
-
 		ss->ss6.aux_mode = 0x5; /* AUX_CCS_E */
-		ss->ss6.aux_pitch = (buf->ccs[0].stride / 128) - 1;
 
-		address = intel_bb_offset_reloc_with_delta(ibb, buf->handle,
-							   read_domain, write_domain,
-							   (buf->cc.offset ? (1 << 10) : 0) | buf->ccs[0].offset,
-							   intel_bb_offset(ibb) + 4 * 10,
-							   buf->addr.offset);
-		ss->ss10.aux_base_addr = (address + buf->ccs[0].offset) >> 12;
-		ss->ss11.aux_base_addr_hi = (address + buf->ccs[0].offset) >> 32;
+		if (buf->ccs[0].stride) {
+
+			ss->ss6.aux_pitch = (buf->ccs[0].stride / 128) - 1;
+
+			address = intel_bb_offset_reloc_with_delta(ibb, buf->handle,
+								   read_domain, write_domain,
+								   (buf->cc.offset ? (1 << 10) : 0)
+								   | buf->ccs[0].offset,
+								   intel_bb_offset(ibb) + 4 * 10,
+								   buf->addr.offset);
+			ss->ss10.aux_base_addr = (address + buf->ccs[0].offset) >> 12;
+			ss->ss11.aux_base_addr_hi = (address + buf->ccs[0].offset) >> 32;
+		}
 
-		if (buf->cc.offset) {
+		if (fast_clear || (buf->cc.offset && !HAS_FLATCCS(ibb->devid))) {
 			igt_assert(buf->compression == I915_COMPRESSION_RENDER);
 
 			ss->ss10.clearvalue_addr_enable = 1;
@@ -252,8 +266,30 @@ gen8_bind_buf(struct intel_bb *ibb, const struct intel_buf *buf, int is_dst) {
 								   buf->cc.offset,
 								   intel_bb_offset(ibb) + 4 * 12,
 								   buf->addr.offset);
-			ss->ss12.clear_address = address + buf->cc.offset;
+
+			/*
+			 * If this assert doesn't hold below clear address will be
+			 * written wrong.
+			 */
+
+			igt_assert(__builtin_ctzl(address + buf->cc.offset) >= 6 &&
+				   (__builtin_clzl(address + buf->cc.offset) >= 16));
+
+			ss->ss12.clear_address = (address + buf->cc.offset) >> 6;
 			ss->ss13.clear_address_hi = (address + buf->cc.offset) >> 32;
+		} else if (HAS_FLATCCS(ibb->devid)) {
+			ss->ss7.dg2.memory_compression_type = 0;
+			ss->ss7.dg2.memory_compression_enable = 0;
+			ss->ss7.dg2.disable_support_for_multi_gpu_partial_writes = 1;
+			ss->ss7.dg2.disable_support_for_multi_gpu_atomics = 1;
+
+			/*
+			 * For now here is coming only 32bpp rgb format
+			 * which is marked below as B8G8R8X8_UNORM = '8'
+			 * If here ever arrive other formats below need to be
+			 * fixed to take that into account.
+			 */
+			ss->ss12.compression_format = 8;
 		}
 	}
 
@@ -266,14 +302,15 @@ gen8_bind_surfaces(struct intel_bb *ibb,
 		   const struct intel_buf *dst)
 {
 	uint32_t *binding_table, binding_table_offset;
+	bool fast_clear = !src;
 
 	binding_table = intel_bb_ptr_align(ibb, 32);
 	binding_table_offset = intel_bb_ptr_add_return_prev_offset(ibb, 32);
 
-	binding_table[0] = gen8_bind_buf(ibb, dst, 1);
+	binding_table[0] = gen8_bind_buf(ibb, dst, 1, fast_clear);
 
 	if (src != NULL)
-		binding_table[1] = gen8_bind_buf(ibb, src, 0);
+		binding_table[1] = gen8_bind_buf(ibb, src, 0, false);
 
 	return binding_table_offset;
 }
@@ -856,12 +893,14 @@ gen8_emit_ps(struct intel_bb *ibb, uint32_t kernel, bool fast_clear) {
 static void
 gen9_emit_depth(struct intel_bb *ibb)
 {
+	bool need_10dw = HAS_4TILE(ibb->devid);
+
 	intel_bb_out(ibb, GEN8_3DSTATE_WM_DEPTH_STENCIL | (4 - 2));
 	intel_bb_out(ibb, 0);
 	intel_bb_out(ibb, 0);
 	intel_bb_out(ibb, 0);
 
-	intel_bb_out(ibb, GEN7_3DSTATE_DEPTH_BUFFER | (8-2));
+	intel_bb_out(ibb, GEN7_3DSTATE_DEPTH_BUFFER | (need_10dw ? (10-2) : (8-2)));
 	intel_bb_out(ibb, 0);
 	intel_bb_out(ibb, 0);
 	intel_bb_out(ibb, 0);
@@ -869,6 +908,10 @@ gen9_emit_depth(struct intel_bb *ibb)
 	intel_bb_out(ibb, 0);
 	intel_bb_out(ibb, 0);
 	intel_bb_out(ibb, 0);
+	if (need_10dw) {
+		intel_bb_out(ibb, 0);
+		intel_bb_out(ibb, 0);
+	}
 
 	intel_bb_out(ibb, GEN8_3DSTATE_HIER_DEPTH_BUFFER | (5-2));
 	intel_bb_out(ibb, 0);
@@ -1080,7 +1123,7 @@ void _gen9_render_op(struct intel_bb *ibb,
 
 	gen9_emit_state_base_address(ibb);
 
-	if (IS_DG2(ibb->devid) || intel_gen(ibb->devid) > 12) {
+	if (HAS_4TILE(ibb->devid) || intel_gen(ibb->devid) > 12) {
 		intel_bb_out(ibb, GEN4_3DSTATE_BINDING_TABLE_POOL_ALLOC | 2);
 		intel_bb_emit_reloc(ibb, ibb->handle,
 				    I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0,
@@ -1197,18 +1240,12 @@ void gen12p71_render_copyfunc(struct intel_bb *ibb,
 			      struct intel_buf *dst,
 			      unsigned int dst_x, unsigned int dst_y)
 {
-	struct aux_pgtable_info pgtable_info = { };
-
-	gen12_aux_pgtable_init(&pgtable_info, ibb, src, dst);
-
 	_gen9_render_op(ibb, src, src_x, src_y,
 			width, height, dst, dst_x, dst_y,
-			pgtable_info.pgtable_buf,
+			NULL,
 			NULL,
 			gen12p71_render_copy,
 			sizeof(gen12p71_render_copy));
-
-	gen12_aux_pgtable_cleanup(ibb, &pgtable_info);
 }
 
 void gen12_render_clearfunc(struct intel_bb *ibb,
@@ -1217,16 +1254,24 @@ void gen12_render_clearfunc(struct intel_bb *ibb,
 			    unsigned int width, unsigned int height,
 			    const float clear_color[4])
 {
-	struct aux_pgtable_info pgtable_info = { };
-
-	gen12_aux_pgtable_init(&pgtable_info, ibb, NULL, dst);
-
-	_gen9_render_op(ibb, NULL, 0, 0,
-		        width, height, dst, dst_x, dst_y,
-		        pgtable_info.pgtable_buf,
-		        clear_color,
-		        gen12_render_copy,
-		        sizeof(gen12_render_copy));
-
-	gen12_aux_pgtable_cleanup(ibb, &pgtable_info);
+	if (!HAS_4TILE(ibb->devid)) {
+		struct aux_pgtable_info pgtable_info = { };
+
+		gen12_aux_pgtable_init(&pgtable_info, ibb, NULL, dst);
+
+		_gen9_render_op(ibb, NULL, 0, 0,
+				width, height, dst, dst_x, dst_y,
+				pgtable_info.pgtable_buf,
+				clear_color,
+				gen12_render_copy,
+				sizeof(gen12_render_copy));
+		gen12_aux_pgtable_cleanup(ibb, &pgtable_info);
+	} else {
+			_gen9_render_op(ibb, NULL, 0, 0,
+					width, height, dst, dst_x, dst_y,
+					NULL,
+					clear_color,
+					gen12p71_render_copy,
+					sizeof(gen12p71_render_copy));
+	}
 }
diff --git a/lib/veboxcopy_gen12.c b/lib/veboxcopy_gen12.c
index 17564493..aa90939b 100644
--- a/lib/veboxcopy_gen12.c
+++ b/lib/veboxcopy_gen12.c
@@ -53,19 +53,25 @@ struct vebox_surface_state {
 		uint32_t width:14;
 		uint32_t height:14;
 	} ss2;
-	struct {
+	union {
+		struct {
 #define VEBOX_TILE_WALK_XMAJOR 0
 #define VEBOX_TILE_WALK_YMAJOR 1
-		uint32_t tile_walk:1;
-		uint32_t tiled_surface:1;
-		uint32_t chroma_half_pitch:1;
-		uint32_t surface_pitch:17;
-		uint32_t chroma_interleave:1;
-		uint32_t lsb_packed_enable:1;
-		uint32_t bayer_input_alignment:2;
-		uint32_t bayer_pattern_format:1;
-		uint32_t bayer_pattern_offset:2;
-		uint32_t surface_format:5;
+			uint32_t tile_walk:1;
+			uint32_t tiled_surface:1;
+			uint32_t chroma_half_pitch:1;
+			uint32_t surface_pitch:17;
+			uint32_t chroma_interleave:1;
+			uint32_t lsb_packed_enable:1;
+			uint32_t bayer_input_alignment:2;
+			uint32_t bayer_pattern_format:1;
+			uint32_t bayer_pattern_offset:2;
+			uint32_t surface_format:5;
+		} tgl;
+		struct {
+			uint32_t tile_mode:2;
+			uint32_t pad0:30;
+		} dg2;
 	} ss3;
 	struct {
 		uint32_t u_y_offset:15;
@@ -82,9 +88,15 @@ struct vebox_surface_state {
 		uint32_t frame_x_offset:15;
 		uint32_t pad:2;
 	} ss6;
-	struct {
-		uint32_t derived_surface_pitch:17;
-		uint32_t pad:15;
+	union {
+		struct {
+			uint32_t derived_surface_pitch:17;
+			uint32_t pad:15;
+		} skl;
+		struct {
+			uint32_t pad:27;
+			uint32_t compression_format:5;
+		} dg2;
 	} ss7;
 	struct {
 		uint32_t skin_score_output_surface_pitch:17;
@@ -166,17 +178,46 @@ static void emit_surface_state_cmd(struct intel_bb *ibb,
 	ss->ss2.height = height - 1;
 	ss->ss2.width = width - 1;
 
-	ss->ss3.surface_format = format;
+	ss->ss3.tgl.surface_format = format;
 	if (format_is_interleaved_yuv(format))
-		ss->ss3.chroma_interleave = 1;
-	ss->ss3.surface_pitch = pitch - 1;
-	ss->ss3.tile_walk = (tiling == I915_TILING_Y) ||
-			    (tiling == I915_TILING_Yf);
-	ss->ss3.tiled_surface = tiling != I915_TILING_NONE;
+		ss->ss3.tgl.chroma_interleave = 1;
+	ss->ss3.tgl.surface_pitch = pitch - 1;
 
 	ss->ss4.u_y_offset = uv_offset / pitch;
 
-	ss->ss7.derived_surface_pitch = pitch - 1;
+	if (HAS_FLATCCS(ibb->devid)) {
+		/*
+		 * f-tile = 3 (Tile F)
+		 */
+		ss->ss3.dg2.tile_mode = (tiling != I915_TILING_NONE) ? 3 : 0;
+
+		switch (format) {
+		case R8G8B8A8_UNORM:
+			ss->ss7.dg2.compression_format = 0xa;
+			break;
+		case PLANAR_420_8:
+			ss->ss7.dg2.compression_format = 0xf;
+			break;
+		case PLANAR_420_16:
+			ss->ss7.dg2.compression_format = 8;
+			break;
+		case YCRCB_NORMAL:
+			ss->ss7.dg2.compression_format = 3;
+			break;
+		case PACKED_444A_8:
+			ss->ss7.dg2.compression_format = 0x9;
+			break;
+		default:
+			igt_assert(0);
+		}
+	} else {
+		ss->ss3.tgl.tile_walk = (tiling == I915_TILING_Y) ||
+			(tiling == I915_TILING_Yf) ||
+			(tiling == I915_TILING_4);
+		ss->ss3.tgl.tiled_surface = tiling != I915_TILING_NONE;
+	}
+
+	ss->ss7.skl.derived_surface_pitch = pitch - 1;
 
 	intel_bb_ptr_add(ibb, sizeof(*ss));
 }
@@ -203,7 +244,11 @@ static void emit_tiling_convert_cmd(struct intel_bb *ibb,
 		tc->tc1_2.input_compression_type =
 			src->compression == I915_COMPRESSION_RENDER;
 	}
-	tc->tc1_2.input_tiled_resource_mode = src->tiling == I915_TILING_Yf;
+
+	if (HAS_4TILE(ibb->devid))
+		tc->tc1_2.input_mocs_idx = 3;
+	else
+		tc->tc1_2.input_tiled_resource_mode = src->tiling == I915_TILING_Yf;
 	reloc_delta = tc->tc1_2_l;
 
 	igt_assert(src->addr.offset == ALIGN(src->addr.offset, 0x1000));
@@ -220,7 +265,12 @@ static void emit_tiling_convert_cmd(struct intel_bb *ibb,
 		tc->tc3_4.output_compression_type =
 			dst->compression == I915_COMPRESSION_RENDER;
 	}
-	tc->tc3_4.output_tiled_resource_mode = dst->tiling == I915_TILING_Yf;
+
+	if (HAS_4TILE(ibb->devid))
+		tc->tc3_4.output_mocs_idx = 3;
+	else
+		tc->tc3_4.output_tiled_resource_mode = dst->tiling == I915_TILING_Yf;
+
 	reloc_delta = tc->tc3_4_l;
 
 	igt_assert(dst->addr.offset == ALIGN(dst->addr.offset, 0x1000));
@@ -255,10 +305,12 @@ void gen12_vebox_copyfunc(struct intel_bb *ibb,
 	intel_bb_add_intel_buf(ibb, dst, true);
 	intel_bb_add_intel_buf(ibb, src, false);
 
-	intel_bb_ptr_set(ibb, BATCH_STATE_SPLIT);
-	gen12_aux_pgtable_init(&aux_pgtable_info, ibb, src, dst);
-	aux_pgtable_state = gen12_create_aux_pgtable_state(ibb,
-							   aux_pgtable_info.pgtable_buf);
+	if (!HAS_FLATCCS(ibb->devid)) {
+		intel_bb_ptr_set(ibb, BATCH_STATE_SPLIT);
+		gen12_aux_pgtable_init(&aux_pgtable_info, ibb, src, dst);
+		aux_pgtable_state = gen12_create_aux_pgtable_state(ibb,
+								   aux_pgtable_info.pgtable_buf);
+	}
 
 	intel_bb_ptr_set(ibb, 0);
 	gen12_emit_aux_pgtable_state(ibb, aux_pgtable_state, false);
@@ -311,5 +363,6 @@ void gen12_vebox_copyfunc(struct intel_bb *ibb,
 
 	intel_bb_reset(ibb, false);
 
-	gen12_aux_pgtable_cleanup(ibb, &aux_pgtable_info);
+	if (!HAS_FLATCCS(ibb->devid))
+		gen12_aux_pgtable_cleanup(ibb, &aux_pgtable_info);
 }
-- 
2.36.0