[PATCH i-g-t v3 1/3] lib/rendercopy: Add render-copy xe2 implementation

Zbigniew Kempczyński zbigniew.kempczynski at intel.com
Wed Jan 10 12:25:12 UTC 2024


Due to small differences between xe2 and previous 3d pipeline I decided
to adopt this in gen9 render-copy function instead of introducing a new one.
Xe2 uses large GRFs (512-bit) where coordinates occupy only 2 GRF
registers (instead 4 on 256-bit GRFs). This requires shader adoption
on data preparation for sampler/render target write.

Signed-off-by: Zbigniew Kempczyński <zbigniew.kempczynski at intel.com>
Cc: Juha-Pekka Heikkila <juhapekka.heikkila at gmail.com>
Cc: Dominik Grzegorzek <dominik.grzegorzek at intel.com>
Reviewed-by: Dominik Grzegorzek <dominik.grzegorzek at intel.com>
---
v2/v3: Addressing review comments (Dominik)
---
 lib/i915/shaders/ps/gen20_render_copy.asm |  8 ++++
 lib/rendercopy.h                          |  4 ++
 lib/rendercopy_gen9.c                     | 56 +++++++++++++++++++++--
 lib/xe2_render.h                          | 14 ++++++
 4 files changed, 78 insertions(+), 4 deletions(-)
 create mode 100644 lib/i915/shaders/ps/gen20_render_copy.asm
 create mode 100644 lib/xe2_render.h

diff --git a/lib/i915/shaders/ps/gen20_render_copy.asm b/lib/i915/shaders/ps/gen20_render_copy.asm
new file mode 100644
index 0000000000..330417966d
--- /dev/null
+++ b/lib/i915/shaders/ps/gen20_render_copy.asm
@@ -0,0 +1,8 @@
+L0:
+(W)     mad (16|M0)              acc0.0<1>:f   r6.3<0;0>:f      r1.0<1;1>:f       r6.0<0>:f
+(W)     mad (16|M0)              r113.0<1>:f   acc0.0<1;1>:f    r1.0<1;1>:f       r6.1<0>:f
+(W)     mad (16|M0)              acc0.0<1>:f   r6.7<0;0>:f      r1.0<1;1>:f       r6.4<0>:f
+(W)     mad (16|M0)              r114.0<1>:f   acc0.0<1;1>:f    r2.0<1;1>:f       r6.5<0>:f
+(W)     send.smpl (16|M0)        r12      r113  null:0  0x0            0x04420001           {F at 1,$0} // wr:2+0, rd:4; simd16 sample:u+v+r+ai+mlod using sampler index 0
+(W)     send.rc (16|M0)          null     r12   null:0  0x0            0x08031400           {EOT,$0} // wr:4+0, rd:0; full-precision render target write SIMD16; last render target to surface 0
+L96:
diff --git a/lib/rendercopy.h b/lib/rendercopy.h
index 0d81d27f83..3733e8a09f 100644
--- a/lib/rendercopy.h
+++ b/lib/rendercopy.h
@@ -43,6 +43,10 @@ void gen12p71_render_copyfunc(struct intel_bb *ibb,
 			      struct intel_buf *src, uint32_t src_x, uint32_t src_y,
 			      uint32_t width, uint32_t height,
 			      struct intel_buf *dst, uint32_t dst_x, uint32_t dst_y);
+void xe2_render_copyfunc(struct intel_bb *ibb,
+			 struct intel_buf *src, uint32_t src_x, uint32_t src_y,
+			 uint32_t width, uint32_t height,
+			 struct intel_buf *dst, uint32_t dst_x, uint32_t dst_y);
 void gen12_render_copyfunc(struct intel_bb *ibb,
 			   struct intel_buf *src, uint32_t src_x, uint32_t src_y,
 			   uint32_t width, uint32_t height,
diff --git a/lib/rendercopy_gen9.c b/lib/rendercopy_gen9.c
index 363bc6c1b2..bdfd9be087 100644
--- a/lib/rendercopy_gen9.c
+++ b/lib/rendercopy_gen9.c
@@ -22,6 +22,7 @@
 #include "intel_mocs.h"
 #include "rendercopy.h"
 #include "gen9_render.h"
+#include "xe2_render.h"
 #include "intel_reg.h"
 #include "igt_aux.h"
 #include "intel_chipset.h"
@@ -136,6 +137,15 @@ static const uint32_t gen12p71_render_copy[][4] = {
 	{ 0x80041131, 0x00000004, 0x50007144, 0x00c40000 },
 };
 
+static const uint32_t xe2_render_copy[][4] = {
+	{ 0x8010005b, 0x200002a0, 0x020a0634, 0x06040105 },
+	{ 0x8010005b, 0x710402a8, 0x020a2001, 0x06140105 },
+	{ 0x8010005b, 0x200002a0, 0x020a0674, 0x06440105 },
+	{ 0x8010005b, 0x720402a8, 0x020a2001, 0x06540205 },
+	{ 0x80122031, 0x0c240000, 0x20027114, 0x00800000 },
+	{ 0x8010c031, 0x00000004, 0x58000c24, 0x00c40000 },
+};
+
 /* Mostly copy+paste from gen6, except height, width, pitch moved */
 static uint32_t
 gen9_bind_buf(struct intel_bb *ibb, const struct intel_buf *buf, int is_dst,
@@ -545,7 +555,10 @@ gen9_emit_state_base_address(struct intel_bb *ibb) {
 	/* WaBindlessSurfaceStateModifyEnable:skl,bxt */
 	/* The length has to be one less if we dont modify
 	   bindless state */
-	intel_bb_out(ibb, GEN4_STATE_BASE_ADDRESS | (19 - 1 - 2));
+	if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20))
+		intel_bb_out(ibb, GEN4_STATE_BASE_ADDRESS | 20);
+	else
+		intel_bb_out(ibb, GEN4_STATE_BASE_ADDRESS | (19 - 1 - 2));
 
 	/* general */
 	intel_bb_out(ibb, 0 | BASE_ADDRESS_MODIFY);
@@ -586,6 +599,13 @@ gen9_emit_state_base_address(struct intel_bb *ibb) {
 	intel_bb_out(ibb, 0);
 	intel_bb_out(ibb, 0);
 	intel_bb_out(ibb, 0);
+
+	if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20)) {
+		/* Bindless sampler */
+		intel_bb_out(ibb, 0);
+		intel_bb_out(ibb, 0);
+		intel_bb_out(ibb, 0);
+	}
 }
 
 static void
@@ -753,7 +773,12 @@ gen9_emit_ds(struct intel_bb *ibb) {
 
 static void
 gen8_emit_wm_hz_op(struct intel_bb *ibb) {
-	intel_bb_out(ibb, GEN8_3DSTATE_WM_HZ_OP | (5-2));
+	if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20)) {
+		intel_bb_out(ibb, GEN8_3DSTATE_WM_HZ_OP | (6-2));
+		intel_bb_out(ibb, 0);
+	} else {
+		intel_bb_out(ibb, GEN8_3DSTATE_WM_HZ_OP | (5-2));
+	}
 	intel_bb_out(ibb, 0);
 	intel_bb_out(ibb, 0);
 	intel_bb_out(ibb, 0);
@@ -852,7 +877,11 @@ gen8_emit_ps(struct intel_bb *ibb, uint32_t kernel, bool fast_clear) {
 	intel_bb_out(ibb, (max_threads - 1) << GEN8_3DSTATE_PS_MAX_THREADS_SHIFT |
 	             GEN6_3DSTATE_WM_16_DISPATCH_ENABLE |
 	             (fast_clear ? GEN8_3DSTATE_FAST_CLEAR_ENABLE : 0));
-	intel_bb_out(ibb, 6 << GEN6_3DSTATE_WM_DISPATCH_START_GRF_0_SHIFT);
+	if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20))
+		intel_bb_out(ibb, 6 << GEN6_3DSTATE_WM_DISPATCH_START_GRF_0_SHIFT |
+			     GENXE_KERNEL0_POLY_PACK16_FIXED << GENXE_KERNEL0_PACKING_POLICY);
+	else
+		intel_bb_out(ibb, 6 << GEN6_3DSTATE_WM_DISPATCH_START_GRF_0_SHIFT);
 	intel_bb_out(ibb, 0); // kernel 1
 	intel_bb_out(ibb, 0); /* kernel 1 hi */
 	intel_bb_out(ibb, 0); // kernel 2
@@ -903,6 +932,9 @@ gen9_emit_depth(struct intel_bb *ibb)
 
 static void
 gen7_emit_clear(struct intel_bb *ibb) {
+	if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20))
+		return;
+
 	intel_bb_out(ibb, GEN7_3DSTATE_CLEAR_PARAMS | (3-2));
 	intel_bb_out(ibb, 0);
 	intel_bb_out(ibb, 1); // clear valid
@@ -911,7 +943,10 @@ gen7_emit_clear(struct intel_bb *ibb) {
 static void
 gen6_emit_drawing_rectangle(struct intel_bb *ibb, const struct intel_buf *dst)
 {
-	intel_bb_out(ibb, GEN4_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
+	if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20))
+		intel_bb_out(ibb, GENXE2_3DSTATE_DRAWING_RECTANGLE_FAST | (4 - 2));
+	else
+		intel_bb_out(ibb, GEN4_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
 	intel_bb_out(ibb, 0);
 	intel_bb_out(ibb, (intel_buf_height(dst) - 1) << 16 | (intel_buf_width(dst) - 1));
 	intel_bb_out(ibb, 0);
@@ -1220,6 +1255,19 @@ void gen12p71_render_copyfunc(struct intel_bb *ibb,
 			sizeof(gen12p71_render_copy));
 }
 
+void xe2_render_copyfunc(struct intel_bb *ibb,
+			 struct intel_buf *src, uint32_t src_x, uint32_t src_y,
+			 uint32_t width, uint32_t height,
+			 struct intel_buf *dst, uint32_t dst_x, uint32_t dst_y)
+{
+	_gen9_render_op(ibb, src, src_x, src_y,
+			  width, height, dst, dst_x, dst_y,
+			  NULL,
+			  NULL,
+			  xe2_render_copy,
+			  sizeof(xe2_render_copy));
+}
+
 void mtl_render_copyfunc(struct intel_bb *ibb,
 			 struct intel_buf *src,
 			 unsigned int src_x, unsigned int src_y,
diff --git a/lib/xe2_render.h b/lib/xe2_render.h
new file mode 100644
index 0000000000..adca29c57b
--- /dev/null
+++ b/lib/xe2_render.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2024 Intel Corporation
+ */
+#ifndef XE2_RENDER_H
+#define XE2_RENDER_H
+
+#define GENXE2_3DSTATE_DRAWING_RECTANGLE_FAST	GEN4_3D(3, 0, 0)
+
+/* 3DSTATE_PS dword6 */
+# define GENXE_KERNEL0_PACKING_POLICY		24
+#  define GENXE_KERNEL0_POLY_PACK16_FIXED	3
+
+#endif
-- 
2.34.1



More information about the igt-dev mailing list