[PATCH i-g-t 1/3] lib/rendercopy: Add render-copy xe2 implementation
Zbigniew Kempczyński
zbigniew.kempczynski at intel.com
Mon Jan 8 11:30:10 UTC 2024
Due to small differences between xe2 and previous 3d pipeline I decided
to adopt this in gen9 render-copy function instead of introducing a new one.
Xe2 uses large GRFs (512-bit) where coordinates occupy only 2 GRF
registers (instead 4 on 256-bit GRFs). This requires shader adoption
on data preparation for sampler/render target write.
Signed-off-by: Zbigniew Kempczyński <zbigniew.kempczynski at intel.com>
Cc: Juha-Pekka Heikkila <juhapekka.heikkila at gmail.com>
---
lib/genxe2_render.h | 14 ++++++
lib/i915/shaders/ps/gen20_render_copy.asm | 8 +++
lib/rendercopy.h | 4 ++
lib/rendercopy_gen9.c | 60 +++++++++++++++++++++--
4 files changed, 81 insertions(+), 5 deletions(-)
create mode 100644 lib/genxe2_render.h
create mode 100644 lib/i915/shaders/ps/gen20_render_copy.asm
diff --git a/lib/genxe2_render.h b/lib/genxe2_render.h
new file mode 100644
index 0000000000..3db7a84894
--- /dev/null
+++ b/lib/genxe2_render.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2024 Intel Corporation
+ */
+#ifndef GENXE2_RENDER_H
+#define GENXE2_RENDER_H
+
+#define GENXE2_3DSTATE_DRAWING_RECTANGLE_FAST GEN4_3D(3, 0, 0)
+
+/* 3DSTATE_PS dword6 */
+# define GENXE_KERNEL0_PACKING_POLICY 24
+# define GENXE_KERNEL0_POLY_PACK16_FIXED 3
+
+#endif
diff --git a/lib/i915/shaders/ps/gen20_render_copy.asm b/lib/i915/shaders/ps/gen20_render_copy.asm
new file mode 100644
index 0000000000..330417966d
--- /dev/null
+++ b/lib/i915/shaders/ps/gen20_render_copy.asm
@@ -0,0 +1,8 @@
+L0:
+(W) mad (16|M0) acc0.0<1>:f r6.3<0;0>:f r1.0<1;1>:f r6.0<0>:f
+(W) mad (16|M0) r113.0<1>:f acc0.0<1;1>:f r1.0<1;1>:f r6.1<0>:f
+(W) mad (16|M0) acc0.0<1>:f r6.7<0;0>:f r1.0<1;1>:f r6.4<0>:f
+(W) mad (16|M0) r114.0<1>:f acc0.0<1;1>:f r2.0<1;1>:f r6.5<0>:f
+(W) send.smpl (16|M0) r12 r113 null:0 0x0 0x04420001 {F at 1,$0} // wr:2+0, rd:4; simd16 sample:u+v+r+ai+mlod using sampler index 0
+(W) send.rc (16|M0) null r12 null:0 0x0 0x08031400 {EOT,$0} // wr:4+0, rd:0; full-precision render target write SIMD16; last render target to surface 0
+L96:
diff --git a/lib/rendercopy.h b/lib/rendercopy.h
index 0d81d27f83..1a97a72573 100644
--- a/lib/rendercopy.h
+++ b/lib/rendercopy.h
@@ -43,6 +43,10 @@ void gen12p71_render_copyfunc(struct intel_bb *ibb,
struct intel_buf *src, uint32_t src_x, uint32_t src_y,
uint32_t width, uint32_t height,
struct intel_buf *dst, uint32_t dst_x, uint32_t dst_y);
+void genxe2_render_copyfunc(struct intel_bb *ibb,
+ struct intel_buf *src, uint32_t src_x, uint32_t src_y,
+ uint32_t width, uint32_t height,
+ struct intel_buf *dst, uint32_t dst_x, uint32_t dst_y);
void gen12_render_copyfunc(struct intel_bb *ibb,
struct intel_buf *src, uint32_t src_x, uint32_t src_y,
uint32_t width, uint32_t height,
diff --git a/lib/rendercopy_gen9.c b/lib/rendercopy_gen9.c
index 363bc6c1b2..f0efadeb50 100644
--- a/lib/rendercopy_gen9.c
+++ b/lib/rendercopy_gen9.c
@@ -22,6 +22,7 @@
#include "intel_mocs.h"
#include "rendercopy.h"
#include "gen9_render.h"
+#include "genxe2_render.h"
#include "intel_reg.h"
#include "igt_aux.h"
#include "intel_chipset.h"
@@ -136,6 +137,15 @@ static const uint32_t gen12p71_render_copy[][4] = {
{ 0x80041131, 0x00000004, 0x50007144, 0x00c40000 },
};
+static const uint32_t xe2_render_copy[][4] = {
+ { 0x8010005b, 0x200002a0, 0x020a0634, 0x06040105 },
+ { 0x8010005b, 0x710402a8, 0x020a2001, 0x06140105 },
+ { 0x8010005b, 0x200002a0, 0x020a0674, 0x06440105 },
+ { 0x8010005b, 0x720402a8, 0x020a2001, 0x06540205 },
+ { 0x80122031, 0x0c240000, 0x20027114, 0x00800000 },
+ { 0x8010c031, 0x00000004, 0x58000c24, 0x00c40000 },
+};
+
/* Mostly copy+paste from gen6, except height, width, pitch moved */
static uint32_t
gen9_bind_buf(struct intel_bb *ibb, const struct intel_buf *buf, int is_dst,
@@ -545,7 +555,10 @@ gen9_emit_state_base_address(struct intel_bb *ibb) {
/* WaBindlessSurfaceStateModifyEnable:skl,bxt */
/* The length has to be one less if we dont modify
bindless state */
- intel_bb_out(ibb, GEN4_STATE_BASE_ADDRESS | (19 - 1 - 2));
+ if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20))
+ intel_bb_out(ibb, GEN4_STATE_BASE_ADDRESS | 20);
+ else
+ intel_bb_out(ibb, GEN4_STATE_BASE_ADDRESS | (19 - 1 - 2));
/* general */
intel_bb_out(ibb, 0 | BASE_ADDRESS_MODIFY);
@@ -586,6 +599,13 @@ gen9_emit_state_base_address(struct intel_bb *ibb) {
intel_bb_out(ibb, 0);
intel_bb_out(ibb, 0);
intel_bb_out(ibb, 0);
+
+ if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20)) {
+ /* Bindless sampler */
+ intel_bb_out(ibb, 0);
+ intel_bb_out(ibb, 0);
+ intel_bb_out(ibb, 0);
+ }
}
static void
@@ -753,7 +773,10 @@ gen9_emit_ds(struct intel_bb *ibb) {
static void
gen8_emit_wm_hz_op(struct intel_bb *ibb) {
- intel_bb_out(ibb, GEN8_3DSTATE_WM_HZ_OP | (5-2));
+ if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20))
+ intel_bb_out(ibb, GEN8_3DSTATE_WM_HZ_OP | (6-2));
+ else
+ intel_bb_out(ibb, GEN8_3DSTATE_WM_HZ_OP | (5-2));
intel_bb_out(ibb, 0);
intel_bb_out(ibb, 0);
intel_bb_out(ibb, 0);
@@ -852,7 +875,11 @@ gen8_emit_ps(struct intel_bb *ibb, uint32_t kernel, bool fast_clear) {
intel_bb_out(ibb, (max_threads - 1) << GEN8_3DSTATE_PS_MAX_THREADS_SHIFT |
GEN6_3DSTATE_WM_16_DISPATCH_ENABLE |
(fast_clear ? GEN8_3DSTATE_FAST_CLEAR_ENABLE : 0));
- intel_bb_out(ibb, 6 << GEN6_3DSTATE_WM_DISPATCH_START_GRF_0_SHIFT);
+ if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20))
+ intel_bb_out(ibb, 6 << GEN6_3DSTATE_WM_DISPATCH_START_GRF_0_SHIFT |
+ GENXE_KERNEL0_POLY_PACK16_FIXED << GENXE_KERNEL0_PACKING_POLICY);
+ else
+ intel_bb_out(ibb, 6 << GEN6_3DSTATE_WM_DISPATCH_START_GRF_0_SHIFT);
intel_bb_out(ibb, 0); // kernel 1
intel_bb_out(ibb, 0); /* kernel 1 hi */
intel_bb_out(ibb, 0); // kernel 2
@@ -862,7 +889,11 @@ gen8_emit_ps(struct intel_bb *ibb, uint32_t kernel, bool fast_clear) {
intel_bb_out(ibb, GEN8_PS_BLEND_HAS_WRITEABLE_RT);
intel_bb_out(ibb, GEN8_3DSTATE_PS_EXTRA | (2 - 2));
- intel_bb_out(ibb, GEN8_PSX_PIXEL_SHADER_VALID | GEN8_PSX_ATTRIBUTE_ENABLE);
+
+ if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20))
+ intel_bb_out(ibb, GEN8_PSX_PIXEL_SHADER_VALID);
+ else
+ intel_bb_out(ibb, GEN8_PSX_PIXEL_SHADER_VALID | GEN8_PSX_ATTRIBUTE_ENABLE);
}
static void
@@ -903,6 +934,9 @@ gen9_emit_depth(struct intel_bb *ibb)
static void
gen7_emit_clear(struct intel_bb *ibb) {
+ if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20))
+ return;
+
intel_bb_out(ibb, GEN7_3DSTATE_CLEAR_PARAMS | (3-2));
intel_bb_out(ibb, 0);
intel_bb_out(ibb, 1); // clear valid
@@ -911,7 +945,10 @@ gen7_emit_clear(struct intel_bb *ibb) {
static void
gen6_emit_drawing_rectangle(struct intel_bb *ibb, const struct intel_buf *dst)
{
- intel_bb_out(ibb, GEN4_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
+ if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20))
+ intel_bb_out(ibb, GENXE2_3DSTATE_DRAWING_RECTANGLE_FAST | (4 - 2));
+ else
+ intel_bb_out(ibb, GEN4_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
intel_bb_out(ibb, 0);
intel_bb_out(ibb, (intel_buf_height(dst) - 1) << 16 | (intel_buf_width(dst) - 1));
intel_bb_out(ibb, 0);
@@ -1220,6 +1257,19 @@ void gen12p71_render_copyfunc(struct intel_bb *ibb,
sizeof(gen12p71_render_copy));
}
+void genxe2_render_copyfunc(struct intel_bb *ibb,
+ struct intel_buf *src, uint32_t src_x, uint32_t src_y,
+ uint32_t width, uint32_t height,
+ struct intel_buf *dst, uint32_t dst_x, uint32_t dst_y)
+{
+ _gen9_render_op(ibb, src, src_x, src_y,
+ width, height, dst, dst_x, dst_y,
+ NULL,
+ NULL,
+ xe2_render_copy,
+ sizeof(xe2_render_copy));
+}
+
void mtl_render_copyfunc(struct intel_bb *ibb,
struct intel_buf *src,
unsigned int src_x, unsigned int src_y,
--
2.34.1
More information about the igt-dev
mailing list