[PATCH i-g-t 1/3] lib/rendercopy: Add render-copy xe2 implementation

Grzegorzek, Dominik dominik.grzegorzek at intel.com
Mon Jan 8 15:59:21 UTC 2024


On Mon, 2024-01-08 at 12:30 +0100, Zbigniew Kempczyński wrote:
> Due to small differences between xe2 and previous 3d pipeline I decided
> to adopt this in gen9 render-copy function instead of introducing a new one.
> Xe2 uses large GRFs (512-bit) where coordinates occupy only 2 GRF
> registers (instead 4 on 256-bit GRFs). This requires shader adoption
> on data preparation for sampler/render target write.
> 
> Signed-off-by: Zbigniew Kempczyński <zbigniew.kempczynski at intel.com>
> Cc: Juha-Pekka Heikkila <juhapekka.heikkila at gmail.com>
> ---
>  lib/genxe2_render.h                       | 14 ++++++
> 
I believe we should not use 'gen' notation anymore. So either use as a prefix
first platform which uses this (xe2lpg_ in this case) or simply architecture xe2_. Applies to all
'GENXE2, genxe2' prefixes.

>  lib/i915/shaders/ps/gen20_render_copy.asm |  8 +++
>  lib/rendercopy.h                          |  4 ++
>  lib/rendercopy_gen9.c                     | 60 +++++++++++++++++++++--
>  4 files changed, 81 insertions(+), 5 deletions(-)
>  create mode 100644 lib/genxe2_render.h
>  create mode 100644 lib/i915/shaders/ps/gen20_render_copy.asm
> 
> diff --git a/lib/genxe2_render.h b/lib/genxe2_render.h
> new file mode 100644
> index 0000000000..3db7a84894
> --- /dev/null
> +++ b/lib/genxe2_render.h
> @@ -0,0 +1,14 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright © 2024 Intel Corporation
> + */
> +#ifndef GENXE2_RENDER_H
> +#define GENXE2_RENDER_H
> +
> +#define GENXE2_3DSTATE_DRAWING_RECTANGLE_FAST	GEN4_3D(3, 0, 0)
> +
> +/* 3DSTATE_PS dword6 */
> +# define GENXE_KERNEL0_PACKING_POLICY		24
> +#  define GENXE_KERNEL0_POLY_PACK16_FIXED	3
> +
> +#endif
> diff --git a/lib/i915/shaders/ps/gen20_render_copy.asm b/lib/i915/shaders/ps/gen20_render_copy.asm
> new file mode 100644
> index 0000000000..330417966d
> --- /dev/null
> +++ b/lib/i915/shaders/ps/gen20_render_copy.asm
> @@ -0,0 +1,8 @@
> +L0:
> +(W)     mad (16|M0)              acc0.0<1>:f   r6.3<0;0>:f      r1.0<1;1>:f       r6.0<0>:f
> +(W)     mad (16|M0)              r113.0<1>:f   acc0.0<1;1>:f    r1.0<1;1>:f       r6.1<0>:f
> +(W)     mad (16|M0)              acc0.0<1>:f   r6.7<0;0>:f      r1.0<1;1>:f       r6.4<0>:f
> +(W)     mad (16|M0)              r114.0<1>:f   acc0.0<1;1>:f    r2.0<1;1>:f       r6.5<0>:f
> +(W)     send.smpl (16|M0)        r12      r113  null:0  0x0            0x04420001           {F at 1,$0} // wr:2+0, rd:4; simd16 sample:u+v+r+ai+mlod using sampler index 0
> +(W)     send.rc (16|M0)          null     r12   null:0  0x0            0x08031400           {EOT,$0} // wr:4+0, rd:0; full-precision render target write SIMD16; last render target to surface 0
> +L96:
> diff --git a/lib/rendercopy.h b/lib/rendercopy.h

> index 0d81d27f83..1a97a72573 100644
> --- a/lib/rendercopy.h
> +++ b/lib/rendercopy.h
> @@ -43,6 +43,10 @@ void gen12p71_render_copyfunc(struct intel_bb *ibb,
>  			      struct intel_buf *src, uint32_t src_x, uint32_t src_y,
>  			      uint32_t width, uint32_t height,
>  			      struct intel_buf *dst, uint32_t dst_x, uint32_t dst_y);
> +void genxe2_render_copyfunc(struct intel_bb *ibb,
> +			    struct intel_buf *src, uint32_t src_x, uint32_t src_y,
> +			    uint32_t width, uint32_t height,
> +			    struct intel_buf *dst, uint32_t dst_x, uint32_t dst_y);
>  void gen12_render_copyfunc(struct intel_bb *ibb,
>  			   struct intel_buf *src, uint32_t src_x, uint32_t src_y,
>  			   uint32_t width, uint32_t height,
> diff --git a/lib/rendercopy_gen9.c b/lib/rendercopy_gen9.c
> index 363bc6c1b2..f0efadeb50 100644
> --- a/lib/rendercopy_gen9.c
> +++ b/lib/rendercopy_gen9.c
> @@ -22,6 +22,7 @@
>  #include "intel_mocs.h"
>  #include "rendercopy.h"
>  #include "gen9_render.h"
> +#include "genxe2_render.h"
>  #include "intel_reg.h"
>  #include "igt_aux.h"
>  #include "intel_chipset.h"
> @@ -136,6 +137,15 @@ static const uint32_t gen12p71_render_copy[][4] = {
>  	{ 0x80041131, 0x00000004, 0x50007144, 0x00c40000 },
>  };
>  
> +static const uint32_t xe2_render_copy[][4] = {
> +	{ 0x8010005b, 0x200002a0, 0x020a0634, 0x06040105 },
> +	{ 0x8010005b, 0x710402a8, 0x020a2001, 0x06140105 },
> +	{ 0x8010005b, 0x200002a0, 0x020a0674, 0x06440105 },
> +	{ 0x8010005b, 0x720402a8, 0x020a2001, 0x06540205 },
> +	{ 0x80122031, 0x0c240000, 0x20027114, 0x00800000 },
> +	{ 0x8010c031, 0x00000004, 0x58000c24, 0x00c40000 },
> +};
> +
>  /* Mostly copy+paste from gen6, except height, width, pitch moved */
>  static uint32_t
>  gen9_bind_buf(struct intel_bb *ibb, const struct intel_buf *buf, int is_dst,
> @@ -545,7 +555,10 @@ gen9_emit_state_base_address(struct intel_bb *ibb) {
>  	/* WaBindlessSurfaceStateModifyEnable:skl,bxt */
>  	/* The length has to be one less if we dont modify
>  	   bindless state */
> -	intel_bb_out(ibb, GEN4_STATE_BASE_ADDRESS | (19 - 1 - 2));
> +	if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20))
> +		intel_bb_out(ibb, GEN4_STATE_BASE_ADDRESS | 20);
> +	else
> +		intel_bb_out(ibb, GEN4_STATE_BASE_ADDRESS | (19 - 1 - 2));
>  
>  	/* general */
>  	intel_bb_out(ibb, 0 | BASE_ADDRESS_MODIFY);
> @@ -586,6 +599,13 @@ gen9_emit_state_base_address(struct intel_bb *ibb) {
>  	intel_bb_out(ibb, 0);
>  	intel_bb_out(ibb, 0);
>  	intel_bb_out(ibb, 0);
> +
> +	if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20)) {
> +		/* Bindless sampler */
> +		intel_bb_out(ibb, 0);
> +		intel_bb_out(ibb, 0);
> +		intel_bb_out(ibb, 0);
> +	}
>  }
>  
>  static void
> @@ -753,7 +773,10 @@ gen9_emit_ds(struct intel_bb *ibb) {
>  
>  static void
>  gen8_emit_wm_hz_op(struct intel_bb *ibb) {
> -	intel_bb_out(ibb, GEN8_3DSTATE_WM_HZ_OP | (5-2));
> +	if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20))
> +		intel_bb_out(ibb, GEN8_3DSTATE_WM_HZ_OP | (6-2));
> +	else
> +		intel_bb_out(ibb, GEN8_3DSTATE_WM_HZ_OP | (5-2));
>  	intel_bb_out(ibb, 0);
>  	intel_bb_out(ibb, 0);
>  	intel_bb_out(ibb, 0);
> 
Shouldn't we add here intel_bb_out(ibb, 0); as well? You increased lenght of the instruction.
> @@ -852,7 +875,11 @@ gen8_emit_ps(struct intel_bb *ibb, uint32_t kernel, bool fast_clear) {
>  	intel_bb_out(ibb, (max_threads - 1) << GEN8_3DSTATE_PS_MAX_THREADS_SHIFT |
>  	             GEN6_3DSTATE_WM_16_DISPATCH_ENABLE |
>  	             (fast_clear ? GEN8_3DSTATE_FAST_CLEAR_ENABLE : 0));
> -	intel_bb_out(ibb, 6 << GEN6_3DSTATE_WM_DISPATCH_START_GRF_0_SHIFT);
> +	if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20))
> +		intel_bb_out(ibb, 6 << GEN6_3DSTATE_WM_DISPATCH_START_GRF_0_SHIFT |
> +			     GENXE_KERNEL0_POLY_PACK16_FIXED << GENXE_KERNEL0_PACKING_POLICY);
Looks sane.
> +	else
> +		intel_bb_out(ibb, 6 << GEN6_3DSTATE_WM_DISPATCH_START_GRF_0_SHIFT);
>  	intel_bb_out(ibb, 0); // kernel 1
>  	intel_bb_out(ibb, 0); /* kernel 1 hi */
>  	intel_bb_out(ibb, 0); // kernel 2
> @@ -862,7 +889,11 @@ gen8_emit_ps(struct intel_bb *ibb, uint32_t kernel, bool fast_clear) {
>  	intel_bb_out(ibb, GEN8_PS_BLEND_HAS_WRITEABLE_RT);
>  
>  	intel_bb_out(ibb, GEN8_3DSTATE_PS_EXTRA | (2 - 2));
> -	intel_bb_out(ibb, GEN8_PSX_PIXEL_SHADER_VALID | GEN8_PSX_ATTRIBUTE_ENABLE);
> +
> +	if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20))
> +		intel_bb_out(ibb, GEN8_PSX_PIXEL_SHADER_VALID);
> +	else
> +		intel_bb_out(ibb, GEN8_PSX_PIXEL_SHADER_VALID | GEN8_PSX_ATTRIBUTE_ENABLE);
Looking at the spec, meaning of that field has not change. So why do we need to clear this? Was
wrong before?

Overall, it looks good to me, just minor nits spotted.

Regards, 
Dominik 
>  }
>  
>  static void
> @@ -903,6 +934,9 @@ gen9_emit_depth(struct intel_bb *ibb)
>  
>  static void
>  gen7_emit_clear(struct intel_bb *ibb) {
> +	if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20))
> +		return;
> +
>  	intel_bb_out(ibb, GEN7_3DSTATE_CLEAR_PARAMS | (3-2));
>  	intel_bb_out(ibb, 0);
>  	intel_bb_out(ibb, 1); // clear valid
> @@ -911,7 +945,10 @@ gen7_emit_clear(struct intel_bb *ibb) {
>  static void
>  gen6_emit_drawing_rectangle(struct intel_bb *ibb, const struct intel_buf *dst)
>  {
> -	intel_bb_out(ibb, GEN4_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
> +	if (AT_LEAST_GEN(intel_get_drm_devid(ibb->fd), 20))
> +		intel_bb_out(ibb, GENXE2_3DSTATE_DRAWING_RECTANGLE_FAST | (4 - 2));
> +	else
> +		intel_bb_out(ibb, GEN4_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
>  	intel_bb_out(ibb, 0);
>  	intel_bb_out(ibb, (intel_buf_height(dst) - 1) << 16 | (intel_buf_width(dst) - 1));
>  	intel_bb_out(ibb, 0);
> @@ -1220,6 +1257,19 @@ void gen12p71_render_copyfunc(struct intel_bb *ibb,
>  			sizeof(gen12p71_render_copy));
>  }
>  
> +void genxe2_render_copyfunc(struct intel_bb *ibb,
> +			    struct intel_buf *src, uint32_t src_x, uint32_t src_y,
> +			    uint32_t width, uint32_t height,
> +			    struct intel_buf *dst, uint32_t dst_x, uint32_t dst_y)
> +{
> +	_gen9_render_op(ibb, src, src_x, src_y,
> +			  width, height, dst, dst_x, dst_y,
> +			  NULL,
> +			  NULL,
> +			  xe2_render_copy,
> +			  sizeof(xe2_render_copy));
> +}
> +
>  void mtl_render_copyfunc(struct intel_bb *ibb,
>  			 struct intel_buf *src,
>  			 unsigned int src_x, unsigned int src_y,



More information about the igt-dev mailing list