Mesa (main): freedreno/a6xx: Add EARLYPREAMBLE flag to all a6xx_sp_xs_ctrl_reg0

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Wed May 18 11:38:53 UTC 2022


Module: Mesa
Branch: main
Commit: 5d377f435b4e64762ce706f6082005e974b894ee
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=5d377f435b4e64762ce706f6082005e974b894ee

Author: Danylo Piliaiev <dpiliaiev at igalia.com>
Date:   Tue Apr 12 21:04:35 2022 +0300

freedreno/a6xx: Add EARLYPREAMBLE flag to all a6xx_sp_xs_ctrl_reg0

Each shader stage has its own "early preamble" flag.

Early preamble is likely an optimization to hide some of latency
when loading UBOs into consts in the preamble.

Early preamble has the following limitations:
- Only shared, a1, and consts regs could be used
  (accessing other regs would result in GPU fault);
- No cat5/cat6, only stc/ldc variants are working;
- Values writen to shared regs are not accessible by the rest
  of the shader;
- Instructions before shps are also considered to be a part of
  early preamble.

Note, for all shaders from d3d11 games blob produced preambles
compatible with early preamble mode.

Signed-off-by: Danylo Piliaiev <dpiliaiev at igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15901>

---

 src/freedreno/computerator/a6xx.c                  |  1 +
 .../computerator/examples/early_preamble.asm       | 25 ++++++++++++++
 src/freedreno/ir3/ir3_assembler.h                  |  1 +
 src/freedreno/ir3/ir3_lexer.l                      |  1 +
 src/freedreno/ir3/ir3_parser.y                     |  4 +++
 src/freedreno/registers/adreno/a6xx.xml            | 40 +++++++++++++---------
 6 files changed, 56 insertions(+), 16 deletions(-)

diff --git a/src/freedreno/computerator/a6xx.c b/src/freedreno/computerator/a6xx.c
index a0ce6f986da..59c3ebfc196 100644
--- a/src/freedreno/computerator/a6xx.c
+++ b/src/freedreno/computerator/a6xx.c
@@ -153,6 +153,7 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
                A6XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT(i->max_reg + 1) |
                A6XX_SP_CS_CTRL_REG0_HALFREGFOOTPRINT(i->max_half_reg + 1) |
                COND(v->mergedregs, A6XX_SP_CS_CTRL_REG0_MERGEDREGS) |
+               COND(ir3_kernel->info.early_preamble, A6XX_SP_CS_CTRL_REG0_EARLYPREAMBLE) |
                A6XX_SP_CS_CTRL_REG0_BRANCHSTACK(ir3_shader_branchstack_hw(v)));
 
    OUT_PKT4(ring, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
diff --git a/src/freedreno/computerator/examples/early_preamble.asm b/src/freedreno/computerator/examples/early_preamble.asm
new file mode 100644
index 00000000000..717b374deb6
--- /dev/null
+++ b/src/freedreno/computerator/examples/early_preamble.asm
@@ -0,0 +1,25 @@
+ at localsize 1, 1, 1
+ at buf 4  ; g[0]
+ at invocationid(r0.x) ; r0.xyz
+ at const(c0.x)  0.0, 0.0, 0.0, 0.0
+ at earlypreamble
+
+shps #l_preamble_end
+getone #l_preamble_end
+
+mov.u32u32 r48.x, 1
+mov.u32u32 r48.y, 2
+mov.u32u32 r48.z, 3
+mov.u32u32 r48.w, 4
+(rpt5)nop
+stc.u32 c[0], r48.x, 4
+
+(sy)(ss)shpe
+
+l_preamble_end:
+(jp)nop
+
+(rpt3)mov.u32u32 r1.x, (r)c0.x
+(rpt5)nop
+stib.b.untyped.1d.u32.4.imm r1.x, r0.x, 0
+end
diff --git a/src/freedreno/ir3/ir3_assembler.h b/src/freedreno/ir3/ir3_assembler.h
index 5ff28242c80..328915e1ae9 100644
--- a/src/freedreno/ir3/ir3_assembler.h
+++ b/src/freedreno/ir3/ir3_assembler.h
@@ -37,6 +37,7 @@ struct ir3_kernel_info {
    /* driver-param / replaced uniforms: */
    unsigned numwg;
    unsigned wgid;
+   unsigned early_preamble;
 };
 
 struct ir3_shader;
diff --git a/src/freedreno/ir3/ir3_lexer.l b/src/freedreno/ir3/ir3_lexer.l
index 52b97789645..07843a84881 100644
--- a/src/freedreno/ir3/ir3_lexer.l
+++ b/src/freedreno/ir3/ir3_lexer.l
@@ -107,6 +107,7 @@ static int parse_reg(const char *str)
 "@out"                            return TOKEN(T_A_OUT);
 "@tex"                            return TOKEN(T_A_TEX);
 "@pvtmem"                         return TOKEN(T_A_PVTMEM);
+"@earlypreamble"                  return TOKEN(T_A_EARLYPREAMBLE);
 "(sy)"                            return TOKEN(T_SY);
 "(ss)"                            return TOKEN(T_SS);
 "(absneg)"                        return TOKEN(T_ABSNEG);
diff --git a/src/freedreno/ir3/ir3_parser.y b/src/freedreno/ir3/ir3_parser.y
index c306f9a7a6f..d2fb5753d7a 100644
--- a/src/freedreno/ir3/ir3_parser.y
+++ b/src/freedreno/ir3/ir3_parser.y
@@ -334,6 +334,7 @@ static void print_token(FILE *file, int type, YYSTYPE value)
 %token <tok> T_A_OUT
 %token <tok> T_A_TEX
 %token <tok> T_A_PVTMEM
+%token <tok> T_A_EARLYPREAMBLE
 /* todo, re-add @sampler/@uniform/@varying if needed someday */
 
 /* src register flags */
@@ -701,6 +702,7 @@ header:            localsize_header
 |                  out_header
 |                  tex_header
 |                  pvtmem_header
+|                  earlypreamble_header
 
 const_val:         T_FLOAT   { $$ = fui($1); }
 |                  T_INT     { $$ = $1;      }
@@ -767,6 +769,8 @@ branchstack_header: T_A_BRANCHSTACK const_val { variant->branchstack = $2; }
 
 pvtmem_header: T_A_PVTMEM const_val { variant->pvtmem_size = $2; }
 
+earlypreamble_header: T_A_EARLYPREAMBLE { info->early_preamble = 1; }
+
 /* Stubs for now */
 in_header:         T_A_IN '(' T_REGISTER ')' T_IDENTIFIER '(' T_IDENTIFIER '=' integer ')' { }
 
diff --git a/src/freedreno/registers/adreno/a6xx.xml b/src/freedreno/registers/adreno/a6xx.xml
index 22a9961755d..e8e6f8accee 100644
--- a/src/freedreno/registers/adreno/a6xx.xml
+++ b/src/freedreno/registers/adreno/a6xx.xml
@@ -2887,8 +2887,22 @@ to upconvert to 32b float internally?
 		GS must have the same mergedregs setting as VS.
 		-->
 		<bitfield name="MERGEDREGS" pos="20" type="boolean"/>
-		<!-- ??? (blob has it set) -->
-		<bitfield name="UNK21" pos="21" type="boolean"/>
+		<!--
+		Creates a separate preamble-only thread?
+
+		Early preamble has the following limitations:
+		- Only shared, a1, and consts regs could be used
+		  (accessing other regs would result in GPU fault);
+		- No cat5/cat6, only stc/ldc variants are working;
+		- Values writen to shared regs are not accessible by the rest
+		  of the shader;
+		- Instructions before shps are also considered to be a part of
+		  early preamble;
+
+		Note, for all shaders from d3d11 games blob produced preambles
+		compatible with early preamble mode.
+		-->
+		<bitfield name="EARLYPREAMBLE" pos="21" type="boolean"/>
 	</reg32>
 	<!-- bitmask of true/false conditions for VS brac.N instructions,
 	     bit N corresponds to brac.N -->
@@ -3001,11 +3015,8 @@ to upconvert to 32b float internally?
 	<reg32 offset="0xa825" name="SP_VS_PVT_MEM_HW_STACK_OFFSET" type="a6xx_sp_xs_pvt_mem_hw_stack_offset"/>
 
 	<reg32 offset="0xa830" name="SP_HS_CTRL_REG0" type="a6xx_sp_xs_ctrl_reg0">
-		<!--
-		There is no mergedregs bit, that comes from the VS.
-		No idea what this bit does here.
-		-->
-		<bitfield name="UNK20" pos="20" type="boolean"/>
+		<!-- There is no mergedregs bit, that comes from the VS. -->
+		<bitfield name="EARLYPREAMBLE" pos="20" type="boolean"/>
 	</reg32>
 	<!--
 	Total size of local storage in dwords divided by the wave size.
@@ -3029,7 +3040,7 @@ to upconvert to 32b float internally?
 
 	<reg32 offset="0xa840" name="SP_DS_CTRL_REG0" type="a6xx_sp_xs_ctrl_reg0">
 		<!-- There is no mergedregs bit, that comes from the VS. -->
-		<bitfield name="UNK20" pos="20" type="boolean"/> <!-- something preamble-related -->
+		<bitfield name="EARLYPREAMBLE" pos="20" type="boolean"/>
 	</reg32>
 	<reg32 offset="0xa841" name="SP_DS_BRANCH_COND" type="hex"/>
 
@@ -3064,11 +3075,8 @@ to upconvert to 32b float internally?
 	<reg32 offset="0xa865" name="SP_DS_PVT_MEM_HW_STACK_OFFSET" type="a6xx_sp_xs_pvt_mem_hw_stack_offset"/>
 
 	<reg32 offset="0xa870" name="SP_GS_CTRL_REG0" type="a6xx_sp_xs_ctrl_reg0">
-		<!--
-		There is no mergedregs bit, that comes from the VS.
-		No idea what this bit does here.
-		-->
-		<bitfield name="UNK20" pos="20" type="boolean"/>
+		<!-- There is no mergedregs bit, that comes from the VS. -->
+		<bitfield name="EARLYPREAMBLE" pos="20" type="boolean"/>
 	</reg32>
 	<reg32 offset="0xa871" name="SP_GS_PRIM_SIZE" low="0" high="7" type="uint">
 		<doc>
@@ -3137,7 +3145,8 @@ to upconvert to 32b float internally?
 		<bitfield name="UNK24" pos="24" type="boolean"/>
 		<bitfield name="UNK25" pos="25" type="boolean"/>
 		<bitfield name="PIXLODENABLE" pos="26" type="boolean"/>
-		<bitfield name="UNK27" low="27" high="28"/>
+		<bitfield name="UNK27" pos="27" type="boolean"/>
+		<bitfield name="EARLYPREAMBLE" pos="28" type="boolean"/>
 		<bitfield name="MERGEDREGS" pos="31" type="boolean"/>
 	</reg32>
 	<reg32 offset="0xa981" name="SP_FS_BRANCH_COND" type="hex"/>
@@ -3249,8 +3258,7 @@ to upconvert to 32b float internally?
 		<bitfield name="UNK21" pos="21" type="boolean"/>
 		<!-- has a small impact on performance, not clear what it does -->
 		<bitfield name="UNK22" pos="22" type="boolean"/>
-		<!-- creates a separate prolog-only thread? -->
-		<bitfield name="SEPARATEPROLOG" pos="23" type="boolean"/>
+		<bitfield name="EARLYPREAMBLE" pos="23" type="boolean"/>
 		<bitfield name="MERGEDREGS" pos="31" type="boolean"/>
 	</reg32>
 



More information about the mesa-commit mailing list