Mesa (master): turnip,freedreno/a6xx: tell hw the size of shared mem used by CS

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Fri Feb 19 18:48:06 UTC 2021


Module: Mesa
Branch: master
Commit: 0fa7ec14734a640858e7f4047ffab78f71272ece
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=0fa7ec14734a640858e7f4047ffab78f71272ece

Author: Danylo Piliaiev <dpiliaiev at igalia.com>
Date:   Mon Feb 15 13:14:56 2021 +0200

turnip,freedreno/a6xx: tell hw the size of shared mem used by CS

Before, we only used 2k of shared memory.

It was found that 5 lower bits of SP_CS_UNKNOWN_A9B1 do control
the available size of shared memory for compute shaders, with
AVAILABLE_SIZE = (SP_CS_UNKNOWN_A9B1_SHARED_SIZE + 1) * 1k
up to 32k. And SP_CS_UNKNOWN_A9B1_SHARED_SIZE being zero enables
all 32k of shared memory.

Fixes tests:
 dEQP-VK.rasterization.line_continuity.line-strip
 dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_local.buffer.guard_nonlocal.workgroup.comp
 dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_nonlocal.workgroup.guard_local.buffer.comp
 dEQP-VK.memory_model.write_after_read.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_local.image.guard_nonlocal.workgroup.comp

Signed-off-by: Danylo Piliaiev <dpiliaiev at igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9157>

---

 ci-expects/freedreno/deqp-freedreno-a630-fails.txt |  4 ----
 src/freedreno/.gitlab-ci/reference/crash.log       |  4 ++--
 src/freedreno/ir3/ir3_compiler_nir.c               |  1 +
 src/freedreno/ir3/ir3_shader.h                     |  3 +++
 src/freedreno/registers/adreno/a6xx.xml            | 25 +++++++++++++---------
 src/freedreno/vulkan/tu_pipeline.c                 |  4 +++-
 src/gallium/drivers/freedreno/a6xx/fd6_compute.c   |  4 +++-
 7 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/ci-expects/freedreno/deqp-freedreno-a630-fails.txt b/ci-expects/freedreno/deqp-freedreno-a630-fails.txt
index eb6f905a3ca..f4156ff565c 100644
--- a/ci-expects/freedreno/deqp-freedreno-a630-fails.txt
+++ b/ci-expects/freedreno/deqp-freedreno-a630-fails.txt
@@ -49,9 +49,6 @@ dEQP-VK.image.subresource_layout.3d.all_levels.a8b8g8r8_snorm_pack32,Fail
 dEQP-VK.image.subresource_layout.3d.all_levels.r16g16b16a16_snorm,Fail
 dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_local.image.guard_nonlocal.workgroup.comp,Fail
 dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.workgroup.guard_local.image.comp,Fail
-dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_local.buffer.guard_nonlocal.workgroup.comp,Fail
-dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_nonlocal.workgroup.guard_local.buffer.comp,Fail
-dEQP-VK.memory_model.write_after_read.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_local.image.guard_nonlocal.workgroup.comp,Fail
 dEQP-VK.memory.requirements.dedicated_allocation.buffer.regular,Fail
 dEQP-VK.memory.requirements.dedicated_allocation.image.transient_tiling_optimal,Fail
 dEQP-VK.pipeline.extended_dynamic_state.after_pipelines.depth_compare_greater_equal_greater,Fail
@@ -66,7 +63,6 @@ dEQP-VK.pipeline.push_descriptor.compute.binding3_numcalls2_combined_image_sampl
 dEQP-VK.pipeline.push_descriptor.compute.binding3_numcalls2_sampled_image,Crash
 dEQP-VK.pipeline.push_descriptor.compute.binding3_numcalls2_sampler,Crash
 dEQP-VK.pipeline.push_descriptor.compute.binding3_numcalls2_storage_image,Crash
-dEQP-VK.rasterization.line_continuity.line-strip,Fail
 dEQP-VK.renderpass2.suballocation.attachment_allocation.input_output.7,Fail
 dEQP-VK.spirv_assembly.instruction.compute.opquantize.infinities,Fail
 dEQP-VK.spirv_assembly.instruction.graphics.opquantize.carry_bit_geom,Fail
diff --git a/src/freedreno/.gitlab-ci/reference/crash.log b/src/freedreno/.gitlab-ci/reference/crash.log
index 3ca89220612..8389724b5bd 100644
--- a/src/freedreno/.gitlab-ci/reference/crash.log
+++ b/src/freedreno/.gitlab-ci/reference/crash.log
@@ -7412,7 +7412,7 @@ clusters:
 	00000080	SP_FS_TEX_COUNT: 128
 	0000f000	SP_UNKNOWN_A9A8: 0xf000
 	00421800	SP_CS_CTRL_REG0: { THREADMODE = MULTI | HALFREGFOOTPRINT = 0 | FULLREGFOOTPRINT = 48 | BRANCHSTACK = 8 | THREADSIZE = THREAD64 | VARYING }
-	0000001f	SP_CS_UNKNOWN_A9B1: { SHARED_SIZE_2K | UNK1 = 0xf }
+	0000001f	SP_CS_UNKNOWN_A9B1: { SHARED_SIZE = 31 }
 	00000000	SP_CS_BRANCH_COND: 0
 	00000000	SP_CS_OBJ_FIRST_EXEC_OFFSET: 0
 	8c415420	SP_CS_OBJ_START: 0x8c415420
@@ -7494,7 +7494,7 @@ clusters:
 	00000080	SP_FS_TEX_COUNT: 128
 	0000f000	SP_UNKNOWN_A9A8: 0xf000
 	00421800	SP_CS_CTRL_REG0: { THREADMODE = MULTI | HALFREGFOOTPRINT = 0 | FULLREGFOOTPRINT = 48 | BRANCHSTACK = 8 | THREADSIZE = THREAD64 | VARYING }
-	0000001f	SP_CS_UNKNOWN_A9B1: { SHARED_SIZE_2K | UNK1 = 0xf }
+	0000001f	SP_CS_UNKNOWN_A9B1: { SHARED_SIZE = 31 }
 	00000000	SP_CS_BRANCH_COND: 0
 	00000000	SP_CS_OBJ_FIRST_EXEC_OFFSET: 0
 	8c415420	SP_CS_OBJ_START: 0x8c415420
diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c
index 0694bd86018..7c06bf34553 100644
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -3413,6 +3413,7 @@ emit_instructions(struct ir3_context *ctx)
 		ctx->s->info.clip_distance_array_size;
 
 	ctx->so->pvtmem_size = ctx->s->scratch_size;
+	ctx->so->shared_size = ctx->s->shared_size;
 
 	/* NOTE: need to do something more clever when we support >1 fxn */
 	nir_foreach_register (reg, &fxn->registers) {
diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h
index d28567b0a60..5d2aaab1550 100644
--- a/src/freedreno/ir3/ir3_shader.h
+++ b/src/freedreno/ir3/ir3_shader.h
@@ -559,6 +559,9 @@ struct ir3_shader_variant {
 	/* Whether we should use the new per-wave layout rather than per-fiber. */
 	bool pvtmem_per_wave;
 
+	/* Size in bytes of required shared memory */
+	unsigned shared_size;
+
 	/* About Linkage:
 	 *   + Let the frag shader determine the position/compmask for the
 	 *     varyings, since it is the place where we know if the varying
diff --git a/src/freedreno/registers/adreno/a6xx.xml b/src/freedreno/registers/adreno/a6xx.xml
index 6a50378ad2b..f1bac710944 100644
--- a/src/freedreno/registers/adreno/a6xx.xml
+++ b/src/freedreno/registers/adreno/a6xx.xml
@@ -3059,17 +3059,22 @@ to upconvert to 32b float internally?
 
 
 	<reg32 offset="0xa9b0" name="SP_CS_CTRL_REG0" type="a6xx_sp_xs_ctrl_reg0"/>
-	<!-- set for compute shaders, always 0x41 -->
+
+	<!-- set for compute shaders -->
 	<reg32 offset="0xa9b1" name="SP_CS_UNKNOWN_A9B1">
-		<doc>
-			bit 0 seems to toggle between 2k and 32k of shared storage
-			the ldl/stl offset seems to be rewritten to 0 when it is beyond
-			this limit. This is different from ldlw/stlw, which wraps at
-			64k (and has 36k of storage on A640 - reads between 36k-64k
-			always return 0)
-		</doc>
-		<bitfield name="SHARED_SIZE_2K" pos="0" type="boolean"/>
-		<bitfield name="UNK1" low="1" high="6" type="uint"/>
+		<bitfield name="SHARED_SIZE" low="0" high="4" type="uint">
+			<doc>
+				If 0 - all 32k of shared storage is enabled, otherwise
+				(SHARED_SIZE + 1) * 1k is enabled.
+				The ldl/stl offset seems to be rewritten to 0 when it is beyond
+				this limit. This is different from ldlw/stlw, which wraps at
+				64k (and has 36k of storage on A640 - reads between 36k-64k
+				always return 0)
+			</doc>
+		</bitfield>
+		<bitfield name="UNK5" pos="5" type="boolean"/>
+		<!-- always 1 ? -->
+		<bitfield name="UNK6" pos="6" type="boolean"/>
 	</reg32>
 	<reg32 offset="0xa9b2" name="SP_CS_BRANCH_COND" type="hex"/>
 	<reg32 offset="0xa9b3" name="SP_CS_OBJ_FIRST_EXEC_OFFSET" type="uint"/>
diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c
index 3135311a108..1389f8ac134 100644
--- a/src/freedreno/vulkan/tu_pipeline.c
+++ b/src/freedreno/vulkan/tu_pipeline.c
@@ -532,8 +532,10 @@ tu6_emit_cs_config(struct tu_cs *cs, const struct tu_shader *shader,
 
    tu6_emit_xs_config(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova);
 
+   uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1);
    tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
-   tu_cs_emit(cs, 0x41);
+   tu_cs_emit(cs, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) |
+                  A6XX_SP_CS_UNKNOWN_A9B1_UNK6);
 
    uint32_t local_invocation_id =
       ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_compute.c b/src/gallium/drivers/freedreno/a6xx/fd6_compute.c
index 909b6780ebc..e6b42a84cc4 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_compute.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_compute.c
@@ -77,8 +77,10 @@ cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
 			A6XX_SP_CS_CTRL_REG0_BRANCHSTACK(v->branchstack) |
 			COND(v->need_pixlod, A6XX_SP_CS_CTRL_REG0_PIXLODENABLE));
 
+	uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1);
 	OUT_PKT4(ring, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
-	OUT_RING(ring, 0x41);
+	OUT_RING(ring, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) |
+			A6XX_SP_CS_UNKNOWN_A9B1_UNK6);
 
 	uint32_t local_invocation_id, work_group_id;
 	local_invocation_id = ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);



More information about the mesa-commit mailing list