Mesa (master): radeonsi: adjust tess SGPRs to allow fully occupied 3 HS waves of triangles
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Mon Nov 23 03:15:53 UTC 2020
Module: Mesa
Branch: master
Commit: 9b5b5cbc53430d39993db5fb3bcd9f99ed006f2c
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=9b5b5cbc53430d39993db5fb3bcd9f99ed006f2c
Author: Marek Olšák <marek.olsak at amd.com>
Date: Thu Nov 12 22:07:56 2020 -0500
radeonsi: adjust tess SGPRs to allow fully occupied 3 HS waves of triangles
With triangles and 3 HS waves, 3 lanes were unoccupied. Adjust the SGPR
encoding to allow 1 more triangle to fit there.
Some of the fields are not large enough, but they weren't large enough
before either.
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7623>
---
src/gallium/drivers/radeonsi/si_shader_internal.h | 18 +++++++++---------
src/gallium/drivers/radeonsi/si_shader_llvm_tess.c | 6 ++++--
src/gallium/drivers/radeonsi/si_state_draw.c | 7 +++++--
3 files changed, 18 insertions(+), 13 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h
index 8649a78db5c..6722e581415 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -117,25 +117,25 @@ struct si_shader_context {
/* API TCS & TES */
/* Layout of TCS outputs in the offchip buffer
* # 6 bits
- * [0:5] = the number of patches per threadgroup, max = NUM_PATCHES (40)
- * # 6 bits
- * [6:11] = the number of output vertices per patch, max = 32
- * # 20 bits
- * [12:31] = the offset of per patch attributes in the buffer in bytes.
- * max = NUM_PATCHES*32*32*16
+ * [0:5] = the number of patches per threadgroup - 1, max = 63
+ * # 5 bits
+ * [6:10] = the number of output vertices per patch - 1, max = 31
+ * # 21 bits
+ * [11:31] = the offset of per patch attributes in the buffer in bytes.
+ * max = NUM_PATCHES*32*32*16 = 1M
*/
struct ac_arg tcs_offchip_layout;
/* API TCS */
/* Offsets where TCS outputs and TCS patch outputs live in LDS:
- * [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32
+ * [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32 = 64K (TODO: not enough bits)
* [16:31] = TCS output patch0 offset for per-patch / 16
- * max = (NUM_PATCHES + 1) * 32*32
+ * max = (NUM_PATCHES + 1) * 32*32 = 66624 (TODO: not enough bits)
*/
struct ac_arg tcs_out_lds_offsets;
/* Layout of TCS outputs / TES inputs:
* [0:12] = stride between output patches in DW, num_outputs * num_vertices * 4
- * max = 32*32*4 + 32*4
+ * max = 32*32*4 + 32*4 = 4224
* [13:18] = gl_PatchVerticesIn, max = 32
* [19:31] = high 13 bits of the 32-bit address of tessellation ring buffers
*/
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
index 97e5db45955..1ee6ffd547e 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
@@ -145,7 +145,8 @@ static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx)
if (ctx->stage == MESA_SHADER_TESS_CTRL && tcs_out_vertices)
return LLVMConstInt(ctx->ac.i32, tcs_out_vertices, 0);
- return si_unpack_param(ctx, ctx->tcs_offchip_layout, 6, 6);
+ return LLVMBuildAdd(ctx->ac.builder,
+ si_unpack_param(ctx, ctx->tcs_offchip_layout, 6, 5), ctx->ac.i32_1, "");
}
static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
@@ -220,6 +221,7 @@ static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
vertices_per_patch = get_num_tcs_out_vertices(ctx);
num_patches = si_unpack_param(ctx, ctx->tcs_offchip_layout, 0, 6);
+ num_patches = LLVMBuildAdd(ctx->ac.builder, num_patches, ctx->ac.i32_1, "");
total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch, num_patches, "");
constant16 = LLVMConstInt(ctx->ac.i32, 16, 0);
@@ -235,7 +237,7 @@ static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
if (!vertex_index) {
- LLVMValueRef patch_data_offset = si_unpack_param(ctx, ctx->tcs_offchip_layout, 12, 20);
+ LLVMValueRef patch_data_offset = si_unpack_param(ctx, ctx->tcs_offchip_layout, 11, 21);
base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, patch_data_offset, "");
}
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 4c739eeef2d..ef5eae2cc47 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -161,7 +161,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx, const struct pip
* The hardware can do more, but the radeonsi shader constant is
* limited to 6 bits.
*/
- *num_patches = MIN2(*num_patches, 63); /* triangles: 3 full waves except 3 lanes */
+ *num_patches = MIN2(*num_patches, 64); /* triangles: 3 full waves */
/* When distributed tessellation is unsupported, switch between SEs
* at a higher frequency to compensate for it.
@@ -214,6 +214,8 @@ static void si_emit_derived_tess_state(struct si_context *sctx, const struct pip
assert(((perpatch_output_offset / 16) & ~0xffff) == 0);
assert(num_tcs_input_cp <= 32);
assert(num_tcs_output_cp <= 32);
+ assert(*num_patches <= 64);
+ assert(((pervertex_output_patch_size * *num_patches) & ~0x1fffff) == 0);
uint64_t ring_va = (unlikely(sctx->ws->cs_is_secure(sctx->gfx_cs)) ?
si_resource(sctx->tess_rings_tmz) : si_resource(sctx->tess_rings))->gpu_address;
@@ -224,7 +226,8 @@ static void si_emit_derived_tess_state(struct si_context *sctx, const struct pip
tcs_out_layout = (output_patch_size / 4) | (num_tcs_input_cp << 13) | ring_va;
tcs_out_offsets = (output_patch0_offset / 16) | ((perpatch_output_offset / 16) << 16);
offchip_layout =
- *num_patches | (num_tcs_output_cp << 6) | (pervertex_output_patch_size * *num_patches << 12);
+ (*num_patches - 1) | ((num_tcs_output_cp - 1) << 6) |
+ ((pervertex_output_patch_size * *num_patches) << 11);
/* Compute the LDS size. */
lds_size = output_patch0_offset + output_patch_size * *num_patches;
More information about the mesa-commit
mailing list