Mesa (master): radeonsi: adjust tess SGPRs to allow fully occupied 3 HS waves of triangles

Mon Nov 23 03:15:53 UTC 2020

Module: Mesa
Branch: master
Commit: 9b5b5cbc53430d39993db5fb3bcd9f99ed006f2c
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=9b5b5cbc53430d39993db5fb3bcd9f99ed006f2c

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Thu Nov 12 22:07:56 2020 -0500

radeonsi: adjust tess SGPRs to allow fully occupied 3 HS waves of triangles

With triangles and 3 HS waves, 3 lanes were unoccupied. Adjust the SGPR
encoding to allow 1 more triangle to fit there.

Some of the fields are not large enough, but they weren't large enough
before either.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7623>

---

 src/gallium/drivers/radeonsi/si_shader_internal.h  | 18 +++++++++---------
 src/gallium/drivers/radeonsi/si_shader_llvm_tess.c |  6 ++++--
 src/gallium/drivers/radeonsi/si_state_draw.c       |  7 +++++--
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h
index 8649a78db5c..6722e581415 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -117,25 +117,25 @@ struct si_shader_context {
    /* API TCS & TES */
    /* Layout of TCS outputs in the offchip buffer
     * # 6 bits
-    *   [0:5] = the number of patches per threadgroup, max = NUM_PATCHES (40)
-    * # 6 bits
-    *   [6:11] = the number of output vertices per patch, max = 32
-    * # 20 bits
-    *   [12:31] = the offset of per patch attributes in the buffer in bytes.
-    *             max = NUM_PATCHES*32*32*16
+    *   [0:5] = the number of patches per threadgroup - 1, max = 63
+    * # 5 bits
+    *   [6:10] = the number of output vertices per patch - 1, max = 31
+    * # 21 bits
+    *   [11:31] = the offset of per patch attributes in the buffer in bytes.
+    *             max = NUM_PATCHES*32*32*16 = 1M
     */
    struct ac_arg tcs_offchip_layout;
 
    /* API TCS */
    /* Offsets where TCS outputs and TCS patch outputs live in LDS:
-    *   [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32
+    *   [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32 = 64K (TODO: not enough bits)
     *   [16:31] = TCS output patch0 offset for per-patch / 16
-    *             max = (NUM_PATCHES + 1) * 32*32
+    *             max = (NUM_PATCHES + 1) * 32*32 = 66624 (TODO: not enough bits)
     */
    struct ac_arg tcs_out_lds_offsets;
    /* Layout of TCS outputs / TES inputs:
     *   [0:12] = stride between output patches in DW, num_outputs * num_vertices * 4
-    *            max = 32*32*4 + 32*4
+    *            max = 32*32*4 + 32*4 = 4224
     *   [13:18] = gl_PatchVerticesIn, max = 32
     *   [19:31] = high 13 bits of the 32-bit address of tessellation ring buffers
     */
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
index 97e5db45955..1ee6ffd547e 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
@@ -145,7 +145,8 @@ static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx)
    if (ctx->stage == MESA_SHADER_TESS_CTRL && tcs_out_vertices)
       return LLVMConstInt(ctx->ac.i32, tcs_out_vertices, 0);
 
-   return si_unpack_param(ctx, ctx->tcs_offchip_layout, 6, 6);
+   return LLVMBuildAdd(ctx->ac.builder,
+                       si_unpack_param(ctx, ctx->tcs_offchip_layout, 6, 5), ctx->ac.i32_1, "");
 }
 
 static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
@@ -220,6 +221,7 @@ static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
 
    vertices_per_patch = get_num_tcs_out_vertices(ctx);
    num_patches = si_unpack_param(ctx, ctx->tcs_offchip_layout, 0, 6);
+   num_patches = LLVMBuildAdd(ctx->ac.builder, num_patches, ctx->ac.i32_1, "");
    total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch, num_patches, "");
 
    constant16 = LLVMConstInt(ctx->ac.i32, 16, 0);
@@ -235,7 +237,7 @@ static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
    base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
 
    if (!vertex_index) {
-      LLVMValueRef patch_data_offset = si_unpack_param(ctx, ctx->tcs_offchip_layout, 12, 20);
+      LLVMValueRef patch_data_offset = si_unpack_param(ctx, ctx->tcs_offchip_layout, 11, 21);
 
       base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, patch_data_offset, "");
    }
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 4c739eeef2d..ef5eae2cc47 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -161,7 +161,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx, const struct pip
     * The hardware can do more, but the radeonsi shader constant is
     * limited to 6 bits.
     */
-   *num_patches = MIN2(*num_patches, 63); /* triangles: 3 full waves except 3 lanes */
+   *num_patches = MIN2(*num_patches, 64); /* triangles: 3 full waves */
 
    /* When distributed tessellation is unsupported, switch between SEs
     * at a higher frequency to compensate for it.
@@ -214,6 +214,8 @@ static void si_emit_derived_tess_state(struct si_context *sctx, const struct pip
    assert(((perpatch_output_offset / 16) & ~0xffff) == 0);
    assert(num_tcs_input_cp <= 32);
    assert(num_tcs_output_cp <= 32);
+   assert(*num_patches <= 64);
+   assert(((pervertex_output_patch_size * *num_patches) & ~0x1fffff) == 0);
 
    uint64_t ring_va = (unlikely(sctx->ws->cs_is_secure(sctx->gfx_cs)) ?
       si_resource(sctx->tess_rings_tmz) : si_resource(sctx->tess_rings))->gpu_address;
@@ -224,7 +226,8 @@ static void si_emit_derived_tess_state(struct si_context *sctx, const struct pip
    tcs_out_layout = (output_patch_size / 4) | (num_tcs_input_cp << 13) | ring_va;
    tcs_out_offsets = (output_patch0_offset / 16) | ((perpatch_output_offset / 16) << 16);
    offchip_layout =
-      *num_patches | (num_tcs_output_cp << 6) | (pervertex_output_patch_size * *num_patches << 12);
+      (*num_patches - 1) | ((num_tcs_output_cp - 1) << 6) |
+      ((pervertex_output_patch_size * *num_patches) << 11);
 
    /* Compute the LDS size. */
    lds_size = output_patch0_offset + output_patch_size * *num_patches;