Mesa (main): radeonsi: replace the GS prolog with a monolithic shader variant

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Mon Oct 18 18:36:12 UTC 2021


Module: Mesa
Branch: main
Commit: 8cf802e8effeaa324fcb1864048cfdaff5c7acfb
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=8cf802e8effeaa324fcb1864048cfdaff5c7acfb

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Sat Oct 16 09:46:06 2021 -0400

radeonsi: replace the GS prolog with a monolithic shader variant

It only exists because of the hw bug and is used very rarely.
Let's simplify it.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13393>

---

 src/gallium/drivers/radeonsi/si_pipe.c            |   2 +-
 src/gallium/drivers/radeonsi/si_pipe.h            |   1 -
 src/gallium/drivers/radeonsi/si_shader.c          |  20 +---
 src/gallium/drivers/radeonsi/si_shader.h          |  14 +--
 src/gallium/drivers/radeonsi/si_shader_internal.h |   4 +-
 src/gallium/drivers/radeonsi/si_shader_llvm.c     |  53 +++++-----
 src/gallium/drivers/radeonsi/si_shader_llvm_gs.c  | 121 +---------------------
 src/gallium/drivers/radeonsi/si_state_draw.cpp    |   4 +-
 8 files changed, 45 insertions(+), 174 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index bb43377bfb6..7d378857377 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -881,7 +881,7 @@ static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen, v
 static void si_destroy_screen(struct pipe_screen *pscreen)
 {
    struct si_screen *sscreen = (struct si_screen *)pscreen;
-   struct si_shader_part *parts[] = {sscreen->vs_prologs, sscreen->tcs_epilogs, sscreen->gs_prologs,
+   struct si_shader_part *parts[] = {sscreen->vs_prologs, sscreen->tcs_epilogs,
                                      sscreen->ps_prologs, sscreen->ps_epilogs};
    unsigned i;
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index b6fe302e7d6..72e5e7e5c14 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -622,7 +622,6 @@ struct si_screen {
    simple_mtx_t shader_parts_mutex;
    struct si_shader_part *vs_prologs;
    struct si_shader_part *tcs_epilogs;
-   struct si_shader_part *gs_prologs;
    struct si_shader_part *ps_prologs;
    struct si_shader_part *ps_epilogs;
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 655ffecea8b..c8892a31b53 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1207,8 +1207,7 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
           key->ge.part.gs.es->info.stage == MESA_SHADER_VERTEX) {
          si_dump_shader_key_vs(key, &key->ge.part.gs.vs_prolog, "part.gs.vs_prolog", f);
       }
-      fprintf(f, "  part.gs.prolog.tri_strip_adj_fix = %u\n",
-              key->ge.part.gs.prolog.tri_strip_adj_fix);
+      fprintf(f, "  mono.u.gs_tri_strip_adj_fix = %u\n", key->ge.mono.u.gs_tri_strip_adj_fix);
       fprintf(f, "  as_ngg = %u\n", key->ge.as_ngg);
       break;
 
@@ -1593,10 +1592,6 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
       assert(!prolog);
       shader.key.ge.part.tcs.epilog = key->tcs_epilog.states;
       break;
-   case MESA_SHADER_GEOMETRY:
-      assert(prolog);
-      shader.key.ge.as_ngg = key->gs_prolog.as_ngg;
-      break;
    case MESA_SHADER_FRAGMENT:
       if (prolog)
          shader.key.ps.part.prolog = key->ps_prolog.states;
@@ -1719,18 +1714,7 @@ static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_
       shader->previous_stage = es_main_part;
    }
 
-   if (!shader->key.ge.part.gs.prolog.tri_strip_adj_fix)
-      return true;
-
-   union si_shader_part_key prolog_key;
-   memset(&prolog_key, 0, sizeof(prolog_key));
-   prolog_key.gs_prolog.states = shader->key.ge.part.gs.prolog;
-   prolog_key.gs_prolog.as_ngg = shader->key.ge.as_ngg;
-
-   shader->prolog2 =
-      si_get_shader_part(sscreen, &sscreen->gs_prologs, MESA_SHADER_GEOMETRY, true, &prolog_key,
-                         compiler, debug, si_llvm_build_gs_prolog, "Geometry Shader Prolog");
-   return shader->prolog2 != NULL;
+   return true;
 }
 
 /**
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index ae3e8b1f515..118c37ee5a7 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -544,10 +544,6 @@ struct si_tcs_epilog_bits {
    unsigned tes_reads_tess_factors : 1;
 };
 
-struct si_gs_prolog_bits {
-   unsigned tri_strip_adj_fix : 1;
-};
-
 /* Common PS bits between the shader key and the prolog key. */
 struct si_ps_prolog_bits {
    unsigned color_two_side : 1;
@@ -591,10 +587,6 @@ union si_shader_part_key {
    struct {
       struct si_tcs_epilog_bits states;
    } tcs_epilog;
-   struct {
-      struct si_gs_prolog_bits states;
-      unsigned as_ngg : 1;
-   } gs_prolog;
    struct {
       struct si_ps_prolog_bits states;
       unsigned num_input_sgprs : 6;
@@ -633,7 +625,6 @@ struct si_shader_key_ge {
       struct {
          struct si_vs_prolog_bits vs_prolog; /* for merged ES-GS */
          struct si_shader_selector *es;      /* for merged ES-GS */
-         struct si_gs_prolog_bits prolog;
       } gs;
    } part;
 
@@ -654,9 +645,10 @@ struct si_shader_key_ge {
       union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS];
 
       union {
-         uint64_t ff_tcs_inputs_to_copy; /* for fixed-func TCS */
+         uint64_t ff_tcs_inputs_to_copy; /* fixed-func TCS only */
          /* When PS needs PrimID and GS is disabled. */
-         unsigned vs_export_prim_id : 1;
+         unsigned vs_export_prim_id : 1;    /* VS and TES only */
+         unsigned gs_tri_strip_adj_fix : 1; /* GS only */
       } u;
    } mono;
 
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h
index 106abde1c97..b99ded02a04 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -140,6 +140,9 @@ struct si_shader_context {
 
    struct ac_llvm_compiler *compiler;
 
+   /* GS vertex offsets unpacked with the gfx6-9 tristrip_adj bug workaround. */
+   LLVMValueRef gs_vtx_offset[6];
+
    /* Preloaded descriptors. */
    LLVMValueRef esgs_ring;
    LLVMValueRef gsvs_ring[4];
@@ -236,7 +239,6 @@ LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx);
 void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi);
 void si_preload_esgs_ring(struct si_shader_context *ctx);
 void si_preload_gs_rings(struct si_shader_context *ctx);
-void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
 void si_llvm_init_gs_callbacks(struct si_shader_context *ctx);
 
 /* si_shader_llvm_tess.c */
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c
index bd3fe0ea326..dd944e7f8b5 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c
@@ -443,7 +443,32 @@ static void si_llvm_declare_compute_memory(struct si_shader_context *ctx)
 
 static bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir)
 {
-   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+   if (nir->info.stage == MESA_SHADER_GEOMETRY) {
+      /* Unpack GS vertex offsets. */
+      for (unsigned i = 0; i < 6; i++) {
+         if (ctx->screen->info.chip_class >= GFX9) {
+            ctx->gs_vtx_offset[i] = si_unpack_param(ctx, ctx->args.gs_vtx_offset[i / 2], (i & 1) * 16, 16);
+         } else {
+            ctx->gs_vtx_offset[i] = ac_get_arg(&ctx->ac, ctx->args.gs_vtx_offset[i]);
+         }
+      }
+
+      /* Apply the hw bug workaround for triangle strips with adjacency. */
+      if (ctx->screen->info.chip_class <= GFX9 &&
+          ctx->shader->key.ge.mono.u.gs_tri_strip_adj_fix) {
+         LLVMValueRef prim_id = ac_get_arg(&ctx->ac, ctx->args.gs_prim_id);
+         /* Remap GS vertex offsets for every other primitive. */
+         LLVMValueRef rotate = LLVMBuildTrunc(ctx->ac.builder, prim_id, ctx->ac.i1, "");
+         LLVMValueRef fixed[6];
+
+         for (unsigned i = 0; i < 6; i++) {
+            fixed[i] = LLVMBuildSelect(ctx->ac.builder, rotate,
+                                       ctx->gs_vtx_offset[(i + 4) % 6],
+                                       ctx->gs_vtx_offset[i], "");
+         }
+         memcpy(ctx->gs_vtx_offset, fixed, sizeof(fixed));
+      }
+   } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
       unsigned colors_read = ctx->shader->selector->info.colors_read;
       LLVMValueRef main_fn = ctx->main_fn;
 
@@ -1205,17 +1230,8 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
          struct si_shader_selector *es = shader->key.ge.part.gs.es;
          LLVMValueRef es_prolog = NULL;
          LLVMValueRef es_main = NULL;
-         LLVMValueRef gs_prolog = NULL;
          LLVMValueRef gs_main = ctx.main_fn;
 
-         /* GS prolog */
-         union si_shader_part_key gs_prolog_key;
-         memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
-         gs_prolog_key.gs_prolog.states = shader->key.ge.part.gs.prolog;
-         gs_prolog_key.gs_prolog.as_ngg = shader->key.ge.as_ngg;
-         si_llvm_build_gs_prolog(&ctx, &gs_prolog_key);
-         gs_prolog = ctx.main_fn;
-
          /* ES main part */
          struct si_shader shader_es = {};
          shader_es.selector = es;
@@ -1253,28 +1269,17 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
 
          /* Prepare the array of shader parts. */
          LLVMValueRef parts[4];
-         unsigned num_parts = 0, main_part, next_first_part;
+         unsigned num_parts = 0, main_part;
 
          if (es_prolog)
             parts[num_parts++] = es_prolog;
 
          parts[main_part = num_parts++] = es_main;
-         parts[next_first_part = num_parts++] = gs_prolog;
          parts[num_parts++] = gs_main;
 
-         si_build_wrapper_function(&ctx, parts, num_parts, main_part, next_first_part, false);
+         si_build_wrapper_function(&ctx, parts, num_parts, main_part, main_part + 1, false);
       } else {
-         LLVMValueRef parts[2];
-         union si_shader_part_key prolog_key;
-
-         parts[1] = ctx.main_fn;
-
-         memset(&prolog_key, 0, sizeof(prolog_key));
-         prolog_key.gs_prolog.states = shader->key.ge.part.gs.prolog;
-         si_llvm_build_gs_prolog(&ctx, &prolog_key);
-         parts[0] = ctx.main_fn;
-
-         si_build_wrapper_function(&ctx, parts, 2, 1, 0, false);
+         /* Nothing to do for gfx6-8. The shader has only 1 part and it's ctx.main_fn. */
       }
    } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_FRAGMENT) {
       si_llvm_build_monolithic_ps(&ctx, shader);
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
index 4a711c80539..0a9f503ddb4 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
@@ -56,13 +56,10 @@ static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, unsigned in
 
    /* GFX9 has the ESGS ring in LDS. */
    if (ctx->screen->info.chip_class >= GFX9) {
-      unsigned index = vtx_offset_param;
-      vtx_offset =
-         si_unpack_param(ctx, ctx->args.gs_vtx_offset[index / 2], (index & 1) * 16, 16);
-
       unsigned offset = param * 4 + swizzle;
-      vtx_offset =
-         LLVMBuildAdd(ctx->ac.builder, vtx_offset, LLVMConstInt(ctx->ac.i32, offset, false), "");
+
+      vtx_offset = LLVMBuildAdd(ctx->ac.builder, ctx->gs_vtx_offset[vtx_offset_param],
+                                LLVMConstInt(ctx->ac.i32, offset, false), "");
 
       LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset);
       LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, "");
@@ -71,9 +68,8 @@ static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, unsigned in
 
    /* GFX6: input load from the ESGS ring in memory. */
    /* Get the vertex offset parameter on GFX6. */
-   LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac, ctx->args.gs_vtx_offset[vtx_offset_param]);
-
-   vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
+   vtx_offset = LLVMBuildMul(ctx->ac.builder, ctx->gs_vtx_offset[vtx_offset_param],
+                             LLVMConstInt(ctx->ac.i32, 4, 0), "");
 
    soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle) * 256, 0);
 
@@ -545,113 +541,6 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
    return shader;
 }
 
-/**
- * Build the GS prolog function. Rotate the input vertices for triangle strips
- * with adjacency.
- */
-void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
-{
-   unsigned num_sgprs, num_vgprs;
-   LLVMBuilderRef builder = ctx->ac.builder;
-   LLVMTypeRef returns[AC_MAX_ARGS];
-   LLVMValueRef func, ret;
-
-   memset(&ctx->args, 0, sizeof(ctx->args));
-
-   if (ctx->screen->info.chip_class >= GFX9) {
-      /* Other user SGPRs are not needed by GS. */
-      num_sgprs = 8 + SI_NUM_VS_STATE_RESOURCE_SGPRS;
-      num_vgprs = 5; /* ES inputs are not needed by GS */
-   } else {
-      num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
-      num_vgprs = 8;
-   }
-
-   for (unsigned i = 0; i < num_sgprs; ++i) {
-      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-      returns[i] = ctx->ac.i32;
-   }
-
-   for (unsigned i = 0; i < num_vgprs; ++i) {
-      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
-      returns[num_sgprs + i] = ctx->ac.f32;
-   }
-
-   /* Create the function. */
-   si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0);
-   func = ctx->main_fn;
-
-   /* Copy inputs to outputs. This should be no-op, as the registers match,
-    * but it will prevent the compiler from overwriting them unintentionally.
-    */
-   ret = ctx->return_value;
-   for (unsigned i = 0; i < num_sgprs; i++) {
-      LLVMValueRef p = LLVMGetParam(func, i);
-      ret = LLVMBuildInsertValue(builder, ret, p, i, "");
-   }
-   for (unsigned i = 0; i < num_vgprs; i++) {
-      LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
-      p = ac_to_float(&ctx->ac, p);
-      ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
-   }
-
-   if (key->gs_prolog.states.tri_strip_adj_fix) {
-      /* Remap the input vertices for every other primitive. */
-      const struct ac_arg gfx6_vtx_params[6] = {
-         {.used = true, .arg_index = num_sgprs},     {.used = true, .arg_index = num_sgprs + 1},
-         {.used = true, .arg_index = num_sgprs + 3}, {.used = true, .arg_index = num_sgprs + 4},
-         {.used = true, .arg_index = num_sgprs + 5}, {.used = true, .arg_index = num_sgprs + 6},
-      };
-      const struct ac_arg gfx9_vtx_params[3] = {
-         {.used = true, .arg_index = num_sgprs},
-         {.used = true, .arg_index = num_sgprs + 1},
-         {.used = true, .arg_index = num_sgprs + 4},
-      };
-      LLVMValueRef vtx_in[6], vtx_out[6];
-      LLVMValueRef prim_id, rotate;
-
-      if (ctx->screen->info.chip_class >= GFX9) {
-         for (unsigned i = 0; i < 3; i++) {
-            vtx_in[i * 2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
-            vtx_in[i * 2 + 1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
-         }
-      } else {
-         for (unsigned i = 0; i < 6; i++)
-            vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]);
-      }
-
-      prim_id = LLVMGetParam(func, num_sgprs + 2);
-      rotate = LLVMBuildTrunc(builder, prim_id, ctx->ac.i1, "");
-
-      for (unsigned i = 0; i < 6; ++i) {
-         LLVMValueRef base, rotated;
-         base = vtx_in[i];
-         rotated = vtx_in[(i + 4) % 6];
-         vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
-      }
-
-      if (ctx->screen->info.chip_class >= GFX9) {
-         for (unsigned i = 0; i < 3; i++) {
-            LLVMValueRef hi, out;
-
-            hi = LLVMBuildShl(builder, vtx_out[i * 2 + 1], LLVMConstInt(ctx->ac.i32, 16, 0), "");
-            out = LLVMBuildOr(builder, vtx_out[i * 2], hi, "");
-            out = ac_to_float(&ctx->ac, out);
-            ret = LLVMBuildInsertValue(builder, ret, out, gfx9_vtx_params[i].arg_index, "");
-         }
-      } else {
-         for (unsigned i = 0; i < 6; i++) {
-            LLVMValueRef out;
-
-            out = ac_to_float(&ctx->ac, vtx_out[i]);
-            ret = LLVMBuildInsertValue(builder, ret, out, gfx6_vtx_params[i].arg_index, "");
-         }
-      }
-   }
-
-   LLVMBuildRet(builder, ret);
-}
-
 void si_llvm_init_gs_callbacks(struct si_shader_context *ctx)
 {
    ctx->abi.load_inputs = si_nir_load_input_gs;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index fd863e26338..896abcf9e80 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -2133,8 +2133,8 @@ static void si_draw(struct pipe_context *ctx,
       bool gs_tri_strip_adj_fix =
          !HAS_TESS && prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
 
-      if (gs_tri_strip_adj_fix != sctx->shader.gs.key.ge.part.gs.prolog.tri_strip_adj_fix) {
-         sctx->shader.gs.key.ge.part.gs.prolog.tri_strip_adj_fix = gs_tri_strip_adj_fix;
+      if (gs_tri_strip_adj_fix != sctx->shader.gs.key.ge.mono.u.gs_tri_strip_adj_fix) {
+         sctx->shader.gs.key.ge.mono.u.gs_tri_strip_adj_fix = gs_tri_strip_adj_fix;
          sctx->do_update_shaders = true;
       }
    }



More information about the mesa-commit mailing list