Mesa (main): radeonsi: cull against clip planes, clipvertex, clip/cull distances in shader

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Tue Nov 16 20:00:21 UTC 2021


Module: Mesa
Branch: main
Commit: 513bd6acca866e82b54bf419d5ce20b36e2c5f21
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=513bd6acca866e82b54bf419d5ce20b36e2c5f21

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Sun Nov  7 16:43:13 2021 -0500

radeonsi: cull against clip planes, clipvertex, clip/cull distances in shader

The downside is that this duplicates shader code for clip/cull distances
in both the position and parameter portions of the shader.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13811>

---

 src/gallium/drivers/radeonsi/gfx10_shader_ngg.c | 101 ++++++++++++++++++++++--
 src/gallium/drivers/radeonsi/si_pipe.h          |   2 +-
 src/gallium/drivers/radeonsi/si_shader.h        |   4 +-
 src/gallium/drivers/radeonsi/si_state.c         |   6 +-
 src/gallium/drivers/radeonsi/si_state.h         |   6 +-
 src/gallium/drivers/radeonsi/si_state_draw.cpp  |   4 +-
 6 files changed, 109 insertions(+), 14 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
index 6f6f769dce5..17a3747db29 100644
--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@@ -554,12 +554,14 @@ enum
    /* Byte 0: Boolean ES thread accepted (unculled) flag.
     * Byte 1: New ES thread ID, loaded by GS to prepare the prim export value.
     * Byte 2: TES rel patch ID
-    * Byte 3: Unused
+    * Byte 3: 8-bit clip distance mask: 1 means the clip distance is negative.
+    *         The mask from all vertices is AND'ed. If the result is non-zero,
+    *         the primitive is culled.
     */
    lds_byte0_accept_flag = 0,
    lds_byte1_new_thread_id,
    lds_byte2_tes_rel_patch_id,
-   lds_byte3_unused,
+   lds_byte3_clipdist_neg_mask,
 
    lds_packed_data = 0, /* lds_byteN_... */
    lds_pos_cull_x_div_w,
@@ -804,6 +806,37 @@ static void gfx10_build_primitive_accepted(struct ac_llvm_context *ac, LLVMValue
    ac_build_endif(&ctx->ac, 0);
 }
 
+static void add_clipdist_bit(struct si_shader_context *ctx, LLVMValueRef distance, unsigned i,
+                             LLVMValueRef *packed_data)
+{
+   LLVMValueRef neg = LLVMBuildFCmp(ctx->ac.builder, LLVMRealOLT, distance, ctx->ac.f32_0, "");
+   neg = LLVMBuildZExt(ctx->ac.builder, neg, ctx->ac.i32, "");
+   /* Put the negative distance flag into lds_byte3_clipdist_neg_mask. */
+   neg = LLVMBuildShl(ctx->ac.builder, neg, LLVMConstInt(ctx->ac.i32, 24 + i, 0), "");
+   *packed_data = LLVMBuildOr(ctx->ac.builder, *packed_data, neg, "");
+}
+
+static bool add_clipdist_bits_for_clipvertex(struct si_shader_context *ctx,
+                                             unsigned clipdist_enable,
+                                             LLVMValueRef clipvertex[4],
+                                             LLVMValueRef *packed_data)
+{
+   struct ac_export_args clipdist[2];
+   bool added = false;
+
+   si_llvm_clipvertex_to_clipdist(ctx, clipdist, clipvertex);
+
+   for (unsigned j = 0; j < 8; j++) {
+      if (!(clipdist_enable & BITFIELD_BIT(j)))
+         continue;
+
+      LLVMValueRef distance = clipdist[j / 4].out[j % 4];
+      add_clipdist_bit(ctx, distance, j, packed_data);
+      added = true;
+   }
+   return added;
+}
+
 /**
  * Cull primitives for NGG VS or TES, then compact vertices, which happens
  * before the VS or TES main function. Return values for the main function.
@@ -826,10 +859,16 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
           (sel->info.stage == MESA_SHADER_TESS_EVAL && !shader->key.ge.as_es));
 
    LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
+   LLVMValueRef packed_data = ctx->ac.i32_0;
+   LLVMValueRef position[4] = {};
    unsigned pos_index = 0;
+   unsigned clip_plane_enable = SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(shader->key.ge.opt.ngg_culling);
+   unsigned clipdist_enable = (sel->clipdist_mask & clip_plane_enable) | sel->culldist_mask;
+   bool has_clipdist_mask = false;
 
    for (unsigned i = 0; i < info->num_outputs; i++) {
-      LLVMValueRef position[4];
+      LLVMValueRef clipvertex[4];
+      unsigned base;
 
       switch (info->output_semantic[i]) {
       case VARYING_SLOT_POS:
@@ -862,12 +901,45 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
                ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_cull_x_div_w + chan, 0)));
          }
          break;
+
+      case VARYING_SLOT_CLIP_DIST0:
+      case VARYING_SLOT_CLIP_DIST1:
+         base = info->output_semantic[i] == VARYING_SLOT_CLIP_DIST1 ? 4 : 0;
+
+         for (unsigned j = 0; j < 4; j++) {
+            unsigned index = base + j;
+
+            if (!(clipdist_enable & BITFIELD_BIT(index)))
+               continue;
+
+            LLVMValueRef distance = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], "");
+            add_clipdist_bit(ctx, distance, index, &packed_data);
+            has_clipdist_mask = true;
+         }
+         break;
+
+      case VARYING_SLOT_CLIP_VERTEX:
+         for (unsigned j = 0; j < 4; j++)
+            clipvertex[j] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], "");
+
+         if (add_clipdist_bits_for_clipvertex(ctx, clipdist_enable, clipvertex, &packed_data))
+            has_clipdist_mask = true;
+         break;
       }
    }
 
+   if (clip_plane_enable && !sel->clipdist_mask) {
+      /* When clip planes are enabled and there are no clip distance outputs,
+       * we should use user clip planes and cull against the position.
+       */
+      assert(!has_clipdist_mask);
+      if (add_clipdist_bits_for_clipvertex(ctx, clipdist_enable, position, &packed_data))
+         has_clipdist_mask = true;
+   }
+
    /* Initialize the packed data. */
    LLVMBuildStore(
-      builder, ctx->ac.i32_0,
+      builder, packed_data,
       ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_packed_data, 0)));
    ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
    ac_build_s_barrier(&ctx->ac);
@@ -950,6 +1022,8 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
    {
       /* Load positions. */
       LLVMValueRef pos[3][4] = {};
+      LLVMValueRef clipdist_neg_mask = NULL;
+
       for (unsigned vtx = 0; vtx < num_vertices; vtx++) {
          for (unsigned chan = 0; chan < 4; chan++) {
             unsigned index;
@@ -965,8 +1039,25 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
             pos[vtx][chan] = LLVMBuildLoad(builder, addr, "");
             pos[vtx][chan] = ac_to_float(&ctx->ac, pos[vtx][chan]);
          }
+
+         if (has_clipdist_mask) {
+            /* Load and AND clip distance masks. Each bit means whether that clip distance is
+             * negative. If all masks are AND'ed and the result is 0, the primitive isn't culled
+             * by clip distances.
+             */
+            LLVMValueRef addr = si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte3_clipdist_neg_mask);
+            LLVMValueRef mask = LLVMBuildLoad(builder, addr, "");
+            if (!clipdist_neg_mask)
+               clipdist_neg_mask = mask;
+            else
+               clipdist_neg_mask = LLVMBuildAnd(builder, clipdist_neg_mask, mask, "");
+         }
       }
 
+      LLVMValueRef clipdist_accepted =
+         has_clipdist_mask ? LLVMBuildICmp(builder, LLVMIntEQ, clipdist_neg_mask, ctx->ac.i8_0, "")
+                           : ctx->ac.i1true;
+
       LLVMValueRef vp_scale[2] = {}, vp_translate[2] = {}, small_prim_precision = NULL;
       LLVMValueRef clip_half_line_width[2] = {};
 
@@ -1020,7 +1111,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
          gs_accepted,
          (void*)gs_vtxptr,
       };
-      ac_cull_primitive(&ctx->ac, pos, ctx->ac.i1true, vp_scale, vp_translate,
+      ac_cull_primitive(&ctx->ac, pos, clipdist_accepted, vp_scale, vp_translate,
                         small_prim_precision, clip_half_line_width,
                         &options, gfx10_build_primitive_accepted, params);
    }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index d5d88a4a9a2..e695b2a8d2b 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1134,7 +1134,7 @@ struct si_context {
 
    /* Emitted draw state. */
    bool ngg : 1;
-   uint8_t ngg_culling;
+   uint16_t ngg_culling;
    unsigned last_index_size;
    int last_base_vertex;
    unsigned last_start_instance;
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 44021cab095..887cdc1e7b5 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -283,6 +283,8 @@ enum
 #define SI_NGG_CULL_FRONT_FACE               (1 << 2)   /* front faces */
 #define SI_NGG_CULL_LINES                    (1 << 3)   /* the primitive type is lines */
 #define SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT (1 << 4)   /* cull small lines according to the diamond exit rule */
+#define SI_NGG_CULL_CLIP_PLANE_ENABLE(enable) (((enable) & 0xff) << 5)
+#define SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(x)  (((x) >> 5) & 0xff)
 
 /**
  * For VS shader keys, describe any fixups required for vertex fetch.
@@ -660,7 +662,7 @@ struct si_shader_key_ge {
       unsigned kill_pointsize : 1;
 
       /* For NGG VS and TES. */
-      unsigned ngg_culling : 5; /* SI_NGG_CULL_* */
+      unsigned ngg_culling : 13; /* SI_NGG_CULL_* */
 
       /* For shaders where monolithic variants have better code.
        *
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index c618e03f522..c29a184680c 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -960,11 +960,13 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
                          S_028810_DX_RASTERIZATION_KILL(state->rasterizer_discard) |
                          S_028810_DX_LINEAR_ATTR_CLIP_ENA(1);
 
-   rs->ngg_cull_flags_tris = SI_NGG_CULL_TRIANGLES;
+   rs->ngg_cull_flags_tris = SI_NGG_CULL_TRIANGLES |
+                             SI_NGG_CULL_CLIP_PLANE_ENABLE(state->clip_plane_enable);
    rs->ngg_cull_flags_tris_y_inverted = rs->ngg_cull_flags_tris;
 
    rs->ngg_cull_flags_lines = SI_NGG_CULL_LINES |
-                              (!rs->perpendicular_end_caps ? SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT : 0);
+                              (!rs->perpendicular_end_caps ? SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT : 0) |
+                              SI_NGG_CULL_CLIP_PLANE_ENABLE(state->clip_plane_enable);
 
    if (rs->rasterizer_discard) {
       rs->ngg_cull_flags_tris |= SI_NGG_CULL_FRONT_FACE |
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 4c5941388ac..6d233925631 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -76,9 +76,9 @@ struct si_state_rasterizer {
    unsigned pa_cl_clip_cntl;
    float line_width;
    float max_point_size;
-   unsigned ngg_cull_flags_tris : 8;
-   unsigned ngg_cull_flags_tris_y_inverted : 8;
-   unsigned ngg_cull_flags_lines : 8;
+   unsigned ngg_cull_flags_tris : 16;
+   unsigned ngg_cull_flags_tris_y_inverted : 16;
+   unsigned ngg_cull_flags_lines : 16;
    unsigned sprite_coord_enable : 8;
    unsigned clip_plane_enable : 8;
    unsigned half_pixel_center : 1;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index 38bcce94056..84a189be71e 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -2263,7 +2263,7 @@ static void si_draw(struct pipe_context *ctx,
    }
 
    /* Update NGG culling settings. */
-   uint8_t old_ngg_culling = sctx->ngg_culling;
+   uint16_t old_ngg_culling = sctx->ngg_culling;
    if (GFX_VERSION >= GFX10) {
       struct si_shader_selector *hw_vs = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->cso;
 
@@ -2278,7 +2278,7 @@ static void si_draw(struct pipe_context *ctx,
          /* Check that the current shader allows culling. */
          assert(hw_vs->ngg_cull_vert_threshold != UINT_MAX);
 
-         uint8_t ngg_culling;
+         uint16_t ngg_culling;
 
          if (util_prim_is_lines(sctx->current_rast_prim)) {
             /* Overwrite it to mask out face cull flags. */



More information about the mesa-commit mailing list