Mesa (main): radeonsi: cull against clip planes, clipvertex, clip/cull distances in shader
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Tue Nov 16 20:00:21 UTC 2021
Module: Mesa
Branch: main
Commit: 513bd6acca866e82b54bf419d5ce20b36e2c5f21
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=513bd6acca866e82b54bf419d5ce20b36e2c5f21
Author: Marek Olšák <marek.olsak at amd.com>
Date: Sun Nov 7 16:43:13 2021 -0500
radeonsi: cull against clip planes, clipvertex, clip/cull distances in shader
The downside is that this duplicates shader code for clip/cull distances
in both the position and parameter portions of the shader.
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13811>
---
src/gallium/drivers/radeonsi/gfx10_shader_ngg.c | 101 ++++++++++++++++++++++--
src/gallium/drivers/radeonsi/si_pipe.h | 2 +-
src/gallium/drivers/radeonsi/si_shader.h | 4 +-
src/gallium/drivers/radeonsi/si_state.c | 6 +-
src/gallium/drivers/radeonsi/si_state.h | 6 +-
src/gallium/drivers/radeonsi/si_state_draw.cpp | 4 +-
6 files changed, 109 insertions(+), 14 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
index 6f6f769dce5..17a3747db29 100644
--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@@ -554,12 +554,14 @@ enum
/* Byte 0: Boolean ES thread accepted (unculled) flag.
* Byte 1: New ES thread ID, loaded by GS to prepare the prim export value.
* Byte 2: TES rel patch ID
- * Byte 3: Unused
+ * Byte 3: 8-bit clip distance mask: 1 means the clip distance is negative.
+ * The mask from all vertices is AND'ed. If the result is non-zero,
+ * the primitive is culled.
*/
lds_byte0_accept_flag = 0,
lds_byte1_new_thread_id,
lds_byte2_tes_rel_patch_id,
- lds_byte3_unused,
+ lds_byte3_clipdist_neg_mask,
lds_packed_data = 0, /* lds_byteN_... */
lds_pos_cull_x_div_w,
@@ -804,6 +806,37 @@ static void gfx10_build_primitive_accepted(struct ac_llvm_context *ac, LLVMValue
ac_build_endif(&ctx->ac, 0);
}
+static void add_clipdist_bit(struct si_shader_context *ctx, LLVMValueRef distance, unsigned i,
+ LLVMValueRef *packed_data)
+{
+ LLVMValueRef neg = LLVMBuildFCmp(ctx->ac.builder, LLVMRealOLT, distance, ctx->ac.f32_0, "");
+ neg = LLVMBuildZExt(ctx->ac.builder, neg, ctx->ac.i32, "");
+ /* Put the negative distance flag into lds_byte3_clipdist_neg_mask. */
+ neg = LLVMBuildShl(ctx->ac.builder, neg, LLVMConstInt(ctx->ac.i32, 24 + i, 0), "");
+ *packed_data = LLVMBuildOr(ctx->ac.builder, *packed_data, neg, "");
+}
+
+static bool add_clipdist_bits_for_clipvertex(struct si_shader_context *ctx,
+ unsigned clipdist_enable,
+ LLVMValueRef clipvertex[4],
+ LLVMValueRef *packed_data)
+{
+ struct ac_export_args clipdist[2];
+ bool added = false;
+
+ si_llvm_clipvertex_to_clipdist(ctx, clipdist, clipvertex);
+
+ for (unsigned j = 0; j < 8; j++) {
+ if (!(clipdist_enable & BITFIELD_BIT(j)))
+ continue;
+
+ LLVMValueRef distance = clipdist[j / 4].out[j % 4];
+ add_clipdist_bit(ctx, distance, j, packed_data);
+ added = true;
+ }
+ return added;
+}
+
/**
* Cull primitives for NGG VS or TES, then compact vertices, which happens
* before the VS or TES main function. Return values for the main function.
@@ -826,10 +859,16 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
(sel->info.stage == MESA_SHADER_TESS_EVAL && !shader->key.ge.as_es));
LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
+ LLVMValueRef packed_data = ctx->ac.i32_0;
+ LLVMValueRef position[4] = {};
unsigned pos_index = 0;
+ unsigned clip_plane_enable = SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(shader->key.ge.opt.ngg_culling);
+ unsigned clipdist_enable = (sel->clipdist_mask & clip_plane_enable) | sel->culldist_mask;
+ bool has_clipdist_mask = false;
for (unsigned i = 0; i < info->num_outputs; i++) {
- LLVMValueRef position[4];
+ LLVMValueRef clipvertex[4];
+ unsigned base;
switch (info->output_semantic[i]) {
case VARYING_SLOT_POS:
@@ -862,12 +901,45 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_cull_x_div_w + chan, 0)));
}
break;
+
+ case VARYING_SLOT_CLIP_DIST0:
+ case VARYING_SLOT_CLIP_DIST1:
+ base = info->output_semantic[i] == VARYING_SLOT_CLIP_DIST1 ? 4 : 0;
+
+ for (unsigned j = 0; j < 4; j++) {
+ unsigned index = base + j;
+
+ if (!(clipdist_enable & BITFIELD_BIT(index)))
+ continue;
+
+ LLVMValueRef distance = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], "");
+ add_clipdist_bit(ctx, distance, index, &packed_data);
+ has_clipdist_mask = true;
+ }
+ break;
+
+ case VARYING_SLOT_CLIP_VERTEX:
+ for (unsigned j = 0; j < 4; j++)
+ clipvertex[j] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], "");
+
+ if (add_clipdist_bits_for_clipvertex(ctx, clipdist_enable, clipvertex, &packed_data))
+ has_clipdist_mask = true;
+ break;
}
}
+ if (clip_plane_enable && !sel->clipdist_mask) {
+ /* When clip planes are enabled and there are no clip distance outputs,
+ * we should use user clip planes and cull against the position.
+ */
+ assert(!has_clipdist_mask);
+ if (add_clipdist_bits_for_clipvertex(ctx, clipdist_enable, position, &packed_data))
+ has_clipdist_mask = true;
+ }
+
/* Initialize the packed data. */
LLVMBuildStore(
- builder, ctx->ac.i32_0,
+ builder, packed_data,
ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_packed_data, 0)));
ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
ac_build_s_barrier(&ctx->ac);
@@ -950,6 +1022,8 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
{
/* Load positions. */
LLVMValueRef pos[3][4] = {};
+ LLVMValueRef clipdist_neg_mask = NULL;
+
for (unsigned vtx = 0; vtx < num_vertices; vtx++) {
for (unsigned chan = 0; chan < 4; chan++) {
unsigned index;
@@ -965,8 +1039,25 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
pos[vtx][chan] = LLVMBuildLoad(builder, addr, "");
pos[vtx][chan] = ac_to_float(&ctx->ac, pos[vtx][chan]);
}
+
+ if (has_clipdist_mask) {
+ /* Load and AND clip distance masks. Each bit means whether that clip distance is
+ * negative. If all masks are AND'ed and the result is 0, the primitive isn't culled
+ * by clip distances.
+ */
+ LLVMValueRef addr = si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte3_clipdist_neg_mask);
+ LLVMValueRef mask = LLVMBuildLoad(builder, addr, "");
+ if (!clipdist_neg_mask)
+ clipdist_neg_mask = mask;
+ else
+ clipdist_neg_mask = LLVMBuildAnd(builder, clipdist_neg_mask, mask, "");
+ }
}
+ LLVMValueRef clipdist_accepted =
+ has_clipdist_mask ? LLVMBuildICmp(builder, LLVMIntEQ, clipdist_neg_mask, ctx->ac.i8_0, "")
+ : ctx->ac.i1true;
+
LLVMValueRef vp_scale[2] = {}, vp_translate[2] = {}, small_prim_precision = NULL;
LLVMValueRef clip_half_line_width[2] = {};
@@ -1020,7 +1111,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
gs_accepted,
(void*)gs_vtxptr,
};
- ac_cull_primitive(&ctx->ac, pos, ctx->ac.i1true, vp_scale, vp_translate,
+ ac_cull_primitive(&ctx->ac, pos, clipdist_accepted, vp_scale, vp_translate,
small_prim_precision, clip_half_line_width,
&options, gfx10_build_primitive_accepted, params);
}
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index d5d88a4a9a2..e695b2a8d2b 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1134,7 +1134,7 @@ struct si_context {
/* Emitted draw state. */
bool ngg : 1;
- uint8_t ngg_culling;
+ uint16_t ngg_culling;
unsigned last_index_size;
int last_base_vertex;
unsigned last_start_instance;
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 44021cab095..887cdc1e7b5 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -283,6 +283,8 @@ enum
#define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */
#define SI_NGG_CULL_LINES (1 << 3) /* the primitive type is lines */
#define SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT (1 << 4) /* cull small lines according to the diamond exit rule */
+#define SI_NGG_CULL_CLIP_PLANE_ENABLE(enable) (((enable) & 0xff) << 5)
+#define SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(x) (((x) >> 5) & 0xff)
/**
* For VS shader keys, describe any fixups required for vertex fetch.
@@ -660,7 +662,7 @@ struct si_shader_key_ge {
unsigned kill_pointsize : 1;
/* For NGG VS and TES. */
- unsigned ngg_culling : 5; /* SI_NGG_CULL_* */
+ unsigned ngg_culling : 13; /* SI_NGG_CULL_* */
/* For shaders where monolithic variants have better code.
*
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index c618e03f522..c29a184680c 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -960,11 +960,13 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
S_028810_DX_RASTERIZATION_KILL(state->rasterizer_discard) |
S_028810_DX_LINEAR_ATTR_CLIP_ENA(1);
- rs->ngg_cull_flags_tris = SI_NGG_CULL_TRIANGLES;
+ rs->ngg_cull_flags_tris = SI_NGG_CULL_TRIANGLES |
+ SI_NGG_CULL_CLIP_PLANE_ENABLE(state->clip_plane_enable);
rs->ngg_cull_flags_tris_y_inverted = rs->ngg_cull_flags_tris;
rs->ngg_cull_flags_lines = SI_NGG_CULL_LINES |
- (!rs->perpendicular_end_caps ? SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT : 0);
+ (!rs->perpendicular_end_caps ? SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT : 0) |
+ SI_NGG_CULL_CLIP_PLANE_ENABLE(state->clip_plane_enable);
if (rs->rasterizer_discard) {
rs->ngg_cull_flags_tris |= SI_NGG_CULL_FRONT_FACE |
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 4c5941388ac..6d233925631 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -76,9 +76,9 @@ struct si_state_rasterizer {
unsigned pa_cl_clip_cntl;
float line_width;
float max_point_size;
- unsigned ngg_cull_flags_tris : 8;
- unsigned ngg_cull_flags_tris_y_inverted : 8;
- unsigned ngg_cull_flags_lines : 8;
+ unsigned ngg_cull_flags_tris : 16;
+ unsigned ngg_cull_flags_tris_y_inverted : 16;
+ unsigned ngg_cull_flags_lines : 16;
unsigned sprite_coord_enable : 8;
unsigned clip_plane_enable : 8;
unsigned half_pixel_center : 1;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index 38bcce94056..84a189be71e 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -2263,7 +2263,7 @@ static void si_draw(struct pipe_context *ctx,
}
/* Update NGG culling settings. */
- uint8_t old_ngg_culling = sctx->ngg_culling;
+ uint16_t old_ngg_culling = sctx->ngg_culling;
if (GFX_VERSION >= GFX10) {
struct si_shader_selector *hw_vs = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->cso;
@@ -2278,7 +2278,7 @@ static void si_draw(struct pipe_context *ctx,
/* Check that the current shader allows culling. */
assert(hw_vs->ngg_cull_vert_threshold != UINT_MAX);
- uint8_t ngg_culling;
+ uint16_t ngg_culling;
if (util_prim_is_lines(sctx->current_rast_prim)) {
/* Overwrite it to mask out face cull flags. */
More information about the mesa-commit
mailing list