Mesa (main): ac,radeonsi: cull small lines in the shader using the diamond exit rule

Tue Nov 16 02:41:11 UTC 2021

Module: Mesa
Branch: main
Commit: 9151ac3531f05a825b6a07c4977251b45ed34141
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=9151ac3531f05a825b6a07c4977251b45ed34141

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Fri Nov  5 21:56:24 2021 -0400

ac,radeonsi: cull small lines in the shader using the diamond exit rule

It also splits clip_half_line_width into X and Y components for tighter
view culling.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13700>

---

 src/amd/llvm/ac_llvm_cull.c                      | 96 +++++++++++++++++++++++-
 src/gallium/drivers/radeonsi/gfx10_shader_ngg.c  | 31 ++++----
 src/gallium/drivers/radeonsi/si_pipe.h           |  2 +
 src/gallium/drivers/radeonsi/si_shader.h         |  3 +-
 src/gallium/drivers/radeonsi/si_state.c          |  3 +-
 src/gallium/drivers/radeonsi/si_state_viewport.c | 11 ++-
 6 files changed, 125 insertions(+), 21 deletions(-)

diff --git a/src/amd/llvm/ac_llvm_cull.c b/src/amd/llvm/ac_llvm_cull.c
index 87d201f0781..d37a9f847f6 100644
--- a/src/amd/llvm/ac_llvm_cull.c
+++ b/src/amd/llvm/ac_llvm_cull.c
@@ -120,6 +120,25 @@ static LLVMValueRef ac_cull_face(struct ac_llvm_context *ctx, LLVMValueRef pos[3
    return accepted;
 }
 
+static void rotate_45degrees(struct ac_llvm_context *ctx, LLVMValueRef v[2])
+{
+   /* sin(45) == cos(45) */
+   LLVMValueRef sincos45 = LLVMConstReal(ctx->f32, 0.707106781);
+
+   /* x2  =  x*cos45 - y*sin45  =  x*sincos45 - y*sincos45
+    * y2  =  x*sin45 + y*cos45  =  x*sincos45 + y*sincos45
+    */
+   LLVMValueRef first = LLVMBuildFMul(ctx->builder, v[0], sincos45, "");
+
+   /* Doing 2x ffma while duplicating the multiplication is 33% faster than fmul+fadd+fadd. */
+   LLVMValueRef result[2] = {
+      ac_build_fmad(ctx, LLVMBuildFNeg(ctx->builder, v[1], ""), sincos45, first),
+      ac_build_fmad(ctx, v[1], sincos45, first),
+   };
+
+   memcpy(v, result, sizeof(result));
+}
+
 /* Perform view culling and small primitive elimination and return true
  * if the primitive is accepted and initially_accepted == true. */
 static void cull_bbox(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
@@ -181,8 +200,8 @@ static void cull_bbox(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
          }
       }
 
-      /* Small primitive elimination. */
-      if (options->cull_small_prims) {
+      /* Small primitive culling - triangles. */
+      if (options->cull_small_prims && options->num_vertices == 3) {
          /* Assuming a sample position at (0.5, 0.5), if we round
           * the bounding box min/max extents and the results of
           * the rounding are equal in either the X or Y direction,
@@ -214,6 +233,79 @@ static void cull_bbox(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
          accepted = LLVMBuildAnd(builder, accepted, visible, "");
       }
 
+      /* Small primitive culling - lines. */
+      if (options->cull_small_prims && options->num_vertices == 2) {
+         /* This only works with lines without perpendicular end caps (lines with perpendicular
+          * end caps are rasterized as quads and thus can't be culled as small prims in 99% of
+          * cases because line_width >= 1).
+          *
+          * This takes advantage of the diamont exit rule, which says that every pixel
+          * has a diamond inside it touching the pixel boundary and only if a line exits
+          * the diamond, that pixel is filled. If a line enters the diamond or stays
+          * outside the diamond, the pixel isn't filled.
+          *
+          * This algorithm is a little simpler than that. The space outside all diamonds also
+          * has the same diamond shape, which we'll call corner diamonds.
+          *
+          * The idea is to cull all lines that are entirely inside a diamond, including
+          * corner diamonds. If a line is entirely inside a diamond, it can be culled because
+          * it doesn't exit it. If a line is entirely inside a corner diamond, it can be culled
+          * because it doesn't enter any diamond and thus can't exit any diamond.
+          *
+          * The viewport is rotated by 45 degress to turn diamonds into squares, and a bounding
+          * box test is used to determine whether a line is entirely inside any square (diamond).
+          *
+          * The line width doesn't matter. Wide lines only duplicate filled pixels in either X or
+          * Y direction from the filled pixels. MSAA also doesn't matter. MSAA should ideally use
+          * perpendicular end caps that enable quad rasterization for lines. Thus, this should
+          * always use non-MSAA viewport transformation and non-MSAA small prim precision.
+          *
+          * A good test is piglit/lineloop because it draws 10k subpixel lines in a circle.
+          * It should contain no holes if this matches hw behavior.
+          */
+         LLVMValueRef v0[2], v1[2];
+
+         /* Get vertex positions in pixels. */
+         for (unsigned chan = 0; chan < 2; chan++) {
+            v0[chan] = ac_build_fmad(ctx, pos[0][chan], vp_scale[chan], vp_translate[chan]);
+            v1[chan] = ac_build_fmad(ctx, pos[1][chan], vp_scale[chan], vp_translate[chan]);
+         }
+
+         /* Rotate the viewport by 45 degress, so that diamonds become squares. */
+         rotate_45degrees(ctx, v0);
+         rotate_45degrees(ctx, v1);
+
+         LLVMValueRef not_equal[2];
+
+         for (unsigned chan = 0; chan < 2; chan++) {
+            /* The width of each square is sqrt(0.5), so scale it to 1 because we want
+             * round() to give us the position of the closest center of a square (diamond).
+             */
+            v0[chan] = LLVMBuildFMul(builder, v0[chan], LLVMConstReal(ctx->f32, 1.414213562), "");
+            v1[chan] = LLVMBuildFMul(builder, v1[chan], LLVMConstReal(ctx->f32, 1.414213562), "");
+
+            /* Compute the bounding box around both vertices. We do this because we must
+             * enlarge the line area by the precision of the rasterizer.
+             */
+            LLVMValueRef min = ac_build_fmin(ctx, v0[chan], v1[chan]);
+            LLVMValueRef max = ac_build_fmax(ctx, v0[chan], v1[chan]);
+
+            /* Enlarge the bounding box by the precision of the rasterizer. */
+            min = LLVMBuildFSub(builder, min, small_prim_precision, "");
+            max = LLVMBuildFAdd(builder, max, small_prim_precision, "");
+
+            /* Round the bounding box corners. If both rounded corners are equal,
+             * the bounding box is entirely inside a square (diamond).
+             */
+            min = ac_build_round(ctx, min);
+            max = ac_build_round(ctx, max);
+            not_equal[chan] = LLVMBuildFCmp(builder, LLVMRealONE, min, max, "");
+         }
+
+         accepted = LLVMBuildAnd(builder, accepted,
+                                 LLVMBuildOr(builder, not_equal[0], not_equal[1], ""), "");
+      }
+
       /* Disregard the bounding box culling if any W is negative because the code
        * doesn't work with that.
        */
diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
index 71a14a5f721..b5369faf6ec 100644
--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@@ -970,44 +970,47 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
          }
       }
 
+      LLVMValueRef vp_scale[2] = {}, vp_translate[2] = {}, small_prim_precision = NULL;
       LLVMValueRef clip_half_line_width[2] = {};
 
       /* Load the viewport state for small prim culling. */
+      bool prim_is_lines = shader->key.ge.opt.ngg_culling & SI_NGG_CULL_LINES;
       LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->small_prim_cull_info);
-      LLVMValueRef vp = ac_build_load_to_sgpr(&ctx->ac, ptr, ctx->ac.i32_0);
+      /* Lines will always use the non-AA viewport transformation. */
+      LLVMValueRef vp = ac_build_load_to_sgpr(&ctx->ac, ptr,
+                                              prim_is_lines ? ctx->ac.i32_1 : ctx->ac.i32_0);
       vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
-      LLVMValueRef vp_scale[2], vp_translate[2];
       vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
       vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
       vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
       vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
 
-      /* Get the small prim filter precision. */
-      LLVMValueRef small_prim_precision = si_unpack_param(ctx, ctx->vs_state_bits, 7, 4);
-      small_prim_precision =
-         LLVMBuildOr(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 0x70, 0), "");
-      small_prim_precision =
-         LLVMBuildShl(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 23, 0), "");
-      small_prim_precision = LLVMBuildBitCast(builder, small_prim_precision, ctx->ac.f32, "");
-
       /* Execute culling code. */
       struct ac_cull_options options = {};
       options.cull_view_xy = true;
       options.cull_w = true;
 
-      if (shader->key.ge.opt.ngg_culling & SI_NGG_CULL_LINES) {
-         ptr = LLVMBuildPointerCast(builder, ptr,
-                                    LLVMPointerType(ctx->ac.v2i32, AC_ADDR_SPACE_CONST_32BIT), "");
+      if (prim_is_lines) {
          LLVMValueRef terms = ac_build_load_to_sgpr(&ctx->ac, ptr, LLVMConstInt(ctx->ac.i32, 2, 0));
-         terms = LLVMBuildBitCast(builder, terms, ctx->ac.v2f32, "");
+         terms = LLVMBuildBitCast(builder, terms, ctx->ac.v4f32, "");
          clip_half_line_width[0] = ac_llvm_extract_elem(&ctx->ac, terms, 0);
          clip_half_line_width[1] = ac_llvm_extract_elem(&ctx->ac, terms, 1);
+         small_prim_precision = ac_llvm_extract_elem(&ctx->ac, terms, 2);
 
          options.num_vertices = 2;
+         options.cull_small_prims = shader->key.ge.opt.ngg_culling & SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT;
 
          assert(!(shader->key.ge.opt.ngg_culling & SI_NGG_CULL_BACK_FACE));
          assert(!(shader->key.ge.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE));
       } else {
+         /* Get the small prim filter precision. */
+         small_prim_precision = si_unpack_param(ctx, ctx->vs_state_bits, 7, 4);
+         small_prim_precision =
+            LLVMBuildOr(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 0x70, 0), "");
+         small_prim_precision =
+            LLVMBuildShl(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 23, 0), "");
+         small_prim_precision = LLVMBuildBitCast(builder, small_prim_precision, ctx->ac.f32, "");
+
          options.num_vertices = 3;
          options.cull_front = shader->key.ge.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE;
          options.cull_back = shader->key.ge.opt.ngg_culling & SI_NGG_CULL_BACK_FACE;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index a98b6484318..d5d88a4a9a2 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -899,7 +899,9 @@ struct si_saved_cs {
 
 struct si_small_prim_cull_info {
    float scale[2], translate[2];
+   float scale_no_aa[2], translate_no_aa[2];
    float clip_half_line_width[2];      /* line_width * 0.5 in clip space in X and Y directions */
+   float small_prim_precision_no_aa;   /* same as the small prim precision, but ignores MSAA */
    /* The above fields are uploaded to memory. The below fields are passed via user SGPRs. */
    float small_prim_precision;
 };
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 118c37ee5a7..bc27e82e696 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -283,6 +283,7 @@ enum
 #define SI_NGG_CULL_BACK_FACE                (1 << 1)   /* back faces */
 #define SI_NGG_CULL_FRONT_FACE               (1 << 2)   /* front faces */
 #define SI_NGG_CULL_LINES                    (1 << 3)   /* the primitive type is lines */
+#define SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT (1 << 4)   /* cull small lines according to the diamond exit rule */
 
 /**
  * For VS shader keys, describe any fixups required for vertex fetch.
@@ -660,7 +661,7 @@ struct si_shader_key_ge {
       unsigned kill_pointsize : 1;
 
       /* For NGG VS and TES. */
-      unsigned ngg_culling : 4; /* SI_NGG_CULL_* */
+      unsigned ngg_culling : 5; /* SI_NGG_CULL_* */
 
       /* For shaders where monolithic variants have better code.
        *
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index d12424cbe29..ffff6234572 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -969,7 +969,8 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
    } else {
       rs->ngg_cull_flags_tris = rs->ngg_cull_flags_tris_y_inverted = SI_NGG_CULL_ENABLED;
       rs->ngg_cull_flags_lines = SI_NGG_CULL_ENABLED |
-                                 SI_NGG_CULL_LINES;
+                                 SI_NGG_CULL_LINES |
+                                 (!rs->perpendicular_end_caps ? SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT : 0);
 
       bool cull_front, cull_back;
 
diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c
index be184bf5eb9..7db69b9df0d 100644
--- a/src/gallium/drivers/radeonsi/si_state_viewport.c
+++ b/src/gallium/drivers/radeonsi/si_state_viewport.c
@@ -70,6 +70,9 @@ static void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small
       info.translate[1] += 0.5;
    }
 
+   memcpy(info.scale_no_aa, info.scale, sizeof(info.scale));
+   memcpy(info.translate_no_aa, info.translate, sizeof(info.translate));
+
    /* Scale the framebuffer up, so that samples become pixels and small
     * primitive culling is the same for all sample counts.
     * This only works with the standard DX sample positions, because
@@ -87,11 +90,13 @@ static void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small
    unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
 
    if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
-      info.small_prim_precision = num_samples / 4096.0;
+      info.small_prim_precision_no_aa = 1.0 / 4096.0;
    else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
-      info.small_prim_precision = num_samples / 1024.0;
+      info.small_prim_precision_no_aa = 1.0 / 1024.0;
    else
-      info.small_prim_precision = num_samples / 256.0;
+      info.small_prim_precision_no_aa = 1.0 / 256.0;
+
+   info.small_prim_precision = num_samples * info.small_prim_precision_no_aa;
 
    *out = info;
 }