Mesa (staging/20.3): radeonsi: fix small primitive culling with MSAA force-disabled and smoothing

Thu Dec 17 21:21:46 UTC 2020

Module: Mesa
Branch: staging/20.3
Commit: 840a9a6c67bdeb110a2c1c6412c4a27670d2c3da
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=840a9a6c67bdeb110a2c1c6412c4a27670d2c3da

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Wed Dec  9 19:18:37 2020 -0500

radeonsi: fix small primitive culling with MSAA force-disabled and smoothing

The problem was that the shader constants were based on the framebuffer
sample count and ignored the multisample enable state and the line/polygon
smoothing state, which uses MSAA rasterization that only sets SampleMaskIn
to get the coverage for alpha-blended smoothing (the PS epilog computes
the alpha channel from SampleMaskIn and blending generates the AA results).

- This is a complete rework that adds a new state for NGG cull constants.
- It fixes the same thing for the prim discard compute shader.
- It documents how VS_STATE.SMALL_PRIM_PRECISION is encoded.

It fixes blue corruption in Unigine Heaven with MSAA and Medium details
or better.

Fixes: 7648060dc03 - radeonsi: enable NGG culling by default on gfx10.3 dGPUs
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8134>

---

 .pick_status.json                                  |   2 +-
 .../drivers/radeonsi/si_compute_prim_discard.c     |  15 +--
 src/gallium/drivers/radeonsi/si_gfx_cs.c           |  13 ++-
 src/gallium/drivers/radeonsi/si_pipe.h             |  17 +++-
 src/gallium/drivers/radeonsi/si_state.c            |  16 +--
 src/gallium/drivers/radeonsi/si_state.h            |   1 +
 src/gallium/drivers/radeonsi/si_state_shaders.c    |   4 +
 src/gallium/drivers/radeonsi/si_state_viewport.c   | 112 +++++++++++----------
 8 files changed, 99 insertions(+), 81 deletions(-)

diff --git a/.pick_status.json b/.pick_status.json
index 3a3e0f1ff28..c04b56572dd 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -274,7 +274,7 @@
         "description": "radeonsi: fix small primitive culling with MSAA force-disabled and smoothing",
         "nominated": true,
         "nomination_type": 1,
-        "resolution": 0,
+        "resolution": 3,
         "master_sha": null,
         "because_sha": "7648060dc03775979e3fa8904c4948c084e82b6a"
     },
diff --git a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
index d540f260c9b..b4053cae9ec 100644
--- a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
+++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
@@ -1314,19 +1314,6 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
    desc[10] = fui(cull_info.translate[0]);
    desc[11] = fui(cull_info.translate[1]);
 
-   /* Better subpixel precision increases the efficiency of small
-    * primitive culling. */
-   unsigned num_samples = sctx->framebuffer.nr_samples;
-   unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
-   float small_prim_cull_precision;
-
-   if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
-      small_prim_cull_precision = num_samples / 4096.0;
-   else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
-      small_prim_cull_precision = num_samples / 1024.0;
-   else
-      small_prim_cull_precision = num_samples / 256.0;
-
    /* Set user data SGPRs. */
    /* This can't be greater than 14 if we want the fastest launch rate. */
    unsigned user_sgprs = 13;
@@ -1490,7 +1477,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
          radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5));
          radeon_emit(cs, info->restart_index);
          /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
-         radeon_emit(cs, fui(small_prim_cull_precision));
+         radeon_emit(cs, fui(cull_info.small_prim_precision));
       } else {
          assert(VERTEX_COUNTER_GDS_MODE == 2);
          /* Only update the SGPRs that changed. */
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
index 436a1116360..e7c169e7408 100644
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -494,15 +494,14 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
       ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, 8);
       ctx->framebuffer.dirty_zsbuf = true;
    }
-   /* This should always be marked as dirty to set the framebuffer scissor
-    * at least.
-    *
-    * Even with shadowed registers, we have to add buffers to the buffer list.
-    * All of these do that.
+
+   /* Even with shadowed registers, we have to add buffers to the buffer list.
+    * These atoms are the only ones that add buffers.
     */
    si_mark_atom_dirty(ctx, &ctx->atoms.s.framebuffer);
    si_mark_atom_dirty(ctx, &ctx->atoms.s.render_cond);
-   si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
+   if (ctx->screen->use_ngg_culling)
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.ngg_cull_state);
 
    if (first_cs || !ctx->shadowed_regs) {
       /* These don't add any buffers, so skip them with shadowing. */
@@ -532,6 +531,7 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
          si_mark_atom_dirty(ctx, &ctx->atoms.s.window_rectangles);
       si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
       si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
 
       /* Invalidate various draw states so that they are emitted before
        * the first draw call. */
@@ -576,7 +576,6 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
 
    assert(!ctx->gfx_cs->prev_dw);
    ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
-   ctx->small_prim_cull_info_dirty = ctx->small_prim_cull_info_buf != NULL;
    ctx->prim_discard_compute_ib_initialized = false;
 
    /* Compute-based primitive discard:
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index a5c7daeacdb..b36d695727d 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -891,6 +891,7 @@ struct si_sdma_upload {
 
 struct si_small_prim_cull_info {
    float scale[2], translate[2];
+   float small_prim_precision;
 };
 
 struct si_context {
@@ -1133,7 +1134,6 @@ struct si_context {
    struct si_small_prim_cull_info last_small_prim_cull_info;
    struct si_resource *small_prim_cull_info_buf;
    uint64_t small_prim_cull_info_address;
-   bool small_prim_cull_info_dirty;
 
    /* Scratch buffer */
    struct si_resource *scratch_buffer;
@@ -1518,7 +1518,6 @@ struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe,
                                                  const struct pipe_video_buffer *tmpl);
 
 /* si_viewport.c */
-void si_update_ngg_small_prim_precision(struct si_context *ctx);
 void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out);
 void si_update_vs_viewport_state(struct si_context *ctx);
 void si_init_viewport_functions(struct si_context *ctx);
@@ -1933,6 +1932,20 @@ static inline unsigned si_get_shader_wave_size(struct si_shader *shader)
                            shader->key.opt.vs_as_prim_discard_cs);
 }
 
+/* Return the number of samples that the rasterizer uses. */
+static inline unsigned si_get_num_coverage_samples(struct si_context *sctx)
+{
+   if (sctx->framebuffer.nr_samples > 1 &&
+       sctx->queued.named.rasterizer->multisample_enable)
+      return sctx->framebuffer.nr_samples;
+
+   /* Note that smoothing_enabled is set by si_update_shaders. */
+   if (sctx->smoothing_enabled)
+      return SI_NUM_SMOOTH_AA_SAMPLES;
+
+   return 1;
+}
+
 #define PRINT_ERR(fmt, args...)                                                                    \
    fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)
 
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 0352345bbc9..6f018007ea7 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -988,6 +988,10 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
       /* Update the small primitive filter workaround if necessary. */
       if (sctx->screen->info.has_msaa_sample_loc_bug && sctx->framebuffer.nr_samples > 1)
          si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
+
+      /* NGG cull state uses multisample_enable. */
+      if (sctx->screen->use_ngg_culling)
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state);
    }
 
    sctx->current_vs_state &= C_VS_STATE_CLAMP_VERTEX_COLOR;
@@ -2812,10 +2816,13 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 
    si_update_ps_colorbuf0_slot(sctx);
    si_update_poly_offset_state(sctx);
-   si_update_ngg_small_prim_precision(sctx);
    si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
    si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
 
+   /* NGG cull state uses the sample count. */
+   if (sctx->screen->use_ngg_culling)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state);
+
    if (sctx->screen->dpbb_allowed)
       si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
 
@@ -3417,8 +3424,9 @@ static void si_emit_msaa_config(struct si_context *sctx)
     *   EQAA  4s 4z 2f - might look the same as 4x MSAA with low-density geometry
     *   EQAA  2s 2z 2f = 2x MSAA
     */
+   coverage_samples = color_samples = z_samples = si_get_num_coverage_samples(sctx);
+
    if (sctx->framebuffer.nr_samples > 1 && rs->multisample_enable) {
-      coverage_samples = sctx->framebuffer.nr_samples;
       color_samples = sctx->framebuffer.nr_color_samples;
 
       if (sctx->framebuffer.state.zsbuf) {
@@ -3427,10 +3435,6 @@ static void si_emit_msaa_config(struct si_context *sctx)
       } else {
          z_samples = coverage_samples;
       }
-   } else if (sctx->smoothing_enabled) {
-      coverage_samples = color_samples = z_samples = SI_NUM_SMOOTH_AA_SAMPLES;
-   } else {
-      coverage_samples = color_samples = z_samples = 1;
    }
 
    /* Required by OpenGL line rasterization.
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 9ac4d51889c..6f585afd7db 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -228,6 +228,7 @@ union si_state_atoms {
       struct si_atom scratch_state;
       struct si_atom window_rectangles;
       struct si_atom shader_query;
+      struct si_atom ngg_cull_state;
    } s;
    struct si_atom array[sizeof(struct si_atoms_s) / sizeof(struct si_atom)];
 };
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 8df7b9b5fca..8370ed7db10 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -4061,6 +4061,10 @@ bool si_update_shaders(struct si_context *sctx)
          sctx->smoothing_enabled = sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing;
          si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
 
+         /* NGG cull state uses smoothing_enabled. */
+         if (sctx->screen->use_ngg_culling)
+            si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state);
+
          if (sctx->chip_class == GFX6)
             si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 
diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c
index 9d62b2c85f2..c431c6a72cf 100644
--- a/src/gallium/drivers/radeonsi/si_state_viewport.c
+++ b/src/gallium/drivers/radeonsi/si_state_viewport.c
@@ -28,34 +28,13 @@
 
 #define SI_MAX_SCISSOR 16384
 
-void si_update_ngg_small_prim_precision(struct si_context *ctx)
-{
-   if (!ctx->screen->use_ngg_culling)
-      return;
-
-   /* Set VS_STATE.SMALL_PRIM_PRECISION for NGG culling. */
-   unsigned num_samples = ctx->framebuffer.nr_samples;
-   unsigned quant_mode = ctx->viewports.as_scissor[0].quant_mode;
-   float precision;
-
-   if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
-      precision = num_samples / 4096.0;
-   else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
-      precision = num_samples / 1024.0;
-   else
-      precision = num_samples / 256.0;
-
-   ctx->current_vs_state &= C_VS_STATE_SMALL_PRIM_PRECISION;
-   ctx->current_vs_state |= S_VS_STATE_SMALL_PRIM_PRECISION(fui(precision) >> 23);
-}
-
 void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out)
 {
    /* This is needed by the small primitive culling, because it's done
     * in screen space.
     */
    struct si_small_prim_cull_info info;
-   unsigned num_samples = sctx->framebuffer.nr_samples;
+   unsigned num_samples = si_get_num_coverage_samples(sctx);
    assert(num_samples >= 1);
 
    info.scale[0] = sctx->viewports.states[0].scale[0];
@@ -85,9 +64,64 @@ void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_c
       info.scale[i] *= num_samples;
       info.translate[i] *= num_samples;
    }
+
+   /* Better subpixel precision increases the efficiency of small
+    * primitive culling. (more precision means a tighter bounding box
+    * around primitives and more accurate elimination)
+    */
+   unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
+
+   if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
+      info.small_prim_precision = num_samples / 4096.0;
+   else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
+      info.small_prim_precision = num_samples / 1024.0;
+   else
+      info.small_prim_precision = num_samples / 256.0;
+
    *out = info;
 }
 
+static void si_emit_cull_state(struct si_context *sctx)
+{
+   assert(sctx->screen->use_ngg_culling);
+
+   struct si_small_prim_cull_info info;
+   si_get_small_prim_cull_info(sctx, &info);
+
+   if (!sctx->small_prim_cull_info_buf ||
+       memcmp(&info, &sctx->last_small_prim_cull_info, sizeof(info))) {
+      unsigned offset = 0;
+
+      /* Align to 256, because the address is shifted by 8 bits. */
+      u_upload_data(sctx->b.const_uploader, 0, sizeof(info), 256, &info, &offset,
+                    (struct pipe_resource **)&sctx->small_prim_cull_info_buf);
+
+      sctx->small_prim_cull_info_address = sctx->small_prim_cull_info_buf->gpu_address + offset;
+      sctx->last_small_prim_cull_info = info;
+   }
+
+   /* This will end up in SGPR6 as (value << 8), shifted by the hw. */
+   radeon_add_to_buffer_list(sctx, sctx->gfx_cs, sctx->small_prim_cull_info_buf,
+                             RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
+   radeon_set_sh_reg(sctx->gfx_cs, R_00B220_SPI_SHADER_PGM_LO_GS,
+                     sctx->small_prim_cull_info_address >> 8);
+
+   /* Set VS_STATE.SMALL_PRIM_PRECISION for NGG culling.
+    *
+    * small_prim_precision is 1 / 2^n. We only need n between 5 (1/32) and 12 (1/4096).
+    * Such a floating point value can be packed into 4 bits as follows:
+    * If we pass the first 4 bits of the exponent to the shader and set the next 3 bits
+    * to 1, we'll get the number exactly because all other bits are always 0. See:
+    *                                                               1
+    * value  =  (0x70 | value.exponent[0:3]) << 23  =  ------------------------------
+    *                                                  2 ^ (15 - value.exponent[0:3])
+    *
+    * So pass only the first 4 bits of the float exponent to the shader.
+    */
+   sctx->current_vs_state &= C_VS_STATE_SMALL_PRIM_PRECISION;
+   sctx->current_vs_state |= S_VS_STATE_SMALL_PRIM_PRECISION(fui(info.small_prim_precision) >> 23);
+}
+
 static void si_set_scissor_states(struct pipe_context *pctx, unsigned start_slot,
                                   unsigned num_scissors, const struct pipe_scissor_state *state)
 {
@@ -330,8 +364,6 @@ static void si_emit_guardband(struct si_context *ctx)
          S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH + vp_as_scissor.quant_mode));
    if (initial_cdw != ctx->gfx_cs->current.cdw)
       ctx->context_roll = true;
-
-   si_update_ngg_small_prim_precision(ctx);
 }
 
 static void si_emit_scissors(struct si_context *ctx)
@@ -430,6 +462,10 @@ static void si_set_viewport_states(struct pipe_context *pctx, unsigned start_slo
    if (start_slot == 0) {
       ctx->viewports.y_inverted =
          -state->scale[1] + state->translate[1] > state->scale[1] + state->translate[1];
+
+      /* NGG cull state uses the viewport and quant mode. */
+      if (ctx->screen->use_ngg_culling)
+         si_mark_atom_dirty(ctx, &ctx->atoms.s.ngg_cull_state);
    }
 
    si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
@@ -454,33 +490,6 @@ static void si_emit_viewports(struct si_context *ctx)
    struct radeon_cmdbuf *cs = ctx->gfx_cs;
    struct pipe_viewport_state *states = ctx->viewports.states;
 
-   if (ctx->screen->use_ngg_culling) {
-      /* Set the viewport info for small primitive culling. */
-      struct si_small_prim_cull_info info;
-      si_get_small_prim_cull_info(ctx, &info);
-
-      if (memcmp(&info, &ctx->last_small_prim_cull_info, sizeof(info))) {
-         unsigned offset = 0;
-
-         /* Align to 256, because the address is shifted by 8 bits. */
-         u_upload_data(ctx->b.const_uploader, 0, sizeof(info), 256, &info, &offset,
-                       (struct pipe_resource **)&ctx->small_prim_cull_info_buf);
-
-         ctx->small_prim_cull_info_address = ctx->small_prim_cull_info_buf->gpu_address + offset;
-         ctx->last_small_prim_cull_info = info;
-         ctx->small_prim_cull_info_dirty = true;
-      }
-
-      if (ctx->small_prim_cull_info_dirty) {
-         /* This will end up in SGPR6 as (value << 8), shifted by the hw. */
-         radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->small_prim_cull_info_buf,
-                                   RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
-         radeon_set_sh_reg(ctx->gfx_cs, R_00B220_SPI_SHADER_PGM_LO_GS,
-                           ctx->small_prim_cull_info_address >> 8);
-         ctx->small_prim_cull_info_dirty = false;
-      }
-   }
-
    /* The simple case: Only 1 viewport is active. */
    if (!ctx->vs_writes_viewport_index) {
       radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6);
@@ -655,6 +664,7 @@ void si_init_viewport_functions(struct si_context *ctx)
    ctx->atoms.s.scissors.emit = si_emit_scissors;
    ctx->atoms.s.viewports.emit = si_emit_viewport_states;
    ctx->atoms.s.window_rectangles.emit = si_emit_window_rectangles;
+   ctx->atoms.s.ngg_cull_state.emit = si_emit_cull_state;
 
    ctx->b.set_scissor_states = si_set_scissor_states;
    ctx->b.set_viewport_states = si_set_viewport_states;