[Mesa-dev] [PATCH 02/28] i965/fs: Rework the persample shading key/prog_data bits

Jason Ekstrand jason at jlekstrand.net
Tue May 10 23:16:22 UTC 2016


This commit reworks and simplifies the way we handle persample shading in
the shader key and prog_data.  The previous approach had three different
key bits that had slightly different and hard-to-decern meanings while the
new bits are far more clear.  This commit changes it to two easily
understood bits that communicate everything we need:

 1) key->persample_interp: means that the user has requested persample
    interpolation through the API.  This is equivalent to having
    SAMPLE_SHADING enabled and having MIN_SAMPLE_SHADING_VALUE set high
    enough that you actually get multiple per-sample invocations.

 2) key->multisample_fbo: means that the shader will be running on an
    actual multi-sampled framebuffer.

This commit also adds a new "persample_dispatch" bit to prog_data which
indicates that the shader should be run in persample mode.  This way the
state setup code doesn't have to look at the fragment program or GL state
and can just pull that data out of the prog_data.

In theory, this shuffle could mean more recompiles.  However, in practice,
we were shoving enough state into the key before that we were probably
hitting a recompile on every per-sample shader anyway.
---
 src/intel/vulkan/anv_pipeline.c           |  5 ++--
 src/mesa/drivers/dri/i965/brw_compiler.h  |  4 ++--
 src/mesa/drivers/dri/i965/brw_fs.cpp      | 40 +++++++++++++++++++++----------
 src/mesa/drivers/dri/i965/brw_wm.c        | 19 +++++++--------
 src/mesa/drivers/dri/i965/gen6_wm_state.c |  6 ++---
 src/mesa/drivers/dri/i965/gen7_wm_state.c | 20 +++++++---------
 src/mesa/drivers/dri/i965/gen8_ps_state.c | 22 +++++++----------
 7 files changed, 60 insertions(+), 56 deletions(-)

diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index ba088b6..4e52c91 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -286,8 +286,9 @@ populate_wm_prog_key(const struct brw_device_info *devinfo,
       /* We should probably pull this out of the shader, but it's fairly
        * harmless to compute it and then let dead-code take care of it.
        */
-      key->persample_shading = info->pMultisampleState->sampleShadingEnable;
-      key->compute_pos_offset = info->pMultisampleState->sampleShadingEnable;
+      key->persample_interp =
+         (info->pMultisampleState->minSampleShading *
+          info->pMultisampleState->rasterizationSamples) > 1;
       key->multisample_fbo = true;
    }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index 5807305..2482516 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -242,12 +242,11 @@ struct brw_wm_prog_key {
    uint8_t iz_lookup;
    bool stats_wm:1;
    bool flat_shade:1;
-   bool persample_shading:1;
    unsigned nr_color_regions:5;
    bool replicate_alpha:1;
    bool render_to_fbo:1;
    bool clamp_fragment_color:1;
-   bool compute_pos_offset:1;
+   bool persample_interp:1;
    bool multisample_fbo:1;
    unsigned line_aa:2;
    bool high_quality_derivatives:1;
@@ -386,6 +385,7 @@ struct brw_wm_prog_data {
    bool early_fragment_tests;
    bool no_8;
    bool dual_src_blend;
+   bool persample_dispatch;
    bool uses_pos_offset;
    bool uses_omask;
    bool uses_kill;
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 2a542b8..71e759d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1195,8 +1195,8 @@ fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name,
                   inst->no_dd_clear = true;
 
                inst = emit_linterp(*attr, fs_reg(interp), interpolation_mode,
-                                   mod_centroid && !key->persample_shading,
-                                   mod_sample || key->persample_shading);
+                                   mod_centroid && !key->persample_interp,
+                                   mod_sample || key->persample_interp);
                inst->predicate = BRW_PREDICATE_NORMAL;
                inst->predicate_inverse = false;
                if (devinfo->has_pln)
@@ -1204,8 +1204,8 @@ fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name,
 
             } else {
                emit_linterp(*attr, fs_reg(interp), interpolation_mode,
-                            mod_centroid && !key->persample_shading,
-                            mod_sample || key->persample_shading);
+                            mod_centroid && !key->persample_interp,
+                            mod_sample || key->persample_interp);
             }
             if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
                bld.MUL(*attr, *attr, this->pixel_w);
@@ -1262,10 +1262,10 @@ void
 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
 {
    assert(stage == MESA_SHADER_FRAGMENT);
-   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+   brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
    assert(dst.type == BRW_REGISTER_TYPE_F);
 
-   if (key->compute_pos_offset) {
+   if (wm_prog_data->persample_dispatch) {
       /* Convert int_sample_pos to floating point */
       bld.MOV(dst, int_sample_pos);
       /* Scale to the range [0, 1] */
@@ -1430,7 +1430,7 @@ fs_reg *
 fs_visitor::emit_samplemaskin_setup()
 {
    assert(stage == MESA_SHADER_FRAGMENT);
-   brw_wm_prog_key *key = (brw_wm_prog_key *) this->key;
+   brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
    assert(devinfo->gen >= 6);
 
    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
@@ -1438,7 +1438,7 @@ fs_visitor::emit_samplemaskin_setup()
    fs_reg coverage_mask(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
                                BRW_REGISTER_TYPE_D));
 
-   if (key->persample_shading) {
+   if (wm_prog_data->persample_dispatch) {
       /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
        * and a mask representing which sample is being processed by the
        * current shader invocation.
@@ -5098,7 +5098,6 @@ fs_visitor::setup_fs_payload_gen6()
 {
    assert(stage == MESA_SHADER_FRAGMENT);
    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
-   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 
    unsigned barycentric_interp_modes =
       (stage == MESA_SHADER_FRAGMENT) ?
@@ -5151,9 +5150,19 @@ fs_visitor::setup_fs_payload_gen6()
       }
    }
 
-   prog_data->uses_pos_offset = key->compute_pos_offset;
    /* R31: MSAA position offsets. */
-   if (prog_data->uses_pos_offset) {
+   if (prog_data->persample_dispatch &&
+       (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS)) {
+      /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
+       *
+       *    "MSDISPMODE_PERSAMPLE is required in order to select
+       *    POSOFFSET_SAMPLE"
+       *
+       * So we can only really get sample positions if we are doing real
+       * per-sample dispatch.  If we need gl_SamplePosition and we don't have
+       * persample dispatch, we hard-code it to 0.5.
+       */
+      prog_data->uses_pos_offset = true;
       payload.sample_pos_reg = payload.num_regs;
       payload.num_regs++;
    }
@@ -5993,12 +6002,19 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
    prog_data->computed_stencil =
       shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
 
+   prog_data->persample_dispatch =
+      key->multisample_fbo &&
+      (key->persample_interp ||
+       (shader->info.system_values_read & (SYSTEM_BIT_SAMPLE_ID |
+                                           SYSTEM_BIT_SAMPLE_POS)) ||
+       shader->info.fs.uses_sample_qualifier);
+
    prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
 
    prog_data->barycentric_interp_modes =
       brw_compute_barycentric_interp_modes(compiler->devinfo,
                                            key->flat_shade,
-                                           key->persample_shading,
+                                           key->persample_interp,
                                            shader);
 
    fs_visitor v(compiler, log_data, mem_ctx, key,
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index dbc626c..22d640f 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -250,8 +250,8 @@ brw_wm_debug_recompile(struct brw_context *brw,
                       old_key->stats_wm, key->stats_wm);
    found |= key_debug(brw, "flat shading",
                       old_key->flat_shade, key->flat_shade);
-   found |= key_debug(brw, "per-sample shading",
-                      old_key->persample_shading, key->persample_shading);
+   found |= key_debug(brw, "per-sample interpolation",
+                      old_key->persample_interp, key->persample_interp);
    found |= key_debug(brw, "number of color buffers",
                       old_key->nr_color_regions, key->nr_color_regions);
    found |= key_debug(brw, "MRT alpha test or alpha-to-coverage",
@@ -524,15 +524,14 @@ brw_wm_populate_key(struct brw_context *brw, struct brw_wm_prog_key *key)
 
    /* _NEW_BUFFERS _NEW_MULTISAMPLE */
    /* Ignore sample qualifier while computing this flag. */
-   key->persample_shading =
-      _mesa_get_min_invocations_per_fragment(ctx, &fp->program, true) > 1;
+   if (ctx->Multisample.Enabled) {
+      key->persample_interp =
+         ctx->Multisample.SampleShading &&
+         (ctx->Multisample.MinSampleShadingValue *
+          _mesa_geometric_samples(ctx->DrawBuffer) > 1);
 
-   key->compute_pos_offset =
-      _mesa_get_min_invocations_per_fragment(ctx, &fp->program, false) > 1 &&
-      fp->program.Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_POS;
-
-   key->multisample_fbo = ctx->Multisample.Enabled &&
-                          _mesa_geometric_samples(ctx->DrawBuffer) > 1;
+      key->multisample_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
+   }
 
    /* BRW_NEW_VUE_MAP_GEOM_OUT */
    if (brw->gen < 6 || _mesa_bitcount_64(fp->program.Base.InputsRead &
diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
index 335920c..dd33926 100644
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@@ -130,12 +130,10 @@ gen6_upload_wm_state(struct brw_context *brw,
 
    dw5 |= (brw->max_wm_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT;
 
-   assert(min_inv_per_frag >= 1);
-
    if (prog_data->prog_offset_16 || prog_data->no_8) {
       dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
 
-      if (!prog_data->no_8 && min_inv_per_frag == 1) {
+      if (!prog_data->no_8 && !prog_data->persample_dispatch) {
          dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
          dw4 |= (prog_data->base.dispatch_grf_start_reg <<
                  GEN6_WM_DISPATCH_START_GRF_SHIFT_0);
@@ -198,7 +196,7 @@ gen6_upload_wm_state(struct brw_context *brw,
       else
          dw6 |= GEN6_WM_MSRAST_OFF_PIXEL;
 
-      if (min_inv_per_frag > 1)
+      if (prog_data->persample_dispatch)
          dw6 |= GEN6_WM_MSDISPMODE_PERSAMPLE;
       else {
          dw6 |= GEN6_WM_MSDISPMODE_PERPIXEL;
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c
index 2c3930f..945fbbd 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
@@ -91,7 +91,7 @@ upload_wm_state(struct brw_context *brw)
       else
          dw1 |= GEN7_WM_MSRAST_OFF_PIXEL;
 
-      if (_mesa_get_min_invocations_per_fragment(ctx, brw->fragment_program, false) > 1)
+      if (prog_data->persample_dispatch)
          dw2 |= GEN7_WM_MSDISPMODE_PERSAMPLE;
       else
          dw2 |= GEN7_WM_MSDISPMODE_PERPIXEL;
@@ -152,7 +152,6 @@ gen7_upload_ps_state(struct brw_context *brw,
                      bool enable_dual_src_blend, unsigned sample_mask,
                      unsigned fast_clear_op)
 {
-   struct gl_context *ctx = &brw->ctx;
    uint32_t dw2, dw4, dw5, ksp0, ksp2;
    const int max_threads_shift = brw->is_haswell ?
       HSW_PS_MAX_THREADS_SHIFT : IVB_PS_MAX_THREADS_SHIFT;
@@ -216,18 +215,15 @@ gen7_upload_ps_state(struct brw_context *brw,
    if (prog_data->num_varying_inputs != 0)
       dw4 |= GEN7_PS_ATTRIBUTE_ENABLE;
 
-   /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16
-    * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader
-    * is successfully compiled. In majority of the cases that bring us
-    * better performance than 'SIMD8 only' dispatch.
-    */
-   int min_inv_per_frag =
-      _mesa_get_min_invocations_per_fragment(ctx, fp, false);
-   assert(min_inv_per_frag >= 1);
-
    if (prog_data->prog_offset_16 || prog_data->no_8) {
       dw4 |= GEN7_PS_16_DISPATCH_ENABLE;
-      if (!prog_data->no_8 && min_inv_per_frag == 1) {
+
+      /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16
+       * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader
+       * is successfully compiled. In majority of the cases that bring us
+       * better performance than 'SIMD8 only' dispatch.
+       */
+      if (!prog_data->no_8 && !prog_data->persample_dispatch) {
          dw4 |= GEN7_PS_8_DISPATCH_ENABLE;
          dw5 |= (prog_data->base.dispatch_grf_start_reg <<
                  GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c
index 9269a79..d3e1ca3 100644
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
@@ -52,8 +52,7 @@ gen8_upload_ps_extra(struct brw_context *brw,
    if (prog_data->uses_src_w)
       dw1 |= GEN8_PSX_USES_SOURCE_W;
 
-   if (multisampled_fbo &&
-       _mesa_get_min_invocations_per_fragment(ctx, fp, false) > 1)
+   if (prog_data->persample_dispatch)
       dw1 |= GEN8_PSX_SHADER_IS_PER_SAMPLE;
 
    if (prog_data->uses_sample_mask) {
@@ -192,7 +191,6 @@ gen8_upload_ps_state(struct brw_context *brw,
                      const struct brw_wm_prog_data *prog_data,
                      uint32_t fast_clear_op)
 {
-   struct gl_context *ctx = &brw->ctx;
    uint32_t dw3 = 0, dw6 = 0, dw7 = 0, ksp0, ksp2 = 0;
 
    /* Initialize the execution mask with VMask.  Otherwise, derivatives are
@@ -246,19 +244,15 @@ gen8_upload_ps_state(struct brw_context *brw,
 
    dw6 |= fast_clear_op;
 
-   /* _NEW_MULTISAMPLE
-    * In case of non 1x per sample shading, only one of SIMD8 and SIMD16
-    * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader
-    * is successfully compiled. In majority of the cases that bring us
-    * better performance than 'SIMD8 only' dispatch.
-    */
-   int min_invocations_per_fragment =
-      _mesa_get_min_invocations_per_fragment(ctx, fp, false);
-   assert(min_invocations_per_fragment >= 1);
-
    if (prog_data->prog_offset_16 || prog_data->no_8) {
       dw6 |= GEN7_PS_16_DISPATCH_ENABLE;
-      if (!prog_data->no_8 && min_invocations_per_fragment == 1) {
+
+      /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16
+       * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader
+       * is successfully compiled. In majority of the cases that bring us
+       * better performance than 'SIMD8 only' dispatch.
+       */
+      if (!prog_data->no_8 && !prog_data->persample_dispatch) {
          dw6 |= GEN7_PS_8_DISPATCH_ENABLE;
          dw7 |= (prog_data->base.dispatch_grf_start_reg <<
                  GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
-- 
2.5.0.400.gff86faf



More information about the mesa-dev mailing list