Mesa (master): i965,iris,anv: Make alpha to coverage work with sample mask

Mon Mar 25 20:57:34 UTC 2019

Module: Mesa
Branch: master
Commit: c8abe03f3b65505d2c1c165d88efb3bb62e06db1
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=c8abe03f3b65505d2c1c165d88efb3bb62e06db1

Author: Danylo Piliaiev <danylo.piliaiev at globallogic.com>
Date:   Wed Feb 20 19:39:18 2019 +0200

i965,iris,anv: Make alpha to coverage work with sample mask

>From "Alpha Coverage" section of SKL PRM Volume 7:
 "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
  hardware, regardless of the state setting for this feature."

>From OpenGL spec 4.6, "15.2 Shader Execution":
 "The built-in integer array gl_SampleMask can be used to change
 the sample coverage for a fragment from within the shader."

>From OpenGL spec 4.6, "17.3.1 Alpha To Coverage":
 "If SAMPLE_ALPHA_TO_COVERAGE is enabled, a temporary coverage value
  is generated where each bit is determined by the alpha value at the
  corresponding sample location. The temporary coverage value is then
  ANDed with the fragment coverage value to generate a new fragment
  coverage value."

Similar wording could be found in Vulkan spec 1.1.100
"25.6. Multisample Coverage"

Thus we need to compute alpha to coverage dithering manually in shader
and replace sample mask store with the bitwise-AND of sample mask and
alpha to coverage dithering.

The following formula is used to compute final sample mask:
  m = int(16.0 * clamp(src0_alpha, 0.0, 1.0))
  dither_mask = 0x1111 * ((0xfea80 >> (m & ~3)) & 0xf) |
     0x0808 * (m & 2) | 0x0100 * (m & 1)
  sample_mask = sample_mask & dither_mask
Credits to Francisco Jerez <currojerez at riseup.net> for creating it.

It gives a number of ones proportional to the alpha for 2, 4, 8 or 16
least significant bits of the result.

GEN6 hardware does not have issue with simultaneous usage of sample mask
and alpha to coverage however due to the wrong sending order of oMask
and src0_alpha it is still affected by it.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109743

Signed-off-by: Danylo Piliaiev <danylo.piliaiev at globallogic.com>
Reviewed-by: Francisco Jerez <currojerez at riseup.net>

---

 src/gallium/drivers/iris/iris_state.c |  5 +-
 src/intel/compiler/brw_compiler.h     |  4 +-
 src/intel/compiler/brw_fs.cpp         |  4 +-
 src/intel/compiler/brw_fs.h           |  1 +
 src/intel/compiler/brw_fs_visitor.cpp | 94 ++++++++++++++++++++++++++++++++++-
 src/intel/vulkan/anv_pipeline.c       | 11 +++-
 src/mesa/drivers/dri/i965/brw_wm.c    | 18 ++++---
 7 files changed, 123 insertions(+), 14 deletions(-)

diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c
index 4ab3662da7b..1ae9c557a27 100644
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@@ -3335,8 +3335,9 @@ iris_populate_fs_key(const struct iris_context *ice,
 
    key->clamp_fragment_color = rast->clamp_fragment_color;
 
-   key->replicate_alpha = fb->nr_cbufs > 1 &&
-      (zsa->alpha.enabled || blend->alpha_to_coverage);
+   key->alpha_to_coverage = blend->alpha_to_coverage;
+
+   key->alpha_test_replicate_alpha = fb->nr_cbufs > 1 && zsa->alpha.enabled;
 
    /* XXX: only bother if COL0/1 are read */
    key->flat_shade = rast->flatshade;
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index 375705821c9..fb7ab289723 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -398,7 +398,8 @@ struct brw_wm_prog_key {
    bool stats_wm:1;
    bool flat_shade:1;
    unsigned nr_color_regions:5;
-   bool replicate_alpha:1;
+   bool alpha_test_replicate_alpha:1;
+   bool alpha_to_coverage:1;
    bool clamp_fragment_color:1;
    bool persample_interp:1;
    bool multisample_fbo:1;
@@ -707,6 +708,7 @@ struct brw_wm_prog_data {
    bool dispatch_16;
    bool dispatch_32;
    bool dual_src_blend;
+   bool replicate_alpha;
    bool persample_dispatch;
    bool uses_pos_offset;
    bool uses_omask;
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 964cde07215..635f72721d9 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -4237,7 +4237,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
       /* Set "Source0 Alpha Present to RenderTarget" bit in message
        * header.
        */
-      if (inst->target > 0 && key->replicate_alpha)
+      if (inst->target > 0 && prog_data->replicate_alpha)
          g00_bits |= 1 << 11;
 
       /* Set computes stencil to render target */
@@ -4314,7 +4314,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
        */
       setup_color_payload(bld, key, &sources[length], src0_alpha, 1);
       length++;
-   } else if (key->replicate_alpha && inst->target != 0) {
+   } else if (prog_data->replicate_alpha && inst->target != 0) {
       /* Handle the case when fragment shader doesn't write to draw buffer
        * zero. No need to call setup_color_payload() for src0_alpha because
        * alpha value will be undefined.
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index 97956003973..391d46d5471 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -259,6 +259,7 @@ public:
    fs_inst *emit_single_fb_write(const brw::fs_builder &bld,
                                  fs_reg color1, fs_reg color2,
                                  fs_reg src0_alpha, unsigned components);
+   void emit_alpha_to_coverage_workaround(const fs_reg &src0_alpha);
    void emit_fb_writes();
    fs_inst *emit_non_coherent_fb_read(const brw::fs_builder &bld,
                                       const fs_reg &dst, unsigned target);
diff --git a/src/intel/compiler/brw_fs_visitor.cpp b/src/intel/compiler/brw_fs_visitor.cpp
index 741b226b150..f8e8d36360e 100644
--- a/src/intel/compiler/brw_fs_visitor.cpp
+++ b/src/intel/compiler/brw_fs_visitor.cpp
@@ -400,6 +400,82 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
 }
 
 void
+fs_visitor::emit_alpha_to_coverage_workaround(const fs_reg &src0_alpha)
+{
+   /* We need to compute alpha to coverage dithering manually in shader
+    * and replace sample mask store with the bitwise-AND of sample mask and
+    * alpha to coverage dithering.
+    *
+    * The following formula is used to compute final sample mask:
+    *  m = int(16.0 * clamp(src0_alpha, 0.0, 1.0))
+    *  dither_mask = 0x1111 * ((0xfea80 >> (m & ~3)) & 0xf) |
+    *     0x0808 * (m & 2) | 0x0100 * (m & 1)
+    *  sample_mask = sample_mask & dither_mask
+    *
+    * It gives a number of ones proportional to the alpha for 2, 4, 8 or 16
+    * least significant bits of the result:
+    *  0.0000 0000000000000000
+    *  0.0625 0000000100000000
+    *  0.1250 0001000000010000
+    *  0.1875 0001000100010000
+    *  0.2500 1000100010001000
+    *  0.3125 1000100110001000
+    *  0.3750 1001100010011000
+    *  0.4375 1001100110011000
+    *  0.5000 1010101010101010
+    *  0.5625 1010101110101010
+    *  0.6250 1011101010111010
+    *  0.6875 1011101110111010
+    *  0.7500 1110111011101110
+    *  0.8125 1110111111101110
+    *  0.8750 1111111011111110
+    *  0.9375 1111111111111110
+    *  1.0000 1111111111111111
+    */
+   const fs_builder abld = bld.annotate("compute alpha_to_coverage & "
+      "sample_mask");
+
+   /* clamp(src0_alpha, 0.f, 1.f) */
+   const fs_reg float_tmp = abld.vgrf(BRW_REGISTER_TYPE_F);
+   set_saturate(true, abld.MOV(float_tmp, src0_alpha));
+
+   /* 16.0 * clamp(src0_alpha, 0.0, 1.0) */
+   abld.MUL(float_tmp, float_tmp, brw_imm_f(16.0));
+
+   /* m = int(16.0 * clamp(src0_alpha, 0.0, 1.0)) */
+   const fs_reg m = abld.vgrf(BRW_REGISTER_TYPE_UW);
+   abld.MOV(m, float_tmp);
+
+   /* 0x1111 * ((0xfea80 >> (m & ~3)) & 0xf) */
+   const fs_reg int_tmp_1 = abld.vgrf(BRW_REGISTER_TYPE_UW);
+   const fs_reg shift_const = abld.vgrf(BRW_REGISTER_TYPE_UD);
+   abld.MOV(shift_const, brw_imm_d(0xfea80));
+   abld.AND(int_tmp_1, m, brw_imm_uw(~3));
+   abld.SHR(int_tmp_1, shift_const, int_tmp_1);
+   abld.AND(int_tmp_1, int_tmp_1, brw_imm_uw(0xf));
+   abld.MUL(int_tmp_1, int_tmp_1, brw_imm_uw(0x1111));
+
+   /* 0x0808 * (m & 2) */
+   const fs_reg int_tmp_2 = abld.vgrf(BRW_REGISTER_TYPE_UW);
+   abld.AND(int_tmp_2, m, brw_imm_uw(2));
+   abld.MUL(int_tmp_2, int_tmp_2, brw_imm_uw(0x0808));
+
+   abld.OR(int_tmp_1, int_tmp_1, int_tmp_2);
+
+   /* 0x0100 * (m & 1) */
+   const fs_reg int_tmp_3 = abld.vgrf(BRW_REGISTER_TYPE_UW);
+   abld.AND(int_tmp_3, m, brw_imm_uw(1));
+   abld.MUL(int_tmp_3, int_tmp_3, brw_imm_uw(0x0100));
+
+   abld.OR(int_tmp_1, int_tmp_1, int_tmp_3);
+
+   /* sample_mask = sample_mask & dither_mask */
+   const fs_reg mask = abld.vgrf(BRW_REGISTER_TYPE_UD);
+   abld.AND(mask, sample_mask, int_tmp_1);
+   sample_mask = mask;
+}
+
+void
 fs_visitor::emit_fb_writes()
 {
    assert(stage == MESA_SHADER_FRAGMENT);
@@ -427,6 +503,22 @@ fs_visitor::emit_fb_writes()
                            "in SIMD16+ mode.\n");
    }
 
+   /* ANV doesn't know about sample mask output during the wm key creation
+    * so we compute if we need replicate alpha and emit alpha to coverage
+    * workaround here.
+    */
+   prog_data->replicate_alpha = key->alpha_test_replicate_alpha ||
+      (key->nr_color_regions > 1 && key->alpha_to_coverage &&
+       (sample_mask.file == BAD_FILE || devinfo->gen == 6));
+
+   /* From the SKL PRM, Volume 7, "Alpha Coverage":
+    *  "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
+    *   hardware, regardless of the state setting for this feature."
+    */
+   if (devinfo->gen > 6 && key->alpha_to_coverage &&
+       sample_mask.file != BAD_FILE && this->outputs[0].file != BAD_FILE)
+      emit_alpha_to_coverage_workaround(offset(this->outputs[0], bld, 3));
+
    for (int target = 0; target < key->nr_color_regions; target++) {
       /* Skip over outputs that weren't written. */
       if (this->outputs[target].file == BAD_FILE)
@@ -436,7 +528,7 @@ fs_visitor::emit_fb_writes()
          ralloc_asprintf(this->mem_ctx, "FB write target %d", target));
 
       fs_reg src0_alpha;
-      if (devinfo->gen >= 6 && key->replicate_alpha && target != 0)
+      if (devinfo->gen >= 6 && prog_data->replicate_alpha && target != 0)
          src0_alpha = offset(outputs[0], bld, 3);
 
       inst = emit_single_fb_write(abld, this->outputs[target],
diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index 90942a4524a..1c51b83b5ba 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -375,8 +375,15 @@ populate_wm_prog_key(const struct gen_device_info *devinfo,
 
    key->nr_color_regions = util_bitcount(key->color_outputs_valid);
 
-   key->replicate_alpha = key->nr_color_regions > 1 &&
-                          ms_info && ms_info->alphaToCoverageEnable;
+   /* To reduce possible shader recompilations we would need to know if
+    * there is a SampleMask output variable to compute if we should emit
+    * code to workaround the issue that hardware disables alpha to coverage
+    * when there is SampleMask output.
+    */
+   key->alpha_to_coverage = ms_info && ms_info->alphaToCoverageEnable;
+
+   /* Vulkan doesn't support fixed-function alpha test */
+   key->alpha_test_replicate_alpha = false;
 
    if (ms_info) {
       /* We should probably pull this out of the shader, but it's fairly
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 47905ca5549..d2d7974e841 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -93,8 +93,11 @@ brw_wm_debug_recompile(struct brw_context *brw, struct gl_program *prog,
                       old_key->flat_shade, key->flat_shade);
    found |= key_debug(brw, "number of color buffers",
                       old_key->nr_color_regions, key->nr_color_regions);
-   found |= key_debug(brw, "MRT alpha test or alpha-to-coverage",
-                      old_key->replicate_alpha, key->replicate_alpha);
+   found |= key_debug(brw, "MRT alpha test",
+                      old_key->alpha_test_replicate_alpha,
+                      key->alpha_test_replicate_alpha);
+   found |= key_debug(brw, "alpha to coverage",
+                      old_key->alpha_to_coverage, key->alpha_to_coverage);
    found |= key_debug(brw, "fragment color clamping",
                       old_key->clamp_fragment_color, key->clamp_fragment_color);
    found |= key_debug(brw, "per-sample interpolation",
@@ -569,10 +572,13 @@ brw_wm_populate_key(struct brw_context *brw, struct brw_wm_prog_key *key)
    key->force_dual_color_blend = brw->dual_color_blend_by_location &&
       (ctx->Color.BlendEnabled & 1) && ctx->Color.Blend[0]._UsesDualSrc;
 
-   /* _NEW_MULTISAMPLE, _NEW_COLOR, _NEW_BUFFERS */
-   key->replicate_alpha = ctx->DrawBuffer->_NumColorDrawBuffers > 1 &&
-      (_mesa_is_alpha_test_enabled(ctx) ||
-       _mesa_is_alpha_to_coverage_enabled(ctx));
+   /* _NEW_MULTISAMPLE, _NEW_BUFFERS */
+   key->alpha_to_coverage =  _mesa_is_alpha_to_coverage_enabled(ctx);
+
+   /* _NEW_COLOR, _NEW_BUFFERS */
+   key->alpha_test_replicate_alpha =
+      ctx->DrawBuffer->_NumColorDrawBuffers > 1 &&
+      _mesa_is_alpha_test_enabled(ctx);
 
    /* _NEW_BUFFERS _NEW_MULTISAMPLE */
    /* Ignore sample qualifier while computing this flag. */