Mesa (master): intel/compiler: add support for fragment coordinate with coarse pixels

Sun May 2 20:26:07 UTC 2021

Module: Mesa
Branch: master
Commit: 6d4070f3ddb5a5aafaf7f7f51e2f503b78fd0868
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=6d4070f3ddb5a5aafaf7f7f51e2f503b78fd0868

Author: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
Date:   Thu Oct 29 15:10:59 2020 +0200

intel/compiler: add support for fragment coordinate with coarse pixels

v2: Drop new internal opcodes (Jason)
    Simplify code (Jason)

v3: Add Z computation for coarse pixels

v4: Document things a little

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
Reviewed-by: Kenneth Graunke <kenneth at whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7455>

---

 src/intel/compiler/brw_compiler.h       |   1 +
 src/intel/compiler/brw_fs.cpp           |  27 ++++--
 src/intel/compiler/brw_fs.h             |   2 +
 src/intel/compiler/brw_fs_generator.cpp |  18 +++-
 src/intel/compiler/brw_fs_visitor.cpp   | 141 ++++++++++++++++++++++++++++++--
 5 files changed, 172 insertions(+), 17 deletions(-)

diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index 03eb7e1a7d7..805537488e4 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -845,6 +845,7 @@ struct brw_wm_prog_data {
    bool uses_kill;
    bool uses_src_depth;
    bool uses_src_w;
+   bool uses_depth_w_coefficients;
    bool uses_sample_mask;
    bool has_render_target_reads;
    bool has_side_effects;
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 957f5a2838f..46a9e7cefc4 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -754,8 +754,11 @@ fs_inst::components_read(unsigned i) const
 
    case FS_OPCODE_PIXEL_X:
    case FS_OPCODE_PIXEL_Y:
-      assert(i == 0);
-      return 2;
+      assert(i < 2);
+      if (i == 0)
+         return 2;
+      else
+         return 1;
 
    case FS_OPCODE_FB_WRITE_LOGICAL:
       assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
@@ -1241,7 +1244,7 @@ fs_visitor::emit_fragcoord_interpolation(fs_reg wpos)
 
    /* gl_FragCoord.z */
    if (devinfo->ver >= 6) {
-      bld.MOV(wpos, fetch_payload_reg(bld, payload.source_depth_reg));
+      bld.MOV(wpos, this->pixel_z);
    } else {
       bld.emit(FS_OPCODE_LINTERP, wpos,
                this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL],
@@ -7829,6 +7832,12 @@ fs_visitor::setup_fs_payload_gfx6()
          payload.sample_mask_in_reg[j] = payload.num_regs;
          payload.num_regs += payload_width / 8;
       }
+
+      /* R66: Source Depth and/or W Attribute Vertex Deltas */
+      if (prog_data->uses_depth_w_coefficients) {
+         payload.depth_w_coef_reg[j] = payload.num_regs;
+         payload.num_regs++;
+      }
    }
 
    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
@@ -9099,9 +9108,6 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader,
                               const struct brw_wm_prog_key *key,
                               struct brw_wm_prog_data *prog_data)
 {
-   prog_data->uses_src_depth = prog_data->uses_src_w =
-      BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
-
    /* key->alpha_test_func means simulating alpha testing via discards,
     * so the shader definitely kills pixels.
     */
@@ -9154,6 +9160,15 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader,
       (prog_data->computed_depth_mode == BRW_PSCDEPTH_OFF) &&
       !prog_data->computed_stencil;
 
+   prog_data->uses_src_w =
+      BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
+   prog_data->uses_src_depth =
+      BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
+      !prog_data->per_coarse_pixel_dispatch;
+   prog_data->uses_depth_w_coefficients =
+      BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
+      prog_data->per_coarse_pixel_dispatch;
+
    calculate_urb_setup(devinfo, key, prog_data, shader);
    brw_compute_flat_inputs(prog_data, shader);
 }
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index 3b13e557568..8e4212d2273 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -397,6 +397,7 @@ public:
       uint8_t dest_depth_reg[2];
       uint8_t sample_pos_reg[2];
       uint8_t sample_mask_in_reg[2];
+      uint8_t depth_w_coef_reg[2];
       uint8_t barycentric_coord_reg[BRW_BARYCENTRIC_MODE_COUNT][2];
       uint8_t local_invocation_id_reg[2];
 
@@ -409,6 +410,7 @@ public:
 
    fs_reg pixel_x;
    fs_reg pixel_y;
+   fs_reg pixel_z;
    fs_reg wpos_w;
    fs_reg pixel_w;
    fs_reg delta_xy[BRW_BARYCENTRIC_MODE_COUNT];
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index cdd361e138c..ad7686e179e 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -2289,13 +2289,27 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
 	 break;
       case FS_OPCODE_PIXEL_X:
          assert(src[0].type == BRW_REGISTER_TYPE_UW);
+         assert(src[1].type == BRW_REGISTER_TYPE_UW);
          src[0].subnr = 0 * type_sz(src[0].type);
-         brw_MOV(p, dst, stride(src[0], 8, 4, 1));
+         if (src[1].file == BRW_IMMEDIATE_VALUE) {
+            assert(src[1].ud == 0);
+            brw_MOV(p, dst, stride(src[0], 8, 4, 1));
+         } else {
+            /* Coarse pixel case */
+            brw_ADD(p, dst, stride(src[0], 8, 4, 1), src[1]);
+         }
          break;
       case FS_OPCODE_PIXEL_Y:
          assert(src[0].type == BRW_REGISTER_TYPE_UW);
+         assert(src[1].type == BRW_REGISTER_TYPE_UW);
          src[0].subnr = 4 * type_sz(src[0].type);
-         brw_MOV(p, dst, stride(src[0], 8, 4, 1));
+         if (src[1].file == BRW_IMMEDIATE_VALUE) {
+            assert(src[1].ud == 0);
+            brw_MOV(p, dst, stride(src[0], 8, 4, 1));
+         } else {
+            /* Coarse pixel case */
+            brw_ADD(p, dst, stride(src[0], 8, 4, 1), src[1]);
+         }
          break;
 
       case SHADER_OPCODE_SEND:
diff --git a/src/intel/compiler/brw_fs_visitor.cpp b/src/intel/compiler/brw_fs_visitor.cpp
index 2dc5d8fb3d9..081023ec16d 100644
--- a/src/intel/compiler/brw_fs_visitor.cpp
+++ b/src/intel/compiler/brw_fs_visitor.cpp
@@ -189,6 +189,8 @@ fs_visitor::emit_interpolation_setup_gfx4()
       abld.ADD(offset(delta_xy, abld, 1), this->pixel_y, ystart);
    }
 
+   this->pixel_z = fetch_payload_reg(bld, payload.source_depth_reg);
+
    abld = bld.annotate("compute pos.w and 1/pos.w");
    /* Compute wpos.w.  It's always in our setup, since it's needed to
     * interpolate the other attributes.
@@ -274,6 +276,76 @@ fs_visitor::emit_interpolation_setup_gfx6()
    this->pixel_x = vgrf(glsl_type::float_type);
    this->pixel_y = vgrf(glsl_type::float_type);
 
+   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(prog_data);
+
+   fs_reg int_pixel_offset_xy, half_int_pixel_offset_x, half_int_pixel_offset_y;
+   if (!wm_prog_data->per_coarse_pixel_dispatch) {
+      /* The thread payload only delivers subspan locations (ss0, ss1,
+       * ss2, ...). Since subspans covers 2x2 pixels blocks, we need to
+       * generate 4 pixel coordinates out of each subspan location. We do this
+       * by replicating a subspan coordinate 4 times and adding an offset of 1
+       * in each direction from the initial top left (tl) location to generate
+       * top right (tr = +1 in x), bottom left (bl = +1 in y) and bottom right
+       * (br = +1 in x, +1 in y).
+       *
+       * The locations we build look like this in SIMD8 :
+       *
+       *    ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
+       *
+       * The value 0x11001010 is a vector of 8 half byte vector. It adds
+       * following to generate the 4 pixels coordinates out of the subspan0:
+       *
+       *  0x
+       *    1 : ss0.y + 1 -> ss0.br.y
+       *    1 : ss0.y + 1 -> ss0.bl.y
+       *    0 : ss0.y + 0 -> ss0.tr.y
+       *    0 : ss0.y + 0 -> ss0.tl.y
+       *    1 : ss0.x + 1 -> ss0.br.x
+       *    0 : ss0.x + 0 -> ss0.bl.x
+       *    1 : ss0.x + 1 -> ss0.tr.x
+       *    0 : ss0.x + 0 -> ss0.tl.x
+       *
+       * By doing a SIMD16 add in a SIMD8 shader, we can generate the 8 pixels
+       * coordinates out of 2 subspans coordinates in a single ADD instruction
+       * (twice the operation above).
+       */
+      int_pixel_offset_xy = fs_reg(brw_imm_v(0x11001010));
+      half_int_pixel_offset_x = fs_reg(brw_imm_uw(0));
+      half_int_pixel_offset_y = fs_reg(brw_imm_uw(0));
+   } else {
+      /* In coarse pixel dispatch we have to do the same ADD instruction that
+       * we do in normal per pixel dispatch, except this time we're not adding
+       * 1 in each direction, but instead the coarse pixel size.
+       *
+       * The coarse pixel size is delivered as 2 u8 in r1.0
+       */
+      struct brw_reg r1_0 = retype(brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0), BRW_REGISTER_TYPE_UB);
+
+      const fs_builder dbld =
+         abld.exec_all().group(MIN2(16, dispatch_width) * 2, 0);
+
+      /* To build the array of half bytes we do and AND operation with the
+       * right mask in X.
+       */
+      fs_reg int_pixel_offset_x = dbld.vgrf(BRW_REGISTER_TYPE_UW);
+      dbld.AND(int_pixel_offset_x, byte_offset(r1_0, 0), brw_imm_v(0x0000f0f0));
+
+      /* And the right mask in Y. */
+      fs_reg int_pixel_offset_y = dbld.vgrf(BRW_REGISTER_TYPE_UW);
+      dbld.AND(int_pixel_offset_y, byte_offset(r1_0, 1), brw_imm_v(0xff000000));
+
+      /* Finally OR the 2 registers. */
+      int_pixel_offset_xy = dbld.vgrf(BRW_REGISTER_TYPE_UW);
+      dbld.OR(int_pixel_offset_xy, int_pixel_offset_x, int_pixel_offset_y);
+
+      /* Also compute the half pixel size used to center pixels. */
+      half_int_pixel_offset_x = bld.vgrf(BRW_REGISTER_TYPE_UW);
+      half_int_pixel_offset_y = bld.vgrf(BRW_REGISTER_TYPE_UW);
+
+      bld.SHR(half_int_pixel_offset_x, suboffset(r1_0, 0), brw_imm_ud(1));
+      bld.SHR(half_int_pixel_offset_y, suboffset(r1_0, 1), brw_imm_ud(1));
+   }
+
    for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
       const fs_builder hbld = abld.group(MIN2(16, dispatch_width), i);
       struct brw_reg gi_uw = retype(brw_vec1_grf(1 + i, 0), BRW_REGISTER_TYPE_UW);
@@ -311,10 +383,12 @@ fs_visitor::emit_interpolation_setup_gfx6()
 
          dbld.ADD(int_pixel_xy,
                   fs_reg(stride(suboffset(gi_uw, 4), 1, 4, 0)),
-                  fs_reg(brw_imm_v(0x11001010)));
+                  int_pixel_offset_xy);
 
-         hbld.emit(FS_OPCODE_PIXEL_X, offset(pixel_x, hbld, i), int_pixel_xy);
-         hbld.emit(FS_OPCODE_PIXEL_Y, offset(pixel_y, hbld, i), int_pixel_xy);
+         hbld.emit(FS_OPCODE_PIXEL_X, offset(pixel_x, hbld, i), int_pixel_xy,
+                                      horiz_stride(half_int_pixel_offset_x, 0));
+         hbld.emit(FS_OPCODE_PIXEL_Y, offset(pixel_y, hbld, i), int_pixel_xy,
+                                      horiz_stride(half_int_pixel_offset_y, 0));
       } else {
          /* The "Register Region Restrictions" page says for SNB, IVB, HSW:
           *
@@ -343,12 +417,61 @@ fs_visitor::emit_interpolation_setup_gfx6()
       }
    }
 
-   abld = bld.annotate("compute pos.w");
-   this->pixel_w = fetch_payload_reg(abld, payload.source_w_reg);
-   this->wpos_w = vgrf(glsl_type::float_type);
-   abld.emit(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
+   abld = bld.annotate("compute pos.z");
+   if (wm_prog_data->uses_depth_w_coefficients) {
+      assert(!wm_prog_data->uses_src_depth);
+      /* In coarse pixel mode, the HW doesn't interpolate Z coordinate
+       * properly. In the same way we have to add the coarse pixel size to
+       * pixels locations, here we recompute the Z value with 2 coefficients
+       * in X & Y axis.
+       */
+      fs_reg coef_payload = fetch_payload_reg(abld, payload.depth_w_coef_reg, BRW_REGISTER_TYPE_F);
+      const fs_reg x_start = brw_vec1_grf(coef_payload.nr, 2);
+      const fs_reg y_start = brw_vec1_grf(coef_payload.nr, 6);
+      const fs_reg z_cx    = brw_vec1_grf(coef_payload.nr, 1);
+      const fs_reg z_cy    = brw_vec1_grf(coef_payload.nr, 0);
+      const fs_reg z_c0    = brw_vec1_grf(coef_payload.nr, 3);
+
+      const fs_reg float_pixel_x = abld.vgrf(BRW_REGISTER_TYPE_F);
+      const fs_reg float_pixel_y = abld.vgrf(BRW_REGISTER_TYPE_F);
+
+      abld.ADD(float_pixel_x, this->pixel_x, negate(x_start));
+      abld.ADD(float_pixel_y, this->pixel_y, negate(y_start));
+
+      /* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */
+      const fs_reg u8_cps_width = fs_reg(retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UB));
+      /* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */
+      const fs_reg u8_cps_height = byte_offset(u8_cps_width, 1);
+      const fs_reg u32_cps_width = abld.vgrf(BRW_REGISTER_TYPE_UD);
+      const fs_reg u32_cps_height = abld.vgrf(BRW_REGISTER_TYPE_UD);
+      abld.MOV(u32_cps_width, u8_cps_width);
+      abld.MOV(u32_cps_height, u8_cps_height);
+
+      const fs_reg f_cps_width = abld.vgrf(BRW_REGISTER_TYPE_F);
+      const fs_reg f_cps_height = abld.vgrf(BRW_REGISTER_TYPE_F);
+      abld.MOV(f_cps_width, u32_cps_width);
+      abld.MOV(f_cps_height, u32_cps_height);
+
+      /* Center in the middle of the coarse pixel. */
+      abld.MAD(float_pixel_x, float_pixel_x, brw_imm_f(0.5f), f_cps_width);
+      abld.MAD(float_pixel_y, float_pixel_y, brw_imm_f(0.5f), f_cps_height);
+
+      this->pixel_z = abld.vgrf(BRW_REGISTER_TYPE_F);
+      abld.MAD(this->pixel_z, z_c0, z_cx, float_pixel_x);
+      abld.MAD(this->pixel_z, this->pixel_z, z_cy, float_pixel_y);
+   }
 
-   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(prog_data);
+   if (wm_prog_data->uses_src_depth) {
+      assert(!wm_prog_data->uses_depth_w_coefficients);
+      this->pixel_z = fetch_payload_reg(bld, payload.source_depth_reg);
+   }
+
+   if (wm_prog_data->uses_src_w) {
+      abld = bld.annotate("compute pos.w");
+      this->pixel_w = fetch_payload_reg(abld, payload.source_w_reg);
+      this->wpos_w = vgrf(glsl_type::float_type);
+      abld.emit(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
+   }
 
    for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
       this->delta_xy[i] = fetch_barycentric_reg(
@@ -462,7 +585,7 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
       if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
          src_depth = frag_depth;
       else
-         src_depth = fetch_payload_reg(bld, payload.source_depth_reg);
+         src_depth = this->pixel_z;
    }
 
    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL))