Mesa (main): intel/compiler: Handle per-primitive inputs in FS

Sat Dec 4 01:33:15 UTC 2021

Module: Mesa
Branch: main
Commit: be89ea3231c92960cf9b23b37ce6402e4612a207
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=be89ea3231c92960cf9b23b37ce6402e4612a207

Author: Caio Oliveira <caio.oliveira at intel.com>
Date:   Tue May 18 10:17:43 2021 -0700

intel/compiler: Handle per-primitive inputs in FS

In Fragment Shader, regular inputs are laid out in the thread payload
in a one dword per each half-GRF, that gives room for having the two
delta dwords needed for interpolation.

Per-primitive inputs are laid out before the regular inputs, and since
there's no need to have delta information, they are packed.  So
half-GRF will be fully filled with 4 dwords of input.

When num_per_primitive_inputs is zero (the default case), behavior
should be the same as before.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13661>

---

 src/intel/compiler/brw_compiler.h     |  1 +
 src/intel/compiler/brw_fs.cpp         | 48 +++++++++++++++++++++++++++++------
 src/intel/compiler/brw_fs.h           |  1 +
 src/intel/compiler/brw_fs_nir.cpp     | 20 ++++++++++++---
 src/intel/compiler/brw_fs_visitor.cpp | 41 +++++++++++++++++++++++++++---
 5 files changed, 96 insertions(+), 15 deletions(-)

diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index f0e7a773849..16bce3222e2 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -848,6 +848,7 @@ enum brw_pixel_shader_computed_depth_mode {
 struct brw_wm_prog_data {
    struct brw_stage_prog_data base;
 
+   GLuint num_per_primitive_inputs;
    GLuint num_varying_inputs;
 
    uint8_t reg_blocks_8;
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 8caae21d683..d12eeba6e7c 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -1871,10 +1871,31 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
 
    int urb_next = 0;
+
+   /* Per-Primitive Attributes are laid out by Hardware before the regular
+    * attributes, so order them like this to make easy later to map setup into
+    * real HW registers.
+    */
+   if (nir->info.per_primitive_inputs) {
+      for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
+         if (nir->info.per_primitive_inputs & BITFIELD64_BIT(i)) {
+            prog_data->urb_setup[i] = urb_next++;
+         }
+      }
+
+      /* The actual setup attributes later must be aligned to a full GRF. */
+      urb_next = ALIGN(urb_next, 2);
+
+      prog_data->num_per_primitive_inputs = urb_next;
+   }
+
+   const uint64_t inputs_read =
+      nir->info.inputs_read & ~nir->info.per_primitive_inputs;
+
    /* Figure out where each of the incoming setup attributes lands. */
    if (devinfo->ver >= 6) {
-      if (util_bitcount64(nir->info.inputs_read &
-                            BRW_FS_VARYING_INPUT_MASK) <= 16) {
+      if (util_bitcount64(inputs_read &
+                          BRW_FS_VARYING_INPUT_MASK) <= 16) {
          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
           * first 16 varying inputs, so we can put them wherever we want.
           * Just put them in order.
@@ -1885,7 +1906,7 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
           * a different vertex (or geometry) shader.
           */
          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
-            if (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
+            if (inputs_read & BRW_FS_VARYING_INPUT_MASK &
                 BITFIELD64_BIT(i)) {
                prog_data->urb_setup[i] = urb_next++;
             }
@@ -1897,6 +1918,11 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
           * (geometry or vertex shader).
           */
 
+         /* TODO(mesh): Implement this case for Mesh. Basically have a large
+          * number of outputs in Mesh (hence a lot of inputs in Fragment)
+          * should already trigger this.
+          */
+
          /* Re-compute the VUE map here in the case that the one coming from
           * geometry has more than one position slot (used for Primitive
           * Replication).
@@ -1907,7 +1933,7 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
                              nir->info.separate_shader, 1);
 
          int first_slot =
-            brw_compute_first_urb_slot_required(nir->info.inputs_read,
+            brw_compute_first_urb_slot_required(inputs_read,
                                                 &prev_stage_vue_map);
 
          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
@@ -1915,7 +1941,7 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
               slot++) {
             int varying = prev_stage_vue_map.slot_to_varying[slot];
             if (varying != BRW_VARYING_SLOT_PAD &&
-                (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
+                (inputs_read & BRW_FS_VARYING_INPUT_MASK &
                  BITFIELD64_BIT(varying))) {
                prog_data->urb_setup[varying] = slot - first_slot;
             }
@@ -1948,12 +1974,12 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
        *
        * See compile_sf_prog() for more info.
        */
-      if (nir->info.inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
+      if (inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
    }
 
-   prog_data->num_varying_inputs = urb_next;
-   prog_data->inputs = nir->info.inputs_read;
+   prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs;
+   prog_data->inputs = inputs_read;
 
    brw_compute_urb_setup_index(prog_data);
 }
@@ -1995,6 +2021,12 @@ fs_visitor::assign_urb_setup()
 
    /* Each attribute is 4 setup channels, each of which is half a reg. */
    this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
+
+   /* Unlike regular attributes, per-primitive attributes have all 4 channels
+    * in the same slot, so each GRF can store two slots.
+    */
+   assert(prog_data->num_per_primitive_inputs % 2 == 0);
+   this->first_non_payload_grf += prog_data->num_per_primitive_inputs / 2;
 }
 
 void
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index 8bc2dd29a4f..fdeb821bf69 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -332,6 +332,7 @@ public:
    fs_reg get_timestamp(const brw::fs_builder &bld);
 
    fs_reg interp_reg(int location, int channel);
+   fs_reg per_primitive_reg(int location);
 
    virtual void dump_instructions() const;
    virtual void dump_instructions(const char *name) const;
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 2b2bccdf053..a6b13456ce5 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -3620,21 +3620,33 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
    }
 
    case nir_intrinsic_load_input: {
-      /* load_input is only used for flat inputs */
+      /* In Fragment Shaders load_input is used either for flat inputs or
+       * per-primitive inputs.
+       */
       assert(nir_dest_bit_size(instr->dest) == 32);
       unsigned base = nir_intrinsic_base(instr);
       unsigned comp = nir_intrinsic_component(instr);
       unsigned num_components = instr->num_components;
 
+      /* TODO(mesh): Multiview. Verify and handle these special cases for Mesh. */
+
       /* Special case fields in the VUE header */
       if (base == VARYING_SLOT_LAYER)
          comp = 1;
       else if (base == VARYING_SLOT_VIEWPORT)
          comp = 2;
 
-      for (unsigned int i = 0; i < num_components; i++) {
-         bld.MOV(offset(dest, bld, i),
-                 retype(component(interp_reg(base, comp + i), 3), dest.type));
+      if (BITFIELD64_BIT(base) & nir->info.per_primitive_inputs) {
+         assert(base != VARYING_SLOT_PRIMITIVE_INDICES);
+         for (unsigned int i = 0; i < num_components; i++) {
+            bld.MOV(offset(dest, bld, i),
+                    retype(component(per_primitive_reg(base), comp + i), dest.type));
+         }
+      } else {
+         for (unsigned int i = 0; i < num_components; i++) {
+            bld.MOV(offset(dest, bld, i),
+                    retype(component(interp_reg(base, comp + i), 3), dest.type));
+         }
       }
       break;
    }
diff --git a/src/intel/compiler/brw_fs_visitor.cpp b/src/intel/compiler/brw_fs_visitor.cpp
index 060cb83dfec..4de37671b90 100644
--- a/src/intel/compiler/brw_fs_visitor.cpp
+++ b/src/intel/compiler/brw_fs_visitor.cpp
@@ -136,6 +136,11 @@ fs_visitor::emit_dummy_fs()
    calculate_cfg();
 }
 
+/* Input data is organized with first the per-primitive values, followed
+ * by per-vertex values.  The per-vertex will have interpolation information
+ * associated, so use 4 components for each value.
+ */
+
 /* The register location here is relative to the start of the URB
  * data.  It will get adjusted to be a real location before
  * generate_code() time.
@@ -144,9 +149,39 @@ fs_reg
 fs_visitor::interp_reg(int location, int channel)
 {
    assert(stage == MESA_SHADER_FRAGMENT);
-   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
-   int regnr = prog_data->urb_setup[location] * 4 + channel;
-   assert(prog_data->urb_setup[location] != -1);
+   assert(BITFIELD64_BIT(location) & ~nir->info.per_primitive_inputs);
+
+   const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+
+   assert(prog_data->urb_setup[location] >= 0);
+   unsigned nr = prog_data->urb_setup[location];
+
+   /* Adjust so we start counting from the first per_vertex input. */
+   assert(nr >= prog_data->num_per_primitive_inputs);
+   nr -= prog_data->num_per_primitive_inputs;
+
+   const unsigned per_vertex_start = prog_data->num_per_primitive_inputs;
+   const unsigned regnr = per_vertex_start + (nr * 4) + channel;
+
+   return fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F);
+}
+
+/* The register location here is relative to the start of the URB
+ * data.  It will get adjusted to be a real location before
+ * generate_code() time.
+ */
+fs_reg
+fs_visitor::per_primitive_reg(int location)
+{
+   assert(stage == MESA_SHADER_FRAGMENT);
+   assert(BITFIELD64_BIT(location) & nir->info.per_primitive_inputs);
+
+   const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+
+   assert(prog_data->urb_setup[location] >= 0);
+
+   const unsigned regnr = prog_data->urb_setup[location];
+   assert(regnr < prog_data->num_per_primitive_inputs);
 
    return fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F);
 }