[Mesa-dev] [PATCH 4/6] i965: Don't re-layout varyings for separate shader programs.

Fri Sep 11 23:58:48 PDT 2015

Previously, our VUE map code always assigned slots to varyings
sequentially, in one contiguous block.

This was a bad fit for separate shaders - the GS input layout depended
or the VS output layout, so if we swapped out vertex shaders, we might
have to recompile the GS on the fly - which rather defeats the point of
using separate shader objects.  (Tessellation would suffer from this
as well - we could have to recompile the HS, DS, and GS.)

Instead, this patch makes the VUE map for separate shaders use a fixed
layout, based on the input/output variable's location field.  (This is
either specified by layout(location = ...) or assigned by the linker.)
Corresponding inputs/outputs will match up by location; if there's a
mismatch, we're allowed to have undefined behavior.

This may be less efficient - depending what locations were chosen, we
may have empty padding slots in the VUE.  But applications presumably
use small consecutive integers for locations, so it hopefully won't be
much worse in practice.

3% of Dota 2 Reborn shaders are hurt, but only by 2 instructions.
This seems like a small price to pay for avoiding recompiles.

Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_context.h | 14 ++++++++-
 src/mesa/drivers/dri/i965/brw_fs.cpp    |  3 +-
 src/mesa/drivers/dri/i965/brw_gs.c      | 10 +++++--
 src/mesa/drivers/dri/i965/brw_vs.c      |  7 +++--
 src/mesa/drivers/dri/i965/brw_vue_map.c | 51 ++++++++++++++++++++++++++-------
 5 files changed, 67 insertions(+), 18 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index b05b8bd..772a9fd 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -541,6 +541,17 @@ struct brw_vue_map {
    GLbitfield64 slots_valid;
 
    /**
+    * Is this VUE map for a separate shader pipeline?
+    *
+    * Separable programs (GL_ARB_separate_shader_objects) can be mixed and matched
+    * without the linker having a chance to dead code eliminate unused varyings.
+    *
+    * This means that we have to use a fixed slot layout, based on the output's
+    * location field, rather than assigning slots in a compact contiguous block.
+    */
+   bool separate;
+
+   /**
     * Map from gl_varying_slot value to VUE slot.  For gl_varying_slots that are
     * not stored in a slot (because they are not written, or because
     * additional processing is applied before storing them in the VUE), the
@@ -585,7 +596,8 @@ static inline GLuint brw_varying_to_offset(struct brw_vue_map *vue_map,
 
 void brw_compute_vue_map(const struct brw_device_info *devinfo,
                          struct brw_vue_map *vue_map,
-                         GLbitfield64 slots_valid);
+                         GLbitfield64 slots_valid,
+                         bool separate_shader);
 
 
 /**
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 320f612..3eca0f4 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1437,7 +1437,8 @@ fs_visitor::calculate_urb_setup()
           */
          struct brw_vue_map prev_stage_vue_map;
          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
-                             key->input_slots_valid);
+                             key->input_slots_valid,
+                             shader_prog->SeparateShader);
          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index 16ea684..38b3e3a 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -120,7 +120,8 @@ brw_codegen_gs_prog(struct brw_context *brw,
    GLbitfield64 outputs_written = gp->program.Base.OutputsWritten;
 
    brw_compute_vue_map(brw->intelScreen->devinfo,
-                       &c.prog_data.base.vue_map, outputs_written);
+                       &c.prog_data.base.vue_map, outputs_written,
+                       prog ? prog->SeparateShader : false);
 
    /* Compute the output vertex size.
     *
@@ -243,7 +244,8 @@ brw_codegen_gs_prog(struct brw_context *brw,
       get_hw_prim_for_gl_prim(gp->program.OutputType);
 
    brw_compute_vue_map(brw->intelScreen->devinfo,
-                       &c.input_vue_map, c.key.input_varyings);
+                       &c.input_vue_map, c.key.input_varyings,
+                       prog->SeparateShader);
 
    /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
     * need to program a URB read length of ceiling(num_slots / 2).
@@ -357,7 +359,9 @@ brw_upload_gs_prog(struct brw_context *brw)
    brw->gs.base.prog_data = &brw->gs.prog_data->base.base;
 
    if (brw->gs.prog_data->base.vue_map.slots_valid !=
-       brw->vue_map_geom_out.slots_valid) {
+       brw->vue_map_geom_out.slots_valid ||
+       brw->gs.prog_data->base.vue_map.separate !=
+       brw->vue_map_geom_out.separate) {
       brw->vue_map_geom_out = brw->gs.prog_data->base.vue_map;
       brw->ctx.NewDriverState |= BRW_NEW_VUE_MAP_GEOM_OUT;
    }
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 4e0d34f..adc1810 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -178,7 +178,8 @@ brw_codegen_vs_prog(struct brw_context *brw,
    }
 
    brw_compute_vue_map(brw->intelScreen->devinfo,
-                       &prog_data.base.vue_map, outputs_written);
+                       &prog_data.base.vue_map, outputs_written,
+                       prog ? prog->SeparateShader : false);
 
    if (0) {
       _mesa_fprint_program_opt(stderr, &vp->program.Base, PROG_PRINT_DEBUG,
@@ -369,7 +370,9 @@ brw_upload_vs_prog(struct brw_context *brw)
    brw->vs.base.prog_data = &brw->vs.prog_data->base.base;
 
    if (brw->vs.prog_data->base.vue_map.slots_valid !=
-       brw->vue_map_geom_out.slots_valid) {
+       brw->vue_map_geom_out.slots_valid ||
+       brw->vs.prog_data->base.vue_map.separate !=
+       brw->vue_map_geom_out.separate) {
       brw->vue_map_vs = brw->vs.prog_data->base.vue_map;
       brw->ctx.NewDriverState |= BRW_NEW_VUE_MAP_VS;
       if (brw->gen < 6) {
diff --git a/src/mesa/drivers/dri/i965/brw_vue_map.c b/src/mesa/drivers/dri/i965/brw_vue_map.c
index 1ef5214..45662bd 100644
--- a/src/mesa/drivers/dri/i965/brw_vue_map.c
+++ b/src/mesa/drivers/dri/i965/brw_vue_map.c
@@ -59,10 +59,18 @@ assign_vue_slot(struct brw_vue_map *vue_map, int varying, int slot)
 void
 brw_compute_vue_map(const struct brw_device_info *devinfo,
                     struct brw_vue_map *vue_map,
-                    GLbitfield64 slots_valid)
+                    GLbitfield64 slots_valid,
+                    bool separate)
 {
+   /* Keep using the packed/contiguous layout on old hardware - we only need
+    * the SSO layout when using geometry/tessellation shaders or 32 FS input
+    * varyings, which only exist on Gen >= 6.  It's also a bit more efficient.
+    */
+   if (devinfo->gen < 6)
+      separate = false;
+
    vue_map->slots_valid = slots_valid;
-   int i;
+   vue_map->separate = separate;
 
    /* gl_Layer and gl_ViewportIndex don't get their own varying slots -- they
     * are stored in the first VUE slot (VARYING_SLOT_PSIZ).
@@ -77,7 +85,7 @@ brw_compute_vue_map(const struct brw_device_info *devinfo,
     */
    STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 127);
 
-   for (i = 0; i < BRW_VARYING_SLOT_COUNT; ++i) {
+   for (int i = 0; i < BRW_VARYING_SLOT_COUNT; ++i) {
       vue_map->varying_to_slot[i] = -1;
       vue_map->slot_to_varying[i] = BRW_VARYING_SLOT_PAD;
    }
@@ -131,21 +139,42 @@ brw_compute_vue_map(const struct brw_device_info *devinfo,
          assign_vue_slot(vue_map, VARYING_SLOT_BFC1, slot++);
    }
 
-   /* The hardware doesn't care about the rest of the vertex outputs, so just
-    * assign them contiguously.  Don't reassign outputs that already have a
-    * slot.
+   /* The hardware doesn't care about the rest of the vertex outputs, so we
+    * can assign them however we like.  For normal programs, we simply assign
+    * them contiguously.
+    *
+    * For separate shader pipelines, we first assign built-in varyings
+    * contiguous slots.  This works because ARB_separate_shader_objects
+    * requires that all shaders have matching built-in varying interface
+    * blocks.  Next, we assign generic varyings based on their location
+    * (either explicit or linker assigned).  This guarantees a fixed layout.
     *
     * We generally don't need to assign a slot for VARYING_SLOT_CLIP_VERTEX,
     * since it's encoded as the clip distances by emit_clip_distances().
     * However, it may be output by transform feedback, and we'd rather not
     * recompute state when TF changes, so we just always include it.
     */
-   for (int i = 0; i < VARYING_SLOT_MAX; ++i) {
-      if ((slots_valid & BITFIELD64_BIT(i)) &&
-          vue_map->varying_to_slot[i] == -1) {
-         assign_vue_slot(vue_map, i, slot++);
+   GLbitfield64 builtins = slots_valid & BITFIELD64_MASK(VARYING_SLOT_VAR0);
+   while (builtins != 0) {
+      const int varying = ffsll(builtins) - 1;
+      if (vue_map->varying_to_slot[varying] == -1) {
+         assign_vue_slot(vue_map, varying, slot++);
+      }
+      builtins &= ~BITFIELD64_BIT(varying);
+   }
+
+   const int first_generic_slot = slot;
+   GLbitfield64 generics = slots_valid & ~BITFIELD64_MASK(VARYING_SLOT_VAR0);
+   while (generics != 0) {
+      const int varying = ffsll(generics) - 1;
+      if (separate) {
+         slot = first_generic_slot + varying - VARYING_SLOT_VAR0;
+         assign_vue_slot(vue_map, varying, slot);
+      } else {
+         assign_vue_slot(vue_map, varying, slot++);
       }
+      generics &= ~BITFIELD64_BIT(varying);
    }
 
-   vue_map->num_slots = slot;
+   vue_map->num_slots = separate ? slot + 1 : slot;
 }
-- 
2.5.1