Mesa (main): freedreno/ir3: xfb fix for duplicate outputs

Sun Oct 31 16:49:58 UTC 2021

Module: Mesa
Branch: main
Commit: 7e998783db80a153978bea165132e349896792d8
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=7e998783db80a153978bea165132e349896792d8

Author: Rob Clark <robdclark at chromium.org>
Date:   Sat Oct 30 10:51:59 2021 -0700

freedreno/ir3: xfb fix for duplicate outputs

We can't rely on regid to be unique, shaders can have multiple varyings
with the same output value.  Normally shader linking deduplicates these,
but we still need to handle the case for xfb.  So use slot instead as
the unique identifier.

Fixes KHR-GLES31.core.gpu_shader5.fma_precision_*

Signed-off-by: Rob Clark <robdclark at chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13605>

---

 src/freedreno/ci/freedreno-a630-fails.txt        |  6 ------
 src/freedreno/ir3/ir3_shader.c                   |  8 +++++---
 src/freedreno/ir3/ir3_shader.h                   |  9 ++++++---
 src/freedreno/vulkan/tu_pipeline.c               | 19 +++++++++++--------
 src/gallium/drivers/freedreno/a5xx/fd5_program.c | 12 +++++++-----
 src/gallium/drivers/freedreno/a6xx/fd6_program.c | 14 ++++++++------
 6 files changed, 37 insertions(+), 31 deletions(-)

diff --git a/src/freedreno/ci/freedreno-a630-fails.txt b/src/freedreno/ci/freedreno-a630-fails.txt
index 186cbebbd69..744a708d4b4 100644
--- a/src/freedreno/ci/freedreno-a630-fails.txt
+++ b/src/freedreno/ci/freedreno-a630-fails.txt
@@ -14,12 +14,6 @@ KHR-GL33.transform_feedback.query_vertex_separate_test,Fail
 # "*** Color comparison failed"
 KHR-GLES3.packed_depth_stencil.verify_read_pixels.depth24_stencil8,Fail
 
-# "The values of resultStd[i] & 0xFFFFFFFE and resultFma[i] & 0xFFFFFFFE and resultCPU[i] & 0xFFFFFFFE are not bitwise equal for i = 0..99 "
-KHR-GLES31.core.gpu_shader5.fma_precision_float,Fail
-KHR-GLES31.core.gpu_shader5.fma_precision_vec2,Fail
-KHR-GLES31.core.gpu_shader5.fma_precision_vec3,Fail
-KHR-GLES31.core.gpu_shader5.fma_precision_vec4,Fail
-
 # Lots of errors like "[279] Check failed. Received: [3,0,0,2] instead of: [5,0,0,2]"
 KHR-GLES31.core.geometry_shader.layered_framebuffer.depth_support,Fail
 
diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c
index f2b5afbb0dc..4be8c411c93 100644
--- a/src/freedreno/ir3/ir3_shader.c
+++ b/src/freedreno/ir3/ir3_shader.c
@@ -823,14 +823,16 @@ ir3_link_stream_out(struct ir3_shader_linkage *l,
          continue;
 
       for (idx = 0; idx < l->cnt; idx++) {
-         if (l->var[idx].regid == v->outputs[k].regid)
+         if (l->var[idx].slot == v->outputs[k].slot)
             break;
          nextloc = MAX2(nextloc, l->var[idx].loc + 4);
       }
 
       /* add if not already in linkage map: */
-      if (idx == l->cnt)
-         ir3_link_add(l, v->outputs[k].regid, compmask, nextloc);
+      if (idx == l->cnt) {
+         ir3_link_add(l, v->outputs[k].slot, v->outputs[k].regid,
+                      compmask, nextloc);
+      }
 
       /* expand component-mask if needed, ie streaming out all components
        * but frag shader doesn't consume all components:
diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h
index 4a4fd663e35..b4585724934 100644
--- a/src/freedreno/ir3/ir3_shader.h
+++ b/src/freedreno/ir3/ir3_shader.h
@@ -891,6 +891,7 @@ struct ir3_shader_linkage {
 
    /* Map from VS output to location. */
    struct {
+      uint8_t slot;
       uint8_t regid;
       uint8_t compmask;
       uint8_t loc;
@@ -907,8 +908,8 @@ struct ir3_shader_linkage {
 };
 
 static inline void
-ir3_link_add(struct ir3_shader_linkage *l, uint8_t regid_, uint8_t compmask,
-             uint8_t loc)
+ir3_link_add(struct ir3_shader_linkage *l, uint8_t slot, uint8_t regid_,
+             uint8_t compmask, uint8_t loc)
 {
    for (int j = 0; j < util_last_bit(compmask); j++) {
       uint8_t comploc = loc + j;
@@ -921,6 +922,7 @@ ir3_link_add(struct ir3_shader_linkage *l, uint8_t regid_, uint8_t compmask,
       int i = l->cnt++;
       debug_assert(i < ARRAY_SIZE(l->var));
 
+      l->var[i].slot = slot;
       l->var[i].regid = regid_;
       l->var[i].compmask = compmask;
       l->var[i].loc = loc;
@@ -974,7 +976,8 @@ ir3_link_shaders(struct ir3_shader_linkage *l,
       if (fs->inputs[j].slot == VARYING_SLOT_CLIP_DIST1)
          l->clip1_loc = fs->inputs[j].inloc;
 
-      ir3_link_add(l, k >= 0 ? vs->outputs[k].regid : default_regid,
+      ir3_link_add(l, fs->inputs[j].slot,
+                   k >= 0 ? vs->outputs[k].regid : default_regid,
                    fs->inputs[j].compmask, fs->inputs[j].inloc);
    }
 }
diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c
index 7543e4503c0..0072562e220 100644
--- a/src/freedreno/vulkan/tu_pipeline.c
+++ b/src/freedreno/vulkan/tu_pipeline.c
@@ -774,7 +774,7 @@ tu6_setup_streamout(struct tu_cs *cs,
        * a bit less ideal here..
        */
       for (idx = 0; idx < l->cnt; idx++)
-         if (l->var[idx].regid == v->outputs[k].regid)
+         if (l->var[idx].slot == v->outputs[k].slot)
             break;
 
       debug_assert(idx < l->cnt);
@@ -1006,12 +1006,12 @@ tu6_emit_vpc(struct tu_cs *cs,
 
    if (layer_regid != regid(63, 0)) {
       layer_loc = linkage.max_loc;
-      ir3_link_add(&linkage, layer_regid, 0x1, linkage.max_loc);
+      ir3_link_add(&linkage, VARYING_SLOT_LAYER, layer_regid, 0x1, linkage.max_loc);
    }
 
    if (view_regid != regid(63, 0)) {
       view_loc = linkage.max_loc;
-      ir3_link_add(&linkage, view_regid, 0x1, linkage.max_loc);
+      ir3_link_add(&linkage, VARYING_SLOT_VIEWPORT, view_regid, 0x1, linkage.max_loc);
    }
 
    unsigned extra_pos = 0;
@@ -1023,14 +1023,15 @@ tu6_emit_vpc(struct tu_cs *cs,
       if (position_loc == 0xff)
          position_loc = linkage.max_loc;
 
-      ir3_link_add(&linkage, last_shader->outputs[i].regid,
+      ir3_link_add(&linkage, last_shader->outputs[i].slot,
+                   last_shader->outputs[i].regid,
                    0xf, position_loc + 4 * last_shader->outputs[i].view);
       extra_pos = MAX2(extra_pos, last_shader->outputs[i].view);
    }
 
    if (pointsize_regid != regid(63, 0)) {
       pointsize_loc = linkage.max_loc;
-      ir3_link_add(&linkage, pointsize_regid, 0x1, linkage.max_loc);
+      ir3_link_add(&linkage, VARYING_SLOT_PSIZ, pointsize_regid, 0x1, linkage.max_loc);
    }
 
    uint8_t clip_cull_mask = last_shader->clip_mask | last_shader->cull_mask;
@@ -1039,11 +1040,13 @@ tu6_emit_vpc(struct tu_cs *cs,
    uint32_t clip0_loc = linkage.clip0_loc, clip1_loc = linkage.clip1_loc;
    if (clip0_loc == 0xff && clip0_regid != regid(63, 0)) {
       clip0_loc = linkage.max_loc;
-      ir3_link_add(&linkage, clip0_regid, clip_cull_mask & 0xf, linkage.max_loc);
+      ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST0, clip0_regid,
+                   clip_cull_mask & 0xf, linkage.max_loc);
    }
    if (clip1_loc == 0xff && clip1_regid != regid(63, 0)) {
       clip1_loc = linkage.max_loc;
-      ir3_link_add(&linkage, clip1_regid, clip_cull_mask >> 4, linkage.max_loc);
+      ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST1, clip1_regid,
+                   clip_cull_mask >> 4, linkage.max_loc);
    }
 
    tu6_setup_streamout(cs, last_shader, &linkage);
@@ -1054,7 +1057,7 @@ tu6_emit_vpc(struct tu_cs *cs,
     * any unused code and make sure that optimizations don't remove it.
     */
    if (linkage.cnt == 0)
-      ir3_link_add(&linkage, 0, 0x1, linkage.max_loc);
+      ir3_link_add(&linkage, 0, 0, 0x1, linkage.max_loc);
 
    /* map outputs of the last shader to VPC */
    assert(linkage.cnt <= 32);
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_program.c b/src/gallium/drivers/freedreno/a5xx/fd5_program.c
index 2236420cdf1..36c54b3db38 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_program.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_program.c
@@ -105,7 +105,7 @@ emit_stream_out(struct fd_ringbuffer *ring, const struct ir3_shader_variant *v,
        * a bit less ideal here..
        */
       for (idx = 0; idx < l->cnt; idx++)
-         if (l->var[idx].regid == v->outputs[k].regid)
+         if (l->var[idx].slot == v->outputs[k].slot)
             break;
 
       debug_assert(idx < l->cnt);
@@ -408,11 +408,11 @@ fd5_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
    /* a5xx appends pos/psize to end of the linkage map: */
    if (VALIDREG(pos_regid))
-      ir3_link_add(&l, pos_regid, 0xf, l.max_loc);
+      ir3_link_add(&l, VARYING_SLOT_POS, pos_regid, 0xf, l.max_loc);
 
    if (VALIDREG(psize_regid)) {
       psize_loc = l.max_loc;
-      ir3_link_add(&l, psize_regid, 0x1, l.max_loc);
+      ir3_link_add(&l, VARYING_SLOT_PSIZ, psize_regid, 0x1, l.max_loc);
    }
 
    /* Handle the case where clip/cull distances aren't read by the FS. Make
@@ -422,13 +422,15 @@ fd5_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
    if (clip0_loc == 0xff && VALIDREG(clip0_regid) &&
        (clip_cull_mask & 0xf) != 0) {
       clip0_loc = l.max_loc;
-      ir3_link_add(&l, clip0_regid, clip_cull_mask & 0xf, l.max_loc);
+      ir3_link_add(&l, VARYING_SLOT_CLIP_DIST0, clip0_regid,
+                   clip_cull_mask & 0xf, l.max_loc);
    }
 
    if (clip1_loc == 0xff && VALIDREG(clip1_regid) &&
        (clip_cull_mask >> 4) != 0) {
       clip1_loc = l.max_loc;
-      ir3_link_add(&l, clip1_regid, clip_cull_mask >> 4, l.max_loc);
+      ir3_link_add(&l, VARYING_SLOT_CLIP_DIST1, clip1_regid,
+                   clip_cull_mask >> 4, l.max_loc);
    }
 
    /* If we have stream-out, we use the full shader for binning
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.c b/src/gallium/drivers/freedreno/a6xx/fd6_program.c
index 621cce2bb87..39ad4d928ec 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_program.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.c
@@ -210,7 +210,7 @@ setup_stream_out(struct fd_context *ctx, struct fd6_program_state *state,
        * a bit less ideal here..
        */
       for (idx = 0; idx < l->cnt; idx++)
-         if (l->var[idx].regid == v->outputs[k].regid)
+         if (l->var[idx].slot == v->outputs[k].slot)
             break;
 
       debug_assert(idx < l->cnt);
@@ -560,17 +560,17 @@ setup_stateobj(struct fd_ringbuffer *ring, struct fd_context *ctx,
 
    if (VALIDREG(layer_regid)) {
       layer_loc = l.max_loc;
-      ir3_link_add(&l, layer_regid, 0x1, l.max_loc);
+      ir3_link_add(&l, VARYING_SLOT_LAYER, layer_regid, 0x1, l.max_loc);
    }
 
    if (VALIDREG(pos_regid)) {
       pos_loc = l.max_loc;
-      ir3_link_add(&l, pos_regid, 0xf, l.max_loc);
+      ir3_link_add(&l, VARYING_SLOT_POS, pos_regid, 0xf, l.max_loc);
    }
 
    if (VALIDREG(psize_regid)) {
       psize_loc = l.max_loc;
-      ir3_link_add(&l, psize_regid, 0x1, l.max_loc);
+      ir3_link_add(&l, VARYING_SLOT_PSIZ, psize_regid, 0x1, l.max_loc);
    }
 
    /* Handle the case where clip/cull distances aren't read by the FS. Make
@@ -580,13 +580,15 @@ setup_stateobj(struct fd_ringbuffer *ring, struct fd_context *ctx,
    if (clip0_loc == 0xff && VALIDREG(clip0_regid) &&
        (clip_cull_mask & 0xf) != 0) {
       clip0_loc = l.max_loc;
-      ir3_link_add(&l, clip0_regid, clip_cull_mask & 0xf, l.max_loc);
+      ir3_link_add(&l, VARYING_SLOT_CLIP_DIST0, clip0_regid,
+                   clip_cull_mask & 0xf, l.max_loc);
    }
 
    if (clip1_loc == 0xff && VALIDREG(clip1_regid) &&
        (clip_cull_mask >> 4) != 0) {
       clip1_loc = l.max_loc;
-      ir3_link_add(&l, clip1_regid, clip_cull_mask >> 4, l.max_loc);
+      ir3_link_add(&l, VARYING_SLOT_CLIP_DIST1, clip1_regid,
+                   clip_cull_mask >> 4, l.max_loc);
    }
 
    /* If we have stream-out, we use the full shader for binning