[Mesa-dev] [PATCH v2 7/7] nvc0: enable FBFETCH with a special slot for color buffer 0

Sat Jan 14 21:28:43 UTC 2017

We don't need to support all the color buffers for advanced blend, just
cb0. For Fermi, we use the special binding slots so that we don't
overlap with user textures, while Kepler+ gets a dedicated position for
the fb handle in the driver constbuf.

This logic is only triggered when a FBFETCH is actually present so it
should be a no-op most of the time.

Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---

v1 -> v2: don't enable on fermi until it can be properly tested

 docs/features.txt                                  |  2 +-
 docs/relnotes/17.0.0.html                          |  1 +
 .../drivers/nouveau/codegen/nv50_ir_driver.h       |  2 +
 .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp  | 41 ++++++++++
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp      | 20 ++++-
 src/gallium/drivers/nouveau/nvc0/nvc0_context.h    |  4 +
 src/gallium/drivers/nouveau/nvc0/nvc0_program.c    |  6 ++
 src/gallium/drivers/nouveau/nvc0/nvc0_program.h    |  1 +
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c     |  9 ++-
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.h     |  2 +
 .../drivers/nouveau/nvc0/nvc0_state_validate.c     | 92 +++++++++++++++++++++-
 11 files changed, 173 insertions(+), 7 deletions(-)

diff --git a/docs/features.txt b/docs/features.txt
index dd40ac4..aff0016 100644
--- a/docs/features.txt
+++ b/docs/features.txt
@@ -253,7 +253,7 @@ GLES3.1, GLSL ES 3.1 -- all DONE: i965/hsw+, nvc0, radeonsi
 GLES3.2, GLSL ES 3.2 -- all DONE: i965/gen9+
 
   GL_EXT_color_buffer_float                             DONE (all drivers)
-  GL_KHR_blend_equation_advanced                        DONE (i965)
+  GL_KHR_blend_equation_advanced                        DONE (i965, nvc0)
   GL_KHR_debug                                          DONE (all drivers)
   GL_KHR_robustness                                     DONE (i965, nvc0, radeonsi)
   GL_KHR_texture_compression_astc_ldr                   DONE (i965/gen9+)
diff --git a/docs/relnotes/17.0.0.html b/docs/relnotes/17.0.0.html
index b50f9e9..71fb4c3 100644
--- a/docs/relnotes/17.0.0.html
+++ b/docs/relnotes/17.0.0.html
@@ -45,6 +45,7 @@ Note: some of the new features are only available with certain drivers.
 
 <ul>
 <li>GL_ARB_post_depth_coverage on i965/gen9+</li>
+<li>GL_KHR_blend_equation_advanced on nvc0</li>
 <li>GL_INTEL_conservative_rasterization on i965/gen9+</li>
 <li>GL_NV_image_formats on any driver supporting GL_ARB_shader_image_load_store (i965, nvc0, radeonsi, softpipe)</li>
 <li>GL_ARB_gpu_shader_fp64 in i965/haswell</li>
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index e85b5fa..a038801 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -146,6 +146,7 @@ struct nv50_ir_prog_info
          bool usesDiscard;
          bool persampleInvocation;
          bool usesSampleMaskIn;
+         bool readsFramebuffer;
       } fp;
       struct {
          uint32_t inputOffset; /* base address for user args */
@@ -178,6 +179,7 @@ struct nv50_ir_prog_info
       bool fp64;                 /* program uses fp64 math */
       bool nv50styleSurfaces;    /* generate gX[] access for raw buffers */
       uint16_t texBindBase;      /* base address for tex handles (nve4) */
+      uint16_t fbtexBindBase;    /* base address for fbtex handle (nve4) */
       uint16_t suInfoBase;       /* base address for surface info (nve4) */
       uint16_t bufInfoBase;      /* base address for buffer info */
       uint16_t sampleInfoBase;   /* base address for sample positions */
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 86348e7..7433187 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1457,6 +1457,9 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
    if (insn.getOpcode() == TGSI_OPCODE_BARRIER)
       info->numBarriers = 1;
 
+   if (insn.getOpcode() == TGSI_OPCODE_FBFETCH)
+      info->prop.fp.readsFramebuffer = true;
+
    if (insn.dstCount()) {
       Instruction::DstRegister dst = insn.getDst(0);
 
@@ -1572,6 +1575,7 @@ private:
    void handleTEX(Value *dst0[4], int R, int S, int L, int C, int Dx, int Dy);
    void handleTXF(Value *dst0[4], int R, int L_M);
    void handleTXQ(Value *dst0[4], enum TexQuery, int R);
+   void handleFBFETCH(Value *dst0[4]);
    void handleLIT(Value *dst0[4]);
    void handleUserClipPlanes();
 
@@ -2281,6 +2285,40 @@ Converter::handleTXF(Value *dst[4], int R, int L_M)
 }
 
 void
+Converter::handleFBFETCH(Value *dst[4])
+{
+   TexInstruction *texi = new_TexInstruction(func, OP_TXF);
+   unsigned int c, d;
+
+   texi->tex.target = TEX_TARGET_2D_MS_ARRAY;
+   texi->tex.levelZero = 1;
+   texi->tex.useOffsets = 0;
+
+   for (c = 0, d = 0; c < 4; ++c) {
+      if (dst[c]) {
+         texi->setDef(d++, dst[c]);
+         texi->tex.mask |= 1 << c;
+      }
+   }
+
+   Value *x = mkOp1v(OP_RDSV, TYPE_F32, getScratch(), mkSysVal(SV_POSITION, 0));
+   Value *y = mkOp1v(OP_RDSV, TYPE_F32, getScratch(), mkSysVal(SV_POSITION, 1));
+   Value *z = mkOp1v(OP_RDSV, TYPE_U32, getScratch(), mkSysVal(SV_LAYER, 0));
+   Value *ms = mkOp1v(OP_RDSV, TYPE_U32, getScratch(), mkSysVal(SV_SAMPLE_INDEX, 0));
+
+   mkCvt(OP_CVT, TYPE_U32, x, TYPE_F32, x)->rnd = ROUND_Z;
+   mkCvt(OP_CVT, TYPE_U32, y, TYPE_F32, y)->rnd = ROUND_Z;
+   texi->setSrc(0, x);
+   texi->setSrc(1, y);
+   texi->setSrc(2, z);
+   texi->setSrc(3, ms);
+
+   texi->tex.r = texi->tex.s = -1;
+
+   bb->insertTail(texi);
+}
+
+void
 Converter::handleLIT(Value *dst0[4])
 {
    Value *val0 = NULL;
@@ -3321,6 +3359,9 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
       handleTXQ(dst0, TXQ_TYPE, 0);
       std::swap(dst0[0], dst0[2]);
       break;
+   case TGSI_OPCODE_FBFETCH:
+      handleFBFETCH(dst0);
+      break;
    case TGSI_OPCODE_F2I:
    case TGSI_OPCODE_F2U:
       FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 95de87c..ec50578 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -749,7 +749,10 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
          i->setIndirectR(hnd);
          i->setIndirectS(NULL);
       } else if (i->tex.r == i->tex.s || i->op == OP_TXF) {
-         i->tex.r += prog->driver->io.texBindBase / 4;
+         if (i->tex.r == 0xffff)
+            i->tex.r = prog->driver->io.fbtexBindBase / 4;
+         else
+            i->tex.r += prog->driver->io.texBindBase / 4;
          i->tex.s  = 0; // only a single cX[] value possible here
       } else {
          Value *hnd = bld.getScratch();
@@ -805,6 +808,11 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
       Value *ticRel = i->getIndirectR();
       Value *tscRel = i->getIndirectS();
 
+      if (i->tex.r == 0xffff) {
+         i->tex.r = 0x20;
+         i->tex.s = 0x10;
+      }
+
       if (ticRel) {
          i->setSrc(i->tex.rIndirectSrc, NULL);
          if (i->tex.r)
@@ -2507,9 +2515,13 @@ NVC0LoweringPass::handleRDSV(Instruction *i)
    default:
       if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch)
          vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
-      ld = bld.mkFetch(i->getDef(0), i->dType,
-                       FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
-      ld->perPatch = i->perPatch;
+      if (prog->getType() == Program::TYPE_FRAGMENT) {
+         bld.mkInterp(NV50_IR_INTERP_FLAT, i->getDef(0), addr, NULL);
+      } else {
+         ld = bld.mkFetch(i->getDef(0), i->dType,
+                          FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
+         ld->perPatch = i->perPatch;
+      }
       break;
    }
    bld.getBB()->remove(i);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index 37aecae..79a5333 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -120,6 +120,9 @@
 /* block/grid size, at 3 32-bits integers each, gridid and work_dim */
 #define NVC0_CB_AUX_GRID_INFO(i)    0x100 + (i) * 4 /* CP */
 #define NVC0_CB_AUX_GRID_SIZE       (8 * 4)
+/* FB texture handle */
+#define NVC0_CB_AUX_FB_TEX_INFO     0x100 /* FP */
+#define NVC0_CB_AUX_FB_TEX_SIZE     (4)
 /* 8 user clip planes, at 4 32-bits floats each */
 #define NVC0_CB_AUX_UCP_INFO        0x120
 #define NVC0_CB_AUX_UCP_SIZE        (PIPE_MAX_CLIP_PLANES * 4 * 4)
@@ -206,6 +209,7 @@ struct nvc0_context {
    unsigned num_samplers[6];
    uint32_t samplers_dirty[6];
    bool seamless_cube_map;
+   struct pipe_sampler_view *fbtexture;
 
    uint32_t tex_handles[6][PIPE_MAX_SAMPLERS]; /* for nve4 */
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index a4a164f..6cc5183 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -486,6 +486,11 @@ nvc0_fp_gen_header(struct nvc0_program *fp, struct nv50_ir_prog_info *info)
 
    fp->fp.early_z = info->prop.fp.earlyFragTests;
    fp->fp.sample_mask_in = info->prop.fp.usesSampleMaskIn;
+   fp->fp.reads_framebuffer = info->prop.fp.readsFramebuffer;
+
+   /* Mark position xy and layer as read */
+   if (fp->fp.reads_framebuffer)
+      fp->hdr[5] |= 0x32000000;
 
    return 0;
 }
@@ -583,6 +588,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
    info->io.suInfoBase = NVC0_CB_AUX_SU_INFO(0);
    if (info->target >= NVISA_GK104_CHIPSET) {
       info->io.texBindBase = NVC0_CB_AUX_TEX_INFO(0);
+      info->io.fbtexBindBase = NVC0_CB_AUX_FB_TEX_INFO;
    }
 
    if (prog->type == PIPE_SHADER_COMPUTE) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
index d33aa04..421ca19 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
@@ -49,6 +49,7 @@ struct nvc0_program {
       bool sample_mask_in;
       bool force_persample_interp;
       bool flatshade;
+      bool reads_framebuffer;
    } fp;
    struct {
       uint32_t tess_mode; /* ~0 if defined by the other stage */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index ac9dd5b..1ef481b 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -38,6 +38,8 @@
 #include "nvc0/mme/com9097.mme.h"
 #include "nvc0/mme/com90c0.mme.h"
 
+#include "nv50/g80_texture.xml.h"
+
 static boolean
 nvc0_screen_is_format_supported(struct pipe_screen *pscreen,
                                 enum pipe_format format,
@@ -247,6 +249,8 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
       return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
    case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
       return nouveau_screen(pscreen)->vram_domain & NOUVEAU_BO_VRAM ? 1 : 0;
+   case PIPE_CAP_TGSI_FS_FBFETCH:
+      return class_3d >= NVE4_3D_CLASS; /* needs testing on fermi */
 
    /* unsupported caps */
    case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
@@ -275,7 +279,6 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
    case PIPE_CAP_NATIVE_FENCE_FD:
    case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY:
-   case PIPE_CAP_TGSI_FS_FBFETCH:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -535,6 +538,7 @@ nvc0_screen_destroy(struct pipe_screen *pscreen)
    nouveau_heap_destroy(&screen->lib_code);
    nouveau_heap_destroy(&screen->text_heap);
 
+   FREE(screen->default_tsc);
    FREE(screen->tic.entries);
 
    nouveau_object_del(&screen->eng3d);
@@ -1226,6 +1230,9 @@ nvc0_screen_create(struct nouveau_device *dev)
    if (!nvc0_blitter_create(screen))
       goto fail;
 
+   screen->default_tsc = CALLOC_STRUCT(nv50_tsc_entry);
+   screen->default_tsc->tsc[0] = G80_TSC_0_SRGB_CONVERSION;
+
    nouveau_fence_new(&screen->base, &screen->base.fence.current, false);
 
    return &screen->base;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
index aff0308..a6d4a2b 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@@ -81,6 +81,8 @@ struct nvc0_screen {
 
    struct nvc0_blitter *blitter;
 
+   struct nv50_tsc_entry *default_tsc;
+
    struct {
       void **entries;
       int next;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
index 88766f4..30b971b 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@@ -604,7 +604,9 @@ nvc0_validate_min_samples(struct nvc0_context *nvc0)
       // If we're using the incoming sample mask and doing sample shading, we
       // have to do sample shading "to the max", otherwise there's no way to
       // tell which sets of samples are covered by the current invocation.
-      if (nvc0->fragprog->fp.sample_mask_in)
+      // Similarly for reading the framebuffer.
+      if (nvc0->fragprog->fp.sample_mask_in ||
+          nvc0->fragprog->fp.reads_framebuffer)
          samples = util_framebuffer_get_num_samples(&nvc0->framebuffer);
       samples |= NVC0_3D_SAMPLE_SHADING_ENABLE;
    }
@@ -700,6 +702,92 @@ nvc0_validate_tess_state(struct nvc0_context *nvc0)
    PUSH_DATAp(push, nvc0->default_tess_inner, 2);
 }
 
+/* If we have a frag shader bound which tries to read from the framebuffer, we
+ * have to make sure that the fb is bound as a texture in the expected
+ * location. For Fermi, that's in the special driver slot 16, while for Kepler
+ * it's a regular binding stored in the driver constbuf.
+ */
+static void
+nvc0_validate_fbread(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_screen *screen = nvc0->screen;
+   struct pipe_context *pipe = &nvc0->base.pipe;
+   struct pipe_sampler_view *old_view = nvc0->fbtexture;
+   struct pipe_sampler_view *new_view = NULL;
+
+   if (nvc0->fragprog->fp.reads_framebuffer &&
+       nvc0->framebuffer.nr_cbufs &&
+       nvc0->framebuffer.cbufs[0]) {
+      struct pipe_sampler_view tmpl;
+      struct pipe_surface *sf = nvc0->framebuffer.cbufs[0];
+
+      tmpl.target = PIPE_TEXTURE_2D_ARRAY;
+      tmpl.format = sf->format;
+      tmpl.u.tex.first_level = tmpl.u.tex.last_level = sf->u.tex.level;
+      tmpl.u.tex.first_layer = sf->u.tex.first_layer;
+      tmpl.u.tex.last_layer = sf->u.tex.last_layer;
+      tmpl.swizzle_r = PIPE_SWIZZLE_X;
+      tmpl.swizzle_g = PIPE_SWIZZLE_Y;
+      tmpl.swizzle_b = PIPE_SWIZZLE_Z;
+      tmpl.swizzle_a = PIPE_SWIZZLE_W;
+
+      /* Bail if it's the same parameters */
+      if (old_view && old_view->texture == sf->texture &&
+          old_view->format == sf->format &&
+          old_view->u.tex.first_level == sf->u.tex.level &&
+          old_view->u.tex.first_layer == sf->u.tex.first_layer &&
+          old_view->u.tex.last_layer == sf->u.tex.last_layer)
+         return;
+
+      new_view = pipe->create_sampler_view(pipe, sf->texture, &tmpl);
+   } else if (old_view == NULL) {
+      return;
+   }
+
+   if (old_view)
+      pipe_sampler_view_reference(&nvc0->fbtexture, NULL);
+   nvc0->fbtexture = new_view;
+
+   if (screen->default_tsc->id < 0) {
+      struct nv50_tsc_entry *tsc = nv50_tsc_entry(screen->default_tsc);
+      tsc->id = nvc0_screen_tsc_alloc(screen, tsc);
+      nvc0->base.push_data(&nvc0->base, screen->txc, 65536 + tsc->id * 32,
+                           NV_VRAM_DOMAIN(&screen->base), 32, tsc->tsc);
+      screen->tsc.lock[tsc->id / 32] |= 1 << (tsc->id % 32);
+
+      IMMED_NVC0(push, NVC0_3D(TSC_FLUSH), 0);
+      if (screen->base.class_3d < NVE4_3D_CLASS) {
+         BEGIN_NVC0(push, NVC0_3D(BIND_TSC2(0)), 1);
+         PUSH_DATA (push, (tsc->id << 12) | 1);
+      }
+   }
+
+   if (new_view) {
+      struct nv50_tic_entry *tic = nv50_tic_entry(new_view);
+      assert(tic->id < 0);
+      tic->id = nvc0_screen_tic_alloc(screen, tic);
+      nvc0->base.push_data(&nvc0->base, screen->txc, tic->id * 32,
+                           NV_VRAM_DOMAIN(&screen->base), 32, tic->tic);
+      screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32);
+
+      if (screen->base.class_3d >= NVE4_3D_CLASS) {
+         BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+         PUSH_DATA (push, NVC0_CB_AUX_SIZE);
+         PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(4));
+         PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(4));
+         BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 1);
+         PUSH_DATA (push, NVC0_CB_AUX_FB_TEX_INFO);
+         PUSH_DATA (push, (screen->default_tsc->id << 20) | tic->id);
+      } else {
+         BEGIN_NVC0(push, NVC0_3D(BIND_TIC2(0)), 1);
+         PUSH_DATA (push, (tic->id << 9) | 1);
+      }
+
+      IMMED_NVC0(push, NVC0_3D(TIC_FLUSH), 0);
+   }
+}
+
 static void
 nvc0_switch_pipe_context(struct nvc0_context *ctx_to)
 {
@@ -781,6 +869,8 @@ validate_list_3d[] = {
     { nvc0_validate_textures,      NVC0_NEW_3D_TEXTURES },
     { nvc0_validate_samplers,      NVC0_NEW_3D_SAMPLERS },
     { nve4_set_tex_handles,        NVC0_NEW_3D_TEXTURES | NVC0_NEW_3D_SAMPLERS },
+    { nvc0_validate_fbread,        NVC0_NEW_3D_FRAGPROG |
+                                   NVC0_NEW_3D_FRAMEBUFFER },
     { nvc0_vertex_arrays_validate, NVC0_NEW_3D_VERTEX | NVC0_NEW_3D_ARRAYS },
     { nvc0_validate_surfaces,      NVC0_NEW_3D_SURFACES },
     { nvc0_validate_buffers,       NVC0_NEW_3D_BUFFERS },
-- 
2.10.2