Mesa (master): nv50: add support for doing membars

Sat May 1 20:13:56 UTC 2021

Module: Mesa
Branch: master
Commit: b53b96a86a13cc321ef8812b9ec96c2c229e8e5c
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=b53b96a86a13cc321ef8812b9ec96c2c229e8e5c

Author: Ilia Mirkin <imirkin at alum.mit.edu>
Date:   Sat Mar 20 23:40:00 2021 -0400

nv50: add support for doing membars

This requires an address that's safe to read from.

Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10164>

---

 .../drivers/nouveau/codegen/nv50_ir_driver.h       |  3 ++
 .../nouveau/codegen/nv50_ir_lowering_nv50.cpp      | 39 ++++++++++++++++++++++
 src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp |  7 ++++
 .../drivers/nouveau/codegen/nv50_ir_target.cpp     |  5 ---
 src/gallium/drivers/nouveau/nv50/nv50_context.h    |  6 +++-
 src/gallium/drivers/nouveau/nv50/nv50_program.c    |  3 ++
 src/gallium/drivers/nouveau/nv50/nv50_screen.c     |  6 ++++
 7 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index 8bf0809fa10..83aae296fbb 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -125,6 +125,9 @@ struct nv50_ir_prog_info
       uint8_t msInfoCBSlot;      /* cX[] used for multisample info */
       uint16_t msInfoBase;       /* base address for multisample info */
       uint16_t uboInfoBase;      /* base address for compute UBOs (gk104+) */
+
+      uint16_t membarOffset;     /* base address for membar reads (nv50) */
+      uint8_t gmemMembar;        /* gX[] on which to perform membar reads (nv50) */
    } io;
 
    /* driver callback to assign input/output locations */
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index 9ebb780bfff..2b09855b19f 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -647,6 +647,7 @@ private:
    bool handleEXPORT(Instruction *);
    bool handleLOAD(Instruction *);
    bool handleLDST(Instruction *);
+   bool handleMEMBAR(Instruction *);
    bool handleSharedATOM(Instruction *);
    bool handleSULDP(TexInstruction *);
    bool handleSUREDP(TexInstruction *);
@@ -1619,6 +1620,42 @@ NV50LoweringPreSSA::handleLDST(Instruction *i)
    return true;
 }
 
+bool
+NV50LoweringPreSSA::handleMEMBAR(Instruction *i)
+{
+   // For global memory, apparently doing a bunch of reads at different
+   // addresses forces things to get sufficiently flushed.
+   if (i->subOp & NV50_IR_SUBOP_MEMBAR_GL) {
+      uint8_t b = prog->driver->io.auxCBSlot;
+      Value *base =
+         bld.mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32,
+                                            prog->driver->io.membarOffset), NULL);
+      Value *physid = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), bld.mkSysVal(SV_PHYSID, 0));
+      Value *off = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
+                              bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(),
+                                         physid, bld.loadImm(NULL, 0x1f)),
+                              bld.loadImm(NULL, 2));
+      base = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), base, off);
+      Symbol *gmemMembar = bld.mkSymbol(FILE_MEMORY_GLOBAL, prog->driver->io.gmemMembar, TYPE_U32, 0);
+      for (int i = 0; i < 8; i++) {
+         if (i != 0) {
+            base = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), base, bld.loadImm(NULL, 0x100));
+         }
+         bld.mkLoad(TYPE_U32, bld.getSSA(), gmemMembar, base)
+            ->fixed = 1;
+      }
+   }
+
+   // Both global and shared memory barriers also need a regular control bar
+   // TODO: double-check this is the case
+   i->op = OP_BAR;
+   i->subOp = NV50_IR_SUBOP_BAR_SYNC;
+   i->setSrc(0, bld.mkImm(0u));
+   i->setSrc(1, bld.mkImm(0u));
+
+   return true;
+}
+
 // The type that bests represents how each component can be stored when packed.
 static DataType
 getPackedType(const TexInstruction::ImgFormatDesc *t, int c)
@@ -2188,6 +2225,8 @@ NV50LoweringPreSSA::visit(Instruction *i)
       return handleEXPORT(i);
    case OP_LOAD:
       return handleLOAD(i);
+   case OP_MEMBAR:
+      return handleMEMBAR(i);
    case OP_ATOM:
    case OP_STORE:
       return handleLDST(i);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index 6a6ad01ce40..f4340014396 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -2569,6 +2569,13 @@ RegAlloc::InsertConstraintsPass::visit(BasicBlock *bb)
             addHazard(i, i->src(0).getIndirect(0));
          if (i->src(0).isIndirect(1) && typeSizeof(i->dType) >= 8)
             addHazard(i, i->src(0).getIndirect(1));
+         if (i->op == OP_LOAD && i->fixed && targ->getChipset() < 0xc0) {
+            // Add a hazard to make sure we keep the op around. These are used
+            // for membars.
+            Instruction *nop = new_Instruction(func, OP_NOP, i->dType);
+            nop->setSrc(0, i->getDef(0));
+            i->bb->insertAfter(i, nop);
+         }
       } else
       if (i->op == OP_UNION ||
           i->op == OP_MERGE ||
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
index ccb14535b55..e5a3a995505 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
@@ -297,11 +297,6 @@ CodeEmitter::prepareEmission(BasicBlock *bb)
    for (i = bb->getEntry(); i; i = next) {
       next = i->next;
 
-      if (i->op == OP_MEMBAR && !targ->isOpSupported(OP_MEMBAR, TYPE_NONE)) {
-         bb->remove(i);
-         continue;
-      }
-
       i->encSize = getMinEncodingSize(i);
       if (next && i->encSize < 8)
          ++nShort;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h
index af8a290db71..ed89d54afe4 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@@ -123,7 +123,11 @@
 /* Compute buffer info: 16 surfaces, 12 32-bit integers each */
 #define NV50_CB_AUX_BUF_INFO(i)   (0x3c4 + (i) * 12 * 4)
 #define NV50_CB_AUX_BUF_SIZE      (NV50_MAX_GLOBALS * 12 * 4)
-/* next spot: 0x644 */
+/* Compute membar mapped area */
+#define NV50_CB_AUX_MEMBAR_OFFSET 0x6c4
+/* next spot: 0x6c8 */
+/* 0x800 from the end for compute shader membars, reads only. */
+#define NV50_CB_AUX_MEMBAR        (NV50_CB_AUX_SIZE - 0x800)
 /* 4 32-bit floats for the vertex runout, put at the end */
 #define NV50_CB_AUX_RUNOUT_OFFSET (NV50_CB_AUX_SIZE - 0x10)
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c
index d818d21ef19..90349a92b01 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@@ -364,6 +364,9 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
    info->io.msInfoCBSlot = 15;
    info->io.msInfoBase = NV50_CB_AUX_MS_OFFSET;
 
+   info->io.membarOffset = NV50_CB_AUX_MEMBAR_OFFSET;
+   info->io.gmemMembar = 15;
+
    info->assignSlots = nv50_program_assign_varying_slots;
 
    prog->vp.bfc[0] = 0xff;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 2dffde2b2ad..7948a6d37cc 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -797,6 +797,12 @@ nv50_screen_init_hwctx(struct nv50_screen *screen)
    PUSH_DATAh(push, screen->uniforms->offset + (4 << 16) + NV50_CB_AUX_RUNOUT_OFFSET);
    PUSH_DATA (push, screen->uniforms->offset + (4 << 16) + NV50_CB_AUX_RUNOUT_OFFSET);
 
+   /* set the membar offset */
+   BEGIN_NV04(push, NV50_3D(CB_ADDR), 1);
+   PUSH_DATA (push, (NV50_CB_AUX_MEMBAR_OFFSET << (8 - 2)) | NV50_CB_AUX);
+   BEGIN_NI04(push, NV50_3D(CB_DATA(0)), 1);
+   PUSH_DATA (push, screen->uniforms->offset + (4 << 16) + NV50_CB_AUX_MEMBAR_OFFSET);
+
    nv50_upload_ms_info(push);
 
    /* max TIC (bits 4:8) & TSC bindings, per program type */