[Mesa-dev] [PATCH] WIP nv50/ir: optimize immediates mov to const buffer

Sat Mar 2 23:48:31 UTC 2019

some instructions can read from a const buffer but not long immediates, some
instructions can take an immediate and a const buffer, but not two immediates.
Put immediates we can't load propagate into the driver const buffer, so that
shaders can read from there instead doing some movs writing the immediate into
a register.

changes for gp107:

pixmark piano 512x320 20sec:
835 -> 840

shader-db:
total instructions in shared programs : 9649882 -> 9587964 (-0.64%)
total gprs used in shared programs    : 1058330 -> 1051417 (-0.65%)
total shared used in shared programs  : 702868 -> 702868 (0.00%)
total local used in shared programs   : 35368 -> 35344 (-0.07%)

                 local      shared         gpr        inst       bytes
    helped           1           0        5114       21817       21817
      hurt           0           0          71          29          29
---
 .../drivers/nouveau/codegen/nv50_ir.cpp       |  3 ++
 src/gallium/drivers/nouveau/codegen/nv50_ir.h |  2 +
 .../drivers/nouveau/codegen/nv50_ir_driver.h  |  4 ++
 .../nouveau/codegen/nv50_ir_peephole.cpp      | 49 ++++++++++++++++++-
 .../drivers/nouveau/codegen/nv50_ir_target.h  | 31 ++++++++++++
 .../nouveau/codegen/nv50_ir_target_nvc0.cpp   | 27 ++++++++--
 .../nouveau/codegen/nv50_ir_target_nvc0.h     |  2 +
 .../drivers/nouveau/codegen/unordered_map.h   | 20 ++++++++
 .../drivers/nouveau/nv50/nv50_program.c       |  1 +
 .../drivers/nouveau/nvc0/nvc0_context.h       |  3 ++
 .../drivers/nouveau/nvc0/nvc0_program.c       |  7 +++
 .../drivers/nouveau/nvc0/nvc0_program.h       |  2 +
 .../drivers/nouveau/nvc0/nvc0_shader_state.c  | 21 +++++++-
 13 files changed, 166 insertions(+), 6 deletions(-)
 create mode 100644 src/gallium/drivers/nouveau/codegen/unordered_map.h

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
index 993d01c1e44..e2ca4fa88c1 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
@@ -1286,6 +1286,9 @@ out:
    info->bin.maxGPR = prog->maxGPR;
    info->bin.code = prog->code;
    info->bin.codeSize = prog->binSize;
+   info->bin.imms = (uint8_t*)malloc(prog->immediates.size());
+   info->bin.immsSize = prog->immediates.size();
+   std::copy(prog->immediates.begin(), prog->immediates.end(), info->bin.imms);
    info->bin.tlsSpace = prog->tlsSize;
 
    delete prog;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
index 8d32a25ec23..08e000b1fed 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -1330,6 +1330,8 @@ public:
 
    void releaseInstruction(Instruction *);
    void releaseValue(Value *);
+
+   std::vector<uint8_t> immediates;
 };
 
 // TODO: add const version
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index 7c835ceab8d..9e05e271fb5 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -92,6 +92,8 @@ struct nv50_ir_prog_info
       uint32_t smemSize;  /* required shared memory per block */
       uint32_t *code;
       uint32_t codeSize;
+      uint8_t *imms;
+      uint16_t immsSize;
       uint32_t instructions;
       uint8_t sourceRep;  /* PIPE_SHADER_IR_* */
       const void *source;
@@ -188,6 +190,8 @@ struct nv50_ir_prog_info
       uint8_t msInfoCBSlot;      /* cX[] used for multisample info */
       uint16_t msInfoBase;       /* base address for multisample info */
       uint16_t uboInfoBase;      /* base address for compute UBOs (gk104+) */
+      uint8_t immCBSlot;         /* immediate constant buffer slot */
+      uint16_t immCBOffset;      /* immediate constant buffer offset */
    } io;
 
    /* driver callback to assign input/output locations */
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 0b3220903b9..f5bc37612a3 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -20,6 +20,7 @@
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
+#include "codegen/unordered_map.h"
 #include "codegen/nv50_ir.h"
 #include "codegen/nv50_ir_target.h"
 #include "codegen/nv50_ir_build_util.h"
@@ -157,6 +158,9 @@ private:
    bool isCSpaceLoad(Instruction *);
    bool isImmdLoad(Instruction *);
    bool isAttribOrSharedLoad(Instruction *);
+
+   BuildUtil bld;
+   std::unordered_map<uint64_t,uint16_t> imms;
 };
 
 bool
@@ -276,8 +280,49 @@ LoadPropagation::visit(BasicBlock *bb)
 
          if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV))
             continue;
-         if (!targ->insnCanLoad(i, s, ld))
-            continue;
+         if (!targ->insnCanLoad(i, s, ld)) {
+            if (ld->src(0).getFile() != FILE_IMMEDIATE)
+               continue;
+            if (prog->driver->io.immCBSlot == -1)
+               continue;
+
+            // check if we can load from c[] instead
+            uint64_t iv = ld->getSrc(0)->asImm()->reg.data.u64;
+            int tySize = typeSizeof(ld->dType);
+            int offset;
+
+            if (imms.find(iv) != imms.end()) {
+               offset = imms.at(iv);
+            } else {
+               offset = prog->immediates.size();
+               if ((offset + typeSizeof(ld->dType)) >= 0x1000)
+                  continue;
+               offset += prog->driver->io.immCBOffset;
+
+               imms.emplace(std::make_pair(iv, offset));
+               const uint8_t *imm = (uint8_t*)&iv;
+               prog->immediates.insert(prog->immediates.end(), imm, &imm[tySize]);
+            }
+
+            if (!targ->insnCanLoad(i, s, Target::LoadTest::mem(FILE_MEMORY_CONST, ld->dType, offset, false)))
+               continue;
+
+            bool skip = false;
+            for (int s2 = s + 1; i->srcExists(s2); ++s2) {
+               Instruction *ld2 = i->getSrc(s2)->getInsn();
+               if (!ld2 || ld2->fixed || (ld2->op != OP_LOAD && ld2->op != OP_MOV))
+                  continue;
+               if (targ->insnCanLoad(i, s2, ld2)) {
+                  skip = true;
+                  break;
+               }
+            }
+            if (skip)
+               continue;
+
+            bld.setPosition(i, false);
+            ld = bld.mkLoad(ld->dType, bld.getSSA(tySize), bld.mkSymbol(FILE_MEMORY_CONST, prog->driver->io.immCBSlot, ld->dType, offset), NULL);
+         }
 
          // propagate !
          i->setSrc(s, ld->getSrc(0));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
index afeca14d7d1..1e6f6a200c0 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
@@ -210,6 +210,35 @@ public:
       unsigned int terminator  : 1;
    };
 
+   class LoadTest
+   {
+   public:
+      static inline LoadTest imm(DataType ty, const ImmediateValue *imm)
+      {
+         return LoadTest(FILE_IMMEDIATE, ty, imm, 0, false);
+      }
+
+      static inline LoadTest mem(DataFile f, DataType ty, int32_t offset, bool indirect)
+      {
+         return LoadTest(f, ty, NULL, offset, indirect);
+      }
+
+      static inline LoadTest gpr(DataType ty)
+      {
+         return LoadTest(FILE_GPR, ty, NULL, 0, false);
+      }
+
+      const DataFile f;
+      const DataType ty;
+      const ImmediateValue *const iv;
+      const int32_t offset;
+      const bool indirect;
+   private:
+      inline LoadTest(DataFile f, DataType ty, const ImmediateValue *iv,
+                      int32_t offset, bool indirect)
+      : f(f), ty(ty), iv(iv), offset(offset), indirect(indirect) {};
+   };
+
    inline const OpInfo& getOpInfo(const Instruction *) const;
    inline const OpInfo& getOpInfo(const operation) const;
 
@@ -217,6 +246,8 @@ public:
 
    virtual bool insnCanLoad(const Instruction *insn, int s,
                             const Instruction *ld) const = 0;
+   virtual bool insnCanLoad(const Instruction *insn, int s,
+                            const LoadTest &t) const { return false; };
    virtual bool insnCanLoadOffset(const Instruction *insn, int s,
                                   int offset) const = 0;
    virtual bool isOpSupported(operation, DataType) const = 0;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
index 60134b445db..0dd00a9a20e 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -339,9 +339,30 @@ TargetNVC0::insnCanLoad(const Instruction *i, int s,
                         const Instruction *ld) const
 {
    DataFile sf = ld->src(0).getFile();
+   switch (sf) {
+   case FILE_GPR:
+      return insnCanLoad(i, s, LoadTest::gpr(ld->dType));
+   case FILE_MEMORY_CONST:
+   case FILE_MEMORY_GLOBAL:
+   case FILE_MEMORY_LOCAL:
+   case FILE_MEMORY_SHARED:
+      return insnCanLoad(i, s, LoadTest::mem(sf, ld->dType, ld->getSrc(0)->reg.data.offset, ld->src(0).isIndirect(0)));
+   case FILE_IMMEDIATE:
+      return insnCanLoad(i, s, LoadTest::imm(ld->dType, ld->getSrc(0)->asImm()));
+   default:
+      assert(false);
+      return false;
+   }
+}
+
+bool
+TargetNVC0::insnCanLoad(const Instruction *i, int s,
+                        const LoadTest &lt) const
+{
+   DataFile sf = lt.f;
 
    // immediate 0 can be represented by GPR $r63/$r255
-   if (sf == FILE_IMMEDIATE && ld->getSrc(0)->reg.data.u64 == 0)
+   if (sf == FILE_IMMEDIATE && lt.iv->reg.data.u64 == 0)
       return (!i->isPseudo() &&
               !i->asTex() &&
               i->op != OP_EXPORT && i->op != OP_STORE);
@@ -352,7 +373,7 @@ TargetNVC0::insnCanLoad(const Instruction *i, int s,
       return false;
 
    // indirect loads can only be done by OP_LOAD/VFETCH/INTERP on nvc0
-   if (ld->src(0).isIndirect(0))
+   if (lt.indirect)
       return false;
    // these are implemented using shf.r and shf.l which can't load consts
    if ((i->op == OP_SHL || i->op == OP_SHR) && typeSizeof(i->sType) == 8 &&
@@ -389,7 +410,7 @@ TargetNVC0::insnCanLoad(const Instruction *i, int s,
 
    // not all instructions support full 32 bit immediates
    if (sf == FILE_IMMEDIATE) {
-      Storage &reg = ld->getSrc(0)->asImm()->reg;
+      const Storage &reg = lt.iv->reg;
 
       if (opInfo[i->op].immdBits != 0xffffffff || typeSizeof(i->sType) > 4) {
          switch (i->sType) {
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h
index 2077207bb23..ab583f4c5fc 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h
@@ -52,6 +52,8 @@ public:
 
    virtual bool insnCanLoad(const Instruction *insn, int s,
                             const Instruction *ld) const;
+   virtual bool insnCanLoad(const Instruction *insn, int s,
+                            const LoadTest &t) const;
    virtual bool insnCanLoadOffset(const Instruction *insn, int s,
                                   int offset) const;
    virtual bool isOpSupported(operation, DataType) const;
diff --git a/src/gallium/drivers/nouveau/codegen/unordered_map.h b/src/gallium/drivers/nouveau/codegen/unordered_map.h
new file mode 100644
index 00000000000..760a10e0a15
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/unordered_map.h
@@ -0,0 +1,20 @@
+#ifndef __NV50_UNORDERED_MAP_H__
+#define __NV50_UNORDERED_MAP_H__
+
+#if (__cplusplus >= 201103L)
+#include <unordered_map>
+#else
+#include <tr1/unordered_map>
+#endif
+
+namespace nv50_ir {
+
+#if __cplusplus >= 201103L
+using std::unordered_map;
+#else
+using std::tr1::unordered_map;
+#endif
+
+} // namespace nv50_ir
+
+#endif // __NV50_UNORDERED_MAP_H__
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c
index b117790d6ec..47cc8f9b9f8 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@@ -338,6 +338,7 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
 
    info->bin.smemSize = prog->cp.smem_size;
    info->io.auxCBSlot = 15;
+   info->io.immCBSlot = -1;
    info->io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
    info->io.genUserClip = prog->vp.clpd_nr;
    if (prog->fp.alphatest)
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index 4cfd207d4c0..5ee82b8dcd9 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -156,6 +156,9 @@
 /* 512 64-byte blocks for bindless image handles */
 #define NVC0_CB_AUX_BINDLESS_INFO(i) 0x6b0 + (i) * 16 * 4
 #define NVC0_CB_AUX_BINDLESS_SIZE   (NVE4_IMG_MAX_HANDLES * 16 * 4)
+/* 1k immediate buffer */
+#define NVC0_CB_AUX_IMMEDIATE_INFO  0x86a0
+#define NVC0_CB_AUX_IMMEDIATE_SIZE  0x1000
 /* 4 32-bits floats for the vertex runout, put at the end */
 #define NVC0_CB_AUX_RUNOUT_INFO     NVC0_CB_USR_SIZE + (NVC0_CB_AUX_SIZE * 6)
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index 1bbfa4a9428..4516a0820e3 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -597,6 +597,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
    info->io.genUserClip = prog->vp.num_ucps;
    info->io.auxCBSlot = 15;
    info->io.msInfoCBSlot = 15;
+   info->io.immCBOffset = NVC0_CB_AUX_IMMEDIATE_INFO;
    info->io.ucpBase = NVC0_CB_AUX_UCP_INFO;
    info->io.drawInfoBase = NVC0_CB_AUX_DRAW_INFO;
    info->io.msInfoBase = NVC0_CB_AUX_MS_INFO;
@@ -615,8 +616,11 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
          info->io.uboInfoBase = NVC0_CB_AUX_UBO_INFO(0);
       }
       info->prop.cp.gridInfoBase = NVC0_CB_AUX_GRID_INFO(0);
+      info->io.immCBSlot = 7;
    } else {
       info->io.sampleInfoBase = NVC0_CB_AUX_SAMPLE_INFO;
+      // TODO: on maxwell we can use a seperate buffer
+      info->io.immCBSlot = 15;
    }
 
    info->assignSlots = nvc0_program_assign_varying_slots;
@@ -631,6 +635,8 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
 
    prog->code = info->bin.code;
    prog->code_size = info->bin.codeSize;
+   prog->imms = info->bin.imms;
+   prog->imms_size = info->bin.immsSize;
    prog->relocs = info->bin.relocData;
    prog->fixups = info->bin.fixupData;
    prog->num_gprs = MAX2(4, (info->bin.maxGPR + 1));
@@ -915,6 +921,7 @@ nvc0_program_destroy(struct nvc0_context *nvc0, struct nvc0_program *prog)
    if (prog->mem)
       nouveau_heap_free(&prog->mem);
    FREE(prog->code); /* may be 0 for hardcoded shaders */
+   FREE(prog->imms);
    FREE(prog->relocs);
    FREE(prog->fixups);
    if (prog->type == PIPE_SHADER_COMPUTE && prog->cp.syms)
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
index b73822ea9f7..64f32affe3d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
@@ -29,6 +29,8 @@ struct nvc0_program {
    unsigned code_base;
    unsigned code_size;
    unsigned parm_size; /* size of non-bindable uniforms (c0[]) */
+   uint32_t *imms;
+   unsigned imms_size;
 
    uint32_t hdr[20];
    uint32_t flags[2];
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
index 697bf491a01..8199987d94c 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -46,11 +46,28 @@ nvc0_program_update_context_state(struct nvc0_context *nvc0,
    }
 }
 
+static inline void
+nvc0_program_validate_imms(struct nvc0_context *nvc0, struct nvc0_program *prog)
+{
+   if (prog && prog->imms && prog->imms_size) {
+      struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+      BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+      PUSH_DATA (push, NVC0_CB_AUX_SIZE);
+      PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(nvc0_shader_stage(prog->type)));
+      PUSH_DATA (push, nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(nvc0_shader_stage(prog->type)));
+      BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + prog->imms_size);
+      PUSH_DATA (push, NVC0_CB_AUX_IMMEDIATE_INFO);
+      PUSH_DATAp(push, prog->imms, prog->imms_size);
+   }
+}
+
 static inline bool
 nvc0_program_validate(struct nvc0_context *nvc0, struct nvc0_program *prog)
 {
-   if (prog->mem)
+   if (prog->mem) {
+      nvc0_program_validate_imms(nvc0, prog);
       return true;
+   }
 
    if (!prog->translated) {
       prog->translated = nvc0_program_translate(
@@ -59,6 +76,7 @@ nvc0_program_validate(struct nvc0_context *nvc0, struct nvc0_program *prog)
          return false;
    }
 
+   nvc0_program_validate_imms(nvc0, prog);
    if (likely(prog->code_size))
       return nvc0_program_upload(nvc0, prog);
    return true; /* stream output info only */
@@ -135,6 +153,7 @@ nvc0_fragprog_validate(struct nvc0_context *nvc0)
    }
 
    if (fp->mem && !(nvc0->dirty_3d & NVC0_NEW_3D_FRAGPROG)) {
+      nvc0_program_validate_imms(nvc0, fp);
       return;
    }
 
-- 
2.20.1