[Mesa-dev] [PATCH v2] nvc0: fix bindless multisampled images on Maxwell+

Mon Sep 17 15:13:36 UTC 2018

On Mon, Sep 17, 2018 at 11:00 AM, Rhys Perry <pendingchaos02 at gmail.com> wrote:
> NVC0_CB_AUX_BINDLESS_INFO isn't written to on Maxwell+ and it's too small
> anyway.
>
> With these changes, TXQ is used to determine the number of samples and
> the coordinate adjustment information looked up in a small array in the
> driver constant buffer.
>
> v2: rework to use TXQ and a small array instead of a larger array with an
>     entry for each texture
>
> Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
> ---
>  .../drivers/nouveau/codegen/nv50_ir_driver.h  |  1 +
>  .../codegen/nv50_ir_lowering_gm107.cpp        |  4 +--
>  .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 31 +++++++++++++++++--
>  .../nouveau/codegen/nv50_ir_lowering_nvc0.h   |  3 +-
>  .../nouveau/codegen/nv50_ir_peephole.cpp      |  1 +
>  .../drivers/nouveau/nvc0/mme/com9097.mme      |  8 ++---
>  .../drivers/nouveau/nvc0/mme/com9097.mme.h    |  8 ++---
>  .../drivers/nouveau/nvc0/nvc0_context.h       | 23 ++++++++------
>  .../drivers/nouveau/nvc0/nvc0_program.c       |  1 +
>  .../drivers/nouveau/nvc0/nvc0_screen.c        | 15 +++++++++
>  .../drivers/nouveau/nvc0/nve4_compute.c       | 22 +++++++++++++
>  11 files changed, 94 insertions(+), 23 deletions(-)
>
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
> index 7c835ceab8..b3da6fc3cf 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
> @@ -188,6 +188,7 @@ struct nv50_ir_prog_info
>        uint8_t msInfoCBSlot;      /* cX[] used for multisample info */
>        uint16_t msInfoBase;       /* base address for multisample info */
>        uint16_t uboInfoBase;      /* base address for compute UBOs (gk104+) */
> +      uint16_t msAdjInfoBase;    /* base address for MS coordinate adjustment info */
>     } io;
>
>     /* driver callback to assign input/output locations */
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
> index c7436e2e29..49a5f3b01f 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
> @@ -320,11 +320,11 @@ GM107LoweringPass::handleSUQ(TexInstruction *suq)
>
>        if (mask & 0x1)
>           bld.mkOp2(OP_SHR, TYPE_U32, suq->getDef(0), suq->getDef(0),
> -                   loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0), suq->tex.bindless));
> +                   loadMsAdjInfo32(suq->tex.target, 0, slot, ind, suq->tex.bindless));
>        if (mask & 0x2) {
>           int d = util_bitcount(mask & 0x1);
>           bld.mkOp2(OP_SHR, TYPE_U32, suq->getDef(d), suq->getDef(d),
> -                   loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1), suq->tex.bindless));
> +                   loadMsAdjInfo32(suq->tex.target, 1, slot, ind, suq->tex.bindless));
>        }
>     }
>
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
> index 176e0cf608..5db29ba799 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
> @@ -1732,6 +1732,33 @@ NVC0LoweringPass::loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless
>                          prog->driver->io.suInfoBase);
>  }
>
> +inline Value *
> +NVC0LoweringPass::loadMsAdjInfo32(TexInstruction::Target target, uint32_t index, int slot, Value *ind, bool bindless)
> +{
> +   if (!bindless || targ->getChipset() < NVISA_GM107_CHIPSET)
> +      return loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(index), bindless);
> +
> +   assert(bindless);
> +
> +   Value *samples = bld.getSSA();
> +   // This shouldn't be lowered because it's being inserted before the current instruction
> +   TexInstruction *tex = new_TexInstruction(func, OP_TXQ);
> +   tex->tex.target = target;
> +   tex->tex.query = TXQ_TYPE;
> +   tex->tex.mask = 0x4;
> +   tex->tex.r = 0xff;
> +   tex->tex.s = 0x1f;
> +   tex->tex.rIndirectSrc = 0;
> +   tex->setDef(0, samples);
> +   tex->setSrc(0, ind);
> +   tex->setSrc(1, bld.loadImm(NULL, 0));
> +   bld.insert(tex);
> +
> +   // XMAD has a higher throughput than SHL and we shouldn't be dealing with >65535 integers here
> +   Value *ptr = bld.mkOp3v(OP_XMAD, TYPE_U32, bld.getSSA(), samples, bld.mkImm(8), bld.mkImm(0));
> +   return loadResInfo32(ptr, index * 4, prog->driver->io.msAdjInfoBase);
> +}
> +
>  static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c)
>  {
>     switch (su->tex.target.getEnum()) {
> @@ -1817,8 +1844,8 @@ NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
>     Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
>     Value *ind = tex->getIndirectR();
>
> -   Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0), tex->tex.bindless);
> -   Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1), tex->tex.bindless);
> +   Value *ms_x = loadMsAdjInfo32(tex->tex.target, 0, slot, ind, tex->tex.bindless);
> +   Value *ms_y = loadMsAdjInfo32(tex->tex.target, 1, slot, ind, tex->tex.bindless);
>
>     bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
>     bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
> index 5dbb3e4f00..4136b1ecfe 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
> @@ -148,7 +148,7 @@ protected:
>     void handlePIXLD(Instruction *);
>
>     void checkPredicate(Instruction *);
> -   Value *loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless);
> +   Value *loadMsAdjInfo32(TexInstruction::Target targ, uint32_t index, int slot, Value *ind, bool bindless);
>
>     virtual bool visit(Instruction *);
>
> @@ -161,6 +161,7 @@ private:
>     Value *loadResInfo32(Value *ptr, uint32_t off, uint16_t base);
>     Value *loadResInfo64(Value *ptr, uint32_t off, uint16_t base);
>     Value *loadResLength32(Value *ptr, uint32_t off, uint16_t base);
> +   Value *loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless);
>     Value *loadBufInfo64(Value *ptr, uint32_t off);
>     Value *loadBufLength32(Value *ptr, uint32_t off);
>     Value *loadUboInfo64(Value *ptr, uint32_t off);
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> index d851cf3c37..f91c502e9e 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> @@ -317,6 +317,7 @@ IndirectPropagation::visit(BasicBlock *bb)
>           ImmediateValue imm;
>           if (!i->src(s).isIndirect(0))
>              continue;
> +
>           insn = i->getIndirect(s, 0)->getInsn();
>           if (!insn)
>              continue;
> diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
> index 38c2e86843..8ca8f34f9b 100644
> --- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
> +++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
> @@ -255,7 +255,7 @@ dei_draw_again:
>     parm $r4 maddr 0x5f7 /* INDEX_BATCH_FIRST, start */
>     parm $r4 send $r4 /* index_bias, send start */
>     maddr 0x18e3 /* CB_POS */
> -   send 0x1a0 /* 256 + 160 */
> +   send 0x1e0 /* 256 + 224 */
>     braz $r2 #dei_end
>     parm $r5 send $r4 /* start_instance, send index_bias */
>     send $r5 /* send start_instance */
> @@ -311,7 +311,7 @@ dai_draw_again:
>     braz $r3 #dai_end
>     parm $r4 send $r4 /* start_instance */
>     maddr 0x18e3 /* CB_POS */
> -   send 0x1a0 /* 256 + 160 */
> +   send 0x1e0 /* 256 + 224 */
>     send 0x0 /* send 0 as base_vertex */
>     send $r4 /* send start_instance */
>     send $r6 /* draw id */
> @@ -374,7 +374,7 @@ deic_draw_again:
>     parm $r4 maddr 0x5f7 /* INDEX_BATCH_FIRST, start */
>     parm $r4 send $r4 /* index_bias, send start */
>     maddr 0x18e3 /* CB_POS */
> -   send 0x1a0 /* 256 + 160 */
> +   send 0x1e0 /* 256 + 224 */
>     braz $r2 #deic_end
>     parm $r5 send $r4 /* start_instance, send index_bias */
>     send $r5 /* send start_instance */
> @@ -455,7 +455,7 @@ daic_draw_again:
>     braz $r3 #daic_end
>     parm $r4 send $r4 /* start_instance */
>     maddr 0x18e3 /* CB_POS */
> -   send 0x1a0 /* 256 + 160 */
> +   send 0x1e0 /* 256 + 224 */
>     send 0x0 /* send 0 as base_vertex */
>     send $r4 /* send start_instance */
>     send $r6 /* draw id */
> diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
> index 49c0891114..47c5e6c6e0 100644
> --- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
> +++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
> @@ -140,7 +140,7 @@ uint32_t mme9097_draw_elts_indirect[] = {
>         0x017dc451,
>         0x00002431,
>         0x0638c021,
> -       0x00680041,
> +       0x00780041,
>         0x0004d007,
>         0x00002531,
>         0x00002841,
> @@ -185,7 +185,7 @@ uint32_t mme9097_draw_arrays_indirect[] = {
>         0x0004d807,
>         0x00002431,
>         0x0638c021,
> -       0x00680041,
> +       0x00780041,
>         0x00000041,
>         0x00002041,
>         0x00003041,
> @@ -233,7 +233,7 @@ uint32_t mme9097_draw_elts_indirect_count[] = {
>         0x017dc451,
>         0x00002431,
>         0x0638c021,
> -       0x00680041,
> +       0x00780041,
>         0x0004d007,
>         0x00002531,
>         0x00002841,
> @@ -300,7 +300,7 @@ uint32_t mme9097_draw_arrays_indirect_count[] = {
>         0x0004d807,
>         0x00002431,
>         0x0638c021,
> -       0x00680041,
> +       0x00780041,
>         0x00000041,
>         0x00002041,
>         0x00003041,
> diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
> index 77237a3c0a..1d920c26f5 100644
> --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
> +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
> @@ -122,35 +122,38 @@
>  /* 8 sets of 32-bits coordinate offsets */
>  #define NVC0_CB_AUX_MS_INFO         0x0c0
>  #define NVC0_CB_AUX_MS_SIZE         (8 * 2 * 4)
> +/* 8 sets of 32-bit pairs containing coordinate adjustment information */
> +#define NVC0_CB_AUX_MS_ADJ_INFO(i)  0x100 + (i) * 4 * 2
> +#define NVC0_CB_AUX_MS_ADJ_SIZE     (8 * 2 * 4)
>  /* block/grid size, at 3 32-bits integers each, gridid and work_dim */
> -#define NVC0_CB_AUX_GRID_INFO(i)    0x100 + (i) * 4 /* CP */
> +#define NVC0_CB_AUX_GRID_INFO(i)    0x140 + (i) * 4 /* CP */
>  #define NVC0_CB_AUX_GRID_SIZE       (8 * 4)
>  /* FB texture handle */
> -#define NVC0_CB_AUX_FB_TEX_INFO     0x100 /* FP */
> +#define NVC0_CB_AUX_FB_TEX_INFO     0x140 /* FP */
>  #define NVC0_CB_AUX_FB_TEX_SIZE     (4)
>  /* 8 user clip planes, at 4 32-bits floats each */
> -#define NVC0_CB_AUX_UCP_INFO        0x120
> +#define NVC0_CB_AUX_UCP_INFO        0x160
>  #define NVC0_CB_AUX_UCP_SIZE        (PIPE_MAX_CLIP_PLANES * 4 * 4)
>  /* 13 ubos, at 4 32-bits integer each */
> -#define NVC0_CB_AUX_UBO_INFO(i)     0x120 + (i) * 4 * 4 /* CP */
> +#define NVC0_CB_AUX_UBO_INFO(i)     0x160 + (i) * 4 * 4 /* CP */
>  #define NVC0_CB_AUX_UBO_SIZE        ((NVC0_MAX_PIPE_CONSTBUFS - 1) * 4 * 4)
>  /* 8 sets of 32-bits integer pairs sample offsets */
> -#define NVC0_CB_AUX_SAMPLE_INFO     0x1a0 /* FP */
> +#define NVC0_CB_AUX_SAMPLE_INFO     0x1e0 /* FP */
>  /* 256 bytes, though only 64 bytes used before GM200 */
>  #define NVC0_CB_AUX_SAMPLE_SIZE     (8 * 2 * 4 * 4)
>  /* draw parameters (index bais, base instance, drawid) */
> -#define NVC0_CB_AUX_DRAW_INFO       0x1a0 /* VP */
> +#define NVC0_CB_AUX_DRAW_INFO       0x1e0 /* VP */
>  /* 32 user buffers, at 4 32-bits integers each */
> -#define NVC0_CB_AUX_BUF_INFO(i)     0x2a0 + (i) * 4 * 4
> +#define NVC0_CB_AUX_BUF_INFO(i)     0x2e0 + (i) * 4 * 4
>  #define NVC0_CB_AUX_BUF_SIZE        (NVC0_MAX_BUFFERS * 4 * 4)
>  /* 8 surfaces, at 16 32-bits integers each */
> -#define NVC0_CB_AUX_SU_INFO(i)      0x4a0 + (i) * 16 * 4
> +#define NVC0_CB_AUX_SU_INFO(i)      0x4e0 + (i) * 16 * 4
>  #define NVC0_CB_AUX_SU_SIZE         (NVC0_MAX_IMAGES * 16 * 4)
>  /* 1 64-bits address and 1 32-bits sequence */
> -#define NVC0_CB_AUX_MP_INFO         0x6a0
> +#define NVC0_CB_AUX_MP_INFO         0x6e0
>  #define NVC0_CB_AUX_MP_SIZE         3 * 4
>  /* 512 64-byte blocks for bindless image handles */
> -#define NVC0_CB_AUX_BINDLESS_INFO(i) 0x6b0 + (i) * 16 * 4
> +#define NVC0_CB_AUX_BINDLESS_INFO(i) 0x6f0 + (i) * 16 * 4
>  #define NVC0_CB_AUX_BINDLESS_SIZE   (NVE4_IMG_MAX_HANDLES * 16 * 4)
>  /* 4 32-bits floats for the vertex runout, put at the end */
>  #define NVC0_CB_AUX_RUNOUT_INFO     NVC0_CB_USR_SIZE + (NVC0_CB_AUX_SIZE * 6)
> diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
> index 57d98753f4..b3a0954d76 100644
> --- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
> +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
> @@ -600,6 +600,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
>     info->io.ucpBase = NVC0_CB_AUX_UCP_INFO;
>     info->io.drawInfoBase = NVC0_CB_AUX_DRAW_INFO;
>     info->io.msInfoBase = NVC0_CB_AUX_MS_INFO;
> +   info->io.msAdjInfoBase = NVC0_CB_AUX_MS_ADJ_INFO(0);
>     info->io.bufInfoBase = NVC0_CB_AUX_BUF_INFO(0);
>     info->io.suInfoBase = NVC0_CB_AUX_SU_INFO(0);
>     if (info->target >= NVISA_GK104_CHIPSET) {
> diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
> index 2eecf59ce0..f67e42052e 100644
> --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
> +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
> @@ -1362,6 +1362,21 @@ nvc0_screen_create(struct nouveau_device *dev)
>        PUSH_DATA (push, 1);
>        PUSH_DATA (push, 3); /* 7 */
>        PUSH_DATA (push, 1);
> +
> +      /* MS coordinate adjustment information */
> +      for (int i = 1; i <= 8; i *= 2) {
> +         BEGIN_1IC0(push, NVC0_3D(CB_POS), 3);
> +         PUSH_DATA (push, NVC0_CB_AUX_MS_ADJ_INFO(i));
> +         int ms_x = 0, ms_y = 0;
> +         switch (i) {
> +         case 1: break;
> +         case 2: ms_x = 1; break;
> +         case 4: ms_x = 1; ms_y = 1; break;
> +         case 8: ms_x = 2; ms_y = 1; break;
> +         }
> +         PUSH_DATA(push, ms_x);
> +         PUSH_DATA(push, ms_y);
> +      }
>     }
>     BEGIN_NVC0(push, NVC0_3D(LINKED_TSC), 1);
>     PUSH_DATA (push, 0);
> diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
> index 8aa8d4936f..b7af7ab0d2 100644
> --- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
> +++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
> @@ -168,6 +168,28 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
>     PUSH_DATA (push, 3); /* 7 */
>     PUSH_DATA (push, 1);
>
> +   /* MS coordinate adjustment information */
> +   for (int i = 1; i <= 8; i *= 2) {
> +      BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 4);
> +      PUSH_DATA (push, 8);
> +      PUSH_DATA (push, 1);
> +      PUSH_DATAh(push, address + NVC0_CB_AUX_MS_ADJ_INFO(i));
> +      PUSH_DATA (push, address + NVC0_CB_AUX_MS_ADJ_INFO(i));
> +      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 3);
> +      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
> +
> +      int ms_x = 0, ms_y = 0;
> +      switch (i) {
> +      case 1: break;
> +      case 2: ms_x = 1; break;
> +      case 4: ms_x = 1; ms_y = 1; break;
> +      case 8: ms_x = 2; ms_y = 1; break;

Is this really necessary? Couldn't you just adjust the generated code to be like

ms_x = (samples+2)>>2
ms_y = samples > 1

And void the constbuf bit entirely? [And yeah, this falls down with a
number of samples that isn't 1/2/4/8, but that really shouldn't
happen.]

  -ilia