[Mesa-dev] [PATCH v2] nvc0: fix bindless multisampled images on Maxwell+
Ilia Mirkin
imirkin at alum.mit.edu
Mon Sep 17 15:13:36 UTC 2018
On Mon, Sep 17, 2018 at 11:00 AM, Rhys Perry <pendingchaos02 at gmail.com> wrote:
> NVC0_CB_AUX_BINDLESS_INFO isn't written to on Maxwell+ and it's too small
> anyway.
>
> With these changes, TXQ is used to determine the number of samples and
> the coordinate adjustment information looked up in a small array in the
> driver constant buffer.
>
> v2: rework to use TXQ and a small array instead of a larger array with an
> entry for each texture
>
> Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
> ---
> .../drivers/nouveau/codegen/nv50_ir_driver.h | 1 +
> .../codegen/nv50_ir_lowering_gm107.cpp | 4 +--
> .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 31 +++++++++++++++++--
> .../nouveau/codegen/nv50_ir_lowering_nvc0.h | 3 +-
> .../nouveau/codegen/nv50_ir_peephole.cpp | 1 +
> .../drivers/nouveau/nvc0/mme/com9097.mme | 8 ++---
> .../drivers/nouveau/nvc0/mme/com9097.mme.h | 8 ++---
> .../drivers/nouveau/nvc0/nvc0_context.h | 23 ++++++++------
> .../drivers/nouveau/nvc0/nvc0_program.c | 1 +
> .../drivers/nouveau/nvc0/nvc0_screen.c | 15 +++++++++
> .../drivers/nouveau/nvc0/nve4_compute.c | 22 +++++++++++++
> 11 files changed, 94 insertions(+), 23 deletions(-)
>
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
> index 7c835ceab8..b3da6fc3cf 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
> @@ -188,6 +188,7 @@ struct nv50_ir_prog_info
> uint8_t msInfoCBSlot; /* cX[] used for multisample info */
> uint16_t msInfoBase; /* base address for multisample info */
> uint16_t uboInfoBase; /* base address for compute UBOs (gk104+) */
> + uint16_t msAdjInfoBase; /* base address for MS coordinate adjustment info */
> } io;
>
> /* driver callback to assign input/output locations */
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
> index c7436e2e29..49a5f3b01f 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
> @@ -320,11 +320,11 @@ GM107LoweringPass::handleSUQ(TexInstruction *suq)
>
> if (mask & 0x1)
> bld.mkOp2(OP_SHR, TYPE_U32, suq->getDef(0), suq->getDef(0),
> - loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0), suq->tex.bindless));
> + loadMsAdjInfo32(suq->tex.target, 0, slot, ind, suq->tex.bindless));
> if (mask & 0x2) {
> int d = util_bitcount(mask & 0x1);
> bld.mkOp2(OP_SHR, TYPE_U32, suq->getDef(d), suq->getDef(d),
> - loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1), suq->tex.bindless));
> + loadMsAdjInfo32(suq->tex.target, 1, slot, ind, suq->tex.bindless));
> }
> }
>
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
> index 176e0cf608..5db29ba799 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
> @@ -1732,6 +1732,33 @@ NVC0LoweringPass::loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless
> prog->driver->io.suInfoBase);
> }
>
> +inline Value *
> +NVC0LoweringPass::loadMsAdjInfo32(TexInstruction::Target target, uint32_t index, int slot, Value *ind, bool bindless)
> +{
> + if (!bindless || targ->getChipset() < NVISA_GM107_CHIPSET)
> + return loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(index), bindless);
> +
> + assert(bindless);
> +
> + Value *samples = bld.getSSA();
> + // This shouldn't be lowered because it's being inserted before the current instruction
> + TexInstruction *tex = new_TexInstruction(func, OP_TXQ);
> + tex->tex.target = target;
> + tex->tex.query = TXQ_TYPE;
> + tex->tex.mask = 0x4;
> + tex->tex.r = 0xff;
> + tex->tex.s = 0x1f;
> + tex->tex.rIndirectSrc = 0;
> + tex->setDef(0, samples);
> + tex->setSrc(0, ind);
> + tex->setSrc(1, bld.loadImm(NULL, 0));
> + bld.insert(tex);
> +
> + // XMAD has a higher throughput than SHL and we shouldn't be dealing with >65535 integers here
> + Value *ptr = bld.mkOp3v(OP_XMAD, TYPE_U32, bld.getSSA(), samples, bld.mkImm(8), bld.mkImm(0));
> + return loadResInfo32(ptr, index * 4, prog->driver->io.msAdjInfoBase);
> +}
> +
> static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c)
> {
> switch (su->tex.target.getEnum()) {
> @@ -1817,8 +1844,8 @@ NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
> Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
> Value *ind = tex->getIndirectR();
>
> - Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0), tex->tex.bindless);
> - Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1), tex->tex.bindless);
> + Value *ms_x = loadMsAdjInfo32(tex->tex.target, 0, slot, ind, tex->tex.bindless);
> + Value *ms_y = loadMsAdjInfo32(tex->tex.target, 1, slot, ind, tex->tex.bindless);
>
> bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
> bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
> index 5dbb3e4f00..4136b1ecfe 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
> @@ -148,7 +148,7 @@ protected:
> void handlePIXLD(Instruction *);
>
> void checkPredicate(Instruction *);
> - Value *loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless);
> + Value *loadMsAdjInfo32(TexInstruction::Target targ, uint32_t index, int slot, Value *ind, bool bindless);
>
> virtual bool visit(Instruction *);
>
> @@ -161,6 +161,7 @@ private:
> Value *loadResInfo32(Value *ptr, uint32_t off, uint16_t base);
> Value *loadResInfo64(Value *ptr, uint32_t off, uint16_t base);
> Value *loadResLength32(Value *ptr, uint32_t off, uint16_t base);
> + Value *loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless);
> Value *loadBufInfo64(Value *ptr, uint32_t off);
> Value *loadBufLength32(Value *ptr, uint32_t off);
> Value *loadUboInfo64(Value *ptr, uint32_t off);
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> index d851cf3c37..f91c502e9e 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> @@ -317,6 +317,7 @@ IndirectPropagation::visit(BasicBlock *bb)
> ImmediateValue imm;
> if (!i->src(s).isIndirect(0))
> continue;
> +
> insn = i->getIndirect(s, 0)->getInsn();
> if (!insn)
> continue;
> diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
> index 38c2e86843..8ca8f34f9b 100644
> --- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
> +++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
> @@ -255,7 +255,7 @@ dei_draw_again:
> parm $r4 maddr 0x5f7 /* INDEX_BATCH_FIRST, start */
> parm $r4 send $r4 /* index_bias, send start */
> maddr 0x18e3 /* CB_POS */
> - send 0x1a0 /* 256 + 160 */
> + send 0x1e0 /* 256 + 224 */
> braz $r2 #dei_end
> parm $r5 send $r4 /* start_instance, send index_bias */
> send $r5 /* send start_instance */
> @@ -311,7 +311,7 @@ dai_draw_again:
> braz $r3 #dai_end
> parm $r4 send $r4 /* start_instance */
> maddr 0x18e3 /* CB_POS */
> - send 0x1a0 /* 256 + 160 */
> + send 0x1e0 /* 256 + 224 */
> send 0x0 /* send 0 as base_vertex */
> send $r4 /* send start_instance */
> send $r6 /* draw id */
> @@ -374,7 +374,7 @@ deic_draw_again:
> parm $r4 maddr 0x5f7 /* INDEX_BATCH_FIRST, start */
> parm $r4 send $r4 /* index_bias, send start */
> maddr 0x18e3 /* CB_POS */
> - send 0x1a0 /* 256 + 160 */
> + send 0x1e0 /* 256 + 224 */
> braz $r2 #deic_end
> parm $r5 send $r4 /* start_instance, send index_bias */
> send $r5 /* send start_instance */
> @@ -455,7 +455,7 @@ daic_draw_again:
> braz $r3 #daic_end
> parm $r4 send $r4 /* start_instance */
> maddr 0x18e3 /* CB_POS */
> - send 0x1a0 /* 256 + 160 */
> + send 0x1e0 /* 256 + 224 */
> send 0x0 /* send 0 as base_vertex */
> send $r4 /* send start_instance */
> send $r6 /* draw id */
> diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
> index 49c0891114..47c5e6c6e0 100644
> --- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
> +++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
> @@ -140,7 +140,7 @@ uint32_t mme9097_draw_elts_indirect[] = {
> 0x017dc451,
> 0x00002431,
> 0x0638c021,
> - 0x00680041,
> + 0x00780041,
> 0x0004d007,
> 0x00002531,
> 0x00002841,
> @@ -185,7 +185,7 @@ uint32_t mme9097_draw_arrays_indirect[] = {
> 0x0004d807,
> 0x00002431,
> 0x0638c021,
> - 0x00680041,
> + 0x00780041,
> 0x00000041,
> 0x00002041,
> 0x00003041,
> @@ -233,7 +233,7 @@ uint32_t mme9097_draw_elts_indirect_count[] = {
> 0x017dc451,
> 0x00002431,
> 0x0638c021,
> - 0x00680041,
> + 0x00780041,
> 0x0004d007,
> 0x00002531,
> 0x00002841,
> @@ -300,7 +300,7 @@ uint32_t mme9097_draw_arrays_indirect_count[] = {
> 0x0004d807,
> 0x00002431,
> 0x0638c021,
> - 0x00680041,
> + 0x00780041,
> 0x00000041,
> 0x00002041,
> 0x00003041,
> diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
> index 77237a3c0a..1d920c26f5 100644
> --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
> +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
> @@ -122,35 +122,38 @@
> /* 8 sets of 32-bits coordinate offsets */
> #define NVC0_CB_AUX_MS_INFO 0x0c0
> #define NVC0_CB_AUX_MS_SIZE (8 * 2 * 4)
> +/* 8 sets of 32-bit pairs containing coordinate adjustment information */
> +#define NVC0_CB_AUX_MS_ADJ_INFO(i) 0x100 + (i) * 4 * 2
> +#define NVC0_CB_AUX_MS_ADJ_SIZE (8 * 2 * 4)
> /* block/grid size, at 3 32-bits integers each, gridid and work_dim */
> -#define NVC0_CB_AUX_GRID_INFO(i) 0x100 + (i) * 4 /* CP */
> +#define NVC0_CB_AUX_GRID_INFO(i) 0x140 + (i) * 4 /* CP */
> #define NVC0_CB_AUX_GRID_SIZE (8 * 4)
> /* FB texture handle */
> -#define NVC0_CB_AUX_FB_TEX_INFO 0x100 /* FP */
> +#define NVC0_CB_AUX_FB_TEX_INFO 0x140 /* FP */
> #define NVC0_CB_AUX_FB_TEX_SIZE (4)
> /* 8 user clip planes, at 4 32-bits floats each */
> -#define NVC0_CB_AUX_UCP_INFO 0x120
> +#define NVC0_CB_AUX_UCP_INFO 0x160
> #define NVC0_CB_AUX_UCP_SIZE (PIPE_MAX_CLIP_PLANES * 4 * 4)
> /* 13 ubos, at 4 32-bits integer each */
> -#define NVC0_CB_AUX_UBO_INFO(i) 0x120 + (i) * 4 * 4 /* CP */
> +#define NVC0_CB_AUX_UBO_INFO(i) 0x160 + (i) * 4 * 4 /* CP */
> #define NVC0_CB_AUX_UBO_SIZE ((NVC0_MAX_PIPE_CONSTBUFS - 1) * 4 * 4)
> /* 8 sets of 32-bits integer pairs sample offsets */
> -#define NVC0_CB_AUX_SAMPLE_INFO 0x1a0 /* FP */
> +#define NVC0_CB_AUX_SAMPLE_INFO 0x1e0 /* FP */
> /* 256 bytes, though only 64 bytes used before GM200 */
> #define NVC0_CB_AUX_SAMPLE_SIZE (8 * 2 * 4 * 4)
> /* draw parameters (index bais, base instance, drawid) */
> -#define NVC0_CB_AUX_DRAW_INFO 0x1a0 /* VP */
> +#define NVC0_CB_AUX_DRAW_INFO 0x1e0 /* VP */
> /* 32 user buffers, at 4 32-bits integers each */
> -#define NVC0_CB_AUX_BUF_INFO(i) 0x2a0 + (i) * 4 * 4
> +#define NVC0_CB_AUX_BUF_INFO(i) 0x2e0 + (i) * 4 * 4
> #define NVC0_CB_AUX_BUF_SIZE (NVC0_MAX_BUFFERS * 4 * 4)
> /* 8 surfaces, at 16 32-bits integers each */
> -#define NVC0_CB_AUX_SU_INFO(i) 0x4a0 + (i) * 16 * 4
> +#define NVC0_CB_AUX_SU_INFO(i) 0x4e0 + (i) * 16 * 4
> #define NVC0_CB_AUX_SU_SIZE (NVC0_MAX_IMAGES * 16 * 4)
> /* 1 64-bits address and 1 32-bits sequence */
> -#define NVC0_CB_AUX_MP_INFO 0x6a0
> +#define NVC0_CB_AUX_MP_INFO 0x6e0
> #define NVC0_CB_AUX_MP_SIZE 3 * 4
> /* 512 64-byte blocks for bindless image handles */
> -#define NVC0_CB_AUX_BINDLESS_INFO(i) 0x6b0 + (i) * 16 * 4
> +#define NVC0_CB_AUX_BINDLESS_INFO(i) 0x6f0 + (i) * 16 * 4
> #define NVC0_CB_AUX_BINDLESS_SIZE (NVE4_IMG_MAX_HANDLES * 16 * 4)
> /* 4 32-bits floats for the vertex runout, put at the end */
> #define NVC0_CB_AUX_RUNOUT_INFO NVC0_CB_USR_SIZE + (NVC0_CB_AUX_SIZE * 6)
> diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
> index 57d98753f4..b3a0954d76 100644
> --- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
> +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
> @@ -600,6 +600,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
> info->io.ucpBase = NVC0_CB_AUX_UCP_INFO;
> info->io.drawInfoBase = NVC0_CB_AUX_DRAW_INFO;
> info->io.msInfoBase = NVC0_CB_AUX_MS_INFO;
> + info->io.msAdjInfoBase = NVC0_CB_AUX_MS_ADJ_INFO(0);
> info->io.bufInfoBase = NVC0_CB_AUX_BUF_INFO(0);
> info->io.suInfoBase = NVC0_CB_AUX_SU_INFO(0);
> if (info->target >= NVISA_GK104_CHIPSET) {
> diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
> index 2eecf59ce0..f67e42052e 100644
> --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
> +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
> @@ -1362,6 +1362,21 @@ nvc0_screen_create(struct nouveau_device *dev)
> PUSH_DATA (push, 1);
> PUSH_DATA (push, 3); /* 7 */
> PUSH_DATA (push, 1);
> +
> + /* MS coordinate adjustment information */
> + for (int i = 1; i <= 8; i *= 2) {
> + BEGIN_1IC0(push, NVC0_3D(CB_POS), 3);
> + PUSH_DATA (push, NVC0_CB_AUX_MS_ADJ_INFO(i));
> + int ms_x = 0, ms_y = 0;
> + switch (i) {
> + case 1: break;
> + case 2: ms_x = 1; break;
> + case 4: ms_x = 1; ms_y = 1; break;
> + case 8: ms_x = 2; ms_y = 1; break;
> + }
> + PUSH_DATA(push, ms_x);
> + PUSH_DATA(push, ms_y);
> + }
> }
> BEGIN_NVC0(push, NVC0_3D(LINKED_TSC), 1);
> PUSH_DATA (push, 0);
> diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
> index 8aa8d4936f..b7af7ab0d2 100644
> --- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
> +++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
> @@ -168,6 +168,28 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
> PUSH_DATA (push, 3); /* 7 */
> PUSH_DATA (push, 1);
>
> + /* MS coordinate adjustment information */
> + for (int i = 1; i <= 8; i *= 2) {
> + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 4);
> + PUSH_DATA (push, 8);
> + PUSH_DATA (push, 1);
> + PUSH_DATAh(push, address + NVC0_CB_AUX_MS_ADJ_INFO(i));
> + PUSH_DATA (push, address + NVC0_CB_AUX_MS_ADJ_INFO(i));
> + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 3);
> + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
> +
> + int ms_x = 0, ms_y = 0;
> + switch (i) {
> + case 1: break;
> + case 2: ms_x = 1; break;
> + case 4: ms_x = 1; ms_y = 1; break;
> + case 8: ms_x = 2; ms_y = 1; break;
Is this really necessary? Couldn't you just adjust the generated code to be like
ms_x = (samples+2)>>2
ms_y = samples > 1
And void the constbuf bit entirely? [And yeah, this falls down with a
number of samples that isn't 1/2/4/8, but that really shouldn't
happen.]
-ilia
More information about the mesa-dev
mailing list