[Mesa-dev] [PATCH 05/11] nvc0: allow to use more than 7 UBOs for compute on Kepler
Samuel Pitoiset
samuel.pitoiset at gmail.com
Mon Feb 29 17:27:51 UTC 2016
On 02/27/2016 11:26 PM, Ilia Mirkin wrote:
> On Sat, Feb 27, 2016 at 9:02 AM, Samuel Pitoiset
> <samuel.pitoiset at gmail.com> wrote:
>> The launch descriptor only allows to set up 8 CBs, but OpenGL
>> requires at least 14 UBOs. To bypass this limitation, we store
>> the addrs into the driver constbuf and we directly load from
>> the global memory.
>>
>> Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
>> ---
>> .../drivers/nouveau/codegen/nv50_ir_driver.h | 1 +
>> .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 22 +++++++++++++++++++
>> src/gallium/drivers/nouveau/nvc0/nvc0_context.h | 6 +++++-
>> src/gallium/drivers/nouveau/nvc0/nvc0_program.c | 1 +
>> src/gallium/drivers/nouveau/nvc0/nve4_compute.c | 25 ++++++++++++++++++++++
>> 5 files changed, 54 insertions(+), 1 deletion(-)
>>
>> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
>> index 479e426..a66aa67 100644
>> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
>> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
>> @@ -183,6 +183,7 @@ struct nv50_ir_prog_info
>> uint16_t sampleInfoBase; /* base address for sample positions */
>> uint8_t msInfoCBSlot; /* cX[] used for multisample info */
>> uint16_t msInfoBase; /* base address for multisample info */
>> + uint16_t uboInfoBase; /* base address for compute UBOs (gk104+) */
>> } io;
>>
>> /* driver callback to assign input/output locations */
>> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>> index d6dfed3..2928963 100644
>> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>> @@ -1997,6 +1997,28 @@ NVC0LoweringPass::visit(Instruction *i)
>> i->setIndirect(0, 0, ptr);
>> i->subOp = NV50_IR_SUBOP_LDC_IS;
>> }
>> +
>> + if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
>> + prog->getType() == Program::TYPE_COMPUTE) {
>> + /* The launch descriptor only allows to set up 8 CBs, but OpenGL
>> + * requires at least 14 UBOs. To bypass this limitation, we store
>> + * the addrs into the driver constbuf and we directly load from the
>> + * global memory. */
>> + if (i->getSrc(0)->reg.fileIndex >= 7) {
>> + uint32_t addr = prog->driver->io.uboInfoBase;
>> + uint8_t b = prog->driver->io.resInfoCBSlot;
>> +
>> + addr += (i->getSrc(0)->reg.fileIndex % 7) * 0x8;
>
> I think you wanted - 7 here.
>
>> +
>> + Instruction *ld = bld.mkLoad(TYPE_U64, bld.getSSA(8, FILE_GPR),
>> + bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, addr), NULL);
>> +
>> + bld.mkLoad(i->dType, i->getDef(0),
>> + bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0),
>> + ld->getDef(0));
>> + bld.remove(i);
>
> So... let's say I make a UBO array with indirect block indexing...
> what do you do? (Hint: this won't work.) More interestingly, what does
> the blob do?
>
> Right now you're totally ignoring the indirect ptr for these >= 7
> things. But even if you did it "properly", if I create a ubo block
> array that spans the "real cb" and "fake cb" boundary... not sure what
> to do.
Yes, this has to be improved. I'll make a test and trace the blob to see
how does it handle this weird case.
>
>> + }
>> + }
>> } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
>> assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
>> i->op = OP_VFETCH;
>> diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
>> index dcb0bda..06c1fc6 100644
>> --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
>> +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
>> @@ -91,7 +91,8 @@
>> #define NVC0_BIND_CP_SCREEN 51
>> #define NVC0_BIND_CP_QUERY 52
>> #define NVC0_BIND_CP_BUF 53
>> -#define NVC0_BIND_CP_COUNT 54
>> +#define NVC0_BIND_CP_UBO 54
>> +#define NVC0_BIND_CP_COUNT 55
>>
>> /* bufctx for other operations */
>> #define NVC0_BIND_2D 0
>> @@ -116,6 +117,9 @@
>> /* 8 sets of 32-buts pairs MS offsets */
>> #define NVC0_CB_AUX_MS_INFO 0x100 /* CP */
>> #define NVC0_CB_AUX_MS_SIZE (8 * 2 * 4)
>> +/* 7 sets of 32-bits integer addrs */
>> +#define NVC0_CB_AUX_UBO_INFO 0x140 /* CP */
>> +#define NVC0_CB_AUX_UBO_SIZE (7 * 2 * 4)
>> /* 8 sets of 32-bits integer pairs sample offsets */
>> #define NVC0_CB_AUX_SAMPLE_INFO 0x180 /* FP */
>> #define NVC0_CB_AUX_SAMPLE_SIZE (8 * 4 * 2)
>> diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
>> index afb909c..aba0eda 100644
>> --- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
>> +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
>> @@ -544,6 +544,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
>> info->io.texBindBase = NVC0_CB_AUX_TEX_INFO(0);
>> info->io.suInfoBase = NVC0_CB_AUX_SUF_INFO(0);
>> info->prop.cp.gridInfoBase = NVC0_CB_AUX_GRID_INFO;
>> + info->io.uboInfoBase = NVC0_CB_AUX_UBO_INFO;
>> } else {
>> info->io.resInfoCBSlot = 15;
>> info->io.suInfoBase = 0; /* TODO */
>> diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
>> index 557dbdc..2640e0f 100644
>> --- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
>> +++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
>> @@ -486,7 +486,9 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
>> const uint *grid_layout)
>> {
>> const struct nvc0_screen *screen = nvc0->screen;
>> + struct nouveau_pushbuf *push = nvc0->base.pushbuf;
>> const struct nvc0_program *cp = nvc0->compprog;
>> + uint32_t address;
>> unsigned i;
>>
>> nve4_cp_launch_desc_init_default(desc);
>> @@ -521,6 +523,29 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
>> }
>> nve4_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo,
>> NVC0_CB_AUX_INFO(5), 1 << 10);
>> +
>> + address = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5);
>> +
>> + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
>> + PUSH_DATAh(push, address + NVC0_CB_AUX_UBO_INFO);
>> + PUSH_DATA (push, address + NVC0_CB_AUX_UBO_INFO);
>> + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
>> + PUSH_DATA (push, 7 * 2 * 4);
>
> I'd very much advise against hardcoding the 7 here.
How about (NVC0_MAX_PIPE_CONSTBUFS - i) ?
Or do you prefer a new constant?
>
>> + PUSH_DATA (push, 0x1);
>> + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7 * 2);
>> + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
>> +
>> + for (; i < NVC0_MAX_PIPE_CONSTBUFS; i++) {
>> + struct nv04_resource *res = nv04_resource(nvc0->constbuf[5][i].u.buf);
>> + if (res) {
>> + PUSH_DATA (push, res->address + nvc0->constbuf[5][i].offset);
>> + PUSH_DATAh(push, res->address + nvc0->constbuf[5][i].offset);
>> + BCTX_REFN(nvc0->bufctx_cp, CP_UBO, res, RD);
>
> Did I miss the spot where you clear out this bufctx bin?
Nope, you're right. I forgot to call nouveau_bufctx_reset() at the right
place.
>
>> + } else {
>> + PUSH_DATA (push, 0);
>> + PUSH_DATA (push, 0);
>> + }
>> + }
>> }
>>
>> static inline struct nve4_cp_launch_desc *
>> --
>> 2.7.1
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
-Samuel
More information about the mesa-dev
mailing list