[Mesa-dev] [PATCH] nv50/ir, nvc0: use constant buffers for compute when possible on Kepler+
Ilia Mirkin
imirkin at alum.mit.edu
Fri Aug 3 22:15:54 UTC 2018
Oh. That's the base relative to the bo offset. Which is a u32. So it's
actually OK
On Fri, Aug 3, 2018 at 6:06 PM, Rhys Perry <pendingchaos02 at gmail.com> wrote:
> Yeah
>
> "base" in nve4_cp_launch_desc_set_cb() and
> gp100_cp_launch_desc_set_cb() are uint32_t too. They should probably
> be updated
>
> On Fri, Aug 3, 2018 at 11:04 PM, Ilia Mirkin <imirkin at alum.mit.edu> wrote:
>> On Fri, Aug 3, 2018 at 5:57 PM, Rhys Perry <pendingchaos02 at gmail.com> wrote:
>>> Previously, UBOs were implemented using global memory loads for compute on
>>> Kepler+ because it only supported 8 constant buffers on compute. This
>>> required bounds checking and expensive load instructions.
>>>
>>> However 6 of the constant buffer bindings were left unused, this uses them
>>> instead of loading from global memory in the shader for the first 6
>>> non-user constant buffers when possible.
>>>
>>> total instructions in shared programs : 5787979 -> 5748677 (-0.68%)
>>> total gprs used in shared programs : 669901 -> 669373 (-0.08%)
>>> total shared used in shared programs : 548832 -> 548832 (0.00%)
>>> total local used in shared programs : 21068 -> 21064 (-0.02%)
>>>
>>> local shared gpr inst bytes
>>> helped 1 0 152 274 274
>>> hurt 0 0 0 0 0
>>>
>>> Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
>>> ---
>>> .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 18 +++++++-------
>>> src/gallium/drivers/nouveau/nvc0/nve4_compute.c | 28 ++++++++++++++++++++++
>>> 2 files changed, 36 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>>> index 1410cf26c8..0fba96f261 100644
>>> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>>> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>>> @@ -2437,18 +2437,16 @@ NVC0LoweringPass::handleLDST(Instruction *i)
>>> assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
>>> }
>>> } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
>>> + int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1;
>>> + Value *ind = i->getIndirect(0, 1);
>>> +
>>> if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
>>> - prog->getType() == Program::TYPE_COMPUTE) {
>>> + prog->getType() == Program::TYPE_COMPUTE &&
>>> + (fileIndex >= 6 || ind)) {
>>> // The launch descriptor only allows to set up 8 CBs, but OpenGL
>>> - // requires at least 12 UBOs. To bypass this limitation, we store the
>>> - // addrs into the driver constbuf and we directly load from the global
>>> - // memory.
>>> - int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1;
>>> - Value *ind = i->getIndirect(0, 1);
>>> -
>>> - if (!ind && fileIndex == -1)
>>> - return;
>>> -
>>> + // requires at least 12 UBOs. To bypass this limitation, for constant
>>> + // buffers 7+, we store the addrs into the driver constbuf and we
>>> + // directly load from the global memory.
>>> if (ind) {
>>> // Clamp the UBO index when an indirect access is used to avoid
>>> // loading information from the wrong place in the driver cb.
>>> diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
>>> index 28460f8cbe..8aa8d4936f 100644
>>> --- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
>>> +++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
>>> @@ -551,6 +551,30 @@ nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size)
>>> return NVC1_3D_CACHE_SPLIT_16K_SHARED_48K_L1;
>>> }
>>>
>>> +static void
>>> +nve4_compute_setup_buf_cb(struct nvc0_context *nvc0, bool gp100, void *desc)
>>> +{
>>> + // only user constant buffers 1-6 can be put in the descriptor, the rest are
>>> + // loaded through global memory
>>> + for (int i = 1; i <= 6; i++) {
>>> + if (nvc0->constbuf[5][i].user || !nvc0->constbuf[5][i].u.buf)
>>> + continue;
>>> +
>>> + struct nv04_resource *res =
>>> + nv04_resource(nvc0->constbuf[5][i].u.buf);
>>> +
>>> + uint32_t base = res->offset + nvc0->constbuf[5][i].offset;
>>
>> No compiler error on this one? Ouch. This obviously needs to be a uint64_t.
>>
>>> + uint32_t size = nvc0->constbuf[5][i].size;
>>> + if (gp100)
>>> + gp100_cp_launch_desc_set_cb(desc, i, res->bo, base, size);
>>> + else
>>> + nve4_cp_launch_desc_set_cb(desc, i, res->bo, base, size);
>>> + }
>>> +
>>> + // there is no need to do FLUSH(NVE4_COMPUTE_FLUSH_CB) because
>>> + // nve4_compute_upload_input() does it later
>>> +}
>>> +
>>> static void
>>> nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
>>> struct nve4_cp_launch_desc *desc,
>>> @@ -588,6 +612,8 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
>>> }
>>> nve4_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo,
>>> NVC0_CB_AUX_INFO(5), 1 << 11);
>>> +
>>> + nve4_compute_setup_buf_cb(nvc0, false, desc);
>>> }
>>>
>>> static void
>>> @@ -626,6 +652,8 @@ gp100_compute_setup_launch_desc(struct nvc0_context *nvc0,
>>> }
>>> gp100_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo,
>>> NVC0_CB_AUX_INFO(5), 1 << 11);
>>> +
>>> + nve4_compute_setup_buf_cb(nvc0, true, desc);
>>> }
>>>
>>> static inline void *
>>> --
>>> 2.14.4
>>>
More information about the mesa-dev
mailing list